diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 85050a84..6f927a59 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -89,9 +89,10 @@ jobs:
                 . ./ci-support-v0
 
                 export PYTEST_ADDOPTS="-k 'not slowtest'"
-                
-                if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then
-                   DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms
+
+                if [[ "$GITHUB_HEAD_REF" == "towards-array-context" ]]; then
+                   DOWNSTREAM_PROJECT=https://github.com/alexfikl/${DOWNSTREAM_PROJECT}.git@towards-array-context
                 fi
                 test_downstream "$DOWNSTREAM_PROJECT"
+
 # vim: sw=4
diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml
index 3d83b68d..dc1459d9 100644
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
@@ -1,6 +1,2 @@
 - arg: extension-pkg-whitelist
   val: pyfmmlib
-
-# Needed for boxtree.tools
-- arg: init-hook
-  val: import sys; sys.setrecursionlimit(2000)
diff --git a/boxtree/__init__.py b/boxtree/__init__.py
index 85080390..819206c6 100644
--- a/boxtree/__init__.py
+++ b/boxtree/__init__.py
@@ -21,22 +21,22 @@
 """
 
 from boxtree.tree import Tree, TreeWithLinkedPointSources, box_flags_enum
-from boxtree.tree_build import TreeBuilder
+from boxtree.tree_build import TreeBuilder, build_tree
 
 __all__ = [
     "Tree", "TreeWithLinkedPointSources",
-    "TreeBuilder", "box_flags_enum"]
+    "TreeBuilder", "build_tree", "box_flags_enum"]
 
 __doc__ = r"""
 :mod:`boxtree` can do three main things:
 
 * it can sort particles into an adaptively refined quad/octree,
-  see :class:`boxtree.Tree` and :class:`boxtree.TreeBuilder`.
+  see :class:`boxtree.Tree` and :class:`boxtree.build_tree`.
 
 * it can compute fast-multipole-like interaction lists on this tree structure,
-  see :mod:`boxtree.traversal`. Note that while this traversal generation
-  builds on the result of particle sorting,
-  it is completely distinct in the software sense.
+  see :mod:`boxtree.traversal`. Note that, while this traversal generation
+  builds on the result of particle sorting, it is completely distinct in the
+  software sense.
 
 * It can compute geometric lookup structures based on a :class:`boxtree.Tree`,
   see :mod:`boxtree.area_query`.
@@ -48,16 +48,16 @@
 
 * one where no distinction is made between sources and targets. In this mode,
   all participants in the interaction are called 'particles'.
-  (``targets is None`` in the call to :meth:`boxtree.TreeBuilder.__call__`)
+  (``targets`` is *None* in the call to :meth:`boxtree.build_tree`)
 
 * one where a distinction between sources and targets is made.
-  (``targets is not None`` in the call to :meth:`boxtree.TreeBuilder.__call__`)
+  (``targets`` is not *None* in the call to :meth:`boxtree.build_tree`)
 
 * one where a distinction between sources and targets is made,
   and where sources and/or targets are considered to have an extent, given by an
-  :math:`l^\infty` radius.
-  (``targets is not None`` and ``source_radii is not None or target_radii is
-  not None`` in the call to :meth:`boxtree.TreeBuilder.__call__`)
+  :math:`\ell^p` radius.
+  (``targets`` is not *None* and ``source_radii`` is not *None* or *target_radii*
+  is not *None* in the call to :meth:`boxtree.build_tree`)
 
   If sources have an extent, it is possible to 'link' each source with a number
   of point sources. For this case, it is important to internalize this bit of
@@ -69,9 +69,8 @@
 -------------------------------
 
 :attr:`Tree.source_radii` and :attr:`Tree.target_radii` specify the
-radii of of :math:`l^\infty` 'circles' (that is, squares) centered at
-:attr:`Tree.sources` and :attr:`Tree.targets` that contain the entire
-extent of that source or target.
+radii of of :math:`\ell^p` 'circles' centered at :attr:`Tree.sources` and
+:attr:`Tree.targets` that contain the entire extent of that source or target.
 
 :mod:`boxtree.traversal` guarantees that, in generating traversals, all
 interactions to targets within the source extent and from sources within the
@@ -89,9 +88,9 @@
 * **user target order**
 * **tree target order** (tree/box-sorted)
 
-:attr:`Tree.user_source_ids` helps translate source arrays into
-tree order for processing. :attr:`Tree.sorted_target_ids`
-helps translate potentials back into user target order for output.
+:attr:`Tree.user_source_ids` helps translate source arrays into tree order for
+processing. :attr:`Tree.sorted_target_ids` helps translate potentials back into
+user target order for output.
 
 If each 'original' source above is linked to a number of point sources,
 the point sources have their own orderings:
@@ -107,24 +106,23 @@
 CSR-like interaction list storage
 ---------------------------------
 
-Many list-like data structures in :mod:`boxtree` consists of
-two arrays, one whose name ends in ``_starts``, and another whose
-name ends in ``_lists``. For example,
-suppose we would like to find the colleagues of box #17 using
-:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_starts`
+Many list-like data structures in :mod:`boxtree` consists of two arrays, one
+whose name ends in ``_starts``, and another whose name ends in ``_lists``. For
+example, suppose we would like to find the colleagues of box #17 using
+:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_starts`
 and
-:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_lists`.
+:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_lists`.
 
 The following snippet of code achieves this::
 
     ibox = 17
-    start, end = colleagues_starts[ibox:ibox+2]
-    ibox_colleagues = colleagues_lists[start:end]
+    start, end = same_level_non_well_sep_boxes_starts[ibox:ibox+2]
+    ibox_colleagues = same_level_non_well_sep_boxes_lists[start:end]
 
 This indexing scheme has the following properties:
 
 * If the underlying indexing array (say the list of all boxes) has *n* entries,
-  then the ``_starts`` array has *n+1* entries. The very last entry determines
+  then the ``_starts`` array has *n + 1* entries. The very last entry determines
   the length of the last list.
 
 * The lists in ``_lists`` are stored contiguously. The start of the next list
diff --git a/boxtree/area_query.py b/boxtree/area_query.py
index c49b4253..83f258d5 100644
--- a/boxtree/area_query.py
+++ b/boxtree/area_query.py
@@ -29,12 +29,12 @@
 from pyopencl.elementwise import ElementwiseTemplate
 
 from arraycontext import Array
-from pytools import memoize_method, ProcessLogger
+from pytools import ProcessLogger, memoize_on_first_arg
 from mako.template import Template
 
 from boxtree.tree import Tree
 from boxtree.tools import (
-    InlineBinarySearch, get_coord_vec_dtype, coord_vec_subscript_code)
+    inline_binary_search_for_type, get_coord_vec_dtype, coord_vec_subscript_code)
 from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
 import logging
@@ -45,116 +45,31 @@
 Area queries (Balls -> overlapping leaves)
 ------------------------------------------
 
-.. autoclass:: AreaQueryBuilder
-
 .. autoclass:: AreaQueryResult
+.. autofunction:: build_area_query
 
 
 Inverse of area query (Leaves -> overlapping balls)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autoclass:: LeavesToBallsLookupBuilder
-
 .. autoclass:: LeavesToBallsLookup
-
+.. autofunction:: build_leaves_to_balls_lookup
 
 Space invader queries
 ^^^^^^^^^^^^^^^^^^^^^
 
-.. autoclass:: SpaceInvaderQueryBuilder
-
+.. autofunction:: build_space_invader_query
 
 Peer Lists
 ^^^^^^^^^^
 
 Area queries are implemented using peer lists.
 
-.. autoclass:: PeerListFinder
-
 .. autoclass:: PeerListLookup
-
+.. autofunction:: build_peer_list
 """
 
 
-# {{{ output
-
-@dataclass_array_container
-@dataclass(frozen=True)
-class PeerListLookup:
-    """
-    .. attribute:: tree
-
-        The :class:`boxtree.Tree` instance used to build this lookup.
-
-    .. attribute:: peer_list_starts
-
-        Indices into :attr:`peer_lists`.
-        ``peer_lists[peer_list_starts[box_id]:peer_list_starts[box_id]+1]``
-        contains the list of peer boxes of box `box_id`.
-
-    .. attribute:: peer_lists
-
-    .. versionadded:: 2016.1
-    """
-
-    tree: Tree
-    peer_list_starts: Array
-    peer_lists: Array
-
-
-@dataclass_array_container
-@dataclass(frozen=True)
-class AreaQueryResult:
-    """
-    .. attribute:: tree
-
-        The :class:`boxtree.Tree` instance used to build this lookup.
-
-    .. attribute:: leaves_near_ball_starts
-
-        Indices into :attr:`leaves_near_ball_lists`.
-        ``leaves_near_ball_lists[leaves_near_ball_starts[ball_nr]:
-        leaves_near_ball_starts[ball_nr]+1]``
-        results in a list of leaf boxes that intersect `ball_nr`.
-
-    .. attribute:: leaves_near_ball_lists
-
-    .. versionadded:: 2016.1
-    """
-
-    tree: Tree
-    leaves_near_ball_starts: Array
-    leaves_near_ball_lists: Array
-
-
-@dataclass_array_container
-@dataclass(frozen=True)
-class LeavesToBallsLookup:
-    """
-    .. attribute:: tree
-
-        The :class:`boxtree.Tree` instance used to build this lookup.
-
-    .. attribute:: balls_near_box_starts
-
-        Indices into :attr:`balls_near_box_lists`.
-        ``balls_near_box_lists[balls_near_box_starts[ibox]:
-        balls_near_box_starts[ibox]+1]``
-        results in a list of balls that overlap leaf box *ibox*.
-
-        .. note:: Only leaf boxes have non-empty entries in this table. Nonetheless,
-            this list is indexed by the global box index.
-
-    .. attribute:: balls_near_box_lists
-    """
-
-    tree: Tree
-    balls_near_box_starts: Array
-    balls_near_box_lists: Array
-
-# }}}
-
-
 # {{{ kernel templates
 
 GUIDING_BOX_FINDER_MACRO = r"""//CL:mako//
@@ -471,7 +386,7 @@ class LeavesToBallsLookup:
     dst[i] = bsearch(starts, starts_len, i);
     """,
     name="starts_expander",
-    preamble=str(InlineBinarySearch("idx_t")))
+    preamble=inline_binary_search_for_type("idx_t"))
 
 # }}}
 
@@ -538,13 +453,13 @@ def generate(self, context,
                  dimensions, coord_dtype, box_id_dtype,
                  peer_list_idx_dtype, max_levels,
                  extra_var_values=(), extra_type_aliases=(),
-                 extra_preamble=""):
+                 extra_preamble="",
+                 root_extent_stretch_factor=1.0e-4):
         from pyopencl.tools import dtype_to_ctype
 
         from boxtree import box_flags_enum
         from boxtree.tools import AXIS_NAMES
         from boxtree.traversal import TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES
-        from boxtree.tree_build import TreeBuilder
 
         from pyopencl.cltypes import vec_types
         render_vars = (
@@ -561,7 +476,7 @@ def generate(self, context,
             ("box_flags_enum", box_flags_enum),
             ("peer_list_idx_dtype", peer_list_idx_dtype),
             ("debug", False),
-            ("root_extent_stretch_factor", TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR),
+            ("root_extent_stretch_factor", root_extent_stretch_factor),
 
             # FIXME This gets used in pytential with a template that still uses this:
             ("vec_types", tuple(vec_types.items())),
@@ -638,153 +553,186 @@ def generate(self, context,
 # {{{ area query build
 
 class AreaQueryBuilder:
-    r"""Given a set of :math:`l^\infty` "balls", this class helps build a
-    look-up table from ball to leaf boxes that intersect with the ball.
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                 ball_centers, ball_radii, peer_lists=None,
+                 wait_for=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_area_query' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_area_query(
+            actx, tree, ball_centers, ball_radii, peer_lists)
+
+        return result, None
+
+
+@dataclass_array_container
+@dataclass(frozen=True)
+class AreaQueryResult:
+    """
+    .. attribute:: tree
+
+        The :class:`boxtree.Tree` instance used to build this lookup.
+
+    .. attribute:: leaves_near_ball_starts
+
+        Indices into :attr:`leaves_near_ball_lists`.
+        ``leaves_near_ball_lists[leaves_near_ball_starts[ball_nr]:
+        leaves_near_ball_starts[ball_nr]+1]``
+        results in a list of leaf boxes that intersect `ball_nr`.
+
+    .. attribute:: leaves_near_ball_lists
 
     .. versionadded:: 2016.1
+    """
 
-    .. automethod:: __init__
-    .. automethod:: __call__
+    tree: Tree
+    leaves_near_ball_starts: Array
+    leaves_near_ball_lists: Array
+
+
+@memoize_on_first_arg
+def get_area_query_kernel(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        coord_dtype: "np.dtype",
+        box_id_dtype: "np.dtype",
+        ball_id_dtype: "np.dtype",
+        peer_list_idx_dtype: "np.dtype",
+        max_levels: int,
+        root_extent_stretch_factor: float):
+    from pyopencl.tools import dtype_to_ctype
+
+    from boxtree import box_flags_enum
+    from boxtree.tools import AXIS_NAMES
+    from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
+
+    logger.debug("start building area query kernel")
+
+    template = Template(
+        TRAVERSAL_PREAMBLE_TEMPLATE
+        + AREA_QUERY_TEMPLATE,
+        strict_undefined=True)
+
+    render_vars = dict(
+        np=np,
+        dimensions=dimensions,
+        dtype_to_ctype=dtype_to_ctype,
+        box_id_dtype=box_id_dtype,
+        particle_id_dtype=None,
+        coord_dtype=coord_dtype,
+        get_coord_vec_dtype=get_coord_vec_dtype,
+        cvec_sub=partial(coord_vec_subscript_code, dimensions),
+        max_levels=max_levels,
+        AXIS_NAMES=AXIS_NAMES,
+        box_flags_enum=box_flags_enum,
+        peer_list_idx_dtype=peer_list_idx_dtype,
+        ball_id_dtype=ball_id_dtype,
+        debug=False,
+        root_extent_stretch_factor=root_extent_stretch_factor)
+
+    from boxtree.tools import VectorArg, ScalarArg
+    arg_decls = [
+        VectorArg(coord_dtype, "box_centers", with_offset=False),
+        ScalarArg(coord_dtype, "root_extent"),
+        VectorArg(np.uint8, "box_levels"),
+        ScalarArg(box_id_dtype, "aligned_nboxes"),
+        VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
+        VectorArg(box_flags_enum.dtype, "box_flags"),
+        VectorArg(peer_list_idx_dtype, "peer_list_starts"),
+        VectorArg(box_id_dtype, "peer_lists"),
+        VectorArg(coord_dtype, "ball_radii"),
+        ] + [
+        ScalarArg(coord_dtype, "bbox_min_"+ax)
+        for ax in AXIS_NAMES[:dimensions]
+        ] + [
+        VectorArg(coord_dtype, "ball_"+ax)
+        for ax in AXIS_NAMES[:dimensions]]
+
+    from pyopencl.algorithm import ListOfListsBuilder
+    area_query_knl = ListOfListsBuilder(
+        actx.context,
+        [("leaves", box_id_dtype)],
+        str(template.render(**render_vars)),
+        arg_decls=arg_decls,
+        name_prefix="area_query",
+        count_sharing={},
+        complex_kernel=True)
+
+    logger.debug("done building area query kernel")
+    return area_query_knl
+
+
+def build_area_query(
+        actx: PyOpenCLArrayContext, tree: Tree,
+        ball_centers, ball_radii, peer_lists=None) -> AreaQueryResult:
+    r"""Given a set of :math:`l^\infty` "balls", this class helps build a
+    look-up table from ball to leaf boxes that intersect with the ball.
+
+    :arg ball_centers: an object array of coordinates. Their *dtype* must
+        match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+    :arg ball_radii: an array of positive numbers. Its *dtype* must match
+        *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+    :arg peer_lists: may either be *None* or an instance of
+        :class:`PeerListLookup` associated with `tree`.
     """
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
-        self.peer_list_finder = PeerListFinder(array_context)
 
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
+    # {{{ input check
 
-    # {{{ Kernel generation
+    from pytools import single_valued
+    if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
+        raise TypeError("ball_centers dtype must match tree.coord_dtype")
 
-    @memoize_method
-    def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype,
-                              ball_id_dtype, peer_list_idx_dtype, max_levels):
-        from pyopencl.tools import dtype_to_ctype
+    if ball_radii.dtype != tree.coord_dtype:
+        raise TypeError("ball_radii dtype must match tree.coord_dtype")
 
-        from boxtree import box_flags_enum
-        from boxtree.tools import AXIS_NAMES
-        from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
-        from boxtree.tree_build import TreeBuilder
-
-        logger.debug("start building area query kernel")
-
-        template = Template(
-            TRAVERSAL_PREAMBLE_TEMPLATE
-            + AREA_QUERY_TEMPLATE,
-            strict_undefined=True)
-
-        render_vars = dict(
-            np=np,
-            dimensions=dimensions,
-            dtype_to_ctype=dtype_to_ctype,
-            box_id_dtype=box_id_dtype,
-            particle_id_dtype=None,
-            coord_dtype=coord_dtype,
-            get_coord_vec_dtype=get_coord_vec_dtype,
-            cvec_sub=partial(coord_vec_subscript_code, dimensions),
-            max_levels=max_levels,
-            AXIS_NAMES=AXIS_NAMES,
-            box_flags_enum=box_flags_enum,
-            peer_list_idx_dtype=peer_list_idx_dtype,
-            ball_id_dtype=ball_id_dtype,
-            debug=False,
-            root_extent_stretch_factor=TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR)
-
-        from boxtree.tools import VectorArg, ScalarArg
-        arg_decls = [
-            VectorArg(coord_dtype, "box_centers", with_offset=False),
-            ScalarArg(coord_dtype, "root_extent"),
-            VectorArg(np.uint8, "box_levels"),
-            ScalarArg(box_id_dtype, "aligned_nboxes"),
-            VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
-            VectorArg(box_flags_enum.dtype, "box_flags"),
-            VectorArg(peer_list_idx_dtype, "peer_list_starts"),
-            VectorArg(box_id_dtype, "peer_lists"),
-            VectorArg(coord_dtype, "ball_radii"),
-            ] + [
-            ScalarArg(coord_dtype, "bbox_min_"+ax)
-            for ax in AXIS_NAMES[:dimensions]
-            ] + [
-            VectorArg(coord_dtype, "ball_"+ax)
-            for ax in AXIS_NAMES[:dimensions]]
-
-        from pyopencl.algorithm import ListOfListsBuilder
-        area_query_kernel = ListOfListsBuilder(
-            self.context,
-            [("leaves", box_id_dtype)],
-            str(template.render(**render_vars)),
-            arg_decls=arg_decls,
-            name_prefix="area_query",
-            count_sharing={},
-            complex_kernel=True)
-
-        logger.debug("done building area query kernel")
-        return area_query_kernel
+    from pytools import div_ceil
+    # Avoid generating too many kernels.
+    max_levels = div_ceil(tree.nlevels, 10) * 10
+
+    if peer_lists is None:
+        peer_lists = build_peer_list(actx, tree)
+
+    if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
+        raise ValueError("size of peer lists must match with number of boxes")
+
+    ball_id_dtype = tree.particle_id_dtype
+    peer_list_idx_dtype = peer_lists.peer_list_starts.dtype
 
     # }}}
 
-    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
-                 ball_centers, ball_radii, peer_lists=None,
-                 wait_for=None):
-        """
-        :arg ball_centers: an object array of coordinates. Their *dtype* must
-            match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: an array of positive numbers. Its *dtype* must match
-            *tree*'s :attr:`boxtree.Tree.coord_dtype`.
-        :arg peer_lists: may either be *None* or an instance of
-            :class:`PeerListLookup` associated with `tree`.
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            exeuction.
-        :returns: a tuple *(aq, event)*, where *aq* is an instance of
-            :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
-            for dependency management.
-        """
-
-        from pytools import single_valued
-        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
-            raise TypeError("ball_centers dtype must match tree.coord_dtype")
-        if ball_radii.dtype != tree.coord_dtype:
-            raise TypeError("ball_radii dtype must match tree.coord_dtype")
-
-        ball_id_dtype = tree.particle_id_dtype  # ?
-
-        from pytools import div_ceil
-        # Avoid generating too many kernels.
-        max_levels = div_ceil(tree.nlevels, 10) * 10
-
-        if peer_lists is None:
-            peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for)
-            wait_for = [evt]
-
-        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
-            raise ValueError("size of peer lists must match with number of boxes")
-
-        area_query_kernel = self.get_area_query_kernel(tree.dimensions,
-            tree.coord_dtype, tree.box_id_dtype, ball_id_dtype,
-            peer_lists.peer_list_starts.dtype, max_levels)
-
-        aq_plog = ProcessLogger(logger, "area query")
-
-        result, evt = area_query_kernel(
+    # {{{ area query
+
+    area_query_knl = get_area_query_kernel(
+        actx,
+        tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
+        ball_id_dtype, peer_list_idx_dtype, max_levels,
+        tree.root_extent_stretch_factor)
+
+    with ProcessLogger(logger, "area query"):
+        result, _ = area_query_knl(
                 actx.queue, len(ball_radii),
                 tree.box_centers.data, tree.root_extent,
                 tree.box_levels, tree.aligned_nboxes,
                 tree.box_child_ids.data, tree.box_flags,
                 peer_lists.peer_list_starts,
                 peer_lists.peer_lists, ball_radii,
-                *(tuple(tree.bounding_box[0])
-                    + tuple(bc for bc in ball_centers)),
-                wait_for=wait_for)
+                *(tuple(tree.bounding_box[0]) + tuple(bc for bc in ball_centers)),
+                allocator=actx.allocator,
+                )
 
-        aq_plog.done()
+    # }}}
 
-        result = AreaQueryResult(
-                tree=tree,
-                leaves_near_ball_starts=result["leaves"].starts,
-                leaves_near_ball_lists=result["leaves"].lists)
+    result = AreaQueryResult(
+            tree=tree,
+            leaves_near_ball_starts=result["leaves"].starts,
+            leaves_near_ball_lists=result["leaves"].lists)
 
-        return actx.freeze(result), evt
+    return actx.freeze(result)
 
 # }}}
 
@@ -792,66 +740,92 @@ def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
 # {{{ area query transpose (leaves-to-balls) lookup build
 
 class LeavesToBallsLookupBuilder:
-    r"""Given a set of :math:`l^\infty` "balls", this class helps build a
-    look-up table from leaf boxes to balls that overlap with each leaf box.
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                 ball_centers, ball_radii, peer_lists=None,
+                 wait_for=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_leaves_to_balls_lookup' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_leaves_to_balls_lookup(
+            actx, tree, ball_centers, ball_radii, peer_lists)
+
+        return result, None
+
+
+@dataclass_array_container
+@dataclass(frozen=True)
+class LeavesToBallsLookup:
+    """
+    .. attribute:: tree
+
+        The :class:`boxtree.Tree` instance used to build this lookup.
+
+    .. attribute:: balls_near_box_starts
+
+        Indices into :attr:`balls_near_box_lists`.
+        ``balls_near_box_lists[balls_near_box_starts[ibox]:
+        balls_near_box_starts[ibox]+1]``
+        results in a list of balls that overlap leaf box *ibox*.
 
-    .. automethod:: __init__
-    .. automethod:: __call__
+        .. note:: Only leaf boxes have non-empty entries in this table. Nonetheless,
+            this list is indexed by the global box index.
 
+    .. attribute:: balls_near_box_lists
     """
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        from pyopencl.algorithm import KeyValueSorter
 
-        self._setup_actx = array_context
-        self.key_value_sorter = KeyValueSorter(self.context)
-        self.area_query_builder = AreaQueryBuilder(array_context)
+    tree: Tree
+    balls_near_box_starts: Array
+    balls_near_box_lists: Array
+
+
+def build_leaves_to_balls_lookup(
+        actx: PyOpenCLArrayContext, tree: Tree,
+        ball_centers, ball_radii, peer_lists=None) -> LeavesToBallsLookup:
+    r"""Given a set of :math:`l^\infty` "balls", this builds a
+    look-up table from leaf boxes to balls that overlap with each leaf box.
+
+    :arg ball_centers: an object array of coordinates. Their *dtype* must
+        match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+    :arg ball_radii: an array of positive numbers. Its *dtype* must match
+        *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+    :arg peer_lists: may either be *None* or an instance of
+        :class:`PeerListLookup` associated with `tree`.
+    """
+
+    # {{{ check inputs
+
+    from pytools import single_valued
+    if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
+        raise TypeError("ball_centers dtype must match tree.coord_dtype")
 
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
+    if ball_radii.dtype != tree.coord_dtype:
+        raise TypeError("ball_radii dtype must match tree.coord_dtype")
 
-    @memoize_method
-    def get_starts_expander_kernel(self, idx_dtype):
-        """
-        Expands a "starts" array into a length starts[-1] array of increasing
-        indices:
+    # }}}
+
+    # {{{ build lookup
 
-        Eg: [0 2 5 6] => [0 0 1 1 1 2]
+    from pytools import memoize_in
 
-        """
+    @memoize_in(actx, (build_leaves_to_balls_lookup, tree.box_id_dtype))
+    def get_starts_expander_kernel():
         return STARTS_EXPANDER_TEMPLATE.build(
-                self.context,
-                type_aliases=(("idx_t", idx_dtype),))
+                actx.context,
+                type_aliases=(("idx_t", tree.box_id_dtype),))
 
-    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
-                 ball_centers, ball_radii, peer_lists=None,
-                 wait_for=None):
-        """
-        :arg ball_centers: an object array of coordinates. Their *dtype* must
-            match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: an array of positive numbers. Its *dtype* must match
-            *tree*'s :attr:`boxtree.Tree.coord_dtype`.
-        :arg peer_lists: may either be *None* or an instance of
-            :class:`PeerListLookup` associated with `tree`.
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            execution.
-        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
-            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
-            for dependency management.
-        """
-
-        from pytools import single_valued
-        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
-            raise TypeError("ball_centers dtype must match tree.coord_dtype")
-        if ball_radii.dtype != tree.coord_dtype:
-            raise TypeError("ball_radii dtype must match tree.coord_dtype")
-
-        ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query")
-
-        area_query, evt = self.area_query_builder(
-                actx, tree, ball_centers, ball_radii, peer_lists, wait_for)
-        wait_for = [evt]
+    @memoize_in(actx, (build_leaves_to_balls_lookup, "key_value_sorter"))
+    def get_key_value_sorter_kernel():
+        from pyopencl.algorithm import KeyValueSorter
+        return KeyValueSorter(actx.context)
+
+    with ProcessLogger(logger, "leaves-to-balls lookup: run area query"):
+        area_query = build_area_query(
+            actx, tree, ball_centers, ball_radii, peer_lists)
 
         logger.debug("leaves-to-balls lookup: expand starts")
 
@@ -866,34 +840,38 @@ def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
         #
         # 2. Key-value sort the (ball number, box number) pairs by box number.
 
-        starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype)
+        starts_expander_knl = get_starts_expander_kernel()
         expanded_starts = actx.empty(
                 len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
         evt = starts_expander_knl(
                 expanded_starts,
                 area_query.leaves_near_ball_starts,
-                nballs_p_1)
-        wait_for = [evt]
+                nballs_p_1,
+                queue=actx.queue,
+                )
+        expanded_starts.add_event(evt)
 
         logger.debug("leaves-to-balls lookup: key-value sort")
 
-        balls_near_box_starts, balls_near_box_lists, evt \
-                = self.key_value_sorter(
-                        actx.queue,
-                        # keys
-                        area_query.leaves_near_ball_lists,
-                        # values
-                        expanded_starts,
-                        nkeys, starts_dtype=tree.box_id_dtype,
-                        wait_for=wait_for)
-        ltb_plog.done()
+        sorter_knl = get_key_value_sorter_kernel()
+        balls_near_box_starts, balls_near_box_lists, _ = sorter_knl(
+                actx.queue,
+                # keys
+                area_query.leaves_near_ball_lists,
+                # values
+                expanded_starts,
+                nkeys, starts_dtype=tree.box_id_dtype,
+                allocator=actx.allocator,
+                )
+
+    # }}}
 
-        lookup = LeavesToBallsLookup(
-                tree=tree,
-                balls_near_box_starts=balls_near_box_starts,
-                balls_near_box_lists=balls_near_box_lists)
+    lookup = LeavesToBallsLookup(
+            tree=tree,
+            balls_near_box_starts=balls_near_box_starts,
+            balls_near_box_lists=balls_near_box_lists)
 
-        return actx.freeze(lookup), evt
+    return actx.freeze(lookup)
 
 # }}}
 
@@ -901,6 +879,45 @@ def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
 # {{{ space invader query build
 
 class SpaceInvaderQueryBuilder:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self,
+            actx: PyOpenCLArrayContext, tree: Tree,
+            ball_centers, ball_radii, peer_lists=None, wait_for=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_space_invader_query' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_space_invader_query(
+            actx, tree, ball_centers, ball_radii, peer_lists)
+
+        return result, None
+
+
+@memoize_on_first_arg
+def get_space_invader_query_kernel(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        coord_dtype: "np.dtype",
+        box_id_dtype: "np.dtype",
+        peer_list_starts_dtype: "np.dtype",
+        max_levels: int,
+        root_extent_stretch_factor: float):
+    return SPACE_INVADER_QUERY_TEMPLATE.generate(
+            actx.context,
+            dimensions,
+            coord_dtype,
+            box_id_dtype,
+            peer_list_starts_dtype,
+            max_levels,
+            root_extent_stretch_factor=root_extent_stretch_factor)
+
+
+def build_space_invader_query(
+        actx: PyOpenCLArrayContext, tree: Tree,
+        ball_centers, ball_radii, peer_lists=None) -> Array:
     r"""
     Given a set of :math:`l^\infty` "balls", this class helps build a look-up
     table which maps leaf boxes to the *outer space invader distance*.
@@ -916,247 +933,229 @@ class SpaceInvaderQueryBuilder:
 
         \max \left( \{ d_{\infty}(\text{center}(b), \text{center}(b^*))
         : b^* \text{ is a ball}, b^* \cap b \neq \varnothing \}
-        \cup \{ 0 \} \right)
+        \cup \{ 0 \} \right).
+
+    :arg ball_centers: an object array of coordinates. Their *dtype* must
+        match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+    :arg ball_radii: an array of positive numbers. Its *dtype* must match
+        *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+    :arg peer_lists: may either be *None* or an instance of
+        :class:`PeerListLookup` associated with *tree*.
+
+    :returns: an array with *dtype* same as the *tree*'s
+        :attr:`boxtree.Tree.coord_dtype` and its shape is *(tree.nboxes,)*
+        (see :attr:`boxtree.Tree.nboxes`). The entries of the array are
+        indexed by the global box index and are as follows:
+
+        * if *i* is not the index of a leaf box, *sqi[i] = 0*.
+        * if *i* is the index of a leaf box, *sqi[i]* is the
+            outer space invader distance for *i*.
+    """
+    # {{{ check inputs
 
-    .. automethod:: __init__
-    .. automethod:: __call__
+    from pytools import single_valued
 
-    """
-    def __init__(self, array_context: PyOpenCLArrayContext) -> None:
-        self._setup_actx = array_context
-        self.peer_list_finder = PeerListFinder(array_context)
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
-
-    # {{{ Kernel generation
-
-    @memoize_method
-    def get_space_invader_query_kernel(self, dimensions, coord_dtype,
-                box_id_dtype, peer_list_idx_dtype, max_levels):
-        return SPACE_INVADER_QUERY_TEMPLATE.generate(
-                self.context,
-                dimensions,
-                coord_dtype,
-                box_id_dtype,
-                peer_list_idx_dtype,
-                max_levels)
+    if single_valued([bc.dtype for bc in ball_centers]) != tree.coord_dtype:
+        raise TypeError("ball_centers dtype must match tree.coord_dtype")
+
+    if ball_radii.dtype != tree.coord_dtype:
+        raise TypeError("ball_radii dtype must match tree.coord_dtype")
+
+    from pytools import div_ceil
+    # Avoid generating too many kernels.
+    max_levels = div_ceil(tree.nlevels, 10) * 10
+
+    if peer_lists is None:
+        peer_lists = build_peer_list(actx, tree)
+
+    if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
+        raise ValueError("size of peer lists must match with number of boxes")
 
     # }}}
 
-    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
-                 ball_centers, ball_radii, peer_lists=None,
-                 wait_for=None):
-        """
-        :arg ball_centers: an object array of coordinates. Their *dtype* must
-            match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: an array of positive numbers. Its *dtype* must match
-            *tree*'s :attr:`boxtree.Tree.coord_dtype`.
-        :arg peer_lists: may either be *None* or an instance of
-            :class:`PeerListLookup` associated with *tree*.
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            execution.
-        :returns: a tuple *(sqi, event)*, where *sqi* is an array and *event*
-            is a :class:`pyopencl.Event` for dependency management. The *dtype*
-            of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape
-            is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
-            The entries of *sqi* are indexed by the global box index and are
-            as follows:
-
-            * if *i* is not the index of a leaf box, *sqi[i] = 0*.
-            * if *i* is the index of a leaf box, *sqi[i]* is the
-              outer space invader distance for *i*.
-        """
-
-        from pytools import single_valued
-        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
-            raise TypeError("ball_centers dtype must match tree.coord_dtype")
-        if ball_radii.dtype != tree.coord_dtype:
-            raise TypeError("ball_radii dtype must match tree.coord_dtype")
-
-        from pytools import div_ceil
-        # Avoid generating too many kernels.
-        max_levels = div_ceil(tree.nlevels, 10) * 10
-
-        if peer_lists is None:
-            peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for)
-            wait_for = [evt]
-
-        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
-            raise ValueError("size of peer lists must match with number of boxes")
-
-        space_invader_query_kernel = self.get_space_invader_query_kernel(
-            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
-            peer_lists.peer_list_starts.dtype, max_levels)
-
-        si_plog = ProcessLogger(logger, "space invader query")
+    # {{{ build query
+
+    space_invader_query_knl = get_space_invader_query_kernel(
+        actx,
+        tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
+        peer_lists.peer_list_starts.dtype,
+        max_levels, tree.root_extent_stretch_factor,
+        )
 
+    with ProcessLogger(logger, "space invader query"):
         outer_space_invader_dists = actx.zeros(tree.nboxes, np.float32)
-        if not wait_for:
-            wait_for = []
-        wait_for = (wait_for
-                + outer_space_invader_dists.events
-                + ball_radii.events
-                + [evt for bc in ball_centers for evt in bc.events])
-
-        evt = space_invader_query_kernel(
+        evt = space_invader_query_knl(
                 *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
                     tree, peer_lists,
                     ball_radii,
                     outer_space_invader_dists,
                     *tuple(bc for bc in ball_centers)),
-                wait_for=wait_for,
                 queue=actx.queue,
-                range=slice(len(ball_radii)))
+                range=slice(len(ball_radii)),
+                )
+        outer_space_invader_dists.add_event(evt)
 
         if tree.coord_dtype != np.dtype(np.float32):
             # The kernel output is always an array of float32 due to limited
             # support for atomic operations with float64 in OpenCL.
             # Here the output is cast to match the coord dtype.
-            outer_space_invader_dists.finish()
-            outer_space_invader_dists = outer_space_invader_dists.astype(
-                    tree.coord_dtype)
-            evt, = outer_space_invader_dists.events
+            outer_space_invader_dists = (
+                outer_space_invader_dists.astype(tree.coord_dtype))
 
-        si_plog.done()
+    # }}}
 
-        return outer_space_invader_dists, evt
+    return actx.freeze(outer_space_invader_dists)
 
 # }}}
 
 
 # {{{ peer list build
 
-
 class PeerListFinder:
-    """This class builds a look-up table from box numbers to peer boxes. The
-    full definition [1]_ of a peer box is as follows:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, wait_for=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_peer_list' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        return build_peer_list(actx, tree)
+
 
-        Given a box :math:`b_j` in a quad-tree, :math:`b_k` is a peer box of
-        :math:`b_j` if it is
+@dataclass_array_container
+@dataclass(frozen=True)
+class PeerListLookup:
+    """
+    .. attribute:: tree
 
-         1. adjacent to :math:`b_j`,
+        The :class:`boxtree.Tree` instance used to build this lookup.
 
-         2. of at least the same size as :math:`b_j` (i.e. at the same or a
-            higher level than), and
+    .. attribute:: peer_list_starts
 
-         3. no child of :math:`b_k` satisfies the above two criteria.
+        Indices into :attr:`peer_lists`.
+        ``peer_lists[peer_list_starts[box_id]:peer_list_starts[box_id]+1]``
+        contains the list of peer boxes of box `box_id`.
 
-    .. [1] Rachh, Manas, Andreas Klöckner, and Michael O'Neil. "Fast
-       algorithms for Quadrature by Expansion I: Globally valid expansions."
+    .. attribute:: peer_lists
 
     .. versionadded:: 2016.1
-
-    .. automethod:: __init__
-    .. automethod:: __call__
     """
 
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
+    tree: Tree
+    peer_list_starts: Array
+    peer_lists: Array
 
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
 
-    # {{{ Kernel generation
+@memoize_on_first_arg
+def get_peer_list_finder_kernel(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        coord_dtype: "np.dtype",
+        box_id_dtype: "np.dtype",
+        max_levels: int):
+    from pyopencl.tools import dtype_to_ctype
+
+    from boxtree import box_flags_enum
+    from boxtree.tools import AXIS_NAMES
+    from boxtree.traversal import (
+        TRAVERSAL_PREAMBLE_TEMPLATE, HELPER_FUNCTION_TEMPLATE)
+
+    logger.debug("start building peer list finder kernel")
+
+    template = Template(
+        TRAVERSAL_PREAMBLE_TEMPLATE
+        + HELPER_FUNCTION_TEMPLATE
+        + PEER_LIST_FINDER_TEMPLATE,
+        strict_undefined=True)
+
+    render_vars = dict(
+        np=np,
+        dimensions=dimensions,
+        dtype_to_ctype=dtype_to_ctype,
+        box_id_dtype=box_id_dtype,
+        particle_id_dtype=None,
+        coord_dtype=coord_dtype,
+        get_coord_vec_dtype=get_coord_vec_dtype,
+        cvec_sub=partial(coord_vec_subscript_code, dimensions),
+        max_levels=max_levels,
+        AXIS_NAMES=AXIS_NAMES,
+        box_flags_enum=box_flags_enum,
+        debug=False,
+        # For calls to the helper is_adjacent_or_overlapping()
+        targets_have_extent=False,
+        sources_have_extent=False)
+
+    from boxtree.tools import VectorArg, ScalarArg
+    arg_decls = [
+        VectorArg(coord_dtype, "box_centers", with_offset=False),
+        ScalarArg(coord_dtype, "root_extent"),
+        VectorArg(np.uint8, "box_levels"),
+        ScalarArg(box_id_dtype, "aligned_nboxes"),
+        VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
+        VectorArg(box_flags_enum.dtype, "box_flags"),
+    ]
+
+    from pyopencl.algorithm import ListOfListsBuilder
+    peer_list_finder_knl = ListOfListsBuilder(
+        actx.context,
+        [("peers", box_id_dtype)],
+        str(template.render(**render_vars)),
+        arg_decls=arg_decls,
+        name_prefix="find_peer_lists",
+        count_sharing={},
+        complex_kernel=True)
+
+    logger.debug("done building peer list finder kernel")
+    return peer_list_finder_knl
+
+
+def build_peer_list(actx: PyOpenCLArrayContext, tree: Tree) -> PeerListLookup:
+    """Builds a look-up table from box numbers to peer boxes. The full definition
+    [1]_ of a peer box is as follows:
+
+    Given a box :math:`b_j` in a quad-tree, :math:`b_k` is a peer box of
+    :math:`b_j` if it is
+
+        1. adjacent to :math:`b_j`,
+
+        2. of at least the same size as :math:`b_j` (i.e. at the same or a
+        higher level than), and
+
+        3. no child of :math:`b_k` satisfies the above two criteria.
 
-    @memoize_method
-    def get_peer_list_finder_kernel(self, dimensions, coord_dtype,
-                                    box_id_dtype, max_levels):
-        from pyopencl.tools import dtype_to_ctype
+    .. [1] Rachh, Manas, Andreas Klöckner, and Michael O'Neil. "Fast
+       algorithms for Quadrature by Expansion I: Globally valid expansions."
+    """
 
-        from boxtree import box_flags_enum
-        from boxtree.tools import AXIS_NAMES
-        from boxtree.traversal import (
-            TRAVERSAL_PREAMBLE_TEMPLATE, HELPER_FUNCTION_TEMPLATE)
-
-        logger.debug("start building peer list finder kernel")
-
-        template = Template(
-            TRAVERSAL_PREAMBLE_TEMPLATE
-            + HELPER_FUNCTION_TEMPLATE
-            + PEER_LIST_FINDER_TEMPLATE,
-            strict_undefined=True)
-
-        render_vars = dict(
-            np=np,
-            dimensions=dimensions,
-            dtype_to_ctype=dtype_to_ctype,
-            box_id_dtype=box_id_dtype,
-            particle_id_dtype=None,
-            coord_dtype=coord_dtype,
-            get_coord_vec_dtype=get_coord_vec_dtype,
-            cvec_sub=partial(coord_vec_subscript_code, dimensions),
-            max_levels=max_levels,
-            AXIS_NAMES=AXIS_NAMES,
-            box_flags_enum=box_flags_enum,
-            debug=False,
-            # For calls to the helper is_adjacent_or_overlapping()
-            targets_have_extent=False,
-            sources_have_extent=False)
-
-        from boxtree.tools import VectorArg, ScalarArg
-        arg_decls = [
-            VectorArg(coord_dtype, "box_centers", with_offset=False),
-            ScalarArg(coord_dtype, "root_extent"),
-            VectorArg(np.uint8, "box_levels"),
-            ScalarArg(box_id_dtype, "aligned_nboxes"),
-            VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
-            VectorArg(box_flags_enum.dtype, "box_flags"),
-        ]
-
-        from pyopencl.algorithm import ListOfListsBuilder
-        peer_list_finder_kernel = ListOfListsBuilder(
-            self.context,
-            [("peers", box_id_dtype)],
-            str(template.render(**render_vars)),
-            arg_decls=arg_decls,
-            name_prefix="find_peer_lists",
-            count_sharing={},
-            complex_kernel=True)
-
-        logger.debug("done building peer list finder kernel")
-        return peer_list_finder_kernel
+    from pytools import div_ceil
 
-    # }}}
+    # Round up level count--this gets included in the kernel as
+    # a stack bound. Rounding avoids too many kernel versions.
+    max_levels = div_ceil(tree.nlevels, 10) * 10
 
-    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, wait_for=None):
-        """
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            execution.
-        :returns: a tuple *(pl, event)*, where *pl* is an instance of
-            :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event`
-            for dependency management.
-        """
-        from pytools import div_ceil
-
-        # Round up level count--this gets included in the kernel as
-        # a stack bound. Rounding avoids too many kernel versions.
-        max_levels = div_ceil(tree.nlevels, 10) * 10
-
-        peer_list_finder_kernel = self.get_peer_list_finder_kernel(
-            tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)
-
-        pl_plog = ProcessLogger(logger, "find peer lists")
-
-        result, evt = peer_list_finder_kernel(
+    peer_list_finder_knl = get_peer_list_finder_kernel(
+        actx,
+        tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
+        max_levels,
+        )
+
+    with ProcessLogger(logger, "find peer lists"):
+        result, evt = peer_list_finder_knl(
                 actx.queue, tree.nboxes,
                 tree.box_centers.data, tree.root_extent,
                 tree.box_levels, tree.aligned_nboxes,
                 tree.box_child_ids.data, tree.box_flags,
-                wait_for=wait_for)
-
-        pl_plog.done()
+                allocator=actx.allocator,
+                )
 
-        lookup = PeerListLookup(
-                tree=tree,
-                peer_list_starts=result["peers"].starts,
-                peer_lists=result["peers"].lists)
+    lookup = PeerListLookup(
+            tree=tree,
+            peer_list_starts=result["peers"].starts,
+            peer_lists=result["peers"].lists)
 
-        return actx.freeze(lookup), evt
+    return actx.freeze(lookup)
 
 # }}}
 
diff --git a/boxtree/array_context.py b/boxtree/array_context.py
index ae566774..118ec27b 100644
--- a/boxtree/array_context.py
+++ b/boxtree/array_context.py
@@ -20,7 +20,17 @@
 THE SOFTWARE.
 """
 
-from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase
+from typing import Any, List, Optional, Union
+
+import numpy as np
+
+from pyopencl.algorithm import BuiltList
+from pytools.tag import ToTagSetConvertible
+
+from arraycontext import (          # noqa: F401
+        PyOpenCLArrayContext as PyOpenCLArrayContextBase,
+        with_array_context, serialize_container, deserialize_container,
+        rec_map_array_container)
 from arraycontext.pytest import (
         _PytestPyOpenCLArrayContextFactoryWithClass,
         register_pytest_array_context_factory)
@@ -30,13 +40,38 @@
 """
 
 
-def _acf():
-    import pyopencl as cl
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
+# {{{ make_loopy_program
 
-    return PyOpenCLArrayContext(queue, force_device_scalars=True)
+def make_loopy_program(
+        domains, statements,
+        kernel_data: Optional[List[Any]] = None, *,
+        name: str = "sumpy_loopy_kernel",
+        assumptions: Optional[Union[List[str], str]] = None,
+        tags: ToTagSetConvertible = None):
+    """Return a :class:`loopy.LoopKernel` suitable for use with
+    :meth:`arraycontext.ArrayContext.call_loopy`.
+    """
+    if kernel_data is None:
+        kernel_data = [...]
+
+    import loopy as lp
+    from arraycontext.loopy import _DEFAULT_LOOPY_OPTIONS
+
+    return lp.make_kernel(
+            domains,
+            statements,
+            kernel_data=kernel_data,
+            options=_DEFAULT_LOOPY_OPTIONS,
+            default_offset=lp.auto,
+            name=name,
+            lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
+            assumptions=assumptions,
+            tags=tags)
 
+# }}}
+
+
+# {{{ array context
 
 class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
     def transform_loopy_program(self, t_unit):
@@ -49,7 +84,137 @@ def transform_loopy_program(self, t_unit):
                     "Did you use arraycontext.make_loopy_program "
                     "to create this kernel?")
 
-        return super().transform_loopy_program(t_unit)
+        return t_unit
+
+    # NOTE: _rec_map_container is copied from arraycontext wholesale and should
+    # be kept in sync as much as possible!
+
+    def _rec_map_container(self, func, array, allowed_types=None, *,
+            default_scalar=None, strict=False):
+        import arraycontext.impl.pyopencl.taggable_cl_array as tga
+
+        if allowed_types is None:
+            allowed_types = (tga.TaggableCLArray,)
+
+        def _wrapper(ary):
+            # NOTE: this is copied verbatim from arraycontext and this is the
+            # only change to allow optional fields inside containers
+            if ary is None:
+                return ary
+
+            if isinstance(ary, allowed_types):
+                return func(ary)
+            elif not strict and isinstance(ary, self.array_types):
+                from warnings import warn
+                warn(f"Invoking {type(self).__name__}.{func.__name__[1:]} with "
+                    f"{type(ary).__name__} will be unsupported in 2023. Use "
+                    "'to_tagged_cl_array' to convert instances to TaggableCLArray.",
+                    DeprecationWarning, stacklevel=2)
+                return func(tga.to_tagged_cl_array(ary))
+            elif np.isscalar(ary):
+                if default_scalar is None:
+                    return ary
+                else:
+                    return np.array(ary).dtype.type(default_scalar)
+            else:
+                raise TypeError(
+                    f"{type(self).__name__}.{func.__name__[1:]} invoked with "
+                    f"an unsupported array type: got '{type(ary).__name__}', "
+                    f"but expected one of {allowed_types}")
+
+        return rec_map_array_container(_wrapper, array)
+
+# }}}
+
+
+# {{{ dataclass array container
+
+def dataclass_array_container(cls: type) -> type:
+    """A decorator based on :func:`arraycontext.dataclass_array_container`
+    that allows :class:`typing.Optional` containers.
+    """
+
+    from dataclasses import Field, fields, is_dataclass
+    from typing import Union, get_args
+    try:
+        # NOTE: only available in python >= 3.8
+        from typing import get_origin
+    except ImportError:
+        from typing_extensions import get_origin
+
+    from arraycontext.container.dataclass import (
+        is_array_type, inject_dataclass_serialization)
+
+    assert is_dataclass(cls)
+
+    def is_array_field(f: Field) -> bool:
+        if __debug__:
+            if not f.init:
+                raise ValueError(
+                        f"Fields with 'init=False' not allowed: '{f.name}'")
+
+            if isinstance(f.type, str):
+                raise TypeError(
+                        f"String annotation on field '{f.name}' not supported")
+
+        origin = get_origin(f.type)
+        if origin is Union:
+            # NOTE: `Optional` is caught in here as an alias for `Union[Anon, type]`
+            return all(
+                is_array_type(arg) or isinstance(arg, type(None))
+                for arg in get_args(f.type))
+
+        from typing import _GenericAlias, _SpecialForm  # type: ignore[attr-defined]
+        if isinstance(f.type, (_GenericAlias, _SpecialForm)):
+            return False
+
+        return is_array_type(f.type)
+
+    from pytools import partition
+    array_fields, non_array_fields = partition(is_array_field, fields(cls))
+
+    if not array_fields:
+        raise ValueError(f"'{cls}' must have fields with array container type "
+                "in order to use the 'dataclass_array_container' decorator")
+
+    return inject_dataclass_serialization(cls, array_fields, non_array_fields)
+
+# }}}
+
+
+# {{{ serialization
+
+# NOTE: BuiltList is serialized explicitly here to avoid monkeypatching the
+# version in pyopencl (dataclass_array_container modifies the class)
+
+@serialize_container.register(BuiltList)
+def _serialize_built_list(obj: BuiltList):
+    return tuple([
+        ("starts", obj.starts),
+        ("lists", obj.lists),
+        ("nonempty_indices", obj.nonempty_indices),
+        ("compressed_indices", obj.compressed_indices),
+        ])
+
+
+@deserialize_container.register(BuiltList)
+def _deserialize_built_list(template: BuiltList, iterable):
+    return type(template)(
+        count=template.count,
+        num_nonempty_lists=template.num_nonempty_lists,
+        **dict(iterable))
+
+# }}}
+
+
+# {{{ pytest
+
+def _acf():
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    return PyOpenCLArrayContext(queue, force_device_scalars=True)
 
 
 class PytestPyOpenCLArrayContextFactory(
@@ -59,3 +224,5 @@ class PytestPyOpenCLArrayContextFactory(
 
 register_pytest_array_context_factory("boxtree.pyopencl",
         PytestPyOpenCLArrayContextFactory)
+
+# }}}
diff --git a/boxtree/bounding_box.py b/boxtree/bounding_box.py
index 236cdf0e..9f44de0f 100644
--- a/boxtree/bounding_box.py
+++ b/boxtree/bounding_box.py
@@ -23,23 +23,25 @@
 import numpy as np
 from pyopencl.reduction import ReductionTemplate
 
-from pytools import memoize, memoize_method
+from pytools import memoize, memoize_on_first_arg
 
 from boxtree.tools import get_type_moniker
 from boxtree.array_context import PyOpenCLArrayContext
 
 
+# {{{ kernel template
+
 @memoize
 def make_bounding_box_dtype(device, dimensions, coord_dtype):
     from boxtree.tools import AXIS_NAMES
     fields = []
-    for i in range(dimensions):
-        fields.append(("min_%s" % AXIS_NAMES[i], coord_dtype))
-        fields.append(("max_%s" % AXIS_NAMES[i], coord_dtype))
+    for ax in AXIS_NAMES:
+        fields.append((f"min_{ax}", coord_dtype))
+        fields.append((f"max_{ax}", coord_dtype))
 
     dtype = np.dtype(fields)
 
-    name = "boxtree_bbox_%dd_%s_t" % (dimensions, get_type_moniker(coord_dtype))
+    name = "boxtree_bbox_{}d_{}_t".format(dimensions, get_type_moniker(coord_dtype))
 
     from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
     dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
@@ -118,60 +120,62 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype):
             """,
     name_prefix="bounding_box")
 
+# }}}
+
 
-class BoundingBoxFinder:
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
-
-        for dev in self.context.devices:
-            if (dev.vendor == "Intel(R) Corporation"
-                    and dev.version == "OpenCL 1.2 (Build 56860)"):
-                raise RuntimeError("bounding box finder does not work "
-                        "properly with this CL runtime.")
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
-
-    @memoize_method
-    def get_kernel(self, dimensions, coord_dtype, have_radii):
-        # FIXME: Why does this just use `devices[0]`?
-        bbox_dtype, bbox_cdecl = make_bounding_box_dtype(
-                self.context.devices[0], dimensions, coord_dtype)
-
-        from boxtree.tools import AXIS_NAMES
-        return BBOX_REDUCTION_TPL.build(
-                self.context,
-                type_aliases=(
-                    ("reduction_t", bbox_dtype),
-                    ("bbox_t", bbox_dtype),
-                    ("coord_t", coord_dtype),
-                    ),
-                var_values=(
-                    ("axis_names", AXIS_NAMES[:dimensions]),
-                    ("dimensions", dimensions),
-                    ("coord_dtype", coord_dtype),
-                    ("have_radii", have_radii),
-                    ("np", np),
-                    )
-                )
+# {{{ find_bounding_box
 
-    def __call__(self, actx, particles, radii, wait_for=None):
-        dimensions = len(particles)
+@memoize_on_first_arg
+def get_bounding_box_kernel(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        coord_dtype: "np.dtype",
+        have_radii: bool):
+    bbox_dtype, bbox_cdecl = make_bounding_box_dtype(
+            actx.queue.device, dimensions, coord_dtype)
 
-        from pytools import single_valued
-        coord_dtype = single_valued(coord.dtype for coord in particles)
+    from boxtree.tools import AXIS_NAMES
+    return BBOX_REDUCTION_TPL.build(
+            actx.context,
+            type_aliases=(
+                ("reduction_t", bbox_dtype),
+                ("bbox_t", bbox_dtype),
+                ("coord_t", coord_dtype),
+                ),
+            var_values=(
+                ("axis_names", AXIS_NAMES[:dimensions]),
+                ("dimensions", dimensions),
+                ("coord_dtype", coord_dtype),
+                ("have_radii", have_radii),
+                ("np", np),
+                )
+            )
 
-        if radii is None:
-            radii_tuple = ()
-        else:
-            radii_tuple = (radii,)
 
-        knl = self.get_kernel(dimensions, coord_dtype, have_radii=radii is not None)
-        return knl(
-            *(tuple(particles) + radii_tuple),
-            queue=actx.queue,
-            wait_for=wait_for, return_event=True)
+def find_bounding_box(actx: PyOpenCLArrayContext, particles, radii):
+    dev = actx.queue.device
+    if (dev.vendor == "Intel(R) Corporation"
+            and dev.version == "OpenCL 1.2 (Build 56860)"):
+        raise RuntimeError(
+            f"'find_bounding_box' does not work properly with "
+            f"this CL runtime: {dev}")
+
+    from pytools import single_valued
+    dimensions = len(particles)
+    coord_dtype = single_valued(coord.dtype for coord in particles)
+    have_radii = radii is not None
+
+    if radii is None:
+        radii_tuple = ()
+    else:
+        radii_tuple = (radii,)
+
+    knl = get_bounding_box_kernel(actx, dimensions, coord_dtype, have_radii)
+    return knl(
+        *(tuple(particles) + radii_tuple),
+        queue=actx.queue,
+        allocator=actx.allocator,
+        )
 
 # }}}
 
diff --git a/boxtree/constant_one.py b/boxtree/constant_one.py
index 4e61f736..958989a1 100644
--- a/boxtree/constant_one.py
+++ b/boxtree/constant_one.py
@@ -26,6 +26,8 @@
 """
 
 import numpy as np
+
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.fmm import TreeIndependentDataForWrangler, ExpansionWranglerInterface
 from boxtree.timing import DummyTimingFuture
 
@@ -83,7 +85,9 @@ def local_expansions_view(self, local_exps, level):
     def timing_future(ops):
         return DummyTimingFuture.from_op_count(ops)
 
-    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+    def form_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_box_nrs,
+            source_boxes,
             src_weight_vecs):
         src_weights, = src_weight_vecs
         mpoles = self.multipole_expansion_zeros()
@@ -96,8 +100,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes,
 
         return mpoles, self.timing_future(ops)
 
-    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
-            source_parent_boxes, mpoles):
+    def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_parent_box_nrs,
+            source_parent_boxes,
+            mpoles):
         tree = self.tree
         ops = 0
 
@@ -119,7 +125,8 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs,
 
         return mpoles, self.timing_future(ops)
 
-    def eval_direct(self, target_boxes, neighbor_sources_starts,
+    def eval_direct(self, actx: PyOpenCLArrayContext,
+            target_boxes, neighbor_sources_starts,
             neighbor_sources_lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         pot = self.output_zeros()
@@ -144,6 +151,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts,
         return pot, self.timing_future(ops)
 
     def multipole_to_local(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
@@ -164,7 +172,9 @@ def multipole_to_local(self,
         return local_exps, self.timing_future(ops)
 
     def eval_multipoles(self,
-            target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level,
+            actx: PyOpenCLArrayContext,
+            target_boxes_by_source_level,
+            from_sep_smaller_nonsiblings_by_level,
             mpole_exps):
         pot = self.output_zeros()
         ops = 0
@@ -186,8 +196,10 @@ def eval_multipoles(self,
         return pot, self.timing_future(ops)
 
     def form_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
-            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+            target_or_target_parent_boxes,
+            starts, lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         local_exps = self.local_expansion_zeros()
         ops = 0
@@ -209,7 +221,9 @@ def form_locals(self,
 
         return local_exps, self.timing_future(ops)
 
-    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
+    def refine_locals(self,
+            actx: PyOpenCLArrayContext,
+            level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, local_exps):
         ops = 0
 
@@ -222,7 +236,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs,
 
         return local_exps, self.timing_future(ops)
 
-    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+    def eval_locals(self,
+            actx: PyOpenCLArrayContext,
+            level_start_target_box_nrs,
+            target_boxes, local_exps):
         pot = self.output_zeros()
         ops = 0
 
@@ -233,7 +250,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
 
         return pot, self.timing_future(ops)
 
-    def finalize_potentials(self, potentials, template_ary):
+    def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials):
         return potentials
 
 # }}}
diff --git a/boxtree/cost.py b/boxtree/cost.py
index e6d43e63..1e026b52 100644
--- a/boxtree/cost.py
+++ b/boxtree/cost.py
@@ -69,7 +69,7 @@
 from mako.template import Template
 
 from pymbolic import var, evaluate
-from pytools import memoize_method
+from pytools import memoize_in
 
 from boxtree.array_context import PyOpenCLArrayContext
 
@@ -242,7 +242,6 @@ def process_form_multipoles(self, actx: PyOpenCLArrayContext,
         :return: an array of shape (nsource_boxes,), with each entry represents
             the cost of the box.
         """
-        pass
 
     @abstractmethod
     def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
@@ -259,7 +258,6 @@ def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
             immediate clear how per-box cost of upward propagation will be useful for
             distributed load balancing.
         """
-        pass
 
     @abstractmethod
     def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
@@ -271,7 +269,6 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
         :return: an array of shape (ntarget_boxes,), with each entry representing
             the number of direct evaluation sources for that target box.
         """
-        pass
 
     @abstractmethod
     def process_direct(self, actx: PyOpenCLArrayContext,
@@ -292,7 +289,6 @@ def process_direct(self, actx: PyOpenCLArrayContext,
         :return: an array of shape (ntarget_boxes,), with each entry represents
             the cost of the box.
         """
-        pass
 
     @abstractmethod
     def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost):
@@ -304,7 +300,6 @@ def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost):
             each entry representing the cost of multipole-to-local
             translations to this box.
         """
-        pass
 
     @abstractmethod
     def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
@@ -322,7 +317,6 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
             cost of evaluating all targets inside this box from multipole
             expansions of list-3 boxes.
         """
-        pass
 
     @abstractmethod
     def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost):
@@ -335,7 +329,6 @@ def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost):
             each entry representing the cost of point-to-local translations to
             this box.
         """
-        pass
 
     @abstractmethod
     def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
@@ -352,7 +345,6 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
         :return: an array of shape (ntarget_boxes,), the cost of evaluating the
             potentials of all targets inside this box from its local expansion.
         """
-        pass
 
     @abstractmethod
     def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost):
@@ -368,7 +360,6 @@ def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost)
             immediate clear how per-box cost of downward propagation will be useful
             for distributed load balancing.
         """
-        pass
 
     @abstractmethod
     def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result):
@@ -377,7 +368,6 @@ def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result):
         :arg per_box_result: an array to be sumed.
         :return: a :class:`float`, the result of the sum.
         """
-        pass
 
     @staticmethod
     def cost_factors_to_dev(cost_factors, actx: Optional[PyOpenCLArrayContext]):
@@ -449,7 +439,6 @@ def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes):
         :param nboxes: the number of boxes
         :return: an array of shape (*nboxes*,), representing the zero per-box cost.
         """
-        pass
 
     def cost_per_box(self, actx: PyOpenCLArrayContext, traversal, level_to_order,
                      calibration_params,
@@ -730,41 +719,46 @@ class FMMCostModel(AbstractFMMCostModel):
 
     # {{{ form multipoles
 
-    @memoize_method
-    def process_form_multipoles_knl(self, actx: PyOpenCLArrayContext,
+    def process_form_multipoles_kernel(self, actx: PyOpenCLArrayContext,
                                     box_id_dtype, particle_id_dtype,
                                     box_level_dtype):
-        return ElementwiseKernel(
-            actx.context,
-            Template(r"""
-                double *np2m,
-                ${box_id_t} *source_boxes,
-                ${particle_id_t} *box_source_counts_nonchild,
-                ${box_level_t} *box_levels,
-                double *p2m_cost
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            Template(r"""
-                ${box_id_t} box_idx = source_boxes[i];
-                ${particle_id_t} nsources = box_source_counts_nonchild[box_idx];
-                ${box_level_t} ilevel = box_levels[box_idx];
-                np2m[i] = nsources * p2m_cost[ilevel];
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            name="process_form_multipoles"
-        )
+        @memoize_in(actx, (
+            FMMCostModel.process_form_multipoles_kernel,
+            box_id_dtype, particle_id_dtype, box_level_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    double *np2m,
+                    ${box_id_t} *source_boxes,
+                    ${particle_id_t} *box_source_counts_nonchild,
+                    ${box_level_t} *box_levels,
+                    double *p2m_cost
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                Template(r"""
+                    ${box_id_t} box_idx = source_boxes[i];
+                    ${particle_id_t} nsources = box_source_counts_nonchild[box_idx];
+                    ${box_level_t} ilevel = box_levels[box_idx];
+                    np2m[i] = nsources * p2m_cost[ilevel];
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                name="process_form_multipoles"
+            )
+
+        return get_kernel()
 
     def process_form_multipoles(self, actx, traversal, p2m_cost):
         tree = traversal.tree
         np2m = actx.zeros(len(traversal.source_boxes), dtype=np.float64)
 
-        process_form_multipoles_knl = self.process_form_multipoles_knl(
+        process_form_multipoles_knl = self.process_form_multipoles_kernel(
             actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
         )
 
@@ -783,57 +777,62 @@ def process_form_multipoles(self, actx, traversal, p2m_cost):
 
     # {{{ propagate multipoles upward
 
-    @memoize_method
-    def process_coarsen_multipoles_knl(self, actx: PyOpenCLArrayContext,
-                                       ndimensions, box_id_dtype,
-                                       box_level_dtype, nlevels):
-        return ElementwiseKernel(
-            actx.context,
-            Template(r"""
-                ${box_id_t} *source_parent_boxes,
-                ${box_level_t} *box_levels,
-                double *m2m_cost,
-                double *nm2m,
-                % for i in range(2**ndimensions):
-                    % if i == 2**ndimensions - 1:
-                        ${box_id_t} *box_child_ids_${i}
-                    % else:
-                        ${box_id_t} *box_child_ids_${i},
-                    % endif
-                % endfor
-            """).render(
-                ndimensions=ndimensions,
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            Template(r"""
-                ${box_id_t} box_idx = source_parent_boxes[i];
-                ${box_level_t} target_level = box_levels[box_idx];
-                if(target_level <= 1) {
-                    nm2m[i] = 0.0;
-                } else {
-                    int nchild = 0;
+    def process_coarsen_multipoles_kernel(self, actx: PyOpenCLArrayContext,
+                                          ndimensions, box_id_dtype,
+                                          box_level_dtype, nlevels):
+        @memoize_in(actx, (
+            FMMCostModel.process_coarsen_multipoles_kernel,
+            ndimensions, box_id_dtype, box_level_dtype, nlevels))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    ${box_id_t} *source_parent_boxes,
+                    ${box_level_t} *box_levels,
+                    double *m2m_cost,
+                    double *nm2m,
                     % for i in range(2**ndimensions):
-                        if(box_child_ids_${i}[box_idx])
-                            nchild += 1;
+                        % if i == 2**ndimensions - 1:
+                            ${box_id_t} *box_child_ids_${i}
+                        % else:
+                            ${box_id_t} *box_child_ids_${i},
+                        % endif
                     % endfor
-                    nm2m[i] = nchild * m2m_cost[target_level];
-                }
-            """).render(
-                ndimensions=ndimensions,
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype),
-                nlevels=nlevels
-            ),
-            name="process_coarsen_multipoles"
-        )
+                """).render(
+                    ndimensions=ndimensions,
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                Template(r"""
+                    ${box_id_t} box_idx = source_parent_boxes[i];
+                    ${box_level_t} target_level = box_levels[box_idx];
+                    if(target_level <= 1) {
+                        nm2m[i] = 0.0;
+                    } else {
+                        int nchild = 0;
+                        % for i in range(2**ndimensions):
+                            if(box_child_ids_${i}[box_idx])
+                                nchild += 1;
+                        % endfor
+                        nm2m[i] = nchild * m2m_cost[target_level];
+                    }
+                """).render(
+                    ndimensions=ndimensions,
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype),
+                    nlevels=nlevels
+                ),
+                name="process_coarsen_multipoles"
+            )
+
+        return get_kernel()
 
     def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
                                    traversal, m2m_cost):
         tree = traversal.tree
         nm2m = actx.zeros(len(traversal.source_parent_boxes), dtype=np.float64)
 
-        process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl(
+        process_coarsen_multipoles_knl = self.process_coarsen_multipoles_kernel(
             actx,
             tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels
         )
@@ -844,7 +843,7 @@ def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
             m2m_cost,
             nm2m,
             *tree.box_child_ids,
-            queue=actx.queue
+            queue=actx.queue,
         )
 
         return self.aggregate_over_boxes(actx, nm2m)
@@ -853,42 +852,47 @@ def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
 
     # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
 
-    @memoize_method
-    def _get_ndirect_sources_knl(self, actx: PyOpenCLArrayContext,
+    def _get_ndirect_sources_kernel(self, actx: PyOpenCLArrayContext,
                                  particle_id_dtype, box_id_dtype):
-        return ElementwiseKernel(
-            actx.context,
-            Template("""
-                ${particle_id_t} *ndirect_sources_by_itgt_box,
-                ${box_id_t} *source_boxes_starts,
-                ${box_id_t} *source_boxes_lists,
-                ${particle_id_t} *box_source_counts_nonchild
-            """).render(
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_id_t=dtype_to_ctype(box_id_dtype)
-            ),
-            Template(r"""
-                ${particle_id_t} nsources = 0;
-                ${box_id_t} source_boxes_start_idx = source_boxes_starts[i];
-                ${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1];
-
-                for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx;
-                    cur_source_boxes_idx < source_boxes_end_idx;
-                    cur_source_boxes_idx++)
-                {
-                    ${box_id_t} cur_source_box = source_boxes_lists[
-                        cur_source_boxes_idx
-                    ];
-                    nsources += box_source_counts_nonchild[cur_source_box];
-                }
-
-                ndirect_sources_by_itgt_box[i] += nsources;
-            """).render(
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_id_t=dtype_to_ctype(box_id_dtype)
-            ),
-            name="get_ndirect_sources"
-        )
+        @memoize_in(actx, (
+            FMMCostModel._get_ndirect_sources_kernel,
+            particle_id_dtype, box_id_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template("""
+                    ${particle_id_t} *ndirect_sources_by_itgt_box,
+                    ${box_id_t} *source_boxes_starts,
+                    ${box_id_t} *source_boxes_lists,
+                    ${particle_id_t} *box_source_counts_nonchild
+                """).render(
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_id_t=dtype_to_ctype(box_id_dtype)
+                ),
+                Template(r"""
+                    ${particle_id_t} nsources = 0;
+                    ${box_id_t} source_boxes_start_idx = source_boxes_starts[i];
+                    ${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1];
+
+                    for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx;
+                        cur_source_boxes_idx < source_boxes_end_idx;
+                        cur_source_boxes_idx++)
+                    {
+                        ${box_id_t} cur_source_box = source_boxes_lists[
+                            cur_source_boxes_idx
+                        ];
+                        nsources += box_source_counts_nonchild[cur_source_box];
+                    }
+
+                    ndirect_sources_by_itgt_box[i] += nsources;
+                """).render(
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_id_t=dtype_to_ctype(box_id_dtype)
+                ),
+                name="get_ndirect_sources"
+            )
+
+        return get_kernel()
 
     def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
                                            traversal):
@@ -897,7 +901,7 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
         particle_id_dtype = tree.particle_id_dtype
         box_id_dtype = tree.box_id_dtype
 
-        get_ndirect_sources_knl = self._get_ndirect_sources_knl(
+        get_ndirect_sources_knl = self._get_ndirect_sources_kernel(
             actx, particle_id_dtype, box_id_dtype
         )
 
@@ -910,7 +914,8 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
             ndirect_sources_by_itgt_box,
             traversal.neighbor_source_boxes_starts,
             traversal.neighbor_source_boxes_lists,
-            tree.box_source_counts_nonchild
+            tree.box_source_counts_nonchild,
+            queue=actx.queue,
         )
 
         # List 3 close
@@ -920,7 +925,8 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
                 ndirect_sources_by_itgt_box,
                 traversal.from_sep_close_smaller_starts,
                 traversal.from_sep_close_smaller_lists,
-                tree.box_source_counts_nonchild
+                tree.box_source_counts_nonchild,
+                queue=actx.queue,
             )
 
         # List 4 close
@@ -930,7 +936,8 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
                 ndirect_sources_by_itgt_box,
                 traversal.from_sep_close_bigger_starts,
                 traversal.from_sep_close_bigger_lists,
-                tree.box_source_counts_nonchild
+                tree.box_source_counts_nonchild,
+                queue=actx.queue,
             )
 
         return ndirect_sources_by_itgt_box
@@ -950,33 +957,37 @@ def process_direct(self, actx: PyOpenCLArrayContext,
 
     # {{{ translate separated siblings' ("list 2") mpoles to local
 
-    @memoize_method
-    def process_list2_knl(self, actx: PyOpenCLArrayContext,
-                          box_id_dtype, box_level_dtype):
-        return ElementwiseKernel(
-            actx.context,
-            Template(r"""
-                double *nm2l,
-                ${box_id_t} *target_or_target_parent_boxes,
-                ${box_id_t} *from_sep_siblings_starts,
-                ${box_level_t} *box_levels,
-                double *m2l_cost
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            Template(r"""
-                ${box_id_t} start = from_sep_siblings_starts[i];
-                ${box_id_t} end = from_sep_siblings_starts[i+1];
-                ${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]];
-
-                nm2l[i] = (end - start) * m2l_cost[ilevel];
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            name="process_list2"
-        )
+    def process_list2_kernel(self, actx: PyOpenCLArrayContext,
+                             box_id_dtype, box_level_dtype):
+        @memoize_in(actx, (
+            FMMCostModel.process_list2_kernel, box_id_dtype, box_level_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    double *nm2l,
+                    ${box_id_t} *target_or_target_parent_boxes,
+                    ${box_id_t} *from_sep_siblings_starts,
+                    ${box_level_t} *box_levels,
+                    double *m2l_cost
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                Template(r"""
+                    ${box_id_t} start = from_sep_siblings_starts[i];
+                    ${box_id_t} end = from_sep_siblings_starts[i+1];
+                    ${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]];
+
+                    nm2l[i] = (end - start) * m2l_cost[ilevel];
+                """).render(    # noqa: E501
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                name="process_list2"
+            )
+
+        return get_kernel()
 
     def process_list2(self, actx, traversal, m2l_cost):
         tree = traversal.tree
@@ -986,7 +997,7 @@ def process_list2(self, actx, traversal, m2l_cost):
         ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
         nm2l = actx.zeros((ntarget_or_target_parent_boxes,), dtype=np.float64)
 
-        process_list2_knl = self.process_list2_knl(
+        process_list2_knl = self.process_list2_kernel(
             actx, box_id_dtype, box_level_dtype
         )
         process_list2_knl(
@@ -1004,35 +1015,40 @@ def process_list2(self, actx, traversal, m2l_cost):
 
     # {{{ evaluate sep. smaller mpoles ("list 3") at particles
 
-    @memoize_method
-    def process_list3_knl(self, actx: PyOpenCLArrayContext,
-                          box_id_dtype, particle_id_dtype):
-        return ElementwiseKernel(
-            actx.context,
-            Template(r"""
-                ${box_id_t} *target_boxes_sep_smaller,
-                ${box_id_t} *sep_smaller_start,
-                ${particle_id_t} *box_target_counts_nonchild,
-                double m2p_cost_current_level,
-                double *nm2p
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype)
-            ),
-            Template(r"""
-                ${box_id_t} target_box = target_boxes_sep_smaller[i];
-                ${box_id_t} start = sep_smaller_start[i];
-                ${box_id_t} end = sep_smaller_start[i+1];
-                ${particle_id_t} ntargets = box_target_counts_nonchild[target_box];
-                nm2p[target_box] += (
-                    ntargets * (end - start) * m2p_cost_current_level
-                );
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype)
-            ),
-            name="process_list3"
-        )
+    def process_list3_kernel(self, actx: PyOpenCLArrayContext,
+                             box_id_dtype, particle_id_dtype):
+        @memoize_in(actx, (
+            FMMCostModel.process_list3_kernel,
+            box_id_dtype, particle_id_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    ${box_id_t} *target_boxes_sep_smaller,
+                    ${box_id_t} *sep_smaller_start,
+                    ${particle_id_t} *box_target_counts_nonchild,
+                    double m2p_cost_current_level,
+                    double *nm2p
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype)
+                ),
+                Template(r"""
+                    ${box_id_t} target_box = target_boxes_sep_smaller[i];
+                    ${box_id_t} start = sep_smaller_start[i];
+                    ${box_id_t} end = sep_smaller_start[i+1];
+                    ${particle_id_t} ntargets = box_target_counts_nonchild[target_box];
+                    nm2p[target_box] += (
+                        ntargets * (end - start) * m2p_cost_current_level
+                    );
+                """).render(    # noqa: E501
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype)
+                ),
+                name="process_list3"
+            )
+
+        return get_kernel()
 
     def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
                       box_target_counts_nonchild=None):
@@ -1042,7 +1058,7 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = tree.box_target_counts_nonchild
 
-        process_list3_knl = self.process_list3_knl(
+        process_list3_knl = self.process_list3_kernel(
             actx, tree.box_id_dtype, tree.particle_id_dtype
         )
 
@@ -1054,7 +1070,7 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
                 box_target_counts_nonchild,
                 actx.to_numpy(m2p_cost[ilevel]).reshape(-1)[0],
                 nm2p,
-                queue=actx.queue
+                queue=actx.queue,
             )
 
         return nm2p
@@ -1063,46 +1079,51 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
 
     # {{{ form locals for separated bigger source boxes ("list 4")
 
-    @memoize_method
-    def process_list4_knl(self, actx: PyOpenCLArrayContext,
-                          box_id_dtype, particle_id_dtype, box_level_dtype):
-        return ElementwiseKernel(
-            actx.context,
-            Template(r"""
-                double *nm2p,
-                ${box_id_t} *from_sep_bigger_starts,
-                ${box_id_t} *from_sep_bigger_lists,
-                ${particle_id_t} *box_source_counts_nonchild,
-                ${box_level_t} *box_levels,
-                double *p2l_cost
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            Template(r"""
-                ${box_id_t} start = from_sep_bigger_starts[i];
-                ${box_id_t} end = from_sep_bigger_starts[i+1];
-                for(${box_id_t} idx=start; idx < end; idx++) {
-                    ${box_id_t} src_ibox = from_sep_bigger_lists[idx];
-                    ${particle_id_t} nsources = box_source_counts_nonchild[src_ibox];
-                    ${box_level_t} ilevel = box_levels[src_ibox];
-                    nm2p[i] += nsources * p2l_cost[ilevel];
-                }
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            name="process_list4"
-        )
+    def process_list4_kernel(self, actx: PyOpenCLArrayContext,
+                             box_id_dtype, particle_id_dtype, box_level_dtype):
+        @memoize_in(actx, (
+            FMMCostModel.process_list4_kernel,
+            box_id_dtype, particle_id_dtype, box_level_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    double *nm2p,
+                    ${box_id_t} *from_sep_bigger_starts,
+                    ${box_id_t} *from_sep_bigger_lists,
+                    ${particle_id_t} *box_source_counts_nonchild,
+                    ${box_level_t} *box_levels,
+                    double *p2l_cost
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                Template(r"""
+                    ${box_id_t} start = from_sep_bigger_starts[i];
+                    ${box_id_t} end = from_sep_bigger_starts[i+1];
+                    for(${box_id_t} idx=start; idx < end; idx++) {
+                        ${box_id_t} src_ibox = from_sep_bigger_lists[idx];
+                        ${particle_id_t} nsources = box_source_counts_nonchild[src_ibox];
+                        ${box_level_t} ilevel = box_levels[src_ibox];
+                        nm2p[i] += nsources * p2l_cost[ilevel];
+                    }
+                """).render(    # noqa: E501
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                name="process_list4"
+            )
+
+        return get_kernel()
 
     def process_list4(self, actx, traversal, p2l_cost):
         tree = traversal.tree
         target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
         nm2p = actx.zeros(len(target_or_target_parent_boxes), dtype=np.float64)
 
-        process_list4_knl = self.process_list4_knl(
+        process_list4_knl = self.process_list4_kernel(
             actx,
             tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
         )
@@ -1114,7 +1135,7 @@ def process_list4(self, actx, traversal, p2l_cost):
             tree.box_source_counts_nonchild,
             tree.box_levels,
             p2l_cost,
-            queue=actx.queue
+            queue=actx.queue,
         )
 
         return nm2p
@@ -1123,34 +1144,40 @@ def process_list4(self, actx, traversal, p2l_cost):
 
     # {{{ evaluate local expansions at targets
 
-    @memoize_method
-    def process_eval_locals_knl(self, actx: PyOpenCLArrayContext,
-                                box_id_dtype, particle_id_dtype, box_level_dtype):
-        return ElementwiseKernel(
-            actx.context,
-            Template(r"""
-                double *neval_locals,
-                ${box_id_t} *target_boxes,
-                ${particle_id_t} *box_target_counts_nonchild,
-                ${box_level_t} *box_levels,
-                double *l2p_cost
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            Template(r"""
-                ${box_id_t} box_idx = target_boxes[i];
-                ${particle_id_t} ntargets = box_target_counts_nonchild[box_idx];
-                ${box_level_t} ilevel = box_levels[box_idx];
-                neval_locals[i] = ntargets * l2p_cost[ilevel];
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-                particle_id_t=dtype_to_ctype(particle_id_dtype),
-                box_level_t=dtype_to_ctype(box_level_dtype)
-            ),
-            name="process_eval_locals"
-        )
+    def process_eval_locals_kernel(self, actx: PyOpenCLArrayContext,
+                                   box_id_dtype, particle_id_dtype, box_level_dtype):
+
+        @memoize_in(actx, (
+            FMMCostModel.process_eval_locals_kernel,
+            box_id_dtype, particle_id_dtype, box_level_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    double *neval_locals,
+                    ${box_id_t} *target_boxes,
+                    ${particle_id_t} *box_target_counts_nonchild,
+                    ${box_level_t} *box_levels,
+                    double *l2p_cost
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                Template(r"""
+                    ${box_id_t} box_idx = target_boxes[i];
+                    ${particle_id_t} ntargets = box_target_counts_nonchild[box_idx];
+                    ${box_level_t} ilevel = box_levels[box_idx];
+                    neval_locals[i] = ntargets * l2p_cost[ilevel];
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                    particle_id_t=dtype_to_ctype(particle_id_dtype),
+                    box_level_t=dtype_to_ctype(box_level_dtype)
+                ),
+                name="process_eval_locals"
+            )
+
+        return get_kernel()
 
     def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
                             box_target_counts_nonchild=None):
@@ -1161,7 +1188,7 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
 
-        process_eval_locals_knl = self.process_eval_locals_knl(
+        process_eval_locals_knl = self.process_eval_locals_kernel(
             actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
         )
 
@@ -1170,7 +1197,8 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
             traversal.target_boxes,
             box_target_counts_nonchild,
             tree.box_levels,
-            l2p_cost
+            l2p_cost,
+            queue=actx.queue,
         )
 
         return neval_locals
@@ -1179,32 +1207,36 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
 
     # {{{ propagate locals downward
 
-    @memoize_method
-    def process_refine_locals_knl(self, actx: PyOpenCLArrayContext, box_id_dtype):
-        from pyopencl.reduction import ReductionKernel
-        return ReductionKernel(
-            actx.context,
-            np.float64,
-            neutral="0.0",
-            reduce_expr="a+b",
-            map_expr=r"""
-                (level_start_target_or_target_parent_box_nrs[i + 1]
-                 - level_start_target_or_target_parent_box_nrs[i])
-                 * l2l_cost[i - 1]
-            """,
-            arguments=Template(r"""
-                ${box_id_t} *level_start_target_or_target_parent_box_nrs,
-                double *l2l_cost
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype)
-            ),
-            name="process_refine_locals"
-        )
+    def process_refine_locals_kernel(self, actx: PyOpenCLArrayContext, box_id_dtype):
+        @memoize_in(actx, (
+            FMMCostModel.process_refine_locals_kernel, box_id_dtype))
+        def get_kernel():
+            from pyopencl.reduction import ReductionKernel
+            return ReductionKernel(
+                actx.context,
+                np.float64,
+                neutral="0.0",
+                reduce_expr="a+b",
+                map_expr=r"""
+                    (level_start_target_or_target_parent_box_nrs[i + 1]
+                    - level_start_target_or_target_parent_box_nrs[i])
+                    * l2l_cost[i - 1]
+                """,
+                arguments=Template(r"""
+                    ${box_id_t} *level_start_target_or_target_parent_box_nrs,
+                    double *l2l_cost
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype)
+                ),
+                name="process_refine_locals"
+            )
+
+        return get_kernel()
 
     def process_refine_locals(self, actx: PyOpenCLArrayContext,
                               traversal, l2l_cost):
         tree = traversal.tree
-        process_refine_locals_knl = self.process_refine_locals_knl(
+        process_refine_locals_knl = self.process_refine_locals_kernel(
             actx, tree.box_id_dtype
         )
 
@@ -1215,7 +1247,9 @@ def process_refine_locals(self, actx: PyOpenCLArrayContext,
         cost = process_refine_locals_knl(
             level_start_target_or_target_parent_box_nrs,
             l2l_cost,
-            range=slice(1, tree.nlevels)
+            range=slice(1, tree.nlevels),
+            queue=actx.queue,
+            allocator=actx.allocator,
         )
 
         return actx.to_numpy(cost).reshape(-1)[0]
diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py
index a1f3606b..bc747d66 100644
--- a/boxtree/distributed/__init__.py
+++ b/boxtree/distributed/__init__.py
@@ -88,7 +88,7 @@
 Distributed Wrangler
 --------------------
 
-.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler
+.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWranglerMixin
 
 .. _distributed-fmm-evaluation:
 
@@ -97,23 +97,27 @@
 
 The distributed version of the FMM evaluation shares the same interface as the
 shared-memory version. To evaluate FMM in a distributed manner, use a subclass
-of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in
-:func:`boxtree.fmm.drive_fmm`.
+of :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin`
+in :func:`boxtree.fmm.drive_fmm`.
 
 """
 
-from mpi4py import MPI
-import numpy as np
-import pyopencl as cl
-import pyopencl.array
-from enum import IntEnum
+import enum
 import warnings
+
+import numpy as np
+from mpi4py import MPI
+
 from boxtree.cost import FMMCostModel
+from boxtree.array_context import PyOpenCLArrayContext
 
 __all__ = ["DistributedFMMRunner"]
 
 
-class MPITags(IntEnum):
+# {{{ MPI
+
+@enum.unique
+class MPITags(enum.IntEnum):
     DIST_WEIGHT = 1
     GATHER_POTENTIALS = 2
     REDUCE_POTENTIALS = 3
@@ -121,27 +125,36 @@ class MPITags(IntEnum):
 
 
 def dtype_to_mpi(dtype):
-    """ This function translates a numpy datatype into the corresponding type used in
+    """This function translates a numpy datatype into the corresponding type used in
     mpi4py.
     """
+
     if hasattr(MPI, "_typedict"):
-        mpi_type = MPI._typedict[np.dtype(dtype).char]
+        typedict = MPI._typedict
     elif hasattr(MPI, "__TypeDict__"):
-        mpi_type = MPI.__TypeDict__[np.dtype(dtype).char]
+        typedict = MPI.__TypeDict__
     else:
-        raise RuntimeError("There is no dictionary to translate from Numpy dtype to "
-                           "MPI type")
+        raise RuntimeError(
+            "There is no dictionary to translate from np.dtype to an MPI datatype")
+
+    mpi_type = typedict.get(np.dtype(dtype).char, None)
+    if mpi_type is None:
+        raise ValueError(f"Could not convert '{dtype}' to an MPI datatype")
+
     return mpi_type
 
+# }}}
+
+
+# {{{ DistributedFMMRunner
 
 def construct_distributed_wrangler(
-        queue, global_tree, traversal_builder, wrangler_factory,
+        actx: PyOpenCLArrayContext, global_tree, traversal_builder, wrangler_factory,
         calibration_params, comm):
     """Helper function for constructing the distributed wrangler on each rank.
 
     Note: This function needs to be called collectively on all ranks.
     """
-
     mpi_rank = comm.Get_rank()
 
     # `tree_in_device_memory` is True if the global tree is in the device memory
@@ -152,7 +165,7 @@ def construct_distributed_wrangler(
     # worker ranks.
     tree_in_device_memory = None
     if mpi_rank == 0:
-        tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array)
+        tree_in_device_memory = isinstance(global_tree.targets[0], actx.array_types)
     tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0)
 
     # {{{ Broadcast the global tree
@@ -160,7 +173,7 @@ def construct_distributed_wrangler(
     global_tree_host = None
     if mpi_rank == 0:
         if tree_in_device_memory:
-            global_tree_host = global_tree.get(queue)
+            global_tree_host = actx.to_numpy(global_tree)
         else:
             global_tree_host = global_tree
 
@@ -170,11 +183,11 @@ def construct_distributed_wrangler(
     if mpi_rank == 0 and tree_in_device_memory:
         global_tree_dev = global_tree
     else:
-        global_tree_dev = global_tree_host.to_device(queue)
-    global_tree_dev = global_tree_dev.with_queue(queue)
+        global_tree_dev = actx.from_numpy(global_tree_host)
+    global_tree_dev = actx.thaw(global_tree_dev)
 
-    global_trav_dev, _ = traversal_builder(queue, global_tree_dev)
-    global_trav_host = global_trav_dev.get(queue)
+    global_trav_dev = traversal_builder(actx, global_tree_dev)
+    global_trav_host = actx.to_numpy(global_trav_dev)
 
     if tree_in_device_memory:
         global_trav = global_trav_dev
@@ -196,16 +209,16 @@ def construct_distributed_wrangler(
             # accurate one
             warnings.warn("Calibration parameters for the cost model are not "
                         "supplied. The default one will be used.")
-            calibration_params = \
-                FMMCostModel.get_unit_calibration_params()
+            calibration_params = FMMCostModel.get_unit_calibration_params()
 
         # We need to construct a wrangler in order to access `level_orders`
         global_wrangler = wrangler_factory(global_trav, global_trav)
 
         cost_per_box = cost_model.cost_per_box(
-            queue, global_trav_dev, global_wrangler.level_orders,
+            actx, global_trav_dev, global_wrangler.level_orders,
             calibration_params
-        ).get()
+        )
+        cost_per_box = actx.to_numpy(cost_per_box)
 
     from boxtree.distributed.partition import partition_work
     responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm)
@@ -216,7 +229,7 @@ def construct_distributed_wrangler(
 
     from boxtree.distributed.local_tree import generate_local_tree
     local_tree, src_idx, tgt_idx = generate_local_tree(
-        queue, global_trav_host, responsible_boxes_list, comm)
+        actx, global_trav_dev, actx.from_numpy(responsible_boxes_list), comm)
 
     # }}}
 
@@ -230,12 +243,12 @@ def construct_distributed_wrangler(
     # {{{ Compute traversal object on each rank
 
     from boxtree.distributed.local_traversal import generate_local_travs
-    local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder)
+    local_trav_dev = generate_local_travs(actx, local_tree, traversal_builder)
 
     if not tree_in_device_memory:
-        local_trav = local_trav_dev.get(queue=queue)
+        local_trav = actx.to_numpy(local_trav_dev)
     else:
-        local_trav = local_trav_dev.with_queue(None)
+        local_trav = actx.freeze(local_trav_dev)
 
     # }}}
 
@@ -250,7 +263,7 @@ class DistributedFMMRunner:
     .. automethod:: __init__
     .. automethod:: drive_dfmm
     """
-    def __init__(self, queue, global_tree,
+    def __init__(self, array_context: PyOpenCLArrayContext, global_tree,
                  traversal_builder,
                  wrangler_factory,
                  calibration_params=None, comm=MPI.COMM_WORLD):
@@ -273,15 +286,22 @@ def __init__(self, queue, global_tree,
         """
         self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \
             construct_distributed_wrangler(
-                queue, global_tree, traversal_builder, wrangler_factory,
+                array_context, global_tree, traversal_builder, wrangler_factory,
                 calibration_params, comm)
 
-    def drive_dfmm(self, source_weights, timing_data=None):
-        """Calculate potentials at target points.
-        """
+    def drive_dfmm(self,
+            actx: PyOpenCLArrayContext,
+            source_weights,
+            timing_data=None):
+        """Calculate potentials at target points."""
         from boxtree.fmm import drive_fmm
         return drive_fmm(
+            actx,
             self.wrangler, source_weights,
             timing_data=timing_data,
             global_src_idx_all_ranks=self.src_idx_all_ranks,
             global_tgt_idx_all_ranks=self.tgt_idx_all_ranks)
+
+# }}}
+
+# vim: fdm=marker
diff --git a/boxtree/distributed/calculation.py b/boxtree/distributed/calculation.py
index 0fcf3aec..bfd41c65 100644
--- a/boxtree/distributed/calculation.py
+++ b/boxtree/distributed/calculation.py
@@ -24,51 +24,97 @@
 """
 
 import numpy as np
-import pyopencl as cl
-from boxtree.distributed import MPITags
 from mpi4py import MPI
-from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
-from boxtree.fmm import ExpansionWranglerInterface
-from pytools import memoize_method
+
 from pyopencl.tools import dtype_to_ctype
 from pyopencl.elementwise import ElementwiseKernel
+
+from pytools import memoize_method, memoize_on_first_arg
 from mako.template import Template
 
+from boxtree.distributed import MPITags
+from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
+from boxtree.array_context import PyOpenCLArrayContext
+
 import logging
 logger = logging.getLogger(__name__)
 
 
 # {{{ Distributed FMM wrangler
 
-class DistributedExpansionWrangler(ExpansionWranglerInterface):
-    """Distributed expansion wrangler base class.
+@memoize_on_first_arg
+def get_find_boxes_used_by_subrange_kernel(
+        actx: PyOpenCLArrayContext,
+        box_id_dtype: "np.dtype"):
+    return ElementwiseKernel(
+        actx.context,
+        Template(r"""
+            ${box_id_t} *contributing_boxes_list,
+            int subrange_start,
+            int subrange_end,
+            ${box_id_t} *box_to_user_rank_starts,
+            int *box_to_user_rank_lists,
+            char *box_in_subrange
+        """).render(
+            box_id_t=dtype_to_ctype(box_id_dtype),
+        ),
+        Template(r"""
+            ${box_id_t} ibox = contributing_boxes_list[i];
+            ${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
+            ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
+            for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
+                int useri = box_to_user_rank_lists[iuser];
+                if(subrange_start <= useri && useri < subrange_end) {
+                    box_in_subrange[i] = 1;
+                }
+            }
+        """).render(    # noqa: E501
+            box_id_t=dtype_to_ctype(box_id_dtype)
+        ),
+        "find_boxes_used_by_subrange"
+    )
+
 
-    This is an abstract class and should not be directly instantiated. Instead, it is
-    expected that all distributed wranglers should be subclasses of this class.
+class DistributedExpansionWranglerMixin:
+    """Distributed expansion wrangler helper class.
+
+    This class is meant to aid in adding distributed capabilities to wranglers.
+    All distributed wranglers shoudl inherit from this class
+
+    .. attribute:: comm
+    .. attribute:: global_traversal
+    .. attribute:: communicate_mpoles_via_allreduce
 
-    .. automethod:: __init__
     .. automethod:: distribute_source_weights
     .. automethod:: gather_potential_results
     .. automethod:: communicate_mpoles
     """
-    def __init__(self, context, comm, global_traversal,
-                 traversal_in_device_memory,
-                 communicate_mpoles_via_allreduce=False):
-        self.context = context
-        self.comm = comm
-        self.global_traversal = global_traversal
-        self.traversal_in_device_memory = traversal_in_device_memory
-        self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
 
-    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
-        mpi_rank = self.comm.Get_rank()
-        mpi_size = self.comm.Get_size()
+    @property
+    def context(self):
+        return self._setup_actx.context
+
+    @property
+    @memoize_method
+    def mpi_rank(self):
+        return self.comm.Get_rank()
+
+    @property
+    @memoize_method
+    def mpi_size(self):
+        return self.comm.Get_size()
 
-        if mpi_rank == 0:
+    @property
+    def is_mpi_root(self):
+        return self.mpi_rank == 0
+
+    def distribute_source_weights(self,
+            actx: PyOpenCLArrayContext, src_weight_vecs, src_idx_all_ranks):
+        if self.is_mpi_root:
             distribute_weight_req = []
-            local_src_weight_vecs = np.empty((mpi_size,), dtype=object)
+            local_src_weight_vecs = np.empty((self.mpi_size,), dtype=object)
 
-            for irank in range(mpi_size):
+            for irank in range(self.mpi_size):
                 local_src_weight_vecs[irank] = [
                     source_weights[src_idx_all_ranks[irank]]
                     for source_weights in src_weight_vecs]
@@ -86,23 +132,20 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
 
         return local_src_weight_vecs
 
-    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
-        mpi_rank = self.comm.Get_rank()
-        mpi_size = self.comm.Get_size()
-
+    def gather_potential_results(self,
+            actx: PyOpenCLArrayContext, potentials, tgt_idx_all_ranks):
         from boxtree.distributed import dtype_to_mpi
         potentials_mpi_type = dtype_to_mpi(potentials.dtype)
-
         gathered_potentials = None
 
-        if mpi_rank == 0:
+        if self.is_mpi_root:
             # The root rank received calculated potentials from all worker ranks
-            potentials_all_ranks = np.empty((mpi_size,), dtype=object)
+            potentials_all_ranks = np.empty((self.mpi_size,), dtype=object)
             potentials_all_ranks[0] = potentials
 
             recv_reqs = []
 
-            for irank in range(1, mpi_size):
+            for irank in range(1, self.mpi_size):
                 potentials_all_ranks[irank] = np.empty(
                     tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype)
 
@@ -117,7 +160,7 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks):
             gathered_potentials = np.empty(
                 self.global_traversal.tree.ntargets, dtype=potentials.dtype)
 
-            for irank in range(mpi_size):
+            for irank in range(self.mpi_size):
                 gathered_potentials[tgt_idx_all_ranks[irank]] = (
                     potentials_all_ranks[irank])
         else:
@@ -131,8 +174,13 @@ def _slice_mpoles(self, mpoles, slice_indices):
         if len(slice_indices) == 0:
             return np.empty((0,), dtype=mpoles.dtype)
 
+        level_start_box_nrs = self.traversal.tree.level_start_box_nrs
+        if not isinstance(level_start_box_nrs, np.ndarray):
+            level_start_box_nrs = self._setup_actx.to_numpy(
+                level_start_box_nrs)
+
         level_start_slice_indices = np.searchsorted(
-            slice_indices, self.traversal.tree.level_start_box_nrs)
+            slice_indices, level_start_box_nrs)
         mpoles_list = []
 
         for ilevel in range(self.traversal.tree.nlevels):
@@ -152,8 +200,13 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
         if len(slice_indices) == 0:
             return
 
+        level_start_box_nrs = self.traversal.tree.level_start_box_nrs
+        if not isinstance(level_start_box_nrs, np.ndarray):
+            level_start_box_nrs = self._setup_actx.to_numpy(
+                level_start_box_nrs)
+
         level_start_slice_indices = np.searchsorted(
-            slice_indices, self.traversal.tree.level_start_box_nrs)
+            slice_indices, level_start_box_nrs)
         mpole_updates_start = 0
 
         for ilevel in range(self.traversal.tree.nlevels):
@@ -174,60 +227,26 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
 
                 mpole_updates_start = mpole_updates_end
 
-    @memoize_method
-    def find_boxes_used_by_subrange_kernel(self, box_id_dtype):
-        return ElementwiseKernel(
-            self.context,
-            Template(r"""
-                ${box_id_t} *contributing_boxes_list,
-                int subrange_start,
-                int subrange_end,
-                ${box_id_t} *box_to_user_rank_starts,
-                int *box_to_user_rank_lists,
-                char *box_in_subrange
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-            ),
-            Template(r"""
-                ${box_id_t} ibox = contributing_boxes_list[i];
-                ${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
-                ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
-                for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
-                    int useri = box_to_user_rank_lists[iuser];
-                    if(subrange_start <= useri && useri < subrange_end) {
-                        box_in_subrange[i] = 1;
-                    }
-                }
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype)
-            ),
-            "find_boxes_used_by_subrange"
-        )
-
     def find_boxes_used_by_subrange(
-            self, subrange, box_to_user_rank_starts, box_to_user_rank_lists,
+            self, actx: PyOpenCLArrayContext,
+            subrange, box_to_user_rank_starts, box_to_user_rank_lists,
             contributing_boxes_list):
         """Test whether the multipole expansions of the contributing boxes are used
         by at least one box in a range.
 
         :arg subrange: the range is represented by ``(subrange[0], subrange[1])``.
-        :arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object
-            indicating the start and end index in *box_to_user_rank_lists* for each
+        :arg box_to_user_rank_starts: an array object indicating the start and
+            end index in *box_to_user_rank_lists* for each box in
+            *contributing_boxes_list*.
+        :arg box_to_user_rank_lists: an array object storing the users of each
             box in *contributing_boxes_list*.
-        :arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing
-            the users of each box in *contributing_boxes_list*.
-        :returns: a :class:`pyopencl.array.Array` object with the same shape as
-            *contributing_boxes_list*, where the i-th entry is 1 if
-            ``contributing_boxes_list[i]`` is used by at least on box in the
-            subrange specified.
+        :returns: an array object with the same shape as *contributing_boxes_list*,
+            where the i-th entry is 1 if ``contributing_boxes_list[i]`` is used
+            by at least on box in the subrange specified.
         """
-        box_in_subrange = cl.array.zeros(
-            contributing_boxes_list.queue,
-            contributing_boxes_list.shape[0],
-            dtype=np.int8
-        )
-        knl = self.find_boxes_used_by_subrange_kernel(
-                self.traversal.tree.box_id_dtype)
+        box_in_subrange = actx.zeros(contributing_boxes_list.shape[0], dtype=np.int8)
+        knl = get_find_boxes_used_by_subrange_kernel(
+            actx, self.traversal.tree.box_id_dtype)
 
         knl(
             contributing_boxes_list,
@@ -240,7 +259,8 @@ def find_boxes_used_by_subrange(
 
         return box_in_subrange
 
-    def communicate_mpoles(self, mpole_exps, return_stats=False):
+    def communicate_mpoles(self,
+            actx: PyOpenCLArrayContext, mpole_exps, return_stats=False):
         """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
 
         The main idea is to mimic an allreduce as done on a hypercube network, but to
@@ -249,12 +269,12 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
 
         .. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston,
             Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc,
-            Lexing Ying, Denis Zorin, and George Biros. “A massively parallel
-            adaptive fast multipole method on heterogeneous architectures."
-            Communications of the ACM 55, no. 5 (2012): 101-109.
+            Lexing Ying, Denis Zorin, and George Biros. "A massively parallel
+            adaptive fast multipole method on heterogeneous architectures",
+            Communications of the ACM 55, no. 5 (2012): 101-109,
+            `DOI <https://doi.org/10.1145/1654059.1654118>`__.
         """
-        mpi_rank = self.comm.Get_rank()
-        mpi_size = self.comm.Get_size()
+        actx = self._setup_actx
         tree = self.traversal.tree
 
         if self.communicate_mpoles_via_allreduce:
@@ -280,16 +300,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         # Initially, this set consists of the boxes satisfying condition (a), which
         # are precisely the boxes owned by this process and their ancestors.
         if self.traversal_in_device_memory:
-            with cl.CommandQueue(self.context) as queue:
-                contributing_boxes = tree.ancestor_mask.get(queue=queue)
-                responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue)
+            contributing_boxes = actx.to_numpy(tree.ancestor_mask)
+            responsible_boxes_list = actx.to_numpy(tree.responsible_boxes_list)
         else:
-            contributing_boxes = tree.ancestor_mask.copy()
+            contributing_boxes = np.copy(tree.ancestor_mask)
             responsible_boxes_list = tree.responsible_boxes_list
         contributing_boxes[responsible_boxes_list] = 1
 
         from boxtree.tools import AllReduceCommPattern
-        comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size)
+        comm_pattern = AllReduceCommPattern(self.mpi_rank, self.mpi_size)
 
         # Temporary buffers for receiving data
         mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype)
@@ -299,15 +318,13 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         stats["bytes_recvd_by_stage"] = []
 
         if self.traversal_in_device_memory:
-            box_to_user_rank_starts_dev = \
-                tree.box_to_user_rank_starts.with_queue(None)
-            box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None)
+            box_to_user_rank_starts_dev = actx.freeze(tree.box_to_user_rank_starts)
+            box_to_user_rank_lists_dev = actx.freeze(tree.box_to_user_rank_lists)
         else:
-            with cl.CommandQueue(self.context) as queue:
-                box_to_user_rank_starts_dev = cl.array.to_device(
-                    queue, tree.box_to_user_rank_starts).with_queue(None)
-                box_to_user_rank_lists_dev = cl.array.to_device(
-                    queue, tree.box_to_user_rank_lists).with_queue(None)
+            box_to_user_rank_starts_dev = actx.freeze(
+                actx.from_numpy(tree.box_to_user_rank_starts))
+            box_to_user_rank_lists_dev = actx.freeze(
+                actx.from_numpy(tree.box_to_user_rank_lists))
 
         while not comm_pattern.done():
             send_requests = []
@@ -321,18 +338,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
                     tree.box_id_dtype
                 )
 
-                with cl.CommandQueue(self.context) as queue:
-                    contributing_boxes_list_dev = cl.array.to_device(
-                        queue, contributing_boxes_list)
-
-                    box_in_subrange = self.find_boxes_used_by_subrange(
-                        message_subrange,
-                        box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
-                        contributing_boxes_list_dev
-                    )
-
-                    box_in_subrange_host = box_in_subrange.get().astype(bool)
+                contributing_boxes_list_dev = actx.from_numpy(
+                    contributing_boxes_list)
+                box_in_subrange = self.find_boxes_used_by_subrange(
+                    actx, message_subrange,
+                    box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
+                    contributing_boxes_list_dev
+                )
 
+                box_in_subrange_host = actx.to_numpy(box_in_subrange).astype(bool)
                 relevant_boxes_list = contributing_boxes_list[
                     box_in_subrange_host
                 ].astype(tree.box_id_dtype)
@@ -381,7 +395,7 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
 
                 # Update data structures.
                 self._update_mpoles(
-                        mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
+                    mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
 
                 contributing_boxes[boxes_list_buf[:nboxes]] = 1
 
@@ -393,38 +407,41 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         if return_stats:
             return stats
 
-    def finalize_potentials(self, potentials, template_ary):
-        if self.comm.Get_rank() == 0:
-            return super().finalize_potentials(potentials, template_ary)
-        else:
-            return None
-
 
 class DistributedFMMLibExpansionWrangler(
-        DistributedExpansionWrangler, FMMLibExpansionWrangler):
+            DistributedExpansionWranglerMixin,
+            FMMLibExpansionWrangler):
     def __init__(
-            self, context, comm, tree_indep, local_traversal, global_traversal,
+            self, array_context, comm, tree_indep, local_traversal, global_traversal,
             fmm_level_to_order=None,
             communicate_mpoles_via_allreduce=False,
             **kwargs):
-        DistributedExpansionWrangler.__init__(
-            self, context, comm, global_traversal, False,
-            communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
         FMMLibExpansionWrangler.__init__(
             self, tree_indep, local_traversal,
             fmm_level_to_order=fmm_level_to_order, **kwargs)
 
-    #TODO: use log_process like FMMLibExpansionWrangler?
+        self._setup_actx = array_context
+        self.comm = comm
+        self.traversal_in_device_memory = False
+        self.global_traversal = global_traversal
+        self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
+
     def reorder_sources(self, source_array):
-        if self.comm.Get_rank() == 0:
+        if self.is_mpi_root:
             return source_array[..., self.global_traversal.tree.user_source_ids]
         else:
             return None
 
     def reorder_potentials(self, potentials):
-        if self.comm.Get_rank() == 0:
+        if self.is_mpi_root:
             return potentials[self.global_traversal.tree.sorted_target_ids]
         else:
             return None
 
+    def finalize_potentials(self, potentials, template_ary):
+        if self.is_mpi_root:
+            return super().finalize_potentials(potentials, template_ary)
+        else:
+            return None
+
 # }}}
diff --git a/boxtree/distributed/local_traversal.py b/boxtree/distributed/local_traversal.py
index 60eb3689..4752ee37 100644
--- a/boxtree/distributed/local_traversal.py
+++ b/boxtree/distributed/local_traversal.py
@@ -28,34 +28,30 @@
 
 
 def generate_local_travs(
-        queue, local_tree, traversal_builder, merge_close_lists=False):
+        actx, local_tree, traversal_builder, merge_close_lists=False):
     """Generate local traversal from local tree.
 
-    :arg queue: a :class:`pyopencl.CommandQueue` object.
-    :arg local_tree: the local tree of class
-        `boxtree.tools.ImmutableHostDeviceArray` on which the local traversal
-        object will be constructed.
-    :arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and
-        a tree, returns the traversal object based on the tree.
+    :arg local_tree: the local tree on which the local traversal object will
+        be constructed.
+    :arg traversal_builder: a function, taken a :class:`arraycontext.ArrayContext`
+        and a tree, returns the traversal object based on the tree.
 
     :return: generated local traversal object in device memory
     """
     start_time = time.time()
 
-    local_tree.with_queue(queue)
-
     # We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the
     # multipole formation and upward propagation within the rank's responsible boxes
     # region. Had there not been such restrictions, some sources might be distributed
     # to more than 1 rank and counted multiple times.
-    local_trav, _ = traversal_builder(
-        queue, local_tree.to_device(queue),
-        source_boxes_mask=local_tree.responsible_boxes_mask.device,
-        source_parent_boxes_mask=local_tree.ancestor_mask.device
+    local_trav = traversal_builder(
+        actx, local_tree,
+        source_boxes_mask=local_tree.responsible_boxes_mask,
+        source_parent_boxes_mask=local_tree.ancestor_mask
     )
 
     if merge_close_lists and local_tree.targets_have_extent:
-        local_trav = local_trav.merge_close_lists(queue)
+        local_trav = local_trav.merge_close_lists(actx)
 
     logger.info("Generate local traversal in {} sec.".format(
         str(time.time() - start_time))
diff --git a/boxtree/distributed/local_tree.py b/boxtree/distributed/local_tree.py
index 32e878ba..5ce6cfd5 100644
--- a/boxtree/distributed/local_tree.py
+++ b/boxtree/distributed/local_tree.py
@@ -21,15 +21,21 @@
 THE SOFTWARE.
 """
 
-from boxtree import Tree
-from mako.template import Template
-from pyopencl.tools import dtype_to_ctype
-from pytools import memoize_method
-import numpy as np
-import pyopencl as cl
+import time
 from dataclasses import dataclass
 from typing import Optional
-import time
+
+import numpy as np
+
+from pyopencl.tools import dtype_to_ctype
+from pyopencl.elementwise import ElementwiseKernel
+
+from arraycontext import Array, ArrayOrContainer
+from pytools import memoize_on_first_arg
+from mako.template import Template
+
+from boxtree import Tree
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
 import logging
 logger = logging.getLogger(__name__)
@@ -40,154 +46,156 @@
 # We should refactor this to make use of this commonality.
 # https://documen.tician.de/boxtree/tree.html#filtering-the-lists-of-targets
 
-
-class LocalTreeGeneratorCodeContainer:
-    """Objects of this type serve as a place to keep the code needed for
-    :func:`generate_local_tree`.
-    """
-    def __init__(self, cl_context, dimensions, particle_id_dtype, coord_dtype):
-        self.cl_context = cl_context
-        self.dimensions = dimensions
-        self.particle_id_dtype = particle_id_dtype
-        self.coord_dtype = coord_dtype
-
-    @memoize_method
-    def particle_mask_kernel(self):
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
-            arguments=Template("""
-                __global char *responsible_boxes,
-                __global ${particle_id_t} *box_particle_starts,
-                __global ${particle_id_t} *box_particle_counts_nonchild,
-                __global ${particle_id_t} *particle_mask
-            """, strict_undefined=True).render(
-                particle_id_t=dtype_to_ctype(self.particle_id_dtype)
-            ),
-            operation=Template("""
-                if(responsible_boxes[i]) {
-                    for(${particle_id_t} pid = box_particle_starts[i];
-                        pid < box_particle_starts[i]
-                              + box_particle_counts_nonchild[i];
-                        ++pid) {
-                        particle_mask[pid] = 1;
-                    }
+# {{{ kernels
+
+FETCH_LOCAL_PARTICLES_ARGUMENTS_TPL = Template("""
+    __global const ${mask_t} *particle_mask,
+    __global const ${mask_t} *particle_scan
+    % for dim in range(ndims):
+        , __global const ${coord_t} *particles_${dim}
+    % endfor
+    % for dim in range(ndims):
+        , __global ${coord_t} *local_particles_${dim}
+    % endfor
+    % if particles_have_extent:
+        , __global const ${coord_t} *particle_radii
+        , __global ${coord_t} *local_particle_radii
+    % endif
+""", strict_undefined=True)
+
+FETCH_LOCAL_PARTICLES_PRG_TPL = Template("""
+    if(particle_mask[i]) {
+        ${particle_id_t} des = particle_scan[i];
+        % for dim in range(ndims):
+            local_particles_${dim}[des] = particles_${dim}[i];
+        % endfor
+        % if particles_have_extent:
+            local_particle_radii[des] = particle_radii[i];
+        % endif
+    }
+""", strict_undefined=True)
+
+
+@memoize_on_first_arg
+def get_particle_mask_kernel(
+        actx: PyOpenCLArrayContext,
+        particle_id_dtype: "np.dtype"):
+    return ElementwiseKernel(
+        actx.context,
+        arguments=Template("""
+            __global char *responsible_boxes,
+            __global ${particle_id_t} *box_particle_starts,
+            __global ${particle_id_t} *box_particle_counts_nonchild,
+            __global ${particle_id_t} *particle_mask
+        """, strict_undefined=True).render(
+            particle_id_t=dtype_to_ctype(particle_id_dtype)
+        ),
+        operation=Template("""
+            if(responsible_boxes[i]) {
+                for(${particle_id_t} pid = box_particle_starts[i];
+                    pid < box_particle_starts[i]
+                            + box_particle_counts_nonchild[i];
+                    ++pid) {
+                    particle_mask[pid] = 1;
                 }
-            """).render(particle_id_t=dtype_to_ctype(self.particle_id_dtype))
+            }
+        """).render(particle_id_t=dtype_to_ctype(particle_id_dtype))
         )
 
-    @memoize_method
-    def mask_scan_kernel(self):
-        from pyopencl.scan import GenericScanKernel
-        return GenericScanKernel(
-            self.cl_context, self.particle_id_dtype,
-            arguments=Template("""
-                __global ${mask_t} *ary,
-                __global ${mask_t} *scan
-                """, strict_undefined=True).render(
-                mask_t=dtype_to_ctype(self.particle_id_dtype)
-            ),
-            input_expr="ary[i]",
-            scan_expr="a+b", neutral="0",
-            output_statement="scan[i + 1] = item;"
+
+@memoize_on_first_arg
+def get_mask_scan_kernel(
+        actx: PyOpenCLArrayContext,
+        particle_id_dtype: "np.dtype"):
+    from pyopencl.scan import GenericScanKernel
+    return GenericScanKernel(
+        actx.context, particle_id_dtype,
+        arguments=Template("""
+            __global ${mask_t} *ary,
+            __global ${mask_t} *scan
+            """, strict_undefined=True).render(
+            mask_t=dtype_to_ctype(particle_id_dtype)
+        ),
+        input_expr="ary[i]",
+        scan_expr="a+b", neutral="0",
+        output_statement="scan[i + 1] = item;"
         )
 
-    fetch_local_paticles_arguments = Template("""
-        __global const ${mask_t} *particle_mask,
-        __global const ${mask_t} *particle_scan
-        % for dim in range(ndims):
-            , __global const ${coord_t} *particles_${dim}
-        % endfor
-        % for dim in range(ndims):
-            , __global ${coord_t} *local_particles_${dim}
-        % endfor
-        % if particles_have_extent:
-            , __global const ${coord_t} *particle_radii
-            , __global ${coord_t} *local_particle_radii
-        % endif
-    """, strict_undefined=True)
-
-    fetch_local_particles_prg = Template("""
-        if(particle_mask[i]) {
-            ${particle_id_t} des = particle_scan[i];
-            % for dim in range(ndims):
-                local_particles_${dim}[des] = particles_${dim}[i];
-            % endfor
-            % if particles_have_extent:
-                local_particle_radii[des] = particle_radii[i];
-            % endif
-        }
-    """, strict_undefined=True)
-
-    @memoize_method
-    def fetch_local_particles_kernel(self, particles_have_extent):
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
-            self.fetch_local_paticles_arguments.render(
-                mask_t=dtype_to_ctype(self.particle_id_dtype),
-                coord_t=dtype_to_ctype(self.coord_dtype),
-                ndims=self.dimensions,
-                particles_have_extent=particles_have_extent
-            ),
-            self.fetch_local_particles_prg.render(
-                particle_id_t=dtype_to_ctype(self.particle_id_dtype),
-                ndims=self.dimensions,
-                particles_have_extent=particles_have_extent
-            )
+
+@memoize_on_first_arg
+def get_fetch_local_particles_kernel(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        particle_id_dtype: "np.dtype",
+        coord_dtype: "np.dtype",
+        particles_have_extent: bool):
+    return ElementwiseKernel(
+        actx.context,
+        FETCH_LOCAL_PARTICLES_ARGUMENTS_TPL.render(
+            mask_t=dtype_to_ctype(particle_id_dtype),
+            coord_t=dtype_to_ctype(coord_dtype),
+            ndims=dimensions,
+            particles_have_extent=particles_have_extent
+        ),
+        FETCH_LOCAL_PARTICLES_PRG_TPL.render(
+            particle_id_t=dtype_to_ctype(particle_id_dtype),
+            ndims=dimensions,
+            particles_have_extent=particles_have_extent
         )
+    )
 
-    @memoize_method
-    def mask_compressor_kernel(self):
-        from boxtree.tools import MaskCompressorKernel
-        return MaskCompressorKernel(self.cl_context)
-
-    @memoize_method
-    def modify_target_flags_kernel(self):
-        from boxtree import box_flags_enum
-        box_flag_t = dtype_to_ctype(box_flags_enum.dtype)
-
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
-            Template("""
-                __global ${particle_id_t} *box_target_counts_nonchild,
-                __global ${particle_id_t} *box_target_counts_cumul,
-                __global ${box_flag_t} *box_flags
-            """).render(
-                particle_id_t=dtype_to_ctype(self.particle_id_dtype),
-                box_flag_t=box_flag_t
+
+@memoize_on_first_arg
+def get_modify_target_flags_kernel(
+        actx: PyOpenCLArrayContext,
+        particle_id_dtype: "np.dtype"):
+    from boxtree import box_flags_enum
+    box_flag_t = dtype_to_ctype(box_flags_enum.dtype)
+
+    return ElementwiseKernel(
+        actx.context,
+        Template("""
+            __global ${particle_id_t} *box_target_counts_nonchild,
+            __global ${particle_id_t} *box_target_counts_cumul,
+            __global ${box_flag_t} *box_flags
+        """).render(
+            particle_id_t=dtype_to_ctype(particle_id_dtype),
+            box_flag_t=box_flag_t
+        ),
+        Template(r"""
+            // reset HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits in the flag of
+            // each box
+            box_flags[i] &= (~${HAS_OWN_TARGETS});
+            box_flags[i] &= (~${HAS_CHILD_TARGETS});
+
+            // rebuild HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits
+            if(box_target_counts_nonchild[i]) box_flags[i] |= ${HAS_OWN_TARGETS};
+            if(box_target_counts_nonchild[i] < box_target_counts_cumul[i])
+                box_flags[i] |= ${HAS_CHILD_TARGETS};
+        """).render(
+            HAS_OWN_TARGETS=(
+                "(" + box_flag_t + ") " + str(box_flags_enum.HAS_OWN_TARGETS)
             ),
-            Template(r"""
-                // reset HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits in the flag of
-                // each box
-                box_flags[i] &= (~${HAS_OWN_TARGETS});
-                box_flags[i] &= (~${HAS_CHILD_TARGETS});
-
-                // rebuild HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits
-                if(box_target_counts_nonchild[i]) box_flags[i] |= ${HAS_OWN_TARGETS};
-                if(box_target_counts_nonchild[i] < box_target_counts_cumul[i])
-                    box_flags[i] |= ${HAS_CHILD_TARGETS};
-            """).render(
-                HAS_OWN_TARGETS=(
-                    "(" + box_flag_t + ") " + str(box_flags_enum.HAS_OWN_TARGETS)
-                ),
-                HAS_CHILD_TARGETS=(
-                    "(" + box_flag_t + ") " + str(box_flags_enum.HAS_CHILD_TARGETS)
-                )
+            HAS_CHILD_TARGETS=(
+                "(" + box_flag_t + ") " + str(box_flags_enum.HAS_CHILD_TARGETS)
             )
         )
+    )
 
 
-@dataclass
+@dataclass(frozen=True)
 class LocalParticlesAndLists:
-    particles: np.ndarray
-    particle_radii: Optional[cl.array.Array]
-    box_particle_starts: cl.array.Array
-    box_particle_counts_nonchild: cl.array.Array
-    box_particle_counts_cumul: cl.array.Array
+    particles: ArrayOrContainer
+    particle_radii: Optional[Array]
+    box_particle_starts: Array
+    box_particle_counts_nonchild: Array
+    box_particle_counts_cumul: Array
     particle_idx: np.ndarray
 
 
 def construct_local_particles_and_lists(
-        queue, code, dimensions, num_boxes, num_global_particles,
+        actx: PyOpenCLArrayContext,
+        dimensions, num_boxes, num_global_particles,
         particle_id_dtype, coord_dtype, particles_have_extent,
         box_mask,
         global_particles, global_particle_radii,
@@ -198,59 +206,74 @@ def construct_local_particles_and_lists(
     """
     # {{{ calculate the particle mask
 
-    particle_mask = cl.array.zeros(
-        queue, num_global_particles, dtype=particle_id_dtype)
-
-    code.particle_mask_kernel()(
-        box_mask, box_particle_starts, box_particle_counts_nonchild, particle_mask)
+    particle_mask = actx.zeros(num_global_particles, dtype=particle_id_dtype)
+    knl = get_particle_mask_kernel(actx, particle_id_dtype)
+    knl(box_mask,
+        box_particle_starts,
+        box_particle_counts_nonchild,
+        particle_mask,
+        queue=actx.queue,
+        )
 
     # }}}
 
     # {{{ calculate the scan of the particle mask
 
-    global_to_local_particle_index = cl.array.empty(
-        queue, num_global_particles + 1, dtype=particle_id_dtype)
+    global_to_local_particle_index = actx.empty(
+        num_global_particles + 1, dtype=particle_id_dtype)
 
     global_to_local_particle_index[0] = 0
-    code.mask_scan_kernel()(particle_mask, global_to_local_particle_index)
+    knl = get_mask_scan_kernel(actx, particle_id_dtype)
+    knl(particle_mask, global_to_local_particle_index,
+        queue=actx.queue,
+        allocator=actx.allocator,
+        )
 
     # }}}
 
     # {{{ fetch the local particles
 
-    num_local_particles = global_to_local_particle_index[-1].get(queue).item()
-
-    local_particles = [
-        cl.array.empty(queue, num_local_particles, dtype=coord_dtype)
-        for _ in range(dimensions)]
+    from pytools.obj_array import make_obj_array
+    num_local_particles = actx.to_numpy(global_to_local_particle_index[-1]).item()
+    local_particles = make_obj_array([
+        actx.zeros(num_local_particles, coord_dtype)
+        for _ in range(dimensions)
+        ])
 
     from pytools.obj_array import make_obj_array
     local_particles = make_obj_array(local_particles)
 
-    local_particle_radii = None
-    if particles_have_extent:
-        local_particle_radii = cl.array.empty(
-            queue, num_local_particles, dtype=coord_dtype)
+    knl = get_fetch_local_particles_kernel(
+        actx, dimensions, particle_id_dtype, coord_dtype,
+        particles_have_extent=particles_have_extent,
+        )
 
-        code.fetch_local_particles_kernel(True)(
+    if particles_have_extent:
+        local_particle_radii = actx.empty(num_local_particles, dtype=coord_dtype)
+        knl(
             particle_mask, global_to_local_particle_index,
             *global_particles.tolist(),
             *local_particles,
             global_particle_radii,
-            local_particle_radii)
+            local_particle_radii,
+            queue=actx.queue,
+            )
     else:
-        code.fetch_local_particles_kernel(False)(
+        local_particle_radii = None
+        knl(
             particle_mask, global_to_local_particle_index,
             *global_particles.tolist(),
-            *local_particles)
+            *local_particles,
+            queue=actx.queue,
+            )
 
     # {{{ construct the list of list indices
 
     local_box_particle_starts = global_to_local_particle_index[box_particle_starts]
 
-    box_counts_all_zeros = cl.array.zeros(queue, num_boxes, dtype=particle_id_dtype)
+    box_counts_all_zeros = actx.zeros(num_boxes, dtype=particle_id_dtype)
 
-    local_box_particle_counts_nonchild = cl.array.if_positive(
+    local_box_particle_counts_nonchild = actx.np.where(
         box_mask, box_particle_counts_nonchild, box_counts_all_zeros)
 
     box_particle_ends_cumul = box_particle_starts + box_particle_counts_cumul
@@ -261,18 +284,20 @@ def construct_local_particles_and_lists(
 
     # }}}
 
-    particle_mask = particle_mask.get(queue=queue).astype(bool)
+    particle_mask = actx.to_numpy(particle_mask).astype(bool)
     particle_idx = np.arange(num_global_particles)[particle_mask]
 
     return LocalParticlesAndLists(
-        local_particles,
-        local_particle_radii,
-        local_box_particle_starts,
-        local_box_particle_counts_nonchild,
-        local_box_particle_counts_cumul,
-        particle_idx)
+        particles=local_particles,
+        particle_radii=local_particle_radii,
+        box_particle_starts=local_box_particle_starts,
+        box_particle_counts_nonchild=local_box_particle_counts_nonchild,
+        box_particle_counts_cumul=local_box_particle_counts_cumul,
+        particle_idx=particle_idx)
 
 
+@dataclass_array_container
+@dataclass(frozen=True)
 class LocalTree(Tree):
     """
     Inherits from :class:`boxtree.Tree`.
@@ -291,13 +316,22 @@ class LocalTree(Tree):
         propagated from an ancestor) List 2.
     """
 
+    box_to_user_rank_starts: Array
+    box_to_user_rank_lists: Array
+
+    responsible_boxes_list: Array
+    responsible_boxes_mask: Array
+    ancestor_mask: Array
 
-def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
+
+def generate_local_tree(
+        actx: PyOpenCLArrayContext,
+        global_traversal, responsible_boxes_list, comm,
+        root_extent_stretch_factor: float = 1.0e-4) -> LocalTree:
     """Generate the local tree for the current rank.
 
     This is an MPI-collective routine on *comm*.
 
-    :arg queue: a :class:`pyopencl.CommandQueue` object.
     :arg global_traversal: Global :class:`boxtree.traversal.FMMTraversalInfo` object
         on host memory.
     :arg responsible_boxes_list: a :class:`numpy.ndarray` object containing the
@@ -310,10 +344,7 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
         global tree. ``src_idx`` and ``tgt_idx`` are needed for distributing source
         weights from root rank and assembling calculated potentials on the root rank.
     """
-    global_tree = global_traversal.tree
-    code = LocalTreeGeneratorCodeContainer(
-            queue.context, global_tree.dimensions,
-            global_tree.particle_id_dtype, global_tree.coord_dtype)
+    global_tree = actx.thaw(global_traversal.tree)
 
     mpi_rank = comm.Get_rank()
     mpi_size = comm.Get_size()
@@ -321,33 +352,31 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
     start_time = time.time()
 
     from boxtree.distributed.partition import get_box_masks
-    box_masks = get_box_masks(queue, global_traversal, responsible_boxes_list)
-
-    global_tree_dev = global_tree.to_device(queue).with_queue(queue)
+    box_masks = get_box_masks(actx, global_traversal, responsible_boxes_list)
 
     local_sources_and_lists = construct_local_particles_and_lists(
-        queue, code, global_tree.dimensions, global_tree.nboxes,
+        actx, global_tree.dimensions, global_tree.nboxes,
         global_tree.nsources,
         global_tree.particle_id_dtype, global_tree.coord_dtype,
         global_tree.sources_have_extent,
         box_masks.point_src_boxes,
-        global_tree_dev.sources,
-        global_tree_dev.sources_radii if global_tree.sources_have_extent else None,
-        global_tree_dev.box_source_starts,
-        global_tree_dev.box_source_counts_nonchild,
-        global_tree_dev.box_source_counts_cumul)
+        global_tree.sources,
+        global_tree.sources_radii if global_tree.sources_have_extent else None,
+        global_tree.box_source_starts,
+        global_tree.box_source_counts_nonchild,
+        global_tree.box_source_counts_cumul)
 
     local_targets_and_lists = construct_local_particles_and_lists(
-        queue, code, global_tree.dimensions, global_tree.nboxes,
+        actx, global_tree.dimensions, global_tree.nboxes,
         global_tree.ntargets,
         global_tree.particle_id_dtype, global_tree.coord_dtype,
         global_tree.targets_have_extent,
         box_masks.responsible_boxes,
-        global_tree_dev.targets,
-        global_tree_dev.target_radii if global_tree.targets_have_extent else None,
-        global_tree_dev.box_target_starts,
-        global_tree_dev.box_target_counts_nonchild,
-        global_tree_dev.box_target_counts_cumul)
+        global_tree.targets,
+        global_tree.target_radii if global_tree.targets_have_extent else None,
+        global_tree.box_target_starts,
+        global_tree.box_target_counts_nonchild,
+        global_tree.box_target_counts_cumul)
 
     # {{{ compute the users of multipole expansions of each box on the root rank
 
@@ -357,24 +386,24 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
             (mpi_size, global_tree.nboxes),
             dtype=box_masks.multipole_src_boxes.dtype)
     comm.Gather(
-        box_masks.multipole_src_boxes.get(), multipole_src_boxes_all_ranks, root=0)
+        actx.to_numpy(box_masks.multipole_src_boxes),
+        multipole_src_boxes_all_ranks, root=0)
 
     box_to_user_rank_starts = None
     box_to_user_rank_lists = None
 
     if mpi_rank == 0:
-        multipole_src_boxes_all_ranks = cl.array.to_device(
-            queue, multipole_src_boxes_all_ranks)
+        multipole_src_boxes_all_ranks = actx.from_numpy(
+            multipole_src_boxes_all_ranks)
 
-        (box_to_user_rank_starts, box_to_user_rank_lists, evt) = \
-            code.mask_compressor_kernel()(
-                queue, multipole_src_boxes_all_ranks.transpose(),
-                list_dtype=np.int32)
+        from boxtree.tools import mask_to_csr
+        (box_to_user_rank_starts, box_to_user_rank_lists) = (
+            mask_to_csr(
+                actx, multipole_src_boxes_all_ranks.transpose(),
+                list_dtype=np.int32))
 
-        cl.wait_for_events([evt])
-
-        box_to_user_rank_starts = box_to_user_rank_starts.get()
-        box_to_user_rank_lists = box_to_user_rank_lists.get()
+        box_to_user_rank_starts = actx.to_numpy(box_to_user_rank_starts)
+        box_to_user_rank_lists = actx.to_numpy(box_to_user_rank_lists)
 
         logger.debug("computing box_to_user: done")
 
@@ -391,22 +420,17 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
     # expansions formed by souces in other ranks. Modifying the source box flags
     # could result in incomplete interaction lists.
 
-    local_box_flags = global_tree_dev.box_flags.copy(queue=queue)
-    code.modify_target_flags_kernel()(
+    local_box_flags = actx.np.copy(global_tree.box_flags)
+    knl = get_modify_target_flags_kernel(actx, global_tree.particle_id_dtype)
+    knl(
         local_targets_and_lists.box_particle_counts_nonchild,
         local_targets_and_lists.box_particle_counts_cumul,
-        local_box_flags)
+        local_box_flags,
+        queue=actx.queue,
+        )
 
     # }}}
 
-    from pytools.obj_array import make_obj_array
-    local_sources = make_obj_array([
-        local_sources_idim.get(queue=queue)
-        for local_sources_idim in local_sources_and_lists.particles])
-    local_targets = make_obj_array([
-        local_target_idim.get(queue=queue)
-        for local_target_idim in local_targets_and_lists.particles])
-
     local_tree = LocalTree(
         sources_are_targets=global_tree.sources_are_targets,
         sources_have_extent=global_tree.sources_have_extent,
@@ -423,33 +447,34 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
 
         bounding_box=global_tree.bounding_box,
         level_start_box_nrs=global_tree.level_start_box_nrs,
-        level_start_box_nrs_dev=global_tree.level_start_box_nrs_dev,
 
-        sources=local_sources,
-        targets=local_targets,
-        source_radii=(local_sources_and_lists.particle_radii.get(queue=queue)
+        sources=local_sources_and_lists.particles,
+        targets=local_targets_and_lists.particles,
+        source_radii=(
+                local_sources_and_lists.particle_radii
                 if global_tree.sources_have_extent else None),
-        target_radii=(local_targets_and_lists.particle_radii.get(queue=queue)
+        target_radii=(
+                local_targets_and_lists.particle_radii
                 if global_tree.targets_have_extent else None),
 
         box_source_starts=(
-            local_sources_and_lists.box_particle_starts.get(queue=queue)),
+            local_sources_and_lists.box_particle_starts),
         box_source_counts_nonchild=(
-            local_sources_and_lists.box_particle_counts_nonchild.get(queue=queue)),
+            local_sources_and_lists.box_particle_counts_nonchild),
         box_source_counts_cumul=(
-            local_sources_and_lists.box_particle_counts_cumul.get(queue=queue)),
+            local_sources_and_lists.box_particle_counts_cumul),
         box_target_starts=(
-            local_targets_and_lists.box_particle_starts.get(queue=queue)),
+            local_targets_and_lists.box_particle_starts),
         box_target_counts_nonchild=(
-            local_targets_and_lists.box_particle_counts_nonchild.get(queue=queue)),
+            local_targets_and_lists.box_particle_counts_nonchild),
         box_target_counts_cumul=(
-            local_targets_and_lists.box_particle_counts_cumul.get(queue=queue)),
+            local_targets_and_lists.box_particle_counts_cumul),
 
         box_parent_ids=global_tree.box_parent_ids,
         box_child_ids=global_tree.box_child_ids,
         box_centers=global_tree.box_centers,
         box_levels=global_tree.box_levels,
-        box_flags=local_box_flags.get(queue=queue),
+        box_flags=local_box_flags,
 
         user_source_ids=None,
         sorted_target_ids=None,
@@ -459,23 +484,21 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
         box_target_bounding_box_min=global_tree.box_target_bounding_box_min,
         box_target_bounding_box_max=global_tree.box_target_bounding_box_max,
 
+        root_extent_stretch_factor=root_extent_stretch_factor,
         _is_pruned=global_tree._is_pruned,
 
         responsible_boxes_list=responsible_boxes_list,
-        responsible_boxes_mask=box_masks.responsible_boxes.get(),
-        ancestor_mask=box_masks.ancestor_boxes.get(),
-        box_to_user_rank_starts=box_to_user_rank_starts,
-        box_to_user_rank_lists=box_to_user_rank_lists
+        responsible_boxes_mask=box_masks.responsible_boxes,
+        ancestor_mask=box_masks.ancestor_boxes,
+        box_to_user_rank_starts=actx.from_numpy(box_to_user_rank_starts),
+        box_to_user_rank_lists=actx.from_numpy(box_to_user_rank_lists),
     )
 
-    local_tree = local_tree.to_host_device_array(queue)
-    local_tree.with_queue(None)
-
     logger.info("Generate local tree on rank {} in {} sec.".format(
         mpi_rank, str(time.time() - start_time)
     ))
 
     return (
-        local_tree,
+        actx.freeze(local_tree),
         local_sources_and_lists.particle_idx,
         local_targets_and_lists.particle_idx)
diff --git a/boxtree/distributed/partition.py b/boxtree/distributed/partition.py
index a3b2b799..569dbe10 100644
--- a/boxtree/distributed/partition.py
+++ b/boxtree/distributed/partition.py
@@ -21,14 +21,71 @@
 THE SOFTWARE.
 """
 
+from dataclasses import dataclass
+
 import numpy as np
-import pyopencl as cl
+
 from pyopencl.tools import dtype_to_ctype
+from pyopencl.elementwise import ElementwiseKernel
+
+from arraycontext import Array
+from pytools import memoize_on_first_arg
 from mako.template import Template
-from pytools import memoize_method
-from dataclasses import dataclass
+
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
+
+
+# {{{ kernels
+
+@memoize_on_first_arg
+def get_add_interaction_list_boxes_kernel(
+        actx: PyOpenCLArrayContext,
+        box_id_dtype: "np.dtype"):
+    """Given a ``responsible_boxes_mask`` and an interaction list, mark source
+    boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask.
+    """
+    return ElementwiseKernel(
+        actx.context,
+        Template("""
+            __global ${box_id_t} *box_list,
+            __global char *responsible_boxes_mask,
+            __global ${box_id_t} *interaction_boxes_starts,
+            __global ${box_id_t} *interaction_boxes_lists,
+            __global char *src_boxes_mask
+        """, strict_undefined=True).render(
+            box_id_t=dtype_to_ctype(box_id_dtype)
+        ),
+        Template(r"""
+            typedef ${box_id_t} box_id_t;
+            box_id_t current_box = box_list[i];
+            if(responsible_boxes_mask[current_box]) {
+                for(box_id_t box_idx = interaction_boxes_starts[i];
+                    box_idx < interaction_boxes_starts[i + 1];
+                    ++box_idx)
+                    src_boxes_mask[interaction_boxes_lists[box_idx]] = 1;
+            }
+        """, strict_undefined=True).render(
+            box_id_t=dtype_to_ctype(box_id_dtype)
+        ),
+    )
+
+
+@memoize_on_first_arg
+def get_add_parent_boxes_kernel(
+        actx: PyOpenCLArrayContext,
+        box_id_dtype: "np.dtype"):
+    return ElementwiseKernel(
+        actx.context,
+        "__global char *current, __global char *parent, "
+        "__global %s *box_parent_ids" % dtype_to_ctype(box_id_dtype),
+        "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1"
+    )
+
+# }}}
 
 
+# {{{ get_box_masks
+
 def get_box_ids_dfs_order(tree):
     """Helper function for getting box ids of a tree in depth-first order.
 
@@ -115,68 +172,23 @@ def partition_work(cost_per_box, traversal, comm):
         responsible_boxes_current_rank[0]:responsible_boxes_current_rank[1]]
 
 
-class GetBoxMasksCodeContainer:
-    def __init__(self, cl_context, box_id_dtype):
-        self.cl_context = cl_context
-        self.box_id_dtype = box_id_dtype
-
-    @memoize_method
-    def add_interaction_list_boxes_kernel(self):
-        """Given a ``responsible_boxes_mask`` and an interaction list, mark source
-        boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask.
-        """
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
-            Template("""
-                __global ${box_id_t} *box_list,
-                __global char *responsible_boxes_mask,
-                __global ${box_id_t} *interaction_boxes_starts,
-                __global ${box_id_t} *interaction_boxes_lists,
-                __global char *src_boxes_mask
-            """, strict_undefined=True).render(
-                box_id_t=dtype_to_ctype(self.box_id_dtype)
-            ),
-            Template(r"""
-                typedef ${box_id_t} box_id_t;
-                box_id_t current_box = box_list[i];
-                if(responsible_boxes_mask[current_box]) {
-                    for(box_id_t box_idx = interaction_boxes_starts[i];
-                        box_idx < interaction_boxes_starts[i + 1];
-                        ++box_idx)
-                        src_boxes_mask[interaction_boxes_lists[box_idx]] = 1;
-                }
-            """, strict_undefined=True).render(
-                box_id_t=dtype_to_ctype(self.box_id_dtype)
-            ),
-        )
-
-    @memoize_method
-    def add_parent_boxes_kernel(self):
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
-            "__global char *current, __global char *parent, "
-            "__global %s *box_parent_ids" % dtype_to_ctype(self.box_id_dtype),
-            "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1"
-        )
-
-
-def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask):
+def get_ancestor_boxes_mask(actx, traversal, responsible_boxes_mask):
     """Query the ancestors of responsible boxes.
 
-    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
-    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
-        i-th entry is 1 if ``i`` is an ancestor of the responsible boxes specified by
+    :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is a responsible box.
+    :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if ``i``
+        is an ancestor of the responsible boxes specified by
         *responsible_boxes_mask*.
     """
-    ancestor_boxes = cl.array.zeros(queue, (traversal.tree.nboxes,), dtype=np.int8)
+    knl = get_add_parent_boxes_kernel(actx, traversal.tree.box_id_dtype)
+
+    ancestor_boxes = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
     ancestor_boxes_last = responsible_boxes_mask.copy()
 
     while ancestor_boxes_last.any():
-        ancestor_boxes_new = cl.array.zeros(
-            queue, (traversal.tree.nboxes,), dtype=np.int8)
-        code.add_parent_boxes_kernel()(
-            ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids)
+        ancestor_boxes_new = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
+        knl(ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids)
         ancestor_boxes_new = ancestor_boxes_new & (~ancestor_boxes)
         ancestor_boxes = ancestor_boxes | ancestor_boxes_new
         ancestor_boxes_last = ancestor_boxes_new
@@ -185,115 +197,111 @@ def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask):
 
 
 def get_point_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
+        actx, traversal, responsible_boxes_mask, ancestor_boxes_mask):
     """Query the boxes whose sources are needed in order to evaluate potentials
     of boxes represented by *responsible_boxes_mask*.
 
-    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
-    :param ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
-        or an ancestor of the responsible boxes.
-    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
-        i-th entry is 1 if souces of box ``i`` are needed for evaluating the
-        potentials of targets in boxes represented by *responsible_boxes_mask*.
+    :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is a responsible box.
+    :param ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is either a responsible box or an ancestor
+        of the responsible boxes.
+    :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if
+        souces of box ``i`` are needed for evaluating the potentials of targets
+        in boxes represented by *responsible_boxes_mask*.
     """
-
+    knl = get_add_interaction_list_boxes_kernel(actx, traversal.tree.box_id_dtype)
     src_boxes_mask = responsible_boxes_mask.copy()
 
     # Add list 1 of responsible boxes
-    code.add_interaction_list_boxes_kernel()(
+    knl(
         traversal.target_boxes, responsible_boxes_mask,
         traversal.neighbor_source_boxes_starts,
         traversal.neighbor_source_boxes_lists, src_boxes_mask,
-        queue=queue)
+        queue=actx.queue)
 
     # Add list 4 of responsible boxes or ancestor boxes
-    code.add_interaction_list_boxes_kernel()(
+    knl(
         traversal.target_or_target_parent_boxes,
         responsible_boxes_mask | ancestor_boxes_mask,
         traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists,
         src_boxes_mask,
-        queue=queue)
+        queue=actx.queue)
 
     if traversal.tree.targets_have_extent:
         # Add list 3 close of responsible boxes
         if traversal.from_sep_close_smaller_starts is not None:
-            code.add_interaction_list_boxes_kernel()(
+            knl(
                 traversal.target_boxes,
                 responsible_boxes_mask,
                 traversal.from_sep_close_smaller_starts,
                 traversal.from_sep_close_smaller_lists,
                 src_boxes_mask,
-                queue=queue
+                queue=actx.queue
             )
 
         # Add list 4 close of responsible boxes
         if traversal.from_sep_close_bigger_starts is not None:
-            code.add_interaction_list_boxes_kernel()(
+            knl(
                 traversal.target_boxes,
                 responsible_boxes_mask | ancestor_boxes_mask,
                 traversal.from_sep_close_bigger_starts,
                 traversal.from_sep_close_bigger_lists,
                 src_boxes_mask,
-                queue=queue
+                queue=actx.queue
             )
 
     return src_boxes_mask
 
 
 def get_multipole_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
+        actx, traversal, responsible_boxes_mask, ancestor_boxes_mask):
     """Query the boxes whose multipoles are used in order to evaluate
     potentials of targets in boxes represented by *responsible_boxes_mask*.
 
-    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
-    :arg ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
-        or an ancestor of the responsible boxes.
-    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)``
-        whose i-th entry is 1 if multipoles of box ``i`` are needed for evaluating
-        the potentials of targets in boxes represented by *responsible_boxes_mask*.
+    :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is a responsible box.
+    :arg ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is either a responsible box or an ancestor of
+        the responsible boxes.
+    :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if
+        multipoles of box ``i`` are needed for evaluating the potentials of
+        targets in boxes represented by *responsible_boxes_mask*.
     """
-
-    multipole_boxes_mask = cl.array.zeros(
-        queue, (traversal.tree.nboxes,), dtype=np.int8
-    )
+    knl = get_add_interaction_list_boxes_kernel(actx, traversal.tree.box_id_dtype)
+    multipole_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
 
     # A mpole is used by process p if it is in the List 2 of either a box
     # owned by p or one of its ancestors.
-    code.add_interaction_list_boxes_kernel()(
+    knl(
         traversal.target_or_target_parent_boxes,
         responsible_boxes_mask | ancestor_boxes_mask,
         traversal.from_sep_siblings_starts,
         traversal.from_sep_siblings_lists,
         multipole_boxes_mask,
-        queue=queue
+        queue=actx.queue
     )
-    multipole_boxes_mask.finish()
 
     # A mpole is used by process p if it is in the List 3 of a box owned by p.
     for ilevel in range(traversal.tree.nlevels):
-        code.add_interaction_list_boxes_kernel()(
+        knl(
             traversal.target_boxes_sep_smaller_by_source_level[ilevel],
             responsible_boxes_mask,
             traversal.from_sep_smaller_by_level[ilevel].starts,
             traversal.from_sep_smaller_by_level[ilevel].lists,
             multipole_boxes_mask,
-            queue=queue
+            queue=actx.queue
         )
 
-        multipole_boxes_mask.finish()
-
     return multipole_boxes_mask
 
 
-@dataclass
+@dataclass_array_container
+@dataclass(frozen=True)
 class BoxMasks:
     """
-    Box masks needed for the distributed calculation. Each of these masks is a
-    PyOpenCL array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is
+    Box masks needed for the distributed calculation. Each of these masks is an
+    array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is
     set.
 
     .. attribute:: responsible_boxes
@@ -313,13 +321,13 @@ class BoxMasks:
 
         Current process needs multipole expressions in these boxes.
     """
-    responsible_boxes: cl.array.Array
-    ancestor_boxes: cl.array.Array
-    point_src_boxes: cl.array.Array
-    multipole_src_boxes: cl.array.Array
+    responsible_boxes: Array
+    ancestor_boxes: Array
+    point_src_boxes: Array
+    multipole_src_boxes: Array
 
 
-def get_box_masks(queue, traversal, responsible_boxes_list):
+def get_box_masks(actx, traversal, responsible_boxes_list):
     """Given the responsible boxes for a rank, this helper function calculates the
     relevant masks.
 
@@ -327,27 +335,23 @@ def get_box_masks(queue, traversal, responsible_boxes_list):
 
     :returns: A :class:`BoxMasks` object of the relevant masks.
     """
-    code = GetBoxMasksCodeContainer(queue.context, traversal.tree.box_id_dtype)
-
-    # FIXME: It is wasteful to copy the whole traversal object into device memory
-    # here because
-    # 1) Not all fields are needed.
-    # 2) For sumpy wrangler, a device traversal object is already available.
-    traversal = traversal.to_device(queue)
-
-    responsible_boxes_mask = np.zeros((traversal.tree.nboxes,), dtype=np.int8)
-    responsible_boxes_mask[responsible_boxes_list] = 1
-    responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask)
+    responsible_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
+    responsible_boxes_mask[responsible_boxes_list] = (
+        1 + actx.zeros(responsible_boxes_list.shape, np.int8))
 
     ancestor_boxes_mask = get_ancestor_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask)
+        actx, traversal, responsible_boxes_mask)
 
     point_src_boxes_mask = get_point_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
+        actx, traversal, responsible_boxes_mask, ancestor_boxes_mask)
 
     multipole_src_boxes_mask = get_multipole_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
+        actx, traversal, responsible_boxes_mask, ancestor_boxes_mask)
 
     return BoxMasks(
-        responsible_boxes_mask, ancestor_boxes_mask, point_src_boxes_mask,
+        responsible_boxes_mask,
+        ancestor_boxes_mask,
+        point_src_boxes_mask,
         multipole_src_boxes_mask)
+
+# }}}
diff --git a/boxtree/fmm.py b/boxtree/fmm.py
index 7644349d..83ef14cb 100644
--- a/boxtree/fmm.py
+++ b/boxtree/fmm.py
@@ -33,6 +33,7 @@
 
 from boxtree.tree import Tree
 from boxtree.traversal import FMMTraversalInfo
+from boxtree.array_context import PyOpenCLArrayContext
 
 import logging
 logger = logging.getLogger(__name__)
@@ -155,6 +156,7 @@ def local_expansions_view(self, local_exps, level):
 
     @abstractmethod
     def form_multipoles(self,
+            actx: PyOpenCLArrayContext,
             level_start_source_box_nrs, source_boxes,
             src_weight_vecs):
         """Return an expansions array
@@ -167,6 +169,7 @@ def form_multipoles(self,
 
     @abstractmethod
     def coarsen_multipoles(self,
+            actx: PyOpenCLArrayContext,
             level_start_source_parent_box_nrs,
             source_parent_boxes, mpoles):
         """For each box in *source_parent_boxes*,
@@ -179,6 +182,7 @@ def coarsen_multipoles(self,
 
     @abstractmethod
     def eval_direct(self,
+            actx: PyOpenCLArrayContext,
             target_boxes, neighbor_sources_starts,
             neighbor_sources_lists, src_weight_vecs):
         """For each box in *target_boxes*, evaluate the influence of the
@@ -191,6 +195,7 @@ def eval_direct(self,
 
     @abstractmethod
     def multipole_to_local(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
@@ -205,6 +210,7 @@ def multipole_to_local(self,
 
     @abstractmethod
     def eval_multipoles(self,
+            actx: PyOpenCLArrayContext,
             target_boxes_by_source_level, from_sep_smaller_by_level, mpole_exps):
         """For a level *i*, each box in *target_boxes_by_source_level[i]*, evaluate
         the multipole expansion in *mpole_exps* in the nearby boxes given in
@@ -218,6 +224,7 @@ def eval_multipoles(self,
 
     @abstractmethod
     def form_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, starts, lists, src_weight_vecs):
         """For each box in *target_or_target_parent_boxes*, form local
@@ -232,6 +239,7 @@ def form_locals(self,
 
     @abstractmethod
     def refine_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, local_exps):
         """For each box in *child_boxes*,
@@ -243,6 +251,7 @@ def refine_locals(self,
 
     @abstractmethod
     def eval_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_box_nrs, target_boxes, local_exps):
         """For each box in *target_boxes*, evaluate the local expansion in
         *local_exps* and return a new potential array.
@@ -254,7 +263,7 @@ def eval_locals(self,
     # }}}
 
     @abstractmethod
-    def finalize_potentials(self, potentials, template_ary):
+    def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials):
         """
         Postprocess the reordered potentials. This is where global scaling
         factors could be applied. This is distinct from :meth:`reorder_potentials`
@@ -268,7 +277,9 @@ def finalize_potentials(self, potentials, template_ary):
             type.
         """
 
-    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+    def distribute_source_weights(self,
+            actx: PyOpenCLArrayContext,
+            src_weight_vecs, src_idx_all_ranks):
         """Used by the distributed implementation for transferring needed source
         weights from root rank to each worker rank in the communicator.
 
@@ -288,7 +299,9 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
         """
         return src_weight_vecs
 
-    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
+    def gather_potential_results(self,
+            actx: PyOpenCLArrayContext,
+            potentials, tgt_idx_all_ranks):
         """Used by the distributed implementation for gathering calculated potentials
         from all worker ranks in the communicator to the root rank.
 
@@ -305,7 +318,9 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks):
         """
         return potentials
 
-    def communicate_mpoles(self, mpole_exps, return_stats=False):
+    def communicate_mpoles(self,
+            actx: PyOpenCLArrayContext,
+            mpole_exps, return_stats=False):
         """Used by the distributed implementation for forming the complete multipole
         expansions from the partial multipole expansions.
 
@@ -319,14 +334,16 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         :returns: Statistics of the communication if *return_stats* is True. *None*
             otherwise.
         """
-        pass
 
 # }}}
 
 
-def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
+def drive_fmm(actx: PyOpenCLArrayContext,
+              wrangler: ExpansionWranglerInterface,
+              src_weight_vecs, *,
               timing_data=None,
-              global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None):
+              global_src_idx_all_ranks=None,
+              global_tgt_idx_all_ranks=None):
     """Top-level driver routine for a fast multipole calculation.
 
     In part, this is intended as a template for custom FMMs, in the sense that
@@ -340,7 +357,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     :arg expansion_wrangler: An object exhibiting the
         :class:`ExpansionWranglerInterface`. For distributed implementation, this
         wrangler should be a subclass of
-        :class:`boxtree.distributed.calculation.DistributedExpansionWrangler`.
+        :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin`.
     :arg src_weight_vecs: A sequence of source 'density/weights/charges'.
         Passed unmodified to *expansion_wrangler*. For distributed
         implementation, this argument is only significant on the root rank, but
@@ -373,15 +390,17 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     from boxtree.timing import TimingRecorder
     recorder = TimingRecorder()
 
-    src_weight_vecs = [wrangler.reorder_sources(weight) for
-        weight in src_weight_vecs]
+    src_weight_vecs = [
+            wrangler.reorder_sources(weight) for weight in src_weight_vecs]
 
     src_weight_vecs = wrangler.distribute_source_weights(
-        src_weight_vecs, global_src_idx_all_ranks)
+            actx,
+            src_weight_vecs, global_src_idx_all_ranks)
 
     # {{{ "Step 2.1:" Construct local multipoles
 
     mpole_exps, timing_future = wrangler.form_multipoles(
+            actx,
             traversal.level_start_source_box_nrs,
             traversal.source_boxes,
             src_weight_vecs)
@@ -393,6 +412,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Step 2.2:" Propagate multipoles upward
 
     mpole_exps, timing_future = wrangler.coarsen_multipoles(
+            actx,
             traversal.level_start_source_parent_box_nrs,
             traversal.source_parent_boxes,
             mpole_exps)
@@ -403,11 +423,12 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
 
     # }}}
 
-    wrangler.communicate_mpoles(mpole_exps)
+    wrangler.communicate_mpoles(actx, mpole_exps)
 
     # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")
 
     potentials, timing_future = wrangler.eval_direct(
+            actx,
             traversal.target_boxes,
             traversal.neighbor_source_boxes_starts,
             traversal.neighbor_source_boxes_lists,
@@ -422,6 +443,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local
 
     local_exps, timing_future = wrangler.multipole_to_local(
+            actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_siblings_starts,
@@ -440,6 +462,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # contribution *out* of the downward-propagating local expansions)
 
     mpole_result, timing_future = wrangler.eval_multipoles(
+            actx,
             traversal.target_boxes_sep_smaller_by_source_level,
             traversal.from_sep_smaller_by_level,
             mpole_exps)
@@ -455,6 +478,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
                 "('list 3 close')")
 
         direct_result, timing_future = wrangler.eval_direct(
+                actx,
                 traversal.target_boxes,
                 traversal.from_sep_close_smaller_starts,
                 traversal.from_sep_close_smaller_lists,
@@ -469,6 +493,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")
 
     local_result, timing_future = wrangler.form_locals(
+            actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_bigger_starts,
@@ -481,6 +506,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
 
     if traversal.from_sep_close_bigger_starts is not None:
         direct_result, timing_future = wrangler.eval_direct(
+                actx,
                 traversal.target_boxes,
                 traversal.from_sep_close_bigger_starts,
                 traversal.from_sep_close_bigger_lists,
@@ -495,6 +521,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 7:" propagate local_exps downward
 
     local_exps, timing_future = wrangler.refine_locals(
+            actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             local_exps)
@@ -506,6 +533,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 8:" evaluate locals
 
     local_result, timing_future = wrangler.eval_locals(
+            actx,
             traversal.level_start_target_box_nrs,
             traversal.target_boxes,
             local_exps)
@@ -517,11 +545,11 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # }}}
 
     potentials = wrangler.gather_potential_results(
-                    potentials, global_tgt_idx_all_ranks)
+            actx,
+            potentials, global_tgt_idx_all_ranks)
 
     result = wrangler.reorder_potentials(potentials)
-
-    result = wrangler.finalize_potentials(result, template_ary=src_weight_vecs[0])
+    result = wrangler.finalize_potentials(actx, result)
 
     fmm_proc.done()
 
diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py
index 4cb62171..4132cdcf 100644
--- a/boxtree/pyfmmlib_integration.py
+++ b/boxtree/pyfmmlib_integration.py
@@ -36,6 +36,7 @@
 """
 
 import enum
+from abc import ABC, abstractmethod
 
 import numpy as np
 
@@ -51,7 +52,7 @@
 
 # {{{ rotation data interface
 
-class FMMLibRotationDataInterface:
+class FMMLibRotationDataInterface(ABC):
     """Abstract interface for additional, optional data for precomputation of
     rotation matrices passed to the expansion wrangler.
 
@@ -61,16 +62,16 @@ class FMMLibRotationDataInterface:
 
     """
 
+    @abstractmethod
     def m2l_rotation_lists(self):
         """Return a :mod:`numpy` array mapping entries of List 2 to rotation classes.
         """
-        raise NotImplementedError
 
+    @abstractmethod
     def m2l_rotation_angles(self):
         """Return a :mod:`numpy` array mapping List 2 rotation classes to
         rotation angles.
         """
-        raise NotImplementedError
 
 
 class FMMLibRotationData(FMMLibRotationDataInterface):
@@ -84,17 +85,12 @@ def __init__(self, array_context: PyOpenCLArrayContext, trav):
         self.trav = trav
         self.tree = trav.tree
 
-    @property
-    @memoize_method
-    def rotation_classes_builder(self):
-        from boxtree.rotation_classes import RotationClassesBuilder
-        return RotationClassesBuilder(self._setup_actx)
-
     @memoize_method
     def build_rotation_classes_lists(self):
-        trav = self._setup_actx.from_numpy(self.trav)
-        tree = self._setup_actx.from_numpy(self.tree)
-        return self.rotation_classes_builder(self._setup_actx, trav, tree)[0]
+        from boxtree.rotation_classes import build_rotation_classes
+        actx = self._setup_actx
+        return build_rotation_classes(
+            actx, actx.from_numpy(self.trav), actx.from_numpy(self.tree))
 
     @memoize_method
     def m2l_rotation_lists(self):
@@ -684,7 +680,9 @@ def reorder_potentials(self, potentials):
 
     @log_process(logger)
     @return_timing_data
-    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+    def form_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_box_nrs,
+            source_boxes,
             src_weight_vecs):
         src_weights, = src_weight_vecs
         formmp = self.tree_indep.get_routine(
@@ -727,8 +725,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes,
 
     @log_process(logger)
     @return_timing_data
-    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
-            source_parent_boxes, mpoles):
+    def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_parent_box_nrs,
+            source_parent_boxes,
+            mpoles):
         tree = self.tree
 
         mpmp = self.tree_indep.get_translation_routine(self, "%ddmpmp")
@@ -783,8 +783,11 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs,
 
     @log_process(logger)
     @return_timing_data
-    def eval_direct(self, target_boxes, neighbor_sources_starts,
-            neighbor_sources_lists, src_weight_vecs):
+    def eval_direct(self, actx: PyOpenCLArrayContext,
+            target_boxes,
+            neighbor_sources_starts,
+            neighbor_sources_lists,
+            src_weight_vecs):
         src_weights, = src_weight_vecs
         output = self.output_zeros()
 
@@ -827,7 +830,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts,
 
     @log_process(logger)
     @return_timing_data
-    def multipole_to_local(self,
+    def multipole_to_local(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
@@ -942,8 +945,9 @@ def multipole_to_local(self,
 
     @log_process(logger)
     @return_timing_data
-    def eval_multipoles(self,
-            target_boxes_by_source_level, sep_smaller_nonsiblings_by_level,
+    def eval_multipoles(self, actx: PyOpenCLArrayContext,
+            target_boxes_by_source_level,
+            sep_smaller_nonsiblings_by_level,
             mpole_exps):
         output = self.output_zeros()
 
@@ -985,9 +989,10 @@ def eval_multipoles(self,
 
     @log_process(logger)
     @return_timing_data
-    def form_locals(self,
+    def form_locals(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
-            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+            target_or_target_parent_boxes,
+            starts, lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         local_exps = self.local_expansion_zeros()
 
@@ -1065,8 +1070,10 @@ def form_locals(self,
 
     @log_process(logger)
     @return_timing_data
-    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
-            target_or_target_parent_boxes, local_exps):
+    def refine_locals(self, actx: PyOpenCLArrayContext,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes,
+            local_exps):
 
         locloc = self.tree_indep.get_translation_routine(self, "%ddlocloc")
 
@@ -1112,7 +1119,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs,
 
     @log_process(logger)
     @return_timing_data
-    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+    def eval_locals(self, actx: PyOpenCLArrayContext,
+            level_start_target_box_nrs,
+            target_boxes,
+            local_exps):
         output = self.output_zeros()
         taeval = self.tree_indep.get_expn_eval_routine("ta")
 
@@ -1147,7 +1157,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
         return output
 
     @log_process(logger)
-    def finalize_potentials(self, potential, template_ary):
+    def finalize_potentials(self, actx: PyOpenCLArrayContext, potential):
         if self.tree_indep.eqn_letter == "l" and self.dim == 2:
             scale_factor = -1/(2*np.pi)
         elif self.tree_indep.eqn_letter == "h" and self.dim == 2:
diff --git a/boxtree/rotation_classes.py b/boxtree/rotation_classes.py
index 7093bcff..6de3da6d 100644
--- a/boxtree/rotation_classes.py
+++ b/boxtree/rotation_classes.py
@@ -3,11 +3,7 @@
 -------------------------------
 
 .. autoclass:: RotationClassesInfo
-
-Build rotation classes
-----------------------
-
-.. autoclass:: RotationClassesBuilder
+.. autofunction:: build_rotation_classes
 """
 
 __copyright__ = "Copyright (C) 2019 Matt Wala"
@@ -37,18 +33,44 @@
 import numpy as np
 
 from arraycontext import Array
+from pytools import log_process
 
-from boxtree.translation_classes import TranslationClassesBuilder
+from boxtree.tree import Tree
+from boxtree.traversal import FMMTraversalInfo
 from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
 import logging
 logger = logging.getLogger(__name__)
 
-from pytools import log_process
+
+def vec_gcd(vec) -> int:
+    """Return the GCD of a list of integers."""
+    import math
+
+    # TODO: math.gcd supports a list of integers from >= 3.9
+    result = abs(vec[0])
+    for elem in vec[1:]:
+        result = math.gcd(result, abs(elem))
+
+    return result
 
 
 # {{{ rotation classes builder
 
+class RotationClassesBuilder:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, actx, trav, tree, wait_for=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_rotation_classes' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_rotation_classes(actx, trav, tree)
+        return result, None
+
+
 @dataclass_array_container
 @dataclass(frozen=True)
 class RotationClassesInfo:
@@ -84,112 +106,95 @@ def nfrom_sep_siblings_rotation_classes(self):
         return len(self.from_sep_siblings_rotation_class_to_angle)
 
 
-class RotationClassesBuilder:
-    """Build rotation classes for List 2 translations.
-
-    .. automethod:: __init__
-    .. automethod:: __call__
-    """
-
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
-        self.tcb = TranslationClassesBuilder(array_context)
-
-    @staticmethod
-    def vec_gcd(vec) -> int:
-        """Return the GCD of a list of integers."""
-        import math
-
-        # TODO: math.gcd supports a list of integers from >= 3.9
-        result = abs(vec[0])
-        for elem in vec[1:]:
-            result = math.gcd(result, abs(elem))
-
-        return result
-
-    def compute_rotation_classes(self,
-            well_sep_is_n_away: int, dimensions: int, used_translation_classes):
-        """Convert translation classes to a list of rotation classes and angles."""
-        angle_to_rot_class = {}
-        angles = []
-
-        ntranslation_classes_per_level = (
-                self.tcb.ntranslation_classes_per_level(well_sep_is_n_away,
-                    dimensions))
-
-        translation_class_to_rot_class = (
-                np.empty(ntranslation_classes_per_level, dtype=np.int32))
-
-        translation_class_to_rot_class[:] = -1
-
-        for cls in used_translation_classes:
-            vec = self.tcb.translation_class_to_normalized_vector(
-                    well_sep_is_n_away, dimensions, cls)
-
-            # Normalize the translation vector (by dividing by its GCD).
-            #
-            # We need this before computing the cosine of the rotation angle,
-            # because generally in in floating point arithmetic, if k is a
-            # positive scalar and v is a vector, we can't assume
-            #
-            #   kv[-1] / sqrt(|kv|^2) == v[-1] / sqrt(|v|^2).
-            #
-            # Normalizing ensures vectors that are positive integer multiples of
-            # each other get classified into the same equivalence class of
-            # rotations.
-            vec //= self.vec_gcd(vec)
-
-            # Compute the rotation angle for the vector.
-            norm = np.linalg.norm(vec)
-            assert norm != 0
-            angle = np.arccos(vec[-1] / norm)
-
-            # Find the rotation class.
-            if angle in angle_to_rot_class:
-                rot_class = angle_to_rot_class[angle]
-            else:
-                rot_class = len(angles)
-                angle_to_rot_class[angle] = rot_class
-                angles.append(angle)
-
-            translation_class_to_rot_class[cls] = rot_class
-
-        return translation_class_to_rot_class, angles
-
-    @log_process(logger, "build m2l rotation classes")
-    def __call__(self, actx, trav, tree, wait_for=None):
-        """Returns a pair *info*, *evt* where info is a :class:`RotationClassesInfo`.
-        """
-        evt, translation_class_is_used, translation_classes_lists = \
-            self.tcb.compute_translation_classes(actx, trav, tree, wait_for, False)
-
-        d = tree.dimensions
-        n = trav.well_sep_is_n_away
-
-        # convert translation classes to rotation classes
-
-        used_translation_classes = (
-                np.flatnonzero(actx.to_numpy(translation_class_is_used)))
-
-        translation_class_to_rotation_class, rotation_angles = (
-                self.compute_rotation_classes(n, d, used_translation_classes))
-
-        # There should be no more than 2^(d-1) * (2n+1)^d distinct rotation
-        # classes, since that is an upper bound on the number of distinct
-        # positions for list 2 boxes.
-        assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d
-
-        rotation_classes_lists = actx.from_numpy(
-            translation_class_to_rotation_class
-            )[translation_classes_lists]
-        rotation_angles = actx.from_numpy(np.array(rotation_angles))
-
-        info = RotationClassesInfo(
-                from_sep_siblings_rotation_classes=rotation_classes_lists,
-                from_sep_siblings_rotation_class_to_angle=rotation_angles,
-                )
-
-        return actx.freeze(info), evt
+def translation_classes_to_rotation_classes_and_angles(
+        used_translation_classes, well_sep_is_n_away: int, dimensions: int):
+    """Convert translation classes to a list of rotation classes and angles."""
+    angle_to_rot_class = {}
+    angles = []
+
+    from boxtree.translation_classes import ntranslation_classes_per_level
+    ntranslation_classes_per_level = (
+        ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
+
+    translation_class_to_rot_class = (
+            np.empty(ntranslation_classes_per_level, dtype=np.int32))
+
+    translation_class_to_rot_class[:] = -1
+
+    from boxtree.translation_classes import translation_class_to_normalized_vector
+    for cls in used_translation_classes:
+        vec = translation_class_to_normalized_vector(
+            well_sep_is_n_away, dimensions, cls)
+
+        # Normalize the translation vector (by dividing by its GCD).
+        #
+        # We need this before computing the cosine of the rotation angle,
+        # because generally in in floating point arithmetic, if k is a
+        # positive scalar and v is a vector, we can't assume
+        #
+        #   kv[-1] / sqrt(|kv|^2) == v[-1] / sqrt(|v|^2).
+        #
+        # Normalizing ensures vectors that are positive integer multiples of
+        # each other get classified into the same equivalence class of
+        # rotations.
+        vec //= vec_gcd(vec)
+
+        # Compute the rotation angle for the vector.
+        norm = np.linalg.norm(vec)
+        assert norm != 0
+        angle = np.arccos(vec[-1] / norm)
+
+        # Find the rotation class.
+        if angle in angle_to_rot_class:
+            rot_class = angle_to_rot_class[angle]
+        else:
+            rot_class = len(angles)
+            angle_to_rot_class[angle] = rot_class
+            angles.append(angle)
+
+        translation_class_to_rot_class[cls] = rot_class
+
+    return translation_class_to_rot_class, angles
+
+
+@log_process(logger, "build m2l rotation classes")
+def build_rotation_classes(
+        actx: PyOpenCLArrayContext,
+        trav: FMMTraversalInfo, tree: Tree) -> RotationClassesInfo:
+    """Build rotation classes for List 2 translations."""
+    from boxtree.translation_classes import compute_used_translation_classes
+    translation_class_is_used, translation_classes_lists = (
+        compute_used_translation_classes(actx, trav, tree,
+            is_translation_per_level=False))
+
+    d = tree.dimensions
+    n = trav.well_sep_is_n_away
+
+    # convert translation classes to rotation classes
+
+    used_translation_classes = (
+            np.flatnonzero(actx.to_numpy(translation_class_is_used)))
+
+    translation_class_to_rotation_class, rotation_angles = (
+        translation_classes_to_rotation_classes_and_angles(
+            used_translation_classes, n, d))
+
+    # There should be no more than 2^(d-1) * (2n+1)^d distinct rotation
+    # classes, since that is an upper bound on the number of distinct
+    # positions for list 2 boxes.
+    assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d
+
+    rotation_classes_lists = actx.from_numpy(
+        translation_class_to_rotation_class
+        )[translation_classes_lists]
+    rotation_angles = actx.from_numpy(np.array(rotation_angles))
+
+    info = RotationClassesInfo(
+            from_sep_siblings_rotation_classes=rotation_classes_lists,
+            from_sep_siblings_rotation_class_to_angle=rotation_angles,
+            )
+
+    return actx.freeze(info)
 
 # }}}
 
diff --git a/boxtree/tools.py b/boxtree/tools.py
index 951dea1a..da8ad00b 100644
--- a/boxtree/tools.py
+++ b/boxtree/tools.py
@@ -28,17 +28,15 @@
 
 import pyopencl as cl
 import pyopencl.array
+from pyopencl.elementwise import ElementwiseTemplate, ElementwiseKernel
 from pyopencl.tools import dtype_to_c_struct, ScalarArg, VectorArg as _VectorArg
 from mako.template import Template
 
-from pytools import Record, memoize_method
+from pytools import Record, memoize_in
 from pytools.obj_array import make_obj_array
 
 from boxtree.array_context import PyOpenCLArrayContext
 
-import loopy as lp
-from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
-
 
 # Use offsets in VectorArg by default.
 VectorArg = partial(_VectorArg, with_offset=True)
@@ -51,22 +49,19 @@ def padded_bin(i, nbits):
     return bin(i)[2:].rjust(nbits, "0")
 
 
-# NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__()
-def realloc_array(actx, new_shape, ary, zero_fill=False, wait_for=None):
-    if wait_for is None:
-        wait_for = []
-
+# NOTE: Order of positional args should match copy_and_map_gappy
+def realloc_array(actx: PyOpenCLArrayContext, new_shape, ary, zero_fill=False):
     if zero_fill:
-        array_maker = actx.zeros
+        new_ary = actx.zeros(shape=new_shape, dtype=ary.dtype)
     else:
-        array_maker = actx.empty
+        new_ary = actx.empty(shape=new_shape, dtype=ary.dtype)
 
-    new_ary = array_maker(shape=new_shape, dtype=ary.dtype)
     evt = cl.enqueue_copy(actx.queue, new_ary.data, ary.data,
         byte_count=ary.nbytes,
-        wait_for=wait_for + new_ary.events)
+        wait_for=new_ary.events)
+    new_ary.add_event(evt)
 
-    return new_ary, evt
+    return new_ary
 
 
 def reverse_index_array(actx, indices, target_size=None, result_fill_value=None):
@@ -107,155 +102,155 @@ def make_normal_particle_array(actx, nparticles, dims, dtype, seed=15):
 
 
 def make_surface_particle_array(actx, nparticles, dims, dtype, seed=15):
-    if dims == 2:
-        def get_2d_knl(dtype):
-            knl = lp.make_kernel(
-                "{[i]: 0<=i<n}",
-                """
-                    for i
-                        <> phi = 2*M_PI/n * i
-                        x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi))
-                        y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi))
-                    end
-                    """,
-                [
-                    lp.GlobalArg("x,y", dtype, shape=lp.auto),
-                    lp.ValueArg("n", np.int32),
-                    ],
-                name="make_surface_dist")
-
-            knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-
-            return knl
+    import loopy as lp
+    from boxtree.array_context import make_loopy_program
+
+    @memoize_in(actx, (make_surface_particle_array, "2d", dtype))
+    def get_2d_kernel():
+        knl = make_loopy_program(
+            "{[i]: 0 <= i < n}",
+            """
+            for i
+                <> phi = 2*M_PI / n * i
+                x0[i] = 0.5 * (3 * cos(phi) + 2.0 * sin(3 * phi))
+                x1[i] = 0.5 * (1 * sin(phi) + 1.5 * sin(2 * phi))
+            end
+            """,
+            kernel_data=[
+                lp.GlobalArg("x0,x1", dtype, shape=lp.auto),
+                lp.ValueArg("n", np.int32),
+            ],
+            name="make_surface_array_2d",
+            assumptions="n>0")
+
+        knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+        return knl
+
+    @memoize_in(actx, (make_surface_particle_array, "3d", dtype))
+    def get_3d_kernel():
+        knl = make_loopy_program(
+            "{[i, j]: 0 <= i, j <n}",
+            """
+            for i, j
+                <> phi = 2 * M_PI / n * i
+                <> theta = 2 * M_PI / n * j
+                x0[i, j] = 5 * cos(phi) * (3 + cos(theta))
+                x1[i, j] = 5 * sin(phi) * (3 + cos(theta))
+                x2[i, j] = 5 * sin(theta)
+            end
+            """,
+            kernel_data=[
+                lp.GlobalArg("x0,x1,x2", dtype, shape=lp.auto),
+                lp.ValueArg("n", np.int32),
+            ],
+            name="make_surface_array_3d",
+            assumptions="n>0")
+
+        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
+
+        return knl
 
-        evt, result = get_2d_knl(dtype)(actx.queue, n=nparticles)
-
-        result = [x.ravel() for x in result]
-
-        return make_obj_array(result)
+    if dims == 2:
+        n = nparticles
+        knl = get_2d_kernel()
     elif dims == 3:
         n = int(nparticles**0.5)
-
-        def get_3d_knl(dtype):
-            knl = lp.make_kernel(
-                "{[i,j]: 0<=i,j<n}",
-                """
-                    for i,j
-                        <> phi = 2*M_PI/n * i
-                        <> theta = 2*M_PI/n * j
-                        x[i,j] = 5*cos(phi) * (3 + cos(theta))
-                        y[i,j] = 5*sin(phi) * (3 + cos(theta))
-                        z[i,j] = 5*sin(theta)
-                    end
-                    """,
-                [
-                    lp.GlobalArg("x,y,z,", dtype, shape=lp.auto),
-                    lp.ValueArg("n", np.int32),
-                    ])
-
-            knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
-            knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
-
-            return knl
-
-        evt, result = get_3d_knl(dtype)(actx.queue, n=n)
-
-        result = [x.ravel() for x in result]
-
-        return make_obj_array(result)
+        knl = get_3d_kernel()
     else:
-        raise NotImplementedError
+        raise ValueError(f"unsupported dimensions: {dims}")
+
+    assert n > 0
+    result = actx.call_loopy(knl, n=n)
+    return make_obj_array([result[f"x{i}"].ravel() for i in range(dims)])
 
 
 def make_uniform_particle_array(actx, nparticles, dims, dtype, seed=15):
+    import loopy as lp
+    from boxtree.array_context import make_loopy_program
+
+    @memoize_in(actx, (make_uniform_particle_array, "2d", dtype))
+    def get_2d_kernel():
+        knl = make_loopy_program(
+            "{[i, j]: 0 <= i, j < n}",
+            """
+            for i, j
+                <> xx = 4 * i / (n - 1)
+                <> yy = 4 * j / (n - 1)
+                <float64> angle = 0.3
+                <> s = sin(angle)
+                <> c = cos(angle)
+                x0[i, j] = c * xx + s * yy - 2
+                x1[i, j] = -s * xx + c * yy - 2
+            end
+            """,
+            kernel_data=[
+                lp.GlobalArg("x0,x1", dtype, shape=lp.auto),
+                lp.ValueArg("n", np.int32),
+            ],
+            name="make_uniform_array_2d",
+            assumptions="n>0")
+
+        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
+
+        return knl
+
+    @memoize_in(actx, (make_uniform_particle_array, "3d", dtype))
+    def get_3d_kernel():
+        knl = make_loopy_program(
+            "{[i, j, k]: 0 <= i, j, k < n}",
+            """
+            for i, j, k
+                <> xx = i / (n - 1)
+                <> yy = j / (n - 1)
+                <> zz = k / (n - 1)
+
+                <float64> phi = 0.3
+                <> s1 = sin(phi)
+                <> c1 = cos(phi)
+
+                <> xxx = c1 * xx + s1 * yy
+                <> yyy = -s1 * xx + c1 * yy
+                <> zzz = zz
+
+                <float64> theta = 0.7
+                <> s2 = sin(theta)
+                <> c2 = cos(theta)
+
+                x0[i, j, k] = 4 * (c2 * xxx + s2 * zzz) - 2
+                x1[i, j, k] = 4 * yyy - 2
+                x2[i, j, k] = 4 * (-s2 * xxx + c2 * zzz) - 2
+            end
+            """,
+            kernel_data=[
+                lp.GlobalArg("x0,x1,x2", dtype, shape=lp.auto),
+                lp.ValueArg("n", np.int32),
+            ],
+            name="make_uniform_array_3d",
+            assumptions="n>0")
+
+        knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0")
+
+        return knl
+
     if dims == 2:
         n = int(nparticles**0.5)
-
-        def get_2d_knl(dtype):
-            knl = lp.make_kernel(
-                "{[i,j]: 0<=i,j<n}",
-                """
-                    for i,j
-                        <> xx = 4*i/(n-1)
-                        <> yy = 4*j/(n-1)
-                        <float64> angle = 0.3
-                        <> s = sin(angle)
-                        <> c = cos(angle)
-                        x[i,j] = c*xx + s*yy - 2
-                        y[i,j] = -s*xx + c*yy - 2
-                    end
-                    """,
-                [
-                    lp.GlobalArg("x,y", dtype, shape=lp.auto),
-                    lp.ValueArg("n", np.int32),
-                    ], assumptions="n>0")
-
-            knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
-            knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
-
-            return knl
-
-        evt, result = get_2d_knl(dtype)(actx.queue, n=n)
-
-        result = [x.ravel() for x in result]
-
-        return make_obj_array(result)
+        knl = get_2d_kernel()
     elif dims == 3:
         n = int(nparticles**(1/3))
-
-        def get_3d_knl(dtype):
-            knl = lp.make_kernel(
-                "{[i,j,k]: 0<=i,j,k<n}",
-                """
-                    for i,j,k
-                        <> xx = i/(n-1)
-                        <> yy = j/(n-1)
-                        <> zz = k/(n-1)
-
-                        <float64> phi = 0.3
-                        <> s1 = sin(phi)
-                        <> c1 = cos(phi)
-
-                        <> xxx = c1*xx + s1*yy
-                        <> yyy = -s1*xx + c1*yy
-                        <> zzz = zz
-
-                        <float64> theta = 0.7
-                        <> s2 = sin(theta)
-                        <> c2 = cos(theta)
-
-                        x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
-                        y[i,j,k] = 4 * yyy - 2
-                        z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
-                    end
-                    """,
-                [
-                    lp.GlobalArg("x,y,z", dtype, shape=lp.auto),
-                    lp.ValueArg("n", np.int32),
-                    ], assumptions="n>0")
-
-            knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
-            knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0")
-
-            return knl
-
-        evt, result = get_3d_knl(dtype)(actx.queue, n=n)
-
-        result = [x.ravel() for x in result]
-
-        return make_obj_array(result)
+        knl = get_3d_kernel()
     else:
-        raise NotImplementedError
-
+        raise ValueError(f"unsupported dimensions: {dims}")
 
-def make_rotated_uniform_particle_array(actx, nparticles, dims, dtype, seed=15):
-    raise NotImplementedError
+    assert n > 0
 
-# }}}
+    result = actx.call_loopy(knl, n=n)
+    return make_obj_array([result[f"x{i}"].ravel() for i in range(dims)])
 
 
-def particle_array_to_host(actx, particles):
-    return np.array([actx.to_numpy(x) for x in particles], order="F").T
+# }}}
 
 
 # {{{ host/device data storage
@@ -310,9 +305,8 @@ def transform_val(val):
     def get(self, queue, **kwargs):
         """
         :returns: a copy of *self* in which all data lives on the host, i.e.
-            all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray`
-            objects are replaced by corresponding :class:`numpy.ndarray`
-            instances on the host.
+            all :class:`pyopencl.array.Array` objects are replaced by
+            corresponding :class:`numpy.ndarray` instances on the host.
         """
         from warnings import warn
         warn(f"{type(self).__name__}.get is deprecated and will be removed "
@@ -320,9 +314,6 @@ def get(self, queue, **kwargs):
             DeprecationWarning, stacklevel=2)
 
         def try_get(attr):
-            if isinstance(attr, ImmutableHostDeviceArray):
-                return attr.host
-
             try:
                 return attr.get(queue=queue, **kwargs)
             except AttributeError:
@@ -367,8 +358,6 @@ def to_device(self, queue, exclude_fields=frozenset()):
         def _to_device(attr):
             if isinstance(attr, np.ndarray):
                 return cl.array.to_device(queue, attr).with_queue(None)
-            elif isinstance(attr, ImmutableHostDeviceArray):
-                return attr.device
             elif isinstance(attr, DeviceDataRecord):
                 return attr.to_device(queue)
             else:
@@ -376,38 +365,13 @@ def _to_device(attr):
 
         return self._transform_arrays(_to_device, exclude_fields=exclude_fields)
 
-    def to_host_device_array(self, queue, exclude_fields=frozenset()):
-        """
-        :arg exclude_fields: a :class:`frozenset` containing fields excluded
-            from transformed to `ImmutableHostDeviceArray`.
-
-        :returns: a copy of *self* where all device and host arrays are
-            transformed to `ImmutableHostDeviceArray` objects.
-        """
-        from warnings import warn
-        warn(f"{type(self).__name__}.to_host_device_array is deprecated and will "
-            "be removed in 2023. Switch from ImmutableHostDeviceArray.",
-            DeprecationWarning, stacklevel=2)
-
-        def _to_host_device_array(attr):
-            if isinstance(attr, (np.ndarray, cl.array.Array)):
-                return ImmutableHostDeviceArray(queue, attr)
-            elif isinstance(attr, DeviceDataRecord):
-                return attr.to_host_device_array(queue)
-            else:
-                return attr
-
-        return self._transform_arrays(
-            _to_host_device_array, exclude_fields=exclude_fields
-        )
-
 # }}}
 
 
 # {{{ type mangling
 
 def get_type_moniker(dtype):
-    return "%s%d" % (dtype.kind, dtype.itemsize)
+    return f"{dtype.kind}{dtype.itemsize}"
 
 # }}}
 
@@ -438,22 +402,33 @@ def get_type_moniker(dtype):
 """, strict_undefined=True)
 
 
-class GappyCopyAndMapKernel:
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
-
-    @memoize_method
-    def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
-                    have_src_indices, have_dst_indices, map_values):
+# NOTE: Order of positional args should match realloc_array()
+def copy_and_map_gappy(
+        actx: PyOpenCLArrayContext, new_shape, ary,
+        src_indices=None, dst_indices=None, mapping=None, range=None,
+        zero_fill: bool = False,
+        debug: bool = False):
+    """Compresses box info arrays after empty leaf pruning and, optionally,
+    maps old box IDs to new box IDs (if the array being operated on contains
+    box IDs).
+    """
+    have_src_indices = src_indices is not None
+    have_dst_indices = dst_indices is not None
+    have_mapping = mapping is not None
+
+    src_index_dtype = src_indices.dtype if have_src_indices else None
+    dst_index_dtype = dst_indices.dtype if have_dst_indices else None
+
+    @memoize_in(actx, (
+        copy_and_map_gappy, ary.dtype,
+        src_index_dtype, dst_index_dtype,
+        have_src_indices, have_dst_indices, have_mapping))
+    def get_kernel():
         from boxtree.tools import VectorArg
 
         args = [
-                VectorArg(dtype, "input_ary"),
-                VectorArg(dtype, "output_ary"),
+                VectorArg(ary.dtype, "input_ary"),
+                VectorArg(ary.dtype, "output_ary"),
                ]
 
         if have_src_indices:
@@ -462,85 +437,62 @@ def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
         if have_dst_indices:
             args.append(VectorArg(dst_index_dtype, "to_indices"))
 
-        if map_values:
-            args.append(VectorArg(dtype, "value_map"))
+        if have_mapping:
+            args.append(VectorArg(ary.dtype, "value_map"))
 
         from pyopencl.tools import dtype_to_ctype
         src = GAPPY_COPY_TPL.render(
-                dtype=dtype,
+                dtype=ary.dtype,
                 dtype_to_ctype=dtype_to_ctype,
                 from_dtype=src_index_dtype,
                 to_dtype=dst_index_dtype,
                 from_indices=have_src_indices,
                 to_indices=have_dst_indices,
-                map_values=map_values)
+                map_values=have_mapping)
 
-        from pyopencl.elementwise import ElementwiseKernel
-        return ElementwiseKernel(self.context,
+        return ElementwiseKernel(actx.context,
                 args, str(src),
-                preamble=dtype_to_c_struct(self.context.devices[0], dtype),
+                preamble=dtype_to_c_struct(actx.queue.device, ary.dtype),
                 name="gappy_copy_and_map")
 
-    # NOTE: Order of positional args should match realloc_array()
-    def __call__(self, actx, new_shape, ary, src_indices=None,
-                 dst_indices=None, map_values=None, zero_fill=False,
-                 wait_for=None, range=None, debug=False):
-        """Compresses box info arrays after empty leaf pruning and, optionally,
-        maps old box IDs to new box IDs (if the array being operated on contains
-        box IDs).
-        """
-
-        have_src_indices = src_indices is not None
-        have_dst_indices = dst_indices is not None
-        have_map_values = map_values is not None
-
-        if not (have_src_indices or have_dst_indices):
-            raise ValueError("must specify at least one of src or dest indices")
-
-        if range is None:
-            if have_src_indices and have_dst_indices:
-                raise ValueError(
-                    "must supply range when passing both src and dest indices")
-            elif have_src_indices:
-                range = slice(src_indices.shape[0])
-                if debug:
-                    assert int(actx.to_numpy(actx.np.amax(src_indices))) < len(ary)
-            elif have_dst_indices:
-                range = slice(dst_indices.shape[0])
-                if debug:
-                    assert int(actx.to_numpy(actx.np.amax(dst_indices))) < new_shape
-
-        if zero_fill:
-            array_maker = actx.zeros
-        else:
-            array_maker = actx.empty
-
-        result = array_maker(new_shape, ary.dtype)
+    if not (have_src_indices or have_dst_indices):
+        raise ValueError("must specify at least one of src or dst indices")
+
+    if range is None:
+        if have_src_indices and have_dst_indices:
+            raise ValueError(
+                "must supply range when passing both src and dst indices")
+        elif have_src_indices:
+            range = slice(src_indices.shape[0])
+            if debug:
+                assert int(actx.to_numpy(actx.np.amax(src_indices))) < len(ary)
+        elif have_dst_indices:
+            range = slice(dst_indices.shape[0])
+            if debug:
+                assert int(actx.to_numpy(actx.np.amax(dst_indices))) < new_shape
 
-        kernel = self._get_kernel(ary.dtype,
-                                  src_indices.dtype if have_src_indices else None,
-                                  dst_indices.dtype if have_dst_indices else None,
-                                  have_src_indices,
-                                  have_dst_indices,
-                                  have_map_values)
+    if zero_fill:
+        result = actx.zeros(shape=new_shape, dtype=ary.dtype)
+    else:
+        result = actx.empty(shape=new_shape, dtype=ary.dtype)
 
-        args = (ary, result)
-        args += (src_indices,) if have_src_indices else ()
-        args += (dst_indices,) if have_dst_indices else ()
-        args += (map_values,) if have_map_values else ()
+    args = (ary, result)
+    args += (src_indices,) if have_src_indices else ()
+    args += (dst_indices,) if have_dst_indices else ()
+    args += (mapping,) if have_mapping else ()
 
-        evt = kernel(*args, queue=actx.queue, range=range, wait_for=wait_for)
+    # FIXME: avoid in-place modifications
+    kernel = get_kernel()
+    evt = kernel(*args, queue=actx.queue, range=range)
+    result.add_event(evt)
 
-        return result, evt
+    return result
 
 # }}}
 
 
 # {{{ map values through table
 
-from pyopencl.elementwise import ElementwiseTemplate
-
-
 MAP_VALUES_TPL = ElementwiseTemplate(
     arguments="""//CL//
         dst_value_t *dst,
@@ -553,43 +505,31 @@ def __call__(self, actx, new_shape, ary, src_indices=None,
     name="map_values")
 
 
-class MapValuesKernel:
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
+def map_values(actx: PyOpenCLArrayContext, mapping, src, dst=None):
+    """Map the values of *src* through *mapping* as ``mapping[src[i]]``."""
+    if dst is None:
+        dst = src
 
-    @memoize_method
-    def _get_kernel(self, dst_dtype, src_dtype):
+    @memoize_in(actx, (map_values, dst.dtype, src.dtype))
+    def get_kernel():
         type_aliases = (
-            ("src_value_t", src_dtype),
-            ("dst_value_t", dst_dtype)
+            ("src_value_t", src.dtype),
+            ("dst_value_t", dst.dtype)
             )
 
-        return MAP_VALUES_TPL.build(self.context, type_aliases)
-
-    def __call__(self, map_values, src, dst=None):
-        """
-        Map the entries of the array `src` through the table `map_values`.
-        """
-        if dst is None:
-            dst = src
+        return MAP_VALUES_TPL.build(actx.context, type_aliases)
 
-        kernel = self._get_kernel(dst.dtype, src.dtype)
-        evt = kernel(dst, src, map_values)
+    # FIXME: avoid in-place modifications :(
+    evt = get_kernel()(dst, src, mapping)
+    dst.add_event(evt)
 
-        return dst, evt
+    return dst
 
 # }}}
 
 
 # {{{ binary search
 
-from mako.template import Template
-
-
 BINARY_SEARCH_TEMPLATE = Template("""
 /*
  * Returns the largest value of i such that arr[i] <= val, or (size_t) -1 if val
@@ -629,14 +569,8 @@ def __call__(self, map_values, src, dst=None):
 """)
 
 
-class InlineBinarySearch:
-
-    def __init__(self, elem_type_name):
-        self.render_vars = {"elem_t": elem_type_name}
-
-    @memoize_method
-    def __str__(self):
-        return BINARY_SEARCH_TEMPLATE.render(**self.render_vars)
+def inline_binary_search_for_type(elem_type_name: str) -> str:
+    return BINARY_SEARCH_TEMPLATE.render(**{"elem_t": elem_type_name})
 
 # }}}
 
@@ -669,82 +603,79 @@ def __str__(self):
 """
 
 
-class MaskCompressorKernel:
-    """
-    .. automethod:: __call__
-    """
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
+def mask_to_csr(actx: PyOpenCLArrayContext, mask, list_dtype=None):
+    """Convert a mask to a list in :ref:`csr` format.
 
-    @property
-    def context(self):
-        return self._setup_actx.context
+    :arg mask: Either a 1D or 2D array.
+        * If *mask* is 1D, it should represent a masked list, where
+            *mask[i]* is true if and only if *i* is in the list.
+        * If *mask* is 2D, it should represent a list of masked lists,
+            so that *mask[i,j]* is true if and only if *j* is in list *i*.
 
-    @memoize_method
-    def get_list_compressor_kernel(self, mask_dtype, list_dtype):
-        from pyopencl.algorithm import ListOfListsBuilder
+    :arg list_dtype: The dtype for the output list(s). Defaults to the mask
+        dtype.
 
+    :returns: The return value depends on the type of the input.
+        * If *mask* is 1D, returns a *(list,)*.
+        * If *mask* is 2D, returns a tuple *(starts, lists)*, as a :ref:`csr` list.
+    """
+    from pyopencl.algorithm import ListOfListsBuilder
+
+    if list_dtype is None:
+        list_dtype = mask.dtype
+
+    @memoize_in(actx, (mask_to_csr, "list_compressor", mask.dtype, list_dtype))
+    def get_list_compressor_kernel():
         return ListOfListsBuilder(
-                self.context,
+                actx.context,
                 [("output", list_dtype)],
                 MASK_LIST_COMPRESSOR_BODY,
                 [
-                    _VectorArg(mask_dtype, "mask"),
+                    _VectorArg(mask.dtype, "mask"),
                 ],
                 name_prefix="compress_list")
 
-    @memoize_method
-    def get_matrix_compressor_kernel(self, mask_dtype, list_dtype):
-        from pyopencl.algorithm import ListOfListsBuilder
-
+    @memoize_in(actx, (mask_to_csr, "matrix_compressor", mask.dtype, list_dtype))
+    def get_matrix_compressor_kernel():
         return ListOfListsBuilder(
-                self.context,
+                actx.context,
                 [("output", list_dtype)],
                 MASK_MATRIX_COMPRESSOR_BODY,
                 [
                     ScalarArg(np.int32, "ncols"),
                     ScalarArg(np.int32, "outer_stride"),
                     ScalarArg(np.int32, "inner_stride"),
-                    _VectorArg(mask_dtype, "mask"),
+                    _VectorArg(mask.dtype, "mask"),
                 ],
                 name_prefix="compress_matrix")
 
-    def __call__(self, actx, mask, list_dtype=None):
-        """Convert a mask to a list in :ref:`csr` format.
-
-        :arg mask: Either a 1D or 2D array.
-            * If *mask* is 1D, it should represent a masked list, where
-              *mask[i]* is true if and only if *i* is in the list.
-            * If *mask* is 2D, it should represent a list of masked lists,
-              so that *mask[i,j]* is true if and only if *j* is in list *i*.
-
-        :arg list_dtype: The dtype for the output list(s). Defaults to the mask
-            dtype.
+    if len(mask.shape) == 1:
+        knl = get_list_compressor_kernel()
+        result, evt = knl(
+            actx.queue, mask.shape[0], mask.data,
+            allocator=actx.allocator,
+            )
+        result["output"].lists.add_event(evt)
+
+        return result["output"].lists
+    elif len(mask.shape) == 2:
+        # FIXME: This is efficient for small column sizes but may not be
+        # for larger ones since the work is partitioned by row.
+        knl = get_matrix_compressor_kernel()
+        size = mask.dtype.itemsize
+        assert size > 0
+
+        result, evt = knl(
+            actx.queue, mask.shape[0], mask.shape[1],
+            mask.strides[0] // size, mask.strides[1] // size, mask.data,
+            allocator=actx.allocator,
+            )
+        result["output"].starts.add_event(evt)
+        result["output"].lists.add_event(evt)
 
-        :returns: The return value depends on the type of the input.
-            * If mask* is 1D, returns a tuple *(list, evt)*.
-            * If *mask* is 2D, returns a tuple *(starts, lists, event)*, as a
-              :ref:`csr` list.
-        """
-        if list_dtype is None:
-            list_dtype = mask.dtype
-
-        if len(mask.shape) == 1:
-            knl = self.get_list_compressor_kernel(mask.dtype, list_dtype)
-            result, evt = knl(actx.queue, mask.shape[0], mask.data)
-            return (result["output"].lists, evt)
-        elif len(mask.shape) == 2:
-            # FIXME: This is efficient for small column sizes but may not be
-            # for larger ones since the work is partitioned by row.
-            knl = self.get_matrix_compressor_kernel(mask.dtype, list_dtype)
-            size = mask.dtype.itemsize
-            assert size > 0
-            result, evt = knl(actx.queue, mask.shape[0], mask.shape[1],
-                              mask.strides[0] // size, mask.strides[1] // size,
-                              mask.data)
-            return (result["output"].starts, result["output"].lists, evt)
-        else:
-            raise ValueError("unsupported dimensionality")
+        return result["output"].starts, result["output"].lists
+    else:
+        raise ValueError("unsupported dimensionality")
 
 # }}}
 
@@ -893,58 +824,6 @@ def run_mpi(script: str, num_processes: int, env: Dict[str, Any]) -> None:
 # }}}
 
 
-# {{{ HostDeviceArray
-
-class ImmutableHostDeviceArray:
-    """Interface for arrays on both host and device.
-
-    .. note:: This interface assumes the array is immutable. The behavior of
-    modifying the content of either the host array or the device array is undefined.
-
-    @TODO: Once available, replace this implementation with PyOpenCL's in-house
-    implementation.
-    """
-    def __init__(self, queue, array):
-        self.queue = queue
-        self.shape = array.shape
-        self.host_array = None
-        self.device_array = None
-
-        if isinstance(array, np.ndarray):
-            self.host_array = array
-        elif isinstance(array, cl.array.Array):
-            self.device_array = array
-
-    def with_queue(self, queue):
-        self.queue = queue
-
-    @property
-    def svm_capable(self):
-        svm_capabilities = \
-            self.queue.device.get_info(cl.device_info.SVM_CAPABILITIES)
-        if svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER != 0:
-            return True
-        else:
-            return False
-
-    @property
-    def host(self):
-        if self.host_array is None:
-            self.host_array = self.device_array.get(self.queue)
-        return self.host_array
-
-    @property
-    def device(self):
-        if self.device_array is None:
-            # @TODO: Use SVM
-            self.device_array = cl.array.to_device(self.queue, self.host_array)
-
-        self.device_array.with_queue(self.queue)
-        return self.device_array
-
-# }}}
-
-
 # {{{ coord_vec tools
 
 def get_coord_vec_dtype(
diff --git a/boxtree/translation_classes.py b/boxtree/translation_classes.py
index eaaf32f4..e810ed44 100644
--- a/boxtree/translation_classes.py
+++ b/boxtree/translation_classes.py
@@ -3,11 +3,7 @@
 ----------------------------------
 
 .. autoclass:: TranslationClassesInfo
-
-Build translation classes
--------------------------
-
-.. autoclass:: TranslationClassesBuilder
+.. autofunction:: build_translation_classes
 """
 
 __copyright__ = "Copyright (C) 2019 Matt Wala"
@@ -36,24 +32,23 @@
 from dataclasses import dataclass
 
 import numpy as np
-from pyopencl.elementwise import ElementwiseTemplate, ElementwiseKernel
+from pyopencl.elementwise import ElementwiseTemplate
 
 from arraycontext import Array
-from pytools import memoize_method
+from pytools import memoize_on_first_arg, log_process
 from mako.template import Template
 
-from boxtree.tools import (
-    InlineBinarySearch, get_coord_vec_dtype, coord_vec_subscript_code)
+from boxtree.tree import Tree
 from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS, FMMTraversalInfo
+from boxtree.tools import (
+    inline_binary_search_for_type, get_coord_vec_dtype, coord_vec_subscript_code)
 from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
 import logging
 logger = logging.getLogger(__name__)
 
-from pytools import log_process
 
-
-# {{{ translation classes builder
+# {{{ kernel templates
 
 TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE = Template(r"""//CL:mako//
     #define LEVEL_TO_RAD(level) \
@@ -114,7 +109,7 @@
         %endfor
         return result;
     }
-    """ + str(InlineBinarySearch("box_id_t")),
+    """ + inline_binary_search_for_type("box_id_t"),
     strict_undefined=True)
 
 
@@ -178,10 +173,27 @@
     atomic_or(&translation_class_is_used[translation_class], 1);
     """)
 
+# }}}
 
-@dataclass(frozen=True)
-class _KernelInfo:
-    translation_class_finder: ElementwiseKernel
+
+# {{{ translation classes builder
+
+class TranslationClassesBuilder:
+    def __init__(self, *args, **kargs):
+        pass
+
+    def __call__(self, actx: PyOpenCLArrayContext,
+            trav, tree, wait_for=None, is_translation_per_level=True):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_rotation_classes' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_translation_classes(
+            actx, trav, tree,
+            is_translation_per_level=is_translation_per_level)
+
+        return result, None
 
 
 @dataclass_array_container
@@ -233,199 +245,188 @@ def nfrom_sep_siblings_translation_classes(self):
         return len(self.from_sep_siblings_translation_class_to_distance_vector)
 
 
-class TranslationClassesBuilder:
-    """Build translation classes for List 2 translations.
-
-    .. automethod:: __init__
-    .. automethod:: __call__
-    """
-
-    def __init__(self, array_context: PyOpenCLArrayContext) -> None:
-        self._setup_actx = array_context
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
-
-    @memoize_method
-    def get_kernel_info(self,
-            dimensions: int,
-            well_sep_is_n_away: int,
-            box_id_dtype: np.dtype,
-            box_level_dtype: np.dtype,
-            coord_dtype: np.dtype,
-            translation_class_per_level) -> None:
-        coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)
-        int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions)
-
-        num_translation_classes = \
-            self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)
-
-        # Make sure translation classes can fit inside a 32 bit integer.
-        if not num_translation_classes <= 1 + np.iinfo(np.int32).max:
-            raise ValueError("would overflow")
-
-        preamble = TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.render(
-                dimensions=dimensions,
-                cvec_sub=partial(coord_vec_subscript_code, dimensions))
-
-        translation_class_finder = (
-                TRANSLATION_CLASS_FINDER_TEMPLATE.build(
-                    self.context,
-                    type_aliases=(
-                        ("int_coord_vec_t", int_coord_vec_dtype),
-                        ("coord_vec_t", coord_vec_dtype),
-                        ("coord_t", coord_dtype),
-                        ("box_id_t", box_id_dtype),
-                        ("box_level_t", box_level_dtype),
-                    ),
-                    var_values=(
-                        ("dimensions", dimensions),
-                        ("ntranslation_classes_per_level", num_translation_classes),
-                        ("translation_class_per_level", translation_class_per_level),
-                        ("cvec_sub", partial(
-                            coord_vec_subscript_code, dimensions)),
-                    ),
-                    more_preamble=preamble))
-
-        return _KernelInfo(translation_class_finder=translation_class_finder)
-
-    @staticmethod
-    def ntranslation_classes_per_level(
-            well_sep_is_n_away: int, dimensions: int) -> int:
-        return (4 * well_sep_is_n_away + 3) ** dimensions
-
-    def translation_class_to_normalized_vector(
-            self, well_sep_is_n_away: int, dimensions: int, cls: type
-            ) -> np.ndarray:
-        # This computes the vector for the translation class, using the inverse
-        # of the formula found in get_translation_class() defined in
-        # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.
-        assert 0 <= cls < self.ntranslation_classes_per_level(well_sep_is_n_away,
-                                                              dimensions)
-        result = np.zeros(dimensions, dtype=np.int32)
-        shift = 2 * well_sep_is_n_away + 1
-        base = 4 * well_sep_is_n_away + 3
-        for i in range(dimensions):
-            result[i] = cls % base - shift
-            cls //= base
-
-        return result
-
-    def compute_translation_classes(self,
-            actx: PyOpenCLArrayContext, trav, tree, wait_for,
-            is_translation_per_level):
-        """
-        :returns: a :class:`tuple` containing *evt*, *translation_class_is_used*
-            and *translation_classes_lists*.
-        """
-
-        # {{{ compute translation classes for list 2
-
-        well_sep_is_n_away = trav.well_sep_is_n_away
-        dimensions = tree.dimensions
-        coord_dtype = tree.coord_dtype
-
-        knl_info = self.get_kernel_info(
-                dimensions, well_sep_is_n_away, tree.box_id_dtype,
-                tree.box_level_dtype, coord_dtype, is_translation_per_level)
-
-        ntranslation_classes = (
-                self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
-
-        if is_translation_per_level:
-            ntranslation_classes = ntranslation_classes * tree.nlevels
-
-        translation_classes_lists = actx.empty(
-            len(trav.from_sep_siblings_lists), dtype=np.int32)
-        translation_class_is_used = actx.zeros(ntranslation_classes, dtype=np.int32)
-
-        error_flag = actx.zeros(1, dtype=np.int32)
-        evt = knl_info.translation_class_finder(
-                trav.from_sep_siblings_lists,
-                trav.from_sep_siblings_starts,
-                trav.target_or_target_parent_boxes,
-                trav.ntarget_or_target_parent_boxes,
-                tree.box_centers,
-                tree.aligned_nboxes,
-                tree.root_extent,
-                tree.box_levels,
-                well_sep_is_n_away,
-                translation_classes_lists,
-                translation_class_is_used,
-                error_flag,
-                queue=actx.queue,
-                wait_for=wait_for)
-
-        if actx.to_numpy(error_flag)[0]:
-            raise ValueError("could not compute translation classes")
-
-        return (evt, translation_class_is_used, translation_classes_lists)
-
-        # }}}
-
-    @log_process(logger, "build m2l translation classes")
-    def __call__(self, actx: PyOpenCLArrayContext,
-            trav, tree, wait_for=None, is_translation_per_level=True):
-        """Returns a pair *info*, *evt* where info is a
-        :class:`TranslationClassesInfo`.
-        """
-        evt, translation_class_is_used, translation_classes_lists = \
-            self.compute_translation_classes(actx, trav, tree, wait_for,
-                                             is_translation_per_level)
-
-        well_sep_is_n_away = trav.well_sep_is_n_away
-        dimensions = tree.dimensions
-
-        used_translation_classes_map = np.empty(len(translation_class_is_used),
-                                                dtype=np.int32)
-        used_translation_classes_map.fill(-1)
-
-        distances = np.empty((dimensions, len(translation_class_is_used)),
-                             dtype=tree.coord_dtype)
-        num_translation_classes = \
-            self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)
-
-        nlevels = tree.nlevels
-        count = 0
-        prev_level = -1
-        from_sep_siblings_translation_classes_level_starts = \
-            np.empty(nlevels+1, dtype=np.int32)
-        for i, used in enumerate(actx.to_numpy(translation_class_is_used)):
-            cls_without_level = i % num_translation_classes
-            level = i // num_translation_classes
-            if (prev_level != level):
-                from_sep_siblings_translation_classes_level_starts[level] = count
-                prev_level = level
-
-            if not used:
-                continue
-
-            used_translation_classes_map[i] = count
-            unit_vector = self.translation_class_to_normalized_vector(
-                            well_sep_is_n_away, dimensions, cls_without_level)
-            distances[:, count] = unit_vector * tree.root_extent / (1 << level)
-            count = count + 1
-
-        from_sep_siblings_translation_classes_level_starts[nlevels] = count
-
-        translation_classes_lists = actx.from_numpy(
-            used_translation_classes_map
-            )[translation_classes_lists]
-
-        distances = actx.from_numpy(distances)
-        from_sep_siblings_translation_classes_level_starts = actx.from_numpy(
-            from_sep_siblings_translation_classes_level_starts)
-
-        info = TranslationClassesInfo(
-                traversal=trav,
-                from_sep_siblings_translation_classes=translation_classes_lists,
-                from_sep_siblings_translation_class_to_distance_vector=distances,
-                from_sep_siblings_translation_classes_level_starts=(
-                    from_sep_siblings_translation_classes_level_starts),
-                )
-
-        return actx.freeze(info), evt
+def ntranslation_classes_per_level(well_sep_is_n_away: int, dimensions: int) -> int:
+    return (4 * well_sep_is_n_away + 3) ** dimensions
+
+
+def translation_class_to_normalized_vector(
+        well_sep_is_n_away: int, dimensions: int, nclasses: int
+        ) -> np.ndarray:
+    # This computes the vector for the translation class, using the inverse
+    # of the formula found in get_translation_class() defined in
+    # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.
+    assert 0 <= nclasses < ntranslation_classes_per_level(
+        well_sep_is_n_away, dimensions)
+
+    result = np.zeros(dimensions, dtype=np.int32)
+    shift = 2 * well_sep_is_n_away + 1
+    base = 4 * well_sep_is_n_away + 3
+    for i in range(dimensions):
+        result[i] = nclasses % base - shift
+        nclasses //= base
+
+    return result
+
+
+@memoize_on_first_arg
+def get_translation_class_finder_kernel(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        well_sep_is_n_away: int,
+        box_id_dtype: "np.dtype",
+        box_level_dtype: "np.dtype",
+        coord_dtype: "np.dtype",
+        is_translation_per_level: bool
+        ):
+    coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)
+    int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions)
+
+    num_translation_classes = (
+        ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
+
+    # Make sure translation classes can fit inside a 32 bit integer.
+    if not num_translation_classes <= 1 + np.iinfo(np.int32).max:
+        raise ValueError("would overflow")
+
+    preamble = TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.render(
+            dimensions=dimensions,
+            cvec_sub=partial(coord_vec_subscript_code, dimensions))
+
+    return TRANSLATION_CLASS_FINDER_TEMPLATE.build(
+        actx.context,
+        type_aliases=(
+            ("int_coord_vec_t", int_coord_vec_dtype),
+            ("coord_vec_t", coord_vec_dtype),
+            ("coord_t", coord_dtype),
+            ("box_id_t", box_id_dtype),
+            ("box_level_t", box_level_dtype),
+        ),
+        var_values=(
+            ("dimensions", dimensions),
+            ("ntranslation_classes_per_level", num_translation_classes),
+            ("translation_class_per_level", is_translation_per_level),
+            ("cvec_sub", partial(
+                coord_vec_subscript_code, dimensions)),
+        ),
+        more_preamble=preamble)
+
+
+def compute_used_translation_classes(
+        actx: PyOpenCLArrayContext, trav: FMMTraversalInfo, tree: Tree, *,
+        is_translation_per_level: bool):
+    # {{{ compute translation classes for list 2
+
+    well_sep_is_n_away = trav.well_sep_is_n_away
+    dimensions = tree.dimensions
+    coord_dtype = tree.coord_dtype
+
+    ntranslation_classes = (
+        ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
+
+    if is_translation_per_level:
+        ntranslation_classes = ntranslation_classes * tree.nlevels
+
+    translation_classes_lists = actx.empty(
+        len(trav.from_sep_siblings_lists), dtype=np.int32)
+    translation_class_is_used = actx.zeros(ntranslation_classes, dtype=np.int32)
+    error_flag = actx.zeros(1, dtype=np.int32)
+
+    translation_class_finder_knl = get_translation_class_finder_kernel(
+        actx,
+        dimensions, well_sep_is_n_away,
+        tree.box_id_dtype, tree.box_level_dtype, coord_dtype,
+        is_translation_per_level,
+        )
+
+    evt = translation_class_finder_knl(
+            trav.from_sep_siblings_lists,
+            trav.from_sep_siblings_starts,
+            trav.target_or_target_parent_boxes,
+            trav.ntarget_or_target_parent_boxes,
+            tree.box_centers,
+            tree.aligned_nboxes,
+            tree.root_extent,
+            tree.box_levels,
+            well_sep_is_n_away,
+            translation_classes_lists,
+            translation_class_is_used,
+            error_flag,
+            queue=actx.queue,
+            )
+    translation_classes_lists.add_event(evt)
+    translation_class_is_used.add_event(evt)
+
+    if actx.to_numpy(error_flag):
+        raise ValueError("could not compute translation classes")
+
+    return translation_class_is_used, translation_classes_lists
+
+    # }}}
+
+
+@log_process(logger, "build m2l translation classes")
+def build_translation_classes(actx: PyOpenCLArrayContext,
+        trav: FMMTraversalInfo, tree: Tree, *,
+        is_translation_per_level: bool = True) -> TranslationClassesInfo:
+    """Build translation classes for List 2 translations."""
+    translation_class_is_used, translation_classes_lists = (
+        compute_used_translation_classes(actx, trav, tree,
+            is_translation_per_level=is_translation_per_level))
+
+    well_sep_is_n_away = trav.well_sep_is_n_away
+    dimensions = tree.dimensions
+
+    used_translation_classes_map = np.empty(
+        len(translation_class_is_used), dtype=np.int32)
+    used_translation_classes_map.fill(-1)
+
+    distances = np.empty(
+        (dimensions, len(translation_class_is_used)), dtype=tree.coord_dtype)
+    num_translation_classes = (
+        ntranslation_classes_per_level(well_sep_is_n_away, dimensions))
+
+    nlevels = tree.nlevels
+    count = 0
+    prev_level = -1
+    from_sep_siblings_translation_classes_level_starts = (
+        np.empty(nlevels + 1, dtype=np.int32))
+
+    for i, used in enumerate(actx.to_numpy(translation_class_is_used)):
+        cls_without_level = i % num_translation_classes
+        level = i // num_translation_classes
+        if (prev_level != level):
+            from_sep_siblings_translation_classes_level_starts[level] = count
+            prev_level = level
+
+        if not used:
+            continue
+
+        used_translation_classes_map[i] = count
+        unit_vector = translation_class_to_normalized_vector(
+            well_sep_is_n_away, dimensions, cls_without_level)
+
+        distances[:, count] = unit_vector * tree.root_extent / (1 << level)
+        count = count + 1
+
+    from_sep_siblings_translation_classes_level_starts[nlevels] = count
+
+    translation_classes_lists = actx.from_numpy(
+        used_translation_classes_map
+        )[translation_classes_lists]
+
+    distances = actx.from_numpy(distances)
+    from_sep_siblings_translation_classes_level_starts = actx.from_numpy(
+        from_sep_siblings_translation_classes_level_starts)
+
+    info = TranslationClassesInfo(
+            traversal=trav,
+            from_sep_siblings_translation_classes=translation_classes_lists,
+            from_sep_siblings_translation_class_to_distance_vector=distances,
+            from_sep_siblings_translation_classes_level_starts=(
+                from_sep_siblings_translation_classes_level_starts),
+            )
+
+    return actx.freeze(info)
 
 # }}}
 
diff --git a/boxtree/traversal.py b/boxtree/traversal.py
index 7e777150..c6ece4d2 100644
--- a/boxtree/traversal.py
+++ b/boxtree/traversal.py
@@ -3,13 +3,7 @@
 ------------------------
 
 .. autoclass:: FMMTraversalInfo
-
-Build Entrypoint
-----------------
-
-.. autoclass:: FMMTraversalBuilder
-
-    .. automethod:: __call__
+.. autofunction:: build_traversal
 """
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -37,13 +31,14 @@
 import enum
 from functools import partial
 from dataclasses import dataclass
+from typing import Optional
 
 import numpy as np
 from pyopencl.algorithm import ListOfListsBuilder
 from pyopencl.elementwise import ElementwiseTemplate, ElementwiseKernel
 
 from arraycontext import Array
-from pytools import ProcessLogger, log_process, memoize_method
+from pytools import ProcessLogger, log_process, memoize_on_first_arg
 from pytools.obj_array import make_obj_array
 from mako.template import Template
 
@@ -1180,114 +1175,126 @@ class _IndexStyle(enum.IntEnum):
 
 
 class _ListMerger:
-    """Utility class for combining box lists optionally changing indexing style."""
-
     def __init__(self, array_context: PyOpenCLArrayContext, box_id_dtype):
         self._setup_actx = array_context
         self.box_id_dtype = box_id_dtype
 
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
+    def __call__(self, actx, input_starts, input_lists, input_index_style,
+            output_index_style, target_boxes, target_or_target_parent_boxes,
+            nboxes, debug=False, wait_for=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'merge_lists' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        return merge_lists(
+            actx, input_starts, input_lists,
+            input_index_style, output_index_style,
+            target_boxes, target_or_target_parent_boxes, nboxes, self.box_id_dtype,
+            debug=debug)
+
+
+def merge_lists(
+        actx: PyOpenCLArrayContext, input_starts, input_lists, input_index_style,
+        output_index_style, target_boxes, target_or_target_parent_boxes,
+        nboxes, box_id_dtype, debug: bool = False):
+    """Utility class for combining box lists optionally changing indexing style.
+
+    :arg input_starts: Starts arrays of input
+    :arg input_lists: Lists arrays of input
+    :arg input_index_style: A :class:`_IndexStyle`
+    :arg output_index_style: A :class:`_IndexStyle`
+    """
+    # {{{
+
+    if (
+            output_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
+            and input_index_style == _IndexStyle.TARGET_BOXES):
+        raise ValueError(
+                "unsupported: merging a list indexed by target boxes "
+                "into a list indexed by target or target parent boxes")
+
+    ntarget_boxes = len(target_boxes)
+    ntarget_or_ntarget_parent_boxes = len(target_or_target_parent_boxes)
+
+    noutput_boxes = (ntarget_boxes
+        if output_index_style == _IndexStyle.TARGET_BOXES
+        else ntarget_or_ntarget_parent_boxes)
+
+    if (
+            input_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
+            and output_index_style == _IndexStyle.TARGET_BOXES):
+        from boxtree.tools import reverse_index_array
+        target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
+                actx, target_or_target_parent_boxes, target_size=nboxes)
+        target_or_target_parent_boxes_from_target_boxes = (
+                target_or_target_parent_boxes_from_all_boxes[target_boxes])
+
+        output_to_input_box = target_or_target_parent_boxes_from_target_boxes
+    else:
+        output_to_input_box = actx.from_numpy(
+                np.arange(noutput_boxes, dtype=box_id_dtype))
+
+    new_counts = actx.empty(noutput_boxes + 1, box_id_dtype)
+    assert len(input_starts) == len(input_lists)
+
+    nlists = len(input_starts)
+    assert nlists >= 1
 
-    @memoize_method
-    def get_list_merger_kernel(self, nlists, write_counts):
-        """
-        :arg nlists: Number of input lists
-        :arg write_counts: A :class:`bool`, indicating whether to generate a
-            kernel that produces box counts or box lists
-        """
-        assert nlists >= 1
+    # }}}
+
+    # {{{ merge lists
 
+    from pytools import memoize_in
+
+    @memoize_in(actx, (merge_lists, box_id_dtype, nlists))
+    def get_list_merger_kernel(with_write_counts):
         return LIST_MERGER_TEMPLATE.build(
-                self.context,
+                actx.context,
                 type_aliases=(
-                    ("box_id_t", self.box_id_dtype),
+                    ("box_id_t", box_id_dtype),
                 ),
                 var_values=(
                     ("nlists", nlists),
-                    ("write_counts", write_counts),
+                    ("write_counts", with_write_counts),
                 ))
 
-    def __call__(self, actx, input_starts, input_lists, input_index_style,
-            output_index_style, target_boxes, target_or_target_parent_boxes,
-            nboxes, debug=False, wait_for=None):
-        """
-        :arg input_starts: Starts arrays of input
-        :arg input_lists: Lists arrays of input
-        :arg input_index_style: A :class:`_IndexStyle`
-        :arg output_index_style: A :class:`_IndexStyle`
-        :returns: A pair *results_dict, event*, where *results_dict*
-            contains entries *starts* and *lists*
-        """
-        if wait_for is None:
-            wait_for = []
+    evt = get_list_merger_kernel(True)(*(
+                # input:
+                (output_to_input_box,)
+                + input_starts
+                # output:
+                + (new_counts,)),
+                range=slice(noutput_boxes),
+                queue=actx.queue,
+                )
+    new_counts.add_event(evt)
+
+    import pyopencl.array as cl_array
+    new_starts = cl_array.cumsum(new_counts)
+    del new_counts
+
+    new_lists = actx.empty(int(actx.to_numpy(new_starts[-1])), box_id_dtype)
+    new_lists.fill(999999999)
+
+    evt = get_list_merger_kernel(False)(*(
+                # input:
+                (output_to_input_box,)
+                + input_starts
+                + input_lists
+                + (new_starts,)
+                # output:
+                + (new_lists,)),
+                range=slice(noutput_boxes),
+                queue=actx.queue,
+                )
+    new_starts.add_event(evt)
+    new_lists.add_event(evt)
+
+    # }}}
+
+    return {"starts": new_starts, "lists": new_lists}
 
-        if (
-                output_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
-                and input_index_style == _IndexStyle.TARGET_BOXES):
-            raise ValueError(
-                    "unsupported: merging a list indexed by target boxes "
-                    "into a list indexed by target or target parent boxes")
-
-        ntarget_boxes = len(target_boxes)
-        ntarget_or_ntarget_parent_boxes = len(target_or_target_parent_boxes)
-
-        noutput_boxes = (ntarget_boxes
-                if output_index_style == _IndexStyle.TARGET_BOXES
-                else ntarget_or_ntarget_parent_boxes)
-
-        if (
-                input_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES
-                and output_index_style == _IndexStyle.TARGET_BOXES):
-            from boxtree.tools import reverse_index_array
-            target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
-                    actx, target_or_target_parent_boxes, target_size=nboxes)
-            target_or_target_parent_boxes_from_target_boxes = (
-                    target_or_target_parent_boxes_from_all_boxes[target_boxes]
-                    )
-
-            output_to_input_box = target_or_target_parent_boxes_from_target_boxes
-        else:
-            output_to_input_box = actx.from_numpy(
-                    np.arange(noutput_boxes, dtype=self.box_id_dtype)
-                    )
-
-        new_counts = actx.empty(noutput_boxes + 1, self.box_id_dtype)
-
-        assert len(input_starts) == len(input_lists)
-        nlists = len(input_starts)
-
-        evt = self.get_list_merger_kernel(nlists, True)(*(
-                    # input:
-                    (output_to_input_box,)
-                    + input_starts
-                    # output:
-                    + (new_counts,)),
-                    range=slice(noutput_boxes),
-                    queue=actx.queue,
-                    wait_for=wait_for)
-
-        import pyopencl.array as cl_array
-        new_starts = cl_array.cumsum(new_counts)
-        del new_counts
-
-        new_lists = actx.empty(int(actx.to_numpy(new_starts[-1])), self.box_id_dtype)
-        new_lists.fill(999999999)
-
-        evt = self.get_list_merger_kernel(nlists, False)(*(
-                    # input:
-                    (output_to_input_box,)
-                    + input_starts
-                    + input_lists
-                    + (new_starts,)
-                    # output:
-                    + (new_lists,)),
-                    range=slice(noutput_boxes),
-                    queue=actx.queue,
-                    wait_for=[evt])
-
-        return dict(starts=new_starts, lists=new_lists), evt
 
 # }}}
 
@@ -1592,7 +1599,9 @@ def ntarget_or_target_parent_boxes(self):
 
     # {{{ "close" list merging -> "unified list 1"
 
-    def merge_close_lists(self, actx, debug=False):
+    def merge_close_lists(self,
+            actx: PyOpenCLArrayContext,
+            debug: bool = False) -> "FMMTraversalInfo":
         """Return a new :class:`FMMTraversalInfo` instance with the contents of
         :attr:`from_sep_close_smaller_starts` and
         :attr:`from_sep_close_bigger_starts` merged into
@@ -1600,10 +1609,8 @@ def merge_close_lists(self, actx, debug=False):
         *None*.
         """
 
-        list_merger = _ListMerger(actx, self.tree.box_id_dtype)
-
-        result, evt = (
-                list_merger(
+        result = (
+                merge_lists(
                     actx,
                     # starts
                     (self.neighbor_source_boxes_starts,
@@ -1621,11 +1628,9 @@ def merge_close_lists(self, actx, debug=False):
                     self.target_boxes,
                     self.target_or_target_parent_boxes,
                     self.tree.nboxes,
+                    self.tree.box_id_dtype,
                     debug))
 
-        import pyopencl as cl
-        cl.wait_for_events([evt])
-
         from dataclasses import replace
         return replace(self,
                 neighbor_source_boxes_starts=actx.freeze(result["starts"]),
@@ -1651,7 +1656,7 @@ def get_box_list(self, what, index):
 
 
 @dataclass(frozen=True)
-class _KernelInfo:
+class TraversalKernelInfo:
     sources_parents_and_targets_builder: ListOfListsBuilder
     level_start_box_nrs_extractor: ElementwiseKernel
     same_level_non_well_sep_boxes_builder: ListOfListsBuilder
@@ -1662,588 +1667,626 @@ class _KernelInfo:
 
 
 class FMMTraversalBuilder:
-    """
-    .. automethod:: __init__
-    """
-
-    def __init__(self, array_context: PyOpenCLArrayContext, *,
-            well_sep_is_n_away=1,
-            from_sep_smaller_crit=None) -> None:
-        """
-        :arg well_sep_is_n_away: Either An integer 1 or greater.
-            (Only 1 and 2 are tested.)
-            The spacing between boxes that is considered "well-separated" for
-            :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_siblings_starts`
-            (List 2).
-        :arg from_sep_smaller_crit: The criterion used to determine separation
-            box dimensions and separation for
-            :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_smaller_by_level`
-            (List 3). May be one of ``"static_linf"`` (use the box square,
-            possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`),
-            ``"precise_linf"`` (use the precise extent of targets in the box,
-            including their radii), or ``"static_l2"`` (use the circumcircle of
-            the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`).
-        """
+    def __init__(self,
+            array_context: PyOpenCLArrayContext, *,
+            well_sep_is_n_away: int = 1,
+            from_sep_smaller_crit: Optional[str] = None) -> None:
         self._setup_actx = array_context
         self.well_sep_is_n_away = well_sep_is_n_away
         self.from_sep_smaller_crit = from_sep_smaller_crit
 
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                wait_for=None, debug=False,
+                 _from_sep_smaller_min_nsources_cumul=None,
+                 source_boxes_mask=None,
+                 source_parent_boxes_mask=None):
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_traversal' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_traversal(actx, tree,
+            well_sep_is_n_away=self.well_sep_is_n_away,
+            from_sep_smaller_crit=self.from_sep_smaller_crit,
+            source_boxes_mask=source_boxes_mask,
+            source_parent_boxes_mask=source_parent_boxes_mask,
+            _from_sep_smaller_min_nsources_cumul=(
+                _from_sep_smaller_min_nsources_cumul),
+            debug=debug,
+            )
+
+        return result, None
+
+
+# {{{ traversal kernels
+
+@log_process(logger)
+@memoize_on_first_arg
+def get_traversal_kernel_info(
+        actx: PyOpenCLArrayContext, *,
+        dimensions: int,
+        particle_id_dtype: "np.dtype",
+        box_id_dtype: "np.dtype",
+        coord_dtype: "np.dtype",
+        box_level_dtype: "np.dtype",
+        max_levels: int,
+        sources_are_targets: bool,
+        sources_have_extent: bool,
+        targets_have_extent: bool,
+        extent_norm: str,
+        source_boxes_has_mask: bool,
+        source_parent_boxes_has_mask: bool,
+        well_sep_is_n_away: int,
+        from_sep_smaller_crit: str,
+        debug: bool = False) -> TraversalKernelInfo:
+    # {{{ process from_sep_smaller_crit
+
+    if extent_norm == "linf":
+        # no special checks needed
+        pass
+
+    elif extent_norm == "l2":
+        if from_sep_smaller_crit == "static_linf":
+            # Not technically necessary, but static linf will assume box
+            # bounds that are not guaranteed to contain all particle
+            # extents.
+            raise ValueError(
+                    "The static l^inf from-sep-smaller criterion "
+                    "cannot be used with the l^2 extent norm")
 
-    # {{{ kernel builder
+    elif extent_norm is None:
+        assert not (sources_have_extent or targets_have_extent)
 
-    @memoize_method
-    @log_process(logger)
-    def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype,
-            coord_dtype, box_level_dtype, max_levels,
-            sources_are_targets, sources_have_extent, targets_have_extent,
-            extent_norm,
-            source_boxes_has_mask,
-            source_parent_boxes_has_mask):
+        if from_sep_smaller_crit is None:
+            # doesn't matter
+            from_sep_smaller_crit = "static_linf"
 
-        # {{{ process from_sep_smaller_crit
+    else:
+        raise ValueError("unexpected value of 'extent_norm': %s"
+                % extent_norm)
 
-        from_sep_smaller_crit = self.from_sep_smaller_crit
+    if from_sep_smaller_crit not in [
+            "static_linf", "precise_linf",
+            "static_l2",
+            ]:
+        raise ValueError("unexpected value of 'from_sep_smaller_crit': %s"
+                % from_sep_smaller_crit)
 
-        if from_sep_smaller_crit is None:
-            from_sep_smaller_crit = "precise_linf"
-
-        if extent_norm == "linf":
-            # no special checks needed
-            pass
-
-        elif extent_norm == "l2":
-            if from_sep_smaller_crit == "static_linf":
-                # Not technically necessary, but static linf will assume box
-                # bounds that are not guaranteed to contain all particle
-                # extents.
-                raise ValueError(
-                        "The static l^inf from-sep-smaller criterion "
-                        "cannot be used with the l^2 extent norm")
-
-        elif extent_norm is None:
-            assert not (sources_have_extent or targets_have_extent)
-
-            if from_sep_smaller_crit is None:
-                # doesn't matter
-                from_sep_smaller_crit = "static_linf"
-
-        else:
-            raise ValueError("unexpected value of 'extent_norm': %s"
-                    % extent_norm)
-
-        if from_sep_smaller_crit not in [
-                "static_linf", "precise_linf",
-                "static_l2",
-                ]:
-            raise ValueError("unexpected value of 'from_sep_smaller_crit': %s"
-                    % from_sep_smaller_crit)
-
-        # }}}
-
-        debug = False
-
-        from pyopencl.tools import dtype_to_ctype
-
-        from boxtree.tree import box_flags_enum
-        from boxtree.tools import AXIS_NAMES
-
-        render_vars = dict(
-                np=np,
-                dimensions=dimensions,
-                dtype_to_ctype=dtype_to_ctype,
-                particle_id_dtype=particle_id_dtype,
-                box_id_dtype=box_id_dtype,
-                box_flags_enum=box_flags_enum,
-                coord_dtype=coord_dtype,
-                get_coord_vec_dtype=get_coord_vec_dtype,
-                cvec_sub=partial(coord_vec_subscript_code, dimensions),
-                max_levels=max_levels,
-                AXIS_NAMES=AXIS_NAMES,
-                debug=debug,
-                sources_are_targets=sources_are_targets,
-                sources_have_extent=sources_have_extent,
-                targets_have_extent=targets_have_extent,
-                well_sep_is_n_away=self.well_sep_is_n_away,
-                from_sep_smaller_crit=from_sep_smaller_crit,
-                source_boxes_has_mask=source_boxes_has_mask,
-                source_parent_boxes_has_mask=source_parent_boxes_has_mask
-                )
-        from pyopencl.algorithm import ListOfListsBuilder
-        from boxtree.tools import VectorArg, ScalarArg
+    # }}}
+
+    from pyopencl.tools import dtype_to_ctype
+
+    from boxtree.tree import box_flags_enum
+    from boxtree.tools import AXIS_NAMES
+
+    render_vars = dict(
+            np=np,
+            dimensions=dimensions,
+            dtype_to_ctype=dtype_to_ctype,
+            particle_id_dtype=particle_id_dtype,
+            box_id_dtype=box_id_dtype,
+            box_flags_enum=box_flags_enum,
+            coord_dtype=coord_dtype,
+            get_coord_vec_dtype=get_coord_vec_dtype,
+            cvec_sub=partial(coord_vec_subscript_code, dimensions),
+            max_levels=max_levels,
+            AXIS_NAMES=AXIS_NAMES,
+            debug=debug,
+            sources_are_targets=sources_are_targets,
+            sources_have_extent=sources_have_extent,
+            targets_have_extent=targets_have_extent,
+            well_sep_is_n_away=well_sep_is_n_away,
+            from_sep_smaller_crit=from_sep_smaller_crit,
+            source_boxes_has_mask=source_boxes_has_mask,
+            source_parent_boxes_has_mask=source_parent_boxes_has_mask
+            )
+    from pyopencl.algorithm import ListOfListsBuilder
+    from boxtree.tools import VectorArg, ScalarArg
+
+    result = {}
+
+    # {{{ source boxes, their parents, target boxes
+
+    src = Template(
+            TRAVERSAL_PREAMBLE_TEMPLATE
+            + SOURCES_PARENTS_AND_TARGETS_TEMPLATE,
+            strict_undefined=True).render(**render_vars)
+
+    arg_decls = [VectorArg(box_flags_enum.dtype, "box_flags")]
+    if source_boxes_has_mask:
+        arg_decls.append(VectorArg(np.int8, "source_boxes_mask"))
+    if source_parent_boxes_has_mask:
+        arg_decls.append(VectorArg(np.int8, "source_parent_boxes_mask"))
+
+    result["sources_parents_and_targets_builder"] = \
+            ListOfListsBuilder(actx.context,
+                    [
+                        ("source_parent_boxes", box_id_dtype),
+                        ("source_boxes", box_id_dtype),
+                        ("target_or_target_parent_boxes", box_id_dtype)
+                        ] + (
+                            [("target_boxes", box_id_dtype)]
+                            if not sources_are_targets
+                            else []),
+                    str(src),
+                    arg_decls=arg_decls,
+                    debug=debug,
+                    name_prefix="sources_parents_and_targets")
 
-        result = {}
+    result["level_start_box_nrs_extractor"] = \
+            LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(actx.context,
+                type_aliases=(
+                    ("box_id_t", box_id_dtype),
+                    ("box_level_t", box_level_dtype),
+                    ),
+                )
 
-        # {{{ source boxes, their parents, target boxes
+    # }}}
 
+    # {{{ build list N builders
+
+    base_args = [
+            VectorArg(coord_dtype, "box_centers", with_offset=False),
+            ScalarArg(coord_dtype, "root_extent"),
+            VectorArg(np.uint8, "box_levels"),
+            ScalarArg(box_id_dtype, "aligned_nboxes"),
+            VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
+            VectorArg(box_flags_enum.dtype, "box_flags"),
+            ]
+
+    for list_name, template, extra_args, extra_lists, eliminate_empty_list in [
+            ("same_level_non_well_sep_boxes",
+                SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE, [], [], []),
+            ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE,
+                    [
+                        VectorArg(box_id_dtype, "target_boxes"),
+                        ], [], []),
+            ("from_sep_siblings", FROM_SEP_SIBLINGS_TEMPLATE,
+                    [
+                        VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
+                        VectorArg(box_id_dtype, "box_parent_ids",
+                            with_offset=False),
+                        VectorArg(box_id_dtype,
+                            "same_level_non_well_sep_boxes_starts"),
+                        VectorArg(box_id_dtype,
+                            "same_level_non_well_sep_boxes_lists"),
+                        ], [], []),
+            ("from_sep_smaller", FROM_SEP_SMALLER_TEMPLATE,
+                    [
+                        ScalarArg(coord_dtype, "stick_out_factor"),
+                        VectorArg(box_id_dtype, "target_boxes"),
+                        VectorArg(box_id_dtype,
+                            "same_level_non_well_sep_boxes_starts"),
+                        VectorArg(box_id_dtype,
+                            "same_level_non_well_sep_boxes_lists"),
+                        VectorArg(coord_dtype, "box_target_bounding_box_min",
+                            with_offset=False),
+                        VectorArg(coord_dtype, "box_target_bounding_box_max",
+                            with_offset=False),
+                        VectorArg(particle_id_dtype, "box_source_counts_cumul"),
+                        ScalarArg(particle_id_dtype,
+                            "from_sep_smaller_min_nsources_cumul"),
+                        ScalarArg(box_id_dtype, "from_sep_smaller_source_level"),
+                        ],
+                        ["from_sep_close_smaller"]
+                        if sources_have_extent or targets_have_extent
+                        else [], ["from_sep_smaller"]),
+            ("from_sep_bigger", FROM_SEP_BIGGER_TEMPLATE,
+                    [
+                        ScalarArg(coord_dtype, "stick_out_factor"),
+                        VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
+                        VectorArg(box_id_dtype, "box_parent_ids",
+                            with_offset=False),
+                        VectorArg(box_id_dtype,
+                            "same_level_non_well_sep_boxes_starts"),
+                        VectorArg(box_id_dtype,
+                            "same_level_non_well_sep_boxes_lists"),
+                        ],
+                        ["from_sep_close_bigger"]
+                        if sources_have_extent or targets_have_extent
+                        else [], []),
+            ]:
         src = Template(
                 TRAVERSAL_PREAMBLE_TEMPLATE
-                + SOURCES_PARENTS_AND_TARGETS_TEMPLATE,
+                + HELPER_FUNCTION_TEMPLATE
+                + template,
                 strict_undefined=True).render(**render_vars)
 
-        arg_decls = [VectorArg(box_flags_enum.dtype, "box_flags")]
-        if source_boxes_has_mask:
-            arg_decls.append(VectorArg(np.int8, "source_boxes_mask"))
-        if source_parent_boxes_has_mask:
-            arg_decls.append(VectorArg(np.int8, "source_parent_boxes_mask"))
-
-        result["sources_parents_and_targets_builder"] = \
-                ListOfListsBuilder(self.context,
-                        [
-                            ("source_parent_boxes", box_id_dtype),
-                            ("source_boxes", box_id_dtype),
-                            ("target_or_target_parent_boxes", box_id_dtype)
-                            ] + (
-                                [("target_boxes", box_id_dtype)]
-                                if not sources_are_targets
-                                else []),
-                        str(src),
-                        arg_decls=arg_decls,
-                        debug=debug,
-                        name_prefix="sources_parents_and_targets")
-
-        result["level_start_box_nrs_extractor"] = \
-                LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(self.context,
-                    type_aliases=(
-                        ("box_id_t", box_id_dtype),
-                        ("box_level_t", box_level_dtype),
-                        ),
-                    )
-
-        # }}}
-
-        # {{{ build list N builders
-
-        base_args = [
-                VectorArg(coord_dtype, "box_centers", with_offset=False),
-                ScalarArg(coord_dtype, "root_extent"),
-                VectorArg(np.uint8, "box_levels"),
-                ScalarArg(box_id_dtype, "aligned_nboxes"),
-                VectorArg(box_id_dtype, "box_child_ids", with_offset=False),
-                VectorArg(box_flags_enum.dtype, "box_flags"),
-                ]
-
-        for list_name, template, extra_args, extra_lists, eliminate_empty_list in [
-                ("same_level_non_well_sep_boxes",
-                    SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE, [], [], []),
-                ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE,
-                        [
-                            VectorArg(box_id_dtype, "target_boxes"),
-                            ], [], []),
-                ("from_sep_siblings", FROM_SEP_SIBLINGS_TEMPLATE,
-                        [
-                            VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
-                            VectorArg(box_id_dtype, "box_parent_ids",
-                                with_offset=False),
-                            VectorArg(box_id_dtype,
-                                "same_level_non_well_sep_boxes_starts"),
-                            VectorArg(box_id_dtype,
-                                "same_level_non_well_sep_boxes_lists"),
-                            ], [], []),
-                ("from_sep_smaller", FROM_SEP_SMALLER_TEMPLATE,
-                        [
-                            ScalarArg(coord_dtype, "stick_out_factor"),
-                            VectorArg(box_id_dtype, "target_boxes"),
-                            VectorArg(box_id_dtype,
-                                "same_level_non_well_sep_boxes_starts"),
-                            VectorArg(box_id_dtype,
-                                "same_level_non_well_sep_boxes_lists"),
-                            VectorArg(coord_dtype, "box_target_bounding_box_min",
-                                with_offset=False),
-                            VectorArg(coord_dtype, "box_target_bounding_box_max",
-                                with_offset=False),
-                            VectorArg(particle_id_dtype, "box_source_counts_cumul"),
-                            ScalarArg(particle_id_dtype,
-                                "from_sep_smaller_min_nsources_cumul"),
-                            ScalarArg(box_id_dtype, "from_sep_smaller_source_level"),
-                            ],
-                            ["from_sep_close_smaller"]
-                            if sources_have_extent or targets_have_extent
-                            else [], ["from_sep_smaller"]),
-                ("from_sep_bigger", FROM_SEP_BIGGER_TEMPLATE,
-                        [
-                            ScalarArg(coord_dtype, "stick_out_factor"),
-                            VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
-                            VectorArg(box_id_dtype, "box_parent_ids",
-                                with_offset=False),
-                            VectorArg(box_id_dtype,
-                                "same_level_non_well_sep_boxes_starts"),
-                            VectorArg(box_id_dtype,
-                                "same_level_non_well_sep_boxes_lists"),
-                            ],
-                            ["from_sep_close_bigger"]
-                            if sources_have_extent or targets_have_extent
-                            else [], []),
-                ]:
-            src = Template(
-                    TRAVERSAL_PREAMBLE_TEMPLATE
-                    + HELPER_FUNCTION_TEMPLATE
-                    + template,
-                    strict_undefined=True).render(**render_vars)
-
-            result[f"{list_name}_builder"] = ListOfListsBuilder(self.context,
-                    [(list_name, box_id_dtype)]
-                    + [(extra_list_name, box_id_dtype)
-                        for extra_list_name in extra_lists],
-                    str(src),
-                    arg_decls=base_args + extra_args,
-                    debug=debug, name_prefix=list_name,
-                    complex_kernel=True,
-                    eliminate_empty_output_lists=eliminate_empty_list)
+        result[f"{list_name}_builder"] = ListOfListsBuilder(actx.context,
+                [(list_name, box_id_dtype)]
+                + [(extra_list_name, box_id_dtype)
+                    for extra_list_name in extra_lists],
+                str(src),
+                arg_decls=base_args + extra_args,
+                debug=debug, name_prefix=list_name,
+                complex_kernel=True,
+                eliminate_empty_output_lists=eliminate_empty_list)
 
-        # }}}
+    # }}}
 
-        return _KernelInfo(**result)
+    return TraversalKernelInfo(**result)
 
-    # }}}
+# }}}
 
-    # {{{ driver
 
-    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
-                wait_for=None, debug=False,
-                 _from_sep_smaller_min_nsources_cumul=None,
-                 source_boxes_mask=None,
-                 source_parent_boxes_mask=None):
-        """
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            exeuction.
-        :arg source_boxes_mask: Only boxes passing this mask will be considered for
-            `source_boxes`. Used by the distributed implementation.
-        :arg source_parent_boxes_mask: Only boxes passing this mask will be
-            considered for `source_parent_boxes`. Used by the distributed
-            implementation.
-        :return: A :class:`tuple` *(trav, event)*, where *trav* is a new instance of
-            :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event`
-            for dependency management.
-        """
-        if _from_sep_smaller_min_nsources_cumul is None:
-            # default to old no-threshold behavior
-            _from_sep_smaller_min_nsources_cumul = 0
-
-        if not tree._is_pruned:
-            raise ValueError("tree must be pruned for traversal generation")
-
-        if tree.sources_have_extent:
-            # YAGNI
-            raise NotImplementedError(
-                    "trees with source extent are not supported for "
-                    "traversal generation")
+# {{{ driver
 
-        # Generated code shouldn't depend on the *exact* number of tree levels.
-        # So round up to the next multiple of 5.
-        from pytools import div_ceil
-        max_levels = div_ceil(tree.nlevels, 5) * 5
+def build_traversal(
+        actx: PyOpenCLArrayContext, tree: Tree, *,
+        well_sep_is_n_away: int = 1,
+        from_sep_smaller_crit: Optional[str] = None,
+        source_boxes_mask: Optional["np.ndarray"] = None,
+        source_parent_boxes_mask: Optional["np.ndarray"] = None,
+        _from_sep_smaller_min_nsources_cumul=None,
+        debug: bool = False) -> FMMTraversalInfo:
+    """
+    :arg well_sep_is_n_away: Either An integer 1 or greater.
+        (Only 1 and 2 are tested.)
+        The spacing between boxes that is considered "well-separated" for
+        :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_siblings_starts`
+        (List 2).
+    :arg from_sep_smaller_crit: The criterion used to determine separation
+        box dimensions and separation for
+        :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_smaller_by_level`
+        (List 3). May be one of ``"static_linf"`` (use the box square,
+        possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`),
+        ``"precise_linf"`` (use the precise extent of targets in the box,
+        including their radii), or ``"static_l2"`` (use the circumcircle of
+        the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`).
+    :arg source_boxes_mask: Only boxes passing this mask will be considered for
+        `source_boxes`. Used by the distributed implementation.
+    :arg source_parent_boxes_mask: Only boxes passing this mask will be
+        considered for `source_parent_boxes`. Used by the distributed
+        implementation.
+
+    :return: A :class:`tuple` *(trav, event)*, where *trav* is a new instance of
+        :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event`
+        for dependency management.
+    """
+    if from_sep_smaller_crit is None:
+        from_sep_smaller_crit = "precise_linf"
+
+    if _from_sep_smaller_min_nsources_cumul is None:
+        # default to old no-threshold behavior
+        _from_sep_smaller_min_nsources_cumul = 0
+
+    if not tree._is_pruned:
+        raise ValueError("tree must be pruned for traversal generation")
+
+    if tree.sources_have_extent:
+        # YAGNI
+        raise NotImplementedError(
+                "trees with source extent are not supported for "
+                "traversal generation")
+
+    # Generated code shouldn't depend on the *exact* number of tree levels.
+    # So round up to the next multiple of 5.
+    from pytools import div_ceil
+    max_levels = div_ceil(tree.nlevels, 5) * 5
+
+    knl = get_traversal_kernel_info(
+            actx,
+            dimensions=tree.dimensions,
+            particle_id_dtype=tree.particle_id_dtype,
+            box_id_dtype=tree.box_id_dtype,
+            coord_dtype=tree.coord_dtype,
+            box_level_dtype=tree.box_level_dtype,
+            max_levels=max_levels,
+            sources_are_targets=tree.sources_are_targets,
+            sources_have_extent=tree.sources_have_extent,
+            targets_have_extent=tree.targets_have_extent,
+            extent_norm=tree.extent_norm,
+            source_boxes_has_mask=source_boxes_mask is not None,
+            source_parent_boxes_has_mask=source_parent_boxes_mask is not None,
+            well_sep_is_n_away=well_sep_is_n_away,
+            from_sep_smaller_crit=from_sep_smaller_crit,
+            debug=debug,
+            )
+
+    def debug_with_finish(s):
+        if debug:
+            actx.queue.finish()
+
+        logger.debug(s)
+
+    traversal_plog = ProcessLogger(logger, "build traversal")
+
+    # {{{ source boxes, their parents, and target boxes
 
-        knl_info = self.get_kernel_info(
-                tree.dimensions, tree.particle_id_dtype, tree.box_id_dtype,
-                tree.coord_dtype, tree.box_level_dtype, max_levels,
-                tree.sources_are_targets,
-                tree.sources_have_extent, tree.targets_have_extent,
-                tree.extent_norm,
-                source_boxes_mask is not None,
-                source_parent_boxes_mask is not None)
+    debug_with_finish(
+        "building list of source boxes, their parents, and target boxes")
 
-        def debug_with_finish(s):
-            if debug:
-                actx.queue.finish()
+    extra_args = []
+    if source_boxes_mask is not None:
+        extra_args.append(source_boxes_mask)
+    if source_parent_boxes_mask is not None:
+        extra_args.append(source_parent_boxes_mask)
 
-            logger.debug(s)
+    result, evt = knl.sources_parents_and_targets_builder(
+        actx.queue, tree.nboxes, tree.box_flags, *extra_args,
+        allocator=actx.allocator,
+    )
 
-        traversal_plog = ProcessLogger(logger, "build traversal")
+    wait_for = [evt]
 
-        # {{{ source boxes, their parents, and target boxes
+    source_parent_boxes = result["source_parent_boxes"].lists
+    source_boxes = result["source_boxes"].lists
+    target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists
 
-        debug_with_finish(
-            "building list of source boxes, their parents, and target boxes")
+    if not tree.sources_are_targets:
+        target_boxes = result["target_boxes"].lists
+    else:
+        target_boxes = source_boxes
 
-        extra_args = []
-        if source_boxes_mask is not None:
-            extra_args.append(source_boxes_mask)
-        if source_parent_boxes_mask is not None:
-            extra_args.append(source_parent_boxes_mask)
+    # }}}
+
+    # {{{ figure out level starts in *_parent_boxes
 
-        result, evt = knl_info.sources_parents_and_targets_builder(
-            actx.queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for
-        )
+    def extract_level_start_box_nrs(box_list, wait_for):
+        result = actx.empty(
+            tree.nlevels + 1, tree.box_id_dtype).fill(len(box_list))
 
-        wait_for = [evt]
+        evt = knl.level_start_box_nrs_extractor(
+                tree.level_start_box_nrs,
+                tree.box_levels,
+                box_list,
+                result,
+                range=slice(0, len(box_list)),
+                queue=actx.queue, wait_for=wait_for,
+                )
 
-        source_parent_boxes = result["source_parent_boxes"].lists
-        source_boxes = result["source_boxes"].lists
-        target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists
+        result = actx.to_numpy(result)
 
-        if not tree.sources_are_targets:
-            target_boxes = result["target_boxes"].lists
-        else:
-            target_boxes = source_boxes
+        # Postprocess result for unoccupied levels
+        prev_start = len(box_list)
+        for ilev in range(tree.nlevels-1, -1, -1):
+            result[ilev] = prev_start = \
+                    min(result[ilev], prev_start)
 
-        # }}}
+        return result, evt
 
-        # {{{ figure out level starts in *_parent_boxes
+    debug_with_finish("finding level starts in source boxes array")
+    level_start_source_box_nrs, evt_s = \
+            extract_level_start_box_nrs(
+                    source_boxes, wait_for=wait_for)
 
-        def extract_level_start_box_nrs(box_list, wait_for):
-            result = actx.empty(
-                tree.nlevels + 1, tree.box_id_dtype).fill(len(box_list))
+    debug_with_finish("finding level starts in source parent boxes array")
+    level_start_source_parent_box_nrs, evt_sp = \
+            extract_level_start_box_nrs(
+                    source_parent_boxes, wait_for=wait_for)
 
-            evt = knl_info.level_start_box_nrs_extractor(
-                    tree.level_start_box_nrs,
-                    tree.box_levels,
-                    box_list,
-                    result,
-                    range=slice(0, len(box_list)),
-                    queue=actx.queue, wait_for=wait_for)
+    debug_with_finish("finding level starts in target boxes array")
+    level_start_target_box_nrs, evt_t = \
+            extract_level_start_box_nrs(
+                    target_boxes, wait_for=wait_for)
 
-            result = actx.to_numpy(result)
+    debug_with_finish(
+        "finding level starts in target or target parent boxes array")
+    level_start_target_or_target_parent_box_nrs, evt_tp = \
+            extract_level_start_box_nrs(
+                    target_or_target_parent_boxes, wait_for=wait_for)
 
-            # Postprocess result for unoccupied levels
-            prev_start = len(box_list)
-            for ilev in range(tree.nlevels-1, -1, -1):
-                result[ilev] = prev_start = \
-                        min(result[ilev], prev_start)
+    wait_for = [evt_s, evt_sp, evt_t, evt_tp]
 
-            return result, evt
+    # }}}
 
-        debug_with_finish("finding level starts in source boxes array")
-        level_start_source_box_nrs, evt_s = \
-                extract_level_start_box_nrs(
-                        source_boxes, wait_for=wait_for)
+    # {{{ same-level non-well-separated boxes
 
-        debug_with_finish("finding level starts in source parent boxes array")
-        level_start_source_parent_box_nrs, evt_sp = \
-                extract_level_start_box_nrs(
-                        source_parent_boxes, wait_for=wait_for)
+    # If well_sep_is_n_away is 1, this agrees with the definition of
+    # 'colleagues' from the classical FMM literature.
 
-        debug_with_finish("finding level starts in target boxes array")
-        level_start_target_box_nrs, evt_t = \
-                extract_level_start_box_nrs(
-                        target_boxes, wait_for=wait_for)
+    debug_with_finish("finding same-level near-field boxes")
 
-        debug_with_finish(
-            "finding level starts in target or target parent boxes array")
-        level_start_target_or_target_parent_box_nrs, evt_tp = \
-                extract_level_start_box_nrs(
-                        target_or_target_parent_boxes, wait_for=wait_for)
+    result, evt = knl.same_level_non_well_sep_boxes_builder(
+            actx.queue, tree.nboxes,
+            tree.box_centers.data, tree.root_extent, tree.box_levels,
+            tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+            wait_for=wait_for, allocator=actx.allocator,
+            )
+    wait_for = [evt]
+    same_level_non_well_sep_boxes = result["same_level_non_well_sep_boxes"]
 
-        wait_for = [evt_s, evt_sp, evt_t, evt_tp]
+    # }}}
 
-        # }}}
+    # {{{ neighbor source boxes ("list 1")
 
-        # {{{ same-level non-well-separated boxes
+    debug_with_finish("finding neighbor source boxes ('list 1')")
 
-        # If well_sep_is_n_away is 1, this agrees with the definition of
-        # 'colleagues' from the classical FMM literature.
+    result, evt = knl.neighbor_source_boxes_builder(
+            actx.queue, len(target_boxes),
+            tree.box_centers.data, tree.root_extent, tree.box_levels,
+            tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+            target_boxes, wait_for=wait_for, allocator=actx.allocator,
+            )
 
-        debug_with_finish("finding same-level near-field boxes")
+    wait_for = [evt]
+    neighbor_source_boxes = result["neighbor_source_boxes"]
 
-        result, evt = knl_info.same_level_non_well_sep_boxes_builder(
-                actx.queue, tree.nboxes,
-                tree.box_centers.data, tree.root_extent, tree.box_levels,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
-                wait_for=wait_for)
-        wait_for = [evt]
-        same_level_non_well_sep_boxes = result["same_level_non_well_sep_boxes"]
+    # }}}
 
-        # }}}
+    # {{{ well-separated siblings ("list 2")
 
-        # {{{ neighbor source boxes ("list 1")
+    debug_with_finish("finding well-separated siblings ('list 2')")
 
-        debug_with_finish("finding neighbor source boxes ('list 1')")
+    result, evt = knl.from_sep_siblings_builder(
+            actx.queue, len(target_or_target_parent_boxes),
+            tree.box_centers.data, tree.root_extent, tree.box_levels,
+            tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+            target_or_target_parent_boxes, tree.box_parent_ids.data,
+            same_level_non_well_sep_boxes.starts,
+            same_level_non_well_sep_boxes.lists,
+            wait_for=wait_for, allocator=actx.allocator,
+            )
+    wait_for = [evt]
+    from_sep_siblings = result["from_sep_siblings"]
 
-        result, evt = knl_info.neighbor_source_boxes_builder(
-                actx.queue, len(target_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
-                target_boxes, wait_for=wait_for)
+    # }}}
 
-        wait_for = [evt]
-        neighbor_source_boxes = result["neighbor_source_boxes"]
+    with_extent = tree.sources_have_extent or tree.targets_have_extent
 
-        # }}}
+    # {{{ separated smaller ("list 3")
 
-        # {{{ well-separated siblings ("list 2")
+    debug_with_finish("finding separated smaller ('list 3')")
 
-        debug_with_finish("finding well-separated siblings ('list 2')")
+    from_sep_smaller_base_args = (
+            actx.queue, len(target_boxes),
+            tree.box_centers.data, tree.root_extent, tree.box_levels,
+            tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+            tree.stick_out_factor, target_boxes,
+            same_level_non_well_sep_boxes.starts,
+            same_level_non_well_sep_boxes.lists,
+            tree.box_target_bounding_box_min.data,
+            tree.box_target_bounding_box_max.data,
+            tree.box_source_counts_cumul,
+            _from_sep_smaller_min_nsources_cumul,
+            )
 
-        result, evt = knl_info.from_sep_siblings_builder(
-                actx.queue, len(target_or_target_parent_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
-                target_or_target_parent_boxes, tree.box_parent_ids.data,
-                same_level_non_well_sep_boxes.starts,
-                same_level_non_well_sep_boxes.lists,
-                wait_for=wait_for)
-        wait_for = [evt]
-        from_sep_siblings = result["from_sep_siblings"]
+    from_sep_smaller_wait_for = []
+    from_sep_smaller_by_level = []
+    target_boxes_sep_smaller_by_source_level = []
 
-        # }}}
+    for ilevel in range(tree.nlevels):
+        debug_with_finish(f"finding separated smaller ('list 3 level {ilevel}')")
 
-        with_extent = tree.sources_have_extent or tree.targets_have_extent
+        result, evt = knl.from_sep_smaller_builder(
+                *(from_sep_smaller_base_args + (ilevel,)),
+                omit_lists=("from_sep_close_smaller",) if with_extent else (),
+                wait_for=wait_for,
+                allocator=actx.allocator,
+                )
 
-        # {{{ separated smaller ("list 3")
+        target_boxes_sep_smaller = target_boxes[
+            result["from_sep_smaller"].nonempty_indices]
 
-        debug_with_finish("finding separated smaller ('list 3')")
+        from_sep_smaller_by_level.append(result["from_sep_smaller"])
+        target_boxes_sep_smaller_by_source_level.append(target_boxes_sep_smaller)
+        from_sep_smaller_wait_for.append(evt)
 
-        from_sep_smaller_base_args = (
-                actx.queue, len(target_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
-                tree.stick_out_factor, target_boxes,
-                same_level_non_well_sep_boxes.starts,
-                same_level_non_well_sep_boxes.lists,
-                tree.box_target_bounding_box_min.data,
-                tree.box_target_bounding_box_max.data,
-                tree.box_source_counts_cumul,
-                _from_sep_smaller_min_nsources_cumul,
+    if with_extent:
+        debug_with_finish("finding separated smaller close ('list 3 close')")
+        result, evt = knl.from_sep_smaller_builder(
+                *(from_sep_smaller_base_args + (-1,)),
+                omit_lists=("from_sep_smaller",),
+                wait_for=wait_for,
+                allocator=actx.allocator,
                 )
+        from_sep_close_smaller_starts = result["from_sep_close_smaller"].starts
+        from_sep_close_smaller_lists = result["from_sep_close_smaller"].lists
 
-        from_sep_smaller_wait_for = []
-        from_sep_smaller_by_level = []
-        target_boxes_sep_smaller_by_source_level = []
-
-        for ilevel in range(tree.nlevels):
-            debug_with_finish(f"finding separated smaller ('list 3 level {ilevel}')")
-
-            result, evt = knl_info.from_sep_smaller_builder(
-                    *(from_sep_smaller_base_args + (ilevel,)),
-                    omit_lists=("from_sep_close_smaller",) if with_extent else (),
-                    wait_for=wait_for)
-
-            target_boxes_sep_smaller = target_boxes[
-                result["from_sep_smaller"].nonempty_indices]
-
-            from_sep_smaller_by_level.append(result["from_sep_smaller"])
-            target_boxes_sep_smaller_by_source_level.append(target_boxes_sep_smaller)
-            from_sep_smaller_wait_for.append(evt)
-
-        if with_extent:
-            debug_with_finish("finding separated smaller close ('list 3 close')")
-            result, evt = knl_info.from_sep_smaller_builder(
-                    *(from_sep_smaller_base_args + (-1,)),
-                    omit_lists=("from_sep_smaller",),
-                    wait_for=wait_for)
-            from_sep_close_smaller_starts = result["from_sep_close_smaller"].starts
-            from_sep_close_smaller_lists = result["from_sep_close_smaller"].lists
-
-            from_sep_smaller_wait_for.append(evt)
-        else:
-            from_sep_close_smaller_starts = None
-            from_sep_close_smaller_lists = None
-
-        # }}}
-
-        wait_for = from_sep_smaller_wait_for
-        del from_sep_smaller_wait_for
-
-        # {{{ separated bigger ("list 4")
-
-        debug_with_finish("finding separated bigger ('list 4')")
-
-        result, evt = knl_info.from_sep_bigger_builder(
-                actx.queue, len(target_or_target_parent_boxes),
-                tree.box_centers.data, tree.root_extent, tree.box_levels,
-                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
-                tree.stick_out_factor, target_or_target_parent_boxes,
-                tree.box_parent_ids.data,
-                same_level_non_well_sep_boxes.starts,
-                same_level_non_well_sep_boxes.lists,
-                wait_for=wait_for)
-
-        wait_for = [evt]
-        from_sep_bigger = result["from_sep_bigger"]
-
-        if with_extent:
-            # These are indexed by target_or_target_parent boxes; we rewrite
-            # them to be indexed by target_boxes.
-            from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts
-            from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists
-
-            list_merger = _ListMerger(actx, tree.box_id_dtype)
-            result, evt = list_merger(
-                    actx,
-                    # starts
-                    (from_sep_close_bigger_starts_raw,),
-                    # lists
-                    (from_sep_close_bigger_lists_raw,),
-                    # input index style
-                    _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES,
-                    # output index style
-                    _IndexStyle.TARGET_BOXES,
-                    # box and tree data
-                    target_boxes,
-                    target_or_target_parent_boxes,
-                    tree.nboxes,
-                    debug,
-                    wait_for=wait_for)
+        from_sep_smaller_wait_for.append(evt)
+    else:
+        from_sep_close_smaller_starts = None
+        from_sep_close_smaller_lists = None
 
-            wait_for = [evt]
+    # }}}
 
-            del from_sep_close_bigger_starts_raw
-            del from_sep_close_bigger_lists_raw
+    wait_for = from_sep_smaller_wait_for
+    del from_sep_smaller_wait_for
 
-            from_sep_close_bigger_starts = result["starts"]
-            from_sep_close_bigger_lists = result["lists"]
-        else:
-            from_sep_close_bigger_starts = None
-            from_sep_close_bigger_lists = None
+    # {{{ separated bigger ("list 4")
 
-        # }}}
+    debug_with_finish("finding separated bigger ('list 4')")
 
-        evt, = wait_for
-        traversal_plog.done(
-                "from_sep_smaller_crit: %s",
-                self.from_sep_smaller_crit)
+    result, evt = knl.from_sep_bigger_builder(
+            actx.queue, len(target_or_target_parent_boxes),
+            tree.box_centers.data, tree.root_extent, tree.box_levels,
+            tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
+            tree.stick_out_factor, target_or_target_parent_boxes,
+            tree.box_parent_ids.data,
+            same_level_non_well_sep_boxes.starts,
+            same_level_non_well_sep_boxes.lists,
+            wait_for=wait_for, allocator=actx.allocator,
+            )
+
+    wait_for = [evt]
+    from_sep_bigger = result["from_sep_bigger"]
+
+    if with_extent:
+        # These are indexed by target_or_target_parent boxes; we rewrite
+        # them to be indexed by target_boxes.
+        from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts
+        from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists
+
+        result = merge_lists(
+                actx,
+                # starts
+                (from_sep_close_bigger_starts_raw,),
+                # lists
+                (from_sep_close_bigger_lists_raw,),
+                # input index style
+                _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES,
+                # output index style
+                _IndexStyle.TARGET_BOXES,
+                # box and tree data
+                target_boxes,
+                target_or_target_parent_boxes,
+                tree.nboxes,
+                tree.box_id_dtype,
+                debug,
+                )
 
-        info = FMMTraversalInfo(
-                tree=tree,
-                well_sep_is_n_away=self.well_sep_is_n_away,
+        del from_sep_close_bigger_starts_raw
+        del from_sep_close_bigger_lists_raw
 
-                source_boxes=source_boxes,
-                target_boxes=target_boxes,
+        from_sep_close_bigger_starts = result["starts"]
+        from_sep_close_bigger_lists = result["lists"]
+    else:
+        from_sep_close_bigger_starts = None
+        from_sep_close_bigger_lists = None
 
-                level_start_source_box_nrs=actx.from_numpy(
-                    level_start_source_box_nrs),
-                level_start_target_box_nrs=actx.from_numpy(
-                    level_start_target_box_nrs),
+    # }}}
 
-                source_parent_boxes=source_parent_boxes,
-                level_start_source_parent_box_nrs=actx.from_numpy(
-                    level_start_source_parent_box_nrs),
+    evt, = wait_for
+    traversal_plog.done("from_sep_smaller_crit: %s", from_sep_smaller_crit)
 
-                target_or_target_parent_boxes=target_or_target_parent_boxes,
-                level_start_target_or_target_parent_box_nrs=actx.from_numpy(
-                    level_start_target_or_target_parent_box_nrs),
+    info = FMMTraversalInfo(
+            tree=tree,
+            well_sep_is_n_away=well_sep_is_n_away,
 
-                same_level_non_well_sep_boxes_starts=(
-                    same_level_non_well_sep_boxes.starts),
-                same_level_non_well_sep_boxes_lists=(
-                    same_level_non_well_sep_boxes.lists),
+            source_boxes=source_boxes,
+            target_boxes=target_boxes,
 
-                neighbor_source_boxes_starts=neighbor_source_boxes.starts,
-                neighbor_source_boxes_lists=neighbor_source_boxes.lists,
+            level_start_source_box_nrs=actx.from_numpy(
+                level_start_source_box_nrs),
+            level_start_target_box_nrs=actx.from_numpy(
+                level_start_target_box_nrs),
 
-                from_sep_siblings_starts=from_sep_siblings.starts,
-                from_sep_siblings_lists=from_sep_siblings.lists,
+            source_parent_boxes=source_parent_boxes,
+            level_start_source_parent_box_nrs=actx.from_numpy(
+                level_start_source_parent_box_nrs),
 
-                from_sep_smaller_by_level=make_obj_array(
-                    from_sep_smaller_by_level),
-                target_boxes_sep_smaller_by_source_level=make_obj_array(
-                    target_boxes_sep_smaller_by_source_level),
+            target_or_target_parent_boxes=target_or_target_parent_boxes,
+            level_start_target_or_target_parent_box_nrs=actx.from_numpy(
+                level_start_target_or_target_parent_box_nrs),
 
-                from_sep_close_smaller_starts=from_sep_close_smaller_starts,
-                from_sep_close_smaller_lists=from_sep_close_smaller_lists,
+            same_level_non_well_sep_boxes_starts=(
+                same_level_non_well_sep_boxes.starts),
+            same_level_non_well_sep_boxes_lists=(
+                same_level_non_well_sep_boxes.lists),
 
-                from_sep_bigger_starts=from_sep_bigger.starts,
-                from_sep_bigger_lists=from_sep_bigger.lists,
+            neighbor_source_boxes_starts=neighbor_source_boxes.starts,
+            neighbor_source_boxes_lists=neighbor_source_boxes.lists,
 
-                from_sep_close_bigger_starts=from_sep_close_bigger_starts,
-                from_sep_close_bigger_lists=from_sep_close_bigger_lists,
-                )
+            from_sep_siblings_starts=from_sep_siblings.starts,
+            from_sep_siblings_lists=from_sep_siblings.lists,
 
-        return actx.freeze(info), evt
+            from_sep_smaller_by_level=make_obj_array(
+                from_sep_smaller_by_level),
+            target_boxes_sep_smaller_by_source_level=make_obj_array(
+                target_boxes_sep_smaller_by_source_level),
 
-    # }}}
+            from_sep_close_smaller_starts=from_sep_close_smaller_starts,
+            from_sep_close_smaller_lists=from_sep_close_smaller_lists,
+
+            from_sep_bigger_starts=from_sep_bigger.starts,
+            from_sep_bigger_lists=from_sep_bigger.lists,
+
+            from_sep_close_bigger_starts=from_sep_close_bigger_starts,
+            from_sep_close_bigger_lists=from_sep_close_bigger_lists,
+            )
+
+    return actx.freeze(info)
+
+# }}}
 
 # vim: fdm=marker
diff --git a/boxtree/tree.py b/boxtree/tree.py
index 519140d8..5ffc5de2 100644
--- a/boxtree/tree.py
+++ b/boxtree/tree.py
@@ -51,10 +51,7 @@
 Tools
 ^^^^^
 
-.. autoclass:: ParticleListFilter
-
 .. autofunction:: filter_target_lists_in_user_order
-
 .. autofunction:: filter_target_lists_in_tree_order
 """
 
@@ -87,7 +84,7 @@
 
 from cgen import Enum
 from arraycontext import Array
-from pytools import memoize_method
+from pytools import memoize_in
 
 from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
@@ -126,7 +123,7 @@ class Tree:
     into which they may be sorted.
 
     Instances of this class are not constructed directly. They are returned
-    by :meth:`TreeBuilder.__call__`.
+    by :meth:`~boxtree.build_tree`.
 
     .. rubric:: Flags
 
@@ -366,8 +363,8 @@ class Tree:
     particle's extents) inside of the box.  If the box is empty, both *min* and *max*
     will reflect the box center.  The purpose of this information is to reduce the
     cost of some interactions through knowledge that some boxes are partially empty.
-    (See the *from_sep_smaller_crit* argument to the constructor of
-    :class:`boxtree.traversal.FMMTraversalBuilder` for an example.)
+    (See the *from_sep_smaller_crit* argument to
+    :func:`boxtree.traversal.build_traversal` for an example.)
 
     .. note::
 
@@ -440,6 +437,7 @@ class Tree:
     box_target_bounding_box_min: Array
     box_target_bounding_box_max: Array
 
+    root_extent_stretch_factor: float
     _is_pruned: bool
 
     @property
@@ -650,21 +648,30 @@ def link_point_sources(
     tree_order_point_source_counts = actx.empty(
             tree.nsources, tree.particle_id_dtype)
 
-    from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_SOURCE_SCAN_TPL
-    knl = POINT_SOURCE_LINKING_SOURCE_SCAN_TPL.build(
-        actx.queue.context,
-        type_aliases=(
-            ("scan_t", tree.particle_id_dtype),
-            ("index_t", tree.particle_id_dtype),
-            ("particle_id_t", tree.particle_id_dtype),
-            ),
-        )
+    @memoize_in(actx, (
+        link_point_sources, tree.particle_id_dtype,
+        "point_source_linking_source_scan"))
+    def get_point_source_linking_source_scan_kernel():
+        from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_SOURCE_SCAN_TPL
+        return POINT_SOURCE_LINKING_SOURCE_SCAN_TPL.build(
+            actx.queue.context,
+            type_aliases=(
+                ("scan_t", tree.particle_id_dtype),
+                ("index_t", tree.particle_id_dtype),
+                ("particle_id_t", tree.particle_id_dtype),
+                ),
+            )
 
     logger.debug("point source linking: tree order source scan")
 
-    knl(point_source_starts, tree.user_source_ids,
+    knl = get_point_source_linking_source_scan_kernel()
+    knl(
+            point_source_starts, tree.user_source_ids,
             tree_order_point_source_starts, tree_order_point_source_counts,
-            npoint_sources_dev, size=tree.nsources, queue=actx.queue)
+            npoint_sources_dev, size=tree.nsources,
+            queue=actx.queue,
+            allocator=actx.allocator,
+            )
 
     # }}}
 
@@ -680,11 +687,8 @@ def link_point_sources(
     user_point_source_ids = actx.empty(npoint_sources, tree.particle_id_dtype)
     user_point_source_ids.fill(1)
 
-    import pyopencl.array as cl_array
-    cl_array.multi_put(
-            [tree_order_index_user_point_source_starts],
-            dest_indices=tree_order_point_source_starts,
-            out=[user_point_source_ids])
+    user_point_source_ids[tree_order_point_source_starts] = (
+        tree_order_index_user_point_source_starts)
 
     if debug:
         ups_host = actx.to_numpy(user_point_source_ids)
@@ -694,29 +698,30 @@ def link_point_sources(
     source_boundaries = actx.zeros(npoint_sources, np.int8)
 
     # FIXME: Should be a scalar, in principle.
-    ones = actx.empty(tree.nsources, np.int8)
-    ones.fill(1)
-
-    cl_array.multi_put(
-            [ones],
-            dest_indices=tree_order_point_source_starts,
-            out=[source_boundaries])
+    ones = 1 + actx.zeros(1, np.int8)
+    source_boundaries[tree_order_point_source_starts] = ones
 
-    from boxtree.tree_build_kernels import \
-            POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL
+    @memoize_in(actx, (
+        link_point_sources, tree.particle_id_dtype,
+        "point_source_linking_user_point_source_id_scan"))
+    def get_point_source_linking_user_point_source_id_scan_kernel():
+        from boxtree.tree_build_kernels import (
+                POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL)
+        return POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL.build(
+            actx.queue.context,
+            type_aliases=(
+                ("scan_t", tree.particle_id_dtype),
+                ("index_t", tree.particle_id_dtype),
+                ("particle_id_t", tree.particle_id_dtype),
+                ),
+            )
 
     logger.debug("point source linking: point source id scan")
-
-    knl = POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL.build(
-        actx.queue.context,
-        type_aliases=(
-            ("scan_t", tree.particle_id_dtype),
-            ("index_t", tree.particle_id_dtype),
-            ("particle_id_t", tree.particle_id_dtype),
-            ),
-        )
+    knl = get_point_source_linking_user_point_source_id_scan_kernel()
     knl(source_boundaries, user_point_source_ids,
-            size=npoint_sources, queue=actx.queue)
+            size=npoint_sources,
+            queue=actx.queue,
+            allocator=actx.allocator)
 
     if debug:
         ups_host = actx.to_numpy(user_point_source_ids)
@@ -725,6 +730,7 @@ def link_point_sources(
 
     # }}}
 
+    import pyopencl.array as cl_array
     from pytools.obj_array import make_obj_array
     tree_order_point_sources = make_obj_array([
         cl_array.take(point_sources[i], user_point_source_ids, queue=actx.queue)
@@ -733,15 +739,18 @@ def link_point_sources(
 
     # {{{ compute box point source metadata
 
-    from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_BOX_POINT_SOURCES
-
-    knl = POINT_SOURCE_LINKING_BOX_POINT_SOURCES.build(
-        actx.queue.context,
-        type_aliases=(
-            ("particle_id_t", tree.particle_id_dtype),
-            ("box_id_t", tree.box_id_dtype),
-            ),
-        )
+    @memoize_in(actx, (
+        link_point_sources, tree.particle_id_dtype, tree.box_id_dtype,
+        "point_source_linking_box_point_sources"))
+    def get_point_source_linking_box_point_sources_kernel():
+        from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_BOX_POINT_SOURCES
+        return POINT_SOURCE_LINKING_BOX_POINT_SOURCES.build(
+            actx.queue.context,
+            type_aliases=(
+                ("particle_id_t", tree.particle_id_dtype),
+                ("box_id_t", tree.box_id_dtype),
+                ),
+            )
 
     logger.debug("point source linking: box point sources")
 
@@ -750,6 +759,7 @@ def link_point_sources(
     box_point_source_counts_nonchild = actx.empty(
             tree.nboxes, tree.particle_id_dtype)
 
+    knl = get_point_source_linking_box_point_sources_kernel()
     knl(
             box_point_source_starts, box_point_source_counts_nonchild,
             box_point_source_counts_cumul,
@@ -759,7 +769,9 @@ def link_point_sources(
 
             tree_order_point_source_starts,
             tree_order_point_source_counts,
-            range=slice(tree.nboxes), queue=actx.queue)
+            range=slice(tree.nboxes),
+            queue=actx.queue,
+            )
 
     # }}}
 
@@ -793,10 +805,30 @@ def link_point_sources(
 
 # {{{ particle list filter
 
+class ParticleListFilter:
+    """
+    .. automethod:: filter_target_lists_in_tree_order
+    .. automethod:: filter_target_lists_in_user_order
+    """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def filter_target_lists_in_user_order(self, actx, tree, flags):
+        return filter_target_lists_in_user_order(actx, tree, flags)
+
+    def filter_target_lists_in_tree_order(self, actx, tree, flags):
+        return filter_target_lists_in_tree_order(actx, tree, flags)
+
+# }}}
+
+
+# {{{ filter_target_lists_in_user_order
+
 @dataclass_array_container
 @dataclass(frozen=True)
 class FilteredTargetListsInUserOrder:
-    """Use :meth:`ParticleListFilter.filter_target_lists_in_user_order` to create
+    """Use :func:`filter_target_lists_in_user_order` to create
     instances of this class.
 
     This class represents subsets of the list of targets in each box (as given
@@ -835,10 +867,89 @@ class FilteredTargetListsInUserOrder:
     target_lists: Array
 
 
+def filter_target_lists_in_user_order(
+        actx: PyOpenCLArrayContext, tree: Tree, flags: Array,
+        ) -> FilteredTargetListsInUserOrder:
+    """
+    :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
+        :class:`numpy.int8` objects, which indicate by being zero that the
+        corresponding target (in user target order) is not part of the
+        filtered list, or by being nonzero that it is.
+
+    :returns: A :class:`FilteredTargetListsInUserOrder`
+    """
+    user_order_flags = flags
+    del flags
+
+    @memoize_in(actx, (
+        filter_target_lists_in_user_order,
+        tree.particle_id_dtype, user_order_flags.dtype))
+    def get_kernel():
+        from boxtree.tools import VectorArg
+        from pyopencl.tools import dtype_to_ctype
+        from pyopencl.algorithm import ListOfListsBuilder
+        from mako.template import Template
+
+        return ListOfListsBuilder(actx.context,
+            [("filt_tgt_list", tree.particle_id_dtype)], Template("""//CL//
+            typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
+
+            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
+            {
+                particle_id_t b_t_start = box_target_starts[i];
+                particle_id_t b_t_count = box_target_counts_nonchild[i];
+
+                for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
+                {
+                    particle_id_t user_target_id = user_target_ids[j];
+                    if (user_order_flags[user_target_id])
+                    {
+                        APPEND_filt_tgt_list(user_target_id);
+                    }
+                }
+            }
+            """, strict_undefined=True).render(
+                dtype_to_ctype=dtype_to_ctype,
+                particle_id_dtype=tree.particle_id_dtype
+                ), arg_decls=[
+                    VectorArg(user_order_flags.dtype, "user_order_flags"),
+                    VectorArg(tree.particle_id_dtype, "user_target_ids"),
+                    VectorArg(tree.particle_id_dtype, "box_target_starts"),
+                    VectorArg(tree.particle_id_dtype, "box_target_counts_nonchild"),
+                ])
+
+    user_target_ids = actx.empty(tree.ntargets, tree.sorted_target_ids.dtype)
+    user_target_ids[tree.sorted_target_ids] = actx.from_numpy(
+            np.arange(tree.ntargets, dtype=user_target_ids.dtype)
+            )
+
+    knl = get_kernel()
+    result, _ = knl(
+            actx.queue, tree.nboxes,
+            user_order_flags,
+            user_target_ids,
+            tree.box_target_starts,
+            tree.box_target_counts_nonchild,
+            allocator=actx.allocator,
+            )
+
+    target_lists = FilteredTargetListsInUserOrder(
+            nfiltered_targets=result["filt_tgt_list"].count,
+            target_starts=result["filt_tgt_list"].starts,
+            target_lists=result["filt_tgt_list"].lists,
+            )
+
+    return actx.freeze(target_lists)
+
+# }}}
+
+
+# {{{ filter_target_lists_in_tree_order
+
 @dataclass_array_container
 @dataclass(frozen=True)
 class FilteredTargetListsInTreeOrder:
-    """Use :meth:`ParticleListFilter.filter_target_lists_in_tree_order` to create
+    """Use :func:`filter_target_lists_in_tree_order` to create
     instances of this class.
 
     This class represents subsets of the list of targets in each box (as given by
@@ -891,181 +1002,100 @@ class FilteredTargetListsInTreeOrder:
     unfiltered_from_filtered_target_indices: Array
 
 
-class ParticleListFilter:
+def filter_target_lists_in_tree_order(
+        actx: PyOpenCLArrayContext, tree: Tree, flags: Array
+        ) -> FilteredTargetListsInTreeOrder:
     """
-    .. automethod:: filter_target_lists_in_tree_order
-    .. automethod:: filter_target_lists_in_user_order
+    :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
+        :class:`numpy.int8` objects, which indicate by being zero that the
+        corresponding target (in user target order) is not part of the
+        filtered list, or by being nonzero that it is.
+    :returns: A :class:`FilteredTargetListsInTreeOrder`
     """
 
-    def __init__(self, array_context: PyOpenCLArrayContext):
-        self._setup_actx = array_context
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
-
-    @memoize_method
-    def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype,
-            user_order_flags_dtype):
-        from boxtree.tools import VectorArg
-        from pyopencl.tools import dtype_to_ctype
-        from pyopencl.algorithm import ListOfListsBuilder
-        from mako.template import Template
-
-        builder = ListOfListsBuilder(self.context,
-            [("filt_tgt_list", particle_id_dtype)], Template("""//CL//
-            typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;
-
-            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
-            {
-                particle_id_t b_t_start = box_target_starts[i];
-                particle_id_t b_t_count = box_target_counts_nonchild[i];
-
-                for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
-                {
-                    particle_id_t user_target_id = user_target_ids[j];
-                    if (user_order_flags[user_target_id])
-                    {
-                        APPEND_filt_tgt_list(user_target_id);
-                    }
-                }
-            }
-            """, strict_undefined=True).render(
-                dtype_to_ctype=dtype_to_ctype,
-                particle_id_dtype=particle_id_dtype
-                ), arg_decls=[
-                    VectorArg(user_order_flags_dtype, "user_order_flags"),
-                    VectorArg(particle_id_dtype, "user_target_ids"),
-                    VectorArg(particle_id_dtype, "box_target_starts"),
-                    VectorArg(particle_id_dtype, "box_target_counts_nonchild"),
-                ])
-
-        return builder
-
-    def filter_target_lists_in_user_order(self, actx, tree, flags):
-        """
-        :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
-            :class:`numpy.int8` objects, which indicate by being zero that the
-            corresponding target (in user target order) is not part of the
-            filtered list, or by being nonzero that it is.
-
-        :returns: A :class:`FilteredTargetListsInUserOrder`
-        """
-        user_order_flags = flags
-        del flags
-
-        user_target_ids = actx.empty(tree.ntargets, tree.sorted_target_ids.dtype)
-        user_target_ids[tree.sorted_target_ids] = actx.from_numpy(
-                np.arange(tree.ntargets, dtype=user_target_ids.dtype)
-                )
-
-        kernel = self.get_filter_target_lists_in_user_order_kernel(
-                tree.particle_id_dtype, user_order_flags.dtype)
-
-        result, evt = kernel(actx.queue, tree.nboxes,
-                user_order_flags,
-                user_target_ids,
-                tree.box_target_starts,
-                tree.box_target_counts_nonchild)
-
-        target_lists = FilteredTargetListsInUserOrder(
-                nfiltered_targets=result["filt_tgt_list"].count,
-                target_starts=result["filt_tgt_list"].starts,
-                target_lists=result["filt_tgt_list"].lists,
-                )
-
-        return actx.freeze(target_lists)
-
-    @memoize_method
-    def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype):
+    @memoize_in(actx, (filter_target_lists_in_tree_order, tree.particle_id_dtype))
+    def get_kernels():
         from boxtree.tree_build_kernels import (
                 TREE_ORDER_TARGET_FILTER_SCAN_TPL,
                 TREE_ORDER_TARGET_FILTER_INDEX_TPL)
 
         scan_knl = TREE_ORDER_TARGET_FILTER_SCAN_TPL.build(
-            self.context,
+            actx.context,
             type_aliases=(
-                ("scan_t", particle_id_dtype),
-                ("particle_id_t", particle_id_dtype),
+                ("scan_t", tree.particle_id_dtype),
+                ("particle_id_t", tree.particle_id_dtype),
                 ),
             )
 
         index_knl = TREE_ORDER_TARGET_FILTER_INDEX_TPL.build(
-            self.context,
+            actx.context,
             type_aliases=(
-                ("particle_id_t", particle_id_dtype),
+                ("particle_id_t", tree.particle_id_dtype),
                 ),
             )
 
         return scan_knl, index_knl
 
-    def filter_target_lists_in_tree_order(self, actx, tree, flags):
-        """
-        :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
-            :class:`numpy.int8` objects, which indicate by being zero that the
-            corresponding target (in user target order) is not part of the
-            filtered list, or by being nonzero that it is.
-        :returns: A :class:`FilteredTargetListsInTreeOrder`
-        """
-
-        tree_order_flags = actx.empty(tree.ntargets, np.int8)
-        tree_order_flags[tree.sorted_target_ids] = flags
+    tree_order_flags = actx.empty(tree.ntargets, np.int8)
+    tree_order_flags[tree.sorted_target_ids] = flags
 
-        filtered_from_unfiltered_target_indices = actx.empty(
-                tree.ntargets, tree.particle_id_dtype)
-        unfiltered_from_filtered_target_indices = actx.empty(
-                tree.ntargets, tree.particle_id_dtype)
+    filtered_from_unfiltered_target_indices = actx.empty(
+            tree.ntargets, tree.particle_id_dtype)
+    unfiltered_from_filtered_target_indices = actx.empty(
+            tree.ntargets, tree.particle_id_dtype)
 
-        nfiltered_targets = actx.empty(1, tree.particle_id_dtype)
+    nfiltered_targets = actx.empty(1, tree.particle_id_dtype)
 
-        scan_knl, index_knl = self.get_filter_target_lists_in_tree_order_kernels(
-                tree.particle_id_dtype)
-
-        scan_knl(tree_order_flags,
-                filtered_from_unfiltered_target_indices,
-                unfiltered_from_filtered_target_indices,
-                nfiltered_targets,
-                queue=actx.queue)
+    scan_knl, index_knl = get_kernels()
+    scan_knl(
+            tree_order_flags,
+            filtered_from_unfiltered_target_indices,
+            unfiltered_from_filtered_target_indices,
+            nfiltered_targets,
+            queue=actx.queue,
+            allocator=actx.allocator,
+            )
 
-        nfiltered_targets = int(actx.to_numpy(nfiltered_targets))
+    nfiltered_targets = int(actx.to_numpy(nfiltered_targets))
 
-        unfiltered_from_filtered_target_indices = \
-                unfiltered_from_filtered_target_indices[:nfiltered_targets]
+    unfiltered_from_filtered_target_indices = \
+            unfiltered_from_filtered_target_indices[:nfiltered_targets]
 
-        from pytools.obj_array import make_obj_array
-        filtered_targets = make_obj_array([
-            actx.thaw(targets_i)[unfiltered_from_filtered_target_indices]
-            for targets_i in tree.targets
-            ])
+    from pytools.obj_array import make_obj_array
+    filtered_targets = make_obj_array([
+        actx.thaw(targets_i)[unfiltered_from_filtered_target_indices]
+        for targets_i in tree.targets
+        ])
 
-        box_target_starts_filtered = actx.np.zeros_like(tree.box_target_starts)
-        box_target_counts_nonchild_filtered = (
-                actx.np.zeros_like(tree.box_target_counts_nonchild))
+    box_target_starts_filtered = actx.np.zeros_like(tree.box_target_starts)
+    box_target_counts_nonchild_filtered = (
+            actx.np.zeros_like(tree.box_target_counts_nonchild))
 
-        index_knl(
-                # input
-                tree.box_target_starts,
-                tree.box_target_counts_nonchild,
-                filtered_from_unfiltered_target_indices,
-                tree.ntargets,
-                nfiltered_targets,
+    index_knl(
+            # input
+            tree.box_target_starts,
+            tree.box_target_counts_nonchild,
+            filtered_from_unfiltered_target_indices,
+            tree.ntargets,
+            nfiltered_targets,
 
-                # output
-                box_target_starts_filtered,
-                box_target_counts_nonchild_filtered,
+            # output
+            box_target_starts_filtered,
+            box_target_counts_nonchild_filtered,
 
-                queue=actx.queue)
+            queue=actx.queue,
+            )
 
-        target_lists = FilteredTargetListsInTreeOrder(
-                nfiltered_targets=nfiltered_targets,
-                box_target_starts=box_target_starts_filtered,
-                box_target_counts_nonchild=box_target_counts_nonchild_filtered,
-                unfiltered_from_filtered_target_indices=(
-                    unfiltered_from_filtered_target_indices),
-                targets=filtered_targets,
-                )
+    target_lists = FilteredTargetListsInTreeOrder(
+            nfiltered_targets=nfiltered_targets,
+            box_target_starts=box_target_starts_filtered,
+            box_target_counts_nonchild=box_target_counts_nonchild_filtered,
+            unfiltered_from_filtered_target_indices=(
+                unfiltered_from_filtered_target_indices),
+            targets=filtered_targets,
+            )
 
-        return actx.freeze(target_lists)
+    return actx.freeze(target_lists)
 
 # }}}
 
diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py
index f252be57..20b076c3 100644
--- a/boxtree/tree_build.py
+++ b/boxtree/tree_build.py
@@ -4,7 +4,7 @@
 Building Trees
 --------------
 
-.. autoclass:: TreeBuilder
+.. autofunction:: build_tree
 """
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -30,10 +30,11 @@
 """
 
 from functools import partial
+from typing import Any, Optional
 
 import numpy as np
 
-from pytools import ProcessLogger, DebugProcessLogger, memoize_method
+from pytools import ProcessLogger, DebugProcessLogger, memoize_on_first_arg
 
 from boxtree.tree import Tree
 from boxtree.array_context import PyOpenCLArrayContext
@@ -47,46 +48,12 @@ class MaxLevelsExceeded(RuntimeError):
 
 
 class TreeBuilder:
-    """
-    .. automethod:: __init__
-    .. automethod:: __call__
-    """
-
     morton_nr_dtype = np.dtype(np.int8)
     box_level_dtype = np.dtype(np.uint8)
     ROOT_EXTENT_STRETCH_FACTOR = 1e-4
 
-    def __init__(self, array_context: PyOpenCLArrayContext) -> None:
-        self._setup_actx = array_context
-
-        from boxtree.bounding_box import BoundingBoxFinder
-        self.bbox_finder = BoundingBoxFinder(array_context)
-
-        # This is used to map box IDs and compress box lists in empty leaf
-        # pruning.
-
-        from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel
-        self.gappy_copy_and_map = GappyCopyAndMapKernel(array_context)
-        self.map_values_kernel = MapValuesKernel(array_context)
-
-    @property
-    def context(self):
-        return self._setup_actx.queue.context
-
-    @memoize_method
-    def get_kernel_info(self, dimensions, coord_dtype,
-            particle_id_dtype, box_id_dtype,
-            sources_are_targets, srcntgts_extent_norm,
-            kind):
-
-        from boxtree.tree_build_kernels import get_tree_build_kernel_info
-        return get_tree_build_kernel_info(self.context, dimensions, coord_dtype,
-            particle_id_dtype, box_id_dtype,
-            sources_are_targets, srcntgts_extent_norm,
-            self.morton_nr_dtype, self.box_level_dtype,
-            kind=kind)
-
-    # {{{ run control
+    def __init__(self, *args, **kwargs) -> None:
+        pass
 
     def __call__(self, actx: PyOpenCLArrayContext, particles, kind="adaptive",
             max_particles_in_box=None, allocator=None, debug=False,
@@ -95,1721 +62,1662 @@ def __call__(self, actx: PyOpenCLArrayContext, particles, kind="adaptive",
             max_leaf_refine_weight=None, wait_for=None,
             extent_norm=None, bbox=None,
             **kwargs):
-        """
-        :arg particles: an object array of (XYZ) point coordinate arrays.
-        :arg kind: One of the following strings:
-
-            - 'adaptive'
-            - 'adaptive-level-restricted'
-            - 'non-adaptive'
-
-            'adaptive' requests an adaptive tree without level restriction.  See
-            :ref:`tree-kinds` for further explanation.
-
-        :arg targets: an object array of (XYZ) point coordinate arrays or ``None``.
-            If ``None``, *particles* act as targets, too.
-            Must have the same (inner) dtype as *particles*.
-        :arg source_radii: If not *None*, an arra of the same dtype as *particles*.
-
-            If this is given, *targets* must also be given, i.e. sources and
-            targets must be separate. See :ref:`extent`.
-
-        :arg target_radii: Like *source_radii*, but for targets.
-        :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`.
-        :arg refine_weights: If not *None*, an array of the
-            type :class:`numpy.int32`. A box will be split if it has a cumulative
-            refine_weight greater than *max_leaf_refine_weight*. If this is given,
-            *max_leaf_refine_weight* must also be given and *max_particles_in_box*
-            must be *None*.
-        :arg max_leaf_refine_weight: If not *None*, specifies the maximum weight
-            of a leaf box.
-        :arg max_particles_in_box: If not *None*, specifies the maximum number
-            of particles in a leaf box. If this is given, both
-            *refine_weights* and *max_leaf_refine_weight* must be *None*.
-        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
-            instances for whose completion this command waits before starting
-            execution.
-        :arg extent_norm: ``"l2"`` or ``"linf"``. Indicates the norm with respect
-            to which particle stick-out is measured. See :attr:`Tree.extent_norm`.
-        :arg bbox: Bounding box of either type:
-            1. A dim-by-2 array, with each row to be [min, max] coordinates
-            in its corresponding axis direction.
-            2. (Internal use only) of the same type as returned by
-            *boxtree.bounding_box.make_bounding_box_dtype*.
-            When given, this bounding box is used for tree
-            building. Otherwise, the bounding box is determined from particles
-            in such a way that it is square and is slightly larger at the top (so
-            that scaled coordinates are always < 1).
-            When supplied, the bounding box must be square and have all the
-            particles in its closure.
-        :arg kwargs: Used internally for debugging.
-
-        :returns: a tuple ``(tree, event)``, where *tree* is an instance of
-            :class:`Tree`, and *event* is a :class:`pyopencl.Event` for dependency
-            management.
-        """
-
-        if allocator is not None:
-            from warnings import warn
-            warn("Passing in 'allocator' is deprecated. The allocator of the "
-                "array context 'actx' is used throughout.",
-                DeprecationWarning, stacklevel=2)
-
-        # {{{ input processing
-
-        if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]:
-            raise ValueError(f"unknown tree kind '{kind}'")
-
-        # we'll modify this below, so copy it
-        if wait_for is None:
-            wait_for = []
-        else:
-            wait_for = list(wait_for)
+        from warnings import warn
+        warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. "
+            "Use 'build_tree' instead.",
+            DeprecationWarning, stacklevel=2)
+
+        result = build_tree(
+            actx, particles,
+            kind=kind, max_particles_in_box=max_particles_in_box,
+            targets=targets, source_radii=source_radii, target_radii=target_radii,
+            stick_out_factor=stick_out_factor,
+            refine_weights=refine_weights,
+            max_leaf_refine_weight=max_leaf_refine_weight,
+            extent_norm=extent_norm, bbox=bbox,
+            morton_nr_dtype=self.morton_nr_dtype,
+            box_level_dtype=self.box_level_dtype,
+            root_extent_stretch_factor=TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR,
+            debug=debug, **kwargs)
+
+        return result, None
+
+
+# {{{ build_tree
+
+@memoize_on_first_arg
+def get_kernel_info(
+        actx: PyOpenCLArrayContext,
+        dimensions: int,
+        coord_dtype: "np.dtype",
+        particle_id_dtype: "np.dtype",
+        box_id_dtype: "np.dtype",
+        sources_are_targets: bool,
+        srcntgts_extent_norm: str,
+        kind: str,
+        morton_nr_dtype: "np.dtype",
+        box_level_dtype: "np.dtype"):
+    from boxtree.tree_build_kernels import get_tree_build_kernel_info
+    return get_tree_build_kernel_info(actx.context, dimensions, coord_dtype,
+        particle_id_dtype, box_id_dtype,
+        sources_are_targets, srcntgts_extent_norm,
+        morton_nr_dtype, box_level_dtype,
+        kind=kind)
+
+
+def build_tree(
+        actx: PyOpenCLArrayContext, particles: np.ndarray, *,
+        kind: str = "adaptive",
+        max_particles_in_box: Optional[int] = None,
+        targets: Optional[np.ndarray] = None,
+        source_radii: Optional[np.ndarray] = None,
+        target_radii: Optional[np.ndarray] = None,
+        stick_out_factor: Optional[float] = None,
+        refine_weights: Optional[np.ndarray] = None,
+        max_leaf_refine_weight: Optional[int] = None,
+        extent_norm: Optional[str] = None,
+        bbox: Optional[np.ndarray] = None,
+        morton_nr_dtype: Optional[np.dtype] = None,
+        box_level_dtype: Optional[np.dtype] = None,
+        root_extent_stretch_factor: float = 1.0e-4,
+        debug: bool = False,
+        **kwargs: Any) -> Tree:
+    """
+    :arg particles: an object array of (XYZ) point coordinate arrays.
+    :arg kind: One of the following strings:
+
+        - 'adaptive'
+        - 'adaptive-level-restricted'
+        - 'non-adaptive'
+
+        'adaptive' requests an adaptive tree without level restriction.  See
+        :ref:`tree-kinds` for further explanation.
+
+    :arg targets: an object array of (XYZ) point coordinate arrays or ``None``.
+        If ``None``, *particles* act as targets, too.
+        Must have the same (inner) dtype as *particles*.
+    :arg source_radii: If not *None*, an array of the same dtype as *particles*.
+
+        If this is given, *targets* must also be given, i.e. sources and
+        targets must be separate. See :ref:`extent`.
+
+    :arg target_radii: Like *source_radii*, but for targets.
+    :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`.
+    :arg refine_weights: If not *None*, an array of the
+        type :class:`numpy.int32`. A box will be split if it has a cumulative
+        refine_weight greater than *max_leaf_refine_weight*. If this is given,
+        *max_leaf_refine_weight* must also be given and *max_particles_in_box*
+        must be *None*.
+    :arg max_leaf_refine_weight: If not *None*, specifies the maximum weight
+        of a leaf box.
+    :arg max_particles_in_box: If not *None*, specifies the maximum number
+        of particles in a leaf box. If this is given, both
+        *refine_weights* and *max_leaf_refine_weight* must be *None*.
+    :arg extent_norm: ``"l2"`` or ``"linf"``. Indicates the norm with respect
+        to which particle stick-out is measured. See :attr:`Tree.extent_norm`.
+    :arg bbox: Bounding box of either type:
+        1. A dim-by-2 array, with each row to be [min, max] coordinates
+        in its corresponding axis direction.
+        2. (Internal use only) of the same type as returned by
+        *boxtree.bounding_box.make_bounding_box_dtype*.
+        When given, this bounding box is used for tree
+        building. Otherwise, the bounding box is determined from particles
+        in such a way that it is square and is slightly larger at the top (so
+        that scaled coordinates are always < 1).
+        When supplied, the bounding box must be square and have all the
+        particles in its closure.
+    :arg kwargs: Used internally for debugging.
+    """
 
-        dimensions = len(particles)
+    # {{{ input processing
 
-        from boxtree.tools import AXIS_NAMES
-        axis_names = AXIS_NAMES[:dimensions]
+    if morton_nr_dtype is None:
+        morton_nr_dtype = np.dtype(np.int8)
 
-        sources_are_targets = targets is None
-        sources_have_extent = source_radii is not None
-        targets_have_extent = target_radii is not None
+    if box_level_dtype is None:
+        box_level_dtype = np.dtype(np.uint8)
 
-        if extent_norm is None:
-            extent_norm = "linf"
+    if kind not in ("adaptive", "adaptive-level-restricted", "non-adaptive"):
+        raise ValueError(f"unknown tree kind '{kind}'")
 
-        if extent_norm not in ["linf", "l2"]:
-            raise ValueError("unexpected value of 'extent_norm': %s"
-                    % extent_norm)
+    dimensions = len(particles)
 
-        srcntgts_extent_norm = extent_norm
-        srcntgts_have_extent = sources_have_extent or targets_have_extent
-        if not srcntgts_have_extent:
-            srcntgts_extent_norm = None
+    from boxtree.tools import AXIS_NAMES
+    axis_names = AXIS_NAMES[:dimensions]
 
-        del extent_norm
+    sources_are_targets = targets is None
+    sources_have_extent = source_radii is not None
+    targets_have_extent = target_radii is not None
 
-        if srcntgts_extent_norm and targets is None:
-            raise ValueError("must specify targets when specifying "
-                    "any kind of radii")
+    if extent_norm is None:
+        extent_norm = "linf"
 
-        from pytools import single_valued
-        particle_id_dtype = np.int32
-        box_id_dtype = np.int32
-        coord_dtype = single_valued(coord.dtype for coord in particles)
+    if extent_norm not in ("linf", "l2"):
+        raise ValueError(f"unexpected value of 'extent_norm': {extent_norm}")
 
-        if targets is None:
-            nsrcntgts = single_valued(len(coord) for coord in particles)
-        else:
-            nsources = single_valued(len(coord) for coord in particles)
-            ntargets = single_valued(len(coord) for coord in targets)
-            nsrcntgts = nsources + ntargets
-
-        if source_radii is not None:
-            if source_radii.shape != (nsources,):
-                raise ValueError("source_radii has an invalid shape")
-
-            if source_radii.dtype != coord_dtype:
-                raise TypeError("dtypes of coordinate arrays and "
-                        "source_radii must agree")
-
-        if target_radii is not None:
-            if target_radii.shape != (ntargets,):
-                raise ValueError("target_radii has an invalid shape")
-
-            if target_radii.dtype != coord_dtype:
-                raise TypeError("dtypes of coordinate arrays and "
-                        "target_radii must agree")
-
-        if sources_have_extent or targets_have_extent:
-            if stick_out_factor is None:
-                raise ValueError("if sources or targets have extent, "
-                        "stick_out_factor must be explicitly specified")
-        else:
-            stick_out_factor = 0
+    srcntgts_extent_norm = extent_norm
+    srcntgts_have_extent = sources_have_extent or targets_have_extent
+    if not srcntgts_have_extent:
+        srcntgts_extent_norm = None
 
-        # }}}
-
-        def zeros(shape, dtype):
-            result = actx.zeros(shape, dtype)
+    del extent_norm
 
-            if result.events:
-                event, = result.events
-            else:
-                from numbers import Number
-                if isinstance(shape, Number):
-                    shape = (shape,)
+    if srcntgts_extent_norm and targets is None:
+        raise ValueError(
+            "must specify targets when specifying any kind of radii")
 
-                from pytools import product
-                assert product(shape) == 0
+    from pytools import single_valued
+    particle_id_dtype = np.int32
+    box_id_dtype = np.int32
+    coord_dtype = single_valued(coord.dtype for coord in particles)
 
-                from pyopencl import enqueue_marker
-                event = enqueue_marker(actx.queue)
+    if targets is None:
+        nsrcntgts = single_valued(len(coord) for coord in particles)
+    else:
+        nsources = single_valued(len(coord) for coord in particles)
+        ntargets = single_valued(len(coord) for coord in targets)
+        nsrcntgts = nsources + ntargets
 
-            return result, event
+    if source_radii is not None:
+        if source_radii.shape != (nsources,):
+            raise ValueError(
+                "'source_radii' has an invalid shape: "
+                f"expected {(nsources,)} but got {source_radii.shape}")
 
-        knl_info = self.get_kernel_info(dimensions, coord_dtype,
-                particle_id_dtype, box_id_dtype,
-                sources_are_targets, srcntgts_extent_norm,
-                kind=kind)
+        if source_radii.dtype != coord_dtype:
+            raise TypeError(
+                "dtypes of coordinate arrays and 'source_radii' must agree: "
+                f"got {coord_dtype} and {source_radii.dtype}")
 
-        logger.debug("tree build: start")
+    if target_radii is not None:
+        if target_radii.shape != (ntargets,):
+            raise ValueError(
+                "'target_radii' has an invalid shape: "
+                f"expected {(ntargets,)} but got {target_radii.shape}")
 
-        # {{{ combine sources and targets into one array, if necessary
+        if target_radii.dtype != coord_dtype:
+            raise TypeError(
+                "dtypes of coordinate arrays and 'target_radii' must agree: "
+                f"got {coord_dtype} and {target_radii.dtype}")
 
-        prep_events = []
+    if sources_have_extent or targets_have_extent:
+        if stick_out_factor is None:
+            raise ValueError(
+                "if sources or targets have extent, "
+                "'stick_out_factor' must be explicitly specified")
+    else:
+        stick_out_factor = 0
 
-        if targets is None:
-            # Targets weren't specified. Sources are also targets. Let's
-            # call them "srcntgts".
+    # }}}
 
-            if isinstance(particles, np.ndarray) and particles.dtype.char == "O":
-                srcntgts = particles
-            else:
-                from pytools.obj_array import make_obj_array
-                srcntgts = make_obj_array([
-                    actx.np.copy(actx.thaw(p)) for p in particles
-                    ])
+    # {{{ kernels
 
-            assert source_radii is None
-            assert target_radii is None
+    knl_info = get_kernel_info(
+        actx,
+        dimensions,
+        coord_dtype, particle_id_dtype, box_id_dtype,
+        sources_are_targets,
+        srcntgts_extent_norm,
+        kind,
+        morton_nr_dtype, box_level_dtype,
+        )
 
-            srcntgt_radii = None
+    # }}}
 
-        else:
-            # Here, we mash sources and targets into one array to give us one
-            # big array of "srcntgts". In this case, a "srcntgt" is either a
-            # source or a target, but not really both, as above. How will we be
-            # able to tell which it was? Easy: We'll compare its 'user' id with
-            # nsources. If it's >=, it's a target, otherwise it's a source.
+    logger.debug("tree build: start")
 
-            target_coord_dtype = single_valued(tgt_i.dtype for tgt_i in targets)
+    # {{{ combine sources and targets into one array, if necessary
 
-            if target_coord_dtype != coord_dtype:
-                raise TypeError("sources and targets must have same coordinate "
-                        "dtype")
+    if targets is None:
+        # Targets weren't specified. Sources are also targets. Let's
+        # call them "srcntgts".
 
-            def combine_srcntgt_arrays(ary1, ary2=None):
-                if ary2 is None:
-                    dtype = ary1.dtype
-                else:
-                    dtype = ary2.dtype
+        if isinstance(particles, np.ndarray) and particles.dtype.char == "O":
+            srcntgts = particles
+        else:
+            from pytools.obj_array import make_obj_array
+            srcntgts = make_obj_array([
+                actx.np.copy(actx.thaw(p)) for p in particles
+                ])
 
-                result = actx.empty(nsrcntgts, dtype)
-                if (ary1 is None) or (ary2 is None):
-                    result.fill(0)
+        assert source_radii is None
+        assert target_radii is None
 
-                if ary1 is not None and ary1.nbytes:
-                    result[:len(ary1)] = ary1
+        srcntgt_radii = None
 
-                if ary2 is not None and ary2.nbytes:
-                    result[nsources:] = ary2
+    else:
+        # Here, we mash sources and targets into one array to give us one
+        # big array of "srcntgts". In this case, a "srcntgt" is either a
+        # source or a target, but not really both, as above. How will we be
+        # able to tell which it was? Easy: We'll compare its 'user' id with
+        # nsources. If it's >=, it's a target, otherwise it's a source.
 
-                return result
+        target_coord_dtype = single_valued(tgt_i.dtype for tgt_i in targets)
 
-            from pytools.obj_array import make_obj_array
-            srcntgts = make_obj_array([
-                combine_srcntgt_arrays(src_i, tgt_i)
-                for src_i, tgt_i in zip(particles, targets)
-                ])
+        if target_coord_dtype != coord_dtype:
+            raise TypeError(
+                "sources and targets must have same coordinate dtype: "
+                f"got {coord_dtype} and {target_coord_dtype}")
 
-            if srcntgts_have_extent:
-                srcntgt_radii = combine_srcntgt_arrays(source_radii, target_radii)
+        def combine_srcntgt_arrays(ary1, ary2=None):
+            if ary2 is None:
+                dtype = ary1.dtype
             else:
-                srcntgt_radii = None
+                dtype = ary2.dtype
 
-        del source_radii
-        del target_radii
+            result = actx.empty(nsrcntgts, dtype)
+            if (ary1 is None) or (ary2 is None):
+                result.fill(0)
 
-        del particles
+            if ary1 is not None and ary1.nbytes:
+                result[:len(ary1)] = ary1
 
-        user_srcntgt_ids = actx.from_numpy(
-            np.arange(nsrcntgts, dtype=particle_id_dtype)
-            )
+            if ary2 is not None and ary2.nbytes:
+                result[nsources:] = ary2
 
-        evt, = user_srcntgt_ids.events
-        wait_for.append(evt)
-        del evt
+            return result
 
-        # }}}
+        from pytools.obj_array import make_obj_array
+        srcntgts = make_obj_array([
+            combine_srcntgt_arrays(src_i, tgt_i)
+            for src_i, tgt_i in zip(particles, targets)
+            ])
 
-        # {{{ process refine_weights
-
-        from boxtree.tree_build_kernels import refine_weight_dtype
-
-        specified_max_particles_in_box = max_particles_in_box is not None
-        specified_refine_weights = refine_weights is not None and \
-            max_leaf_refine_weight is not None
-
-        if specified_max_particles_in_box and specified_refine_weights:
-            raise ValueError("may only specify one of max_particles_in_box and "
-                    "refine_weights/max_leaf_refine_weight")
-        elif not specified_max_particles_in_box and not specified_refine_weights:
-            raise ValueError("must specify either max_particles_in_box or "
-                    "refine_weights/max_leaf_refine_weight")
-        elif specified_max_particles_in_box:
-            refine_weights = actx.empty(nsrcntgts, refine_weight_dtype)
-            refine_weights.fill(1)
-
-            event, = refine_weights.events
-            prep_events.append(event)
-            max_leaf_refine_weight = max_particles_in_box
-        elif specified_refine_weights:
-            if refine_weights.dtype != refine_weight_dtype:
-                raise TypeError("refine_weights must have dtype '%s'"
-                        % refine_weight_dtype)
-
-        if max_leaf_refine_weight <= 0:
-            raise ValueError("max_leaf_refine_weight must be positive")
-
-        max_refine_weights = actx.to_numpy(actx.np.amax(refine_weights))
-        if max_leaf_refine_weight < max_refine_weights:
-            raise ValueError(
-                    "entries of refine_weights cannot exceed max_leaf_refine_weight")
+        if srcntgts_have_extent:
+            srcntgt_radii = combine_srcntgt_arrays(source_radii, target_radii)
+        else:
+            srcntgt_radii = None
 
-        min_refine_weights = actx.to_numpy(actx.np.amin(refine_weights))
-        if min_refine_weights < 0:
-            raise ValueError("all entries of refine_weights must be nonnegative")
+    del source_radii
+    del target_radii
+    del particles
 
-        total_refine_weight = actx.to_numpy(
-            actx.np.sum(refine_weights, dtype=np.dtype(np.int64))
-            )
+    user_srcntgt_ids = actx.from_numpy(
+        np.arange(nsrcntgts, dtype=particle_id_dtype)
+        )
 
-        del max_particles_in_box
-        del specified_max_particles_in_box
-        del specified_refine_weights
+    # }}}
 
-        # }}}
+    # {{{ process refine_weights
+
+    from boxtree.tree_build_kernels import refine_weight_dtype
+
+    specified_max_particles_in_box = max_particles_in_box is not None
+    specified_refine_weights = refine_weights is not None and \
+        max_leaf_refine_weight is not None
+
+    if specified_max_particles_in_box and specified_refine_weights:
+        raise ValueError(
+            "may only specify one of 'max_particles_in_box' and "
+            "'refine_weights' / 'max_leaf_refine_weight'")
+    elif not specified_max_particles_in_box and not specified_refine_weights:
+        raise ValueError(
+            "must specify either max_'particles_in_box' or "
+            "'refine_weights' / 'max_leaf_refine_weight'")
+    elif specified_max_particles_in_box:
+        refine_weights = actx.empty(nsrcntgts, refine_weight_dtype)
+        refine_weights.fill(1)
+
+        max_leaf_refine_weight = max_particles_in_box
+    elif specified_refine_weights:
+        if refine_weights.dtype != refine_weight_dtype:
+            raise TypeError(
+                f"'refine_weights' must have dtype '{refine_weight_dtype}', "
+                f"bit got {refine_weights.dtype}")
+
+    if max_leaf_refine_weight <= 0:
+        raise ValueError("'max_leaf_refine_weight' must be positive")
+
+    max_refine_weights = actx.to_numpy(actx.np.amax(refine_weights)).item()
+    if max_leaf_refine_weight < max_refine_weights:
+        raise ValueError(
+            "entries of 'refine_weights' cannot exceed 'max_leaf_refine_weight'")
+
+    min_refine_weights = actx.to_numpy(actx.np.amin(refine_weights)).item()
+    if min_refine_weights < 0:
+        raise ValueError("all entries of 'refine_weights' must be nonnegative")
+
+    total_refine_weight = actx.to_numpy(
+        actx.np.sum(refine_weights, dtype=np.dtype(np.int64))
+        )
+
+    del max_particles_in_box
+    del specified_max_particles_in_box
+    del specified_refine_weights
 
-        # {{{ find and process bounding box
+    # }}}
 
-        if bbox is None:
-            bbox, _ = self.bbox_finder(
-                actx, srcntgts, srcntgt_radii, wait_for=wait_for)
-            bbox = actx.to_numpy(bbox)
+    # {{{ find and process bounding box
 
-            root_extent = max(
-                bbox["max_"+ax] - bbox["min_"+ax]
-                for ax in axis_names) * (1+TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR)
+    from boxtree.bounding_box import find_bounding_box
+    if bbox is None:
+        bbox = find_bounding_box(actx, srcntgts, srcntgt_radii)
+        bbox = actx.to_numpy(bbox)
 
-            # make bbox square and slightly larger at the top, to ensure scaled
-            # coordinates are always < 1
-            bbox_min = np.empty(dimensions, coord_dtype)
-            for i, ax in enumerate(axis_names):
-                bbox_min[i] = bbox["min_"+ax]
+        root_extent = (
+            (1 + root_extent_stretch_factor)
+            * max([bbox[f"max_{ax}"] - bbox[f"min_{ax}"] for ax in axis_names])
+            )
 
-            bbox_max = bbox_min + root_extent
-            for i, ax in enumerate(axis_names):
-                bbox["max_"+ax] = bbox_max[i]
-        else:
-            # Validate that bbox is a superset of particle-derived bbox
-            bbox_auto, _ = self.bbox_finder(
-                    srcntgts, srcntgt_radii, wait_for=wait_for)
-            bbox_auto = actx.to_numpy(bbox_auto)
-
-            # Convert unstructured numpy array to bbox_type
-            if isinstance(bbox, np.ndarray):
-                if len(bbox) == dimensions:
-                    bbox_bak = bbox.copy()
-                    bbox = np.empty(1, bbox_auto.dtype)
-                    for i, ax in enumerate(axis_names):
-                        bbox["min_"+ax] = bbox_bak[i][0]
-                        bbox["max_"+ax] = bbox_bak[i][1]
-                else:
-                    assert len(bbox) == 1
+        # make bbox square and slightly larger at the top, to ensure scaled
+        # coordinates are always < 1
+        bbox_min = np.empty(dimensions, coord_dtype)
+        for i, ax in enumerate(axis_names):
+            bbox_min[i] = bbox[f"min_{ax}"]
+
+        bbox_max = bbox_min + root_extent
+        for i, ax in enumerate(axis_names):
+            bbox[f"max_{ax}"] = bbox_max[i]
+    else:
+        # Validate that bbox is a superset of particle-derived bbox
+        bbox_auto = find_bounding_box(actx, srcntgts, srcntgt_radii)
+        bbox_auto = actx.to_numpy(bbox_auto)
+
+        # Convert unstructured numpy array to bbox_type
+        if isinstance(bbox, np.ndarray):
+            if len(bbox) == dimensions:
+                bbox_bak = bbox.copy()
+                bbox = np.empty(1, bbox_auto.dtype)
+                for i, ax in enumerate(axis_names):
+                    bbox[f"min_{ax}"] = bbox_bak[i][0]
+                    bbox[f"max_{ax}"] = bbox_bak[i][1]
             else:
-                raise NotImplementedError("Unsupported bounding box type: "
-                        + str(type(bbox)))
+                assert len(bbox) == 1
+        else:
+            raise NotImplementedError(
+                f"Unsupported bounding box type: {type(bbox).__name__}")
 
-            # bbox must cover bbox_auto
-            bbox_min = np.empty(dimensions, coord_dtype)
-            bbox_max = np.empty(dimensions, coord_dtype)
+        # bbox must cover bbox_auto
+        bbox_min = np.empty(dimensions, coord_dtype)
+        bbox_max = np.empty(dimensions, coord_dtype)
 
-            for i, ax in enumerate(axis_names):
-                bbox_min[i] = bbox["min_" + ax]
-                bbox_max[i] = bbox["max_" + ax]
-                assert bbox_min[i] < bbox_max[i]
-                assert bbox_min[i] <= bbox_auto["min_" + ax]
-                assert bbox_max[i] >= bbox_auto["max_" + ax]
+        for i, ax in enumerate(axis_names):
+            bbox_min[i] = bbox[f"min_{ax}"]
+            bbox_max[i] = bbox[f"max_{ax}"]
+            assert bbox_min[i] < bbox_max[i]
+            assert bbox_min[i] <= bbox_auto[f"min_{ax}"]
+            assert bbox_max[i] >= bbox_auto[f"max_{ax}"]
 
-            # bbox must be a square
-            bbox_exts = bbox_max - bbox_min
-            for ext in bbox_exts:
-                assert abs(ext - bbox_exts[0]) < 1e-15
+        # bbox must be a square
+        bbox_exts = bbox_max - bbox_min
+        for ext in bbox_exts:
+            assert abs(ext - bbox_exts[0]) < 1e-15
 
-            root_extent = bbox_exts[0]
+        root_extent = bbox_exts[0]
 
-        # }}}
-
-        # {{{ allocate data
+    # }}}
 
-        logger.debug("allocating memory")
+    # {{{ allocate data
+
+    logger.debug("allocating memory")
+
+    # box-local morton bin counts for each particle at the current level
+    # only valid from scan -> split'n'sort
+    morton_bin_counts = actx.empty(
+        nsrcntgts, dtype=knl_info.morton_bin_count_dtype)
+
+    # (local) morton nrs for each particle at the current level
+    # only valid from scan -> split'n'sort
+    morton_nrs = actx.empty(nsrcntgts, dtype=morton_nr_dtype)
+
+    # 0/1 segment flags
+    # invariant to sorting once set
+    # (particles are only reordered within a box)
+    # valid throughout computation
+    box_start_flags = actx.zeros(nsrcntgts, dtype=np.int8)
+    srcntgt_box_ids = actx.zeros(nsrcntgts, dtype=box_id_dtype)
+
+    # Outside nboxes_guess feeding is solely for debugging purposes,
+    # to test the reallocation code.
+    nboxes_guess = kwargs.get("nboxes_guess")
+    if nboxes_guess is None:
+        nboxes_guess = 2**dimensions * (
+                (max_leaf_refine_weight + total_refine_weight - 1)
+                // max_leaf_refine_weight)
+
+    assert nboxes_guess > 0
+
+    # /!\ IMPORTANT
+    #
+    # If you're allocating an array here that depends on nboxes_guess, or if
+    # your array contains box numbers, you have to write code for the
+    # following down below as well:
+    #
+    # * You *must* write reallocation code to handle box renumbering and
+    #   reallocation triggered at the top of the level loop.
+    #
+    # * If your array persists after the level loop, you *must* write code
+    #   to handle box renumbering and reallocation triggered by the box
+    #   pruning step.
+
+    split_box_ids = actx.zeros(nboxes_guess, dtype=box_id_dtype)
+
+    # per-box morton bin counts
+    box_morton_bin_counts = actx.zeros(
+        nboxes_guess, dtype=knl_info.morton_bin_count_dtype)
+
+    # particle# at which each box starts
+    box_srcntgt_starts = actx.zeros(nboxes_guess, dtype=particle_id_dtype)
+
+    # pointer to parent box
+    box_parent_ids = actx.zeros(nboxes_guess, dtype=box_id_dtype)
+
+    # pointer to child box, by morton number
+    box_child_ids = tuple([
+        actx.zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions)
+        ])
+
+    # box centers, by dimension
+    box_centers = tuple([
+        actx.zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions)
+        ])
+
+    # Initialize box_centers[0] to contain the root box's center
+    for d, ax in enumerate(axis_names):
+        center_ax = bbox[f"min_{ax}"] + (bbox[f"max_{ax}"] - bbox[f"min_{ax}"]) / 2
+        box_centers[d][0].fill(center_ax)
+
+    # box -> level map
+    box_levels = actx.zeros(nboxes_guess, dtype=box_level_dtype)
+
+    # number of particles in each box
+    # needs to be globally initialized because empty boxes never get touched
+    box_srcntgt_counts_cumul = actx.zeros(nboxes_guess, dtype=particle_id_dtype)
+
+    # Initialize box 0 to contain all particles
+    box_srcntgt_counts_cumul[0].fill(nsrcntgts, queue=actx.queue)
+
+    # box -> whether the box has a child. FIXME: use smaller integer type
+    box_has_children = actx.zeros(nboxes_guess, dtype=np.dtype(np.int32))
+
+    # box -> whether the box needs a splitting to enforce level restriction.
+    # FIXME: use smaller integer type
+    force_split_box = actx.zeros(
+        nboxes_guess if knl_info.level_restrict else 0,
+        dtype=np.dtype(np.int32))
+
+    # set parent of root box to itself
+    from pyopencl import enqueue_copy
+    evt = enqueue_copy(
+            actx.queue, box_parent_ids.data,
+            np.zeros((), dtype=box_parent_ids.dtype))
+    box_parent_ids.add_event(evt)
+    del evt
+
+    # 2*(num bits in the significand)
+    # https://gitlab.tiker.net/inducer/boxtree/issues/23
+    nlevels_max = 2*(np.finfo(coord_dtype).nmant + 1)
+    assert nlevels_max <= np.iinfo(box_level_dtype).max
+
+    # level -> starting box on level
+    level_start_box_nrs_dev = actx.zeros(nlevels_max, dtype=box_id_dtype)
+
+    # level -> number of used boxes on level
+    level_used_box_counts_dev = actx.zeros(nlevels_max, dtype=box_id_dtype)
 
-        # box-local morton bin counts for each particle at the current level
-        # only valid from scan -> split'n'sort
-        morton_bin_counts = actx.empty(
-            nsrcntgts, dtype=knl_info.morton_bin_count_dtype)
+    # }}}
 
-        # (local) morton nrs for each particle at the current level
-        # only valid from scan -> split'n'sort
-        morton_nrs = actx.empty(nsrcntgts, dtype=self.morton_nr_dtype)
+    def debug_with_finish(s):
+        if debug:
+            actx.queue.finish()
 
-        # 0/1 segment flags
-        # invariant to sorting once set
-        # (particles are only reordered within a box)
-        # valid throughout computation
-        box_start_flags, evt = zeros(nsrcntgts, dtype=np.int8)
-        prep_events.append(evt)
-        srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
-        prep_events.append(evt)
+        logger.debug(s)
 
-        # Outside nboxes_guess feeding is solely for debugging purposes,
-        # to test the reallocation code.
-        nboxes_guess = kwargs.get("nboxes_guess")
-        if nboxes_guess is None:
-            nboxes_guess = 2**dimensions * (
-                    (max_leaf_refine_weight + total_refine_weight - 1)
-                    // max_leaf_refine_weight)
+    from pytools.obj_array import make_obj_array
+    have_oversize_split_box = actx.zeros((), np.int32)
 
-        assert nboxes_guess > 0
+    # True if and only if the level restrict kernel found a box to split in
+    # order to enforce level restriction.
+    have_upper_level_split_box = actx.zeros((), np.int32)
 
-        # /!\ IMPORTANT
-        #
-        # If you're allocating an array here that depends on nboxes_guess, or if
-        # your array contains box numbers, you have to write code for the
-        # following down below as well:
-        #
-        # * You *must* write reallocation code to handle box renumbering and
-        #   reallocation triggered at the top of the level loop.
-        #
-        # * If your array persists after the level loop, you *must* write code
-        #   to handle box renumbering and reallocation triggered by the box
-        #   pruning step.
-
-        split_box_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
-        prep_events.append(evt)
-
-        # per-box morton bin counts
-        box_morton_bin_counts, evt = zeros(nboxes_guess,
-                                      dtype=knl_info.morton_bin_count_dtype)
-        prep_events.append(evt)
-
-        # particle# at which each box starts
-        box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
-        prep_events.append(evt)
-
-        # pointer to parent box
-        box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
-        prep_events.append(evt)
-
-        # pointer to child box, by morton number
-        box_child_ids, evts = zip(
-            *(zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions)))
-        prep_events.extend(evts)
-
-        # box centers, by dimension
-        box_centers, evts = zip(
-            *(zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions)))
-        prep_events.extend(evts)
-
-        # Initialize box_centers[0] to contain the root box's center
-        for d, (ax, evt) in enumerate(zip(axis_names, evts)):
-            center_ax = bbox["min_"+ax] + (bbox["max_"+ax] - bbox["min_"+ax]) / 2
-            box_centers[d][0].fill(center_ax, wait_for=[evt])
-
-        # box -> level map
-        box_levels, evt = zeros(nboxes_guess, self.box_level_dtype)
-        prep_events.append(evt)
-
-        # number of particles in each box
-        # needs to be globally initialized because empty boxes never get touched
-        box_srcntgt_counts_cumul, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
-        prep_events.append(evt)
-
-        # Initialize box 0 to contain all particles
-        box_srcntgt_counts_cumul[0].fill(nsrcntgts, queue=actx.queue, wait_for=[evt])
-
-        # box -> whether the box has a child. FIXME: use smaller integer type
-        box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32))
-        prep_events.append(evt)
-
-        # box -> whether the box needs a splitting to enforce level restriction.
-        # FIXME: use smaller integer type
-        force_split_box, evt = zeros(nboxes_guess
-                                     if knl_info.level_restrict
-                                     else 0, dtype=np.dtype(np.int32))
-        prep_events.append(evt)
-
-        # set parent of root box to itself
-        from pyopencl import enqueue_copy
-        evt = enqueue_copy(
-                actx.queue, box_parent_ids.data,
-                np.zeros((), dtype=box_parent_ids.dtype))
-        prep_events.append(evt)
-
-        # 2*(num bits in the significand)
-        # https://gitlab.tiker.net/inducer/boxtree/issues/23
-        nlevels_max = 2*(np.finfo(coord_dtype).nmant + 1)
-        assert nlevels_max <= np.iinfo(self.box_level_dtype).max
-
-        # level -> starting box on level
-        level_start_box_nrs_dev, evt = zeros(nlevels_max, dtype=box_id_dtype)
-        prep_events.append(evt)
-
-        # level -> number of used boxes on level
-        level_used_box_counts_dev, evt = zeros(nlevels_max, dtype=box_id_dtype)
-        prep_events.append(evt)
+    from pytools import div_ceil
 
-        # }}}
+    # {{{ level loop
 
-        def debug_with_finish(s):
-            if debug:
-                actx.queue.finish()
+    # Level 0 starts at 0 and always contains box 0 and nothing else.
+    # Level 1 therefore starts at 1.
+    level_start_box_nrs = [0, 1]
+    level_start_box_nrs_dev[0] = 0
+    level_start_box_nrs_dev[1] = 1
 
-            logger.debug(s)
+    # This counts the number of boxes that have been used per level. Note
+    # that this could be fewer than the actual number of boxes allocated to
+    # the level (in the case of building a level restricted tree, more boxes
+    # are pre-allocated for a level than used since we may decide to split
+    # parent level boxes later).
+    level_used_box_counts = [1]
+    level_used_box_counts_dev[0] = 1
 
-        from pytools.obj_array import make_obj_array
-        have_oversize_split_box, evt = zeros((), np.int32)
-        prep_events.append(evt)
+    # level -> number of leaf boxes on level. Initially the root node is a
+    # leaf.
+    level_leaf_counts = np.array([1])
 
-        # True if and only if the level restrict kernel found a box to split in
-        # order to enforce level restriction.
-        have_upper_level_split_box, evt = zeros((), np.int32)
-        prep_events.append(evt)
+    tree_build_proc = ProcessLogger(logger, "tree build")
 
-        wait_for = prep_events
+    if total_refine_weight > max_leaf_refine_weight:
+        level = 1
+    else:
+        level = 0
 
-        from pytools import div_ceil
+    # INVARIANTS -- Upon entry to this loop:
+    #
+    # - level is the level being built.
+    # - the last entry of level_start_box_nrs is the beginning of the level
+    #   to be built
+    # - the last entry of level_used_box_counts is the number of boxes that
+    #   are used (not just allocated) at the previous level
 
-        # {{{ level loop
+    # This while condition prevents entering the loop in case there's just a
+    # single box, by how 'level' is set above. Read this as 'while True' with
+    # an edge case.
 
-        # Level 0 starts at 0 and always contains box 0 and nothing else.
-        # Level 1 therefore starts at 1.
-        level_start_box_nrs = [0, 1]
-        level_start_box_nrs_dev[0] = 0
-        level_start_box_nrs_dev[1] = 1
-        wait_for.extend(level_start_box_nrs_dev.events)
+    level_loop_proc = DebugProcessLogger(logger, "tree build level loop")
 
-        # This counts the number of boxes that have been used per level. Note
-        # that this could be fewer than the actual number of boxes allocated to
-        # the level (in the case of building a level restricted tree, more boxes
-        # are pre-allocated for a level than used since we may decide to split
-        # parent level boxes later).
-        level_used_box_counts = [1]
-        level_used_box_counts_dev[0] = 1
-        wait_for.extend(level_used_box_counts_dev.events)
+    # When doing level restriction, the level loop may need to be entered
+    # one more time after creating all the levels (see fixme note below
+    # regarding this). This flag is set to True when that happens.
+    final_level_restrict_iteration = False
 
-        # level -> number of leaf boxes on level. Initially the root node is a
-        # leaf.
-        level_leaf_counts = np.array([1])
+    from boxtree.tools import map_values, copy_and_map_gappy
+    while level:
+        if debug:
+            # More invariants:
+            assert level == len(level_start_box_nrs) - 1
+            assert level == len(level_used_box_counts)
+            assert level == len(level_leaf_counts)
+
+        if level + 1 >= nlevels_max:  # level is zero-based
+            raise MaxLevelsExceeded("Level count exceeded number of significant "
+                    "bits in coordinate dtype. That means that a large number "
+                    "of particles was indistinguishable up to floating point "
+                    "precision (because they ended up in the same box).")
+
+        common_args = ((morton_bin_counts, morton_nrs,
+                box_start_flags,
+                srcntgt_box_ids, split_box_ids,
+                box_morton_bin_counts,
+                refine_weights,
+                max_leaf_refine_weight,
+                box_srcntgt_starts, box_srcntgt_counts_cumul,
+                box_parent_ids, box_levels,
+                level, bbox,
+                user_srcntgt_ids)
+                + tuple(srcntgts)
+                + ((srcntgt_radii,) if srcntgts_have_extent else ())
+                )
 
-        tree_build_proc = ProcessLogger(logger, "tree build")
+        debug_with_finish("morton count scan")
 
-        if total_refine_weight > max_leaf_refine_weight:
-            level = 1
-        else:
-            level = 0
+        morton_count_args = common_args
+        if srcntgts_have_extent:
+            morton_count_args += (stick_out_factor,)
 
-        # INVARIANTS -- Upon entry to this loop:
-        #
-        # - level is the level being built.
-        # - the last entry of level_start_box_nrs is the beginning of the level
-        #   to be built
-        # - the last entry of level_used_box_counts is the number of boxes that
-        #   are used (not just allocated) at the previous level
+        # writes: box_morton_bin_counts
+        knl_info.morton_count_scan(
+                *morton_count_args, queue=actx.queue, size=nsrcntgts,
+                allocator=actx.allocator,
+                )
 
-        # This while condition prevents entering the loop in case there's just a
-        # single box, by how 'level' is set above. Read this as 'while True' with
-        # an edge case.
+        debug_with_finish("split box id scan")
+
+        # writes: box_has_children, split_box_ids
+        knl_info.split_box_id_scan(
+                srcntgt_box_ids,
+                box_srcntgt_counts_cumul,
+                box_morton_bin_counts,
+                refine_weights,
+                max_leaf_refine_weight,
+                box_levels,
+                level_start_box_nrs_dev,
+                level_used_box_counts_dev,
+                force_split_box,
+                level,
 
-        level_loop_proc = DebugProcessLogger(logger, "tree build level loop")
+                # output:
+                box_has_children,
+                split_box_ids,
+                have_oversize_split_box,
 
-        # When doing level restriction, the level loop may need to be entered
-        # one more time after creating all the levels (see fixme note below
-        # regarding this). This flag is set to True when that happens.
-        final_level_restrict_iteration = False
+                queue=actx.queue,
+                size=level_start_box_nrs[level],
+                allocator=actx.allocator,
+                )
 
-        from pyopencl import wait_for_events
-        while level:
-            if debug:
-                # More invariants:
-                assert level == len(level_start_box_nrs) - 1
-                assert level == len(level_used_box_counts)
-                assert level == len(level_leaf_counts)
-
-            if level + 1 >= nlevels_max:  # level is zero-based
-                raise MaxLevelsExceeded("Level count exceeded number of significant "
-                        "bits in coordinate dtype. That means that a large number "
-                        "of particles was indistinguishable up to floating point "
-                        "precision (because they ended up in the same box).")
-
-            common_args = ((morton_bin_counts, morton_nrs,
-                    box_start_flags,
-                    srcntgt_box_ids, split_box_ids,
-                    box_morton_bin_counts,
-                    refine_weights,
-                    max_leaf_refine_weight,
-                    box_srcntgt_starts, box_srcntgt_counts_cumul,
-                    box_parent_ids, box_levels,
-                    level, bbox,
-                    user_srcntgt_ids)
-                    + tuple(srcntgts)
-                    + ((srcntgt_radii,) if srcntgts_have_extent else ())
-                    )
+        # {{{ compute new level_used_box_counts, level_leaf_counts
+
+        # The last split_box_id on each level tells us how many boxes are
+        # needed at the next level.
+        new_level_used_box_counts = [1]
+        for level_start_box_id in level_start_box_nrs[1:]:
+            last_box_on_prev_level = level_start_box_id - 1
+            new_level_used_box_counts.append(
+                # FIXME: Get this all at once.
+                int(actx.to_numpy(split_box_ids[last_box_on_prev_level]))
+                - level_start_box_id)
+
+        # New leaf count =
+        #   old leaf count
+        #   + nr. new boxes from splitting parent's leaves
+        #   - nr. new boxes from splitting current level's leaves / 2**d
+        level_used_box_counts_diff = (new_level_used_box_counts
+                - np.append(level_used_box_counts, [0]))
+        new_level_leaf_counts = (level_leaf_counts
+                + level_used_box_counts_diff[:-1]
+                - level_used_box_counts_diff[1:] // 2 ** dimensions)
+        new_level_leaf_counts = np.append(
+                new_level_leaf_counts,
+                [level_used_box_counts_diff[-1]])
+        del level_used_box_counts_diff
 
-            debug_with_finish("morton count scan")
-
-            morton_count_args = common_args
-            if srcntgts_have_extent:
-                morton_count_args += (stick_out_factor,)
-
-            # writes: box_morton_bin_counts
-            evt = knl_info.morton_count_scan(
-                    *morton_count_args, queue=actx.queue, size=nsrcntgts,
-                    wait_for=wait_for)
-            wait_for = [evt]
-
-            debug_with_finish("split box id scan")
-
-            # writes: box_has_children, split_box_ids
-            evt = knl_info.split_box_id_scan(
-                    srcntgt_box_ids,
-                    box_srcntgt_counts_cumul,
-                    box_morton_bin_counts,
-                    refine_weights,
-                    max_leaf_refine_weight,
-                    box_levels,
-                    level_start_box_nrs_dev,
-                    level_used_box_counts_dev,
-                    force_split_box,
-                    level,
+        # }}}
 
-                    # output:
-                    box_has_children,
-                    split_box_ids,
-                    have_oversize_split_box,
+        # Assumption: Everything between here and the top of the loop must
+        # be repeatable, so that in an out-of-memory situation, we can just
+        # rerun this bit of the code after reallocating and a minimal reset
+        # procedure.
+
+        # The algorithm for deciding on level sizes is as follows:
+        # 1. Compute the minimal necessary size of each level, including the
+        #    new level being created.
+        # 2. If level restricting, add padding to the new level being created.
+        # 3. Check if there is enough existing space for each level.
+        # 4. If any level does not have sufficient space, reallocate all levels:
+        #    4a. Compute new sizes of upper levels
+        #    4b. If level restricting, add padding to all levels.
+
+        curr_upper_level_lengths = np.diff(level_start_box_nrs)
+        minimal_upper_level_lengths = np.max(
+            [new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0)
+        minimal_new_level_length = new_level_used_box_counts[-1]
+
+        # Allocate extra space at the end of the current level for higher
+        # level leaves that may be split later.
+        #
+        # If there are no further levels to split (i.e.
+        # have_oversize_split_box = 0), then we do not need to allocate any
+        # extra space, since no new leaves can be created at the bottom
+        # level.
+        if knl_info.level_restrict and actx.to_numpy(have_oversize_split_box):
+            # Currently undocumented.
+            lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1)
+            minimal_new_level_length += sum(
+                2**(lev*dimensions) * new_level_leaf_counts[level - lev]
+                for lev in range(1, 1 + min(level, lr_lookbehind_levels)))
+
+        nboxes_minimal = (
+                sum(minimal_upper_level_lengths) + minimal_new_level_length)
+
+        needs_renumbering = (
+                (curr_upper_level_lengths < minimal_upper_level_lengths).any())
+
+        # {{{ prepare for reallocation/renumbering
+
+        if needs_renumbering:
+            assert knl_info.level_restrict
+
+            # {{{ compute new level_start_box_nrs
+
+            # Represents the amount of padding needed for upper levels.
+            upper_level_padding = np.zeros(level, dtype=int)
+
+            # Recompute the level padding.
+            for ulevel in range(level):
+                upper_level_padding[ulevel] = sum(
+                    2**(lev*dimensions) * new_level_leaf_counts[ulevel - lev]
+                    for lev in range(
+                        1, 1 + min(ulevel, lr_lookbehind_levels)))
+
+            new_upper_level_unused_box_counts = np.max(
+                [upper_level_padding,
+                minimal_upper_level_lengths - new_level_used_box_counts[:-1]],
+                axis=0)
+
+            new_level_start_box_nrs = np.empty(level + 1, dtype=int)
+            new_level_start_box_nrs[0] = 0
+            new_level_start_box_nrs[1:] = np.cumsum(
+                new_level_used_box_counts[:-1]
+                + new_upper_level_unused_box_counts)
 
-                    queue=actx.queue,
-                    size=level_start_box_nrs[level],
-                    wait_for=wait_for)
-            wait_for = [evt]
-
-            # {{{ compute new level_used_box_counts, level_leaf_counts
-
-            # The last split_box_id on each level tells us how many boxes are
-            # needed at the next level.
-            new_level_used_box_counts = [1]
-            for level_start_box_id in level_start_box_nrs[1:]:
-                last_box_on_prev_level = level_start_box_id - 1
-                new_level_used_box_counts.append(
-                    # FIXME: Get this all at once.
-                    int(actx.to_numpy(split_box_ids[last_box_on_prev_level]))
-                    - level_start_box_id)
-
-            # New leaf count =
-            #   old leaf count
-            #   + nr. new boxes from splitting parent's leaves
-            #   - nr. new boxes from splitting current level's leaves / 2**d
-            level_used_box_counts_diff = (new_level_used_box_counts
-                    - np.append(level_used_box_counts, [0]))
-            new_level_leaf_counts = (level_leaf_counts
-                    + level_used_box_counts_diff[:-1]
-                    - level_used_box_counts_diff[1:] // 2 ** dimensions)
-            new_level_leaf_counts = np.append(
-                    new_level_leaf_counts,
-                    [level_used_box_counts_diff[-1]])
-            del level_used_box_counts_diff
+            assert not (level_start_box_nrs == new_level_start_box_nrs).all()
 
             # }}}
 
-            # Assumption: Everything between here and the top of the loop must
-            # be repeatable, so that in an out-of-memory situation, we can just
-            # rerun this bit of the code after reallocating and a minimal reset
-            # procedure.
-
-            # The algorithm for deciding on level sizes is as follows:
-            # 1. Compute the minimal necessary size of each level, including the
-            #    new level being created.
-            # 2. If level restricting, add padding to the new level being created.
-            # 3. Check if there is enough existing space for each level.
-            # 4. If any level does not have sufficient space, reallocate all levels:
-            #    4a. Compute new sizes of upper levels
-            #    4b. If level restricting, add padding to all levels.
-
-            curr_upper_level_lengths = np.diff(level_start_box_nrs)
-            minimal_upper_level_lengths = np.max(
-                [new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0)
-            minimal_new_level_length = new_level_used_box_counts[-1]
-
-            # Allocate extra space at the end of the current level for higher
-            # level leaves that may be split later.
-            #
-            # If there are no further levels to split (i.e.
-            # have_oversize_split_box = 0), then we do not need to allocate any
-            # extra space, since no new leaves can be created at the bottom
-            # level.
-            if knl_info.level_restrict and actx.to_numpy(have_oversize_split_box):
-                # Currently undocumented.
-                lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1)
-                minimal_new_level_length += sum(
-                    2**(lev*dimensions) * new_level_leaf_counts[level - lev]
-                    for lev in range(1, 1 + min(level, lr_lookbehind_levels)))
-
-            nboxes_minimal = \
-                    sum(minimal_upper_level_lengths) + minimal_new_level_length
-
-            needs_renumbering = \
-                    (curr_upper_level_lengths < minimal_upper_level_lengths).any()
-
-            # {{{ prepare for reallocation/renumbering
-
-            if needs_renumbering:
-                assert knl_info.level_restrict
-
-                # {{{ compute new level_start_box_nrs
-
-                # Represents the amount of padding needed for upper levels.
-                upper_level_padding = np.zeros(level, dtype=int)
-
-                # Recompute the level padding.
-                for ulevel in range(level):
-                    upper_level_padding[ulevel] = sum(
-                        2**(lev*dimensions) * new_level_leaf_counts[ulevel - lev]
-                        for lev in range(
-                            1, 1 + min(ulevel, lr_lookbehind_levels)))
-
-                new_upper_level_unused_box_counts = np.max(
-                    [upper_level_padding,
-                    minimal_upper_level_lengths - new_level_used_box_counts[:-1]],
-                    axis=0)
-
-                new_level_start_box_nrs = np.empty(level + 1, dtype=int)
-                new_level_start_box_nrs[0] = 0
-                new_level_start_box_nrs[1:] = np.cumsum(
-                    new_level_used_box_counts[:-1]
-                    + new_upper_level_unused_box_counts)
-
-                assert not (level_start_box_nrs == new_level_start_box_nrs).all()
-
-                # }}}
-
-                # {{{ set up reallocators
-
-                old_box_count = level_start_box_nrs[-1]
-                # Where should I put this box?
-                dst_box_id = actx.empty(shape=old_box_count, dtype=box_id_dtype)
-
-                for level_start, new_level_start, level_len in zip(
-                        level_start_box_nrs, new_level_start_box_nrs,
-                        curr_upper_level_lengths):
-                    dst_box_id[level_start:level_start+level_len] = actx.from_numpy(
-                        np.arange(new_level_start,
-                                  new_level_start + level_len,
-                                  dtype=box_id_dtype)
-                        )
-
-                wait_for.extend(dst_box_id.events)
+            # {{{ set up reallocators
 
-                realloc_array = partial(self.gappy_copy_and_map,
-                        dst_indices=dst_box_id, range=slice(old_box_count),
-                        debug=debug)
-                realloc_and_renumber_array = partial(self.gappy_copy_and_map,
-                        dst_indices=dst_box_id, map_values=dst_box_id,
-                        range=slice(old_box_count), debug=debug)
-                renumber_array = partial(self.map_values_kernel, dst_box_id)
-
-                # }}}
+            old_box_count = level_start_box_nrs[-1]
+            # Where should I put this box?
+            dst_box_id = actx.empty(shape=old_box_count, dtype=box_id_dtype)
 
-                # Update level_start_box_nrs. This will be the
-                # level_start_box_nrs for the reallocated data.
+            for level_start, new_level_start, level_len in zip(
+                    level_start_box_nrs, new_level_start_box_nrs,
+                    curr_upper_level_lengths):
+                dst_box_id[level_start:level_start+level_len] = actx.from_numpy(
+                    np.arange(new_level_start,
+                                new_level_start + level_len,
+                                dtype=box_id_dtype)
+                    )
 
-                level_start_box_nrs = list(new_level_start_box_nrs)
-                level_start_box_nrs_dev[:level + 1] = \
-                    np.array(new_level_start_box_nrs, dtype=box_id_dtype)
-                level_start_box_nrs_updated = True
-                wait_for.extend(level_start_box_nrs_dev.events)
+            realloc_array = partial(copy_and_map_gappy,
+                    actx,
+                    dst_indices=dst_box_id, range=slice(old_box_count),
+                    debug=debug)
+            realloc_and_renumber_array = partial(copy_and_map_gappy,
+                    actx,
+                    dst_indices=dst_box_id, mapping=dst_box_id,
+                    range=slice(old_box_count), debug=debug)
+            renumber_array = partial(map_values, actx, dst_box_id)
 
-                nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length
+            # }}}
 
-                del new_level_start_box_nrs
-            else:
-                from boxtree.tools import realloc_array
-                realloc_and_renumber_array = realloc_array
-                renumber_array = None
-                level_start_box_nrs_updated = False
-                nboxes_new = nboxes_minimal
+            # Update level_start_box_nrs. This will be the
+            # level_start_box_nrs for the reallocated data.
 
-            del nboxes_minimal
+            level_start_box_nrs = list(new_level_start_box_nrs)
+            level_start_box_nrs_dev[:level + 1] = (
+                np.array(new_level_start_box_nrs, dtype=box_id_dtype))
+            level_start_box_nrs_updated = True
 
-            # }}}
+            nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length
 
-            # {{{ reallocate and/or renumber boxes if necessary
-
-            if level_start_box_nrs_updated or nboxes_new > nboxes_guess:
-                debug_with_finish("starting nboxes_guess increase")
-
-                while nboxes_guess < nboxes_new:
-                    nboxes_guess *= 2
-
-                def my_realloc_nocopy(ary, shape=nboxes_guess):
-                    return actx.zeros(shape=shape, dtype=ary.dtype)
-
-                def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
-                    result = actx.zeros(shape=shape, dtype=ary.dtype)
-                    return result, result.events[0]
-
-                my_realloc = partial(
-                    realloc_array,
-                    actx, nboxes_guess, wait_for=wait_for)
-                my_realloc_zeros = partial(
-                    realloc_array,
-                    actx, nboxes_guess, zero_fill=True, wait_for=wait_for)
-                my_realloc_zeros_and_renumber = partial(
-                    realloc_and_renumber_array,
-                    actx, nboxes_guess, zero_fill=True, wait_for=wait_for)
-
-                resize_events = []
-
-                split_box_ids = my_realloc_nocopy(split_box_ids)
-
-                # *Most*, but not *all* of the values in this array are
-                # rewritten when the morton scan is redone. Specifically,
-                # only the box morton bin counts of boxes on the level
-                # currently being processed are written-but we need to
-                # retain the box morton bin counts from the higher levels.
-                box_morton_bin_counts, evt = my_realloc_zeros(box_morton_bin_counts)
-                resize_events.append(evt)
-
-                # force_split_box is unused unless level restriction is enabled.
-                if knl_info.level_restrict:
-                    force_split_box, evt = my_realloc_zeros(force_split_box)
-                    resize_events.append(evt)
-
-                box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts)
-                resize_events.append(evt)
-
-                box_srcntgt_counts_cumul, evt = \
-                        my_realloc_zeros(box_srcntgt_counts_cumul)
-                resize_events.append(evt)
-
-                box_has_children, evt = my_realloc_zeros(box_has_children)
-                resize_events.append(evt)
-
-                box_centers, evts = zip(
-                    *(my_realloc(ary) for ary in box_centers))
-                resize_events.extend(evts)
-
-                box_child_ids, evts = zip(
-                    *(my_realloc_zeros_and_renumber(ary)
-                      for ary in box_child_ids))
-                resize_events.extend(evts)
-
-                box_parent_ids, evt = my_realloc_zeros_and_renumber(box_parent_ids)
-                resize_events.append(evt)
-
-                if not level_start_box_nrs_updated:
-                    box_levels, evt = my_realloc(box_levels)
-                    resize_events.append(evt)
-                else:
-                    box_levels, evt = my_realloc_zeros_nocopy(box_levels)
-                    wait_for_events([evt])
-                    for box_level, (level_start, level_end) in enumerate(zip(
-                            level_start_box_nrs, level_start_box_nrs[1:])):
-                        box_levels[level_start:level_end].fill(box_level)
-                    resize_events.extend(box_levels.events)
-
-                if level_start_box_nrs_updated:
-                    srcntgt_box_ids, evt = renumber_array(srcntgt_box_ids)
-                    resize_events.append(evt)
-
-                del my_realloc_zeros
-                del my_realloc_nocopy
-                del my_realloc_zeros_nocopy
-                del renumber_array
-
-                # Can't del on Py2.7 - these are used in generator expressions
-                # above, which are nested scopes
-                my_realloc = None
-                my_realloc_zeros_and_renumber = None
-
-                # retry
-                logger.info("nboxes_guess exceeded: "
-                            "enlarged allocations, restarting level")
+            del new_level_start_box_nrs
+        else:
+            from boxtree.tools import realloc_array as _realloc_array
+            realloc_array = partial(_realloc_array, actx)
+            realloc_and_renumber_array = realloc_array
+            renumber_array = None
+            level_start_box_nrs_updated = False
+            nboxes_new = nboxes_minimal
 
-                continue
+        del nboxes_minimal
 
-            # }}}
+        # }}}
 
-            logger.debug("LEVEL %d -> %d boxes" % (level, nboxes_new))
+        # {{{ reallocate and/or renumber boxes if necessary
 
-            assert (
-                level_start_box_nrs[-1] != nboxes_new
-                or srcntgts_have_extent
-                or final_level_restrict_iteration)
+        if level_start_box_nrs_updated or nboxes_new > nboxes_guess:
+            debug_with_finish("starting nboxes_guess increase")
 
-            if level_start_box_nrs[-1] == nboxes_new:
-                # We haven't created new boxes in this level loop trip.
-                #
-                # If srcntgts have extent, this can happen if boxes were
-                # in-principle overfull, but couldn't subdivide because of
-                # extent restrictions.
-                if srcntgts_have_extent and not final_level_restrict_iteration:
-                    level -= 1
-                    break
-                assert final_level_restrict_iteration
+            while nboxes_guess < nboxes_new:
+                nboxes_guess *= 2
 
-            # {{{ update level_start_box_nrs, level_used_box_counts
+            def my_realloc_nocopy(ary, shape=nboxes_guess):
+                return actx.zeros(shape=shape, dtype=ary.dtype)
 
-            level_start_box_nrs.append(nboxes_new)
-            level_start_box_nrs_dev[level + 1].fill(nboxes_new)
-            wait_for.extend(level_start_box_nrs_dev.events)
+            def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
+                return actx.zeros(shape=shape, dtype=ary.dtype)
 
-            level_used_box_counts = new_level_used_box_counts
-            level_used_box_counts_dev[:level + 1] = \
-                    np.array(level_used_box_counts, dtype=box_id_dtype)
-            wait_for.extend(level_used_box_counts_dev.events)
+            my_realloc = partial(realloc_array, nboxes_guess)
+            my_realloc_zeros = partial(
+                realloc_array, nboxes_guess, zero_fill=True)
+            my_realloc_zeros_and_renumber = partial(
+                realloc_and_renumber_array, nboxes_guess, zero_fill=True)
 
-            level_leaf_counts = new_level_leaf_counts
-            if debug:
-                for level_start, level_nboxes, leaf_count in zip(
-                        level_start_box_nrs,
-                        level_used_box_counts,
-                        level_leaf_counts):
-                    if level_nboxes == 0:
-                        assert leaf_count == 0
-                        continue
-                    nleaves_actual = level_nboxes - int(actx.to_numpy(
-                        actx.np.sum(
-                            box_has_children[level_start:level_start + level_nboxes]
-                            )
-                        ))
-                    assert leaf_count == nleaves_actual
-
-            # Can't del in Py2.7 - see note below
-            new_level_leaf_counts = None
+            split_box_ids = my_realloc_nocopy(split_box_ids)
 
-            # }}}
+            # *Most*, but not *all* of the values in this array are
+            # rewritten when the morton scan is redone. Specifically,
+            # only the box morton bin counts of boxes on the level
+            # currently being processed are written-but we need to
+            # retain the box morton bin counts from the higher levels.
+            box_morton_bin_counts = my_realloc_zeros(box_morton_bin_counts)
 
-            del nboxes_new
-            del new_level_used_box_counts
+            # force_split_box is unused unless level restriction is enabled.
+            if knl_info.level_restrict:
+                force_split_box = my_realloc_zeros(force_split_box)
 
-            # {{{ split boxes
+            box_srcntgt_starts = my_realloc_zeros(box_srcntgt_starts)
+            box_srcntgt_counts_cumul = my_realloc_zeros(box_srcntgt_counts_cumul)
+            box_has_children = my_realloc_zeros(box_has_children)
 
-            box_splitter_args = (
-                common_args
-                + (box_has_children, force_split_box, root_extent)
-                + box_child_ids
-                + box_centers)
+            box_centers = tuple([my_realloc(ary) for ary in box_centers])
+            box_child_ids = tuple([
+                my_realloc_zeros_and_renumber(ary) for ary in box_child_ids
+                ])
+            box_parent_ids = my_realloc_zeros_and_renumber(box_parent_ids)
 
-            evt = knl_info.box_splitter_kernel(*box_splitter_args,
-                    range=slice(level_start_box_nrs[-1]),
-                    wait_for=wait_for)
+            if not level_start_box_nrs_updated:
+                box_levels = my_realloc(box_levels)
+            else:
+                box_levels = my_realloc_zeros_nocopy(box_levels)
+                for box_level, (level_start, level_end) in enumerate(zip(
+                        level_start_box_nrs, level_start_box_nrs[1:])):
+                    box_levels[level_start:level_end].fill(box_level)
 
-            wait_for = [evt]
+            if level_start_box_nrs_updated:
+                srcntgt_box_ids = renumber_array(srcntgt_box_ids)
 
-            debug_with_finish("box splitter")
+            del my_realloc
+            del my_realloc_zeros
+            del my_realloc_nocopy
+            del my_realloc_zeros_nocopy
+            del my_realloc_zeros_and_renumber
+            del renumber_array
 
-            # Mark the levels of boxes added for padding (these were not updated
-            # by the box splitter kernel).
-            last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1]
-            box_levels[last_used_box:level_start_box_nrs[-1]].fill(level)
+            # retry
+            logger.info("nboxes_guess exceeded: "
+                        "enlarged allocations, restarting level")
 
-            wait_for.extend(box_levels.events)
+            continue
 
-            if debug:
-                box_levels.finish()
-                level_bl_chunk = actx.to_numpy(box_levels)[
-                        level_start_box_nrs[-2]:level_start_box_nrs[-1]]
-                assert np.all(level_bl_chunk == level)
-                del level_bl_chunk
+        # }}}
 
-            if debug:
-                assert np.all(actx.to_numpy(box_srcntgt_starts) < nsrcntgts)
+        logger.debug("LEVEL %d -> %d boxes" % (level, nboxes_new))
 
-            # }}}
+        assert (
+            level_start_box_nrs[-1] != nboxes_new
+            or srcntgts_have_extent
+            or final_level_restrict_iteration)
 
-            # {{{ renumber particles within split boxes
+        if level_start_box_nrs[-1] == nboxes_new:
+            # We haven't created new boxes in this level loop trip.
+            #
+            # If srcntgts have extent, this can happen if boxes were
+            # in-principle overfull, but couldn't subdivide because of
+            # extent restrictions.
+            if srcntgts_have_extent and not final_level_restrict_iteration:
+                level -= 1
+                break
+            assert final_level_restrict_iteration
 
-            new_user_srcntgt_ids = actx.np.zeros_like(user_srcntgt_ids)
-            new_srcntgt_box_ids = actx.np.zeros_like(srcntgt_box_ids)
+        # {{{ update level_start_box_nrs, level_used_box_counts
 
-            particle_renumberer_args = (
-                common_args
-                + (box_has_children, force_split_box,
-                   new_user_srcntgt_ids, new_srcntgt_box_ids))
+        level_start_box_nrs.append(nboxes_new)
+        level_start_box_nrs_dev[level + 1].fill(nboxes_new)
 
-            evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args,
-                    range=slice(nsrcntgts), wait_for=wait_for)
+        level_used_box_counts = new_level_used_box_counts
+        level_used_box_counts_dev[:level + 1] = (
+                np.array(level_used_box_counts, dtype=box_id_dtype))
 
-            wait_for = [evt]
+        level_leaf_counts = new_level_leaf_counts
+        if debug:
+            for level_start, level_nboxes, leaf_count in zip(
+                    level_start_box_nrs,
+                    level_used_box_counts,
+                    level_leaf_counts):
+                if level_nboxes == 0:
+                    assert leaf_count == 0
+                    continue
+                nleaves_actual = level_nboxes - int(actx.to_numpy(
+                    actx.np.sum(
+                        box_has_children[level_start:level_start + level_nboxes]
+                        )
+                    ))
+                assert leaf_count == nleaves_actual
 
-            debug_with_finish("particle renumbering")
+        # Can't del in Py2.7 - see note below
+        new_level_leaf_counts = None
 
-            user_srcntgt_ids = new_user_srcntgt_ids
-            del new_user_srcntgt_ids
-            srcntgt_box_ids = new_srcntgt_box_ids
-            del new_srcntgt_box_ids
+        # }}}
 
-            # }}}
+        del nboxes_new
+        del new_level_used_box_counts
 
-            # {{{ enforce level restriction on upper levels
+        # {{{ split boxes
 
-            if final_level_restrict_iteration:
-                # Roll back level update.
-                #
-                # FIXME: The extra iteration at the end to split boxes should
-                # not be necessary. Instead, all the work for the final box
-                # split should be done in the last iteration of the level
-                # loop. Currently the main issue that forces the extra iteration
-                # to be there is the need to use the box renumbering and
-                # reallocation code. In order to fix this issue, the box
-                # numbering and reallocation code needs to be accessible after
-                # the final level restriction is done.
-                assert int(actx.to_numpy(have_oversize_split_box)) == 0
-                assert level_used_box_counts[-1] == 0
-                del level_used_box_counts[-1]
-                del level_start_box_nrs[-1]
-                level -= 1
-                break
+        box_splitter_args = (
+            common_args
+            + (box_has_children, force_split_box, root_extent)
+            + box_child_ids
+            + box_centers)
 
-            if knl_info.level_restrict:
-                # Avoid generating too many kernels.
-                LEVEL_STEP = 10  # noqa
-                if level % LEVEL_STEP == 1:
-                    level_restrict_kernel = knl_info.level_restrict_kernel_builder(
-                            LEVEL_STEP * div_ceil(level, LEVEL_STEP))
+        knl_info.box_splitter_kernel(*box_splitter_args,
+                range=slice(level_start_box_nrs[-1]),
+                queue=actx.queue,
+                )
 
-                # Upward pass - check if leaf boxes at higher levels need
-                # further splitting.
-                assert len(force_split_box) > 0
-                force_split_box.fill(0)
-                wait_for.extend(force_split_box.events)
+        debug_with_finish("box splitter")
 
-                did_upper_level_split = False
+        # Mark the levels of boxes added for padding (these were not updated
+        # by the box splitter kernel).
+        last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1]
+        box_levels[last_used_box:level_start_box_nrs[-1]].fill(level)
 
-                if debug:
-                    boxes_split = []
-
-                for upper_level, upper_level_start, upper_level_box_count in zip(
-                        # We just built level. Our parent level doesn't need to
-                        # be rechecked for splitting because the smallest boxes
-                        # in the tree (ours) already have a 2-to-1 ratio with
-                        # that. Start checking at the level above our parent.
-                        range(level - 2, 0, -1),
-                        # At this point, the last entry in level_start_box_nrs
-                        # already refers to (level + 1).
-                        level_start_box_nrs[-4::-1],
-                        level_used_box_counts[-3::-1]):
-
-                    upper_level_slice = slice(
-                        upper_level_start, upper_level_start + upper_level_box_count)
-
-                    have_upper_level_split_box.fill(0)
-                    wait_for.extend(have_upper_level_split_box.events)
-
-                    # writes: force_split_box, have_upper_level_split_box
-                    evt = level_restrict_kernel(
-                        upper_level,
-                        root_extent,
-                        box_has_children,
-                        force_split_box,
-                        have_upper_level_split_box,
-                        *(box_child_ids + box_centers),
-                        slice=upper_level_slice,
-                        wait_for=wait_for)
-
-                    wait_for = [evt]
-
-                    if debug:
-                        force_split_box.finish()
-                        boxes_split.append(int(actx.to_numpy(
-                            actx.np.sum(force_split_box[upper_level_slice])
-                            )))
-
-                    if int(actx.to_numpy(have_upper_level_split_box)) == 0:
-                        break
-
-                    did_upper_level_split = True
+        if debug:
+            level_bl_chunk = actx.to_numpy(box_levels)[
+                    level_start_box_nrs[-2]:level_start_box_nrs[-1]]
+            assert np.all(level_bl_chunk == level)
+            del level_bl_chunk
 
-                if debug:
-                    total_boxes_split = sum(boxes_split)
-                    logger.debug("level restriction: {total_boxes_split} boxes split"
-                                 .format(total_boxes_split=total_boxes_split))
-                    from itertools import count
-                    for level_, nboxes_split in zip(
-                            count(level - 2, step=-1), boxes_split[:-1]):
-                        logger.debug("level {level}: {nboxes_split} boxes split"
-                            .format(level=level_, nboxes_split=nboxes_split))
-                    del boxes_split
-
-                if (int(actx.to_numpy(have_oversize_split_box)) == 0
-                        and did_upper_level_split):
-                    # We are in the situation where there are boxes left to
-                    # split on upper levels, and the level loop is done creating
-                    # lower levels.
-                    #
-                    # We re-run the level loop one more time to finish creating
-                    # the upper level boxes.
-                    final_level_restrict_iteration = True
-                    level += 1
-                    continue
+        if debug:
+            assert np.all(actx.to_numpy(box_srcntgt_starts) < nsrcntgts)
 
-            # }}}
+        # }}}
 
-            if not int(actx.to_numpy(have_oversize_split_box)):
-                logger.debug("no boxes left to split")
-                break
+        # {{{ renumber particles within split boxes
 
-            level += 1
-            have_oversize_split_box.fill(0)
+        new_user_srcntgt_ids = actx.np.zeros_like(user_srcntgt_ids)
+        new_srcntgt_box_ids = actx.np.zeros_like(srcntgt_box_ids)
 
-            # {{{ check that nonchild part of box_morton_bin_counts is consistent
+        particle_renumberer_args = (
+            common_args
+            + (box_has_children, force_split_box,
+                new_user_srcntgt_ids, new_srcntgt_box_ids))
 
-            if debug and 0:
-                h_box_morton_bin_counts = actx.to_numpy(box_morton_bin_counts)
-                h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul)
-                h_box_child_ids = tuple([
-                    actx.to_numpy(bci) for bci in box_child_ids
-                    ])
+        knl_info.particle_renumberer_kernel(
+                *particle_renumberer_args, range=slice(nsrcntgts),
+                queue=actx.queue,
+                )
 
-                has_mismatch = False
-                for ibox in range(level_start_box_nrs[-1]):
-                    is_leaf = all(bci[ibox] == 0 for bci in h_box_child_ids)
-                    if is_leaf:
-                        # nonchild count only found in box_info kernel
-                        continue
+        debug_with_finish("particle renumbering")
 
-                    if h_box_srcntgt_counts_cumul[ibox] == 0:
-                        # empty boxes don't have box_morton_bin_counts written
-                        continue
+        user_srcntgt_ids = new_user_srcntgt_ids
+        del new_user_srcntgt_ids
+        srcntgt_box_ids = new_srcntgt_box_ids
+        del new_srcntgt_box_ids
 
-                    kid_sum = sum(
-                            h_box_srcntgt_counts_cumul[bci[ibox]]
-                            for bci in h_box_child_ids
-                            if bci[ibox] != 0)
+        # }}}
 
-                    if (
-                            h_box_srcntgt_counts_cumul[ibox]
-                            != (h_box_morton_bin_counts[ibox]["nonchild_srcntgts"]
-                                + kid_sum)):
-                        print("MISMATCH", level, ibox)
-                        has_mismatch = True
+        # {{{ enforce level restriction on upper levels
 
-                assert not has_mismatch
-                print("LEVEL %d OK" % level)
+        if final_level_restrict_iteration:
+            # Roll back level update.
+            #
+            # FIXME: The extra iteration at the end to split boxes should
+            # not be necessary. Instead, all the work for the final box
+            # split should be done in the last iteration of the level
+            # loop. Currently the main issue that forces the extra iteration
+            # to be there is the need to use the box renumbering and
+            # reallocation code. In order to fix this issue, the box
+            # numbering and reallocation code needs to be accessible after
+            # the final level restriction is done.
+            assert int(actx.to_numpy(have_oversize_split_box)) == 0
+            assert level_used_box_counts[-1] == 0
+            del level_used_box_counts[-1]
+            del level_start_box_nrs[-1]
+            level -= 1
+            break
+
+        if knl_info.level_restrict:
+            # Avoid generating too many kernels.
+            LEVEL_STEP = 10  # noqa
+            if level % LEVEL_STEP == 1:
+                level_restrict_kernel = knl_info.level_restrict_kernel_builder(
+                        LEVEL_STEP * div_ceil(level, LEVEL_STEP))
+
+            # Upward pass - check if leaf boxes at higher levels need
+            # further splitting.
+            assert len(force_split_box) > 0
+            force_split_box.fill(0)
+
+            did_upper_level_split = False
 
-                # Cannot delete in Py 2.7: referred to from nested scope.
-                h_box_srcntgt_counts_cumul = None
+            if debug:
+                boxes_split = []
+
+            for upper_level, upper_level_start, upper_level_box_count in zip(
+                    # We just built level. Our parent level doesn't need to
+                    # be rechecked for splitting because the smallest boxes
+                    # in the tree (ours) already have a 2-to-1 ratio with
+                    # that. Start checking at the level above our parent.
+                    range(level - 2, 0, -1),
+                    # At this point, the last entry in level_start_box_nrs
+                    # already refers to (level + 1).
+                    level_start_box_nrs[-4::-1],
+                    level_used_box_counts[-3::-1]):
+
+                upper_level_slice = slice(
+                    upper_level_start, upper_level_start + upper_level_box_count)
+
+                have_upper_level_split_box.fill(0)
+
+                # writes: force_split_box, have_upper_level_split_box
+                level_restrict_kernel(
+                    upper_level,
+                    root_extent,
+                    box_has_children,
+                    force_split_box,
+                    have_upper_level_split_box,
+                    *(box_child_ids + box_centers),
+                    slice=upper_level_slice,
+                    queue=actx.queue,
+                    )
 
-                del h_box_morton_bin_counts
-                del h_box_child_ids
+                if debug:
+                    boxes_split.append(int(actx.to_numpy(
+                        actx.np.sum(force_split_box[upper_level_slice])
+                        )))
 
-            # }}}
+                if int(actx.to_numpy(have_upper_level_split_box)) == 0:
+                    break
 
-        nboxes = level_start_box_nrs[-1]
+                did_upper_level_split = True
 
-        npasses = level+1
-        level_loop_proc.done("%d levels, %d boxes", level, nboxes)
-        del npasses
+            if debug:
+                total_boxes_split = sum(boxes_split)
+                logger.debug("level restriction: {total_boxes_split} boxes split"
+                                .format(total_boxes_split=total_boxes_split))
+                from itertools import count
+                for level_, nboxes_split in zip(
+                        count(level - 2, step=-1), boxes_split[:-1]):
+                    logger.debug("level {level}: {nboxes_split} boxes split"
+                        .format(level=level_, nboxes_split=nboxes_split))
+                del boxes_split
+
+            if (int(actx.to_numpy(have_oversize_split_box)) == 0
+                    and did_upper_level_split):
+                # We are in the situation where there are boxes left to
+                # split on upper levels, and the level loop is done creating
+                # lower levels.
+                #
+                # We re-run the level loop one more time to finish creating
+                # the upper level boxes.
+                final_level_restrict_iteration = True
+                level += 1
+                continue
 
         # }}}
 
-        # {{{ extract number of non-child srcntgts from box morton counts
+        if not int(actx.to_numpy(have_oversize_split_box)):
+            logger.debug("no boxes left to split")
+            break
 
-        if srcntgts_have_extent:
-            box_srcntgt_counts_nonchild = actx.empty(nboxes, particle_id_dtype)
-            debug_with_finish("extract non-child srcntgt count")
+        level += 1
+        have_oversize_split_box.fill(0)
 
-            assert len(level_start_box_nrs) >= 2
-            highest_possibly_split_box_nr = level_start_box_nrs[-2]
+        # {{{ check that nonchild part of box_morton_bin_counts is consistent
 
-            evt = knl_info.extract_nonchild_srcntgt_count_kernel(
-                    # input
-                    box_morton_bin_counts,
-                    box_srcntgt_counts_cumul,
-                    highest_possibly_split_box_nr,
+        if debug and 0:
+            h_box_morton_bin_counts = actx.to_numpy(box_morton_bin_counts)
+            h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul)
+            h_box_child_ids = tuple([
+                actx.to_numpy(bci) for bci in box_child_ids
+                ])
 
-                    # output
-                    box_srcntgt_counts_nonchild,
+            has_mismatch = False
+            for ibox in range(level_start_box_nrs[-1]):
+                is_leaf = all(bci[ibox] == 0 for bci in h_box_child_ids)
+                if is_leaf:
+                    # nonchild count only found in box_info kernel
+                    continue
 
-                    range=slice(nboxes), wait_for=wait_for)
-            wait_for = [evt]
+                if h_box_srcntgt_counts_cumul[ibox] == 0:
+                    # empty boxes don't have box_morton_bin_counts written
+                    continue
 
-            del highest_possibly_split_box_nr
+                kid_sum = sum(
+                        h_box_srcntgt_counts_cumul[bci[ibox]]
+                        for bci in h_box_child_ids
+                        if bci[ibox] != 0)
 
-            if debug:
-                h_box_srcntgt_counts_nonchild = (
-                    actx.to_numpy(box_srcntgt_counts_nonchild))
-                h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul)
+                if (
+                        h_box_srcntgt_counts_cumul[ibox]
+                        != (h_box_morton_bin_counts[ibox]["nonchild_srcntgts"]
+                            + kid_sum)):
+                    print("MISMATCH", level, ibox)
+                    has_mismatch = True
 
-                assert np.all(
-                    h_box_srcntgt_counts_nonchild
-                    <= h_box_srcntgt_counts_cumul[:nboxes])
+            assert not has_mismatch
+            print("LEVEL %d OK" % level)
 
-                del h_box_srcntgt_counts_nonchild
+            # Cannot delete in Py 2.7: referred to from nested scope.
+            h_box_srcntgt_counts_cumul = None
 
-                # Cannot delete in Py 2.7: referred to from nested scope.
-                h_box_srcntgt_counts_cumul = None
+            del h_box_morton_bin_counts
+            del h_box_child_ids
 
         # }}}
 
-        del morton_nrs
-        del box_morton_bin_counts
+    nboxes = level_start_box_nrs[-1]
 
-        # {{{ prune empty/unused leaf boxes
+    npasses = level+1
+    level_loop_proc.done("%d levels, %d boxes", level, nboxes)
+    del npasses
 
-        prune_empty_leaves = not kwargs.get("skip_prune")
-
-        if prune_empty_leaves:
-            # What is the original index of this box?
-            src_box_id = actx.empty(nboxes, box_id_dtype)
+    # }}}
 
-            # Where should I put this box?
-            #
-            # Initialize to all zeros, because pruned boxes should be mapped to
-            # zero (e.g. when pruning child_box_ids).
-            dst_box_id, evt = zeros(nboxes, box_id_dtype)
-            wait_for.append(evt)
-
-            debug_with_finish("find prune indices")
-
-            nboxes_post_prune_dev = actx.empty((), dtype=box_id_dtype)
-            evt = knl_info.find_prune_indices_kernel(
-                    box_srcntgt_counts_cumul,
-                    src_box_id, dst_box_id, nboxes_post_prune_dev,
-                    size=nboxes, wait_for=wait_for)
-            wait_for = [evt]
-            nboxes_post_prune = int(actx.to_numpy(nboxes_post_prune_dev))
-            logger.debug("{} boxes after pruning "
-                        "({} empty leaves and/or unused boxes removed)"
-                    .format(nboxes_post_prune, nboxes - nboxes_post_prune))
-            should_prune = True
-        elif knl_info.level_restrict:
-            # Remove unused boxes from the tree.
-            src_box_id = actx.empty(nboxes, box_id_dtype)
-            dst_box_id = actx.empty(nboxes, box_id_dtype)
-
-            new_level_start_box_nrs = np.zeros_like(level_start_box_nrs)
-            new_level_start_box_nrs[0] = 0
-            new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts)
-            for level_start, new_level_start, level_used_box_count in zip(
-                    level_start_box_nrs, new_level_start_box_nrs,
-                    level_used_box_counts):
+    # {{{ extract number of non-child srcntgts from box morton counts
 
-                def make_slice(start, offset=level_used_box_count):
-                    return slice(start, start + offset)
+    if srcntgts_have_extent:
+        box_srcntgt_counts_nonchild = actx.empty(nboxes, particle_id_dtype)
+        debug_with_finish("extract non-child srcntgt count")
 
-                def make_arange(start, offset=level_used_box_count):
-                    return actx.from_numpy(
-                        np.arange(start, start + offset, dtype=box_id_dtype)
-                        )
+        assert len(level_start_box_nrs) >= 2
+        highest_possibly_split_box_nr = level_start_box_nrs[-2]
 
-                src_box_id[make_slice(new_level_start)] = make_arange(level_start)
-                dst_box_id[make_slice(level_start)] = make_arange(new_level_start)
-            wait_for.extend(src_box_id.events + dst_box_id.events)
+        knl_info.extract_nonchild_srcntgt_count_kernel(
+                # input
+                box_morton_bin_counts,
+                box_srcntgt_counts_cumul,
+                highest_possibly_split_box_nr,
 
-            nboxes_post_prune = new_level_start_box_nrs[-1]
+                # output
+                box_srcntgt_counts_nonchild,
 
-            logger.info("{} boxes after pruning ({} unused boxes removed)"
-                    .format(nboxes_post_prune, nboxes - nboxes_post_prune))
-            should_prune = True
-        else:
-            should_prune = False
+                range=slice(nboxes),
+                queue=actx.queue,
+                )
 
-        if should_prune:
-            prune_events = []
+        del highest_possibly_split_box_nr
 
-            prune_empty = partial(self.gappy_copy_and_map,
-                    actx, nboxes_post_prune,
-                    src_indices=src_box_id,
-                    range=slice(nboxes_post_prune), debug=debug)
+        if debug:
+            h_box_srcntgt_counts_nonchild = (
+                actx.to_numpy(box_srcntgt_counts_nonchild))
+            h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul)
 
-            box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts)
-            prune_events.append(evt)
+            assert np.all(
+                h_box_srcntgt_counts_nonchild
+                <= h_box_srcntgt_counts_cumul[:nboxes])
 
-            box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul)
-            prune_events.append(evt)
+            del h_box_srcntgt_counts_nonchild
 
-            if debug and prune_empty_leaves:
-                assert np.all(actx.to_numpy(box_srcntgt_counts_cumul) > 0)
+            # Cannot delete in Py 2.7: referred to from nested scope.
+            h_box_srcntgt_counts_cumul = None
 
-            srcntgt_box_ids, evt = self.map_values_kernel(
-                    dst_box_id, srcntgt_box_ids)
-            prune_events.append(evt)
+    # }}}
 
-            box_parent_ids, evt = prune_empty(box_parent_ids, map_values=dst_box_id)
-            prune_events.append(evt)
+    del morton_nrs
+    del box_morton_bin_counts
 
-            box_levels, evt = prune_empty(box_levels)
-            prune_events.append(evt)
+    # {{{ prune empty/unused leaf boxes
 
-            if srcntgts_have_extent:
-                box_srcntgt_counts_nonchild, evt = prune_empty(
-                        box_srcntgt_counts_nonchild)
-                prune_events.append(evt)
+    prune_empty_leaves = not kwargs.get("skip_prune")
 
-            box_has_children, evt = prune_empty(box_has_children)
-            prune_events.append(evt)
+    if prune_empty_leaves:
+        # What is the original index of this box?
+        src_box_id = actx.empty(nboxes, box_id_dtype)
 
-            box_child_ids, evts = zip(
-                *(prune_empty(ary, map_values=dst_box_id)
-                  for ary in box_child_ids))
-            prune_events.extend(evts)
+        # Where should I put this box?
+        #
+        # Initialize to all zeros, because pruned boxes should be mapped to
+        # zero (e.g. when pruning child_box_ids).
+        dst_box_id = actx.zeros(nboxes, box_id_dtype)
+
+        debug_with_finish("find prune indices")
+
+        nboxes_post_prune_dev = actx.empty((), dtype=box_id_dtype)
+        knl_info.find_prune_indices_kernel(
+                box_srcntgt_counts_cumul,
+                src_box_id, dst_box_id, nboxes_post_prune_dev,
+                size=nboxes,
+                queue=actx.queue,
+                allocator=actx.allocator,
+                )
+        nboxes_post_prune = int(actx.to_numpy(nboxes_post_prune_dev).item())
+
+        logger.debug(
+            "%d boxes after pruning (%d empty leaves and/or unused boxes removed)",
+            nboxes_post_prune, nboxes - nboxes_post_prune)
+        should_prune = True
+
+    elif knl_info.level_restrict:
+        # Remove unused boxes from the tree.
+        src_box_id = actx.empty(nboxes, box_id_dtype)
+        dst_box_id = actx.empty(nboxes, box_id_dtype)
+
+        new_level_start_box_nrs = np.zeros_like(level_start_box_nrs)
+        new_level_start_box_nrs[0] = 0
+        new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts)
+        for level_start, new_level_start, level_used_box_count in zip(
+                level_start_box_nrs, new_level_start_box_nrs,
+                level_used_box_counts):
+
+            def make_slice(start, offset=level_used_box_count):
+                return slice(start, start + offset)
+
+            def make_arange(start, offset=level_used_box_count):
+                return actx.from_numpy(
+                    np.arange(start, start + offset, dtype=box_id_dtype)
+                    )
 
-            box_centers, evts = zip(
-                *(prune_empty(ary) for ary in box_centers))
-            prune_events.extend(evts)
+            src_box_id[make_slice(new_level_start)] = make_arange(level_start)
+            dst_box_id[make_slice(level_start)] = make_arange(new_level_start)
 
-            # Update box counts and level start box indices.
-            box_levels.finish()
+        nboxes_post_prune = new_level_start_box_nrs[-1]
 
-            evt = knl_info.find_level_box_counts_kernel(
-                box_levels, level_used_box_counts_dev)
-            wait_for_events([evt])
+        logger.info("%d boxes after pruning (%d unused boxes removed)",
+                nboxes_post_prune, nboxes - nboxes_post_prune)
+        should_prune = True
 
-            nlevels = len(level_used_box_counts)
-            level_used_box_counts = (
-                actx.to_numpy(level_used_box_counts_dev[:nlevels]))
+    else:
+        should_prune = False
 
-            level_start_box_nrs = [0]
-            level_start_box_nrs.extend(np.cumsum(level_used_box_counts))
+    if should_prune:
+        prune_empty = partial(copy_and_map_gappy,
+                actx, nboxes_post_prune,
+                src_indices=src_box_id,
+                range=slice(nboxes_post_prune), debug=debug)
 
-            level_start_box_nrs_dev[:nlevels + 1] = np.array(
-                level_start_box_nrs, dtype=box_id_dtype)
-            prune_events.extend(level_start_box_nrs_dev.events)
+        box_srcntgt_starts = prune_empty(box_srcntgt_starts)
+        box_srcntgt_counts_cumul = prune_empty(box_srcntgt_counts_cumul)
 
-            wait_for = prune_events
-        else:
-            logger.info("skipping empty-leaf pruning")
-            nboxes_post_prune = nboxes
+        if debug and prune_empty_leaves:
+            assert np.all(actx.to_numpy(box_srcntgt_counts_cumul) > 0)
 
-        level_start_box_nrs = np.array(level_start_box_nrs, box_id_dtype)
+        srcntgt_box_ids = map_values(actx, dst_box_id, srcntgt_box_ids)
+        box_parent_ids = prune_empty(box_parent_ids, mapping=dst_box_id)
+        box_levels = prune_empty(box_levels)
 
-        # }}}
+        if srcntgts_have_extent:
+            box_srcntgt_counts_nonchild = (
+                prune_empty(box_srcntgt_counts_nonchild))
 
-        del nboxes
+        box_has_children = prune_empty(box_has_children)
 
-        # {{{ compute source/target particle indices and counts in each box
+        box_child_ids = tuple([
+            prune_empty(ary, mapping=dst_box_id)
+            for ary in box_child_ids])
+        box_centers = tuple([prune_empty(ary) for ary in box_centers])
 
-        if targets is None:
-            from boxtree.tools import reverse_index_array
-            user_source_ids = user_srcntgt_ids
-            sorted_target_ids = reverse_index_array(actx, user_srcntgt_ids)
+        # Update box counts and level start box indices.
+        knl_info.find_level_box_counts_kernel(
+            box_levels, level_used_box_counts_dev,
+            queue=actx.queue,
+            allocator=actx.allocator,
+            )
 
-            box_source_starts = box_target_starts = box_srcntgt_starts
-            box_source_counts_cumul = box_target_counts_cumul = \
-                    box_srcntgt_counts_cumul
-            if srcntgts_have_extent:
-                box_source_counts_nonchild = box_target_counts_nonchild = \
-                        box_srcntgt_counts_nonchild
-        else:
-            source_numbers = actx.empty(nsrcntgts, particle_id_dtype)
-
-            debug_with_finish("source counter")
-            evt = knl_info.source_counter(user_srcntgt_ids, nsources,
-                    source_numbers, queue=actx.queue, allocator=actx.allocator,
-                    wait_for=wait_for)
-            wait_for = [evt]
-
-            user_source_ids = actx.empty(nsources, particle_id_dtype)
-            # srcntgt_target_ids is temporary until particle permutation is done
-            srcntgt_target_ids = actx.empty(ntargets, particle_id_dtype)
-            sorted_target_ids = actx.empty(ntargets, particle_id_dtype)
-
-            # need to use zeros because parent boxes won't be initialized
-            box_source_starts, evt = zeros(nboxes_post_prune, particle_id_dtype)
-            wait_for.append(evt)
-            box_source_counts_cumul, evt = zeros(
-                    nboxes_post_prune, particle_id_dtype)
-            wait_for.append(evt)
-            box_target_starts, evt = zeros(
-                    nboxes_post_prune, particle_id_dtype)
-            wait_for.append(evt)
-            box_target_counts_cumul, evt = zeros(
-                    nboxes_post_prune, particle_id_dtype)
-            wait_for.append(evt)
-
-            if srcntgts_have_extent:
-                box_source_counts_nonchild, evt = zeros(
-                        nboxes_post_prune, particle_id_dtype)
-                wait_for.append(evt)
-                box_target_counts_nonchild, evt = zeros(
-                        nboxes_post_prune, particle_id_dtype)
-                wait_for.append(evt)
-
-            debug_with_finish("source and target index finder")
-            evt = knl_info.source_and_target_index_finder(*(
-                # input:
-                (
-                    user_srcntgt_ids, nsources, srcntgt_box_ids,
-                    box_parent_ids,
+        nlevels = len(level_used_box_counts)
+        level_used_box_counts = (
+            actx.to_numpy(level_used_box_counts_dev[:nlevels]))
 
-                    box_srcntgt_starts, box_srcntgt_counts_cumul,
-                    source_numbers,
-                )
-                + ((box_srcntgt_counts_nonchild,)
-                    if srcntgts_have_extent else ())
+        level_start_box_nrs = [0]
+        level_start_box_nrs.extend(np.cumsum(level_used_box_counts))
 
-                # output:
-                + (
-                    user_source_ids, srcntgt_target_ids, sorted_target_ids,
-                    box_source_starts, box_source_counts_cumul,
-                    box_target_starts, box_target_counts_cumul,
-                    )
-                + ((
-                    box_source_counts_nonchild,
-                    box_target_counts_nonchild,
-                    ) if srcntgts_have_extent else ())
-                ),
-                queue=actx.queue, range=slice(nsrcntgts),
-                wait_for=wait_for)
-            wait_for = [evt]
+        level_start_box_nrs_dev[:nlevels + 1] = np.array(
+            level_start_box_nrs, dtype=box_id_dtype)
+    else:
+        logger.info("skipping empty-leaf pruning")
+        nboxes_post_prune = nboxes
 
-            if srcntgts_have_extent:
-                if debug:
-                    assert np.all(actx.to_numpy(
-                        box_srcntgt_counts_nonchild
-                        == (box_source_counts_nonchild + box_target_counts_nonchild)
-                        ))
+    level_start_box_nrs = np.array(level_start_box_nrs, box_id_dtype)
 
-            if debug:
-                usi_host = actx.to_numpy(user_source_ids)
-                assert np.all(usi_host < nsources)
-                assert np.all(0 <= usi_host)
-                del usi_host
+    # }}}
 
-                sti_host = actx.to_numpy(srcntgt_target_ids)
-                assert np.all(sti_host < nsources+ntargets)
-                assert np.all(nsources <= sti_host)
-                del sti_host
+    del nboxes
 
-                assert np.all(actx.to_numpy(
-                    box_source_counts_cumul + box_target_counts_cumul
-                    == box_srcntgt_counts_cumul
-                    ))
+    # {{{ compute source/target particle indices and counts in each box
 
-            del source_numbers
+    if targets is None:
+        from boxtree.tools import reverse_index_array
+        user_source_ids = user_srcntgt_ids
+        sorted_target_ids = reverse_index_array(actx, user_srcntgt_ids)
 
-        del box_srcntgt_starts
+        box_source_starts = box_target_starts = box_srcntgt_starts
+        box_source_counts_cumul = box_target_counts_cumul = \
+                box_srcntgt_counts_cumul
         if srcntgts_have_extent:
-            del box_srcntgt_counts_nonchild
+            box_source_counts_nonchild = box_target_counts_nonchild = \
+                    box_srcntgt_counts_nonchild
+    else:
+        source_numbers = actx.empty(nsrcntgts, particle_id_dtype)
+
+        debug_with_finish("source counter")
+        knl_info.source_counter(
+                user_srcntgt_ids, nsources, source_numbers,
+                queue=actx.queue,
+                allocator=actx.allocator,
+                )
 
-        # }}}
+        user_source_ids = actx.empty(nsources, particle_id_dtype)
+        # srcntgt_target_ids is temporary until particle permutation is done
+        srcntgt_target_ids = actx.empty(ntargets, particle_id_dtype)
+        sorted_target_ids = actx.empty(ntargets, particle_id_dtype)
+
+        # need to use zeros because parent boxes won't be initialized
+        box_source_starts = actx.zeros(nboxes_post_prune, particle_id_dtype)
+        box_source_counts_cumul = actx.zeros(nboxes_post_prune, particle_id_dtype)
+        box_target_starts = actx.zeros(nboxes_post_prune, particle_id_dtype)
+        box_target_counts_cumul = actx.zeros(nboxes_post_prune, particle_id_dtype)
 
-        # {{{ permute and source/target-split (if necessary) particle array
+        if srcntgts_have_extent:
+            box_source_counts_nonchild = (
+                    actx.zeros(nboxes_post_prune, particle_id_dtype))
+            box_target_counts_nonchild = (
+                    actx.zeros(nboxes_post_prune, particle_id_dtype))
+
+        debug_with_finish("source and target index finder")
+        knl_info.source_and_target_index_finder(*(
+            # input:
+            (
+                user_srcntgt_ids, nsources, srcntgt_box_ids,
+                box_parent_ids,
+
+                box_srcntgt_starts, box_srcntgt_counts_cumul,
+                source_numbers,
+            )
+            + ((box_srcntgt_counts_nonchild,)
+                if srcntgts_have_extent else ())
+
+            # output:
+            + (
+                user_source_ids, srcntgt_target_ids, sorted_target_ids,
+                box_source_starts, box_source_counts_cumul,
+                box_target_starts, box_target_counts_cumul,
+                )
+            + ((
+                box_source_counts_nonchild,
+                box_target_counts_nonchild,
+                ) if srcntgts_have_extent else ())
+            ),
+            queue=actx.queue, range=slice(nsrcntgts),
+            )
 
-        if targets is None:
-            sources = targets = actx.np.zeros_like(srcntgts)
+        if srcntgts_have_extent:
+            if debug:
+                assert np.all(actx.to_numpy(
+                    box_srcntgt_counts_nonchild
+                    == (box_source_counts_nonchild + box_target_counts_nonchild)
+                    ))
 
-            debug_with_finish("srcntgt permuter (particles)")
-            evt = knl_info.srcntgt_permuter(
-                    user_srcntgt_ids,
-                    *(tuple(srcntgts) + tuple(sources)),
-                    wait_for=wait_for)
-            wait_for = [evt]
+        if debug:
+            usi_host = actx.to_numpy(user_source_ids)
+            assert np.all(usi_host < nsources)
+            assert np.all(0 <= usi_host)
+            del usi_host
 
-            assert srcntgt_radii is None
+            sti_host = actx.to_numpy(srcntgt_target_ids)
+            assert np.all(sti_host < nsources+ntargets)
+            assert np.all(nsources <= sti_host)
+            del sti_host
 
-        else:
-            sources = make_obj_array([
-                actx.empty(nsources, coord_dtype) for i in range(dimensions)
-                ])
-            debug_with_finish("srcntgt permuter (sources)")
-            evt = knl_info.srcntgt_permuter(
-                    user_source_ids,
-                    *(tuple(srcntgts) + tuple(sources)),
-                    queue=actx.queue, range=slice(nsources),
-                    wait_for=wait_for)
-            wait_for = [evt]
-
-            targets = make_obj_array([
-                actx.empty(ntargets, coord_dtype) for i in range(dimensions)
-                ])
-            debug_with_finish("srcntgt permuter (targets)")
-            evt = knl_info.srcntgt_permuter(
-                    srcntgt_target_ids,
-                    *(tuple(srcntgts) + tuple(targets)),
-                    queue=actx.queue, range=slice(ntargets),
-                    wait_for=wait_for)
-            wait_for = [evt]
+            assert np.all(actx.to_numpy(
+                box_source_counts_cumul + box_target_counts_cumul
+                == box_srcntgt_counts_cumul
+                ))
 
-            if srcntgt_radii is not None:
-                import pyopencl.array as cl_array
-                debug_with_finish("srcntgt permuter (source radii)")
-                source_radii = cl_array.take(
-                        srcntgt_radii, user_source_ids, queue=actx.queue,
-                        wait_for=wait_for)
+        del source_numbers
 
-                debug_with_finish("srcntgt permuter (target radii)")
-                target_radii = cl_array.take(
-                        srcntgt_radii, srcntgt_target_ids, queue=actx.queue,
-                        wait_for=wait_for)
+    del box_srcntgt_starts
+    if srcntgts_have_extent:
+        del box_srcntgt_counts_nonchild
 
-                wait_for = source_radii.events + target_radii.events
+    # }}}
 
-            del srcntgt_target_ids
+    # {{{ permute and source/target-split (if necessary) particle array
 
-        del srcntgt_radii
+    if targets is None:
+        sources = targets = actx.np.zeros_like(srcntgts)
 
-        # }}}
+        debug_with_finish("srcntgt permuter (particles)")
+        knl_info.srcntgt_permuter(
+                user_srcntgt_ids,
+                *(tuple(srcntgts) + tuple(sources)),
+                queue=actx.queue,
+                )
 
-        del srcntgts
+        assert srcntgt_radii is None
+
+    else:
+        sources = make_obj_array([
+            actx.empty(nsources, coord_dtype) for i in range(dimensions)
+            ])
+        debug_with_finish("srcntgt permuter (sources)")
+        knl_info.srcntgt_permuter(
+                user_source_ids,
+                *(tuple(srcntgts) + tuple(sources)),
+                queue=actx.queue, range=slice(nsources),
+                )
 
-        nlevels = len(level_start_box_nrs) - 1
+        targets = make_obj_array([
+            actx.empty(ntargets, coord_dtype) for i in range(dimensions)
+            ])
+        debug_with_finish("srcntgt permuter (targets)")
+        knl_info.srcntgt_permuter(
+                srcntgt_target_ids,
+                *(tuple(srcntgts) + tuple(targets)),
+                queue=actx.queue, range=slice(ntargets),
+                )
 
-        assert nlevels == len(level_used_box_counts)
-        assert level + 1 == nlevels, (level+1, nlevels)
-        if debug:
-            max_level = np.max(actx.to_numpy(box_levels))
-            assert max_level + 1 == nlevels
+        if srcntgt_radii is not None:
+            debug_with_finish("srcntgt permuter (source radii)")
+            source_radii = srcntgt_radii[user_source_ids]
 
-        # {{{ gather box child ids, box centers
+            debug_with_finish("srcntgt permuter (target radii)")
+            target_radii = srcntgt_radii[srcntgt_target_ids]
 
-        # A number of arrays below are nominally 2-dimensional and stored with
-        # the box index as the fastest-moving index. To make sure that accesses
-        # remain aligned, we round up the number of boxes used for indexing.
-        aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32
+        del srcntgt_target_ids
 
-        box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
-        wait_for.append(evt)
-        box_centers_new = actx.empty((dimensions, aligned_nboxes), coord_dtype)
+    del srcntgt_radii
 
-        for mnr, child_row in enumerate(box_child_ids):
-            box_child_ids_new[mnr, :nboxes_post_prune] = \
-                    child_row[:nboxes_post_prune]
-        wait_for.extend(box_child_ids_new.events)
+    # }}}
 
-        for dim, center_row in enumerate(box_centers):
-            box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune]
-        wait_for.extend(box_centers_new.events)
+    del srcntgts
 
-        wait_for_events(wait_for)
+    nlevels = len(level_start_box_nrs) - 1
 
-        box_centers = box_centers_new
-        box_child_ids = box_child_ids_new
+    assert nlevels == len(level_used_box_counts)
+    assert level + 1 == nlevels, (level+1, nlevels)
+    if debug:
+        max_level = np.max(actx.to_numpy(box_levels))
+        assert max_level + 1 == nlevels
 
-        del box_centers_new
-        del box_child_ids_new
+    # {{{ gather box child ids, box centers
 
-        # }}}
+    # A number of arrays below are nominally 2-dimensional and stored with
+    # the box index as the fastest-moving index. To make sure that accesses
+    # remain aligned, we round up the number of boxes used for indexing.
+    aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32
 
-        # {{{ compute box flags
+    box_child_ids_new = actx.zeros((2**dimensions, aligned_nboxes), box_id_dtype)
+    box_centers_new = actx.empty((dimensions, aligned_nboxes), coord_dtype)
 
-        from boxtree.tree import box_flags_enum
-        box_flags = actx.empty(nboxes_post_prune, box_flags_enum.dtype)
+    for mnr, child_row in enumerate(box_child_ids):
+        box_child_ids_new[mnr, :nboxes_post_prune] = \
+                child_row[:nboxes_post_prune]
 
-        if not srcntgts_have_extent:
-            # If srcntgts_have_extent, then non-child counts have already been
-            # computed, and we have nothing to do here. But if not, then
-            # we must fill these non-child counts. This amounts to copying
-            # the cumulative counts and setting them to zero for non-leaves.
+    for dim, center_row in enumerate(box_centers):
+        box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune]
 
-            # {{{ make sure box_{source,target}_counts_nonchild are not defined
+    box_centers = box_centers_new
+    box_child_ids = box_child_ids_new
 
-            # (before we overwrite them)
+    del box_centers_new
+    del box_child_ids_new
 
-            try:
-                box_source_counts_nonchild
-            except NameError:
-                pass
-            else:
-                raise AssertionError
+    # }}}
 
-            try:
-                box_target_counts_nonchild
-            except NameError:
-                pass
-            else:
-                raise AssertionError
+    # {{{ compute box flags
 
-            # }}}
+    from boxtree.tree import box_flags_enum
+    box_flags = actx.empty(nboxes_post_prune, box_flags_enum.dtype)
 
-            box_source_counts_nonchild, evt = zeros(
-                    nboxes_post_prune, particle_id_dtype)
-            wait_for.append(evt)
+    if not srcntgts_have_extent:
+        # If srcntgts_have_extent, then non-child counts have already been
+        # computed, and we have nothing to do here. But if not, then
+        # we must fill these non-child counts. This amounts to copying
+        # the cumulative counts and setting them to zero for non-leaves.
 
-            if sources_are_targets:
-                box_target_counts_nonchild = box_source_counts_nonchild
-            else:
-                box_target_counts_nonchild, evt = zeros(
-                        nboxes_post_prune, particle_id_dtype)
-                wait_for.append(evt)
-
-        debug_with_finish("compute box info")
-        evt = knl_info.box_info_kernel(
-                *(
-                    # input:
-                    box_parent_ids, box_srcntgt_counts_cumul,
-                    box_source_counts_cumul, box_target_counts_cumul,
-                    box_has_children, box_levels, nlevels,
-
-                    # output if srcntgts_have_extent, input+output otherwise
-                    box_source_counts_nonchild, box_target_counts_nonchild,
-
-                    # output:
-                    box_flags,
-                ),
-                range=slice(nboxes_post_prune),
-                wait_for=wait_for)
+        # {{{ make sure box_{source,target}_counts_nonchild are not defined
 
-        # }}}
+        # (before we overwrite them)
 
-        del box_has_children
-        wait_for = [evt]
+        try:
+            box_source_counts_nonchild
+        except NameError:
+            pass
+        else:
+            raise AssertionError
 
-        # {{{ compute box bounding box
+        try:
+            box_target_counts_nonchild
+        except NameError:
+            pass
+        else:
+            raise AssertionError
 
-        debug_with_finish("finding box extents")
+        # }}}
 
-        box_source_bounding_box_min = actx.empty(
-            (dimensions, aligned_nboxes), dtype=coord_dtype)
-        box_source_bounding_box_max = actx.empty(
-            (dimensions, aligned_nboxes), dtype=coord_dtype)
+        box_source_counts_nonchild = (
+                actx.zeros(nboxes_post_prune, particle_id_dtype))
 
         if sources_are_targets:
-            box_target_bounding_box_min = box_source_bounding_box_min
-            box_target_bounding_box_max = box_source_bounding_box_max
+            box_target_counts_nonchild = box_source_counts_nonchild
         else:
-            box_target_bounding_box_min = actx.empty(
-                    (dimensions, aligned_nboxes), dtype=coord_dtype)
-            box_target_bounding_box_max = actx.empty(
-                    (dimensions, aligned_nboxes), dtype=coord_dtype)
+            box_target_counts_nonchild = (
+                    actx.zeros(nboxes_post_prune, particle_id_dtype))
 
-        bogus_radii_array = actx.empty(1, dtype=coord_dtype)
+    debug_with_finish("compute box info")
+    knl_info.box_info_kernel(
+            *(
+                # input:
+                box_parent_ids, box_srcntgt_counts_cumul,
+                box_source_counts_cumul, box_target_counts_cumul,
+                box_has_children, box_levels, nlevels,
 
-        # nlevels-1 is the highest valid level index
-        for level in range(nlevels-1, -1, -1):
-            start, stop = level_start_box_nrs[level:level+2]
+                # output if srcntgts_have_extent, input+output otherwise
+                box_source_counts_nonchild, box_target_counts_nonchild,
 
-            for (skip, enable_radii, box_bounding_box_min, box_bounding_box_max,
-                    pstarts, pcounts, particle_radii, particles) in [
-                    (
-                        # never skip
-                        False,
-
-                        sources_have_extent,
-                        box_source_bounding_box_min,
-                        box_source_bounding_box_max,
-                        box_source_starts,
-                        box_source_counts_nonchild,
-                        source_radii if sources_have_extent else bogus_radii_array,
-                        sources),
-                    (
-                        # skip the 'target' round if sources and targets
-                        # are the same.
-                        sources_are_targets,
-
-                        targets_have_extent,
-                        box_target_bounding_box_min,
-                        box_target_bounding_box_max,
-                        box_target_starts,
-                        box_target_counts_nonchild,
-                        target_radii if targets_have_extent else bogus_radii_array,
-                        targets),
-                    ]:
-
-                if skip:
-                    continue
+                # output:
+                box_flags,
+            ),
+            range=slice(nboxes_post_prune),
+            queue=actx.queue,
+            )
 
-                args = (
-                        (
-                            aligned_nboxes,
-                            box_child_ids,
-                            box_centers,
-                            pstarts, pcounts,)
-                        + tuple(particles)
-                        + (
-                            particle_radii,
-                            enable_radii,
+    # }}}
 
-                            box_bounding_box_min,
-                            box_bounding_box_max))
+    del box_has_children
 
-                evt = knl_info.box_extents_finder_kernel(
-                        *args,
+    # {{{ compute box bounding box
 
-                        range=slice(start, stop),
-                        queue=actx.queue, wait_for=wait_for)
+    debug_with_finish("finding box extents")
 
-            wait_for = [evt]
+    box_source_bounding_box_min = actx.empty(
+        (dimensions, aligned_nboxes), dtype=coord_dtype)
+    box_source_bounding_box_max = actx.empty(
+        (dimensions, aligned_nboxes), dtype=coord_dtype)
 
-        del bogus_radii_array
+    if sources_are_targets:
+        box_target_bounding_box_min = box_source_bounding_box_min
+        box_target_bounding_box_max = box_source_bounding_box_max
+    else:
+        box_target_bounding_box_min = actx.empty(
+                (dimensions, aligned_nboxes), dtype=coord_dtype)
+        box_target_bounding_box_max = actx.empty(
+                (dimensions, aligned_nboxes), dtype=coord_dtype)
 
-        # }}}
+    bogus_radii_array = actx.empty(1, dtype=coord_dtype)
 
-        # {{{ build output
+    # nlevels-1 is the highest valid level index
+    for level in range(nlevels-1, -1, -1):
+        start, stop = level_start_box_nrs[level:level+2]
 
-        extra_tree_attrs = {}
+        for (skip, enable_radii, box_bounding_box_min, box_bounding_box_max,
+                pstarts, pcounts, particle_radii, particles) in [
+                (
+                    # never skip
+                    False,
 
-        if sources_have_extent:
-            extra_tree_attrs.update(source_radii=source_radii)
-        else:
-            extra_tree_attrs.update(source_radii=None)
+                    sources_have_extent,
+                    box_source_bounding_box_min,
+                    box_source_bounding_box_max,
+                    box_source_starts,
+                    box_source_counts_nonchild,
+                    source_radii if sources_have_extent else bogus_radii_array,
+                    sources),
+                (
+                    # skip the 'target' round if sources and targets
+                    # are the same.
+                    sources_are_targets,
+
+                    targets_have_extent,
+                    box_target_bounding_box_min,
+                    box_target_bounding_box_max,
+                    box_target_starts,
+                    box_target_counts_nonchild,
+                    target_radii if targets_have_extent else bogus_radii_array,
+                    targets),
+                ]:
 
-        if targets_have_extent:
-            extra_tree_attrs.update(target_radii=target_radii)
-        else:
-            extra_tree_attrs.update(target_radii=None)
+            if skip:
+                continue
 
-        tree_build_proc.done(
-                "%d levels, %d boxes, %d particles, box extent norm: %s, "
-                "max_leaf_refine_weight: %d",
-                nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm,
-                max_leaf_refine_weight)
+            args = (
+                    (
+                        aligned_nboxes,
+                        box_child_ids,
+                        box_centers,
+                        pstarts, pcounts,)
+                    + tuple(particles)
+                    + (
+                        particle_radii,
+                        enable_radii,
+
+                        box_bounding_box_min,
+                        box_bounding_box_max))
+
+            knl_info.box_extents_finder_kernel(
+                    *args, range=slice(start, stop),
+                    queue=actx.queue,
+                    )
 
-        tree = Tree(
-                # If you change this, also change the documentation
-                # of what's in the tree, above.
+    del bogus_radii_array
 
-                sources_are_targets=sources_are_targets,
-                sources_have_extent=sources_have_extent,
-                targets_have_extent=targets_have_extent,
+    # }}}
 
-                particle_id_dtype=knl_info.particle_id_dtype,
-                box_id_dtype=knl_info.box_id_dtype,
-                coord_dtype=coord_dtype,
-                box_level_dtype=self.box_level_dtype,
+    # {{{ build output
 
-                root_extent=root_extent,
-                stick_out_factor=stick_out_factor,
-                extent_norm=srcntgts_extent_norm,
+    extra_tree_attrs = {}
 
-                bounding_box=(bbox_min, bbox_max),
-                level_start_box_nrs=actx.from_numpy(level_start_box_nrs),
+    if sources_have_extent:
+        extra_tree_attrs.update(source_radii=source_radii)
+    else:
+        extra_tree_attrs.update(source_radii=None)
 
-                sources=sources,
-                targets=targets,
+    if targets_have_extent:
+        extra_tree_attrs.update(target_radii=target_radii)
+    else:
+        extra_tree_attrs.update(target_radii=None)
 
-                box_source_starts=box_source_starts,
-                box_source_counts_nonchild=box_source_counts_nonchild,
-                box_source_counts_cumul=box_source_counts_cumul,
-                box_target_starts=box_target_starts,
-                box_target_counts_nonchild=box_target_counts_nonchild,
-                box_target_counts_cumul=box_target_counts_cumul,
+    tree_build_proc.done(
+            "%d levels, %d boxes, %d particles, box extent norm: %s, "
+            "max_leaf_refine_weight: %d",
+            nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm,
+            max_leaf_refine_weight)
 
-                box_parent_ids=box_parent_ids,
-                box_child_ids=box_child_ids,
-                box_centers=box_centers,
-                box_levels=box_levels,
-                box_flags=box_flags,
+    tree = Tree(
+            # If you change this, also change the documentation
+            # of what's in the tree, above.
 
-                user_source_ids=user_source_ids,
-                sorted_target_ids=sorted_target_ids,
+            sources_are_targets=sources_are_targets,
+            sources_have_extent=sources_have_extent,
+            targets_have_extent=targets_have_extent,
 
-                box_source_bounding_box_min=box_source_bounding_box_min,
-                box_source_bounding_box_max=box_source_bounding_box_max,
-                box_target_bounding_box_min=box_target_bounding_box_min,
-                box_target_bounding_box_max=box_target_bounding_box_max,
+            particle_id_dtype=knl_info.particle_id_dtype,
+            box_id_dtype=knl_info.box_id_dtype,
+            coord_dtype=coord_dtype,
+            box_level_dtype=box_level_dtype,
 
-                _is_pruned=prune_empty_leaves,
+            root_extent=root_extent,
+            stick_out_factor=stick_out_factor,
+            extent_norm=srcntgts_extent_norm,
 
-                **extra_tree_attrs
-                )
+            bounding_box=(bbox_min, bbox_max),
+            level_start_box_nrs=actx.from_numpy(level_start_box_nrs),
 
-        return actx.freeze(tree), evt
+            sources=sources,
+            targets=targets,
 
-        # }}}
+            box_source_starts=box_source_starts,
+            box_source_counts_nonchild=box_source_counts_nonchild,
+            box_source_counts_cumul=box_source_counts_cumul,
+            box_target_starts=box_target_starts,
+            box_target_counts_nonchild=box_target_counts_nonchild,
+            box_target_counts_cumul=box_target_counts_cumul,
+
+            box_parent_ids=box_parent_ids,
+            box_child_ids=box_child_ids,
+            box_centers=box_centers,
+            box_levels=box_levels,
+            box_flags=box_flags,
+
+            user_source_ids=user_source_ids,
+            sorted_target_ids=sorted_target_ids,
+
+            box_source_bounding_box_min=box_source_bounding_box_min,
+            box_source_bounding_box_max=box_source_bounding_box_max,
+            box_target_bounding_box_min=box_target_bounding_box_min,
+            box_target_bounding_box_max=box_target_bounding_box_max,
+
+            root_extent_stretch_factor=root_extent_stretch_factor,
+            _is_pruned=prune_empty_leaves,
+
+            **extra_tree_attrs
+            )
+
+    return actx.freeze(tree)
 
     # }}}
 
+# }}}
+
 # vim: foldmethod=marker:filetype=pyopencl
diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py
index 2e5fa295..7f3b5803 100644
--- a/boxtree/tree_build_kernels.py
+++ b/boxtree/tree_build_kernels.py
@@ -122,7 +122,7 @@
 
 
 @dataclass(frozen=True)
-class _KernelInfo:
+class TreeBuildKernelInfo:
     particle_id_dtype: np.dtype
     box_id_dtype: np.dtype
     morton_bin_count_dtype: np.dtype
@@ -1834,7 +1834,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
 
     # }}}
 
-    return _KernelInfo(
+    return TreeBuildKernelInfo(
             particle_id_dtype=particle_id_dtype,
             box_id_dtype=box_id_dtype,
             morton_bin_count_dtype=morton_bin_count_dtype,
diff --git a/doc/Makefile b/doc/Makefile
index c45814ac..d0ac5f2f 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -1,130 +1,20 @@
-# Makefile for Sphinx documentation
+# Minimal makefile for Sphinx documentation
 #
 
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = python $(shell which sphinx-build)
-PAPER         =
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= python $(shell which sphinx-build)
+SOURCEDIR     = .
 BUILDDIR      = _build
 
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
-
+# Put it first so that "make" without argument is like "make help".
 help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	-rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/boxtree.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/boxtree.qhc"
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/boxtree"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/boxtree"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	make -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
+.PHONY: help Makefile
 
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/conf.py b/doc/conf.py
index 07572b6c..8a9553ac 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,5 +18,6 @@
         "https://docs.python.org/3/": None,
         "https://numpy.org/doc/stable/": None,
         "https://documen.tician.de/pyopencl/": None,
+        "https://documen.tician.de/arraycontext/": None,
         "https://documen.tician.de/pytential/": None,
         }
diff --git a/doc/tools.rst b/doc/tools.rst
index 6db9bc70..0b5225ee 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -4,3 +4,5 @@ Utility Functionality
 .. automodule:: boxtree.timing
 
 .. automodule:: boxtree.constant_one
+
+.. automodule:: boxtree.array_context
diff --git a/examples/cost_model.py b/examples/cost_model.py
index 14a1b8f2..d832fa0c 100644
--- a/examples/cost_model.py
+++ b/examples/cost_model.py
@@ -1,29 +1,16 @@
+import os
+
 import numpy as np
 import pyopencl as cl
-import sys
 
 import logging
-import os
-
-# Configure the root logger
 logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING"))
-
 logger = logging.getLogger(__name__)
-
-# Set the logger level of this module to INFO so that logging outputs of this module
-# are shown
 logger.setLevel(logging.INFO)
 
-# `process_elapsed` in `ProcessTimer` is only supported for Python >= 3.3
-SUPPORTS_PROCESS_TIME = (sys.version_info >= (3, 3))
-
 
 def demo_cost_model():
-    if not SUPPORTS_PROCESS_TIME:
-        raise NotImplementedError(
-            "Currently this script uses process time which only works on Python>=3.3"
-        )
-
+    from boxtree.array_context import PyOpenCLArrayContext
     from boxtree.pyfmmlib_integration import (
             Kernel,
             FMMLibTreeIndependentDataForWrangler,
@@ -36,6 +23,7 @@ def demo_cost_model():
 
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
 
     traversals = []
     traversals_dev = []
@@ -49,30 +37,25 @@ def fmm_level_to_order(tree, ilevel):
         # {{{ Generate sources, targets and target_radii
 
         from boxtree.tools import make_normal_particle_array as p_normal
-        sources = p_normal(queue, nsources, dims, dtype, seed=15)
-        targets = p_normal(queue, ntargets, dims, dtype, seed=18)
+        sources = p_normal(actx, nsources, dims, dtype, seed=15)
+        targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
-        from pyopencl.clrandom import PhiloxGenerator
-        rng = PhiloxGenerator(queue.context, seed=22)
-        target_radii = rng.uniform(
-            queue, ntargets, a=0, b=0.05, dtype=dtype
-        ).get()
+        rng = np.random.default_rng(seed=22)
+        target_radii = rng.uniform(low=0.0, high=0.05, size=ntargets)
 
         # }}}
 
         # {{{ Generate tree and traversal
 
-        from boxtree import TreeBuilder
-        tb = TreeBuilder(ctx)
-        tree, _ = tb(
-            queue, sources, targets=targets, target_radii=target_radii,
+        from boxtree import build_tree
+        tree = build_tree(
+            actx, sources, targets=targets, target_radii=target_radii,
             stick_out_factor=0.15, max_particles_in_box=30, debug=True
         )
 
-        from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2)
-        trav_dev, _ = tg(queue, tree, debug=True)
-        trav = trav_dev.get(queue=queue)
+        from boxtree.traversal import build_traversal
+        trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True)
+        trav = actx.to_numpy(trav_dev)
 
         traversals.append(trav)
         traversals_dev.append(trav_dev)
@@ -88,7 +71,7 @@ def fmm_level_to_order(tree, ilevel):
         timing_data = {}
         from boxtree.fmm import drive_fmm
         src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype)
-        drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+        drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
 
         timing_results.append(timing_data)
 
@@ -103,7 +86,7 @@ def fmm_level_to_order(tree, ilevel):
         traversal = traversals_dev[icase]
         model_results.append(
             cost_model.cost_per_stage(
-                queue, traversal, level_orders_list[icase],
+                actx, traversal, level_orders_list[icase],
                 FMMCostModel.get_unit_calibration_params(),
             )
         )
@@ -114,7 +97,7 @@ def fmm_level_to_order(tree, ilevel):
     )
 
     predicted_time = cost_model.cost_per_stage(
-        queue, traversals_dev[-1], level_orders_list[-1], params,
+        actx, traversals_dev[-1], level_orders_list[-1], params,
     )
     queue.finish()
 
diff --git a/examples/demo.py b/examples/demo.py
index 8105c37a..14dcd8e2 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -4,8 +4,11 @@
 import logging
 logging.basicConfig(level="INFO")
 
+from boxtree.array_context import PyOpenCLArrayContext
+
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
+actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
 
 dims = 2
 nparticles = 500
@@ -13,24 +16,21 @@
 # -----------------------------------------------------------------------------
 # generate some random particle positions
 # -----------------------------------------------------------------------------
-from pyopencl.clrandom import PhiloxGenerator
-rng = PhiloxGenerator(ctx, seed=15)
-
 from pytools.obj_array import make_obj_array
+rng = np.random.default_rng(seed=15)
+
 particles = make_obj_array([
-    rng.normal(queue, nparticles, dtype=np.float64)
+    actx.from_numpy(rng.normal(size=nparticles))
     for i in range(dims)])
 
 # -----------------------------------------------------------------------------
 # build tree and traversals (lists)
 # -----------------------------------------------------------------------------
-from boxtree import TreeBuilder
-tb = TreeBuilder(ctx)
-tree, _ = tb(queue, particles, max_particles_in_box=5)
+from boxtree import build_tree
+tree = build_tree(actx, particles, max_particles_in_box=5)
 
-from boxtree.traversal import FMMTraversalBuilder
-tg = FMMTraversalBuilder(ctx)
-trav, _ = tg(queue, tree)
+from boxtree.traversal import build_traversal
+trav = build_traversal(actx, tree)
 
 # ENDEXAMPLE
 
@@ -38,12 +38,15 @@
 # plot the tree
 # -----------------------------------------------------------------------------
 
+particles = actx.to_numpy(particles)
+tree = actx.to_numpy(tree)
+
 import matplotlib.pyplot as pt
+from boxtree.visualization import TreePlotter
 
-pt.plot(particles[0].get(), particles[1].get(), "+")
+pt.plot(particles[0], particles[1], "+")
+plotter = TreePlotter(tree)
 
-from boxtree.visualization import TreePlotter
-plotter = TreePlotter(tree.get(queue=queue))
 plotter.draw_tree(fill=False, edgecolor="black")
 #plotter.draw_box_numbers()
 plotter.set_bounding_box()
diff --git a/test/test_cost_model.py b/test/test_cost_model.py
index 80fda05e..2446ca0c 100644
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
@@ -33,6 +33,7 @@
 from arraycontext import pytest_generate_tests_for_array_contexts
 from boxtree.array_context import (                                 # noqa: F401
         PytestPyOpenCLArrayContextFactory, _acf)
+
 from boxtree.cost import FMMCostModel, _PythonFMMCostModel
 from boxtree.cost import make_pde_aware_translation_cost_model
 
@@ -58,8 +59,8 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     # {{{ Generate sources, targets and target_radii
 
     from boxtree.tools import make_normal_particle_array as p_normal
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
     rng = np.random.default_rng(22)
     target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype)
@@ -68,17 +69,15 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     # {{{ Generate tree and traversal
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-    tree, _ = tb(
-        actx.queue, sources, targets=targets, target_radii=target_radii,
+    from boxtree import build_tree
+    tree = build_tree(
+        actx, sources, targets=targets, target_radii=target_radii,
         stick_out_factor=0.15, max_particles_in_box=30, debug=True
     )
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
-    trav_dev, _ = tg(actx.queue, tree, debug=True)
-    trav = trav_dev.get(queue=actx.queue)
+    from boxtree.traversal import build_traversal
+    trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True)
+    trav = actx.to_numpy(trav_dev)
 
     # }}}
 
@@ -106,12 +105,12 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
             context=constant_one_params
         )
     p2m_cost_dev = actx.from_numpy(p2m_cost)
-
     actx.queue.finish()
+
     start_time = time.time()
 
     cl_form_multipoles = cl_cost_model.process_form_multipoles(
-        actx.queue, trav_dev, p2m_cost_dev
+        actx, trav_dev, p2m_cost_dev
     )
 
     actx.queue.finish()
@@ -121,7 +120,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     python_form_multipoles = python_cost_model.process_form_multipoles(
-        actx.queue, trav, p2m_cost
+        actx, trav, p2m_cost
     )
 
     logger.info("Python time for process_form_multipoles: %gs",
@@ -144,7 +143,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
     cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles(
-        actx.queue, trav_dev, m2m_cost_dev
+        actx, trav_dev, m2m_cost_dev
     )
 
     actx.queue.finish()
@@ -154,7 +153,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles(
-        actx.queue, trav, m2m_cost
+        actx, trav, m2m_cost
     )
 
     logger.info("Python time for coarsen_multipoles: %gs",
@@ -170,10 +169,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     cl_ndirect_sources_per_target_box = \
-        cl_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav_dev)
+        cl_cost_model.get_ndirect_sources_per_target_box(actx, trav_dev)
 
     cl_direct = cl_cost_model.process_direct(
-        actx.queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0
+        actx, trav_dev, cl_ndirect_sources_per_target_box, 5.0
     )
 
     actx.queue.finish()
@@ -183,10 +182,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     python_ndirect_sources_per_target_box = \
-        python_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav)
+        python_cost_model.get_ndirect_sources_per_target_box(actx, trav)
 
     python_direct = python_cost_model.process_direct(
-        actx.queue, trav, python_ndirect_sources_per_target_box, 5.0
+        actx, trav, python_ndirect_sources_per_target_box, 5.0
     )
 
     logger.info("Python time for process_direct: %gs",
@@ -200,7 +199,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
 
-    cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct)
+    cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(actx, cl_direct)
 
     actx.queue.finish()
     logger.info("OpenCL time for aggregate_over_boxes: %gs",
@@ -208,7 +207,9 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
 
-    python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct)
+    python_direct_aggregate = (
+        python_cost_model.aggregate_over_boxes(actx, python_direct)
+    )
 
     logger.info("Python time for aggregate_over_boxes: %gs",
             time.time() - start_time)
@@ -231,14 +232,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
 
-    cl_m2l_cost = cl_cost_model.process_list2(actx.queue, trav_dev, m2l_cost_dev)
+    cl_m2l_cost = cl_cost_model.process_list2(actx, trav_dev, m2l_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_list2: %gs",
             time.time() - start_time)
 
     start_time = time.time()
-    python_m2l_cost = python_cost_model.process_list2(actx.queue, trav, m2l_cost)
+    python_m2l_cost = python_cost_model.process_list2(actx, trav, m2l_cost)
     logger.info("Python time for process_list2: %gs",
             time.time() - start_time)
 
@@ -259,14 +260,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
 
-    cl_m2p_cost = cl_cost_model.process_list3(actx.queue, trav_dev, m2p_cost_dev)
+    cl_m2p_cost = cl_cost_model.process_list3(actx, trav_dev, m2p_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_list3: %gs",
             time.time() - start_time)
 
     start_time = time.time()
-    python_m2p_cost = python_cost_model.process_list3(actx.queue, trav, m2p_cost)
+    python_m2p_cost = python_cost_model.process_list3(actx, trav, m2p_cost)
     logger.info("Python time for process_list3: %gs",
             time.time() - start_time)
 
@@ -287,14 +288,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
 
-    cl_p2l_cost = cl_cost_model.process_list4(actx.queue, trav_dev, p2l_cost_dev)
+    cl_p2l_cost = cl_cost_model.process_list4(actx, trav_dev, p2l_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_list4: %gs",
             time.time() - start_time)
 
     start_time = time.time()
-    python_p2l_cost = python_cost_model.process_list4(actx.queue, trav, p2l_cost)
+    python_p2l_cost = python_cost_model.process_list4(actx, trav, p2l_cost)
     logger.info("Python time for process_list4: %gs",
             time.time() - start_time)
 
@@ -316,7 +317,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     cl_refine_locals_cost = cl_cost_model.process_refine_locals(
-        actx.queue, trav_dev, l2l_cost_dev
+        actx, trav_dev, l2l_cost_dev
     )
 
     actx.queue.finish()
@@ -325,7 +326,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
     python_refine_locals_cost = python_cost_model.process_refine_locals(
-        actx.queue, trav, l2l_cost
+        actx, trav, l2l_cost
     )
     logger.info("Python time for refine_locals: %gs",
             time.time() - start_time)
@@ -348,7 +349,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     cl_l2p_cost = cl_cost_model.process_eval_locals(
-            actx.queue, trav_dev, l2p_cost_dev)
+            actx, trav_dev, l2p_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_eval_locals: %gs",
@@ -356,7 +357,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
     python_l2p_cost = python_cost_model.process_eval_locals(
-            actx.queue, trav, l2p_cost)
+            actx, trav, l2p_cost)
     logger.info("Python time for process_eval_locals: %gs",
             time.time() - start_time)
 
@@ -395,8 +396,8 @@ def fmm_level_to_order(tree, ilevel):
         # {{{ Generate sources, targets and target_radii
 
         from boxtree.tools import make_normal_particle_array as p_normal
-        sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-        targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+        sources = p_normal(actx, nsources, dims, dtype, seed=15)
+        targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
         rng = np.random.default_rng(22)
         target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype)
@@ -405,17 +406,15 @@ def fmm_level_to_order(tree, ilevel):
 
         # {{{ Generate tree and traversal
 
-        from boxtree import TreeBuilder
-        tb = TreeBuilder(actx.context)
-        tree, _ = tb(
-            actx.queue, sources, targets=targets, target_radii=target_radii,
+        from boxtree import build_tree
+        tree = build_tree(
+            actx, sources, targets=targets, target_radii=target_radii,
             stick_out_factor=0.15, max_particles_in_box=30, debug=True
         )
 
-        from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
-        trav_dev, _ = tg(actx.queue, tree, debug=True)
-        trav = trav_dev.get(queue=actx.queue)
+        from boxtree.traversal import build_traversal
+        trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True)
+        trav = actx.to_numpy(trav_dev)
 
         traversals.append(trav)
         traversals_dev.append(trav_dev)
@@ -431,7 +430,7 @@ def fmm_level_to_order(tree, ilevel):
         timing_data = {}
         from boxtree.fmm import drive_fmm
         src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype)
-        drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+        drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
 
         timing_results.append(timing_data)
 
@@ -458,7 +457,7 @@ def test_params_equal(test_params1, test_params2):
         level_to_order = level_to_orders[icase]
 
         python_model_results.append(python_cost_model.cost_per_stage(
-            actx.queue, traversal, level_to_order,
+            actx, traversal, level_to_order,
             _PythonFMMCostModel.get_unit_calibration_params(),
         ))
 
@@ -477,7 +476,7 @@ def test_params_equal(test_params1, test_params2):
         level_to_order = level_to_orders[icase]
 
         cl_model_results.append(cl_cost_model.cost_per_stage(
-            actx.queue, traversal, level_to_order,
+            actx, traversal, level_to_order,
             FMMCostModel.get_unit_calibration_params(),
         ))
 
@@ -530,23 +529,21 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     actx = actx_factory()
 
     from boxtree.tools import make_normal_particle_array as p_normal
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=16)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=19)
+    sources = p_normal(actx, nsources, dims, dtype, seed=16)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=19)
 
     rng = np.random.default_rng(20)
     target_radii = rng.uniform(0, 0.04, (ntargets,)).astype(dtype)
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-    tree, _ = tb(
-        actx.queue, sources, targets=targets, target_radii=target_radii,
+    from boxtree import build_tree
+    tree = build_tree(
+        actx, sources, targets=targets, target_radii=target_radii,
         stick_out_factor=0.15, max_particles_in_box=30, debug=True
     )
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
-    trav_dev, _ = tg(actx.queue, tree, debug=True)
-    trav = trav_dev.get(queue=actx.queue)
+    from boxtree.traversal import build_traversal
+    trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True)
+    trav = actx.to_numpy(trav_dev)
 
     from boxtree.constant_one import (
             ConstantOneTreeIndependentDataForWrangler,
@@ -557,7 +554,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     timing_data = {}
     from boxtree.fmm import drive_fmm
     src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype)
-    drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+    drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
 
     cost_model = FMMCostModel(
         translation_cost_model_factory=OpCountingTranslationCostModel
@@ -566,7 +563,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     level_to_order = np.array([1 for _ in range(tree.nlevels)])
 
     modeled_time = cost_model.cost_per_stage(
-        actx.queue, trav_dev, level_to_order,
+        actx, trav_dev, level_to_order,
         FMMCostModel.get_unit_calibration_params(),
     )
 
@@ -585,10 +582,10 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
         total_cost += timing_data[stage]["ops_elapsed"]
 
     per_box_cost = cost_model.cost_per_box(
-        actx.queue, trav_dev, level_to_order,
+        actx, trav_dev, level_to_order,
         FMMCostModel.get_unit_calibration_params(),
     )
-    total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost)
+    total_aggregate_cost = cost_model.aggregate_over_boxes(actx, per_box_cost)
 
     assert total_cost == (
             total_aggregate_cost
diff --git a/test/test_distributed.py b/test/test_distributed.py
index c9543519..b6ca7d32 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -33,7 +33,7 @@
 
 from boxtree.pyfmmlib_integration import (
     Kernel, FMMLibTreeIndependentDataForWrangler,
-    FMMLibExpansionWrangler)
+    FMMLibExpansionWrangler, FMMLibRotationData)
 from boxtree.constant_one import (
     ConstantOneExpansionWrangler as ConstantOneExpansionWranglerBase,
     ConstantOneTreeIndependentDataForWrangler)
@@ -78,8 +78,11 @@ def fmm_level_to_order(tree, level):
     with patch.dict(os.environ, {"XDG_CACHE_HOME": rank_cache_dir}):
         actx = _acf()
 
-        from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
+        from functools import partial
+        from boxtree.traversal import build_traversal
+        build_traversal = partial(build_traversal,
+            well_sep_is_n_away=2,
+            debug=True)
 
         tree_indep = FMMLibTreeIndependentDataForWrangler(
             dims, Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE)
@@ -88,32 +91,32 @@ def fmm_level_to_order(tree, level):
         if rank == 0:
             # Generate random particles and source weights
             from boxtree.tools import make_normal_particle_array as p_normal
-            sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-            targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            sources = p_normal(actx, nsources, dims, dtype, seed=15)
+            targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
             rng = np.random.default_rng(20)
             sources_weights = rng.uniform(0.0, 1.0, (nsources,))
             target_radii = rng.uniform(0.0, 0.05, (ntargets,))
 
             # Build the tree and interaction lists
-            from boxtree import TreeBuilder
-            tb = TreeBuilder(actx.context)
-            global_tree_dev, _ = tb(
-                actx.queue, sources, targets=targets, target_radii=target_radii,
+            from boxtree import build_tree
+            global_tree_dev = build_tree(
+                actx, sources, targets=targets, target_radii=target_radii,
                 stick_out_factor=0.25, max_particles_in_box=30, debug=True)
 
-            d_trav, _ = tg(actx.queue, global_tree_dev, debug=True)
-            global_traversal_host = d_trav.get(queue=actx.queue)
+            d_trav = build_traversal(actx, global_tree_dev)
+            global_traversal_host = actx.to_numpy(d_trav)
             global_tree_host = global_traversal_host.tree
 
             # Get pyfmmlib expansion wrangler
             wrangler = FMMLibExpansionWrangler(
                     tree_indep, global_traversal_host,
-                    fmm_level_to_order=fmm_level_to_order)
+                    fmm_level_to_order=fmm_level_to_order,
+                    rotation_data=FMMLibRotationData(actx, global_traversal_host))
 
             # Compute FMM with one MPI rank
             from boxtree.fmm import drive_fmm
-            pot_fmm = drive_fmm(wrangler, [sources_weights]) * 2 * np.pi
+            pot_fmm = drive_fmm(actx, wrangler, [sources_weights]) * 2 * np.pi
 
         # Compute FMM using the distributed implementation
 
@@ -122,17 +125,17 @@ def wrangler_factory(local_traversal, global_traversal):
                     DistributedFMMLibExpansionWrangler
 
             return DistributedFMMLibExpansionWrangler(
-                actx.context, comm, tree_indep, local_traversal, global_traversal,
+                actx, comm, tree_indep, local_traversal, global_traversal,
                 fmm_level_to_order=fmm_level_to_order,
                 communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
 
         from boxtree.distributed import DistributedFMMRunner
         distribued_fmm_info = DistributedFMMRunner(
-            actx.queue, global_tree_host, tg, wrangler_factory, comm=comm)
+            actx, global_tree_host, build_traversal, wrangler_factory, comm=comm)
 
         timing_data = {}
         pot_dfmm = distribued_fmm_info.drive_dfmm(
-                    [sources_weights], timing_data=timing_data)
+            actx, [sources_weights], timing_data=timing_data)
         assert timing_data
 
     # Uncomment the following section to print the time taken of each stage
@@ -182,31 +185,42 @@ def test_against_shared(
 # {{{ test_constantone
 
 def _test_constantone(tmp_cache_basedir, dims, nsources, ntargets, dtype):
-    from boxtree.distributed.calculation import DistributedExpansionWrangler
+    from boxtree.distributed.calculation import DistributedExpansionWranglerMixin
 
     class ConstantOneExpansionWrangler(
-            ConstantOneExpansionWranglerBase, DistributedExpansionWrangler):
+            DistributedExpansionWranglerMixin,
+            ConstantOneExpansionWranglerBase):
         def __init__(
-                self, queue, comm, tree_indep, local_traversal, global_traversal):
-            DistributedExpansionWrangler.__init__(
-                self, queue, comm, global_traversal, False,
-                communicate_mpoles_via_allreduce=True)
+                self, array_context, comm,
+                tree_indep, local_traversal, global_traversal):
             ConstantOneExpansionWranglerBase.__init__(
                 self, tree_indep, local_traversal)
+
+            self._setup_actx = array_context
+            self.comm = comm
+            self.global_traversal = global_traversal
+            self.communicate_mpoles_via_allreduce = True
+
             self.level_orders = np.ones(local_traversal.tree.nlevels, dtype=np.int32)
 
         def reorder_sources(self, source_array):
-            if self.comm.Get_rank() == 0:
+            if self.is_mpi_root:
                 return source_array[self.global_traversal.tree.user_source_ids]
             else:
                 return None
 
         def reorder_potentials(self, potentials):
-            if self.comm.Get_rank() == 0:
+            if self.is_mpi_root:
                 return potentials[self.global_traversal.tree.sorted_target_ids]
             else:
                 return None
 
+        def finalize_potentials(self, potentials, template_ary):
+            if self.is_mpi_root:
+                return super().finalize_potentials(potentials, template_ary)
+            else:
+                return None
+
     from mpi4py import MPI
 
     # Get the current rank
@@ -222,39 +236,36 @@ def reorder_potentials(self, potentials):
     with patch.dict(os.environ, {"XDG_CACHE_HOME": rank_cache_dir}):
         actx = _acf()
 
-        from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context)
+        from boxtree.traversal import build_traversal
 
         if rank == 0:
-
             # Generate random particles
             from boxtree.tools import make_normal_particle_array as p_normal
-            sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-            targets = (p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            sources = p_normal(actx, nsources, dims, dtype, seed=15)
+            targets = (p_normal(actx, ntargets, dims, dtype, seed=18)
                        + np.array([2, 0, 0])[:dims])
 
             # Constant one source weights
             sources_weights = np.ones((nsources,), dtype=dtype)
 
             # Build the global tree
-            from boxtree import TreeBuilder
-            tb = TreeBuilder(actx.context)
-            tree, _ = tb(
-                    actx.queue, sources, targets=targets, max_particles_in_box=30,
-                    debug=True)
-            tree = tree.get(actx.queue)
+            from boxtree import build_tree
+            tree = build_tree(
+                    actx, sources,
+                    targets=targets, max_particles_in_box=30, debug=True)
+            tree = actx.to_numpy(tree)
 
         tree_indep = ConstantOneTreeIndependentDataForWrangler()
 
         def wrangler_factory(local_traversal, global_traversal):
             return ConstantOneExpansionWrangler(
-                    actx.queue, comm, tree_indep, local_traversal, global_traversal)
+                    actx, comm, tree_indep, local_traversal, global_traversal)
 
         from boxtree.distributed import DistributedFMMRunner
         distributed_fmm_info = DistributedFMMRunner(
-            actx.queue, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD)
+            actx, tree, build_traversal, wrangler_factory, comm=MPI.COMM_WORLD)
 
-        pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights])
+        pot_dfmm = distributed_fmm_info.drive_dfmm(actx, [sources_weights])
 
     if rank == 0:
         assert (np.all(pot_dfmm == nsources))
diff --git a/test/test_fmm.py b/test/test_fmm.py
index b7446f7e..d472f4d0 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -29,11 +29,7 @@
 from boxtree.array_context import (                                 # noqa: F401
         PytestPyOpenCLArrayContextFactory, _acf)
 
-from boxtree.tools import (  # noqa: F401
-        make_normal_particle_array as p_normal,
-        make_surface_particle_array as p_surface,
-        make_uniform_particle_array as p_uniform,
-        particle_array_to_host)
+from boxtree.tools import make_normal_particle_array as p_normal
 from boxtree.constant_one import (
         ConstantOneTreeIndependentDataForWrangler,
         ConstantOneExpansionWrangler)
@@ -48,7 +44,8 @@
 
 # {{{ ref fmmlib pot computation
 
-def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host,
+def get_fmmlib_ref_pot(
+        actx, wrangler, weights, sources_host, targets_host,
         helmholtz_k, dipole_vec=None):
     dims = sources_host.shape[0]
     eqn_letter = "h" if helmholtz_k else "l"
@@ -84,10 +81,10 @@ def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host,
         kwargs["zk"] = helmholtz_k
 
     return wrangler.finalize_potentials(
+            actx,
             fmmlib_routine(
                 sources=sources_host, targets=targets_host,
-                **kwargs)[0],
-            template_ary=weights)
+                **kwargs)[0])
 
 # }}}
 
@@ -177,7 +174,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
     dtype = np.float64
 
     try:
-        sources = source_gen(actx.queue, nsources_req, dims, dtype, seed=15)
+        sources = source_gen(actx, nsources_req, dims, dtype, seed=15)
         nsources = len(sources[0])
 
         if ntargets_req is None:
@@ -185,7 +182,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
             targets = None
             ntargets = ntargets_req
         else:
-            targets = target_gen(actx.queue, ntargets_req, dims, dtype, seed=16)
+            targets = target_gen(actx, ntargets_req, dims, dtype, seed=16)
             ntargets = len(targets[0])
     except ImportError:
         pytest.skip("loopy not available, but needed for particle array "
@@ -206,41 +203,40 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
     else:
         target_radii = None
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    tree = build_tree(actx, sources, targets=targets,
             max_particles_in_box=30,
             source_radii=source_radii, target_radii=target_radii,
             debug=True, stick_out_factor=0.25, extent_norm=extent_norm)
     if 0:
-        tree = tree.get(queue=actx.queue)
+        tree = actx.to_numpy(tree)
         tree.plot()
         import matplotlib.pyplot as pt
         pt.show()
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context,
-            well_sep_is_n_away=well_sep_is_n_away,
-            from_sep_smaller_crit=from_sep_smaller_crit)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree,
+        well_sep_is_n_away=well_sep_is_n_away,
+        from_sep_smaller_crit=from_sep_smaller_crit,
+        debug=True)
 
     if who_has_extent:
         pre_merge_trav = trav
-        trav = trav.merge_close_lists(actx.queue)
+        trav = trav.merge_close_lists(actx)
 
     #weights = np.random.randn(nsources)
     weights = np.ones(nsources)
     weights_sum = np.sum(weights)
 
-    host_trav = trav.get(queue=actx.queue)
+    host_trav = actx.to_numpy(trav)
     host_tree = host_trav.tree
 
     if who_has_extent:
-        pre_merge_host_trav = pre_merge_trav.get(queue=actx.queue)
+        pre_merge_host_trav = actx.to_numpy(pre_merge_trav)
 
-    from boxtree.tree import ParticleListFilter
-    plfilt = ParticleListFilter(actx.context)
+    from boxtree.tree import (
+        filter_target_lists_in_user_order,
+        filter_target_lists_in_tree_order)
 
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
 
@@ -250,17 +246,15 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
                 )
 
         if filter_kind == "user":
-            filtered_targets = plfilt.filter_target_lists_in_user_order(
-                    actx.queue, tree, flags)
+            filtered_targets = filter_target_lists_in_user_order(actx, tree, flags)
             wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder(
                     tree_indep, host_trav,
-                    filtered_targets.get(queue=actx.queue))
+                    actx.to_numpy(filtered_targets))
         elif filter_kind == "tree":
-            filtered_targets = plfilt.filter_target_lists_in_tree_order(
-                    actx.queue, tree, flags)
+            filtered_targets = filter_target_lists_in_tree_order(actx, tree, flags)
             wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder(
                     tree_indep, host_trav,
-                    filtered_targets.get(queue=actx.queue))
+                    actx.to_numpy(filtered_targets))
         else:
             raise ValueError("unsupported value of 'filter_kind'")
     else:
@@ -274,7 +268,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
                 == weights)
 
     from boxtree.fmm import drive_fmm
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     if filter_kind:
         pot = pot[actx.to_numpy(flags) > 0]
@@ -292,7 +286,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
         for i in range(nsources):
             unit_vec = np.zeros(nsources, dtype=dtype)
             unit_vec[i] = 1
-            mat[:, i] = drive_fmm(host_trav, wrangler, (unit_vec,))
+            mat[:, i] = drive_fmm(actx, wrangler, (unit_vec,))
             pb.progress()
         pb.finished()
 
@@ -401,25 +395,23 @@ def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k):
     ntargets = 1000
     dtype = np.float64
 
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
     targets = (
-            p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            p_normal(actx, ntargets, dims, dtype, seed=18)
             + np.array([2, 0, 0])[:dims])
 
-    sources_host = particle_array_to_host(sources)
-    targets_host = particle_array_to_host(targets)
-
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    sources_host = np.stack(actx.to_numpy(sources))
+    targets_host = np.stack(actx.to_numpy(targets))
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources, targets=targets,
             max_particles_in_box=30, debug=True)
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree, debug=True)
 
-    trav = trav.get(queue=actx.queue)
+    trav = actx.to_numpy(trav)
 
     rng = np.random.default_rng(20)
     weights = rng.uniform(0.0, 1.0, (nsources,))
@@ -461,7 +453,7 @@ def fmm_level_to_order(tree, lev):
     from boxtree.fmm import drive_fmm
 
     timing_data = {}
-    pot = drive_fmm(wrangler, (weights,), timing_data=timing_data)
+    pot = drive_fmm(actx, wrangler, (weights,), timing_data=timing_data)
     print(timing_data)
     assert timing_data
 
@@ -469,8 +461,8 @@ def fmm_level_to_order(tree, lev):
 
     logger.info("computing direct (reference) result")
 
-    ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources_host.T,
-            targets_host.T, helmholtz_k, dipole_vec)
+    ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources_host,
+            targets_host, helmholtz_k, dipole_vec)
 
     rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
     logger.info("relative l2 error vs fmmlib direct: %g", rel_err)
@@ -504,15 +496,17 @@ def fmm_level_to_order(tree, lev):
 
         if use_dipoles:
             knl = DirectionalSourceDerivative(knl)
-            sumpy_extra_kwargs["src_derivative_dir"] = dipole_vec
+            sumpy_extra_kwargs["src_derivative_dir"] = actx.from_numpy(dipole_vec)
 
-        p2p = P2P(actx.context,
-                [knl],
-                exclude_self=False)
+        p2p = P2P(target_kernels=[knl], exclude_self=False)
 
-        evt, (sumpy_ref_pot,) = p2p(
-                actx.queue, targets, sources, (weights,),
-                out_host=True, **sumpy_extra_kwargs)
+        result = p2p(
+                actx,
+                targets,
+                sources,
+                (actx.from_numpy(weights),),
+                **sumpy_extra_kwargs)
+        sumpy_ref_pot = actx.to_numpy(result["result_s0"])
 
         sumpy_rel_err = (
                 la.norm(pot - sumpy_ref_pot, np.inf)
@@ -552,19 +546,17 @@ def test_pyfmmlib_numerical_stability(actx_factory, dims, helmholtz_k, order):
 
     targets = sources * (1 + 1e-3)
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources, targets=targets,
             max_particles_in_box=2, debug=True)
 
     assert tree.nlevels >= 15
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree, debug=True)
 
-    trav = trav.get(queue=actx.queue)
+    trav = actx.to_numpy(trav)
     weights = np.ones_like(sources[0])
 
     from boxtree.pyfmmlib_integration import (
@@ -581,17 +573,17 @@ def fmm_level_to_order(tree, lev):
             tree_indep, trav,
             helmholtz_k=helmholtz_k,
             fmm_level_to_order=fmm_level_to_order,
-            rotation_data=FMMLibRotationData(actx.queue, trav))
+            rotation_data=FMMLibRotationData(actx, trav))
 
     from boxtree.fmm import drive_fmm
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
     assert not np.isnan(pot).any()
 
     # {{{ ref fmmlib computation
 
     logger.info("computing direct (reference) result")
 
-    ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources, targets,
+    ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources, targets,
             helmholtz_k)
 
     rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
@@ -625,8 +617,8 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten
     from_sep_smaller_min_nsources_cumul = 1 + max_particles_in_box
 
     from boxtree.fmm import drive_fmm
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=15)
 
     rng = np.random.default_rng(22)
     if enable_extents:
@@ -636,28 +628,30 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten
     else:
         target_radii = None
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources,
+            targets=targets,
             max_particles_in_box=max_particles_in_box,
             target_radii=target_radii,
-            debug=True, stick_out_factor=0.25)
+            stick_out_factor=0.25,
+            debug=True,
+            )
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True,
-            _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree,
+        debug=True,
+        _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul)
 
     weights = np.ones(nsources)
     weights_sum = np.sum(weights)
 
-    host_trav = trav.get(queue=actx.queue)
+    host_trav = actx.to_numpy(trav)
 
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
 
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     assert np.all(pot == weights_sum)
 
@@ -680,8 +674,8 @@ def test_fmm_float32(actx_factory, enable_extents):
     dtype = np.float32
 
     from boxtree.fmm import drive_fmm
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=15)
 
     rng = np.random.default_rng(12)
     if enable_extents:
@@ -691,27 +685,28 @@ def test_fmm_float32(actx_factory, enable_extents):
     else:
         target_radii = None
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources,
+            targets=targets,
             max_particles_in_box=30,
             target_radii=target_radii,
-            debug=True, stick_out_factor=0.25)
+            stick_out_factor=0.25,
+            debug=True,
+            )
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree, debug=True)
 
     weights = np.ones(nsources)
     weights_sum = np.sum(weights)
 
-    host_trav = trav.get(queue=actx.queue)
+    host_trav = actx.to_numpy(trav)
 
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
 
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     assert np.all(pot == weights_sum)
 
@@ -732,21 +727,19 @@ def test_fmm_with_optimized_3d_m2l(actx_factory, nsrcntgts, helmholtz_k,
     nsources = ntargets = nsrcntgts // 2
     dtype = np.float64
 
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
     targets = (
-            p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            p_normal(actx, ntargets, dims, dtype, seed=18)
             + np.array([2, 0, 0])[:dims])
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources, targets=targets,
             max_particles_in_box=30, debug=True)
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
-    trav = trav.get(queue=actx.queue)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree, debug=True)
+    trav = actx.to_numpy(trav)
 
     rng = np.random.default_rng(20)
     weights = rng.uniform(0.0, 1.0, (nsources,))
@@ -778,17 +771,17 @@ def fmm_level_to_order(tree, lev):
             tree_indep, trav,
             helmholtz_k=helmholtz_k,
             fmm_level_to_order=fmm_level_to_order,
-            rotation_data=FMMLibRotationData(actx.queue, trav))
+            rotation_data=FMMLibRotationData(actx, trav))
 
     from boxtree.fmm import drive_fmm
 
     baseline_timing_data = {}
     baseline_pot = drive_fmm(
-            baseline_wrangler, (weights,), timing_data=baseline_timing_data)
+        actx, baseline_wrangler, (weights,), timing_data=baseline_timing_data)
 
     optimized_timing_data = {}
     optimized_pot = drive_fmm(
-            optimized_wrangler, (weights,), timing_data=optimized_timing_data)
+        actx, optimized_wrangler, (weights,), timing_data=optimized_timing_data)
 
     baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"]
     if baseline_time is not None:
diff --git a/test/test_tools.py b/test/test_tools.py
index d83491c4..683a6c07 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -95,8 +95,7 @@ def test_allreduce_comm_pattern(p):
 def test_masked_matrix_compression(actx_factory, order):
     actx = actx_factory()
 
-    from boxtree.tools import MaskCompressorKernel
-    matcompr = MaskCompressorKernel(actx.context)
+    from boxtree.tools import mask_to_csr
 
     n = 40
     m = 10
@@ -105,7 +104,7 @@ def test_masked_matrix_compression(actx_factory, order):
     arr = (rng.random((n, m)) > 0.5).astype(np.int8).copy(order=order)
     d_arr = actx.from_numpy(arr)
 
-    arr_starts, arr_lists, evt = matcompr(actx.queue, d_arr)
+    arr_starts, arr_lists = mask_to_csr(actx, d_arr)
     arr_starts = actx.to_numpy(arr_starts)
     arr_lists = actx.to_numpy(arr_lists)
 
@@ -121,8 +120,7 @@ def test_masked_matrix_compression(actx_factory, order):
 def test_masked_list_compression(actx_factory):
     actx = actx_factory()
 
-    from boxtree.tools import MaskCompressorKernel
-    listcompr = MaskCompressorKernel(actx.context)
+    from boxtree.tools import mask_to_csr
 
     n = 20
 
@@ -131,7 +129,7 @@ def test_masked_list_compression(actx_factory):
     arr = (np.random.rand(n) > 0.5).astype(np.int8)
     d_arr = actx.from_numpy(arr)
 
-    arr_list, evt = listcompr(actx.queue, d_arr)
+    arr_list = mask_to_csr(actx, d_arr)
     arr_list = actx.to_numpy(arr_list)
 
     assert set(arr_list) == set(arr.nonzero()[0])
@@ -164,6 +162,50 @@ def test_device_record(actx_factory):
     for i in range(3):
         assert np.array_equal(record_host.obj_array[i], record.obj_array[i])
 
+
+def test_device_record_array_context(actx_factory):
+    actx = actx_factory()
+
+    from typing import Optional
+    from dataclasses import dataclass
+    from arraycontext import Array
+
+    from boxtree.array_context import dataclass_array_container
+
+    @dataclass_array_container
+    @dataclass(frozen=True)
+    class MyDeviceDataRecord:
+        array: Array
+        obj_array: np.ndarray
+        opt_array: Optional[Array]
+        value: float
+
+    from pytools.obj_array import make_obj_array
+    rng = np.random.default_rng()
+    record = MyDeviceDataRecord(
+        array=rng.random(128),
+        obj_array=make_obj_array([rng.random(128) for _ in range(3)]),
+        opt_array=None,
+        value=3)
+
+    actx_record = actx.from_numpy(record)
+    assert actx_record.array.queue is actx.queue
+
+    frozen_record = actx.freeze(actx_record)
+    assert frozen_record.array.queue is None
+
+    thawed_record = actx.thaw(frozen_record)
+    assert actx_record.array.queue is actx.queue
+
+    host_record = actx.to_numpy(thawed_record)
+    assert isinstance(host_record.array, np.ndarray)
+
+    assert record.value == host_record.value
+    assert np.allclose(record.array, host_record.array)
+    assert np.all([
+        np.allclose(record.obj_array[i], host_record.obj_array[i]) for i in range(3)
+        ])
+
 # }}}
 
 
@@ -175,7 +217,7 @@ def test_device_record(actx_factory):
 def test_particle_array(actx_factory, array_factory, dim, dtype):
     actx = actx_factory()
 
-    particles = array_factory(actx.queue, 1000, dim, dtype)
+    particles = array_factory(actx, 1000, dim, dtype)
     assert len(particles) == dim
     assert all(len(particles[0]) == len(axis) for axis in particles)
 
diff --git a/test/test_traversal.py b/test/test_traversal.py
index a86001a0..1b3c3a8d 100644
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@@ -52,22 +52,21 @@ def test_tree_connectivity(actx_factory, dims, sources_are_targets):
     actx = actx_factory()
     dtype = np.float64
 
-    sources = make_normal_particle_array(actx.queue, 1 * 10**5, dims, dtype)
+    sources = make_normal_particle_array(actx, 1 * 10**5, dims, dtype)
     if sources_are_targets:
         targets = None
     else:
-        targets = make_normal_particle_array(actx.queue, 2 * 10**5, dims, dtype)
+        targets = make_normal_particle_array(actx, 2 * 10**5, dims, dtype)
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-    tree, _ = tb(actx.queue, sources, max_particles_in_box=30,
-            targets=targets, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources,
+            targets=targets, max_particles_in_box=30, debug=True)
 
-    from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context)
-    trav, _ = tg(actx.queue, tree, debug=True)
-    tree = tree.get(queue=actx.queue)
-    trav = trav.get(queue=actx.queue)
+    from boxtree.traversal import build_traversal
+    trav = build_traversal(actx, tree, debug=True)
+    tree = actx.to_numpy(tree)
+    trav = actx.to_numpy(trav)
 
     levels = tree.box_levels
     parents = tree.box_parent_ids.T
@@ -282,18 +281,14 @@ def test_plot_traversal(actx_factory, well_sep_is_n_away=1, visualize=False):
             actx.from_numpy(rng.normal(0.0, 1.0, (nparticles,)).astype(dtype))
             for i in range(dims)])
 
-        from boxtree import TreeBuilder
-        tb = TreeBuilder(actx.context)
+        from boxtree import build_tree
+        tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
-        actx.queue.finish()
-        tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+        from boxtree.traversal import build_traversal
+        trav = build_traversal(actx, tree, well_sep_is_n_away=well_sep_is_n_away)
 
-        from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away)
-        trav, _ = tg(actx.queue, tree)
-
-        tree = tree.get(queue=actx.queue)
-        trav = trav.get(queue=actx.queue)
+        tree = actx.to_numpy(tree)
+        trav = actx.to_numpy(trav)
 
         from boxtree.visualization import TreePlotter
         plotter = TreePlotter(tree)
@@ -336,28 +331,20 @@ def test_from_sep_siblings_translation_and_rotation_classes(
         actx.from_numpy(rng.normal(0.0, 1.0, (nparticles,)).astype(dtype))
         for i in range(dims)])
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
     # }}}
 
     # {{{ build traversal
 
-    from boxtree.traversal import FMMTraversalBuilder
-    from boxtree.rotation_classes import RotationClassesBuilder
-    from boxtree.translation_classes import TranslationClassesBuilder
-
-    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away)
-    trav, _ = tg(actx.queue, tree)
-
-    rb = RotationClassesBuilder(actx.context)
-    result, _ = rb(actx.queue, trav, tree)
+    from boxtree.traversal import build_traversal
+    from boxtree.rotation_classes import build_rotation_classes
+    from boxtree.translation_classes import build_translation_classes
 
-    tb = TranslationClassesBuilder(actx.context)
-    result_tb, _ = tb(actx.queue, trav, tree)
+    trav = build_traversal(actx, tree, well_sep_is_n_away=well_sep_is_n_away)
+    result = build_rotation_classes(actx, trav, tree)
+    result_tb = build_translation_classes(actx, trav, tree)
 
     rot_classes = actx.to_numpy(
             result.from_sep_siblings_rotation_classes)
@@ -369,8 +356,8 @@ def test_from_sep_siblings_translation_and_rotation_classes(
     distance_vectors = actx.to_numpy(
         result_tb.from_sep_siblings_translation_class_to_distance_vector)
 
-    tree = tree.get(queue=actx.queue)
-    trav = trav.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    trav = actx.to_numpy(trav)
 
     centers = tree.box_centers.T
 
diff --git a/test/test_tree.py b/test/test_tree.py
index d72ed5c5..35aee864 100644
--- a/test/test_tree.py
+++ b/test/test_tree.py
@@ -28,6 +28,7 @@
 from arraycontext import pytest_generate_tests_for_array_contexts
 from boxtree.array_context import (                                 # noqa: F401
         PytestPyOpenCLArrayContextFactory, _acf)
+
 from boxtree.tools import make_normal_particle_array
 
 import logging
@@ -47,18 +48,16 @@ def test_bounding_box(actx_factory, dtype, dims, nparticles):
     actx = actx_factory()
 
     from boxtree.tools import AXIS_NAMES
-    from boxtree.bounding_box import BoundingBoxFinder
-    bbf = BoundingBoxFinder(actx.context)
-
+    from boxtree.bounding_box import find_bounding_box
     axis_names = AXIS_NAMES[:dims]
     logger.info("%s - %s %s", dtype, dims, nparticles)
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     bbox_min = [np.min(actx.to_numpy(x)) for x in particles]
     bbox_max = [np.max(actx.to_numpy(x)) for x in particles]
 
-    bbox_cl, evt = bbf(particles, radii=None)
+    bbox_cl = find_bounding_box(actx, particles, radii=None)
     bbox_cl = actx.to_numpy(bbox_cl)
 
     bbox_min_cl = np.empty(dims, dtype)
@@ -76,7 +75,7 @@ def test_bounding_box(actx_factory, dtype, dims, nparticles):
 
 # {{{ test basic (no source/target distinction) tree build
 
-def run_build_test(builder, actx, dims, dtype, nparticles, visualize,
+def run_build_test(actx, dims, dtype, nparticles, visualize,
         max_particles_in_box=None, max_leaf_refine_weight=None,
         refine_weights=None, **kwargs):
     dtype = np.dtype(dtype)
@@ -101,21 +100,20 @@ def run_build_test(builder, actx, dims, dtype, nparticles, visualize,
 
     logger.info(75 * "-")
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    actx.queue.finish()
-
-    tree, _ = builder(actx.queue, particles,
+    from boxtree import build_tree
+    tree = build_tree(actx, particles,
                       max_particles_in_box=max_particles_in_box,
                       refine_weights=refine_weights,
                       max_leaf_refine_weight=max_leaf_refine_weight,
                       debug=True, **kwargs)
-    tree = tree.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
 
     sorted_particles = np.array(list(tree.sources))
 
@@ -233,10 +231,7 @@ def particle_tree_test_decorator(f):
 def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
-    run_build_test(builder, actx, dims,
+    run_build_test(actx, dims,
             dtype, 4, max_particles_in_box=30, visualize=visualize)
 
 
@@ -244,10 +239,7 @@ def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False):
 def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
-    run_build_test(builder, actx, dims,
+    run_build_test(actx, dims,
             dtype, 50, max_particles_in_box=30, visualize=visualize)
 
 
@@ -255,11 +247,8 @@ def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False):
 def test_unpruned_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
     # test unpruned tree build
-    run_build_test(builder, actx, dims, dtype, 10**5,
+    run_build_test(actx, dims, dtype, 10**5,
             visualize=visualize, max_particles_in_box=30, skip_prune=True)
 
 
@@ -268,10 +257,7 @@ def test_particle_tree_with_reallocations(
         actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
-    run_build_test(builder, actx, dims, dtype, 10**5,
+    run_build_test(actx, dims, dtype, 10**5,
             max_particles_in_box=30, visualize=visualize, nboxes_guess=5)
 
 
@@ -280,10 +266,7 @@ def test_particle_tree_with_many_empty_leaves(
         actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
-    run_build_test(builder, actx, dims, dtype, 10**5,
+    run_build_test(actx, dims, dtype, 10**5,
             max_particles_in_box=5, visualize=visualize)
 
 
@@ -291,10 +274,7 @@ def test_particle_tree_with_many_empty_leaves(
 def test_vanilla_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
-    run_build_test(builder, actx, dims, dtype, 10**5,
+    run_build_test(actx, dims, dtype, 10**5,
             max_particles_in_box=30, visualize=visualize)
 
 
@@ -303,9 +283,6 @@ def test_explicit_refine_weights_particle_tree(
         actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
     nparticles = 10**5
 
     rng = np.random.default_rng(10)
@@ -313,7 +290,7 @@ def test_explicit_refine_weights_particle_tree(
             rng.integers(1, 10, (nparticles,), dtype=np.int32)
             )
 
-    run_build_test(builder, actx, dims, dtype, nparticles,
+    run_build_test(actx, dims, dtype, nparticles,
             refine_weights=refine_weights, max_leaf_refine_weight=100,
             visualize=visualize)
 
@@ -322,10 +299,7 @@ def test_explicit_refine_weights_particle_tree(
 def test_non_adaptive_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
-
-    run_build_test(builder, actx, dims, dtype, 10**4,
+    run_build_test(actx, dims, dtype, 10**4,
             max_particles_in_box=30, visualize=visualize, kind="non-adaptive")
 
 # }}}
@@ -342,9 +316,9 @@ def test_source_target_tree(actx_factory, dims, visualize=False):
     ntargets = 3 * 10**5
     dtype = np.float64
 
-    sources = make_normal_particle_array(actx.queue, nsources, dims, dtype,
+    sources = make_normal_particle_array(actx, nsources, dims, dtype,
             seed=12)
-    targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype,
+    targets = make_normal_particle_array(actx, ntargets, dims, dtype,
             seed=19)
 
     if visualize:
@@ -354,13 +328,11 @@ def test_source_target_tree(actx_factory, dims, visualize=False):
         pt.plot(np_targets[0], np_targets[1], "g+")
         pt.show()
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, sources, targets=targets,
-            max_particles_in_box=10, debug=True)
-    tree = tree.get(queue=actx.queue)
+    from boxtree import build_tree
+    tree = build_tree(
+            actx, sources,
+            targets=targets, max_particles_in_box=10, debug=True)
+    tree = actx.to_numpy(tree)
 
     sorted_sources = np.array(list(tree.sources))
     sorted_targets = np.array(list(tree.targets))
@@ -454,9 +426,9 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
     dtype = np.float64
     npoint_sources_per_source = 16
 
-    sources = make_normal_particle_array(actx.queue, nsources, dims, dtype,
+    sources = make_normal_particle_array(actx, nsources, dims, dtype,
             seed=12)
-    targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype,
+    targets = make_normal_particle_array(actx, ntargets, dims, dtype,
             seed=19)
 
     refine_weights = actx.zeros(nsources + ntargets, np.int32)
@@ -470,11 +442,8 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
             2**rng.uniform(-10, 0, (ntargets,)).astype(dtype)
             )
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    dev_tree, _ = tb(actx.queue, sources, targets=targets,
+    from boxtree import build_tree
+    dev_tree = build_tree(actx, sources, targets=targets,
             source_radii=source_radii,
             target_radii=target_radii,
             extent_norm=extent_norm,
@@ -492,7 +461,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
 
     logger.info("transfer tree, check orderings")
 
-    tree = dev_tree.get(queue=actx.queue)
+    tree = actx.to_numpy(dev_tree)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -655,7 +624,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
             )
 
     from boxtree.tree import link_point_sources
-    dev_tree = link_point_sources(actx.queue, dev_tree,
+    dev_tree = link_point_sources(actx, dev_tree,
             point_source_starts, point_sources,
             debug=True)
 
@@ -675,31 +644,26 @@ def test_leaves_to_balls_query(actx_factory, dims, visualize=False):
     nparticles = 10**5
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.zeros(nballs, dtype)
 
-    from boxtree.area_query import LeavesToBallsLookupBuilder
-    lblb = LeavesToBallsLookupBuilder(actx.context)
-
-    lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii)
+    from boxtree.area_query import build_leaves_to_balls_lookup
+    lbl = build_leaves_to_balls_lookup(actx, tree, ball_centers, ball_radii)
 
     # get data to host for test
-    tree = tree.get(queue=actx.queue)
-    lbl = lbl.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    lbl = actx.to_numpy(lbl)
     ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T
     ball_radii = actx.to_numpy(ball_radii)
 
@@ -731,14 +695,12 @@ def run_area_query_test(actx, tree, ball_centers, ball_radii):
     """
     Performs an area query and checks that the result is as expected.
     """
-    from boxtree.area_query import AreaQueryBuilder
-    aqb = AreaQueryBuilder(actx.context)
-
-    area_query, _ = aqb(actx.queue, tree, ball_centers, ball_radii)
+    from boxtree.area_query import build_area_query
+    area_query = build_area_query(actx, tree, ball_centers, ball_radii)
 
     # Get data to host for test.
-    tree = tree.get(queue=actx.queue)
-    area_query = area_query.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    area_query = actx.to_numpy(area_query)
     ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T
     ball_radii = actx.to_numpy(ball_radii)
 
@@ -777,21 +739,18 @@ def test_area_query(actx_factory, dims, visualize=False):
     nparticles = 10**5
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.zeros(nballs, dtype)
 
     run_area_query_test(actx, tree, ball_centers, ball_radii)
@@ -810,18 +769,15 @@ def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False):
     nparticles = 10**4
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
     bbox_min = tree.bounding_box[0].min()
@@ -847,25 +803,21 @@ def test_area_query_elwise(actx_factory, dims, visualize=False):
     nparticles = 10**5
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.zeros(nballs, dtype)
 
-    from boxtree.area_query import (
-        AreaQueryElementwiseTemplate, PeerListFinder)
+    from boxtree.area_query import AreaQueryElementwiseTemplate, build_peer_list
 
     template = AreaQueryElementwiseTemplate(
         extra_args="""
@@ -882,21 +834,19 @@ def test_area_query_elwise(actx_factory, dims, visualize=False):
         """,
         leaf_found_op="")
 
-    peer_lists, evt = PeerListFinder(actx.context)(actx.queue, tree)
-
+    peer_lists = build_peer_list(actx, tree)
     kernel = template.generate(
-        actx.context,
+        actx.queue.context,
         dims,
         tree.coord_dtype,
         tree.box_id_dtype,
         peer_lists.peer_list_starts.dtype,
         tree.nlevels)
 
-    evt = kernel(
+    kernel(
         *template.unwrap_args(
             tree, peer_lists, ball_radii, *ball_centers),
         queue=actx.queue,
-        wait_for=[evt],
         range=slice(len(ball_radii)))
 
 # }}}
@@ -916,19 +866,15 @@ def test_level_restriction(
     dtype = np.float64
 
     from boxtree.tools import make_surface_particle_array
-    particles = make_surface_particle_array(
-            actx.queue, nparticles, dims, dtype, seed=15)
+    particles = make_surface_particle_array(actx, nparticles, dims, dtype, seed=15)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree_dev, _ = tb(actx.queue, particles,
+    from boxtree import build_tree
+    tree_dev = build_tree(actx, particles,
             kind="adaptive-level-restricted",
             max_particles_in_box=30, debug=True,
             skip_prune=skip_prune, lr_lookbehind=lookbehind,
@@ -942,19 +888,18 @@ def find_neighbors(leaf_box_centers, leaf_box_radii):
         #
         # Note that since this comes from an area query, the self box will be
         # included in the neighbor list.
-        from boxtree.area_query import AreaQueryBuilder
-        aqb = AreaQueryBuilder(actx.context)
+        from boxtree.area_query import build_area_query
 
         ball_radii = actx.from_numpy(np.min(leaf_box_radii) / 2 + leaf_box_radii)
         leaf_box_centers = [actx.from_numpy(axis) for axis in leaf_box_centers]
 
-        area_query, _ = aqb(actx.queue, tree_dev, leaf_box_centers, ball_radii)
-        area_query = area_query.get(queue=actx.queue)
+        area_query = build_area_query(actx, tree_dev, leaf_box_centers, ball_radii)
+        area_query = actx.to_numpy(area_query)
         return (area_query.leaves_near_ball_starts,
                 area_query.leaves_near_ball_lists)
 
     # Get data to host for test.
-    tree = tree_dev.get(queue=actx.queue)
+    tree = actx.to_numpy(tree_dev)
 
     # Find leaf boxes.
     from boxtree import box_flags_enum
@@ -996,38 +941,32 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False):
     dtype = np.dtype(dtype)
     nparticles = 10**5
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    from boxtree import build_tree
+    tree = build_tree(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.zeros(nballs, dtype)
 
     from boxtree.area_query import (
-        LeavesToBallsLookupBuilder, SpaceInvaderQueryBuilder)
+        build_leaves_to_balls_lookup, build_space_invader_query)
 
-    siqb = SpaceInvaderQueryBuilder(actx.context)
     # We can use leaves-to-balls lookup to get the set of overlapping balls for
     # each box, and from there to compute the outer space invader distance.
-    lblb = LeavesToBallsLookupBuilder(actx.context)
-
-    siq, _ = siqb(actx.queue, tree, ball_centers, ball_radii)
-    lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii)
+    siq = build_space_invader_query(actx, tree, ball_centers, ball_radii)
+    lbl = build_leaves_to_balls_lookup(actx, tree, ball_centers, ball_radii)
 
     # get data to host for test
-    tree = tree.get(queue=actx.queue)
-    siq = siq.get(queue=actx.queue)
-    lbl = lbl.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    siq = actx.to_numpy(siq)
+    lbl = actx.to_numpy(lbl)
 
     ball_centers = np.array([actx.to_numpy(x) for x in ball_centers])
     ball_radii = actx.to_numpy(ball_radii)
@@ -1058,16 +997,14 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False):
 
 @pytest.mark.opencl
 @pytest.mark.parametrize("dims", [2, 3])
-def test_same_tree_with_zero_weight_particles(actx_factory, dims):
+def test_same_tree_with_zero_weight_particles(actx_factory, dims, visualize=False):
     actx = actx_factory()
 
     ntargets_values = [300, 400, 500]
     stick_out_factors = [0, 0.1, 0.3, 1]
     nsources = 20
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
+    from boxtree import build_tree
     trees = []
 
     rng = np.random.default_rng(10)
@@ -1087,18 +1024,18 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims):
             refine_weights[:nsources] = 1
             refine_weights[nsources:] = 0
 
-            tree, _ = tb(actx.queue, sources, targets=targets,
+            tree = build_tree(actx, sources, targets=targets,
                     target_radii=target_radii,
                     stick_out_factor=stick_out_factor,
                     max_leaf_refine_weight=10,
                     refine_weights=refine_weights,
                     debug=True)
-            tree = tree.get(queue=actx.queue)
+            tree = actx.to_numpy(tree)
             trees.append(tree)
 
             print("TREE:", tree.nboxes)
 
-    if 0:
+    if visualize:
         import matplotlib.pyplot as plt
         for tree in trees:
             plt.figure()
@@ -1114,13 +1051,12 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims):
 def test_max_levels_error(actx_factory):
     actx = actx_factory()
 
-    from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
+    from boxtree import build_tree
     sources = [actx.zeros(11, np.float64) for i in range(2)]
+
     from boxtree.tree_build import MaxLevelsExceeded
     with pytest.raises(MaxLevelsExceeded):
-        tree, _ = tb(actx.queue, sources, max_particles_in_box=10, debug=True)
+        build_tree(actx, sources, max_particles_in_box=10, debug=True)
 
 # }}}