diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 85050a84..6f927a59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -89,9 +89,10 @@ jobs: . ./ci-support-v0 export PYTEST_ADDOPTS="-k 'not slowtest'" - - if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then - DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms + + if [[ "$GITHUB_HEAD_REF" == "towards-array-context" ]]; then + DOWNSTREAM_PROJECT=https://github.com/alexfikl/${DOWNSTREAM_PROJECT}.git@towards-array-context fi test_downstream "$DOWNSTREAM_PROJECT" + # vim: sw=4 diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml index 3d83b68d..dc1459d9 100644 --- a/.pylintrc-local.yml +++ b/.pylintrc-local.yml @@ -1,6 +1,2 @@ - arg: extension-pkg-whitelist val: pyfmmlib - -# Needed for boxtree.tools -- arg: init-hook - val: import sys; sys.setrecursionlimit(2000) diff --git a/boxtree/__init__.py b/boxtree/__init__.py index 85080390..819206c6 100644 --- a/boxtree/__init__.py +++ b/boxtree/__init__.py @@ -21,22 +21,22 @@ """ from boxtree.tree import Tree, TreeWithLinkedPointSources, box_flags_enum -from boxtree.tree_build import TreeBuilder +from boxtree.tree_build import TreeBuilder, build_tree __all__ = [ "Tree", "TreeWithLinkedPointSources", - "TreeBuilder", "box_flags_enum"] + "TreeBuilder", "build_tree", "box_flags_enum"] __doc__ = r""" :mod:`boxtree` can do three main things: * it can sort particles into an adaptively refined quad/octree, - see :class:`boxtree.Tree` and :class:`boxtree.TreeBuilder`. + see :class:`boxtree.Tree` and :class:`boxtree.build_tree`. * it can compute fast-multipole-like interaction lists on this tree structure, - see :mod:`boxtree.traversal`. Note that while this traversal generation - builds on the result of particle sorting, - it is completely distinct in the software sense. + see :mod:`boxtree.traversal`. Note that, while this traversal generation + builds on the result of particle sorting, it is completely distinct in the + software sense. * It can compute geometric lookup structures based on a :class:`boxtree.Tree`, see :mod:`boxtree.area_query`. @@ -48,16 +48,16 @@ * one where no distinction is made between sources and targets. In this mode, all participants in the interaction are called 'particles'. - (``targets is None`` in the call to :meth:`boxtree.TreeBuilder.__call__`) + (``targets`` is *None* in the call to :meth:`boxtree.build_tree`) * one where a distinction between sources and targets is made. - (``targets is not None`` in the call to :meth:`boxtree.TreeBuilder.__call__`) + (``targets`` is not *None* in the call to :meth:`boxtree.build_tree`) * one where a distinction between sources and targets is made, and where sources and/or targets are considered to have an extent, given by an - :math:`l^\infty` radius. - (``targets is not None`` and ``source_radii is not None or target_radii is - not None`` in the call to :meth:`boxtree.TreeBuilder.__call__`) + :math:`\ell^p` radius. + (``targets`` is not *None* and ``source_radii`` is not *None* or *target_radii* + is not *None* in the call to :meth:`boxtree.build_tree`) If sources have an extent, it is possible to 'link' each source with a number of point sources. For this case, it is important to internalize this bit of @@ -69,9 +69,8 @@ ------------------------------- :attr:`Tree.source_radii` and :attr:`Tree.target_radii` specify the -radii of of :math:`l^\infty` 'circles' (that is, squares) centered at -:attr:`Tree.sources` and :attr:`Tree.targets` that contain the entire -extent of that source or target. +radii of of :math:`\ell^p` 'circles' centered at :attr:`Tree.sources` and +:attr:`Tree.targets` that contain the entire extent of that source or target. :mod:`boxtree.traversal` guarantees that, in generating traversals, all interactions to targets within the source extent and from sources within the @@ -89,9 +88,9 @@ * **user target order** * **tree target order** (tree/box-sorted) -:attr:`Tree.user_source_ids` helps translate source arrays into -tree order for processing. :attr:`Tree.sorted_target_ids` -helps translate potentials back into user target order for output. +:attr:`Tree.user_source_ids` helps translate source arrays into tree order for +processing. :attr:`Tree.sorted_target_ids` helps translate potentials back into +user target order for output. If each 'original' source above is linked to a number of point sources, the point sources have their own orderings: @@ -107,24 +106,23 @@ CSR-like interaction list storage --------------------------------- -Many list-like data structures in :mod:`boxtree` consists of -two arrays, one whose name ends in ``_starts``, and another whose -name ends in ``_lists``. For example, -suppose we would like to find the colleagues of box #17 using -:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_starts` +Many list-like data structures in :mod:`boxtree` consists of two arrays, one +whose name ends in ``_starts``, and another whose name ends in ``_lists``. For +example, suppose we would like to find the colleagues of box #17 using +:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_starts` and -:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_lists`. +:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_lists`. The following snippet of code achieves this:: ibox = 17 - start, end = colleagues_starts[ibox:ibox+2] - ibox_colleagues = colleagues_lists[start:end] + start, end = same_level_non_well_sep_boxes_starts[ibox:ibox+2] + ibox_colleagues = same_level_non_well_sep_boxes_lists[start:end] This indexing scheme has the following properties: * If the underlying indexing array (say the list of all boxes) has *n* entries, - then the ``_starts`` array has *n+1* entries. The very last entry determines + then the ``_starts`` array has *n + 1* entries. The very last entry determines the length of the last list. * The lists in ``_lists`` are stored contiguously. The start of the next list diff --git a/boxtree/area_query.py b/boxtree/area_query.py index c49b4253..83f258d5 100644 --- a/boxtree/area_query.py +++ b/boxtree/area_query.py @@ -29,12 +29,12 @@ from pyopencl.elementwise import ElementwiseTemplate from arraycontext import Array -from pytools import memoize_method, ProcessLogger +from pytools import ProcessLogger, memoize_on_first_arg from mako.template import Template from boxtree.tree import Tree from boxtree.tools import ( - InlineBinarySearch, get_coord_vec_dtype, coord_vec_subscript_code) + inline_binary_search_for_type, get_coord_vec_dtype, coord_vec_subscript_code) from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container import logging @@ -45,116 +45,31 @@ Area queries (Balls -> overlapping leaves) ------------------------------------------ -.. autoclass:: AreaQueryBuilder - .. autoclass:: AreaQueryResult +.. autofunction:: build_area_query Inverse of area query (Leaves -> overlapping balls) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autoclass:: LeavesToBallsLookupBuilder - .. autoclass:: LeavesToBallsLookup - +.. autofunction:: build_leaves_to_balls_lookup Space invader queries ^^^^^^^^^^^^^^^^^^^^^ -.. autoclass:: SpaceInvaderQueryBuilder - +.. autofunction:: build_space_invader_query Peer Lists ^^^^^^^^^^ Area queries are implemented using peer lists. -.. autoclass:: PeerListFinder - .. autoclass:: PeerListLookup - +.. autofunction:: build_peer_list """ -# {{{ output - -@dataclass_array_container -@dataclass(frozen=True) -class PeerListLookup: - """ - .. attribute:: tree - - The :class:`boxtree.Tree` instance used to build this lookup. - - .. attribute:: peer_list_starts - - Indices into :attr:`peer_lists`. - ``peer_lists[peer_list_starts[box_id]:peer_list_starts[box_id]+1]`` - contains the list of peer boxes of box `box_id`. - - .. attribute:: peer_lists - - .. versionadded:: 2016.1 - """ - - tree: Tree - peer_list_starts: Array - peer_lists: Array - - -@dataclass_array_container -@dataclass(frozen=True) -class AreaQueryResult: - """ - .. attribute:: tree - - The :class:`boxtree.Tree` instance used to build this lookup. - - .. attribute:: leaves_near_ball_starts - - Indices into :attr:`leaves_near_ball_lists`. - ``leaves_near_ball_lists[leaves_near_ball_starts[ball_nr]: - leaves_near_ball_starts[ball_nr]+1]`` - results in a list of leaf boxes that intersect `ball_nr`. - - .. attribute:: leaves_near_ball_lists - - .. versionadded:: 2016.1 - """ - - tree: Tree - leaves_near_ball_starts: Array - leaves_near_ball_lists: Array - - -@dataclass_array_container -@dataclass(frozen=True) -class LeavesToBallsLookup: - """ - .. attribute:: tree - - The :class:`boxtree.Tree` instance used to build this lookup. - - .. attribute:: balls_near_box_starts - - Indices into :attr:`balls_near_box_lists`. - ``balls_near_box_lists[balls_near_box_starts[ibox]: - balls_near_box_starts[ibox]+1]`` - results in a list of balls that overlap leaf box *ibox*. - - .. note:: Only leaf boxes have non-empty entries in this table. Nonetheless, - this list is indexed by the global box index. - - .. attribute:: balls_near_box_lists - """ - - tree: Tree - balls_near_box_starts: Array - balls_near_box_lists: Array - -# }}} - - # {{{ kernel templates GUIDING_BOX_FINDER_MACRO = r"""//CL:mako// @@ -471,7 +386,7 @@ class LeavesToBallsLookup: dst[i] = bsearch(starts, starts_len, i); """, name="starts_expander", - preamble=str(InlineBinarySearch("idx_t"))) + preamble=inline_binary_search_for_type("idx_t")) # }}} @@ -538,13 +453,13 @@ def generate(self, context, dimensions, coord_dtype, box_id_dtype, peer_list_idx_dtype, max_levels, extra_var_values=(), extra_type_aliases=(), - extra_preamble=""): + extra_preamble="", + root_extent_stretch_factor=1.0e-4): from pyopencl.tools import dtype_to_ctype from boxtree import box_flags_enum from boxtree.tools import AXIS_NAMES from boxtree.traversal import TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES - from boxtree.tree_build import TreeBuilder from pyopencl.cltypes import vec_types render_vars = ( @@ -561,7 +476,7 @@ def generate(self, context, ("box_flags_enum", box_flags_enum), ("peer_list_idx_dtype", peer_list_idx_dtype), ("debug", False), - ("root_extent_stretch_factor", TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR), + ("root_extent_stretch_factor", root_extent_stretch_factor), # FIXME This gets used in pytential with a template that still uses this: ("vec_types", tuple(vec_types.items())), @@ -638,153 +553,186 @@ def generate(self, context, # {{{ area query build class AreaQueryBuilder: - r"""Given a set of :math:`l^\infty` "balls", this class helps build a - look-up table from ball to leaf boxes that intersect with the ball. + def __init__(self, *args, **kwargs): + pass + + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None, + wait_for=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_area_query' instead.", + DeprecationWarning, stacklevel=2) + + result = build_area_query( + actx, tree, ball_centers, ball_radii, peer_lists) + + return result, None + + +@dataclass_array_container +@dataclass(frozen=True) +class AreaQueryResult: + """ + .. attribute:: tree + + The :class:`boxtree.Tree` instance used to build this lookup. + + .. attribute:: leaves_near_ball_starts + + Indices into :attr:`leaves_near_ball_lists`. + ``leaves_near_ball_lists[leaves_near_ball_starts[ball_nr]: + leaves_near_ball_starts[ball_nr]+1]`` + results in a list of leaf boxes that intersect `ball_nr`. + + .. attribute:: leaves_near_ball_lists .. versionadded:: 2016.1 + """ - .. automethod:: __init__ - .. automethod:: __call__ + tree: Tree + leaves_near_ball_starts: Array + leaves_near_ball_lists: Array + + +@memoize_on_first_arg +def get_area_query_kernel( + actx: PyOpenCLArrayContext, + dimensions: int, + coord_dtype: "np.dtype", + box_id_dtype: "np.dtype", + ball_id_dtype: "np.dtype", + peer_list_idx_dtype: "np.dtype", + max_levels: int, + root_extent_stretch_factor: float): + from pyopencl.tools import dtype_to_ctype + + from boxtree import box_flags_enum + from boxtree.tools import AXIS_NAMES + from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE + + logger.debug("start building area query kernel") + + template = Template( + TRAVERSAL_PREAMBLE_TEMPLATE + + AREA_QUERY_TEMPLATE, + strict_undefined=True) + + render_vars = dict( + np=np, + dimensions=dimensions, + dtype_to_ctype=dtype_to_ctype, + box_id_dtype=box_id_dtype, + particle_id_dtype=None, + coord_dtype=coord_dtype, + get_coord_vec_dtype=get_coord_vec_dtype, + cvec_sub=partial(coord_vec_subscript_code, dimensions), + max_levels=max_levels, + AXIS_NAMES=AXIS_NAMES, + box_flags_enum=box_flags_enum, + peer_list_idx_dtype=peer_list_idx_dtype, + ball_id_dtype=ball_id_dtype, + debug=False, + root_extent_stretch_factor=root_extent_stretch_factor) + + from boxtree.tools import VectorArg, ScalarArg + arg_decls = [ + VectorArg(coord_dtype, "box_centers", with_offset=False), + ScalarArg(coord_dtype, "root_extent"), + VectorArg(np.uint8, "box_levels"), + ScalarArg(box_id_dtype, "aligned_nboxes"), + VectorArg(box_id_dtype, "box_child_ids", with_offset=False), + VectorArg(box_flags_enum.dtype, "box_flags"), + VectorArg(peer_list_idx_dtype, "peer_list_starts"), + VectorArg(box_id_dtype, "peer_lists"), + VectorArg(coord_dtype, "ball_radii"), + ] + [ + ScalarArg(coord_dtype, "bbox_min_"+ax) + for ax in AXIS_NAMES[:dimensions] + ] + [ + VectorArg(coord_dtype, "ball_"+ax) + for ax in AXIS_NAMES[:dimensions]] + + from pyopencl.algorithm import ListOfListsBuilder + area_query_knl = ListOfListsBuilder( + actx.context, + [("leaves", box_id_dtype)], + str(template.render(**render_vars)), + arg_decls=arg_decls, + name_prefix="area_query", + count_sharing={}, + complex_kernel=True) + + logger.debug("done building area query kernel") + return area_query_knl + + +def build_area_query( + actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None) -> AreaQueryResult: + r"""Given a set of :math:`l^\infty` "balls", this class helps build a + look-up table from ball to leaf boxes that intersect with the ball. + + :arg ball_centers: an object array of coordinates. Their *dtype* must + match *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg ball_radii: an array of positive numbers. Its *dtype* must match + *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg peer_lists: may either be *None* or an instance of + :class:`PeerListLookup` associated with `tree`. """ - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context - self.peer_list_finder = PeerListFinder(array_context) - @property - def context(self): - return self._setup_actx.queue.context + # {{{ input check - # {{{ Kernel generation + from pytools import single_valued + if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: + raise TypeError("ball_centers dtype must match tree.coord_dtype") - @memoize_method - def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype, - ball_id_dtype, peer_list_idx_dtype, max_levels): - from pyopencl.tools import dtype_to_ctype + if ball_radii.dtype != tree.coord_dtype: + raise TypeError("ball_radii dtype must match tree.coord_dtype") - from boxtree import box_flags_enum - from boxtree.tools import AXIS_NAMES - from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE - from boxtree.tree_build import TreeBuilder - - logger.debug("start building area query kernel") - - template = Template( - TRAVERSAL_PREAMBLE_TEMPLATE - + AREA_QUERY_TEMPLATE, - strict_undefined=True) - - render_vars = dict( - np=np, - dimensions=dimensions, - dtype_to_ctype=dtype_to_ctype, - box_id_dtype=box_id_dtype, - particle_id_dtype=None, - coord_dtype=coord_dtype, - get_coord_vec_dtype=get_coord_vec_dtype, - cvec_sub=partial(coord_vec_subscript_code, dimensions), - max_levels=max_levels, - AXIS_NAMES=AXIS_NAMES, - box_flags_enum=box_flags_enum, - peer_list_idx_dtype=peer_list_idx_dtype, - ball_id_dtype=ball_id_dtype, - debug=False, - root_extent_stretch_factor=TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR) - - from boxtree.tools import VectorArg, ScalarArg - arg_decls = [ - VectorArg(coord_dtype, "box_centers", with_offset=False), - ScalarArg(coord_dtype, "root_extent"), - VectorArg(np.uint8, "box_levels"), - ScalarArg(box_id_dtype, "aligned_nboxes"), - VectorArg(box_id_dtype, "box_child_ids", with_offset=False), - VectorArg(box_flags_enum.dtype, "box_flags"), - VectorArg(peer_list_idx_dtype, "peer_list_starts"), - VectorArg(box_id_dtype, "peer_lists"), - VectorArg(coord_dtype, "ball_radii"), - ] + [ - ScalarArg(coord_dtype, "bbox_min_"+ax) - for ax in AXIS_NAMES[:dimensions] - ] + [ - VectorArg(coord_dtype, "ball_"+ax) - for ax in AXIS_NAMES[:dimensions]] - - from pyopencl.algorithm import ListOfListsBuilder - area_query_kernel = ListOfListsBuilder( - self.context, - [("leaves", box_id_dtype)], - str(template.render(**render_vars)), - arg_decls=arg_decls, - name_prefix="area_query", - count_sharing={}, - complex_kernel=True) - - logger.debug("done building area query kernel") - return area_query_kernel + from pytools import div_ceil + # Avoid generating too many kernels. + max_levels = div_ceil(tree.nlevels, 10) * 10 + + if peer_lists is None: + peer_lists = build_peer_list(actx, tree) + + if len(peer_lists.peer_list_starts) != tree.nboxes + 1: + raise ValueError("size of peer lists must match with number of boxes") + + ball_id_dtype = tree.particle_id_dtype + peer_list_idx_dtype = peer_lists.peer_list_starts.dtype # }}} - def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, - ball_centers, ball_radii, peer_lists=None, - wait_for=None): - """ - :arg ball_centers: an object array of coordinates. Their *dtype* must - match *tree*'s :attr:`boxtree.Tree.coord_dtype`. - :arg ball_radii: an array of positive numbers. Its *dtype* must match - *tree*'s :attr:`boxtree.Tree.coord_dtype`. - :arg peer_lists: may either be *None* or an instance of - :class:`PeerListLookup` associated with `tree`. - :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` - instances for whose completion this command waits before starting - exeuction. - :returns: a tuple *(aq, event)*, where *aq* is an instance of - :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event` - for dependency management. - """ - - from pytools import single_valued - if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: - raise TypeError("ball_centers dtype must match tree.coord_dtype") - if ball_radii.dtype != tree.coord_dtype: - raise TypeError("ball_radii dtype must match tree.coord_dtype") - - ball_id_dtype = tree.particle_id_dtype # ? - - from pytools import div_ceil - # Avoid generating too many kernels. - max_levels = div_ceil(tree.nlevels, 10) * 10 - - if peer_lists is None: - peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for) - wait_for = [evt] - - if len(peer_lists.peer_list_starts) != tree.nboxes + 1: - raise ValueError("size of peer lists must match with number of boxes") - - area_query_kernel = self.get_area_query_kernel(tree.dimensions, - tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, - peer_lists.peer_list_starts.dtype, max_levels) - - aq_plog = ProcessLogger(logger, "area query") - - result, evt = area_query_kernel( + # {{{ area query + + area_query_knl = get_area_query_kernel( + actx, + tree.dimensions, tree.coord_dtype, tree.box_id_dtype, + ball_id_dtype, peer_list_idx_dtype, max_levels, + tree.root_extent_stretch_factor) + + with ProcessLogger(logger, "area query"): + result, _ = area_query_knl( actx.queue, len(ball_radii), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, peer_lists.peer_list_starts, peer_lists.peer_lists, ball_radii, - *(tuple(tree.bounding_box[0]) - + tuple(bc for bc in ball_centers)), - wait_for=wait_for) + *(tuple(tree.bounding_box[0]) + tuple(bc for bc in ball_centers)), + allocator=actx.allocator, + ) - aq_plog.done() + # }}} - result = AreaQueryResult( - tree=tree, - leaves_near_ball_starts=result["leaves"].starts, - leaves_near_ball_lists=result["leaves"].lists) + result = AreaQueryResult( + tree=tree, + leaves_near_ball_starts=result["leaves"].starts, + leaves_near_ball_lists=result["leaves"].lists) - return actx.freeze(result), evt + return actx.freeze(result) # }}} @@ -792,66 +740,92 @@ def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, # {{{ area query transpose (leaves-to-balls) lookup build class LeavesToBallsLookupBuilder: - r"""Given a set of :math:`l^\infty` "balls", this class helps build a - look-up table from leaf boxes to balls that overlap with each leaf box. + def __init__(self, *args, **kwargs): + pass + + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None, + wait_for=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_leaves_to_balls_lookup' instead.", + DeprecationWarning, stacklevel=2) + + result = build_leaves_to_balls_lookup( + actx, tree, ball_centers, ball_radii, peer_lists) + + return result, None + + +@dataclass_array_container +@dataclass(frozen=True) +class LeavesToBallsLookup: + """ + .. attribute:: tree + + The :class:`boxtree.Tree` instance used to build this lookup. + + .. attribute:: balls_near_box_starts + + Indices into :attr:`balls_near_box_lists`. + ``balls_near_box_lists[balls_near_box_starts[ibox]: + balls_near_box_starts[ibox]+1]`` + results in a list of balls that overlap leaf box *ibox*. - .. automethod:: __init__ - .. automethod:: __call__ + .. note:: Only leaf boxes have non-empty entries in this table. Nonetheless, + this list is indexed by the global box index. + .. attribute:: balls_near_box_lists """ - def __init__(self, array_context: PyOpenCLArrayContext): - from pyopencl.algorithm import KeyValueSorter - self._setup_actx = array_context - self.key_value_sorter = KeyValueSorter(self.context) - self.area_query_builder = AreaQueryBuilder(array_context) + tree: Tree + balls_near_box_starts: Array + balls_near_box_lists: Array + + +def build_leaves_to_balls_lookup( + actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None) -> LeavesToBallsLookup: + r"""Given a set of :math:`l^\infty` "balls", this builds a + look-up table from leaf boxes to balls that overlap with each leaf box. + + :arg ball_centers: an object array of coordinates. Their *dtype* must + match *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg ball_radii: an array of positive numbers. Its *dtype* must match + *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg peer_lists: may either be *None* or an instance of + :class:`PeerListLookup` associated with `tree`. + """ + + # {{{ check inputs + + from pytools import single_valued + if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: + raise TypeError("ball_centers dtype must match tree.coord_dtype") - @property - def context(self): - return self._setup_actx.queue.context + if ball_radii.dtype != tree.coord_dtype: + raise TypeError("ball_radii dtype must match tree.coord_dtype") - @memoize_method - def get_starts_expander_kernel(self, idx_dtype): - """ - Expands a "starts" array into a length starts[-1] array of increasing - indices: + # }}} + + # {{{ build lookup - Eg: [0 2 5 6] => [0 0 1 1 1 2] + from pytools import memoize_in - """ + @memoize_in(actx, (build_leaves_to_balls_lookup, tree.box_id_dtype)) + def get_starts_expander_kernel(): return STARTS_EXPANDER_TEMPLATE.build( - self.context, - type_aliases=(("idx_t", idx_dtype),)) + actx.context, + type_aliases=(("idx_t", tree.box_id_dtype),)) - def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, - ball_centers, ball_radii, peer_lists=None, - wait_for=None): - """ - :arg ball_centers: an object array of coordinates. Their *dtype* must - match *tree*'s :attr:`boxtree.Tree.coord_dtype`. - :arg ball_radii: an array of positive numbers. Its *dtype* must match - *tree*'s :attr:`boxtree.Tree.coord_dtype`. - :arg peer_lists: may either be *None* or an instance of - :class:`PeerListLookup` associated with `tree`. - :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` - instances for whose completion this command waits before starting - execution. - :returns: a tuple *(lbl, event)*, where *lbl* is an instance of - :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event` - for dependency management. - """ - - from pytools import single_valued - if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: - raise TypeError("ball_centers dtype must match tree.coord_dtype") - if ball_radii.dtype != tree.coord_dtype: - raise TypeError("ball_radii dtype must match tree.coord_dtype") - - ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query") - - area_query, evt = self.area_query_builder( - actx, tree, ball_centers, ball_radii, peer_lists, wait_for) - wait_for = [evt] + @memoize_in(actx, (build_leaves_to_balls_lookup, "key_value_sorter")) + def get_key_value_sorter_kernel(): + from pyopencl.algorithm import KeyValueSorter + return KeyValueSorter(actx.context) + + with ProcessLogger(logger, "leaves-to-balls lookup: run area query"): + area_query = build_area_query( + actx, tree, ball_centers, ball_radii, peer_lists) logger.debug("leaves-to-balls lookup: expand starts") @@ -866,34 +840,38 @@ def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, # # 2. Key-value sort the (ball number, box number) pairs by box number. - starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype) + starts_expander_knl = get_starts_expander_kernel() expanded_starts = actx.empty( len(area_query.leaves_near_ball_lists), tree.box_id_dtype) evt = starts_expander_knl( expanded_starts, area_query.leaves_near_ball_starts, - nballs_p_1) - wait_for = [evt] + nballs_p_1, + queue=actx.queue, + ) + expanded_starts.add_event(evt) logger.debug("leaves-to-balls lookup: key-value sort") - balls_near_box_starts, balls_near_box_lists, evt \ - = self.key_value_sorter( - actx.queue, - # keys - area_query.leaves_near_ball_lists, - # values - expanded_starts, - nkeys, starts_dtype=tree.box_id_dtype, - wait_for=wait_for) - ltb_plog.done() + sorter_knl = get_key_value_sorter_kernel() + balls_near_box_starts, balls_near_box_lists, _ = sorter_knl( + actx.queue, + # keys + area_query.leaves_near_ball_lists, + # values + expanded_starts, + nkeys, starts_dtype=tree.box_id_dtype, + allocator=actx.allocator, + ) + + # }}} - lookup = LeavesToBallsLookup( - tree=tree, - balls_near_box_starts=balls_near_box_starts, - balls_near_box_lists=balls_near_box_lists) + lookup = LeavesToBallsLookup( + tree=tree, + balls_near_box_starts=balls_near_box_starts, + balls_near_box_lists=balls_near_box_lists) - return actx.freeze(lookup), evt + return actx.freeze(lookup) # }}} @@ -901,6 +879,45 @@ def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, # {{{ space invader query build class SpaceInvaderQueryBuilder: + def __init__(self, *args, **kwargs): + pass + + def __call__(self, + actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None, wait_for=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_space_invader_query' instead.", + DeprecationWarning, stacklevel=2) + + result = build_space_invader_query( + actx, tree, ball_centers, ball_radii, peer_lists) + + return result, None + + +@memoize_on_first_arg +def get_space_invader_query_kernel( + actx: PyOpenCLArrayContext, + dimensions: int, + coord_dtype: "np.dtype", + box_id_dtype: "np.dtype", + peer_list_starts_dtype: "np.dtype", + max_levels: int, + root_extent_stretch_factor: float): + return SPACE_INVADER_QUERY_TEMPLATE.generate( + actx.context, + dimensions, + coord_dtype, + box_id_dtype, + peer_list_starts_dtype, + max_levels, + root_extent_stretch_factor=root_extent_stretch_factor) + + +def build_space_invader_query( + actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None) -> Array: r""" Given a set of :math:`l^\infty` "balls", this class helps build a look-up table which maps leaf boxes to the *outer space invader distance*. @@ -916,247 +933,229 @@ class SpaceInvaderQueryBuilder: \max \left( \{ d_{\infty}(\text{center}(b), \text{center}(b^*)) : b^* \text{ is a ball}, b^* \cap b \neq \varnothing \} - \cup \{ 0 \} \right) + \cup \{ 0 \} \right). + + :arg ball_centers: an object array of coordinates. Their *dtype* must + match *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg ball_radii: an array of positive numbers. Its *dtype* must match + *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg peer_lists: may either be *None* or an instance of + :class:`PeerListLookup` associated with *tree*. + + :returns: an array with *dtype* same as the *tree*'s + :attr:`boxtree.Tree.coord_dtype` and its shape is *(tree.nboxes,)* + (see :attr:`boxtree.Tree.nboxes`). The entries of the array are + indexed by the global box index and are as follows: + + * if *i* is not the index of a leaf box, *sqi[i] = 0*. + * if *i* is the index of a leaf box, *sqi[i]* is the + outer space invader distance for *i*. + """ + # {{{ check inputs - .. automethod:: __init__ - .. automethod:: __call__ + from pytools import single_valued - """ - def __init__(self, array_context: PyOpenCLArrayContext) -> None: - self._setup_actx = array_context - self.peer_list_finder = PeerListFinder(array_context) - - @property - def context(self): - return self._setup_actx.queue.context - - # {{{ Kernel generation - - @memoize_method - def get_space_invader_query_kernel(self, dimensions, coord_dtype, - box_id_dtype, peer_list_idx_dtype, max_levels): - return SPACE_INVADER_QUERY_TEMPLATE.generate( - self.context, - dimensions, - coord_dtype, - box_id_dtype, - peer_list_idx_dtype, - max_levels) + if single_valued([bc.dtype for bc in ball_centers]) != tree.coord_dtype: + raise TypeError("ball_centers dtype must match tree.coord_dtype") + + if ball_radii.dtype != tree.coord_dtype: + raise TypeError("ball_radii dtype must match tree.coord_dtype") + + from pytools import div_ceil + # Avoid generating too many kernels. + max_levels = div_ceil(tree.nlevels, 10) * 10 + + if peer_lists is None: + peer_lists = build_peer_list(actx, tree) + + if len(peer_lists.peer_list_starts) != tree.nboxes + 1: + raise ValueError("size of peer lists must match with number of boxes") # }}} - def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, - ball_centers, ball_radii, peer_lists=None, - wait_for=None): - """ - :arg ball_centers: an object array of coordinates. Their *dtype* must - match *tree*'s :attr:`boxtree.Tree.coord_dtype`. - :arg ball_radii: an array of positive numbers. Its *dtype* must match - *tree*'s :attr:`boxtree.Tree.coord_dtype`. - :arg peer_lists: may either be *None* or an instance of - :class:`PeerListLookup` associated with *tree*. - :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` - instances for whose completion this command waits before starting - execution. - :returns: a tuple *(sqi, event)*, where *sqi* is an array and *event* - is a :class:`pyopencl.Event` for dependency management. The *dtype* - of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape - is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). - The entries of *sqi* are indexed by the global box index and are - as follows: - - * if *i* is not the index of a leaf box, *sqi[i] = 0*. - * if *i* is the index of a leaf box, *sqi[i]* is the - outer space invader distance for *i*. - """ - - from pytools import single_valued - if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: - raise TypeError("ball_centers dtype must match tree.coord_dtype") - if ball_radii.dtype != tree.coord_dtype: - raise TypeError("ball_radii dtype must match tree.coord_dtype") - - from pytools import div_ceil - # Avoid generating too many kernels. - max_levels = div_ceil(tree.nlevels, 10) * 10 - - if peer_lists is None: - peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for) - wait_for = [evt] - - if len(peer_lists.peer_list_starts) != tree.nboxes + 1: - raise ValueError("size of peer lists must match with number of boxes") - - space_invader_query_kernel = self.get_space_invader_query_kernel( - tree.dimensions, tree.coord_dtype, tree.box_id_dtype, - peer_lists.peer_list_starts.dtype, max_levels) - - si_plog = ProcessLogger(logger, "space invader query") + # {{{ build query + + space_invader_query_knl = get_space_invader_query_kernel( + actx, + tree.dimensions, tree.coord_dtype, tree.box_id_dtype, + peer_lists.peer_list_starts.dtype, + max_levels, tree.root_extent_stretch_factor, + ) + with ProcessLogger(logger, "space invader query"): outer_space_invader_dists = actx.zeros(tree.nboxes, np.float32) - if not wait_for: - wait_for = [] - wait_for = (wait_for - + outer_space_invader_dists.events - + ball_radii.events - + [evt for bc in ball_centers for evt in bc.events]) - - evt = space_invader_query_kernel( + evt = space_invader_query_knl( *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args( tree, peer_lists, ball_radii, outer_space_invader_dists, *tuple(bc for bc in ball_centers)), - wait_for=wait_for, queue=actx.queue, - range=slice(len(ball_radii))) + range=slice(len(ball_radii)), + ) + outer_space_invader_dists.add_event(evt) if tree.coord_dtype != np.dtype(np.float32): # The kernel output is always an array of float32 due to limited # support for atomic operations with float64 in OpenCL. # Here the output is cast to match the coord dtype. - outer_space_invader_dists.finish() - outer_space_invader_dists = outer_space_invader_dists.astype( - tree.coord_dtype) - evt, = outer_space_invader_dists.events + outer_space_invader_dists = ( + outer_space_invader_dists.astype(tree.coord_dtype)) - si_plog.done() + # }}} - return outer_space_invader_dists, evt + return actx.freeze(outer_space_invader_dists) # }}} # {{{ peer list build - class PeerListFinder: - """This class builds a look-up table from box numbers to peer boxes. The - full definition [1]_ of a peer box is as follows: + def __init__(self, *args, **kwargs): + pass + + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, wait_for=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_peer_list' instead.", + DeprecationWarning, stacklevel=2) + + return build_peer_list(actx, tree) + - Given a box :math:`b_j` in a quad-tree, :math:`b_k` is a peer box of - :math:`b_j` if it is +@dataclass_array_container +@dataclass(frozen=True) +class PeerListLookup: + """ + .. attribute:: tree - 1. adjacent to :math:`b_j`, + The :class:`boxtree.Tree` instance used to build this lookup. - 2. of at least the same size as :math:`b_j` (i.e. at the same or a - higher level than), and + .. attribute:: peer_list_starts - 3. no child of :math:`b_k` satisfies the above two criteria. + Indices into :attr:`peer_lists`. + ``peer_lists[peer_list_starts[box_id]:peer_list_starts[box_id]+1]`` + contains the list of peer boxes of box `box_id`. - .. [1] Rachh, Manas, Andreas Klöckner, and Michael O'Neil. "Fast - algorithms for Quadrature by Expansion I: Globally valid expansions." + .. attribute:: peer_lists .. versionadded:: 2016.1 - - .. automethod:: __init__ - .. automethod:: __call__ """ - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context + tree: Tree + peer_list_starts: Array + peer_lists: Array - @property - def context(self): - return self._setup_actx.queue.context - # {{{ Kernel generation +@memoize_on_first_arg +def get_peer_list_finder_kernel( + actx: PyOpenCLArrayContext, + dimensions: int, + coord_dtype: "np.dtype", + box_id_dtype: "np.dtype", + max_levels: int): + from pyopencl.tools import dtype_to_ctype + + from boxtree import box_flags_enum + from boxtree.tools import AXIS_NAMES + from boxtree.traversal import ( + TRAVERSAL_PREAMBLE_TEMPLATE, HELPER_FUNCTION_TEMPLATE) + + logger.debug("start building peer list finder kernel") + + template = Template( + TRAVERSAL_PREAMBLE_TEMPLATE + + HELPER_FUNCTION_TEMPLATE + + PEER_LIST_FINDER_TEMPLATE, + strict_undefined=True) + + render_vars = dict( + np=np, + dimensions=dimensions, + dtype_to_ctype=dtype_to_ctype, + box_id_dtype=box_id_dtype, + particle_id_dtype=None, + coord_dtype=coord_dtype, + get_coord_vec_dtype=get_coord_vec_dtype, + cvec_sub=partial(coord_vec_subscript_code, dimensions), + max_levels=max_levels, + AXIS_NAMES=AXIS_NAMES, + box_flags_enum=box_flags_enum, + debug=False, + # For calls to the helper is_adjacent_or_overlapping() + targets_have_extent=False, + sources_have_extent=False) + + from boxtree.tools import VectorArg, ScalarArg + arg_decls = [ + VectorArg(coord_dtype, "box_centers", with_offset=False), + ScalarArg(coord_dtype, "root_extent"), + VectorArg(np.uint8, "box_levels"), + ScalarArg(box_id_dtype, "aligned_nboxes"), + VectorArg(box_id_dtype, "box_child_ids", with_offset=False), + VectorArg(box_flags_enum.dtype, "box_flags"), + ] + + from pyopencl.algorithm import ListOfListsBuilder + peer_list_finder_knl = ListOfListsBuilder( + actx.context, + [("peers", box_id_dtype)], + str(template.render(**render_vars)), + arg_decls=arg_decls, + name_prefix="find_peer_lists", + count_sharing={}, + complex_kernel=True) + + logger.debug("done building peer list finder kernel") + return peer_list_finder_knl + + +def build_peer_list(actx: PyOpenCLArrayContext, tree: Tree) -> PeerListLookup: + """Builds a look-up table from box numbers to peer boxes. The full definition + [1]_ of a peer box is as follows: + + Given a box :math:`b_j` in a quad-tree, :math:`b_k` is a peer box of + :math:`b_j` if it is + + 1. adjacent to :math:`b_j`, + + 2. of at least the same size as :math:`b_j` (i.e. at the same or a + higher level than), and + + 3. no child of :math:`b_k` satisfies the above two criteria. - @memoize_method - def get_peer_list_finder_kernel(self, dimensions, coord_dtype, - box_id_dtype, max_levels): - from pyopencl.tools import dtype_to_ctype + .. [1] Rachh, Manas, Andreas Klöckner, and Michael O'Neil. "Fast + algorithms for Quadrature by Expansion I: Globally valid expansions." + """ - from boxtree import box_flags_enum - from boxtree.tools import AXIS_NAMES - from boxtree.traversal import ( - TRAVERSAL_PREAMBLE_TEMPLATE, HELPER_FUNCTION_TEMPLATE) - - logger.debug("start building peer list finder kernel") - - template = Template( - TRAVERSAL_PREAMBLE_TEMPLATE - + HELPER_FUNCTION_TEMPLATE - + PEER_LIST_FINDER_TEMPLATE, - strict_undefined=True) - - render_vars = dict( - np=np, - dimensions=dimensions, - dtype_to_ctype=dtype_to_ctype, - box_id_dtype=box_id_dtype, - particle_id_dtype=None, - coord_dtype=coord_dtype, - get_coord_vec_dtype=get_coord_vec_dtype, - cvec_sub=partial(coord_vec_subscript_code, dimensions), - max_levels=max_levels, - AXIS_NAMES=AXIS_NAMES, - box_flags_enum=box_flags_enum, - debug=False, - # For calls to the helper is_adjacent_or_overlapping() - targets_have_extent=False, - sources_have_extent=False) - - from boxtree.tools import VectorArg, ScalarArg - arg_decls = [ - VectorArg(coord_dtype, "box_centers", with_offset=False), - ScalarArg(coord_dtype, "root_extent"), - VectorArg(np.uint8, "box_levels"), - ScalarArg(box_id_dtype, "aligned_nboxes"), - VectorArg(box_id_dtype, "box_child_ids", with_offset=False), - VectorArg(box_flags_enum.dtype, "box_flags"), - ] - - from pyopencl.algorithm import ListOfListsBuilder - peer_list_finder_kernel = ListOfListsBuilder( - self.context, - [("peers", box_id_dtype)], - str(template.render(**render_vars)), - arg_decls=arg_decls, - name_prefix="find_peer_lists", - count_sharing={}, - complex_kernel=True) - - logger.debug("done building peer list finder kernel") - return peer_list_finder_kernel + from pytools import div_ceil - # }}} + # Round up level count--this gets included in the kernel as + # a stack bound. Rounding avoids too many kernel versions. + max_levels = div_ceil(tree.nlevels, 10) * 10 - def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, wait_for=None): - """ - :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` - instances for whose completion this command waits before starting - execution. - :returns: a tuple *(pl, event)*, where *pl* is an instance of - :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event` - for dependency management. - """ - from pytools import div_ceil - - # Round up level count--this gets included in the kernel as - # a stack bound. Rounding avoids too many kernel versions. - max_levels = div_ceil(tree.nlevels, 10) * 10 - - peer_list_finder_kernel = self.get_peer_list_finder_kernel( - tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels) - - pl_plog = ProcessLogger(logger, "find peer lists") - - result, evt = peer_list_finder_kernel( + peer_list_finder_knl = get_peer_list_finder_kernel( + actx, + tree.dimensions, tree.coord_dtype, tree.box_id_dtype, + max_levels, + ) + + with ProcessLogger(logger, "find peer lists"): + result, evt = peer_list_finder_knl( actx.queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, - wait_for=wait_for) - - pl_plog.done() + allocator=actx.allocator, + ) - lookup = PeerListLookup( - tree=tree, - peer_list_starts=result["peers"].starts, - peer_lists=result["peers"].lists) + lookup = PeerListLookup( + tree=tree, + peer_list_starts=result["peers"].starts, + peer_lists=result["peers"].lists) - return actx.freeze(lookup), evt + return actx.freeze(lookup) # }}} diff --git a/boxtree/array_context.py b/boxtree/array_context.py index ae566774..118ec27b 100644 --- a/boxtree/array_context.py +++ b/boxtree/array_context.py @@ -20,7 +20,17 @@ THE SOFTWARE. """ -from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase +from typing import Any, List, Optional, Union + +import numpy as np + +from pyopencl.algorithm import BuiltList +from pytools.tag import ToTagSetConvertible + +from arraycontext import ( # noqa: F401 + PyOpenCLArrayContext as PyOpenCLArrayContextBase, + with_array_context, serialize_container, deserialize_container, + rec_map_array_container) from arraycontext.pytest import ( _PytestPyOpenCLArrayContextFactoryWithClass, register_pytest_array_context_factory) @@ -30,13 +40,38 @@ """ -def _acf(): - import pyopencl as cl - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) +# {{{ make_loopy_program - return PyOpenCLArrayContext(queue, force_device_scalars=True) +def make_loopy_program( + domains, statements, + kernel_data: Optional[List[Any]] = None, *, + name: str = "sumpy_loopy_kernel", + assumptions: Optional[Union[List[str], str]] = None, + tags: ToTagSetConvertible = None): + """Return a :class:`loopy.LoopKernel` suitable for use with + :meth:`arraycontext.ArrayContext.call_loopy`. + """ + if kernel_data is None: + kernel_data = [...] + + import loopy as lp + from arraycontext.loopy import _DEFAULT_LOOPY_OPTIONS + + return lp.make_kernel( + domains, + statements, + kernel_data=kernel_data, + options=_DEFAULT_LOOPY_OPTIONS, + default_offset=lp.auto, + name=name, + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + assumptions=assumptions, + tags=tags) +# }}} + + +# {{{ array context class PyOpenCLArrayContext(PyOpenCLArrayContextBase): def transform_loopy_program(self, t_unit): @@ -49,7 +84,137 @@ def transform_loopy_program(self, t_unit): "Did you use arraycontext.make_loopy_program " "to create this kernel?") - return super().transform_loopy_program(t_unit) + return t_unit + + # NOTE: _rec_map_container is copied from arraycontext wholesale and should + # be kept in sync as much as possible! + + def _rec_map_container(self, func, array, allowed_types=None, *, + default_scalar=None, strict=False): + import arraycontext.impl.pyopencl.taggable_cl_array as tga + + if allowed_types is None: + allowed_types = (tga.TaggableCLArray,) + + def _wrapper(ary): + # NOTE: this is copied verbatim from arraycontext and this is the + # only change to allow optional fields inside containers + if ary is None: + return ary + + if isinstance(ary, allowed_types): + return func(ary) + elif not strict and isinstance(ary, self.array_types): + from warnings import warn + warn(f"Invoking {type(self).__name__}.{func.__name__[1:]} with " + f"{type(ary).__name__} will be unsupported in 2023. Use " + "'to_tagged_cl_array' to convert instances to TaggableCLArray.", + DeprecationWarning, stacklevel=2) + return func(tga.to_tagged_cl_array(ary)) + elif np.isscalar(ary): + if default_scalar is None: + return ary + else: + return np.array(ary).dtype.type(default_scalar) + else: + raise TypeError( + f"{type(self).__name__}.{func.__name__[1:]} invoked with " + f"an unsupported array type: got '{type(ary).__name__}', " + f"but expected one of {allowed_types}") + + return rec_map_array_container(_wrapper, array) + +# }}} + + +# {{{ dataclass array container + +def dataclass_array_container(cls: type) -> type: + """A decorator based on :func:`arraycontext.dataclass_array_container` + that allows :class:`typing.Optional` containers. + """ + + from dataclasses import Field, fields, is_dataclass + from typing import Union, get_args + try: + # NOTE: only available in python >= 3.8 + from typing import get_origin + except ImportError: + from typing_extensions import get_origin + + from arraycontext.container.dataclass import ( + is_array_type, inject_dataclass_serialization) + + assert is_dataclass(cls) + + def is_array_field(f: Field) -> bool: + if __debug__: + if not f.init: + raise ValueError( + f"Fields with 'init=False' not allowed: '{f.name}'") + + if isinstance(f.type, str): + raise TypeError( + f"String annotation on field '{f.name}' not supported") + + origin = get_origin(f.type) + if origin is Union: + # NOTE: `Optional` is caught in here as an alias for `Union[Anon, type]` + return all( + is_array_type(arg) or isinstance(arg, type(None)) + for arg in get_args(f.type)) + + from typing import _GenericAlias, _SpecialForm # type: ignore[attr-defined] + if isinstance(f.type, (_GenericAlias, _SpecialForm)): + return False + + return is_array_type(f.type) + + from pytools import partition + array_fields, non_array_fields = partition(is_array_field, fields(cls)) + + if not array_fields: + raise ValueError(f"'{cls}' must have fields with array container type " + "in order to use the 'dataclass_array_container' decorator") + + return inject_dataclass_serialization(cls, array_fields, non_array_fields) + +# }}} + + +# {{{ serialization + +# NOTE: BuiltList is serialized explicitly here to avoid monkeypatching the +# version in pyopencl (dataclass_array_container modifies the class) + +@serialize_container.register(BuiltList) +def _serialize_built_list(obj: BuiltList): + return tuple([ + ("starts", obj.starts), + ("lists", obj.lists), + ("nonempty_indices", obj.nonempty_indices), + ("compressed_indices", obj.compressed_indices), + ]) + + +@deserialize_container.register(BuiltList) +def _deserialize_built_list(template: BuiltList, iterable): + return type(template)( + count=template.count, + num_nonempty_lists=template.num_nonempty_lists, + **dict(iterable)) + +# }}} + + +# {{{ pytest + +def _acf(): + import pyopencl as cl + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + return PyOpenCLArrayContext(queue, force_device_scalars=True) class PytestPyOpenCLArrayContextFactory( @@ -59,3 +224,5 @@ class PytestPyOpenCLArrayContextFactory( register_pytest_array_context_factory("boxtree.pyopencl", PytestPyOpenCLArrayContextFactory) + +# }}} diff --git a/boxtree/bounding_box.py b/boxtree/bounding_box.py index 236cdf0e..9f44de0f 100644 --- a/boxtree/bounding_box.py +++ b/boxtree/bounding_box.py @@ -23,23 +23,25 @@ import numpy as np from pyopencl.reduction import ReductionTemplate -from pytools import memoize, memoize_method +from pytools import memoize, memoize_on_first_arg from boxtree.tools import get_type_moniker from boxtree.array_context import PyOpenCLArrayContext +# {{{ kernel template + @memoize def make_bounding_box_dtype(device, dimensions, coord_dtype): from boxtree.tools import AXIS_NAMES fields = [] - for i in range(dimensions): - fields.append(("min_%s" % AXIS_NAMES[i], coord_dtype)) - fields.append(("max_%s" % AXIS_NAMES[i], coord_dtype)) + for ax in AXIS_NAMES: + fields.append((f"min_{ax}", coord_dtype)) + fields.append((f"max_{ax}", coord_dtype)) dtype = np.dtype(fields) - name = "boxtree_bbox_%dd_%s_t" % (dimensions, get_type_moniker(coord_dtype)) + name = "boxtree_bbox_{}d_{}_t".format(dimensions, get_type_moniker(coord_dtype)) from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct dtype, c_decl = match_dtype_to_c_struct(device, name, dtype) @@ -118,60 +120,62 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype): """, name_prefix="bounding_box") +# }}} + -class BoundingBoxFinder: - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context - - for dev in self.context.devices: - if (dev.vendor == "Intel(R) Corporation" - and dev.version == "OpenCL 1.2 (Build 56860)"): - raise RuntimeError("bounding box finder does not work " - "properly with this CL runtime.") - - @property - def context(self): - return self._setup_actx.queue.context - - @memoize_method - def get_kernel(self, dimensions, coord_dtype, have_radii): - # FIXME: Why does this just use `devices[0]`? - bbox_dtype, bbox_cdecl = make_bounding_box_dtype( - self.context.devices[0], dimensions, coord_dtype) - - from boxtree.tools import AXIS_NAMES - return BBOX_REDUCTION_TPL.build( - self.context, - type_aliases=( - ("reduction_t", bbox_dtype), - ("bbox_t", bbox_dtype), - ("coord_t", coord_dtype), - ), - var_values=( - ("axis_names", AXIS_NAMES[:dimensions]), - ("dimensions", dimensions), - ("coord_dtype", coord_dtype), - ("have_radii", have_radii), - ("np", np), - ) - ) +# {{{ find_bounding_box - def __call__(self, actx, particles, radii, wait_for=None): - dimensions = len(particles) +@memoize_on_first_arg +def get_bounding_box_kernel( + actx: PyOpenCLArrayContext, + dimensions: int, + coord_dtype: "np.dtype", + have_radii: bool): + bbox_dtype, bbox_cdecl = make_bounding_box_dtype( + actx.queue.device, dimensions, coord_dtype) - from pytools import single_valued - coord_dtype = single_valued(coord.dtype for coord in particles) + from boxtree.tools import AXIS_NAMES + return BBOX_REDUCTION_TPL.build( + actx.context, + type_aliases=( + ("reduction_t", bbox_dtype), + ("bbox_t", bbox_dtype), + ("coord_t", coord_dtype), + ), + var_values=( + ("axis_names", AXIS_NAMES[:dimensions]), + ("dimensions", dimensions), + ("coord_dtype", coord_dtype), + ("have_radii", have_radii), + ("np", np), + ) + ) - if radii is None: - radii_tuple = () - else: - radii_tuple = (radii,) - knl = self.get_kernel(dimensions, coord_dtype, have_radii=radii is not None) - return knl( - *(tuple(particles) + radii_tuple), - queue=actx.queue, - wait_for=wait_for, return_event=True) +def find_bounding_box(actx: PyOpenCLArrayContext, particles, radii): + dev = actx.queue.device + if (dev.vendor == "Intel(R) Corporation" + and dev.version == "OpenCL 1.2 (Build 56860)"): + raise RuntimeError( + f"'find_bounding_box' does not work properly with " + f"this CL runtime: {dev}") + + from pytools import single_valued + dimensions = len(particles) + coord_dtype = single_valued(coord.dtype for coord in particles) + have_radii = radii is not None + + if radii is None: + radii_tuple = () + else: + radii_tuple = (radii,) + + knl = get_bounding_box_kernel(actx, dimensions, coord_dtype, have_radii) + return knl( + *(tuple(particles) + radii_tuple), + queue=actx.queue, + allocator=actx.allocator, + ) # }}} diff --git a/boxtree/constant_one.py b/boxtree/constant_one.py index 4e61f736..958989a1 100644 --- a/boxtree/constant_one.py +++ b/boxtree/constant_one.py @@ -26,6 +26,8 @@ """ import numpy as np + +from boxtree.array_context import PyOpenCLArrayContext from boxtree.fmm import TreeIndependentDataForWrangler, ExpansionWranglerInterface from boxtree.timing import DummyTimingFuture @@ -83,7 +85,9 @@ def local_expansions_view(self, local_exps, level): def timing_future(ops): return DummyTimingFuture.from_op_count(ops) - def form_multipoles(self, level_start_source_box_nrs, source_boxes, + def form_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_box_nrs, + source_boxes, src_weight_vecs): src_weights, = src_weight_vecs mpoles = self.multipole_expansion_zeros() @@ -96,8 +100,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes, return mpoles, self.timing_future(ops) - def coarsen_multipoles(self, level_start_source_parent_box_nrs, - source_parent_boxes, mpoles): + def coarsen_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_parent_box_nrs, + source_parent_boxes, + mpoles): tree = self.tree ops = 0 @@ -119,7 +125,8 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs, return mpoles, self.timing_future(ops) - def eval_direct(self, target_boxes, neighbor_sources_starts, + def eval_direct(self, actx: PyOpenCLArrayContext, + target_boxes, neighbor_sources_starts, neighbor_sources_lists, src_weight_vecs): src_weights, = src_weight_vecs pot = self.output_zeros() @@ -144,6 +151,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts, return pot, self.timing_future(ops) def multipole_to_local(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, mpole_exps): @@ -164,7 +172,9 @@ def multipole_to_local(self, return local_exps, self.timing_future(ops) def eval_multipoles(self, - target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level, + actx: PyOpenCLArrayContext, + target_boxes_by_source_level, + from_sep_smaller_nonsiblings_by_level, mpole_exps): pot = self.output_zeros() ops = 0 @@ -186,8 +196,10 @@ def eval_multipoles(self, return pot, self.timing_future(ops) def form_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, - target_or_target_parent_boxes, starts, lists, src_weight_vecs): + target_or_target_parent_boxes, + starts, lists, src_weight_vecs): src_weights, = src_weight_vecs local_exps = self.local_expansion_zeros() ops = 0 @@ -209,7 +221,9 @@ def form_locals(self, return local_exps, self.timing_future(ops) - def refine_locals(self, level_start_target_or_target_parent_box_nrs, + def refine_locals(self, + actx: PyOpenCLArrayContext, + level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): ops = 0 @@ -222,7 +236,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs, return local_exps, self.timing_future(ops) - def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): + def eval_locals(self, + actx: PyOpenCLArrayContext, + level_start_target_box_nrs, + target_boxes, local_exps): pot = self.output_zeros() ops = 0 @@ -233,7 +250,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): return pot, self.timing_future(ops) - def finalize_potentials(self, potentials, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials): return potentials # }}} diff --git a/boxtree/cost.py b/boxtree/cost.py index e6d43e63..1e026b52 100644 --- a/boxtree/cost.py +++ b/boxtree/cost.py @@ -69,7 +69,7 @@ from mako.template import Template from pymbolic import var, evaluate -from pytools import memoize_method +from pytools import memoize_in from boxtree.array_context import PyOpenCLArrayContext @@ -242,7 +242,6 @@ def process_form_multipoles(self, actx: PyOpenCLArrayContext, :return: an array of shape (nsource_boxes,), with each entry represents the cost of the box. """ - pass @abstractmethod def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, @@ -259,7 +258,6 @@ def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, immediate clear how per-box cost of upward propagation will be useful for distributed load balancing. """ - pass @abstractmethod def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, @@ -271,7 +269,6 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, :return: an array of shape (ntarget_boxes,), with each entry representing the number of direct evaluation sources for that target box. """ - pass @abstractmethod def process_direct(self, actx: PyOpenCLArrayContext, @@ -292,7 +289,6 @@ def process_direct(self, actx: PyOpenCLArrayContext, :return: an array of shape (ntarget_boxes,), with each entry represents the cost of the box. """ - pass @abstractmethod def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost): @@ -304,7 +300,6 @@ def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost): each entry representing the cost of multipole-to-local translations to this box. """ - pass @abstractmethod def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, @@ -322,7 +317,6 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, cost of evaluating all targets inside this box from multipole expansions of list-3 boxes. """ - pass @abstractmethod def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost): @@ -335,7 +329,6 @@ def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost): each entry representing the cost of point-to-local translations to this box. """ - pass @abstractmethod def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, @@ -352,7 +345,6 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, :return: an array of shape (ntarget_boxes,), the cost of evaluating the potentials of all targets inside this box from its local expansion. """ - pass @abstractmethod def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost): @@ -368,7 +360,6 @@ def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost) immediate clear how per-box cost of downward propagation will be useful for distributed load balancing. """ - pass @abstractmethod def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result): @@ -377,7 +368,6 @@ def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result): :arg per_box_result: an array to be sumed. :return: a :class:`float`, the result of the sum. """ - pass @staticmethod def cost_factors_to_dev(cost_factors, actx: Optional[PyOpenCLArrayContext]): @@ -449,7 +439,6 @@ def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes): :param nboxes: the number of boxes :return: an array of shape (*nboxes*,), representing the zero per-box cost. """ - pass def cost_per_box(self, actx: PyOpenCLArrayContext, traversal, level_to_order, calibration_params, @@ -730,41 +719,46 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ form multipoles - @memoize_method - def process_form_multipoles_knl(self, actx: PyOpenCLArrayContext, + def process_form_multipoles_kernel(self, actx: PyOpenCLArrayContext, box_id_dtype, particle_id_dtype, box_level_dtype): - return ElementwiseKernel( - actx.context, - Template(r""" - double *np2m, - ${box_id_t} *source_boxes, - ${particle_id_t} *box_source_counts_nonchild, - ${box_level_t} *box_levels, - double *p2m_cost - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - Template(r""" - ${box_id_t} box_idx = source_boxes[i]; - ${particle_id_t} nsources = box_source_counts_nonchild[box_idx]; - ${box_level_t} ilevel = box_levels[box_idx]; - np2m[i] = nsources * p2m_cost[ilevel]; - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - name="process_form_multipoles" - ) + @memoize_in(actx, ( + FMMCostModel.process_form_multipoles_kernel, + box_id_dtype, particle_id_dtype, box_level_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + double *np2m, + ${box_id_t} *source_boxes, + ${particle_id_t} *box_source_counts_nonchild, + ${box_level_t} *box_levels, + double *p2m_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} box_idx = source_boxes[i]; + ${particle_id_t} nsources = box_source_counts_nonchild[box_idx]; + ${box_level_t} ilevel = box_levels[box_idx]; + np2m[i] = nsources * p2m_cost[ilevel]; + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_form_multipoles" + ) + + return get_kernel() def process_form_multipoles(self, actx, traversal, p2m_cost): tree = traversal.tree np2m = actx.zeros(len(traversal.source_boxes), dtype=np.float64) - process_form_multipoles_knl = self.process_form_multipoles_knl( + process_form_multipoles_knl = self.process_form_multipoles_kernel( actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -783,57 +777,62 @@ def process_form_multipoles(self, actx, traversal, p2m_cost): # {{{ propagate multipoles upward - @memoize_method - def process_coarsen_multipoles_knl(self, actx: PyOpenCLArrayContext, - ndimensions, box_id_dtype, - box_level_dtype, nlevels): - return ElementwiseKernel( - actx.context, - Template(r""" - ${box_id_t} *source_parent_boxes, - ${box_level_t} *box_levels, - double *m2m_cost, - double *nm2m, - % for i in range(2**ndimensions): - % if i == 2**ndimensions - 1: - ${box_id_t} *box_child_ids_${i} - % else: - ${box_id_t} *box_child_ids_${i}, - % endif - % endfor - """).render( - ndimensions=ndimensions, - box_id_t=dtype_to_ctype(box_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - Template(r""" - ${box_id_t} box_idx = source_parent_boxes[i]; - ${box_level_t} target_level = box_levels[box_idx]; - if(target_level <= 1) { - nm2m[i] = 0.0; - } else { - int nchild = 0; + def process_coarsen_multipoles_kernel(self, actx: PyOpenCLArrayContext, + ndimensions, box_id_dtype, + box_level_dtype, nlevels): + @memoize_in(actx, ( + FMMCostModel.process_coarsen_multipoles_kernel, + ndimensions, box_id_dtype, box_level_dtype, nlevels)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + ${box_id_t} *source_parent_boxes, + ${box_level_t} *box_levels, + double *m2m_cost, + double *nm2m, % for i in range(2**ndimensions): - if(box_child_ids_${i}[box_idx]) - nchild += 1; + % if i == 2**ndimensions - 1: + ${box_id_t} *box_child_ids_${i} + % else: + ${box_id_t} *box_child_ids_${i}, + % endif % endfor - nm2m[i] = nchild * m2m_cost[target_level]; - } - """).render( - ndimensions=ndimensions, - box_id_t=dtype_to_ctype(box_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype), - nlevels=nlevels - ), - name="process_coarsen_multipoles" - ) + """).render( + ndimensions=ndimensions, + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} box_idx = source_parent_boxes[i]; + ${box_level_t} target_level = box_levels[box_idx]; + if(target_level <= 1) { + nm2m[i] = 0.0; + } else { + int nchild = 0; + % for i in range(2**ndimensions): + if(box_child_ids_${i}[box_idx]) + nchild += 1; + % endfor + nm2m[i] = nchild * m2m_cost[target_level]; + } + """).render( + ndimensions=ndimensions, + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype), + nlevels=nlevels + ), + name="process_coarsen_multipoles" + ) + + return get_kernel() def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, traversal, m2m_cost): tree = traversal.tree nm2m = actx.zeros(len(traversal.source_parent_boxes), dtype=np.float64) - process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl( + process_coarsen_multipoles_knl = self.process_coarsen_multipoles_kernel( actx, tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels ) @@ -844,7 +843,7 @@ def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, m2m_cost, nm2m, *tree.box_child_ids, - queue=actx.queue + queue=actx.queue, ) return self.aggregate_over_boxes(actx, nm2m) @@ -853,42 +852,47 @@ def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) - @memoize_method - def _get_ndirect_sources_knl(self, actx: PyOpenCLArrayContext, + def _get_ndirect_sources_kernel(self, actx: PyOpenCLArrayContext, particle_id_dtype, box_id_dtype): - return ElementwiseKernel( - actx.context, - Template(""" - ${particle_id_t} *ndirect_sources_by_itgt_box, - ${box_id_t} *source_boxes_starts, - ${box_id_t} *source_boxes_lists, - ${particle_id_t} *box_source_counts_nonchild - """).render( - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_id_t=dtype_to_ctype(box_id_dtype) - ), - Template(r""" - ${particle_id_t} nsources = 0; - ${box_id_t} source_boxes_start_idx = source_boxes_starts[i]; - ${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1]; - - for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx; - cur_source_boxes_idx < source_boxes_end_idx; - cur_source_boxes_idx++) - { - ${box_id_t} cur_source_box = source_boxes_lists[ - cur_source_boxes_idx - ]; - nsources += box_source_counts_nonchild[cur_source_box]; - } - - ndirect_sources_by_itgt_box[i] += nsources; - """).render( - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_id_t=dtype_to_ctype(box_id_dtype) - ), - name="get_ndirect_sources" - ) + @memoize_in(actx, ( + FMMCostModel._get_ndirect_sources_kernel, + particle_id_dtype, box_id_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(""" + ${particle_id_t} *ndirect_sources_by_itgt_box, + ${box_id_t} *source_boxes_starts, + ${box_id_t} *source_boxes_lists, + ${particle_id_t} *box_source_counts_nonchild + """).render( + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_id_t=dtype_to_ctype(box_id_dtype) + ), + Template(r""" + ${particle_id_t} nsources = 0; + ${box_id_t} source_boxes_start_idx = source_boxes_starts[i]; + ${box_id_t} source_boxes_end_idx = source_boxes_starts[i + 1]; + + for(${box_id_t} cur_source_boxes_idx = source_boxes_start_idx; + cur_source_boxes_idx < source_boxes_end_idx; + cur_source_boxes_idx++) + { + ${box_id_t} cur_source_box = source_boxes_lists[ + cur_source_boxes_idx + ]; + nsources += box_source_counts_nonchild[cur_source_box]; + } + + ndirect_sources_by_itgt_box[i] += nsources; + """).render( + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_id_t=dtype_to_ctype(box_id_dtype) + ), + name="get_ndirect_sources" + ) + + return get_kernel() def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, traversal): @@ -897,7 +901,7 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, particle_id_dtype = tree.particle_id_dtype box_id_dtype = tree.box_id_dtype - get_ndirect_sources_knl = self._get_ndirect_sources_knl( + get_ndirect_sources_knl = self._get_ndirect_sources_kernel( actx, particle_id_dtype, box_id_dtype ) @@ -910,7 +914,8 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, ndirect_sources_by_itgt_box, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, - tree.box_source_counts_nonchild + tree.box_source_counts_nonchild, + queue=actx.queue, ) # List 3 close @@ -920,7 +925,8 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, ndirect_sources_by_itgt_box, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, - tree.box_source_counts_nonchild + tree.box_source_counts_nonchild, + queue=actx.queue, ) # List 4 close @@ -930,7 +936,8 @@ def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, ndirect_sources_by_itgt_box, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, - tree.box_source_counts_nonchild + tree.box_source_counts_nonchild, + queue=actx.queue, ) return ndirect_sources_by_itgt_box @@ -950,33 +957,37 @@ def process_direct(self, actx: PyOpenCLArrayContext, # {{{ translate separated siblings' ("list 2") mpoles to local - @memoize_method - def process_list2_knl(self, actx: PyOpenCLArrayContext, - box_id_dtype, box_level_dtype): - return ElementwiseKernel( - actx.context, - Template(r""" - double *nm2l, - ${box_id_t} *target_or_target_parent_boxes, - ${box_id_t} *from_sep_siblings_starts, - ${box_level_t} *box_levels, - double *m2l_cost - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - Template(r""" - ${box_id_t} start = from_sep_siblings_starts[i]; - ${box_id_t} end = from_sep_siblings_starts[i+1]; - ${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]]; - - nm2l[i] = (end - start) * m2l_cost[ilevel]; - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - name="process_list2" - ) + def process_list2_kernel(self, actx: PyOpenCLArrayContext, + box_id_dtype, box_level_dtype): + @memoize_in(actx, ( + FMMCostModel.process_list2_kernel, box_id_dtype, box_level_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + double *nm2l, + ${box_id_t} *target_or_target_parent_boxes, + ${box_id_t} *from_sep_siblings_starts, + ${box_level_t} *box_levels, + double *m2l_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} start = from_sep_siblings_starts[i]; + ${box_id_t} end = from_sep_siblings_starts[i+1]; + ${box_level_t} ilevel = box_levels[target_or_target_parent_boxes[i]]; + + nm2l[i] = (end - start) * m2l_cost[ilevel]; + """).render( # noqa: E501 + box_id_t=dtype_to_ctype(box_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_list2" + ) + + return get_kernel() def process_list2(self, actx, traversal, m2l_cost): tree = traversal.tree @@ -986,7 +997,7 @@ def process_list2(self, actx, traversal, m2l_cost): ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) nm2l = actx.zeros((ntarget_or_target_parent_boxes,), dtype=np.float64) - process_list2_knl = self.process_list2_knl( + process_list2_knl = self.process_list2_kernel( actx, box_id_dtype, box_level_dtype ) process_list2_knl( @@ -1004,35 +1015,40 @@ def process_list2(self, actx, traversal, m2l_cost): # {{{ evaluate sep. smaller mpoles ("list 3") at particles - @memoize_method - def process_list3_knl(self, actx: PyOpenCLArrayContext, - box_id_dtype, particle_id_dtype): - return ElementwiseKernel( - actx.context, - Template(r""" - ${box_id_t} *target_boxes_sep_smaller, - ${box_id_t} *sep_smaller_start, - ${particle_id_t} *box_target_counts_nonchild, - double m2p_cost_current_level, - double *nm2p - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype) - ), - Template(r""" - ${box_id_t} target_box = target_boxes_sep_smaller[i]; - ${box_id_t} start = sep_smaller_start[i]; - ${box_id_t} end = sep_smaller_start[i+1]; - ${particle_id_t} ntargets = box_target_counts_nonchild[target_box]; - nm2p[target_box] += ( - ntargets * (end - start) * m2p_cost_current_level - ); - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype) - ), - name="process_list3" - ) + def process_list3_kernel(self, actx: PyOpenCLArrayContext, + box_id_dtype, particle_id_dtype): + @memoize_in(actx, ( + FMMCostModel.process_list3_kernel, + box_id_dtype, particle_id_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + ${box_id_t} *target_boxes_sep_smaller, + ${box_id_t} *sep_smaller_start, + ${particle_id_t} *box_target_counts_nonchild, + double m2p_cost_current_level, + double *nm2p + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + Template(r""" + ${box_id_t} target_box = target_boxes_sep_smaller[i]; + ${box_id_t} start = sep_smaller_start[i]; + ${box_id_t} end = sep_smaller_start[i+1]; + ${particle_id_t} ntargets = box_target_counts_nonchild[target_box]; + nm2p[target_box] += ( + ntargets * (end - start) * m2p_cost_current_level + ); + """).render( # noqa: E501 + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + name="process_list3" + ) + + return get_kernel() def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, box_target_counts_nonchild=None): @@ -1042,7 +1058,7 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, if box_target_counts_nonchild is None: box_target_counts_nonchild = tree.box_target_counts_nonchild - process_list3_knl = self.process_list3_knl( + process_list3_knl = self.process_list3_kernel( actx, tree.box_id_dtype, tree.particle_id_dtype ) @@ -1054,7 +1070,7 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, box_target_counts_nonchild, actx.to_numpy(m2p_cost[ilevel]).reshape(-1)[0], nm2p, - queue=actx.queue + queue=actx.queue, ) return nm2p @@ -1063,46 +1079,51 @@ def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, # {{{ form locals for separated bigger source boxes ("list 4") - @memoize_method - def process_list4_knl(self, actx: PyOpenCLArrayContext, - box_id_dtype, particle_id_dtype, box_level_dtype): - return ElementwiseKernel( - actx.context, - Template(r""" - double *nm2p, - ${box_id_t} *from_sep_bigger_starts, - ${box_id_t} *from_sep_bigger_lists, - ${particle_id_t} *box_source_counts_nonchild, - ${box_level_t} *box_levels, - double *p2l_cost - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - Template(r""" - ${box_id_t} start = from_sep_bigger_starts[i]; - ${box_id_t} end = from_sep_bigger_starts[i+1]; - for(${box_id_t} idx=start; idx < end; idx++) { - ${box_id_t} src_ibox = from_sep_bigger_lists[idx]; - ${particle_id_t} nsources = box_source_counts_nonchild[src_ibox]; - ${box_level_t} ilevel = box_levels[src_ibox]; - nm2p[i] += nsources * p2l_cost[ilevel]; - } - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - name="process_list4" - ) + def process_list4_kernel(self, actx: PyOpenCLArrayContext, + box_id_dtype, particle_id_dtype, box_level_dtype): + @memoize_in(actx, ( + FMMCostModel.process_list4_kernel, + box_id_dtype, particle_id_dtype, box_level_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + double *nm2p, + ${box_id_t} *from_sep_bigger_starts, + ${box_id_t} *from_sep_bigger_lists, + ${particle_id_t} *box_source_counts_nonchild, + ${box_level_t} *box_levels, + double *p2l_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} start = from_sep_bigger_starts[i]; + ${box_id_t} end = from_sep_bigger_starts[i+1]; + for(${box_id_t} idx=start; idx < end; idx++) { + ${box_id_t} src_ibox = from_sep_bigger_lists[idx]; + ${particle_id_t} nsources = box_source_counts_nonchild[src_ibox]; + ${box_level_t} ilevel = box_levels[src_ibox]; + nm2p[i] += nsources * p2l_cost[ilevel]; + } + """).render( # noqa: E501 + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_list4" + ) + + return get_kernel() def process_list4(self, actx, traversal, p2l_cost): tree = traversal.tree target_or_target_parent_boxes = traversal.target_or_target_parent_boxes nm2p = actx.zeros(len(target_or_target_parent_boxes), dtype=np.float64) - process_list4_knl = self.process_list4_knl( + process_list4_knl = self.process_list4_kernel( actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -1114,7 +1135,7 @@ def process_list4(self, actx, traversal, p2l_cost): tree.box_source_counts_nonchild, tree.box_levels, p2l_cost, - queue=actx.queue + queue=actx.queue, ) return nm2p @@ -1123,34 +1144,40 @@ def process_list4(self, actx, traversal, p2l_cost): # {{{ evaluate local expansions at targets - @memoize_method - def process_eval_locals_knl(self, actx: PyOpenCLArrayContext, - box_id_dtype, particle_id_dtype, box_level_dtype): - return ElementwiseKernel( - actx.context, - Template(r""" - double *neval_locals, - ${box_id_t} *target_boxes, - ${particle_id_t} *box_target_counts_nonchild, - ${box_level_t} *box_levels, - double *l2p_cost - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - Template(r""" - ${box_id_t} box_idx = target_boxes[i]; - ${particle_id_t} ntargets = box_target_counts_nonchild[box_idx]; - ${box_level_t} ilevel = box_levels[box_idx]; - neval_locals[i] = ntargets * l2p_cost[ilevel]; - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - particle_id_t=dtype_to_ctype(particle_id_dtype), - box_level_t=dtype_to_ctype(box_level_dtype) - ), - name="process_eval_locals" - ) + def process_eval_locals_kernel(self, actx: PyOpenCLArrayContext, + box_id_dtype, particle_id_dtype, box_level_dtype): + + @memoize_in(actx, ( + FMMCostModel.process_eval_locals_kernel, + box_id_dtype, particle_id_dtype, box_level_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + double *neval_locals, + ${box_id_t} *target_boxes, + ${particle_id_t} *box_target_counts_nonchild, + ${box_level_t} *box_levels, + double *l2p_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + Template(r""" + ${box_id_t} box_idx = target_boxes[i]; + ${particle_id_t} ntargets = box_target_counts_nonchild[box_idx]; + ${box_level_t} ilevel = box_levels[box_idx]; + neval_locals[i] = ntargets * l2p_cost[ilevel]; + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_level_t=dtype_to_ctype(box_level_dtype) + ), + name="process_eval_locals" + ) + + return get_kernel() def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, box_target_counts_nonchild=None): @@ -1161,7 +1188,7 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild - process_eval_locals_knl = self.process_eval_locals_knl( + process_eval_locals_knl = self.process_eval_locals_kernel( actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -1170,7 +1197,8 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, traversal.target_boxes, box_target_counts_nonchild, tree.box_levels, - l2p_cost + l2p_cost, + queue=actx.queue, ) return neval_locals @@ -1179,32 +1207,36 @@ def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, # {{{ propagate locals downward - @memoize_method - def process_refine_locals_knl(self, actx: PyOpenCLArrayContext, box_id_dtype): - from pyopencl.reduction import ReductionKernel - return ReductionKernel( - actx.context, - np.float64, - neutral="0.0", - reduce_expr="a+b", - map_expr=r""" - (level_start_target_or_target_parent_box_nrs[i + 1] - - level_start_target_or_target_parent_box_nrs[i]) - * l2l_cost[i - 1] - """, - arguments=Template(r""" - ${box_id_t} *level_start_target_or_target_parent_box_nrs, - double *l2l_cost - """).render( - box_id_t=dtype_to_ctype(box_id_dtype) - ), - name="process_refine_locals" - ) + def process_refine_locals_kernel(self, actx: PyOpenCLArrayContext, box_id_dtype): + @memoize_in(actx, ( + FMMCostModel.process_refine_locals_kernel, box_id_dtype)) + def get_kernel(): + from pyopencl.reduction import ReductionKernel + return ReductionKernel( + actx.context, + np.float64, + neutral="0.0", + reduce_expr="a+b", + map_expr=r""" + (level_start_target_or_target_parent_box_nrs[i + 1] + - level_start_target_or_target_parent_box_nrs[i]) + * l2l_cost[i - 1] + """, + arguments=Template(r""" + ${box_id_t} *level_start_target_or_target_parent_box_nrs, + double *l2l_cost + """).render( + box_id_t=dtype_to_ctype(box_id_dtype) + ), + name="process_refine_locals" + ) + + return get_kernel() def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost): tree = traversal.tree - process_refine_locals_knl = self.process_refine_locals_knl( + process_refine_locals_knl = self.process_refine_locals_kernel( actx, tree.box_id_dtype ) @@ -1215,7 +1247,9 @@ def process_refine_locals(self, actx: PyOpenCLArrayContext, cost = process_refine_locals_knl( level_start_target_or_target_parent_box_nrs, l2l_cost, - range=slice(1, tree.nlevels) + range=slice(1, tree.nlevels), + queue=actx.queue, + allocator=actx.allocator, ) return actx.to_numpy(cost).reshape(-1)[0] diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py index a1f3606b..bc747d66 100644 --- a/boxtree/distributed/__init__.py +++ b/boxtree/distributed/__init__.py @@ -88,7 +88,7 @@ Distributed Wrangler -------------------- -.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler +.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWranglerMixin .. _distributed-fmm-evaluation: @@ -97,23 +97,27 @@ The distributed version of the FMM evaluation shares the same interface as the shared-memory version. To evaluate FMM in a distributed manner, use a subclass -of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in -:func:`boxtree.fmm.drive_fmm`. +of :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin` +in :func:`boxtree.fmm.drive_fmm`. """ -from mpi4py import MPI -import numpy as np -import pyopencl as cl -import pyopencl.array -from enum import IntEnum +import enum import warnings + +import numpy as np +from mpi4py import MPI + from boxtree.cost import FMMCostModel +from boxtree.array_context import PyOpenCLArrayContext __all__ = ["DistributedFMMRunner"] -class MPITags(IntEnum): +# {{{ MPI + +@enum.unique +class MPITags(enum.IntEnum): DIST_WEIGHT = 1 GATHER_POTENTIALS = 2 REDUCE_POTENTIALS = 3 @@ -121,27 +125,36 @@ class MPITags(IntEnum): def dtype_to_mpi(dtype): - """ This function translates a numpy datatype into the corresponding type used in + """This function translates a numpy datatype into the corresponding type used in mpi4py. """ + if hasattr(MPI, "_typedict"): - mpi_type = MPI._typedict[np.dtype(dtype).char] + typedict = MPI._typedict elif hasattr(MPI, "__TypeDict__"): - mpi_type = MPI.__TypeDict__[np.dtype(dtype).char] + typedict = MPI.__TypeDict__ else: - raise RuntimeError("There is no dictionary to translate from Numpy dtype to " - "MPI type") + raise RuntimeError( + "There is no dictionary to translate from np.dtype to an MPI datatype") + + mpi_type = typedict.get(np.dtype(dtype).char, None) + if mpi_type is None: + raise ValueError(f"Could not convert '{dtype}' to an MPI datatype") + return mpi_type +# }}} + + +# {{{ DistributedFMMRunner def construct_distributed_wrangler( - queue, global_tree, traversal_builder, wrangler_factory, + actx: PyOpenCLArrayContext, global_tree, traversal_builder, wrangler_factory, calibration_params, comm): """Helper function for constructing the distributed wrangler on each rank. Note: This function needs to be called collectively on all ranks. """ - mpi_rank = comm.Get_rank() # `tree_in_device_memory` is True if the global tree is in the device memory @@ -152,7 +165,7 @@ def construct_distributed_wrangler( # worker ranks. tree_in_device_memory = None if mpi_rank == 0: - tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array) + tree_in_device_memory = isinstance(global_tree.targets[0], actx.array_types) tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0) # {{{ Broadcast the global tree @@ -160,7 +173,7 @@ def construct_distributed_wrangler( global_tree_host = None if mpi_rank == 0: if tree_in_device_memory: - global_tree_host = global_tree.get(queue) + global_tree_host = actx.to_numpy(global_tree) else: global_tree_host = global_tree @@ -170,11 +183,11 @@ def construct_distributed_wrangler( if mpi_rank == 0 and tree_in_device_memory: global_tree_dev = global_tree else: - global_tree_dev = global_tree_host.to_device(queue) - global_tree_dev = global_tree_dev.with_queue(queue) + global_tree_dev = actx.from_numpy(global_tree_host) + global_tree_dev = actx.thaw(global_tree_dev) - global_trav_dev, _ = traversal_builder(queue, global_tree_dev) - global_trav_host = global_trav_dev.get(queue) + global_trav_dev = traversal_builder(actx, global_tree_dev) + global_trav_host = actx.to_numpy(global_trav_dev) if tree_in_device_memory: global_trav = global_trav_dev @@ -196,16 +209,16 @@ def construct_distributed_wrangler( # accurate one warnings.warn("Calibration parameters for the cost model are not " "supplied. The default one will be used.") - calibration_params = \ - FMMCostModel.get_unit_calibration_params() + calibration_params = FMMCostModel.get_unit_calibration_params() # We need to construct a wrangler in order to access `level_orders` global_wrangler = wrangler_factory(global_trav, global_trav) cost_per_box = cost_model.cost_per_box( - queue, global_trav_dev, global_wrangler.level_orders, + actx, global_trav_dev, global_wrangler.level_orders, calibration_params - ).get() + ) + cost_per_box = actx.to_numpy(cost_per_box) from boxtree.distributed.partition import partition_work responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm) @@ -216,7 +229,7 @@ def construct_distributed_wrangler( from boxtree.distributed.local_tree import generate_local_tree local_tree, src_idx, tgt_idx = generate_local_tree( - queue, global_trav_host, responsible_boxes_list, comm) + actx, global_trav_dev, actx.from_numpy(responsible_boxes_list), comm) # }}} @@ -230,12 +243,12 @@ def construct_distributed_wrangler( # {{{ Compute traversal object on each rank from boxtree.distributed.local_traversal import generate_local_travs - local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder) + local_trav_dev = generate_local_travs(actx, local_tree, traversal_builder) if not tree_in_device_memory: - local_trav = local_trav_dev.get(queue=queue) + local_trav = actx.to_numpy(local_trav_dev) else: - local_trav = local_trav_dev.with_queue(None) + local_trav = actx.freeze(local_trav_dev) # }}} @@ -250,7 +263,7 @@ class DistributedFMMRunner: .. automethod:: __init__ .. automethod:: drive_dfmm """ - def __init__(self, queue, global_tree, + def __init__(self, array_context: PyOpenCLArrayContext, global_tree, traversal_builder, wrangler_factory, calibration_params=None, comm=MPI.COMM_WORLD): @@ -273,15 +286,22 @@ def __init__(self, queue, global_tree, """ self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \ construct_distributed_wrangler( - queue, global_tree, traversal_builder, wrangler_factory, + array_context, global_tree, traversal_builder, wrangler_factory, calibration_params, comm) - def drive_dfmm(self, source_weights, timing_data=None): - """Calculate potentials at target points. - """ + def drive_dfmm(self, + actx: PyOpenCLArrayContext, + source_weights, + timing_data=None): + """Calculate potentials at target points.""" from boxtree.fmm import drive_fmm return drive_fmm( + actx, self.wrangler, source_weights, timing_data=timing_data, global_src_idx_all_ranks=self.src_idx_all_ranks, global_tgt_idx_all_ranks=self.tgt_idx_all_ranks) + +# }}} + +# vim: fdm=marker diff --git a/boxtree/distributed/calculation.py b/boxtree/distributed/calculation.py index 0fcf3aec..bfd41c65 100644 --- a/boxtree/distributed/calculation.py +++ b/boxtree/distributed/calculation.py @@ -24,51 +24,97 @@ """ import numpy as np -import pyopencl as cl -from boxtree.distributed import MPITags from mpi4py import MPI -from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler -from boxtree.fmm import ExpansionWranglerInterface -from pytools import memoize_method + from pyopencl.tools import dtype_to_ctype from pyopencl.elementwise import ElementwiseKernel + +from pytools import memoize_method, memoize_on_first_arg from mako.template import Template +from boxtree.distributed import MPITags +from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler +from boxtree.array_context import PyOpenCLArrayContext + import logging logger = logging.getLogger(__name__) # {{{ Distributed FMM wrangler -class DistributedExpansionWrangler(ExpansionWranglerInterface): - """Distributed expansion wrangler base class. +@memoize_on_first_arg +def get_find_boxes_used_by_subrange_kernel( + actx: PyOpenCLArrayContext, + box_id_dtype: "np.dtype"): + return ElementwiseKernel( + actx.context, + Template(r""" + ${box_id_t} *contributing_boxes_list, + int subrange_start, + int subrange_end, + ${box_id_t} *box_to_user_rank_starts, + int *box_to_user_rank_lists, + char *box_in_subrange + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + ), + Template(r""" + ${box_id_t} ibox = contributing_boxes_list[i]; + ${box_id_t} iuser_start = box_to_user_rank_starts[ibox]; + ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1]; + for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) { + int useri = box_to_user_rank_lists[iuser]; + if(subrange_start <= useri && useri < subrange_end) { + box_in_subrange[i] = 1; + } + } + """).render( # noqa: E501 + box_id_t=dtype_to_ctype(box_id_dtype) + ), + "find_boxes_used_by_subrange" + ) + - This is an abstract class and should not be directly instantiated. Instead, it is - expected that all distributed wranglers should be subclasses of this class. +class DistributedExpansionWranglerMixin: + """Distributed expansion wrangler helper class. + + This class is meant to aid in adding distributed capabilities to wranglers. + All distributed wranglers shoudl inherit from this class + + .. attribute:: comm + .. attribute:: global_traversal + .. attribute:: communicate_mpoles_via_allreduce - .. automethod:: __init__ .. automethod:: distribute_source_weights .. automethod:: gather_potential_results .. automethod:: communicate_mpoles """ - def __init__(self, context, comm, global_traversal, - traversal_in_device_memory, - communicate_mpoles_via_allreduce=False): - self.context = context - self.comm = comm - self.global_traversal = global_traversal - self.traversal_in_device_memory = traversal_in_device_memory - self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): - mpi_rank = self.comm.Get_rank() - mpi_size = self.comm.Get_size() + @property + def context(self): + return self._setup_actx.context + + @property + @memoize_method + def mpi_rank(self): + return self.comm.Get_rank() + + @property + @memoize_method + def mpi_size(self): + return self.comm.Get_size() - if mpi_rank == 0: + @property + def is_mpi_root(self): + return self.mpi_rank == 0 + + def distribute_source_weights(self, + actx: PyOpenCLArrayContext, src_weight_vecs, src_idx_all_ranks): + if self.is_mpi_root: distribute_weight_req = [] - local_src_weight_vecs = np.empty((mpi_size,), dtype=object) + local_src_weight_vecs = np.empty((self.mpi_size,), dtype=object) - for irank in range(mpi_size): + for irank in range(self.mpi_size): local_src_weight_vecs[irank] = [ source_weights[src_idx_all_ranks[irank]] for source_weights in src_weight_vecs] @@ -86,23 +132,20 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): return local_src_weight_vecs - def gather_potential_results(self, potentials, tgt_idx_all_ranks): - mpi_rank = self.comm.Get_rank() - mpi_size = self.comm.Get_size() - + def gather_potential_results(self, + actx: PyOpenCLArrayContext, potentials, tgt_idx_all_ranks): from boxtree.distributed import dtype_to_mpi potentials_mpi_type = dtype_to_mpi(potentials.dtype) - gathered_potentials = None - if mpi_rank == 0: + if self.is_mpi_root: # The root rank received calculated potentials from all worker ranks - potentials_all_ranks = np.empty((mpi_size,), dtype=object) + potentials_all_ranks = np.empty((self.mpi_size,), dtype=object) potentials_all_ranks[0] = potentials recv_reqs = [] - for irank in range(1, mpi_size): + for irank in range(1, self.mpi_size): potentials_all_ranks[irank] = np.empty( tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype) @@ -117,7 +160,7 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks): gathered_potentials = np.empty( self.global_traversal.tree.ntargets, dtype=potentials.dtype) - for irank in range(mpi_size): + for irank in range(self.mpi_size): gathered_potentials[tgt_idx_all_ranks[irank]] = ( potentials_all_ranks[irank]) else: @@ -131,8 +174,13 @@ def _slice_mpoles(self, mpoles, slice_indices): if len(slice_indices) == 0: return np.empty((0,), dtype=mpoles.dtype) + level_start_box_nrs = self.traversal.tree.level_start_box_nrs + if not isinstance(level_start_box_nrs, np.ndarray): + level_start_box_nrs = self._setup_actx.to_numpy( + level_start_box_nrs) + level_start_slice_indices = np.searchsorted( - slice_indices, self.traversal.tree.level_start_box_nrs) + slice_indices, level_start_box_nrs) mpoles_list = [] for ilevel in range(self.traversal.tree.nlevels): @@ -152,8 +200,13 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices): if len(slice_indices) == 0: return + level_start_box_nrs = self.traversal.tree.level_start_box_nrs + if not isinstance(level_start_box_nrs, np.ndarray): + level_start_box_nrs = self._setup_actx.to_numpy( + level_start_box_nrs) + level_start_slice_indices = np.searchsorted( - slice_indices, self.traversal.tree.level_start_box_nrs) + slice_indices, level_start_box_nrs) mpole_updates_start = 0 for ilevel in range(self.traversal.tree.nlevels): @@ -174,60 +227,26 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices): mpole_updates_start = mpole_updates_end - @memoize_method - def find_boxes_used_by_subrange_kernel(self, box_id_dtype): - return ElementwiseKernel( - self.context, - Template(r""" - ${box_id_t} *contributing_boxes_list, - int subrange_start, - int subrange_end, - ${box_id_t} *box_to_user_rank_starts, - int *box_to_user_rank_lists, - char *box_in_subrange - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - ), - Template(r""" - ${box_id_t} ibox = contributing_boxes_list[i]; - ${box_id_t} iuser_start = box_to_user_rank_starts[ibox]; - ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1]; - for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) { - int useri = box_to_user_rank_lists[iuser]; - if(subrange_start <= useri && useri < subrange_end) { - box_in_subrange[i] = 1; - } - } - """).render( - box_id_t=dtype_to_ctype(box_id_dtype) - ), - "find_boxes_used_by_subrange" - ) - def find_boxes_used_by_subrange( - self, subrange, box_to_user_rank_starts, box_to_user_rank_lists, + self, actx: PyOpenCLArrayContext, + subrange, box_to_user_rank_starts, box_to_user_rank_lists, contributing_boxes_list): """Test whether the multipole expansions of the contributing boxes are used by at least one box in a range. :arg subrange: the range is represented by ``(subrange[0], subrange[1])``. - :arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object - indicating the start and end index in *box_to_user_rank_lists* for each + :arg box_to_user_rank_starts: an array object indicating the start and + end index in *box_to_user_rank_lists* for each box in + *contributing_boxes_list*. + :arg box_to_user_rank_lists: an array object storing the users of each box in *contributing_boxes_list*. - :arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing - the users of each box in *contributing_boxes_list*. - :returns: a :class:`pyopencl.array.Array` object with the same shape as - *contributing_boxes_list*, where the i-th entry is 1 if - ``contributing_boxes_list[i]`` is used by at least on box in the - subrange specified. + :returns: an array object with the same shape as *contributing_boxes_list*, + where the i-th entry is 1 if ``contributing_boxes_list[i]`` is used + by at least on box in the subrange specified. """ - box_in_subrange = cl.array.zeros( - contributing_boxes_list.queue, - contributing_boxes_list.shape[0], - dtype=np.int8 - ) - knl = self.find_boxes_used_by_subrange_kernel( - self.traversal.tree.box_id_dtype) + box_in_subrange = actx.zeros(contributing_boxes_list.shape[0], dtype=np.int8) + knl = get_find_boxes_used_by_subrange_kernel( + actx, self.traversal.tree.box_id_dtype) knl( contributing_boxes_list, @@ -240,7 +259,8 @@ def find_boxes_used_by_subrange( return box_in_subrange - def communicate_mpoles(self, mpole_exps, return_stats=False): + def communicate_mpoles(self, + actx: PyOpenCLArrayContext, mpole_exps, return_stats=False): """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_. The main idea is to mimic an allreduce as done on a hypercube network, but to @@ -249,12 +269,12 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): .. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston, Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc, - Lexing Ying, Denis Zorin, and George Biros. “A massively parallel - adaptive fast multipole method on heterogeneous architectures." - Communications of the ACM 55, no. 5 (2012): 101-109. + Lexing Ying, Denis Zorin, and George Biros. "A massively parallel + adaptive fast multipole method on heterogeneous architectures", + Communications of the ACM 55, no. 5 (2012): 101-109, + `DOI `__. """ - mpi_rank = self.comm.Get_rank() - mpi_size = self.comm.Get_size() + actx = self._setup_actx tree = self.traversal.tree if self.communicate_mpoles_via_allreduce: @@ -280,16 +300,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): # Initially, this set consists of the boxes satisfying condition (a), which # are precisely the boxes owned by this process and their ancestors. if self.traversal_in_device_memory: - with cl.CommandQueue(self.context) as queue: - contributing_boxes = tree.ancestor_mask.get(queue=queue) - responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue) + contributing_boxes = actx.to_numpy(tree.ancestor_mask) + responsible_boxes_list = actx.to_numpy(tree.responsible_boxes_list) else: - contributing_boxes = tree.ancestor_mask.copy() + contributing_boxes = np.copy(tree.ancestor_mask) responsible_boxes_list = tree.responsible_boxes_list contributing_boxes[responsible_boxes_list] = 1 from boxtree.tools import AllReduceCommPattern - comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size) + comm_pattern = AllReduceCommPattern(self.mpi_rank, self.mpi_size) # Temporary buffers for receiving data mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype) @@ -299,15 +318,13 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): stats["bytes_recvd_by_stage"] = [] if self.traversal_in_device_memory: - box_to_user_rank_starts_dev = \ - tree.box_to_user_rank_starts.with_queue(None) - box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None) + box_to_user_rank_starts_dev = actx.freeze(tree.box_to_user_rank_starts) + box_to_user_rank_lists_dev = actx.freeze(tree.box_to_user_rank_lists) else: - with cl.CommandQueue(self.context) as queue: - box_to_user_rank_starts_dev = cl.array.to_device( - queue, tree.box_to_user_rank_starts).with_queue(None) - box_to_user_rank_lists_dev = cl.array.to_device( - queue, tree.box_to_user_rank_lists).with_queue(None) + box_to_user_rank_starts_dev = actx.freeze( + actx.from_numpy(tree.box_to_user_rank_starts)) + box_to_user_rank_lists_dev = actx.freeze( + actx.from_numpy(tree.box_to_user_rank_lists)) while not comm_pattern.done(): send_requests = [] @@ -321,18 +338,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): tree.box_id_dtype ) - with cl.CommandQueue(self.context) as queue: - contributing_boxes_list_dev = cl.array.to_device( - queue, contributing_boxes_list) - - box_in_subrange = self.find_boxes_used_by_subrange( - message_subrange, - box_to_user_rank_starts_dev, box_to_user_rank_lists_dev, - contributing_boxes_list_dev - ) - - box_in_subrange_host = box_in_subrange.get().astype(bool) + contributing_boxes_list_dev = actx.from_numpy( + contributing_boxes_list) + box_in_subrange = self.find_boxes_used_by_subrange( + actx, message_subrange, + box_to_user_rank_starts_dev, box_to_user_rank_lists_dev, + contributing_boxes_list_dev + ) + box_in_subrange_host = actx.to_numpy(box_in_subrange).astype(bool) relevant_boxes_list = contributing_boxes_list[ box_in_subrange_host ].astype(tree.box_id_dtype) @@ -381,7 +395,7 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): # Update data structures. self._update_mpoles( - mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes]) + mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes]) contributing_boxes[boxes_list_buf[:nboxes]] = 1 @@ -393,38 +407,41 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): if return_stats: return stats - def finalize_potentials(self, potentials, template_ary): - if self.comm.Get_rank() == 0: - return super().finalize_potentials(potentials, template_ary) - else: - return None - class DistributedFMMLibExpansionWrangler( - DistributedExpansionWrangler, FMMLibExpansionWrangler): + DistributedExpansionWranglerMixin, + FMMLibExpansionWrangler): def __init__( - self, context, comm, tree_indep, local_traversal, global_traversal, + self, array_context, comm, tree_indep, local_traversal, global_traversal, fmm_level_to_order=None, communicate_mpoles_via_allreduce=False, **kwargs): - DistributedExpansionWrangler.__init__( - self, context, comm, global_traversal, False, - communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) FMMLibExpansionWrangler.__init__( self, tree_indep, local_traversal, fmm_level_to_order=fmm_level_to_order, **kwargs) - #TODO: use log_process like FMMLibExpansionWrangler? + self._setup_actx = array_context + self.comm = comm + self.traversal_in_device_memory = False + self.global_traversal = global_traversal + self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce + def reorder_sources(self, source_array): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return source_array[..., self.global_traversal.tree.user_source_ids] else: return None def reorder_potentials(self, potentials): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return potentials[self.global_traversal.tree.sorted_target_ids] else: return None + def finalize_potentials(self, potentials, template_ary): + if self.is_mpi_root: + return super().finalize_potentials(potentials, template_ary) + else: + return None + # }}} diff --git a/boxtree/distributed/local_traversal.py b/boxtree/distributed/local_traversal.py index 60eb3689..4752ee37 100644 --- a/boxtree/distributed/local_traversal.py +++ b/boxtree/distributed/local_traversal.py @@ -28,34 +28,30 @@ def generate_local_travs( - queue, local_tree, traversal_builder, merge_close_lists=False): + actx, local_tree, traversal_builder, merge_close_lists=False): """Generate local traversal from local tree. - :arg queue: a :class:`pyopencl.CommandQueue` object. - :arg local_tree: the local tree of class - `boxtree.tools.ImmutableHostDeviceArray` on which the local traversal - object will be constructed. - :arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and - a tree, returns the traversal object based on the tree. + :arg local_tree: the local tree on which the local traversal object will + be constructed. + :arg traversal_builder: a function, taken a :class:`arraycontext.ArrayContext` + and a tree, returns the traversal object based on the tree. :return: generated local traversal object in device memory """ start_time = time.time() - local_tree.with_queue(queue) - # We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the # multipole formation and upward propagation within the rank's responsible boxes # region. Had there not been such restrictions, some sources might be distributed # to more than 1 rank and counted multiple times. - local_trav, _ = traversal_builder( - queue, local_tree.to_device(queue), - source_boxes_mask=local_tree.responsible_boxes_mask.device, - source_parent_boxes_mask=local_tree.ancestor_mask.device + local_trav = traversal_builder( + actx, local_tree, + source_boxes_mask=local_tree.responsible_boxes_mask, + source_parent_boxes_mask=local_tree.ancestor_mask ) if merge_close_lists and local_tree.targets_have_extent: - local_trav = local_trav.merge_close_lists(queue) + local_trav = local_trav.merge_close_lists(actx) logger.info("Generate local traversal in {} sec.".format( str(time.time() - start_time)) diff --git a/boxtree/distributed/local_tree.py b/boxtree/distributed/local_tree.py index 32e878ba..5ce6cfd5 100644 --- a/boxtree/distributed/local_tree.py +++ b/boxtree/distributed/local_tree.py @@ -21,15 +21,21 @@ THE SOFTWARE. """ -from boxtree import Tree -from mako.template import Template -from pyopencl.tools import dtype_to_ctype -from pytools import memoize_method -import numpy as np -import pyopencl as cl +import time from dataclasses import dataclass from typing import Optional -import time + +import numpy as np + +from pyopencl.tools import dtype_to_ctype +from pyopencl.elementwise import ElementwiseKernel + +from arraycontext import Array, ArrayOrContainer +from pytools import memoize_on_first_arg +from mako.template import Template + +from boxtree import Tree +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container import logging logger = logging.getLogger(__name__) @@ -40,154 +46,156 @@ # We should refactor this to make use of this commonality. # https://documen.tician.de/boxtree/tree.html#filtering-the-lists-of-targets - -class LocalTreeGeneratorCodeContainer: - """Objects of this type serve as a place to keep the code needed for - :func:`generate_local_tree`. - """ - def __init__(self, cl_context, dimensions, particle_id_dtype, coord_dtype): - self.cl_context = cl_context - self.dimensions = dimensions - self.particle_id_dtype = particle_id_dtype - self.coord_dtype = coord_dtype - - @memoize_method - def particle_mask_kernel(self): - return cl.elementwise.ElementwiseKernel( - self.cl_context, - arguments=Template(""" - __global char *responsible_boxes, - __global ${particle_id_t} *box_particle_starts, - __global ${particle_id_t} *box_particle_counts_nonchild, - __global ${particle_id_t} *particle_mask - """, strict_undefined=True).render( - particle_id_t=dtype_to_ctype(self.particle_id_dtype) - ), - operation=Template(""" - if(responsible_boxes[i]) { - for(${particle_id_t} pid = box_particle_starts[i]; - pid < box_particle_starts[i] - + box_particle_counts_nonchild[i]; - ++pid) { - particle_mask[pid] = 1; - } +# {{{ kernels + +FETCH_LOCAL_PARTICLES_ARGUMENTS_TPL = Template(""" + __global const ${mask_t} *particle_mask, + __global const ${mask_t} *particle_scan + % for dim in range(ndims): + , __global const ${coord_t} *particles_${dim} + % endfor + % for dim in range(ndims): + , __global ${coord_t} *local_particles_${dim} + % endfor + % if particles_have_extent: + , __global const ${coord_t} *particle_radii + , __global ${coord_t} *local_particle_radii + % endif +""", strict_undefined=True) + +FETCH_LOCAL_PARTICLES_PRG_TPL = Template(""" + if(particle_mask[i]) { + ${particle_id_t} des = particle_scan[i]; + % for dim in range(ndims): + local_particles_${dim}[des] = particles_${dim}[i]; + % endfor + % if particles_have_extent: + local_particle_radii[des] = particle_radii[i]; + % endif + } +""", strict_undefined=True) + + +@memoize_on_first_arg +def get_particle_mask_kernel( + actx: PyOpenCLArrayContext, + particle_id_dtype: "np.dtype"): + return ElementwiseKernel( + actx.context, + arguments=Template(""" + __global char *responsible_boxes, + __global ${particle_id_t} *box_particle_starts, + __global ${particle_id_t} *box_particle_counts_nonchild, + __global ${particle_id_t} *particle_mask + """, strict_undefined=True).render( + particle_id_t=dtype_to_ctype(particle_id_dtype) + ), + operation=Template(""" + if(responsible_boxes[i]) { + for(${particle_id_t} pid = box_particle_starts[i]; + pid < box_particle_starts[i] + + box_particle_counts_nonchild[i]; + ++pid) { + particle_mask[pid] = 1; } - """).render(particle_id_t=dtype_to_ctype(self.particle_id_dtype)) + } + """).render(particle_id_t=dtype_to_ctype(particle_id_dtype)) ) - @memoize_method - def mask_scan_kernel(self): - from pyopencl.scan import GenericScanKernel - return GenericScanKernel( - self.cl_context, self.particle_id_dtype, - arguments=Template(""" - __global ${mask_t} *ary, - __global ${mask_t} *scan - """, strict_undefined=True).render( - mask_t=dtype_to_ctype(self.particle_id_dtype) - ), - input_expr="ary[i]", - scan_expr="a+b", neutral="0", - output_statement="scan[i + 1] = item;" + +@memoize_on_first_arg +def get_mask_scan_kernel( + actx: PyOpenCLArrayContext, + particle_id_dtype: "np.dtype"): + from pyopencl.scan import GenericScanKernel + return GenericScanKernel( + actx.context, particle_id_dtype, + arguments=Template(""" + __global ${mask_t} *ary, + __global ${mask_t} *scan + """, strict_undefined=True).render( + mask_t=dtype_to_ctype(particle_id_dtype) + ), + input_expr="ary[i]", + scan_expr="a+b", neutral="0", + output_statement="scan[i + 1] = item;" ) - fetch_local_paticles_arguments = Template(""" - __global const ${mask_t} *particle_mask, - __global const ${mask_t} *particle_scan - % for dim in range(ndims): - , __global const ${coord_t} *particles_${dim} - % endfor - % for dim in range(ndims): - , __global ${coord_t} *local_particles_${dim} - % endfor - % if particles_have_extent: - , __global const ${coord_t} *particle_radii - , __global ${coord_t} *local_particle_radii - % endif - """, strict_undefined=True) - - fetch_local_particles_prg = Template(""" - if(particle_mask[i]) { - ${particle_id_t} des = particle_scan[i]; - % for dim in range(ndims): - local_particles_${dim}[des] = particles_${dim}[i]; - % endfor - % if particles_have_extent: - local_particle_radii[des] = particle_radii[i]; - % endif - } - """, strict_undefined=True) - - @memoize_method - def fetch_local_particles_kernel(self, particles_have_extent): - return cl.elementwise.ElementwiseKernel( - self.cl_context, - self.fetch_local_paticles_arguments.render( - mask_t=dtype_to_ctype(self.particle_id_dtype), - coord_t=dtype_to_ctype(self.coord_dtype), - ndims=self.dimensions, - particles_have_extent=particles_have_extent - ), - self.fetch_local_particles_prg.render( - particle_id_t=dtype_to_ctype(self.particle_id_dtype), - ndims=self.dimensions, - particles_have_extent=particles_have_extent - ) + +@memoize_on_first_arg +def get_fetch_local_particles_kernel( + actx: PyOpenCLArrayContext, + dimensions: int, + particle_id_dtype: "np.dtype", + coord_dtype: "np.dtype", + particles_have_extent: bool): + return ElementwiseKernel( + actx.context, + FETCH_LOCAL_PARTICLES_ARGUMENTS_TPL.render( + mask_t=dtype_to_ctype(particle_id_dtype), + coord_t=dtype_to_ctype(coord_dtype), + ndims=dimensions, + particles_have_extent=particles_have_extent + ), + FETCH_LOCAL_PARTICLES_PRG_TPL.render( + particle_id_t=dtype_to_ctype(particle_id_dtype), + ndims=dimensions, + particles_have_extent=particles_have_extent ) + ) - @memoize_method - def mask_compressor_kernel(self): - from boxtree.tools import MaskCompressorKernel - return MaskCompressorKernel(self.cl_context) - - @memoize_method - def modify_target_flags_kernel(self): - from boxtree import box_flags_enum - box_flag_t = dtype_to_ctype(box_flags_enum.dtype) - - return cl.elementwise.ElementwiseKernel( - self.cl_context, - Template(""" - __global ${particle_id_t} *box_target_counts_nonchild, - __global ${particle_id_t} *box_target_counts_cumul, - __global ${box_flag_t} *box_flags - """).render( - particle_id_t=dtype_to_ctype(self.particle_id_dtype), - box_flag_t=box_flag_t + +@memoize_on_first_arg +def get_modify_target_flags_kernel( + actx: PyOpenCLArrayContext, + particle_id_dtype: "np.dtype"): + from boxtree import box_flags_enum + box_flag_t = dtype_to_ctype(box_flags_enum.dtype) + + return ElementwiseKernel( + actx.context, + Template(""" + __global ${particle_id_t} *box_target_counts_nonchild, + __global ${particle_id_t} *box_target_counts_cumul, + __global ${box_flag_t} *box_flags + """).render( + particle_id_t=dtype_to_ctype(particle_id_dtype), + box_flag_t=box_flag_t + ), + Template(r""" + // reset HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits in the flag of + // each box + box_flags[i] &= (~${HAS_OWN_TARGETS}); + box_flags[i] &= (~${HAS_CHILD_TARGETS}); + + // rebuild HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits + if(box_target_counts_nonchild[i]) box_flags[i] |= ${HAS_OWN_TARGETS}; + if(box_target_counts_nonchild[i] < box_target_counts_cumul[i]) + box_flags[i] |= ${HAS_CHILD_TARGETS}; + """).render( + HAS_OWN_TARGETS=( + "(" + box_flag_t + ") " + str(box_flags_enum.HAS_OWN_TARGETS) ), - Template(r""" - // reset HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits in the flag of - // each box - box_flags[i] &= (~${HAS_OWN_TARGETS}); - box_flags[i] &= (~${HAS_CHILD_TARGETS}); - - // rebuild HAS_OWN_TARGETS and HAS_CHILD_TARGETS bits - if(box_target_counts_nonchild[i]) box_flags[i] |= ${HAS_OWN_TARGETS}; - if(box_target_counts_nonchild[i] < box_target_counts_cumul[i]) - box_flags[i] |= ${HAS_CHILD_TARGETS}; - """).render( - HAS_OWN_TARGETS=( - "(" + box_flag_t + ") " + str(box_flags_enum.HAS_OWN_TARGETS) - ), - HAS_CHILD_TARGETS=( - "(" + box_flag_t + ") " + str(box_flags_enum.HAS_CHILD_TARGETS) - ) + HAS_CHILD_TARGETS=( + "(" + box_flag_t + ") " + str(box_flags_enum.HAS_CHILD_TARGETS) ) ) + ) -@dataclass +@dataclass(frozen=True) class LocalParticlesAndLists: - particles: np.ndarray - particle_radii: Optional[cl.array.Array] - box_particle_starts: cl.array.Array - box_particle_counts_nonchild: cl.array.Array - box_particle_counts_cumul: cl.array.Array + particles: ArrayOrContainer + particle_radii: Optional[Array] + box_particle_starts: Array + box_particle_counts_nonchild: Array + box_particle_counts_cumul: Array particle_idx: np.ndarray def construct_local_particles_and_lists( - queue, code, dimensions, num_boxes, num_global_particles, + actx: PyOpenCLArrayContext, + dimensions, num_boxes, num_global_particles, particle_id_dtype, coord_dtype, particles_have_extent, box_mask, global_particles, global_particle_radii, @@ -198,59 +206,74 @@ def construct_local_particles_and_lists( """ # {{{ calculate the particle mask - particle_mask = cl.array.zeros( - queue, num_global_particles, dtype=particle_id_dtype) - - code.particle_mask_kernel()( - box_mask, box_particle_starts, box_particle_counts_nonchild, particle_mask) + particle_mask = actx.zeros(num_global_particles, dtype=particle_id_dtype) + knl = get_particle_mask_kernel(actx, particle_id_dtype) + knl(box_mask, + box_particle_starts, + box_particle_counts_nonchild, + particle_mask, + queue=actx.queue, + ) # }}} # {{{ calculate the scan of the particle mask - global_to_local_particle_index = cl.array.empty( - queue, num_global_particles + 1, dtype=particle_id_dtype) + global_to_local_particle_index = actx.empty( + num_global_particles + 1, dtype=particle_id_dtype) global_to_local_particle_index[0] = 0 - code.mask_scan_kernel()(particle_mask, global_to_local_particle_index) + knl = get_mask_scan_kernel(actx, particle_id_dtype) + knl(particle_mask, global_to_local_particle_index, + queue=actx.queue, + allocator=actx.allocator, + ) # }}} # {{{ fetch the local particles - num_local_particles = global_to_local_particle_index[-1].get(queue).item() - - local_particles = [ - cl.array.empty(queue, num_local_particles, dtype=coord_dtype) - for _ in range(dimensions)] + from pytools.obj_array import make_obj_array + num_local_particles = actx.to_numpy(global_to_local_particle_index[-1]).item() + local_particles = make_obj_array([ + actx.zeros(num_local_particles, coord_dtype) + for _ in range(dimensions) + ]) from pytools.obj_array import make_obj_array local_particles = make_obj_array(local_particles) - local_particle_radii = None - if particles_have_extent: - local_particle_radii = cl.array.empty( - queue, num_local_particles, dtype=coord_dtype) + knl = get_fetch_local_particles_kernel( + actx, dimensions, particle_id_dtype, coord_dtype, + particles_have_extent=particles_have_extent, + ) - code.fetch_local_particles_kernel(True)( + if particles_have_extent: + local_particle_radii = actx.empty(num_local_particles, dtype=coord_dtype) + knl( particle_mask, global_to_local_particle_index, *global_particles.tolist(), *local_particles, global_particle_radii, - local_particle_radii) + local_particle_radii, + queue=actx.queue, + ) else: - code.fetch_local_particles_kernel(False)( + local_particle_radii = None + knl( particle_mask, global_to_local_particle_index, *global_particles.tolist(), - *local_particles) + *local_particles, + queue=actx.queue, + ) # {{{ construct the list of list indices local_box_particle_starts = global_to_local_particle_index[box_particle_starts] - box_counts_all_zeros = cl.array.zeros(queue, num_boxes, dtype=particle_id_dtype) + box_counts_all_zeros = actx.zeros(num_boxes, dtype=particle_id_dtype) - local_box_particle_counts_nonchild = cl.array.if_positive( + local_box_particle_counts_nonchild = actx.np.where( box_mask, box_particle_counts_nonchild, box_counts_all_zeros) box_particle_ends_cumul = box_particle_starts + box_particle_counts_cumul @@ -261,18 +284,20 @@ def construct_local_particles_and_lists( # }}} - particle_mask = particle_mask.get(queue=queue).astype(bool) + particle_mask = actx.to_numpy(particle_mask).astype(bool) particle_idx = np.arange(num_global_particles)[particle_mask] return LocalParticlesAndLists( - local_particles, - local_particle_radii, - local_box_particle_starts, - local_box_particle_counts_nonchild, - local_box_particle_counts_cumul, - particle_idx) + particles=local_particles, + particle_radii=local_particle_radii, + box_particle_starts=local_box_particle_starts, + box_particle_counts_nonchild=local_box_particle_counts_nonchild, + box_particle_counts_cumul=local_box_particle_counts_cumul, + particle_idx=particle_idx) +@dataclass_array_container +@dataclass(frozen=True) class LocalTree(Tree): """ Inherits from :class:`boxtree.Tree`. @@ -291,13 +316,22 @@ class LocalTree(Tree): propagated from an ancestor) List 2. """ + box_to_user_rank_starts: Array + box_to_user_rank_lists: Array + + responsible_boxes_list: Array + responsible_boxes_mask: Array + ancestor_mask: Array -def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): + +def generate_local_tree( + actx: PyOpenCLArrayContext, + global_traversal, responsible_boxes_list, comm, + root_extent_stretch_factor: float = 1.0e-4) -> LocalTree: """Generate the local tree for the current rank. This is an MPI-collective routine on *comm*. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg global_traversal: Global :class:`boxtree.traversal.FMMTraversalInfo` object on host memory. :arg responsible_boxes_list: a :class:`numpy.ndarray` object containing the @@ -310,10 +344,7 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): global tree. ``src_idx`` and ``tgt_idx`` are needed for distributing source weights from root rank and assembling calculated potentials on the root rank. """ - global_tree = global_traversal.tree - code = LocalTreeGeneratorCodeContainer( - queue.context, global_tree.dimensions, - global_tree.particle_id_dtype, global_tree.coord_dtype) + global_tree = actx.thaw(global_traversal.tree) mpi_rank = comm.Get_rank() mpi_size = comm.Get_size() @@ -321,33 +352,31 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): start_time = time.time() from boxtree.distributed.partition import get_box_masks - box_masks = get_box_masks(queue, global_traversal, responsible_boxes_list) - - global_tree_dev = global_tree.to_device(queue).with_queue(queue) + box_masks = get_box_masks(actx, global_traversal, responsible_boxes_list) local_sources_and_lists = construct_local_particles_and_lists( - queue, code, global_tree.dimensions, global_tree.nboxes, + actx, global_tree.dimensions, global_tree.nboxes, global_tree.nsources, global_tree.particle_id_dtype, global_tree.coord_dtype, global_tree.sources_have_extent, box_masks.point_src_boxes, - global_tree_dev.sources, - global_tree_dev.sources_radii if global_tree.sources_have_extent else None, - global_tree_dev.box_source_starts, - global_tree_dev.box_source_counts_nonchild, - global_tree_dev.box_source_counts_cumul) + global_tree.sources, + global_tree.sources_radii if global_tree.sources_have_extent else None, + global_tree.box_source_starts, + global_tree.box_source_counts_nonchild, + global_tree.box_source_counts_cumul) local_targets_and_lists = construct_local_particles_and_lists( - queue, code, global_tree.dimensions, global_tree.nboxes, + actx, global_tree.dimensions, global_tree.nboxes, global_tree.ntargets, global_tree.particle_id_dtype, global_tree.coord_dtype, global_tree.targets_have_extent, box_masks.responsible_boxes, - global_tree_dev.targets, - global_tree_dev.target_radii if global_tree.targets_have_extent else None, - global_tree_dev.box_target_starts, - global_tree_dev.box_target_counts_nonchild, - global_tree_dev.box_target_counts_cumul) + global_tree.targets, + global_tree.target_radii if global_tree.targets_have_extent else None, + global_tree.box_target_starts, + global_tree.box_target_counts_nonchild, + global_tree.box_target_counts_cumul) # {{{ compute the users of multipole expansions of each box on the root rank @@ -357,24 +386,24 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): (mpi_size, global_tree.nboxes), dtype=box_masks.multipole_src_boxes.dtype) comm.Gather( - box_masks.multipole_src_boxes.get(), multipole_src_boxes_all_ranks, root=0) + actx.to_numpy(box_masks.multipole_src_boxes), + multipole_src_boxes_all_ranks, root=0) box_to_user_rank_starts = None box_to_user_rank_lists = None if mpi_rank == 0: - multipole_src_boxes_all_ranks = cl.array.to_device( - queue, multipole_src_boxes_all_ranks) + multipole_src_boxes_all_ranks = actx.from_numpy( + multipole_src_boxes_all_ranks) - (box_to_user_rank_starts, box_to_user_rank_lists, evt) = \ - code.mask_compressor_kernel()( - queue, multipole_src_boxes_all_ranks.transpose(), - list_dtype=np.int32) + from boxtree.tools import mask_to_csr + (box_to_user_rank_starts, box_to_user_rank_lists) = ( + mask_to_csr( + actx, multipole_src_boxes_all_ranks.transpose(), + list_dtype=np.int32)) - cl.wait_for_events([evt]) - - box_to_user_rank_starts = box_to_user_rank_starts.get() - box_to_user_rank_lists = box_to_user_rank_lists.get() + box_to_user_rank_starts = actx.to_numpy(box_to_user_rank_starts) + box_to_user_rank_lists = actx.to_numpy(box_to_user_rank_lists) logger.debug("computing box_to_user: done") @@ -391,22 +420,17 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): # expansions formed by souces in other ranks. Modifying the source box flags # could result in incomplete interaction lists. - local_box_flags = global_tree_dev.box_flags.copy(queue=queue) - code.modify_target_flags_kernel()( + local_box_flags = actx.np.copy(global_tree.box_flags) + knl = get_modify_target_flags_kernel(actx, global_tree.particle_id_dtype) + knl( local_targets_and_lists.box_particle_counts_nonchild, local_targets_and_lists.box_particle_counts_cumul, - local_box_flags) + local_box_flags, + queue=actx.queue, + ) # }}} - from pytools.obj_array import make_obj_array - local_sources = make_obj_array([ - local_sources_idim.get(queue=queue) - for local_sources_idim in local_sources_and_lists.particles]) - local_targets = make_obj_array([ - local_target_idim.get(queue=queue) - for local_target_idim in local_targets_and_lists.particles]) - local_tree = LocalTree( sources_are_targets=global_tree.sources_are_targets, sources_have_extent=global_tree.sources_have_extent, @@ -423,33 +447,34 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): bounding_box=global_tree.bounding_box, level_start_box_nrs=global_tree.level_start_box_nrs, - level_start_box_nrs_dev=global_tree.level_start_box_nrs_dev, - sources=local_sources, - targets=local_targets, - source_radii=(local_sources_and_lists.particle_radii.get(queue=queue) + sources=local_sources_and_lists.particles, + targets=local_targets_and_lists.particles, + source_radii=( + local_sources_and_lists.particle_radii if global_tree.sources_have_extent else None), - target_radii=(local_targets_and_lists.particle_radii.get(queue=queue) + target_radii=( + local_targets_and_lists.particle_radii if global_tree.targets_have_extent else None), box_source_starts=( - local_sources_and_lists.box_particle_starts.get(queue=queue)), + local_sources_and_lists.box_particle_starts), box_source_counts_nonchild=( - local_sources_and_lists.box_particle_counts_nonchild.get(queue=queue)), + local_sources_and_lists.box_particle_counts_nonchild), box_source_counts_cumul=( - local_sources_and_lists.box_particle_counts_cumul.get(queue=queue)), + local_sources_and_lists.box_particle_counts_cumul), box_target_starts=( - local_targets_and_lists.box_particle_starts.get(queue=queue)), + local_targets_and_lists.box_particle_starts), box_target_counts_nonchild=( - local_targets_and_lists.box_particle_counts_nonchild.get(queue=queue)), + local_targets_and_lists.box_particle_counts_nonchild), box_target_counts_cumul=( - local_targets_and_lists.box_particle_counts_cumul.get(queue=queue)), + local_targets_and_lists.box_particle_counts_cumul), box_parent_ids=global_tree.box_parent_ids, box_child_ids=global_tree.box_child_ids, box_centers=global_tree.box_centers, box_levels=global_tree.box_levels, - box_flags=local_box_flags.get(queue=queue), + box_flags=local_box_flags, user_source_ids=None, sorted_target_ids=None, @@ -459,23 +484,21 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): box_target_bounding_box_min=global_tree.box_target_bounding_box_min, box_target_bounding_box_max=global_tree.box_target_bounding_box_max, + root_extent_stretch_factor=root_extent_stretch_factor, _is_pruned=global_tree._is_pruned, responsible_boxes_list=responsible_boxes_list, - responsible_boxes_mask=box_masks.responsible_boxes.get(), - ancestor_mask=box_masks.ancestor_boxes.get(), - box_to_user_rank_starts=box_to_user_rank_starts, - box_to_user_rank_lists=box_to_user_rank_lists + responsible_boxes_mask=box_masks.responsible_boxes, + ancestor_mask=box_masks.ancestor_boxes, + box_to_user_rank_starts=actx.from_numpy(box_to_user_rank_starts), + box_to_user_rank_lists=actx.from_numpy(box_to_user_rank_lists), ) - local_tree = local_tree.to_host_device_array(queue) - local_tree.with_queue(None) - logger.info("Generate local tree on rank {} in {} sec.".format( mpi_rank, str(time.time() - start_time) )) return ( - local_tree, + actx.freeze(local_tree), local_sources_and_lists.particle_idx, local_targets_and_lists.particle_idx) diff --git a/boxtree/distributed/partition.py b/boxtree/distributed/partition.py index a3b2b799..569dbe10 100644 --- a/boxtree/distributed/partition.py +++ b/boxtree/distributed/partition.py @@ -21,14 +21,71 @@ THE SOFTWARE. """ +from dataclasses import dataclass + import numpy as np -import pyopencl as cl + from pyopencl.tools import dtype_to_ctype +from pyopencl.elementwise import ElementwiseKernel + +from arraycontext import Array +from pytools import memoize_on_first_arg from mako.template import Template -from pytools import memoize_method -from dataclasses import dataclass + +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container + + +# {{{ kernels + +@memoize_on_first_arg +def get_add_interaction_list_boxes_kernel( + actx: PyOpenCLArrayContext, + box_id_dtype: "np.dtype"): + """Given a ``responsible_boxes_mask`` and an interaction list, mark source + boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask. + """ + return ElementwiseKernel( + actx.context, + Template(""" + __global ${box_id_t} *box_list, + __global char *responsible_boxes_mask, + __global ${box_id_t} *interaction_boxes_starts, + __global ${box_id_t} *interaction_boxes_lists, + __global char *src_boxes_mask + """, strict_undefined=True).render( + box_id_t=dtype_to_ctype(box_id_dtype) + ), + Template(r""" + typedef ${box_id_t} box_id_t; + box_id_t current_box = box_list[i]; + if(responsible_boxes_mask[current_box]) { + for(box_id_t box_idx = interaction_boxes_starts[i]; + box_idx < interaction_boxes_starts[i + 1]; + ++box_idx) + src_boxes_mask[interaction_boxes_lists[box_idx]] = 1; + } + """, strict_undefined=True).render( + box_id_t=dtype_to_ctype(box_id_dtype) + ), + ) + + +@memoize_on_first_arg +def get_add_parent_boxes_kernel( + actx: PyOpenCLArrayContext, + box_id_dtype: "np.dtype"): + return ElementwiseKernel( + actx.context, + "__global char *current, __global char *parent, " + "__global %s *box_parent_ids" % dtype_to_ctype(box_id_dtype), + "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1" + ) + +# }}} +# {{{ get_box_masks + def get_box_ids_dfs_order(tree): """Helper function for getting box ids of a tree in depth-first order. @@ -115,68 +172,23 @@ def partition_work(cost_per_box, traversal, comm): responsible_boxes_current_rank[0]:responsible_boxes_current_rank[1]] -class GetBoxMasksCodeContainer: - def __init__(self, cl_context, box_id_dtype): - self.cl_context = cl_context - self.box_id_dtype = box_id_dtype - - @memoize_method - def add_interaction_list_boxes_kernel(self): - """Given a ``responsible_boxes_mask`` and an interaction list, mark source - boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask. - """ - return cl.elementwise.ElementwiseKernel( - self.cl_context, - Template(""" - __global ${box_id_t} *box_list, - __global char *responsible_boxes_mask, - __global ${box_id_t} *interaction_boxes_starts, - __global ${box_id_t} *interaction_boxes_lists, - __global char *src_boxes_mask - """, strict_undefined=True).render( - box_id_t=dtype_to_ctype(self.box_id_dtype) - ), - Template(r""" - typedef ${box_id_t} box_id_t; - box_id_t current_box = box_list[i]; - if(responsible_boxes_mask[current_box]) { - for(box_id_t box_idx = interaction_boxes_starts[i]; - box_idx < interaction_boxes_starts[i + 1]; - ++box_idx) - src_boxes_mask[interaction_boxes_lists[box_idx]] = 1; - } - """, strict_undefined=True).render( - box_id_t=dtype_to_ctype(self.box_id_dtype) - ), - ) - - @memoize_method - def add_parent_boxes_kernel(self): - return cl.elementwise.ElementwiseKernel( - self.cl_context, - "__global char *current, __global char *parent, " - "__global %s *box_parent_ids" % dtype_to_ctype(self.box_id_dtype), - "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1" - ) - - -def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask): +def get_ancestor_boxes_mask(actx, traversal, responsible_boxes_mask): """Query the ancestors of responsible boxes. - :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box. - :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose - i-th entry is 1 if ``i`` is an ancestor of the responsible boxes specified by + :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is a responsible box. + :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` + is an ancestor of the responsible boxes specified by *responsible_boxes_mask*. """ - ancestor_boxes = cl.array.zeros(queue, (traversal.tree.nboxes,), dtype=np.int8) + knl = get_add_parent_boxes_kernel(actx, traversal.tree.box_id_dtype) + + ancestor_boxes = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) ancestor_boxes_last = responsible_boxes_mask.copy() while ancestor_boxes_last.any(): - ancestor_boxes_new = cl.array.zeros( - queue, (traversal.tree.nboxes,), dtype=np.int8) - code.add_parent_boxes_kernel()( - ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids) + ancestor_boxes_new = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) + knl(ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids) ancestor_boxes_new = ancestor_boxes_new & (~ancestor_boxes) ancestor_boxes = ancestor_boxes | ancestor_boxes_new ancestor_boxes_last = ancestor_boxes_new @@ -185,115 +197,111 @@ def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask): def get_point_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask): + actx, traversal, responsible_boxes_mask, ancestor_boxes_mask): """Query the boxes whose sources are needed in order to evaluate potentials of boxes represented by *responsible_boxes_mask*. - :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box. - :param ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box - or an ancestor of the responsible boxes. - :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose - i-th entry is 1 if souces of box ``i`` are needed for evaluating the - potentials of targets in boxes represented by *responsible_boxes_mask*. + :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is a responsible box. + :param ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is either a responsible box or an ancestor + of the responsible boxes. + :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if + souces of box ``i`` are needed for evaluating the potentials of targets + in boxes represented by *responsible_boxes_mask*. """ - + knl = get_add_interaction_list_boxes_kernel(actx, traversal.tree.box_id_dtype) src_boxes_mask = responsible_boxes_mask.copy() # Add list 1 of responsible boxes - code.add_interaction_list_boxes_kernel()( + knl( traversal.target_boxes, responsible_boxes_mask, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_boxes_mask, - queue=queue) + queue=actx.queue) # Add list 4 of responsible boxes or ancestor boxes - code.add_interaction_list_boxes_kernel()( + knl( traversal.target_or_target_parent_boxes, responsible_boxes_mask | ancestor_boxes_mask, traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists, src_boxes_mask, - queue=queue) + queue=actx.queue) if traversal.tree.targets_have_extent: # Add list 3 close of responsible boxes if traversal.from_sep_close_smaller_starts is not None: - code.add_interaction_list_boxes_kernel()( + knl( traversal.target_boxes, responsible_boxes_mask, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, src_boxes_mask, - queue=queue + queue=actx.queue ) # Add list 4 close of responsible boxes if traversal.from_sep_close_bigger_starts is not None: - code.add_interaction_list_boxes_kernel()( + knl( traversal.target_boxes, responsible_boxes_mask | ancestor_boxes_mask, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, src_boxes_mask, - queue=queue + queue=actx.queue ) return src_boxes_mask def get_multipole_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask): + actx, traversal, responsible_boxes_mask, ancestor_boxes_mask): """Query the boxes whose multipoles are used in order to evaluate potentials of targets in boxes represented by *responsible_boxes_mask*. - :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box. - :arg ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box - or an ancestor of the responsible boxes. - :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` - whose i-th entry is 1 if multipoles of box ``i`` are needed for evaluating - the potentials of targets in boxes represented by *responsible_boxes_mask*. + :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is a responsible box. + :arg ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is either a responsible box or an ancestor of + the responsible boxes. + :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if + multipoles of box ``i`` are needed for evaluating the potentials of + targets in boxes represented by *responsible_boxes_mask*. """ - - multipole_boxes_mask = cl.array.zeros( - queue, (traversal.tree.nboxes,), dtype=np.int8 - ) + knl = get_add_interaction_list_boxes_kernel(actx, traversal.tree.box_id_dtype) + multipole_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) # A mpole is used by process p if it is in the List 2 of either a box # owned by p or one of its ancestors. - code.add_interaction_list_boxes_kernel()( + knl( traversal.target_or_target_parent_boxes, responsible_boxes_mask | ancestor_boxes_mask, traversal.from_sep_siblings_starts, traversal.from_sep_siblings_lists, multipole_boxes_mask, - queue=queue + queue=actx.queue ) - multipole_boxes_mask.finish() # A mpole is used by process p if it is in the List 3 of a box owned by p. for ilevel in range(traversal.tree.nlevels): - code.add_interaction_list_boxes_kernel()( + knl( traversal.target_boxes_sep_smaller_by_source_level[ilevel], responsible_boxes_mask, traversal.from_sep_smaller_by_level[ilevel].starts, traversal.from_sep_smaller_by_level[ilevel].lists, multipole_boxes_mask, - queue=queue + queue=actx.queue ) - multipole_boxes_mask.finish() - return multipole_boxes_mask -@dataclass +@dataclass_array_container +@dataclass(frozen=True) class BoxMasks: """ - Box masks needed for the distributed calculation. Each of these masks is a - PyOpenCL array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is + Box masks needed for the distributed calculation. Each of these masks is an + array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is set. .. attribute:: responsible_boxes @@ -313,13 +321,13 @@ class BoxMasks: Current process needs multipole expressions in these boxes. """ - responsible_boxes: cl.array.Array - ancestor_boxes: cl.array.Array - point_src_boxes: cl.array.Array - multipole_src_boxes: cl.array.Array + responsible_boxes: Array + ancestor_boxes: Array + point_src_boxes: Array + multipole_src_boxes: Array -def get_box_masks(queue, traversal, responsible_boxes_list): +def get_box_masks(actx, traversal, responsible_boxes_list): """Given the responsible boxes for a rank, this helper function calculates the relevant masks. @@ -327,27 +335,23 @@ def get_box_masks(queue, traversal, responsible_boxes_list): :returns: A :class:`BoxMasks` object of the relevant masks. """ - code = GetBoxMasksCodeContainer(queue.context, traversal.tree.box_id_dtype) - - # FIXME: It is wasteful to copy the whole traversal object into device memory - # here because - # 1) Not all fields are needed. - # 2) For sumpy wrangler, a device traversal object is already available. - traversal = traversal.to_device(queue) - - responsible_boxes_mask = np.zeros((traversal.tree.nboxes,), dtype=np.int8) - responsible_boxes_mask[responsible_boxes_list] = 1 - responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask) + responsible_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) + responsible_boxes_mask[responsible_boxes_list] = ( + 1 + actx.zeros(responsible_boxes_list.shape, np.int8)) ancestor_boxes_mask = get_ancestor_boxes_mask( - queue, code, traversal, responsible_boxes_mask) + actx, traversal, responsible_boxes_mask) point_src_boxes_mask = get_point_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask) + actx, traversal, responsible_boxes_mask, ancestor_boxes_mask) multipole_src_boxes_mask = get_multipole_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask) + actx, traversal, responsible_boxes_mask, ancestor_boxes_mask) return BoxMasks( - responsible_boxes_mask, ancestor_boxes_mask, point_src_boxes_mask, + responsible_boxes_mask, + ancestor_boxes_mask, + point_src_boxes_mask, multipole_src_boxes_mask) + +# }}} diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 7644349d..83ef14cb 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -33,6 +33,7 @@ from boxtree.tree import Tree from boxtree.traversal import FMMTraversalInfo +from boxtree.array_context import PyOpenCLArrayContext import logging logger = logging.getLogger(__name__) @@ -155,6 +156,7 @@ def local_expansions_view(self, local_exps, level): @abstractmethod def form_multipoles(self, + actx: PyOpenCLArrayContext, level_start_source_box_nrs, source_boxes, src_weight_vecs): """Return an expansions array @@ -167,6 +169,7 @@ def form_multipoles(self, @abstractmethod def coarsen_multipoles(self, + actx: PyOpenCLArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, mpoles): """For each box in *source_parent_boxes*, @@ -179,6 +182,7 @@ def coarsen_multipoles(self, @abstractmethod def eval_direct(self, + actx: PyOpenCLArrayContext, target_boxes, neighbor_sources_starts, neighbor_sources_lists, src_weight_vecs): """For each box in *target_boxes*, evaluate the influence of the @@ -191,6 +195,7 @@ def eval_direct(self, @abstractmethod def multipole_to_local(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, mpole_exps): @@ -205,6 +210,7 @@ def multipole_to_local(self, @abstractmethod def eval_multipoles(self, + actx: PyOpenCLArrayContext, target_boxes_by_source_level, from_sep_smaller_by_level, mpole_exps): """For a level *i*, each box in *target_boxes_by_source_level[i]*, evaluate the multipole expansion in *mpole_exps* in the nearby boxes given in @@ -218,6 +224,7 @@ def eval_multipoles(self, @abstractmethod def form_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, src_weight_vecs): """For each box in *target_or_target_parent_boxes*, form local @@ -232,6 +239,7 @@ def form_locals(self, @abstractmethod def refine_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): """For each box in *child_boxes*, @@ -243,6 +251,7 @@ def refine_locals(self, @abstractmethod def eval_locals(self, + actx: PyOpenCLArrayContext, level_start_target_box_nrs, target_boxes, local_exps): """For each box in *target_boxes*, evaluate the local expansion in *local_exps* and return a new potential array. @@ -254,7 +263,7 @@ def eval_locals(self, # }}} @abstractmethod - def finalize_potentials(self, potentials, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials): """ Postprocess the reordered potentials. This is where global scaling factors could be applied. This is distinct from :meth:`reorder_potentials` @@ -268,7 +277,9 @@ def finalize_potentials(self, potentials, template_ary): type. """ - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): + def distribute_source_weights(self, + actx: PyOpenCLArrayContext, + src_weight_vecs, src_idx_all_ranks): """Used by the distributed implementation for transferring needed source weights from root rank to each worker rank in the communicator. @@ -288,7 +299,9 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): """ return src_weight_vecs - def gather_potential_results(self, potentials, tgt_idx_all_ranks): + def gather_potential_results(self, + actx: PyOpenCLArrayContext, + potentials, tgt_idx_all_ranks): """Used by the distributed implementation for gathering calculated potentials from all worker ranks in the communicator to the root rank. @@ -305,7 +318,9 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks): """ return potentials - def communicate_mpoles(self, mpole_exps, return_stats=False): + def communicate_mpoles(self, + actx: PyOpenCLArrayContext, + mpole_exps, return_stats=False): """Used by the distributed implementation for forming the complete multipole expansions from the partial multipole expansions. @@ -319,14 +334,16 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): :returns: Statistics of the communication if *return_stats* is True. *None* otherwise. """ - pass # }}} -def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, +def drive_fmm(actx: PyOpenCLArrayContext, + wrangler: ExpansionWranglerInterface, + src_weight_vecs, *, timing_data=None, - global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None): + global_src_idx_all_ranks=None, + global_tgt_idx_all_ranks=None): """Top-level driver routine for a fast multipole calculation. In part, this is intended as a template for custom FMMs, in the sense that @@ -340,7 +357,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, :arg expansion_wrangler: An object exhibiting the :class:`ExpansionWranglerInterface`. For distributed implementation, this wrangler should be a subclass of - :class:`boxtree.distributed.calculation.DistributedExpansionWrangler`. + :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin`. :arg src_weight_vecs: A sequence of source 'density/weights/charges'. Passed unmodified to *expansion_wrangler*. For distributed implementation, this argument is only significant on the root rank, but @@ -373,15 +390,17 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, from boxtree.timing import TimingRecorder recorder = TimingRecorder() - src_weight_vecs = [wrangler.reorder_sources(weight) for - weight in src_weight_vecs] + src_weight_vecs = [ + wrangler.reorder_sources(weight) for weight in src_weight_vecs] src_weight_vecs = wrangler.distribute_source_weights( - src_weight_vecs, global_src_idx_all_ranks) + actx, + src_weight_vecs, global_src_idx_all_ranks) # {{{ "Step 2.1:" Construct local multipoles mpole_exps, timing_future = wrangler.form_multipoles( + actx, traversal.level_start_source_box_nrs, traversal.source_boxes, src_weight_vecs) @@ -393,6 +412,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Step 2.2:" Propagate multipoles upward mpole_exps, timing_future = wrangler.coarsen_multipoles( + actx, traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) @@ -403,11 +423,12 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # }}} - wrangler.communicate_mpoles(mpole_exps) + wrangler.communicate_mpoles(actx, mpole_exps) # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1") potentials, timing_future = wrangler.eval_direct( + actx, traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, @@ -422,6 +443,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local local_exps, timing_future = wrangler.multipole_to_local( + actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, @@ -440,6 +462,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # contribution *out* of the downward-propagating local expansions) mpole_result, timing_future = wrangler.eval_multipoles( + actx, traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) @@ -455,6 +478,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, "('list 3 close')") direct_result, timing_future = wrangler.eval_direct( + actx, traversal.target_boxes, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, @@ -469,6 +493,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4") local_result, timing_future = wrangler.form_locals( + actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_bigger_starts, @@ -481,6 +506,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, if traversal.from_sep_close_bigger_starts is not None: direct_result, timing_future = wrangler.eval_direct( + actx, traversal.target_boxes, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, @@ -495,6 +521,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 7:" propagate local_exps downward local_exps, timing_future = wrangler.refine_locals( + actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) @@ -506,6 +533,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 8:" evaluate locals local_result, timing_future = wrangler.eval_locals( + actx, traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) @@ -517,11 +545,11 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # }}} potentials = wrangler.gather_potential_results( - potentials, global_tgt_idx_all_ranks) + actx, + potentials, global_tgt_idx_all_ranks) result = wrangler.reorder_potentials(potentials) - - result = wrangler.finalize_potentials(result, template_ary=src_weight_vecs[0]) + result = wrangler.finalize_potentials(actx, result) fmm_proc.done() diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py index 4cb62171..4132cdcf 100644 --- a/boxtree/pyfmmlib_integration.py +++ b/boxtree/pyfmmlib_integration.py @@ -36,6 +36,7 @@ """ import enum +from abc import ABC, abstractmethod import numpy as np @@ -51,7 +52,7 @@ # {{{ rotation data interface -class FMMLibRotationDataInterface: +class FMMLibRotationDataInterface(ABC): """Abstract interface for additional, optional data for precomputation of rotation matrices passed to the expansion wrangler. @@ -61,16 +62,16 @@ class FMMLibRotationDataInterface: """ + @abstractmethod def m2l_rotation_lists(self): """Return a :mod:`numpy` array mapping entries of List 2 to rotation classes. """ - raise NotImplementedError + @abstractmethod def m2l_rotation_angles(self): """Return a :mod:`numpy` array mapping List 2 rotation classes to rotation angles. """ - raise NotImplementedError class FMMLibRotationData(FMMLibRotationDataInterface): @@ -84,17 +85,12 @@ def __init__(self, array_context: PyOpenCLArrayContext, trav): self.trav = trav self.tree = trav.tree - @property - @memoize_method - def rotation_classes_builder(self): - from boxtree.rotation_classes import RotationClassesBuilder - return RotationClassesBuilder(self._setup_actx) - @memoize_method def build_rotation_classes_lists(self): - trav = self._setup_actx.from_numpy(self.trav) - tree = self._setup_actx.from_numpy(self.tree) - return self.rotation_classes_builder(self._setup_actx, trav, tree)[0] + from boxtree.rotation_classes import build_rotation_classes + actx = self._setup_actx + return build_rotation_classes( + actx, actx.from_numpy(self.trav), actx.from_numpy(self.tree)) @memoize_method def m2l_rotation_lists(self): @@ -684,7 +680,9 @@ def reorder_potentials(self, potentials): @log_process(logger) @return_timing_data - def form_multipoles(self, level_start_source_box_nrs, source_boxes, + def form_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_box_nrs, + source_boxes, src_weight_vecs): src_weights, = src_weight_vecs formmp = self.tree_indep.get_routine( @@ -727,8 +725,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes, @log_process(logger) @return_timing_data - def coarsen_multipoles(self, level_start_source_parent_box_nrs, - source_parent_boxes, mpoles): + def coarsen_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_parent_box_nrs, + source_parent_boxes, + mpoles): tree = self.tree mpmp = self.tree_indep.get_translation_routine(self, "%ddmpmp") @@ -783,8 +783,11 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs, @log_process(logger) @return_timing_data - def eval_direct(self, target_boxes, neighbor_sources_starts, - neighbor_sources_lists, src_weight_vecs): + def eval_direct(self, actx: PyOpenCLArrayContext, + target_boxes, + neighbor_sources_starts, + neighbor_sources_lists, + src_weight_vecs): src_weights, = src_weight_vecs output = self.output_zeros() @@ -827,7 +830,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts, @log_process(logger) @return_timing_data - def multipole_to_local(self, + def multipole_to_local(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, mpole_exps): @@ -942,8 +945,9 @@ def multipole_to_local(self, @log_process(logger) @return_timing_data - def eval_multipoles(self, - target_boxes_by_source_level, sep_smaller_nonsiblings_by_level, + def eval_multipoles(self, actx: PyOpenCLArrayContext, + target_boxes_by_source_level, + sep_smaller_nonsiblings_by_level, mpole_exps): output = self.output_zeros() @@ -985,9 +989,10 @@ def eval_multipoles(self, @log_process(logger) @return_timing_data - def form_locals(self, + def form_locals(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, - target_or_target_parent_boxes, starts, lists, src_weight_vecs): + target_or_target_parent_boxes, + starts, lists, src_weight_vecs): src_weights, = src_weight_vecs local_exps = self.local_expansion_zeros() @@ -1065,8 +1070,10 @@ def form_locals(self, @log_process(logger) @return_timing_data - def refine_locals(self, level_start_target_or_target_parent_box_nrs, - target_or_target_parent_boxes, local_exps): + def refine_locals(self, actx: PyOpenCLArrayContext, + level_start_target_or_target_parent_box_nrs, + target_or_target_parent_boxes, + local_exps): locloc = self.tree_indep.get_translation_routine(self, "%ddlocloc") @@ -1112,7 +1119,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs, @log_process(logger) @return_timing_data - def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): + def eval_locals(self, actx: PyOpenCLArrayContext, + level_start_target_box_nrs, + target_boxes, + local_exps): output = self.output_zeros() taeval = self.tree_indep.get_expn_eval_routine("ta") @@ -1147,7 +1157,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): return output @log_process(logger) - def finalize_potentials(self, potential, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potential): if self.tree_indep.eqn_letter == "l" and self.dim == 2: scale_factor = -1/(2*np.pi) elif self.tree_indep.eqn_letter == "h" and self.dim == 2: diff --git a/boxtree/rotation_classes.py b/boxtree/rotation_classes.py index 7093bcff..6de3da6d 100644 --- a/boxtree/rotation_classes.py +++ b/boxtree/rotation_classes.py @@ -3,11 +3,7 @@ ------------------------------- .. autoclass:: RotationClassesInfo - -Build rotation classes ----------------------- - -.. autoclass:: RotationClassesBuilder +.. autofunction:: build_rotation_classes """ __copyright__ = "Copyright (C) 2019 Matt Wala" @@ -37,18 +33,44 @@ import numpy as np from arraycontext import Array +from pytools import log_process -from boxtree.translation_classes import TranslationClassesBuilder +from boxtree.tree import Tree +from boxtree.traversal import FMMTraversalInfo from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container import logging logger = logging.getLogger(__name__) -from pytools import log_process + +def vec_gcd(vec) -> int: + """Return the GCD of a list of integers.""" + import math + + # TODO: math.gcd supports a list of integers from >= 3.9 + result = abs(vec[0]) + for elem in vec[1:]: + result = math.gcd(result, abs(elem)) + + return result # {{{ rotation classes builder +class RotationClassesBuilder: + def __init__(self, *args, **kwargs): + pass + + def __call__(self, actx, trav, tree, wait_for=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_rotation_classes' instead.", + DeprecationWarning, stacklevel=2) + + result = build_rotation_classes(actx, trav, tree) + return result, None + + @dataclass_array_container @dataclass(frozen=True) class RotationClassesInfo: @@ -84,112 +106,95 @@ def nfrom_sep_siblings_rotation_classes(self): return len(self.from_sep_siblings_rotation_class_to_angle) -class RotationClassesBuilder: - """Build rotation classes for List 2 translations. - - .. automethod:: __init__ - .. automethod:: __call__ - """ - - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context - self.tcb = TranslationClassesBuilder(array_context) - - @staticmethod - def vec_gcd(vec) -> int: - """Return the GCD of a list of integers.""" - import math - - # TODO: math.gcd supports a list of integers from >= 3.9 - result = abs(vec[0]) - for elem in vec[1:]: - result = math.gcd(result, abs(elem)) - - return result - - def compute_rotation_classes(self, - well_sep_is_n_away: int, dimensions: int, used_translation_classes): - """Convert translation classes to a list of rotation classes and angles.""" - angle_to_rot_class = {} - angles = [] - - ntranslation_classes_per_level = ( - self.tcb.ntranslation_classes_per_level(well_sep_is_n_away, - dimensions)) - - translation_class_to_rot_class = ( - np.empty(ntranslation_classes_per_level, dtype=np.int32)) - - translation_class_to_rot_class[:] = -1 - - for cls in used_translation_classes: - vec = self.tcb.translation_class_to_normalized_vector( - well_sep_is_n_away, dimensions, cls) - - # Normalize the translation vector (by dividing by its GCD). - # - # We need this before computing the cosine of the rotation angle, - # because generally in in floating point arithmetic, if k is a - # positive scalar and v is a vector, we can't assume - # - # kv[-1] / sqrt(|kv|^2) == v[-1] / sqrt(|v|^2). - # - # Normalizing ensures vectors that are positive integer multiples of - # each other get classified into the same equivalence class of - # rotations. - vec //= self.vec_gcd(vec) - - # Compute the rotation angle for the vector. - norm = np.linalg.norm(vec) - assert norm != 0 - angle = np.arccos(vec[-1] / norm) - - # Find the rotation class. - if angle in angle_to_rot_class: - rot_class = angle_to_rot_class[angle] - else: - rot_class = len(angles) - angle_to_rot_class[angle] = rot_class - angles.append(angle) - - translation_class_to_rot_class[cls] = rot_class - - return translation_class_to_rot_class, angles - - @log_process(logger, "build m2l rotation classes") - def __call__(self, actx, trav, tree, wait_for=None): - """Returns a pair *info*, *evt* where info is a :class:`RotationClassesInfo`. - """ - evt, translation_class_is_used, translation_classes_lists = \ - self.tcb.compute_translation_classes(actx, trav, tree, wait_for, False) - - d = tree.dimensions - n = trav.well_sep_is_n_away - - # convert translation classes to rotation classes - - used_translation_classes = ( - np.flatnonzero(actx.to_numpy(translation_class_is_used))) - - translation_class_to_rotation_class, rotation_angles = ( - self.compute_rotation_classes(n, d, used_translation_classes)) - - # There should be no more than 2^(d-1) * (2n+1)^d distinct rotation - # classes, since that is an upper bound on the number of distinct - # positions for list 2 boxes. - assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d - - rotation_classes_lists = actx.from_numpy( - translation_class_to_rotation_class - )[translation_classes_lists] - rotation_angles = actx.from_numpy(np.array(rotation_angles)) - - info = RotationClassesInfo( - from_sep_siblings_rotation_classes=rotation_classes_lists, - from_sep_siblings_rotation_class_to_angle=rotation_angles, - ) - - return actx.freeze(info), evt +def translation_classes_to_rotation_classes_and_angles( + used_translation_classes, well_sep_is_n_away: int, dimensions: int): + """Convert translation classes to a list of rotation classes and angles.""" + angle_to_rot_class = {} + angles = [] + + from boxtree.translation_classes import ntranslation_classes_per_level + ntranslation_classes_per_level = ( + ntranslation_classes_per_level(well_sep_is_n_away, dimensions)) + + translation_class_to_rot_class = ( + np.empty(ntranslation_classes_per_level, dtype=np.int32)) + + translation_class_to_rot_class[:] = -1 + + from boxtree.translation_classes import translation_class_to_normalized_vector + for cls in used_translation_classes: + vec = translation_class_to_normalized_vector( + well_sep_is_n_away, dimensions, cls) + + # Normalize the translation vector (by dividing by its GCD). + # + # We need this before computing the cosine of the rotation angle, + # because generally in in floating point arithmetic, if k is a + # positive scalar and v is a vector, we can't assume + # + # kv[-1] / sqrt(|kv|^2) == v[-1] / sqrt(|v|^2). + # + # Normalizing ensures vectors that are positive integer multiples of + # each other get classified into the same equivalence class of + # rotations. + vec //= vec_gcd(vec) + + # Compute the rotation angle for the vector. + norm = np.linalg.norm(vec) + assert norm != 0 + angle = np.arccos(vec[-1] / norm) + + # Find the rotation class. + if angle in angle_to_rot_class: + rot_class = angle_to_rot_class[angle] + else: + rot_class = len(angles) + angle_to_rot_class[angle] = rot_class + angles.append(angle) + + translation_class_to_rot_class[cls] = rot_class + + return translation_class_to_rot_class, angles + + +@log_process(logger, "build m2l rotation classes") +def build_rotation_classes( + actx: PyOpenCLArrayContext, + trav: FMMTraversalInfo, tree: Tree) -> RotationClassesInfo: + """Build rotation classes for List 2 translations.""" + from boxtree.translation_classes import compute_used_translation_classes + translation_class_is_used, translation_classes_lists = ( + compute_used_translation_classes(actx, trav, tree, + is_translation_per_level=False)) + + d = tree.dimensions + n = trav.well_sep_is_n_away + + # convert translation classes to rotation classes + + used_translation_classes = ( + np.flatnonzero(actx.to_numpy(translation_class_is_used))) + + translation_class_to_rotation_class, rotation_angles = ( + translation_classes_to_rotation_classes_and_angles( + used_translation_classes, n, d)) + + # There should be no more than 2^(d-1) * (2n+1)^d distinct rotation + # classes, since that is an upper bound on the number of distinct + # positions for list 2 boxes. + assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d + + rotation_classes_lists = actx.from_numpy( + translation_class_to_rotation_class + )[translation_classes_lists] + rotation_angles = actx.from_numpy(np.array(rotation_angles)) + + info = RotationClassesInfo( + from_sep_siblings_rotation_classes=rotation_classes_lists, + from_sep_siblings_rotation_class_to_angle=rotation_angles, + ) + + return actx.freeze(info) # }}} diff --git a/boxtree/tools.py b/boxtree/tools.py index 951dea1a..da8ad00b 100644 --- a/boxtree/tools.py +++ b/boxtree/tools.py @@ -28,17 +28,15 @@ import pyopencl as cl import pyopencl.array +from pyopencl.elementwise import ElementwiseTemplate, ElementwiseKernel from pyopencl.tools import dtype_to_c_struct, ScalarArg, VectorArg as _VectorArg from mako.template import Template -from pytools import Record, memoize_method +from pytools import Record, memoize_in from pytools.obj_array import make_obj_array from boxtree.array_context import PyOpenCLArrayContext -import loopy as lp -from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 - # Use offsets in VectorArg by default. VectorArg = partial(_VectorArg, with_offset=True) @@ -51,22 +49,19 @@ def padded_bin(i, nbits): return bin(i)[2:].rjust(nbits, "0") -# NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__() -def realloc_array(actx, new_shape, ary, zero_fill=False, wait_for=None): - if wait_for is None: - wait_for = [] - +# NOTE: Order of positional args should match copy_and_map_gappy +def realloc_array(actx: PyOpenCLArrayContext, new_shape, ary, zero_fill=False): if zero_fill: - array_maker = actx.zeros + new_ary = actx.zeros(shape=new_shape, dtype=ary.dtype) else: - array_maker = actx.empty + new_ary = actx.empty(shape=new_shape, dtype=ary.dtype) - new_ary = array_maker(shape=new_shape, dtype=ary.dtype) evt = cl.enqueue_copy(actx.queue, new_ary.data, ary.data, byte_count=ary.nbytes, - wait_for=wait_for + new_ary.events) + wait_for=new_ary.events) + new_ary.add_event(evt) - return new_ary, evt + return new_ary def reverse_index_array(actx, indices, target_size=None, result_fill_value=None): @@ -107,155 +102,155 @@ def make_normal_particle_array(actx, nparticles, dims, dtype, seed=15): def make_surface_particle_array(actx, nparticles, dims, dtype, seed=15): - if dims == 2: - def get_2d_knl(dtype): - knl = lp.make_kernel( - "{[i]: 0<=i phi = 2*M_PI/n * i - x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi)) - y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi)) - end - """, - [ - lp.GlobalArg("x,y", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ], - name="make_surface_dist") - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - - return knl + import loopy as lp + from boxtree.array_context import make_loopy_program + + @memoize_in(actx, (make_surface_particle_array, "2d", dtype)) + def get_2d_kernel(): + knl = make_loopy_program( + "{[i]: 0 <= i < n}", + """ + for i + <> phi = 2*M_PI / n * i + x0[i] = 0.5 * (3 * cos(phi) + 2.0 * sin(3 * phi)) + x1[i] = 0.5 * (1 * sin(phi) + 1.5 * sin(2 * phi)) + end + """, + kernel_data=[ + lp.GlobalArg("x0,x1", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + name="make_surface_array_2d", + assumptions="n>0") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + return knl + + @memoize_in(actx, (make_surface_particle_array, "3d", dtype)) + def get_3d_kernel(): + knl = make_loopy_program( + "{[i, j]: 0 <= i, j phi = 2 * M_PI / n * i + <> theta = 2 * M_PI / n * j + x0[i, j] = 5 * cos(phi) * (3 + cos(theta)) + x1[i, j] = 5 * sin(phi) * (3 + cos(theta)) + x2[i, j] = 5 * sin(theta) + end + """, + kernel_data=[ + lp.GlobalArg("x0,x1,x2", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + name="make_surface_array_3d", + assumptions="n>0") + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") + + return knl - evt, result = get_2d_knl(dtype)(actx.queue, n=nparticles) - - result = [x.ravel() for x in result] - - return make_obj_array(result) + if dims == 2: + n = nparticles + knl = get_2d_kernel() elif dims == 3: n = int(nparticles**0.5) - - def get_3d_knl(dtype): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j phi = 2*M_PI/n * i - <> theta = 2*M_PI/n * j - x[i,j] = 5*cos(phi) * (3 + cos(theta)) - y[i,j] = 5*sin(phi) * (3 + cos(theta)) - z[i,j] = 5*sin(theta) - end - """, - [ - lp.GlobalArg("x,y,z,", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ]) - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - - return knl - - evt, result = get_3d_knl(dtype)(actx.queue, n=n) - - result = [x.ravel() for x in result] - - return make_obj_array(result) + knl = get_3d_kernel() else: - raise NotImplementedError + raise ValueError(f"unsupported dimensions: {dims}") + + assert n > 0 + result = actx.call_loopy(knl, n=n) + return make_obj_array([result[f"x{i}"].ravel() for i in range(dims)]) def make_uniform_particle_array(actx, nparticles, dims, dtype, seed=15): + import loopy as lp + from boxtree.array_context import make_loopy_program + + @memoize_in(actx, (make_uniform_particle_array, "2d", dtype)) + def get_2d_kernel(): + knl = make_loopy_program( + "{[i, j]: 0 <= i, j < n}", + """ + for i, j + <> xx = 4 * i / (n - 1) + <> yy = 4 * j / (n - 1) + angle = 0.3 + <> s = sin(angle) + <> c = cos(angle) + x0[i, j] = c * xx + s * yy - 2 + x1[i, j] = -s * xx + c * yy - 2 + end + """, + kernel_data=[ + lp.GlobalArg("x0,x1", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + name="make_uniform_array_2d", + assumptions="n>0") + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") + + return knl + + @memoize_in(actx, (make_uniform_particle_array, "3d", dtype)) + def get_3d_kernel(): + knl = make_loopy_program( + "{[i, j, k]: 0 <= i, j, k < n}", + """ + for i, j, k + <> xx = i / (n - 1) + <> yy = j / (n - 1) + <> zz = k / (n - 1) + + phi = 0.3 + <> s1 = sin(phi) + <> c1 = cos(phi) + + <> xxx = c1 * xx + s1 * yy + <> yyy = -s1 * xx + c1 * yy + <> zzz = zz + + theta = 0.7 + <> s2 = sin(theta) + <> c2 = cos(theta) + + x0[i, j, k] = 4 * (c2 * xxx + s2 * zzz) - 2 + x1[i, j, k] = 4 * yyy - 2 + x2[i, j, k] = 4 * (-s2 * xxx + c2 * zzz) - 2 + end + """, + kernel_data=[ + lp.GlobalArg("x0,x1,x2", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + name="make_uniform_array_3d", + assumptions="n>0") + + knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0") + + return knl + if dims == 2: n = int(nparticles**0.5) - - def get_2d_knl(dtype): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j xx = 4*i/(n-1) - <> yy = 4*j/(n-1) - angle = 0.3 - <> s = sin(angle) - <> c = cos(angle) - x[i,j] = c*xx + s*yy - 2 - y[i,j] = -s*xx + c*yy - 2 - end - """, - [ - lp.GlobalArg("x,y", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ], assumptions="n>0") - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - - return knl - - evt, result = get_2d_knl(dtype)(actx.queue, n=n) - - result = [x.ravel() for x in result] - - return make_obj_array(result) + knl = get_2d_kernel() elif dims == 3: n = int(nparticles**(1/3)) - - def get_3d_knl(dtype): - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j,k xx = i/(n-1) - <> yy = j/(n-1) - <> zz = k/(n-1) - - phi = 0.3 - <> s1 = sin(phi) - <> c1 = cos(phi) - - <> xxx = c1*xx + s1*yy - <> yyy = -s1*xx + c1*yy - <> zzz = zz - - theta = 0.7 - <> s2 = sin(theta) - <> c2 = cos(theta) - - x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2 - y[i,j,k] = 4 * yyy - 2 - z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2 - end - """, - [ - lp.GlobalArg("x,y,z", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ], assumptions="n>0") - - knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0") - - return knl - - evt, result = get_3d_knl(dtype)(actx.queue, n=n) - - result = [x.ravel() for x in result] - - return make_obj_array(result) + knl = get_3d_kernel() else: - raise NotImplementedError - + raise ValueError(f"unsupported dimensions: {dims}") -def make_rotated_uniform_particle_array(actx, nparticles, dims, dtype, seed=15): - raise NotImplementedError + assert n > 0 -# }}} + result = actx.call_loopy(knl, n=n) + return make_obj_array([result[f"x{i}"].ravel() for i in range(dims)]) -def particle_array_to_host(actx, particles): - return np.array([actx.to_numpy(x) for x in particles], order="F").T +# }}} # {{{ host/device data storage @@ -310,9 +305,8 @@ def transform_val(val): def get(self, queue, **kwargs): """ :returns: a copy of *self* in which all data lives on the host, i.e. - all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` - objects are replaced by corresponding :class:`numpy.ndarray` - instances on the host. + all :class:`pyopencl.array.Array` objects are replaced by + corresponding :class:`numpy.ndarray` instances on the host. """ from warnings import warn warn(f"{type(self).__name__}.get is deprecated and will be removed " @@ -320,9 +314,6 @@ def get(self, queue, **kwargs): DeprecationWarning, stacklevel=2) def try_get(attr): - if isinstance(attr, ImmutableHostDeviceArray): - return attr.host - try: return attr.get(queue=queue, **kwargs) except AttributeError: @@ -367,8 +358,6 @@ def to_device(self, queue, exclude_fields=frozenset()): def _to_device(attr): if isinstance(attr, np.ndarray): return cl.array.to_device(queue, attr).with_queue(None) - elif isinstance(attr, ImmutableHostDeviceArray): - return attr.device elif isinstance(attr, DeviceDataRecord): return attr.to_device(queue) else: @@ -376,38 +365,13 @@ def _to_device(attr): return self._transform_arrays(_to_device, exclude_fields=exclude_fields) - def to_host_device_array(self, queue, exclude_fields=frozenset()): - """ - :arg exclude_fields: a :class:`frozenset` containing fields excluded - from transformed to `ImmutableHostDeviceArray`. - - :returns: a copy of *self* where all device and host arrays are - transformed to `ImmutableHostDeviceArray` objects. - """ - from warnings import warn - warn(f"{type(self).__name__}.to_host_device_array is deprecated and will " - "be removed in 2023. Switch from ImmutableHostDeviceArray.", - DeprecationWarning, stacklevel=2) - - def _to_host_device_array(attr): - if isinstance(attr, (np.ndarray, cl.array.Array)): - return ImmutableHostDeviceArray(queue, attr) - elif isinstance(attr, DeviceDataRecord): - return attr.to_host_device_array(queue) - else: - return attr - - return self._transform_arrays( - _to_host_device_array, exclude_fields=exclude_fields - ) - # }}} # {{{ type mangling def get_type_moniker(dtype): - return "%s%d" % (dtype.kind, dtype.itemsize) + return f"{dtype.kind}{dtype.itemsize}" # }}} @@ -438,22 +402,33 @@ def get_type_moniker(dtype): """, strict_undefined=True) -class GappyCopyAndMapKernel: - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context - - @property - def context(self): - return self._setup_actx.queue.context - - @memoize_method - def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, - have_src_indices, have_dst_indices, map_values): +# NOTE: Order of positional args should match realloc_array() +def copy_and_map_gappy( + actx: PyOpenCLArrayContext, new_shape, ary, + src_indices=None, dst_indices=None, mapping=None, range=None, + zero_fill: bool = False, + debug: bool = False): + """Compresses box info arrays after empty leaf pruning and, optionally, + maps old box IDs to new box IDs (if the array being operated on contains + box IDs). + """ + have_src_indices = src_indices is not None + have_dst_indices = dst_indices is not None + have_mapping = mapping is not None + + src_index_dtype = src_indices.dtype if have_src_indices else None + dst_index_dtype = dst_indices.dtype if have_dst_indices else None + + @memoize_in(actx, ( + copy_and_map_gappy, ary.dtype, + src_index_dtype, dst_index_dtype, + have_src_indices, have_dst_indices, have_mapping)) + def get_kernel(): from boxtree.tools import VectorArg args = [ - VectorArg(dtype, "input_ary"), - VectorArg(dtype, "output_ary"), + VectorArg(ary.dtype, "input_ary"), + VectorArg(ary.dtype, "output_ary"), ] if have_src_indices: @@ -462,85 +437,62 @@ def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, if have_dst_indices: args.append(VectorArg(dst_index_dtype, "to_indices")) - if map_values: - args.append(VectorArg(dtype, "value_map")) + if have_mapping: + args.append(VectorArg(ary.dtype, "value_map")) from pyopencl.tools import dtype_to_ctype src = GAPPY_COPY_TPL.render( - dtype=dtype, + dtype=ary.dtype, dtype_to_ctype=dtype_to_ctype, from_dtype=src_index_dtype, to_dtype=dst_index_dtype, from_indices=have_src_indices, to_indices=have_dst_indices, - map_values=map_values) + map_values=have_mapping) - from pyopencl.elementwise import ElementwiseKernel - return ElementwiseKernel(self.context, + return ElementwiseKernel(actx.context, args, str(src), - preamble=dtype_to_c_struct(self.context.devices[0], dtype), + preamble=dtype_to_c_struct(actx.queue.device, ary.dtype), name="gappy_copy_and_map") - # NOTE: Order of positional args should match realloc_array() - def __call__(self, actx, new_shape, ary, src_indices=None, - dst_indices=None, map_values=None, zero_fill=False, - wait_for=None, range=None, debug=False): - """Compresses box info arrays after empty leaf pruning and, optionally, - maps old box IDs to new box IDs (if the array being operated on contains - box IDs). - """ - - have_src_indices = src_indices is not None - have_dst_indices = dst_indices is not None - have_map_values = map_values is not None - - if not (have_src_indices or have_dst_indices): - raise ValueError("must specify at least one of src or dest indices") - - if range is None: - if have_src_indices and have_dst_indices: - raise ValueError( - "must supply range when passing both src and dest indices") - elif have_src_indices: - range = slice(src_indices.shape[0]) - if debug: - assert int(actx.to_numpy(actx.np.amax(src_indices))) < len(ary) - elif have_dst_indices: - range = slice(dst_indices.shape[0]) - if debug: - assert int(actx.to_numpy(actx.np.amax(dst_indices))) < new_shape - - if zero_fill: - array_maker = actx.zeros - else: - array_maker = actx.empty - - result = array_maker(new_shape, ary.dtype) + if not (have_src_indices or have_dst_indices): + raise ValueError("must specify at least one of src or dst indices") + + if range is None: + if have_src_indices and have_dst_indices: + raise ValueError( + "must supply range when passing both src and dst indices") + elif have_src_indices: + range = slice(src_indices.shape[0]) + if debug: + assert int(actx.to_numpy(actx.np.amax(src_indices))) < len(ary) + elif have_dst_indices: + range = slice(dst_indices.shape[0]) + if debug: + assert int(actx.to_numpy(actx.np.amax(dst_indices))) < new_shape - kernel = self._get_kernel(ary.dtype, - src_indices.dtype if have_src_indices else None, - dst_indices.dtype if have_dst_indices else None, - have_src_indices, - have_dst_indices, - have_map_values) + if zero_fill: + result = actx.zeros(shape=new_shape, dtype=ary.dtype) + else: + result = actx.empty(shape=new_shape, dtype=ary.dtype) - args = (ary, result) - args += (src_indices,) if have_src_indices else () - args += (dst_indices,) if have_dst_indices else () - args += (map_values,) if have_map_values else () + args = (ary, result) + args += (src_indices,) if have_src_indices else () + args += (dst_indices,) if have_dst_indices else () + args += (mapping,) if have_mapping else () - evt = kernel(*args, queue=actx.queue, range=range, wait_for=wait_for) + # FIXME: avoid in-place modifications + kernel = get_kernel() + evt = kernel(*args, queue=actx.queue, range=range) + result.add_event(evt) - return result, evt + return result # }}} # {{{ map values through table -from pyopencl.elementwise import ElementwiseTemplate - - MAP_VALUES_TPL = ElementwiseTemplate( arguments="""//CL// dst_value_t *dst, @@ -553,43 +505,31 @@ def __call__(self, actx, new_shape, ary, src_indices=None, name="map_values") -class MapValuesKernel: - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context - - @property - def context(self): - return self._setup_actx.queue.context +def map_values(actx: PyOpenCLArrayContext, mapping, src, dst=None): + """Map the values of *src* through *mapping* as ``mapping[src[i]]``.""" + if dst is None: + dst = src - @memoize_method - def _get_kernel(self, dst_dtype, src_dtype): + @memoize_in(actx, (map_values, dst.dtype, src.dtype)) + def get_kernel(): type_aliases = ( - ("src_value_t", src_dtype), - ("dst_value_t", dst_dtype) + ("src_value_t", src.dtype), + ("dst_value_t", dst.dtype) ) - return MAP_VALUES_TPL.build(self.context, type_aliases) - - def __call__(self, map_values, src, dst=None): - """ - Map the entries of the array `src` through the table `map_values`. - """ - if dst is None: - dst = src + return MAP_VALUES_TPL.build(actx.context, type_aliases) - kernel = self._get_kernel(dst.dtype, src.dtype) - evt = kernel(dst, src, map_values) + # FIXME: avoid in-place modifications :( + evt = get_kernel()(dst, src, mapping) + dst.add_event(evt) - return dst, evt + return dst # }}} # {{{ binary search -from mako.template import Template - - BINARY_SEARCH_TEMPLATE = Template(""" /* * Returns the largest value of i such that arr[i] <= val, or (size_t) -1 if val @@ -629,14 +569,8 @@ def __call__(self, map_values, src, dst=None): """) -class InlineBinarySearch: - - def __init__(self, elem_type_name): - self.render_vars = {"elem_t": elem_type_name} - - @memoize_method - def __str__(self): - return BINARY_SEARCH_TEMPLATE.render(**self.render_vars) +def inline_binary_search_for_type(elem_type_name: str) -> str: + return BINARY_SEARCH_TEMPLATE.render(**{"elem_t": elem_type_name}) # }}} @@ -669,82 +603,79 @@ def __str__(self): """ -class MaskCompressorKernel: - """ - .. automethod:: __call__ - """ - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context +def mask_to_csr(actx: PyOpenCLArrayContext, mask, list_dtype=None): + """Convert a mask to a list in :ref:`csr` format. - @property - def context(self): - return self._setup_actx.context + :arg mask: Either a 1D or 2D array. + * If *mask* is 1D, it should represent a masked list, where + *mask[i]* is true if and only if *i* is in the list. + * If *mask* is 2D, it should represent a list of masked lists, + so that *mask[i,j]* is true if and only if *j* is in list *i*. - @memoize_method - def get_list_compressor_kernel(self, mask_dtype, list_dtype): - from pyopencl.algorithm import ListOfListsBuilder + :arg list_dtype: The dtype for the output list(s). Defaults to the mask + dtype. + :returns: The return value depends on the type of the input. + * If *mask* is 1D, returns a *(list,)*. + * If *mask* is 2D, returns a tuple *(starts, lists)*, as a :ref:`csr` list. + """ + from pyopencl.algorithm import ListOfListsBuilder + + if list_dtype is None: + list_dtype = mask.dtype + + @memoize_in(actx, (mask_to_csr, "list_compressor", mask.dtype, list_dtype)) + def get_list_compressor_kernel(): return ListOfListsBuilder( - self.context, + actx.context, [("output", list_dtype)], MASK_LIST_COMPRESSOR_BODY, [ - _VectorArg(mask_dtype, "mask"), + _VectorArg(mask.dtype, "mask"), ], name_prefix="compress_list") - @memoize_method - def get_matrix_compressor_kernel(self, mask_dtype, list_dtype): - from pyopencl.algorithm import ListOfListsBuilder - + @memoize_in(actx, (mask_to_csr, "matrix_compressor", mask.dtype, list_dtype)) + def get_matrix_compressor_kernel(): return ListOfListsBuilder( - self.context, + actx.context, [("output", list_dtype)], MASK_MATRIX_COMPRESSOR_BODY, [ ScalarArg(np.int32, "ncols"), ScalarArg(np.int32, "outer_stride"), ScalarArg(np.int32, "inner_stride"), - _VectorArg(mask_dtype, "mask"), + _VectorArg(mask.dtype, "mask"), ], name_prefix="compress_matrix") - def __call__(self, actx, mask, list_dtype=None): - """Convert a mask to a list in :ref:`csr` format. - - :arg mask: Either a 1D or 2D array. - * If *mask* is 1D, it should represent a masked list, where - *mask[i]* is true if and only if *i* is in the list. - * If *mask* is 2D, it should represent a list of masked lists, - so that *mask[i,j]* is true if and only if *j* is in list *i*. - - :arg list_dtype: The dtype for the output list(s). Defaults to the mask - dtype. + if len(mask.shape) == 1: + knl = get_list_compressor_kernel() + result, evt = knl( + actx.queue, mask.shape[0], mask.data, + allocator=actx.allocator, + ) + result["output"].lists.add_event(evt) + + return result["output"].lists + elif len(mask.shape) == 2: + # FIXME: This is efficient for small column sizes but may not be + # for larger ones since the work is partitioned by row. + knl = get_matrix_compressor_kernel() + size = mask.dtype.itemsize + assert size > 0 + + result, evt = knl( + actx.queue, mask.shape[0], mask.shape[1], + mask.strides[0] // size, mask.strides[1] // size, mask.data, + allocator=actx.allocator, + ) + result["output"].starts.add_event(evt) + result["output"].lists.add_event(evt) - :returns: The return value depends on the type of the input. - * If mask* is 1D, returns a tuple *(list, evt)*. - * If *mask* is 2D, returns a tuple *(starts, lists, event)*, as a - :ref:`csr` list. - """ - if list_dtype is None: - list_dtype = mask.dtype - - if len(mask.shape) == 1: - knl = self.get_list_compressor_kernel(mask.dtype, list_dtype) - result, evt = knl(actx.queue, mask.shape[0], mask.data) - return (result["output"].lists, evt) - elif len(mask.shape) == 2: - # FIXME: This is efficient for small column sizes but may not be - # for larger ones since the work is partitioned by row. - knl = self.get_matrix_compressor_kernel(mask.dtype, list_dtype) - size = mask.dtype.itemsize - assert size > 0 - result, evt = knl(actx.queue, mask.shape[0], mask.shape[1], - mask.strides[0] // size, mask.strides[1] // size, - mask.data) - return (result["output"].starts, result["output"].lists, evt) - else: - raise ValueError("unsupported dimensionality") + return result["output"].starts, result["output"].lists + else: + raise ValueError("unsupported dimensionality") # }}} @@ -893,58 +824,6 @@ def run_mpi(script: str, num_processes: int, env: Dict[str, Any]) -> None: # }}} -# {{{ HostDeviceArray - -class ImmutableHostDeviceArray: - """Interface for arrays on both host and device. - - .. note:: This interface assumes the array is immutable. The behavior of - modifying the content of either the host array or the device array is undefined. - - @TODO: Once available, replace this implementation with PyOpenCL's in-house - implementation. - """ - def __init__(self, queue, array): - self.queue = queue - self.shape = array.shape - self.host_array = None - self.device_array = None - - if isinstance(array, np.ndarray): - self.host_array = array - elif isinstance(array, cl.array.Array): - self.device_array = array - - def with_queue(self, queue): - self.queue = queue - - @property - def svm_capable(self): - svm_capabilities = \ - self.queue.device.get_info(cl.device_info.SVM_CAPABILITIES) - if svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER != 0: - return True - else: - return False - - @property - def host(self): - if self.host_array is None: - self.host_array = self.device_array.get(self.queue) - return self.host_array - - @property - def device(self): - if self.device_array is None: - # @TODO: Use SVM - self.device_array = cl.array.to_device(self.queue, self.host_array) - - self.device_array.with_queue(self.queue) - return self.device_array - -# }}} - - # {{{ coord_vec tools def get_coord_vec_dtype( diff --git a/boxtree/translation_classes.py b/boxtree/translation_classes.py index eaaf32f4..e810ed44 100644 --- a/boxtree/translation_classes.py +++ b/boxtree/translation_classes.py @@ -3,11 +3,7 @@ ---------------------------------- .. autoclass:: TranslationClassesInfo - -Build translation classes -------------------------- - -.. autoclass:: TranslationClassesBuilder +.. autofunction:: build_translation_classes """ __copyright__ = "Copyright (C) 2019 Matt Wala" @@ -36,24 +32,23 @@ from dataclasses import dataclass import numpy as np -from pyopencl.elementwise import ElementwiseTemplate, ElementwiseKernel +from pyopencl.elementwise import ElementwiseTemplate from arraycontext import Array -from pytools import memoize_method +from pytools import memoize_on_first_arg, log_process from mako.template import Template -from boxtree.tools import ( - InlineBinarySearch, get_coord_vec_dtype, coord_vec_subscript_code) +from boxtree.tree import Tree from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS, FMMTraversalInfo +from boxtree.tools import ( + inline_binary_search_for_type, get_coord_vec_dtype, coord_vec_subscript_code) from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container import logging logger = logging.getLogger(__name__) -from pytools import log_process - -# {{{ translation classes builder +# {{{ kernel templates TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE = Template(r"""//CL:mako// #define LEVEL_TO_RAD(level) \ @@ -114,7 +109,7 @@ %endfor return result; } - """ + str(InlineBinarySearch("box_id_t")), + """ + inline_binary_search_for_type("box_id_t"), strict_undefined=True) @@ -178,10 +173,27 @@ atomic_or(&translation_class_is_used[translation_class], 1); """) +# }}} -@dataclass(frozen=True) -class _KernelInfo: - translation_class_finder: ElementwiseKernel + +# {{{ translation classes builder + +class TranslationClassesBuilder: + def __init__(self, *args, **kargs): + pass + + def __call__(self, actx: PyOpenCLArrayContext, + trav, tree, wait_for=None, is_translation_per_level=True): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_rotation_classes' instead.", + DeprecationWarning, stacklevel=2) + + result = build_translation_classes( + actx, trav, tree, + is_translation_per_level=is_translation_per_level) + + return result, None @dataclass_array_container @@ -233,199 +245,188 @@ def nfrom_sep_siblings_translation_classes(self): return len(self.from_sep_siblings_translation_class_to_distance_vector) -class TranslationClassesBuilder: - """Build translation classes for List 2 translations. - - .. automethod:: __init__ - .. automethod:: __call__ - """ - - def __init__(self, array_context: PyOpenCLArrayContext) -> None: - self._setup_actx = array_context - - @property - def context(self): - return self._setup_actx.queue.context - - @memoize_method - def get_kernel_info(self, - dimensions: int, - well_sep_is_n_away: int, - box_id_dtype: np.dtype, - box_level_dtype: np.dtype, - coord_dtype: np.dtype, - translation_class_per_level) -> None: - coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions) - int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions) - - num_translation_classes = \ - self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions) - - # Make sure translation classes can fit inside a 32 bit integer. - if not num_translation_classes <= 1 + np.iinfo(np.int32).max: - raise ValueError("would overflow") - - preamble = TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.render( - dimensions=dimensions, - cvec_sub=partial(coord_vec_subscript_code, dimensions)) - - translation_class_finder = ( - TRANSLATION_CLASS_FINDER_TEMPLATE.build( - self.context, - type_aliases=( - ("int_coord_vec_t", int_coord_vec_dtype), - ("coord_vec_t", coord_vec_dtype), - ("coord_t", coord_dtype), - ("box_id_t", box_id_dtype), - ("box_level_t", box_level_dtype), - ), - var_values=( - ("dimensions", dimensions), - ("ntranslation_classes_per_level", num_translation_classes), - ("translation_class_per_level", translation_class_per_level), - ("cvec_sub", partial( - coord_vec_subscript_code, dimensions)), - ), - more_preamble=preamble)) - - return _KernelInfo(translation_class_finder=translation_class_finder) - - @staticmethod - def ntranslation_classes_per_level( - well_sep_is_n_away: int, dimensions: int) -> int: - return (4 * well_sep_is_n_away + 3) ** dimensions - - def translation_class_to_normalized_vector( - self, well_sep_is_n_away: int, dimensions: int, cls: type - ) -> np.ndarray: - # This computes the vector for the translation class, using the inverse - # of the formula found in get_translation_class() defined in - # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE. - assert 0 <= cls < self.ntranslation_classes_per_level(well_sep_is_n_away, - dimensions) - result = np.zeros(dimensions, dtype=np.int32) - shift = 2 * well_sep_is_n_away + 1 - base = 4 * well_sep_is_n_away + 3 - for i in range(dimensions): - result[i] = cls % base - shift - cls //= base - - return result - - def compute_translation_classes(self, - actx: PyOpenCLArrayContext, trav, tree, wait_for, - is_translation_per_level): - """ - :returns: a :class:`tuple` containing *evt*, *translation_class_is_used* - and *translation_classes_lists*. - """ - - # {{{ compute translation classes for list 2 - - well_sep_is_n_away = trav.well_sep_is_n_away - dimensions = tree.dimensions - coord_dtype = tree.coord_dtype - - knl_info = self.get_kernel_info( - dimensions, well_sep_is_n_away, tree.box_id_dtype, - tree.box_level_dtype, coord_dtype, is_translation_per_level) - - ntranslation_classes = ( - self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions)) - - if is_translation_per_level: - ntranslation_classes = ntranslation_classes * tree.nlevels - - translation_classes_lists = actx.empty( - len(trav.from_sep_siblings_lists), dtype=np.int32) - translation_class_is_used = actx.zeros(ntranslation_classes, dtype=np.int32) - - error_flag = actx.zeros(1, dtype=np.int32) - evt = knl_info.translation_class_finder( - trav.from_sep_siblings_lists, - trav.from_sep_siblings_starts, - trav.target_or_target_parent_boxes, - trav.ntarget_or_target_parent_boxes, - tree.box_centers, - tree.aligned_nboxes, - tree.root_extent, - tree.box_levels, - well_sep_is_n_away, - translation_classes_lists, - translation_class_is_used, - error_flag, - queue=actx.queue, - wait_for=wait_for) - - if actx.to_numpy(error_flag)[0]: - raise ValueError("could not compute translation classes") - - return (evt, translation_class_is_used, translation_classes_lists) - - # }}} - - @log_process(logger, "build m2l translation classes") - def __call__(self, actx: PyOpenCLArrayContext, - trav, tree, wait_for=None, is_translation_per_level=True): - """Returns a pair *info*, *evt* where info is a - :class:`TranslationClassesInfo`. - """ - evt, translation_class_is_used, translation_classes_lists = \ - self.compute_translation_classes(actx, trav, tree, wait_for, - is_translation_per_level) - - well_sep_is_n_away = trav.well_sep_is_n_away - dimensions = tree.dimensions - - used_translation_classes_map = np.empty(len(translation_class_is_used), - dtype=np.int32) - used_translation_classes_map.fill(-1) - - distances = np.empty((dimensions, len(translation_class_is_used)), - dtype=tree.coord_dtype) - num_translation_classes = \ - self.ntranslation_classes_per_level(well_sep_is_n_away, dimensions) - - nlevels = tree.nlevels - count = 0 - prev_level = -1 - from_sep_siblings_translation_classes_level_starts = \ - np.empty(nlevels+1, dtype=np.int32) - for i, used in enumerate(actx.to_numpy(translation_class_is_used)): - cls_without_level = i % num_translation_classes - level = i // num_translation_classes - if (prev_level != level): - from_sep_siblings_translation_classes_level_starts[level] = count - prev_level = level - - if not used: - continue - - used_translation_classes_map[i] = count - unit_vector = self.translation_class_to_normalized_vector( - well_sep_is_n_away, dimensions, cls_without_level) - distances[:, count] = unit_vector * tree.root_extent / (1 << level) - count = count + 1 - - from_sep_siblings_translation_classes_level_starts[nlevels] = count - - translation_classes_lists = actx.from_numpy( - used_translation_classes_map - )[translation_classes_lists] - - distances = actx.from_numpy(distances) - from_sep_siblings_translation_classes_level_starts = actx.from_numpy( - from_sep_siblings_translation_classes_level_starts) - - info = TranslationClassesInfo( - traversal=trav, - from_sep_siblings_translation_classes=translation_classes_lists, - from_sep_siblings_translation_class_to_distance_vector=distances, - from_sep_siblings_translation_classes_level_starts=( - from_sep_siblings_translation_classes_level_starts), - ) - - return actx.freeze(info), evt +def ntranslation_classes_per_level(well_sep_is_n_away: int, dimensions: int) -> int: + return (4 * well_sep_is_n_away + 3) ** dimensions + + +def translation_class_to_normalized_vector( + well_sep_is_n_away: int, dimensions: int, nclasses: int + ) -> np.ndarray: + # This computes the vector for the translation class, using the inverse + # of the formula found in get_translation_class() defined in + # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE. + assert 0 <= nclasses < ntranslation_classes_per_level( + well_sep_is_n_away, dimensions) + + result = np.zeros(dimensions, dtype=np.int32) + shift = 2 * well_sep_is_n_away + 1 + base = 4 * well_sep_is_n_away + 3 + for i in range(dimensions): + result[i] = nclasses % base - shift + nclasses //= base + + return result + + +@memoize_on_first_arg +def get_translation_class_finder_kernel( + actx: PyOpenCLArrayContext, + dimensions: int, + well_sep_is_n_away: int, + box_id_dtype: "np.dtype", + box_level_dtype: "np.dtype", + coord_dtype: "np.dtype", + is_translation_per_level: bool + ): + coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions) + int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions) + + num_translation_classes = ( + ntranslation_classes_per_level(well_sep_is_n_away, dimensions)) + + # Make sure translation classes can fit inside a 32 bit integer. + if not num_translation_classes <= 1 + np.iinfo(np.int32).max: + raise ValueError("would overflow") + + preamble = TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.render( + dimensions=dimensions, + cvec_sub=partial(coord_vec_subscript_code, dimensions)) + + return TRANSLATION_CLASS_FINDER_TEMPLATE.build( + actx.context, + type_aliases=( + ("int_coord_vec_t", int_coord_vec_dtype), + ("coord_vec_t", coord_vec_dtype), + ("coord_t", coord_dtype), + ("box_id_t", box_id_dtype), + ("box_level_t", box_level_dtype), + ), + var_values=( + ("dimensions", dimensions), + ("ntranslation_classes_per_level", num_translation_classes), + ("translation_class_per_level", is_translation_per_level), + ("cvec_sub", partial( + coord_vec_subscript_code, dimensions)), + ), + more_preamble=preamble) + + +def compute_used_translation_classes( + actx: PyOpenCLArrayContext, trav: FMMTraversalInfo, tree: Tree, *, + is_translation_per_level: bool): + # {{{ compute translation classes for list 2 + + well_sep_is_n_away = trav.well_sep_is_n_away + dimensions = tree.dimensions + coord_dtype = tree.coord_dtype + + ntranslation_classes = ( + ntranslation_classes_per_level(well_sep_is_n_away, dimensions)) + + if is_translation_per_level: + ntranslation_classes = ntranslation_classes * tree.nlevels + + translation_classes_lists = actx.empty( + len(trav.from_sep_siblings_lists), dtype=np.int32) + translation_class_is_used = actx.zeros(ntranslation_classes, dtype=np.int32) + error_flag = actx.zeros(1, dtype=np.int32) + + translation_class_finder_knl = get_translation_class_finder_kernel( + actx, + dimensions, well_sep_is_n_away, + tree.box_id_dtype, tree.box_level_dtype, coord_dtype, + is_translation_per_level, + ) + + evt = translation_class_finder_knl( + trav.from_sep_siblings_lists, + trav.from_sep_siblings_starts, + trav.target_or_target_parent_boxes, + trav.ntarget_or_target_parent_boxes, + tree.box_centers, + tree.aligned_nboxes, + tree.root_extent, + tree.box_levels, + well_sep_is_n_away, + translation_classes_lists, + translation_class_is_used, + error_flag, + queue=actx.queue, + ) + translation_classes_lists.add_event(evt) + translation_class_is_used.add_event(evt) + + if actx.to_numpy(error_flag): + raise ValueError("could not compute translation classes") + + return translation_class_is_used, translation_classes_lists + + # }}} + + +@log_process(logger, "build m2l translation classes") +def build_translation_classes(actx: PyOpenCLArrayContext, + trav: FMMTraversalInfo, tree: Tree, *, + is_translation_per_level: bool = True) -> TranslationClassesInfo: + """Build translation classes for List 2 translations.""" + translation_class_is_used, translation_classes_lists = ( + compute_used_translation_classes(actx, trav, tree, + is_translation_per_level=is_translation_per_level)) + + well_sep_is_n_away = trav.well_sep_is_n_away + dimensions = tree.dimensions + + used_translation_classes_map = np.empty( + len(translation_class_is_used), dtype=np.int32) + used_translation_classes_map.fill(-1) + + distances = np.empty( + (dimensions, len(translation_class_is_used)), dtype=tree.coord_dtype) + num_translation_classes = ( + ntranslation_classes_per_level(well_sep_is_n_away, dimensions)) + + nlevels = tree.nlevels + count = 0 + prev_level = -1 + from_sep_siblings_translation_classes_level_starts = ( + np.empty(nlevels + 1, dtype=np.int32)) + + for i, used in enumerate(actx.to_numpy(translation_class_is_used)): + cls_without_level = i % num_translation_classes + level = i // num_translation_classes + if (prev_level != level): + from_sep_siblings_translation_classes_level_starts[level] = count + prev_level = level + + if not used: + continue + + used_translation_classes_map[i] = count + unit_vector = translation_class_to_normalized_vector( + well_sep_is_n_away, dimensions, cls_without_level) + + distances[:, count] = unit_vector * tree.root_extent / (1 << level) + count = count + 1 + + from_sep_siblings_translation_classes_level_starts[nlevels] = count + + translation_classes_lists = actx.from_numpy( + used_translation_classes_map + )[translation_classes_lists] + + distances = actx.from_numpy(distances) + from_sep_siblings_translation_classes_level_starts = actx.from_numpy( + from_sep_siblings_translation_classes_level_starts) + + info = TranslationClassesInfo( + traversal=trav, + from_sep_siblings_translation_classes=translation_classes_lists, + from_sep_siblings_translation_class_to_distance_vector=distances, + from_sep_siblings_translation_classes_level_starts=( + from_sep_siblings_translation_classes_level_starts), + ) + + return actx.freeze(info) # }}} diff --git a/boxtree/traversal.py b/boxtree/traversal.py index 7e777150..c6ece4d2 100644 --- a/boxtree/traversal.py +++ b/boxtree/traversal.py @@ -3,13 +3,7 @@ ------------------------ .. autoclass:: FMMTraversalInfo - -Build Entrypoint ----------------- - -.. autoclass:: FMMTraversalBuilder - - .. automethod:: __call__ +.. autofunction:: build_traversal """ __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -37,13 +31,14 @@ import enum from functools import partial from dataclasses import dataclass +from typing import Optional import numpy as np from pyopencl.algorithm import ListOfListsBuilder from pyopencl.elementwise import ElementwiseTemplate, ElementwiseKernel from arraycontext import Array -from pytools import ProcessLogger, log_process, memoize_method +from pytools import ProcessLogger, log_process, memoize_on_first_arg from pytools.obj_array import make_obj_array from mako.template import Template @@ -1180,114 +1175,126 @@ class _IndexStyle(enum.IntEnum): class _ListMerger: - """Utility class for combining box lists optionally changing indexing style.""" - def __init__(self, array_context: PyOpenCLArrayContext, box_id_dtype): self._setup_actx = array_context self.box_id_dtype = box_id_dtype - @property - def context(self): - return self._setup_actx.queue.context + def __call__(self, actx, input_starts, input_lists, input_index_style, + output_index_style, target_boxes, target_or_target_parent_boxes, + nboxes, debug=False, wait_for=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'merge_lists' instead.", + DeprecationWarning, stacklevel=2) + + return merge_lists( + actx, input_starts, input_lists, + input_index_style, output_index_style, + target_boxes, target_or_target_parent_boxes, nboxes, self.box_id_dtype, + debug=debug) + + +def merge_lists( + actx: PyOpenCLArrayContext, input_starts, input_lists, input_index_style, + output_index_style, target_boxes, target_or_target_parent_boxes, + nboxes, box_id_dtype, debug: bool = False): + """Utility class for combining box lists optionally changing indexing style. + + :arg input_starts: Starts arrays of input + :arg input_lists: Lists arrays of input + :arg input_index_style: A :class:`_IndexStyle` + :arg output_index_style: A :class:`_IndexStyle` + """ + # {{{ + + if ( + output_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES + and input_index_style == _IndexStyle.TARGET_BOXES): + raise ValueError( + "unsupported: merging a list indexed by target boxes " + "into a list indexed by target or target parent boxes") + + ntarget_boxes = len(target_boxes) + ntarget_or_ntarget_parent_boxes = len(target_or_target_parent_boxes) + + noutput_boxes = (ntarget_boxes + if output_index_style == _IndexStyle.TARGET_BOXES + else ntarget_or_ntarget_parent_boxes) + + if ( + input_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES + and output_index_style == _IndexStyle.TARGET_BOXES): + from boxtree.tools import reverse_index_array + target_or_target_parent_boxes_from_all_boxes = reverse_index_array( + actx, target_or_target_parent_boxes, target_size=nboxes) + target_or_target_parent_boxes_from_target_boxes = ( + target_or_target_parent_boxes_from_all_boxes[target_boxes]) + + output_to_input_box = target_or_target_parent_boxes_from_target_boxes + else: + output_to_input_box = actx.from_numpy( + np.arange(noutput_boxes, dtype=box_id_dtype)) + + new_counts = actx.empty(noutput_boxes + 1, box_id_dtype) + assert len(input_starts) == len(input_lists) + + nlists = len(input_starts) + assert nlists >= 1 - @memoize_method - def get_list_merger_kernel(self, nlists, write_counts): - """ - :arg nlists: Number of input lists - :arg write_counts: A :class:`bool`, indicating whether to generate a - kernel that produces box counts or box lists - """ - assert nlists >= 1 + # }}} + + # {{{ merge lists + from pytools import memoize_in + + @memoize_in(actx, (merge_lists, box_id_dtype, nlists)) + def get_list_merger_kernel(with_write_counts): return LIST_MERGER_TEMPLATE.build( - self.context, + actx.context, type_aliases=( - ("box_id_t", self.box_id_dtype), + ("box_id_t", box_id_dtype), ), var_values=( ("nlists", nlists), - ("write_counts", write_counts), + ("write_counts", with_write_counts), )) - def __call__(self, actx, input_starts, input_lists, input_index_style, - output_index_style, target_boxes, target_or_target_parent_boxes, - nboxes, debug=False, wait_for=None): - """ - :arg input_starts: Starts arrays of input - :arg input_lists: Lists arrays of input - :arg input_index_style: A :class:`_IndexStyle` - :arg output_index_style: A :class:`_IndexStyle` - :returns: A pair *results_dict, event*, where *results_dict* - contains entries *starts* and *lists* - """ - if wait_for is None: - wait_for = [] + evt = get_list_merger_kernel(True)(*( + # input: + (output_to_input_box,) + + input_starts + # output: + + (new_counts,)), + range=slice(noutput_boxes), + queue=actx.queue, + ) + new_counts.add_event(evt) + + import pyopencl.array as cl_array + new_starts = cl_array.cumsum(new_counts) + del new_counts + + new_lists = actx.empty(int(actx.to_numpy(new_starts[-1])), box_id_dtype) + new_lists.fill(999999999) + + evt = get_list_merger_kernel(False)(*( + # input: + (output_to_input_box,) + + input_starts + + input_lists + + (new_starts,) + # output: + + (new_lists,)), + range=slice(noutput_boxes), + queue=actx.queue, + ) + new_starts.add_event(evt) + new_lists.add_event(evt) + + # }}} + + return {"starts": new_starts, "lists": new_lists} - if ( - output_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES - and input_index_style == _IndexStyle.TARGET_BOXES): - raise ValueError( - "unsupported: merging a list indexed by target boxes " - "into a list indexed by target or target parent boxes") - - ntarget_boxes = len(target_boxes) - ntarget_or_ntarget_parent_boxes = len(target_or_target_parent_boxes) - - noutput_boxes = (ntarget_boxes - if output_index_style == _IndexStyle.TARGET_BOXES - else ntarget_or_ntarget_parent_boxes) - - if ( - input_index_style == _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES - and output_index_style == _IndexStyle.TARGET_BOXES): - from boxtree.tools import reverse_index_array - target_or_target_parent_boxes_from_all_boxes = reverse_index_array( - actx, target_or_target_parent_boxes, target_size=nboxes) - target_or_target_parent_boxes_from_target_boxes = ( - target_or_target_parent_boxes_from_all_boxes[target_boxes] - ) - - output_to_input_box = target_or_target_parent_boxes_from_target_boxes - else: - output_to_input_box = actx.from_numpy( - np.arange(noutput_boxes, dtype=self.box_id_dtype) - ) - - new_counts = actx.empty(noutput_boxes + 1, self.box_id_dtype) - - assert len(input_starts) == len(input_lists) - nlists = len(input_starts) - - evt = self.get_list_merger_kernel(nlists, True)(*( - # input: - (output_to_input_box,) - + input_starts - # output: - + (new_counts,)), - range=slice(noutput_boxes), - queue=actx.queue, - wait_for=wait_for) - - import pyopencl.array as cl_array - new_starts = cl_array.cumsum(new_counts) - del new_counts - - new_lists = actx.empty(int(actx.to_numpy(new_starts[-1])), self.box_id_dtype) - new_lists.fill(999999999) - - evt = self.get_list_merger_kernel(nlists, False)(*( - # input: - (output_to_input_box,) - + input_starts - + input_lists - + (new_starts,) - # output: - + (new_lists,)), - range=slice(noutput_boxes), - queue=actx.queue, - wait_for=[evt]) - - return dict(starts=new_starts, lists=new_lists), evt # }}} @@ -1592,7 +1599,9 @@ def ntarget_or_target_parent_boxes(self): # {{{ "close" list merging -> "unified list 1" - def merge_close_lists(self, actx, debug=False): + def merge_close_lists(self, + actx: PyOpenCLArrayContext, + debug: bool = False) -> "FMMTraversalInfo": """Return a new :class:`FMMTraversalInfo` instance with the contents of :attr:`from_sep_close_smaller_starts` and :attr:`from_sep_close_bigger_starts` merged into @@ -1600,10 +1609,8 @@ def merge_close_lists(self, actx, debug=False): *None*. """ - list_merger = _ListMerger(actx, self.tree.box_id_dtype) - - result, evt = ( - list_merger( + result = ( + merge_lists( actx, # starts (self.neighbor_source_boxes_starts, @@ -1621,11 +1628,9 @@ def merge_close_lists(self, actx, debug=False): self.target_boxes, self.target_or_target_parent_boxes, self.tree.nboxes, + self.tree.box_id_dtype, debug)) - import pyopencl as cl - cl.wait_for_events([evt]) - from dataclasses import replace return replace(self, neighbor_source_boxes_starts=actx.freeze(result["starts"]), @@ -1651,7 +1656,7 @@ def get_box_list(self, what, index): @dataclass(frozen=True) -class _KernelInfo: +class TraversalKernelInfo: sources_parents_and_targets_builder: ListOfListsBuilder level_start_box_nrs_extractor: ElementwiseKernel same_level_non_well_sep_boxes_builder: ListOfListsBuilder @@ -1662,588 +1667,626 @@ class _KernelInfo: class FMMTraversalBuilder: - """ - .. automethod:: __init__ - """ - - def __init__(self, array_context: PyOpenCLArrayContext, *, - well_sep_is_n_away=1, - from_sep_smaller_crit=None) -> None: - """ - :arg well_sep_is_n_away: Either An integer 1 or greater. - (Only 1 and 2 are tested.) - The spacing between boxes that is considered "well-separated" for - :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_siblings_starts` - (List 2). - :arg from_sep_smaller_crit: The criterion used to determine separation - box dimensions and separation for - :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_smaller_by_level` - (List 3). May be one of ``"static_linf"`` (use the box square, - possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`), - ``"precise_linf"`` (use the precise extent of targets in the box, - including their radii), or ``"static_l2"`` (use the circumcircle of - the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`). - """ + def __init__(self, + array_context: PyOpenCLArrayContext, *, + well_sep_is_n_away: int = 1, + from_sep_smaller_crit: Optional[str] = None) -> None: self._setup_actx = array_context self.well_sep_is_n_away = well_sep_is_n_away self.from_sep_smaller_crit = from_sep_smaller_crit - @property - def context(self): - return self._setup_actx.queue.context + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + wait_for=None, debug=False, + _from_sep_smaller_min_nsources_cumul=None, + source_boxes_mask=None, + source_parent_boxes_mask=None): + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_traversal' instead.", + DeprecationWarning, stacklevel=2) + + result = build_traversal(actx, tree, + well_sep_is_n_away=self.well_sep_is_n_away, + from_sep_smaller_crit=self.from_sep_smaller_crit, + source_boxes_mask=source_boxes_mask, + source_parent_boxes_mask=source_parent_boxes_mask, + _from_sep_smaller_min_nsources_cumul=( + _from_sep_smaller_min_nsources_cumul), + debug=debug, + ) + + return result, None + + +# {{{ traversal kernels + +@log_process(logger) +@memoize_on_first_arg +def get_traversal_kernel_info( + actx: PyOpenCLArrayContext, *, + dimensions: int, + particle_id_dtype: "np.dtype", + box_id_dtype: "np.dtype", + coord_dtype: "np.dtype", + box_level_dtype: "np.dtype", + max_levels: int, + sources_are_targets: bool, + sources_have_extent: bool, + targets_have_extent: bool, + extent_norm: str, + source_boxes_has_mask: bool, + source_parent_boxes_has_mask: bool, + well_sep_is_n_away: int, + from_sep_smaller_crit: str, + debug: bool = False) -> TraversalKernelInfo: + # {{{ process from_sep_smaller_crit + + if extent_norm == "linf": + # no special checks needed + pass + + elif extent_norm == "l2": + if from_sep_smaller_crit == "static_linf": + # Not technically necessary, but static linf will assume box + # bounds that are not guaranteed to contain all particle + # extents. + raise ValueError( + "The static l^inf from-sep-smaller criterion " + "cannot be used with the l^2 extent norm") - # {{{ kernel builder + elif extent_norm is None: + assert not (sources_have_extent or targets_have_extent) - @memoize_method - @log_process(logger) - def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype, - coord_dtype, box_level_dtype, max_levels, - sources_are_targets, sources_have_extent, targets_have_extent, - extent_norm, - source_boxes_has_mask, - source_parent_boxes_has_mask): + if from_sep_smaller_crit is None: + # doesn't matter + from_sep_smaller_crit = "static_linf" - # {{{ process from_sep_smaller_crit + else: + raise ValueError("unexpected value of 'extent_norm': %s" + % extent_norm) - from_sep_smaller_crit = self.from_sep_smaller_crit + if from_sep_smaller_crit not in [ + "static_linf", "precise_linf", + "static_l2", + ]: + raise ValueError("unexpected value of 'from_sep_smaller_crit': %s" + % from_sep_smaller_crit) - if from_sep_smaller_crit is None: - from_sep_smaller_crit = "precise_linf" - - if extent_norm == "linf": - # no special checks needed - pass - - elif extent_norm == "l2": - if from_sep_smaller_crit == "static_linf": - # Not technically necessary, but static linf will assume box - # bounds that are not guaranteed to contain all particle - # extents. - raise ValueError( - "The static l^inf from-sep-smaller criterion " - "cannot be used with the l^2 extent norm") - - elif extent_norm is None: - assert not (sources_have_extent or targets_have_extent) - - if from_sep_smaller_crit is None: - # doesn't matter - from_sep_smaller_crit = "static_linf" - - else: - raise ValueError("unexpected value of 'extent_norm': %s" - % extent_norm) - - if from_sep_smaller_crit not in [ - "static_linf", "precise_linf", - "static_l2", - ]: - raise ValueError("unexpected value of 'from_sep_smaller_crit': %s" - % from_sep_smaller_crit) - - # }}} - - debug = False - - from pyopencl.tools import dtype_to_ctype - - from boxtree.tree import box_flags_enum - from boxtree.tools import AXIS_NAMES - - render_vars = dict( - np=np, - dimensions=dimensions, - dtype_to_ctype=dtype_to_ctype, - particle_id_dtype=particle_id_dtype, - box_id_dtype=box_id_dtype, - box_flags_enum=box_flags_enum, - coord_dtype=coord_dtype, - get_coord_vec_dtype=get_coord_vec_dtype, - cvec_sub=partial(coord_vec_subscript_code, dimensions), - max_levels=max_levels, - AXIS_NAMES=AXIS_NAMES, - debug=debug, - sources_are_targets=sources_are_targets, - sources_have_extent=sources_have_extent, - targets_have_extent=targets_have_extent, - well_sep_is_n_away=self.well_sep_is_n_away, - from_sep_smaller_crit=from_sep_smaller_crit, - source_boxes_has_mask=source_boxes_has_mask, - source_parent_boxes_has_mask=source_parent_boxes_has_mask - ) - from pyopencl.algorithm import ListOfListsBuilder - from boxtree.tools import VectorArg, ScalarArg + # }}} + + from pyopencl.tools import dtype_to_ctype + + from boxtree.tree import box_flags_enum + from boxtree.tools import AXIS_NAMES + + render_vars = dict( + np=np, + dimensions=dimensions, + dtype_to_ctype=dtype_to_ctype, + particle_id_dtype=particle_id_dtype, + box_id_dtype=box_id_dtype, + box_flags_enum=box_flags_enum, + coord_dtype=coord_dtype, + get_coord_vec_dtype=get_coord_vec_dtype, + cvec_sub=partial(coord_vec_subscript_code, dimensions), + max_levels=max_levels, + AXIS_NAMES=AXIS_NAMES, + debug=debug, + sources_are_targets=sources_are_targets, + sources_have_extent=sources_have_extent, + targets_have_extent=targets_have_extent, + well_sep_is_n_away=well_sep_is_n_away, + from_sep_smaller_crit=from_sep_smaller_crit, + source_boxes_has_mask=source_boxes_has_mask, + source_parent_boxes_has_mask=source_parent_boxes_has_mask + ) + from pyopencl.algorithm import ListOfListsBuilder + from boxtree.tools import VectorArg, ScalarArg + + result = {} + + # {{{ source boxes, their parents, target boxes + + src = Template( + TRAVERSAL_PREAMBLE_TEMPLATE + + SOURCES_PARENTS_AND_TARGETS_TEMPLATE, + strict_undefined=True).render(**render_vars) + + arg_decls = [VectorArg(box_flags_enum.dtype, "box_flags")] + if source_boxes_has_mask: + arg_decls.append(VectorArg(np.int8, "source_boxes_mask")) + if source_parent_boxes_has_mask: + arg_decls.append(VectorArg(np.int8, "source_parent_boxes_mask")) + + result["sources_parents_and_targets_builder"] = \ + ListOfListsBuilder(actx.context, + [ + ("source_parent_boxes", box_id_dtype), + ("source_boxes", box_id_dtype), + ("target_or_target_parent_boxes", box_id_dtype) + ] + ( + [("target_boxes", box_id_dtype)] + if not sources_are_targets + else []), + str(src), + arg_decls=arg_decls, + debug=debug, + name_prefix="sources_parents_and_targets") - result = {} + result["level_start_box_nrs_extractor"] = \ + LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(actx.context, + type_aliases=( + ("box_id_t", box_id_dtype), + ("box_level_t", box_level_dtype), + ), + ) - # {{{ source boxes, their parents, target boxes + # }}} + # {{{ build list N builders + + base_args = [ + VectorArg(coord_dtype, "box_centers", with_offset=False), + ScalarArg(coord_dtype, "root_extent"), + VectorArg(np.uint8, "box_levels"), + ScalarArg(box_id_dtype, "aligned_nboxes"), + VectorArg(box_id_dtype, "box_child_ids", with_offset=False), + VectorArg(box_flags_enum.dtype, "box_flags"), + ] + + for list_name, template, extra_args, extra_lists, eliminate_empty_list in [ + ("same_level_non_well_sep_boxes", + SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE, [], [], []), + ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE, + [ + VectorArg(box_id_dtype, "target_boxes"), + ], [], []), + ("from_sep_siblings", FROM_SEP_SIBLINGS_TEMPLATE, + [ + VectorArg(box_id_dtype, "target_or_target_parent_boxes"), + VectorArg(box_id_dtype, "box_parent_ids", + with_offset=False), + VectorArg(box_id_dtype, + "same_level_non_well_sep_boxes_starts"), + VectorArg(box_id_dtype, + "same_level_non_well_sep_boxes_lists"), + ], [], []), + ("from_sep_smaller", FROM_SEP_SMALLER_TEMPLATE, + [ + ScalarArg(coord_dtype, "stick_out_factor"), + VectorArg(box_id_dtype, "target_boxes"), + VectorArg(box_id_dtype, + "same_level_non_well_sep_boxes_starts"), + VectorArg(box_id_dtype, + "same_level_non_well_sep_boxes_lists"), + VectorArg(coord_dtype, "box_target_bounding_box_min", + with_offset=False), + VectorArg(coord_dtype, "box_target_bounding_box_max", + with_offset=False), + VectorArg(particle_id_dtype, "box_source_counts_cumul"), + ScalarArg(particle_id_dtype, + "from_sep_smaller_min_nsources_cumul"), + ScalarArg(box_id_dtype, "from_sep_smaller_source_level"), + ], + ["from_sep_close_smaller"] + if sources_have_extent or targets_have_extent + else [], ["from_sep_smaller"]), + ("from_sep_bigger", FROM_SEP_BIGGER_TEMPLATE, + [ + ScalarArg(coord_dtype, "stick_out_factor"), + VectorArg(box_id_dtype, "target_or_target_parent_boxes"), + VectorArg(box_id_dtype, "box_parent_ids", + with_offset=False), + VectorArg(box_id_dtype, + "same_level_non_well_sep_boxes_starts"), + VectorArg(box_id_dtype, + "same_level_non_well_sep_boxes_lists"), + ], + ["from_sep_close_bigger"] + if sources_have_extent or targets_have_extent + else [], []), + ]: src = Template( TRAVERSAL_PREAMBLE_TEMPLATE - + SOURCES_PARENTS_AND_TARGETS_TEMPLATE, + + HELPER_FUNCTION_TEMPLATE + + template, strict_undefined=True).render(**render_vars) - arg_decls = [VectorArg(box_flags_enum.dtype, "box_flags")] - if source_boxes_has_mask: - arg_decls.append(VectorArg(np.int8, "source_boxes_mask")) - if source_parent_boxes_has_mask: - arg_decls.append(VectorArg(np.int8, "source_parent_boxes_mask")) - - result["sources_parents_and_targets_builder"] = \ - ListOfListsBuilder(self.context, - [ - ("source_parent_boxes", box_id_dtype), - ("source_boxes", box_id_dtype), - ("target_or_target_parent_boxes", box_id_dtype) - ] + ( - [("target_boxes", box_id_dtype)] - if not sources_are_targets - else []), - str(src), - arg_decls=arg_decls, - debug=debug, - name_prefix="sources_parents_and_targets") - - result["level_start_box_nrs_extractor"] = \ - LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(self.context, - type_aliases=( - ("box_id_t", box_id_dtype), - ("box_level_t", box_level_dtype), - ), - ) - - # }}} - - # {{{ build list N builders - - base_args = [ - VectorArg(coord_dtype, "box_centers", with_offset=False), - ScalarArg(coord_dtype, "root_extent"), - VectorArg(np.uint8, "box_levels"), - ScalarArg(box_id_dtype, "aligned_nboxes"), - VectorArg(box_id_dtype, "box_child_ids", with_offset=False), - VectorArg(box_flags_enum.dtype, "box_flags"), - ] - - for list_name, template, extra_args, extra_lists, eliminate_empty_list in [ - ("same_level_non_well_sep_boxes", - SAME_LEVEL_NON_WELL_SEP_BOXES_TEMPLATE, [], [], []), - ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE, - [ - VectorArg(box_id_dtype, "target_boxes"), - ], [], []), - ("from_sep_siblings", FROM_SEP_SIBLINGS_TEMPLATE, - [ - VectorArg(box_id_dtype, "target_or_target_parent_boxes"), - VectorArg(box_id_dtype, "box_parent_ids", - with_offset=False), - VectorArg(box_id_dtype, - "same_level_non_well_sep_boxes_starts"), - VectorArg(box_id_dtype, - "same_level_non_well_sep_boxes_lists"), - ], [], []), - ("from_sep_smaller", FROM_SEP_SMALLER_TEMPLATE, - [ - ScalarArg(coord_dtype, "stick_out_factor"), - VectorArg(box_id_dtype, "target_boxes"), - VectorArg(box_id_dtype, - "same_level_non_well_sep_boxes_starts"), - VectorArg(box_id_dtype, - "same_level_non_well_sep_boxes_lists"), - VectorArg(coord_dtype, "box_target_bounding_box_min", - with_offset=False), - VectorArg(coord_dtype, "box_target_bounding_box_max", - with_offset=False), - VectorArg(particle_id_dtype, "box_source_counts_cumul"), - ScalarArg(particle_id_dtype, - "from_sep_smaller_min_nsources_cumul"), - ScalarArg(box_id_dtype, "from_sep_smaller_source_level"), - ], - ["from_sep_close_smaller"] - if sources_have_extent or targets_have_extent - else [], ["from_sep_smaller"]), - ("from_sep_bigger", FROM_SEP_BIGGER_TEMPLATE, - [ - ScalarArg(coord_dtype, "stick_out_factor"), - VectorArg(box_id_dtype, "target_or_target_parent_boxes"), - VectorArg(box_id_dtype, "box_parent_ids", - with_offset=False), - VectorArg(box_id_dtype, - "same_level_non_well_sep_boxes_starts"), - VectorArg(box_id_dtype, - "same_level_non_well_sep_boxes_lists"), - ], - ["from_sep_close_bigger"] - if sources_have_extent or targets_have_extent - else [], []), - ]: - src = Template( - TRAVERSAL_PREAMBLE_TEMPLATE - + HELPER_FUNCTION_TEMPLATE - + template, - strict_undefined=True).render(**render_vars) - - result[f"{list_name}_builder"] = ListOfListsBuilder(self.context, - [(list_name, box_id_dtype)] - + [(extra_list_name, box_id_dtype) - for extra_list_name in extra_lists], - str(src), - arg_decls=base_args + extra_args, - debug=debug, name_prefix=list_name, - complex_kernel=True, - eliminate_empty_output_lists=eliminate_empty_list) + result[f"{list_name}_builder"] = ListOfListsBuilder(actx.context, + [(list_name, box_id_dtype)] + + [(extra_list_name, box_id_dtype) + for extra_list_name in extra_lists], + str(src), + arg_decls=base_args + extra_args, + debug=debug, name_prefix=list_name, + complex_kernel=True, + eliminate_empty_output_lists=eliminate_empty_list) - # }}} + # }}} - return _KernelInfo(**result) + return TraversalKernelInfo(**result) - # }}} +# }}} - # {{{ driver - def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, - wait_for=None, debug=False, - _from_sep_smaller_min_nsources_cumul=None, - source_boxes_mask=None, - source_parent_boxes_mask=None): - """ - :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` - instances for whose completion this command waits before starting - exeuction. - :arg source_boxes_mask: Only boxes passing this mask will be considered for - `source_boxes`. Used by the distributed implementation. - :arg source_parent_boxes_mask: Only boxes passing this mask will be - considered for `source_parent_boxes`. Used by the distributed - implementation. - :return: A :class:`tuple` *(trav, event)*, where *trav* is a new instance of - :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event` - for dependency management. - """ - if _from_sep_smaller_min_nsources_cumul is None: - # default to old no-threshold behavior - _from_sep_smaller_min_nsources_cumul = 0 - - if not tree._is_pruned: - raise ValueError("tree must be pruned for traversal generation") - - if tree.sources_have_extent: - # YAGNI - raise NotImplementedError( - "trees with source extent are not supported for " - "traversal generation") +# {{{ driver - # Generated code shouldn't depend on the *exact* number of tree levels. - # So round up to the next multiple of 5. - from pytools import div_ceil - max_levels = div_ceil(tree.nlevels, 5) * 5 +def build_traversal( + actx: PyOpenCLArrayContext, tree: Tree, *, + well_sep_is_n_away: int = 1, + from_sep_smaller_crit: Optional[str] = None, + source_boxes_mask: Optional["np.ndarray"] = None, + source_parent_boxes_mask: Optional["np.ndarray"] = None, + _from_sep_smaller_min_nsources_cumul=None, + debug: bool = False) -> FMMTraversalInfo: + """ + :arg well_sep_is_n_away: Either An integer 1 or greater. + (Only 1 and 2 are tested.) + The spacing between boxes that is considered "well-separated" for + :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_siblings_starts` + (List 2). + :arg from_sep_smaller_crit: The criterion used to determine separation + box dimensions and separation for + :attr:`boxtree.traversal.FMMTraversalInfo.from_sep_smaller_by_level` + (List 3). May be one of ``"static_linf"`` (use the box square, + possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`), + ``"precise_linf"`` (use the precise extent of targets in the box, + including their radii), or ``"static_l2"`` (use the circumcircle of + the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`). + :arg source_boxes_mask: Only boxes passing this mask will be considered for + `source_boxes`. Used by the distributed implementation. + :arg source_parent_boxes_mask: Only boxes passing this mask will be + considered for `source_parent_boxes`. Used by the distributed + implementation. + + :return: A :class:`tuple` *(trav, event)*, where *trav* is a new instance of + :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event` + for dependency management. + """ + if from_sep_smaller_crit is None: + from_sep_smaller_crit = "precise_linf" + + if _from_sep_smaller_min_nsources_cumul is None: + # default to old no-threshold behavior + _from_sep_smaller_min_nsources_cumul = 0 + + if not tree._is_pruned: + raise ValueError("tree must be pruned for traversal generation") + + if tree.sources_have_extent: + # YAGNI + raise NotImplementedError( + "trees with source extent are not supported for " + "traversal generation") + + # Generated code shouldn't depend on the *exact* number of tree levels. + # So round up to the next multiple of 5. + from pytools import div_ceil + max_levels = div_ceil(tree.nlevels, 5) * 5 + + knl = get_traversal_kernel_info( + actx, + dimensions=tree.dimensions, + particle_id_dtype=tree.particle_id_dtype, + box_id_dtype=tree.box_id_dtype, + coord_dtype=tree.coord_dtype, + box_level_dtype=tree.box_level_dtype, + max_levels=max_levels, + sources_are_targets=tree.sources_are_targets, + sources_have_extent=tree.sources_have_extent, + targets_have_extent=tree.targets_have_extent, + extent_norm=tree.extent_norm, + source_boxes_has_mask=source_boxes_mask is not None, + source_parent_boxes_has_mask=source_parent_boxes_mask is not None, + well_sep_is_n_away=well_sep_is_n_away, + from_sep_smaller_crit=from_sep_smaller_crit, + debug=debug, + ) + + def debug_with_finish(s): + if debug: + actx.queue.finish() + + logger.debug(s) + + traversal_plog = ProcessLogger(logger, "build traversal") + + # {{{ source boxes, their parents, and target boxes - knl_info = self.get_kernel_info( - tree.dimensions, tree.particle_id_dtype, tree.box_id_dtype, - tree.coord_dtype, tree.box_level_dtype, max_levels, - tree.sources_are_targets, - tree.sources_have_extent, tree.targets_have_extent, - tree.extent_norm, - source_boxes_mask is not None, - source_parent_boxes_mask is not None) + debug_with_finish( + "building list of source boxes, their parents, and target boxes") - def debug_with_finish(s): - if debug: - actx.queue.finish() + extra_args = [] + if source_boxes_mask is not None: + extra_args.append(source_boxes_mask) + if source_parent_boxes_mask is not None: + extra_args.append(source_parent_boxes_mask) - logger.debug(s) + result, evt = knl.sources_parents_and_targets_builder( + actx.queue, tree.nboxes, tree.box_flags, *extra_args, + allocator=actx.allocator, + ) - traversal_plog = ProcessLogger(logger, "build traversal") + wait_for = [evt] - # {{{ source boxes, their parents, and target boxes + source_parent_boxes = result["source_parent_boxes"].lists + source_boxes = result["source_boxes"].lists + target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists - debug_with_finish( - "building list of source boxes, their parents, and target boxes") + if not tree.sources_are_targets: + target_boxes = result["target_boxes"].lists + else: + target_boxes = source_boxes - extra_args = [] - if source_boxes_mask is not None: - extra_args.append(source_boxes_mask) - if source_parent_boxes_mask is not None: - extra_args.append(source_parent_boxes_mask) + # }}} + + # {{{ figure out level starts in *_parent_boxes - result, evt = knl_info.sources_parents_and_targets_builder( - actx.queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for - ) + def extract_level_start_box_nrs(box_list, wait_for): + result = actx.empty( + tree.nlevels + 1, tree.box_id_dtype).fill(len(box_list)) - wait_for = [evt] + evt = knl.level_start_box_nrs_extractor( + tree.level_start_box_nrs, + tree.box_levels, + box_list, + result, + range=slice(0, len(box_list)), + queue=actx.queue, wait_for=wait_for, + ) - source_parent_boxes = result["source_parent_boxes"].lists - source_boxes = result["source_boxes"].lists - target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists + result = actx.to_numpy(result) - if not tree.sources_are_targets: - target_boxes = result["target_boxes"].lists - else: - target_boxes = source_boxes + # Postprocess result for unoccupied levels + prev_start = len(box_list) + for ilev in range(tree.nlevels-1, -1, -1): + result[ilev] = prev_start = \ + min(result[ilev], prev_start) - # }}} + return result, evt - # {{{ figure out level starts in *_parent_boxes + debug_with_finish("finding level starts in source boxes array") + level_start_source_box_nrs, evt_s = \ + extract_level_start_box_nrs( + source_boxes, wait_for=wait_for) - def extract_level_start_box_nrs(box_list, wait_for): - result = actx.empty( - tree.nlevels + 1, tree.box_id_dtype).fill(len(box_list)) + debug_with_finish("finding level starts in source parent boxes array") + level_start_source_parent_box_nrs, evt_sp = \ + extract_level_start_box_nrs( + source_parent_boxes, wait_for=wait_for) - evt = knl_info.level_start_box_nrs_extractor( - tree.level_start_box_nrs, - tree.box_levels, - box_list, - result, - range=slice(0, len(box_list)), - queue=actx.queue, wait_for=wait_for) + debug_with_finish("finding level starts in target boxes array") + level_start_target_box_nrs, evt_t = \ + extract_level_start_box_nrs( + target_boxes, wait_for=wait_for) - result = actx.to_numpy(result) + debug_with_finish( + "finding level starts in target or target parent boxes array") + level_start_target_or_target_parent_box_nrs, evt_tp = \ + extract_level_start_box_nrs( + target_or_target_parent_boxes, wait_for=wait_for) - # Postprocess result for unoccupied levels - prev_start = len(box_list) - for ilev in range(tree.nlevels-1, -1, -1): - result[ilev] = prev_start = \ - min(result[ilev], prev_start) + wait_for = [evt_s, evt_sp, evt_t, evt_tp] - return result, evt + # }}} - debug_with_finish("finding level starts in source boxes array") - level_start_source_box_nrs, evt_s = \ - extract_level_start_box_nrs( - source_boxes, wait_for=wait_for) + # {{{ same-level non-well-separated boxes - debug_with_finish("finding level starts in source parent boxes array") - level_start_source_parent_box_nrs, evt_sp = \ - extract_level_start_box_nrs( - source_parent_boxes, wait_for=wait_for) + # If well_sep_is_n_away is 1, this agrees with the definition of + # 'colleagues' from the classical FMM literature. - debug_with_finish("finding level starts in target boxes array") - level_start_target_box_nrs, evt_t = \ - extract_level_start_box_nrs( - target_boxes, wait_for=wait_for) + debug_with_finish("finding same-level near-field boxes") - debug_with_finish( - "finding level starts in target or target parent boxes array") - level_start_target_or_target_parent_box_nrs, evt_tp = \ - extract_level_start_box_nrs( - target_or_target_parent_boxes, wait_for=wait_for) + result, evt = knl.same_level_non_well_sep_boxes_builder( + actx.queue, tree.nboxes, + tree.box_centers.data, tree.root_extent, tree.box_levels, + tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, + wait_for=wait_for, allocator=actx.allocator, + ) + wait_for = [evt] + same_level_non_well_sep_boxes = result["same_level_non_well_sep_boxes"] - wait_for = [evt_s, evt_sp, evt_t, evt_tp] + # }}} - # }}} + # {{{ neighbor source boxes ("list 1") - # {{{ same-level non-well-separated boxes + debug_with_finish("finding neighbor source boxes ('list 1')") - # If well_sep_is_n_away is 1, this agrees with the definition of - # 'colleagues' from the classical FMM literature. + result, evt = knl.neighbor_source_boxes_builder( + actx.queue, len(target_boxes), + tree.box_centers.data, tree.root_extent, tree.box_levels, + tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, + target_boxes, wait_for=wait_for, allocator=actx.allocator, + ) - debug_with_finish("finding same-level near-field boxes") + wait_for = [evt] + neighbor_source_boxes = result["neighbor_source_boxes"] - result, evt = knl_info.same_level_non_well_sep_boxes_builder( - actx.queue, tree.nboxes, - tree.box_centers.data, tree.root_extent, tree.box_levels, - tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, - wait_for=wait_for) - wait_for = [evt] - same_level_non_well_sep_boxes = result["same_level_non_well_sep_boxes"] + # }}} - # }}} + # {{{ well-separated siblings ("list 2") - # {{{ neighbor source boxes ("list 1") + debug_with_finish("finding well-separated siblings ('list 2')") - debug_with_finish("finding neighbor source boxes ('list 1')") + result, evt = knl.from_sep_siblings_builder( + actx.queue, len(target_or_target_parent_boxes), + tree.box_centers.data, tree.root_extent, tree.box_levels, + tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, + target_or_target_parent_boxes, tree.box_parent_ids.data, + same_level_non_well_sep_boxes.starts, + same_level_non_well_sep_boxes.lists, + wait_for=wait_for, allocator=actx.allocator, + ) + wait_for = [evt] + from_sep_siblings = result["from_sep_siblings"] - result, evt = knl_info.neighbor_source_boxes_builder( - actx.queue, len(target_boxes), - tree.box_centers.data, tree.root_extent, tree.box_levels, - tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, - target_boxes, wait_for=wait_for) + # }}} - wait_for = [evt] - neighbor_source_boxes = result["neighbor_source_boxes"] + with_extent = tree.sources_have_extent or tree.targets_have_extent - # }}} + # {{{ separated smaller ("list 3") - # {{{ well-separated siblings ("list 2") + debug_with_finish("finding separated smaller ('list 3')") - debug_with_finish("finding well-separated siblings ('list 2')") + from_sep_smaller_base_args = ( + actx.queue, len(target_boxes), + tree.box_centers.data, tree.root_extent, tree.box_levels, + tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, + tree.stick_out_factor, target_boxes, + same_level_non_well_sep_boxes.starts, + same_level_non_well_sep_boxes.lists, + tree.box_target_bounding_box_min.data, + tree.box_target_bounding_box_max.data, + tree.box_source_counts_cumul, + _from_sep_smaller_min_nsources_cumul, + ) - result, evt = knl_info.from_sep_siblings_builder( - actx.queue, len(target_or_target_parent_boxes), - tree.box_centers.data, tree.root_extent, tree.box_levels, - tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, - target_or_target_parent_boxes, tree.box_parent_ids.data, - same_level_non_well_sep_boxes.starts, - same_level_non_well_sep_boxes.lists, - wait_for=wait_for) - wait_for = [evt] - from_sep_siblings = result["from_sep_siblings"] + from_sep_smaller_wait_for = [] + from_sep_smaller_by_level = [] + target_boxes_sep_smaller_by_source_level = [] - # }}} + for ilevel in range(tree.nlevels): + debug_with_finish(f"finding separated smaller ('list 3 level {ilevel}')") - with_extent = tree.sources_have_extent or tree.targets_have_extent + result, evt = knl.from_sep_smaller_builder( + *(from_sep_smaller_base_args + (ilevel,)), + omit_lists=("from_sep_close_smaller",) if with_extent else (), + wait_for=wait_for, + allocator=actx.allocator, + ) - # {{{ separated smaller ("list 3") + target_boxes_sep_smaller = target_boxes[ + result["from_sep_smaller"].nonempty_indices] - debug_with_finish("finding separated smaller ('list 3')") + from_sep_smaller_by_level.append(result["from_sep_smaller"]) + target_boxes_sep_smaller_by_source_level.append(target_boxes_sep_smaller) + from_sep_smaller_wait_for.append(evt) - from_sep_smaller_base_args = ( - actx.queue, len(target_boxes), - tree.box_centers.data, tree.root_extent, tree.box_levels, - tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, - tree.stick_out_factor, target_boxes, - same_level_non_well_sep_boxes.starts, - same_level_non_well_sep_boxes.lists, - tree.box_target_bounding_box_min.data, - tree.box_target_bounding_box_max.data, - tree.box_source_counts_cumul, - _from_sep_smaller_min_nsources_cumul, + if with_extent: + debug_with_finish("finding separated smaller close ('list 3 close')") + result, evt = knl.from_sep_smaller_builder( + *(from_sep_smaller_base_args + (-1,)), + omit_lists=("from_sep_smaller",), + wait_for=wait_for, + allocator=actx.allocator, ) + from_sep_close_smaller_starts = result["from_sep_close_smaller"].starts + from_sep_close_smaller_lists = result["from_sep_close_smaller"].lists - from_sep_smaller_wait_for = [] - from_sep_smaller_by_level = [] - target_boxes_sep_smaller_by_source_level = [] - - for ilevel in range(tree.nlevels): - debug_with_finish(f"finding separated smaller ('list 3 level {ilevel}')") - - result, evt = knl_info.from_sep_smaller_builder( - *(from_sep_smaller_base_args + (ilevel,)), - omit_lists=("from_sep_close_smaller",) if with_extent else (), - wait_for=wait_for) - - target_boxes_sep_smaller = target_boxes[ - result["from_sep_smaller"].nonempty_indices] - - from_sep_smaller_by_level.append(result["from_sep_smaller"]) - target_boxes_sep_smaller_by_source_level.append(target_boxes_sep_smaller) - from_sep_smaller_wait_for.append(evt) - - if with_extent: - debug_with_finish("finding separated smaller close ('list 3 close')") - result, evt = knl_info.from_sep_smaller_builder( - *(from_sep_smaller_base_args + (-1,)), - omit_lists=("from_sep_smaller",), - wait_for=wait_for) - from_sep_close_smaller_starts = result["from_sep_close_smaller"].starts - from_sep_close_smaller_lists = result["from_sep_close_smaller"].lists - - from_sep_smaller_wait_for.append(evt) - else: - from_sep_close_smaller_starts = None - from_sep_close_smaller_lists = None - - # }}} - - wait_for = from_sep_smaller_wait_for - del from_sep_smaller_wait_for - - # {{{ separated bigger ("list 4") - - debug_with_finish("finding separated bigger ('list 4')") - - result, evt = knl_info.from_sep_bigger_builder( - actx.queue, len(target_or_target_parent_boxes), - tree.box_centers.data, tree.root_extent, tree.box_levels, - tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, - tree.stick_out_factor, target_or_target_parent_boxes, - tree.box_parent_ids.data, - same_level_non_well_sep_boxes.starts, - same_level_non_well_sep_boxes.lists, - wait_for=wait_for) - - wait_for = [evt] - from_sep_bigger = result["from_sep_bigger"] - - if with_extent: - # These are indexed by target_or_target_parent boxes; we rewrite - # them to be indexed by target_boxes. - from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts - from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists - - list_merger = _ListMerger(actx, tree.box_id_dtype) - result, evt = list_merger( - actx, - # starts - (from_sep_close_bigger_starts_raw,), - # lists - (from_sep_close_bigger_lists_raw,), - # input index style - _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES, - # output index style - _IndexStyle.TARGET_BOXES, - # box and tree data - target_boxes, - target_or_target_parent_boxes, - tree.nboxes, - debug, - wait_for=wait_for) + from_sep_smaller_wait_for.append(evt) + else: + from_sep_close_smaller_starts = None + from_sep_close_smaller_lists = None - wait_for = [evt] + # }}} - del from_sep_close_bigger_starts_raw - del from_sep_close_bigger_lists_raw + wait_for = from_sep_smaller_wait_for + del from_sep_smaller_wait_for - from_sep_close_bigger_starts = result["starts"] - from_sep_close_bigger_lists = result["lists"] - else: - from_sep_close_bigger_starts = None - from_sep_close_bigger_lists = None + # {{{ separated bigger ("list 4") - # }}} + debug_with_finish("finding separated bigger ('list 4')") - evt, = wait_for - traversal_plog.done( - "from_sep_smaller_crit: %s", - self.from_sep_smaller_crit) + result, evt = knl.from_sep_bigger_builder( + actx.queue, len(target_or_target_parent_boxes), + tree.box_centers.data, tree.root_extent, tree.box_levels, + tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, + tree.stick_out_factor, target_or_target_parent_boxes, + tree.box_parent_ids.data, + same_level_non_well_sep_boxes.starts, + same_level_non_well_sep_boxes.lists, + wait_for=wait_for, allocator=actx.allocator, + ) + + wait_for = [evt] + from_sep_bigger = result["from_sep_bigger"] + + if with_extent: + # These are indexed by target_or_target_parent boxes; we rewrite + # them to be indexed by target_boxes. + from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts + from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists + + result = merge_lists( + actx, + # starts + (from_sep_close_bigger_starts_raw,), + # lists + (from_sep_close_bigger_lists_raw,), + # input index style + _IndexStyle.TARGET_OR_TARGET_PARENT_BOXES, + # output index style + _IndexStyle.TARGET_BOXES, + # box and tree data + target_boxes, + target_or_target_parent_boxes, + tree.nboxes, + tree.box_id_dtype, + debug, + ) - info = FMMTraversalInfo( - tree=tree, - well_sep_is_n_away=self.well_sep_is_n_away, + del from_sep_close_bigger_starts_raw + del from_sep_close_bigger_lists_raw - source_boxes=source_boxes, - target_boxes=target_boxes, + from_sep_close_bigger_starts = result["starts"] + from_sep_close_bigger_lists = result["lists"] + else: + from_sep_close_bigger_starts = None + from_sep_close_bigger_lists = None - level_start_source_box_nrs=actx.from_numpy( - level_start_source_box_nrs), - level_start_target_box_nrs=actx.from_numpy( - level_start_target_box_nrs), + # }}} - source_parent_boxes=source_parent_boxes, - level_start_source_parent_box_nrs=actx.from_numpy( - level_start_source_parent_box_nrs), + evt, = wait_for + traversal_plog.done("from_sep_smaller_crit: %s", from_sep_smaller_crit) - target_or_target_parent_boxes=target_or_target_parent_boxes, - level_start_target_or_target_parent_box_nrs=actx.from_numpy( - level_start_target_or_target_parent_box_nrs), + info = FMMTraversalInfo( + tree=tree, + well_sep_is_n_away=well_sep_is_n_away, - same_level_non_well_sep_boxes_starts=( - same_level_non_well_sep_boxes.starts), - same_level_non_well_sep_boxes_lists=( - same_level_non_well_sep_boxes.lists), + source_boxes=source_boxes, + target_boxes=target_boxes, - neighbor_source_boxes_starts=neighbor_source_boxes.starts, - neighbor_source_boxes_lists=neighbor_source_boxes.lists, + level_start_source_box_nrs=actx.from_numpy( + level_start_source_box_nrs), + level_start_target_box_nrs=actx.from_numpy( + level_start_target_box_nrs), - from_sep_siblings_starts=from_sep_siblings.starts, - from_sep_siblings_lists=from_sep_siblings.lists, + source_parent_boxes=source_parent_boxes, + level_start_source_parent_box_nrs=actx.from_numpy( + level_start_source_parent_box_nrs), - from_sep_smaller_by_level=make_obj_array( - from_sep_smaller_by_level), - target_boxes_sep_smaller_by_source_level=make_obj_array( - target_boxes_sep_smaller_by_source_level), + target_or_target_parent_boxes=target_or_target_parent_boxes, + level_start_target_or_target_parent_box_nrs=actx.from_numpy( + level_start_target_or_target_parent_box_nrs), - from_sep_close_smaller_starts=from_sep_close_smaller_starts, - from_sep_close_smaller_lists=from_sep_close_smaller_lists, + same_level_non_well_sep_boxes_starts=( + same_level_non_well_sep_boxes.starts), + same_level_non_well_sep_boxes_lists=( + same_level_non_well_sep_boxes.lists), - from_sep_bigger_starts=from_sep_bigger.starts, - from_sep_bigger_lists=from_sep_bigger.lists, + neighbor_source_boxes_starts=neighbor_source_boxes.starts, + neighbor_source_boxes_lists=neighbor_source_boxes.lists, - from_sep_close_bigger_starts=from_sep_close_bigger_starts, - from_sep_close_bigger_lists=from_sep_close_bigger_lists, - ) + from_sep_siblings_starts=from_sep_siblings.starts, + from_sep_siblings_lists=from_sep_siblings.lists, - return actx.freeze(info), evt + from_sep_smaller_by_level=make_obj_array( + from_sep_smaller_by_level), + target_boxes_sep_smaller_by_source_level=make_obj_array( + target_boxes_sep_smaller_by_source_level), - # }}} + from_sep_close_smaller_starts=from_sep_close_smaller_starts, + from_sep_close_smaller_lists=from_sep_close_smaller_lists, + + from_sep_bigger_starts=from_sep_bigger.starts, + from_sep_bigger_lists=from_sep_bigger.lists, + + from_sep_close_bigger_starts=from_sep_close_bigger_starts, + from_sep_close_bigger_lists=from_sep_close_bigger_lists, + ) + + return actx.freeze(info) + +# }}} # vim: fdm=marker diff --git a/boxtree/tree.py b/boxtree/tree.py index 519140d8..5ffc5de2 100644 --- a/boxtree/tree.py +++ b/boxtree/tree.py @@ -51,10 +51,7 @@ Tools ^^^^^ -.. autoclass:: ParticleListFilter - .. autofunction:: filter_target_lists_in_user_order - .. autofunction:: filter_target_lists_in_tree_order """ @@ -87,7 +84,7 @@ from cgen import Enum from arraycontext import Array -from pytools import memoize_method +from pytools import memoize_in from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container @@ -126,7 +123,7 @@ class Tree: into which they may be sorted. Instances of this class are not constructed directly. They are returned - by :meth:`TreeBuilder.__call__`. + by :meth:`~boxtree.build_tree`. .. rubric:: Flags @@ -366,8 +363,8 @@ class Tree: particle's extents) inside of the box. If the box is empty, both *min* and *max* will reflect the box center. The purpose of this information is to reduce the cost of some interactions through knowledge that some boxes are partially empty. - (See the *from_sep_smaller_crit* argument to the constructor of - :class:`boxtree.traversal.FMMTraversalBuilder` for an example.) + (See the *from_sep_smaller_crit* argument to + :func:`boxtree.traversal.build_traversal` for an example.) .. note:: @@ -440,6 +437,7 @@ class Tree: box_target_bounding_box_min: Array box_target_bounding_box_max: Array + root_extent_stretch_factor: float _is_pruned: bool @property @@ -650,21 +648,30 @@ def link_point_sources( tree_order_point_source_counts = actx.empty( tree.nsources, tree.particle_id_dtype) - from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_SOURCE_SCAN_TPL - knl = POINT_SOURCE_LINKING_SOURCE_SCAN_TPL.build( - actx.queue.context, - type_aliases=( - ("scan_t", tree.particle_id_dtype), - ("index_t", tree.particle_id_dtype), - ("particle_id_t", tree.particle_id_dtype), - ), - ) + @memoize_in(actx, ( + link_point_sources, tree.particle_id_dtype, + "point_source_linking_source_scan")) + def get_point_source_linking_source_scan_kernel(): + from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_SOURCE_SCAN_TPL + return POINT_SOURCE_LINKING_SOURCE_SCAN_TPL.build( + actx.queue.context, + type_aliases=( + ("scan_t", tree.particle_id_dtype), + ("index_t", tree.particle_id_dtype), + ("particle_id_t", tree.particle_id_dtype), + ), + ) logger.debug("point source linking: tree order source scan") - knl(point_source_starts, tree.user_source_ids, + knl = get_point_source_linking_source_scan_kernel() + knl( + point_source_starts, tree.user_source_ids, tree_order_point_source_starts, tree_order_point_source_counts, - npoint_sources_dev, size=tree.nsources, queue=actx.queue) + npoint_sources_dev, size=tree.nsources, + queue=actx.queue, + allocator=actx.allocator, + ) # }}} @@ -680,11 +687,8 @@ def link_point_sources( user_point_source_ids = actx.empty(npoint_sources, tree.particle_id_dtype) user_point_source_ids.fill(1) - import pyopencl.array as cl_array - cl_array.multi_put( - [tree_order_index_user_point_source_starts], - dest_indices=tree_order_point_source_starts, - out=[user_point_source_ids]) + user_point_source_ids[tree_order_point_source_starts] = ( + tree_order_index_user_point_source_starts) if debug: ups_host = actx.to_numpy(user_point_source_ids) @@ -694,29 +698,30 @@ def link_point_sources( source_boundaries = actx.zeros(npoint_sources, np.int8) # FIXME: Should be a scalar, in principle. - ones = actx.empty(tree.nsources, np.int8) - ones.fill(1) - - cl_array.multi_put( - [ones], - dest_indices=tree_order_point_source_starts, - out=[source_boundaries]) + ones = 1 + actx.zeros(1, np.int8) + source_boundaries[tree_order_point_source_starts] = ones - from boxtree.tree_build_kernels import \ - POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL + @memoize_in(actx, ( + link_point_sources, tree.particle_id_dtype, + "point_source_linking_user_point_source_id_scan")) + def get_point_source_linking_user_point_source_id_scan_kernel(): + from boxtree.tree_build_kernels import ( + POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL) + return POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL.build( + actx.queue.context, + type_aliases=( + ("scan_t", tree.particle_id_dtype), + ("index_t", tree.particle_id_dtype), + ("particle_id_t", tree.particle_id_dtype), + ), + ) logger.debug("point source linking: point source id scan") - - knl = POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL.build( - actx.queue.context, - type_aliases=( - ("scan_t", tree.particle_id_dtype), - ("index_t", tree.particle_id_dtype), - ("particle_id_t", tree.particle_id_dtype), - ), - ) + knl = get_point_source_linking_user_point_source_id_scan_kernel() knl(source_boundaries, user_point_source_ids, - size=npoint_sources, queue=actx.queue) + size=npoint_sources, + queue=actx.queue, + allocator=actx.allocator) if debug: ups_host = actx.to_numpy(user_point_source_ids) @@ -725,6 +730,7 @@ def link_point_sources( # }}} + import pyopencl.array as cl_array from pytools.obj_array import make_obj_array tree_order_point_sources = make_obj_array([ cl_array.take(point_sources[i], user_point_source_ids, queue=actx.queue) @@ -733,15 +739,18 @@ def link_point_sources( # {{{ compute box point source metadata - from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_BOX_POINT_SOURCES - - knl = POINT_SOURCE_LINKING_BOX_POINT_SOURCES.build( - actx.queue.context, - type_aliases=( - ("particle_id_t", tree.particle_id_dtype), - ("box_id_t", tree.box_id_dtype), - ), - ) + @memoize_in(actx, ( + link_point_sources, tree.particle_id_dtype, tree.box_id_dtype, + "point_source_linking_box_point_sources")) + def get_point_source_linking_box_point_sources_kernel(): + from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_BOX_POINT_SOURCES + return POINT_SOURCE_LINKING_BOX_POINT_SOURCES.build( + actx.queue.context, + type_aliases=( + ("particle_id_t", tree.particle_id_dtype), + ("box_id_t", tree.box_id_dtype), + ), + ) logger.debug("point source linking: box point sources") @@ -750,6 +759,7 @@ def link_point_sources( box_point_source_counts_nonchild = actx.empty( tree.nboxes, tree.particle_id_dtype) + knl = get_point_source_linking_box_point_sources_kernel() knl( box_point_source_starts, box_point_source_counts_nonchild, box_point_source_counts_cumul, @@ -759,7 +769,9 @@ def link_point_sources( tree_order_point_source_starts, tree_order_point_source_counts, - range=slice(tree.nboxes), queue=actx.queue) + range=slice(tree.nboxes), + queue=actx.queue, + ) # }}} @@ -793,10 +805,30 @@ def link_point_sources( # {{{ particle list filter +class ParticleListFilter: + """ + .. automethod:: filter_target_lists_in_tree_order + .. automethod:: filter_target_lists_in_user_order + """ + + def __init__(self, *args, **kwargs): + pass + + def filter_target_lists_in_user_order(self, actx, tree, flags): + return filter_target_lists_in_user_order(actx, tree, flags) + + def filter_target_lists_in_tree_order(self, actx, tree, flags): + return filter_target_lists_in_tree_order(actx, tree, flags) + +# }}} + + +# {{{ filter_target_lists_in_user_order + @dataclass_array_container @dataclass(frozen=True) class FilteredTargetListsInUserOrder: - """Use :meth:`ParticleListFilter.filter_target_lists_in_user_order` to create + """Use :func:`filter_target_lists_in_user_order` to create instances of this class. This class represents subsets of the list of targets in each box (as given @@ -835,10 +867,89 @@ class FilteredTargetListsInUserOrder: target_lists: Array +def filter_target_lists_in_user_order( + actx: PyOpenCLArrayContext, tree: Tree, flags: Array, + ) -> FilteredTargetListsInUserOrder: + """ + :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of + :class:`numpy.int8` objects, which indicate by being zero that the + corresponding target (in user target order) is not part of the + filtered list, or by being nonzero that it is. + + :returns: A :class:`FilteredTargetListsInUserOrder` + """ + user_order_flags = flags + del flags + + @memoize_in(actx, ( + filter_target_lists_in_user_order, + tree.particle_id_dtype, user_order_flags.dtype)) + def get_kernel(): + from boxtree.tools import VectorArg + from pyopencl.tools import dtype_to_ctype + from pyopencl.algorithm import ListOfListsBuilder + from mako.template import Template + + return ListOfListsBuilder(actx.context, + [("filt_tgt_list", tree.particle_id_dtype)], Template("""//CL// + typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t; + + void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) + { + particle_id_t b_t_start = box_target_starts[i]; + particle_id_t b_t_count = box_target_counts_nonchild[i]; + + for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j) + { + particle_id_t user_target_id = user_target_ids[j]; + if (user_order_flags[user_target_id]) + { + APPEND_filt_tgt_list(user_target_id); + } + } + } + """, strict_undefined=True).render( + dtype_to_ctype=dtype_to_ctype, + particle_id_dtype=tree.particle_id_dtype + ), arg_decls=[ + VectorArg(user_order_flags.dtype, "user_order_flags"), + VectorArg(tree.particle_id_dtype, "user_target_ids"), + VectorArg(tree.particle_id_dtype, "box_target_starts"), + VectorArg(tree.particle_id_dtype, "box_target_counts_nonchild"), + ]) + + user_target_ids = actx.empty(tree.ntargets, tree.sorted_target_ids.dtype) + user_target_ids[tree.sorted_target_ids] = actx.from_numpy( + np.arange(tree.ntargets, dtype=user_target_ids.dtype) + ) + + knl = get_kernel() + result, _ = knl( + actx.queue, tree.nboxes, + user_order_flags, + user_target_ids, + tree.box_target_starts, + tree.box_target_counts_nonchild, + allocator=actx.allocator, + ) + + target_lists = FilteredTargetListsInUserOrder( + nfiltered_targets=result["filt_tgt_list"].count, + target_starts=result["filt_tgt_list"].starts, + target_lists=result["filt_tgt_list"].lists, + ) + + return actx.freeze(target_lists) + +# }}} + + +# {{{ filter_target_lists_in_tree_order + @dataclass_array_container @dataclass(frozen=True) class FilteredTargetListsInTreeOrder: - """Use :meth:`ParticleListFilter.filter_target_lists_in_tree_order` to create + """Use :func:`filter_target_lists_in_tree_order` to create instances of this class. This class represents subsets of the list of targets in each box (as given by @@ -891,181 +1002,100 @@ class FilteredTargetListsInTreeOrder: unfiltered_from_filtered_target_indices: Array -class ParticleListFilter: +def filter_target_lists_in_tree_order( + actx: PyOpenCLArrayContext, tree: Tree, flags: Array + ) -> FilteredTargetListsInTreeOrder: """ - .. automethod:: filter_target_lists_in_tree_order - .. automethod:: filter_target_lists_in_user_order + :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of + :class:`numpy.int8` objects, which indicate by being zero that the + corresponding target (in user target order) is not part of the + filtered list, or by being nonzero that it is. + :returns: A :class:`FilteredTargetListsInTreeOrder` """ - def __init__(self, array_context: PyOpenCLArrayContext): - self._setup_actx = array_context - - @property - def context(self): - return self._setup_actx.queue.context - - @memoize_method - def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype, - user_order_flags_dtype): - from boxtree.tools import VectorArg - from pyopencl.tools import dtype_to_ctype - from pyopencl.algorithm import ListOfListsBuilder - from mako.template import Template - - builder = ListOfListsBuilder(self.context, - [("filt_tgt_list", particle_id_dtype)], Template("""//CL// - typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t; - - void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) - { - particle_id_t b_t_start = box_target_starts[i]; - particle_id_t b_t_count = box_target_counts_nonchild[i]; - - for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j) - { - particle_id_t user_target_id = user_target_ids[j]; - if (user_order_flags[user_target_id]) - { - APPEND_filt_tgt_list(user_target_id); - } - } - } - """, strict_undefined=True).render( - dtype_to_ctype=dtype_to_ctype, - particle_id_dtype=particle_id_dtype - ), arg_decls=[ - VectorArg(user_order_flags_dtype, "user_order_flags"), - VectorArg(particle_id_dtype, "user_target_ids"), - VectorArg(particle_id_dtype, "box_target_starts"), - VectorArg(particle_id_dtype, "box_target_counts_nonchild"), - ]) - - return builder - - def filter_target_lists_in_user_order(self, actx, tree, flags): - """ - :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of - :class:`numpy.int8` objects, which indicate by being zero that the - corresponding target (in user target order) is not part of the - filtered list, or by being nonzero that it is. - - :returns: A :class:`FilteredTargetListsInUserOrder` - """ - user_order_flags = flags - del flags - - user_target_ids = actx.empty(tree.ntargets, tree.sorted_target_ids.dtype) - user_target_ids[tree.sorted_target_ids] = actx.from_numpy( - np.arange(tree.ntargets, dtype=user_target_ids.dtype) - ) - - kernel = self.get_filter_target_lists_in_user_order_kernel( - tree.particle_id_dtype, user_order_flags.dtype) - - result, evt = kernel(actx.queue, tree.nboxes, - user_order_flags, - user_target_ids, - tree.box_target_starts, - tree.box_target_counts_nonchild) - - target_lists = FilteredTargetListsInUserOrder( - nfiltered_targets=result["filt_tgt_list"].count, - target_starts=result["filt_tgt_list"].starts, - target_lists=result["filt_tgt_list"].lists, - ) - - return actx.freeze(target_lists) - - @memoize_method - def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype): + @memoize_in(actx, (filter_target_lists_in_tree_order, tree.particle_id_dtype)) + def get_kernels(): from boxtree.tree_build_kernels import ( TREE_ORDER_TARGET_FILTER_SCAN_TPL, TREE_ORDER_TARGET_FILTER_INDEX_TPL) scan_knl = TREE_ORDER_TARGET_FILTER_SCAN_TPL.build( - self.context, + actx.context, type_aliases=( - ("scan_t", particle_id_dtype), - ("particle_id_t", particle_id_dtype), + ("scan_t", tree.particle_id_dtype), + ("particle_id_t", tree.particle_id_dtype), ), ) index_knl = TREE_ORDER_TARGET_FILTER_INDEX_TPL.build( - self.context, + actx.context, type_aliases=( - ("particle_id_t", particle_id_dtype), + ("particle_id_t", tree.particle_id_dtype), ), ) return scan_knl, index_knl - def filter_target_lists_in_tree_order(self, actx, tree, flags): - """ - :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of - :class:`numpy.int8` objects, which indicate by being zero that the - corresponding target (in user target order) is not part of the - filtered list, or by being nonzero that it is. - :returns: A :class:`FilteredTargetListsInTreeOrder` - """ - - tree_order_flags = actx.empty(tree.ntargets, np.int8) - tree_order_flags[tree.sorted_target_ids] = flags + tree_order_flags = actx.empty(tree.ntargets, np.int8) + tree_order_flags[tree.sorted_target_ids] = flags - filtered_from_unfiltered_target_indices = actx.empty( - tree.ntargets, tree.particle_id_dtype) - unfiltered_from_filtered_target_indices = actx.empty( - tree.ntargets, tree.particle_id_dtype) + filtered_from_unfiltered_target_indices = actx.empty( + tree.ntargets, tree.particle_id_dtype) + unfiltered_from_filtered_target_indices = actx.empty( + tree.ntargets, tree.particle_id_dtype) - nfiltered_targets = actx.empty(1, tree.particle_id_dtype) + nfiltered_targets = actx.empty(1, tree.particle_id_dtype) - scan_knl, index_knl = self.get_filter_target_lists_in_tree_order_kernels( - tree.particle_id_dtype) - - scan_knl(tree_order_flags, - filtered_from_unfiltered_target_indices, - unfiltered_from_filtered_target_indices, - nfiltered_targets, - queue=actx.queue) + scan_knl, index_knl = get_kernels() + scan_knl( + tree_order_flags, + filtered_from_unfiltered_target_indices, + unfiltered_from_filtered_target_indices, + nfiltered_targets, + queue=actx.queue, + allocator=actx.allocator, + ) - nfiltered_targets = int(actx.to_numpy(nfiltered_targets)) + nfiltered_targets = int(actx.to_numpy(nfiltered_targets)) - unfiltered_from_filtered_target_indices = \ - unfiltered_from_filtered_target_indices[:nfiltered_targets] + unfiltered_from_filtered_target_indices = \ + unfiltered_from_filtered_target_indices[:nfiltered_targets] - from pytools.obj_array import make_obj_array - filtered_targets = make_obj_array([ - actx.thaw(targets_i)[unfiltered_from_filtered_target_indices] - for targets_i in tree.targets - ]) + from pytools.obj_array import make_obj_array + filtered_targets = make_obj_array([ + actx.thaw(targets_i)[unfiltered_from_filtered_target_indices] + for targets_i in tree.targets + ]) - box_target_starts_filtered = actx.np.zeros_like(tree.box_target_starts) - box_target_counts_nonchild_filtered = ( - actx.np.zeros_like(tree.box_target_counts_nonchild)) + box_target_starts_filtered = actx.np.zeros_like(tree.box_target_starts) + box_target_counts_nonchild_filtered = ( + actx.np.zeros_like(tree.box_target_counts_nonchild)) - index_knl( - # input - tree.box_target_starts, - tree.box_target_counts_nonchild, - filtered_from_unfiltered_target_indices, - tree.ntargets, - nfiltered_targets, + index_knl( + # input + tree.box_target_starts, + tree.box_target_counts_nonchild, + filtered_from_unfiltered_target_indices, + tree.ntargets, + nfiltered_targets, - # output - box_target_starts_filtered, - box_target_counts_nonchild_filtered, + # output + box_target_starts_filtered, + box_target_counts_nonchild_filtered, - queue=actx.queue) + queue=actx.queue, + ) - target_lists = FilteredTargetListsInTreeOrder( - nfiltered_targets=nfiltered_targets, - box_target_starts=box_target_starts_filtered, - box_target_counts_nonchild=box_target_counts_nonchild_filtered, - unfiltered_from_filtered_target_indices=( - unfiltered_from_filtered_target_indices), - targets=filtered_targets, - ) + target_lists = FilteredTargetListsInTreeOrder( + nfiltered_targets=nfiltered_targets, + box_target_starts=box_target_starts_filtered, + box_target_counts_nonchild=box_target_counts_nonchild_filtered, + unfiltered_from_filtered_target_indices=( + unfiltered_from_filtered_target_indices), + targets=filtered_targets, + ) - return actx.freeze(target_lists) + return actx.freeze(target_lists) # }}} diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index f252be57..20b076c3 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -4,7 +4,7 @@ Building Trees -------------- -.. autoclass:: TreeBuilder +.. autofunction:: build_tree """ __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -30,10 +30,11 @@ """ from functools import partial +from typing import Any, Optional import numpy as np -from pytools import ProcessLogger, DebugProcessLogger, memoize_method +from pytools import ProcessLogger, DebugProcessLogger, memoize_on_first_arg from boxtree.tree import Tree from boxtree.array_context import PyOpenCLArrayContext @@ -47,46 +48,12 @@ class MaxLevelsExceeded(RuntimeError): class TreeBuilder: - """ - .. automethod:: __init__ - .. automethod:: __call__ - """ - morton_nr_dtype = np.dtype(np.int8) box_level_dtype = np.dtype(np.uint8) ROOT_EXTENT_STRETCH_FACTOR = 1e-4 - def __init__(self, array_context: PyOpenCLArrayContext) -> None: - self._setup_actx = array_context - - from boxtree.bounding_box import BoundingBoxFinder - self.bbox_finder = BoundingBoxFinder(array_context) - - # This is used to map box IDs and compress box lists in empty leaf - # pruning. - - from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel - self.gappy_copy_and_map = GappyCopyAndMapKernel(array_context) - self.map_values_kernel = MapValuesKernel(array_context) - - @property - def context(self): - return self._setup_actx.queue.context - - @memoize_method - def get_kernel_info(self, dimensions, coord_dtype, - particle_id_dtype, box_id_dtype, - sources_are_targets, srcntgts_extent_norm, - kind): - - from boxtree.tree_build_kernels import get_tree_build_kernel_info - return get_tree_build_kernel_info(self.context, dimensions, coord_dtype, - particle_id_dtype, box_id_dtype, - sources_are_targets, srcntgts_extent_norm, - self.morton_nr_dtype, self.box_level_dtype, - kind=kind) - - # {{{ run control + def __init__(self, *args, **kwargs) -> None: + pass def __call__(self, actx: PyOpenCLArrayContext, particles, kind="adaptive", max_particles_in_box=None, allocator=None, debug=False, @@ -95,1721 +62,1662 @@ def __call__(self, actx: PyOpenCLArrayContext, particles, kind="adaptive", max_leaf_refine_weight=None, wait_for=None, extent_norm=None, bbox=None, **kwargs): - """ - :arg particles: an object array of (XYZ) point coordinate arrays. - :arg kind: One of the following strings: - - - 'adaptive' - - 'adaptive-level-restricted' - - 'non-adaptive' - - 'adaptive' requests an adaptive tree without level restriction. See - :ref:`tree-kinds` for further explanation. - - :arg targets: an object array of (XYZ) point coordinate arrays or ``None``. - If ``None``, *particles* act as targets, too. - Must have the same (inner) dtype as *particles*. - :arg source_radii: If not *None*, an arra of the same dtype as *particles*. - - If this is given, *targets* must also be given, i.e. sources and - targets must be separate. See :ref:`extent`. - - :arg target_radii: Like *source_radii*, but for targets. - :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`. - :arg refine_weights: If not *None*, an array of the - type :class:`numpy.int32`. A box will be split if it has a cumulative - refine_weight greater than *max_leaf_refine_weight*. If this is given, - *max_leaf_refine_weight* must also be given and *max_particles_in_box* - must be *None*. - :arg max_leaf_refine_weight: If not *None*, specifies the maximum weight - of a leaf box. - :arg max_particles_in_box: If not *None*, specifies the maximum number - of particles in a leaf box. If this is given, both - *refine_weights* and *max_leaf_refine_weight* must be *None*. - :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` - instances for whose completion this command waits before starting - execution. - :arg extent_norm: ``"l2"`` or ``"linf"``. Indicates the norm with respect - to which particle stick-out is measured. See :attr:`Tree.extent_norm`. - :arg bbox: Bounding box of either type: - 1. A dim-by-2 array, with each row to be [min, max] coordinates - in its corresponding axis direction. - 2. (Internal use only) of the same type as returned by - *boxtree.bounding_box.make_bounding_box_dtype*. - When given, this bounding box is used for tree - building. Otherwise, the bounding box is determined from particles - in such a way that it is square and is slightly larger at the top (so - that scaled coordinates are always < 1). - When supplied, the bounding box must be square and have all the - particles in its closure. - :arg kwargs: Used internally for debugging. - - :returns: a tuple ``(tree, event)``, where *tree* is an instance of - :class:`Tree`, and *event* is a :class:`pyopencl.Event` for dependency - management. - """ - - if allocator is not None: - from warnings import warn - warn("Passing in 'allocator' is deprecated. The allocator of the " - "array context 'actx' is used throughout.", - DeprecationWarning, stacklevel=2) - - # {{{ input processing - - if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]: - raise ValueError(f"unknown tree kind '{kind}'") - - # we'll modify this below, so copy it - if wait_for is None: - wait_for = [] - else: - wait_for = list(wait_for) + from warnings import warn + warn(f"'{type(self).__name__}' is deprecated and will be removed in 2023. " + "Use 'build_tree' instead.", + DeprecationWarning, stacklevel=2) + + result = build_tree( + actx, particles, + kind=kind, max_particles_in_box=max_particles_in_box, + targets=targets, source_radii=source_radii, target_radii=target_radii, + stick_out_factor=stick_out_factor, + refine_weights=refine_weights, + max_leaf_refine_weight=max_leaf_refine_weight, + extent_norm=extent_norm, bbox=bbox, + morton_nr_dtype=self.morton_nr_dtype, + box_level_dtype=self.box_level_dtype, + root_extent_stretch_factor=TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR, + debug=debug, **kwargs) + + return result, None + + +# {{{ build_tree + +@memoize_on_first_arg +def get_kernel_info( + actx: PyOpenCLArrayContext, + dimensions: int, + coord_dtype: "np.dtype", + particle_id_dtype: "np.dtype", + box_id_dtype: "np.dtype", + sources_are_targets: bool, + srcntgts_extent_norm: str, + kind: str, + morton_nr_dtype: "np.dtype", + box_level_dtype: "np.dtype"): + from boxtree.tree_build_kernels import get_tree_build_kernel_info + return get_tree_build_kernel_info(actx.context, dimensions, coord_dtype, + particle_id_dtype, box_id_dtype, + sources_are_targets, srcntgts_extent_norm, + morton_nr_dtype, box_level_dtype, + kind=kind) + + +def build_tree( + actx: PyOpenCLArrayContext, particles: np.ndarray, *, + kind: str = "adaptive", + max_particles_in_box: Optional[int] = None, + targets: Optional[np.ndarray] = None, + source_radii: Optional[np.ndarray] = None, + target_radii: Optional[np.ndarray] = None, + stick_out_factor: Optional[float] = None, + refine_weights: Optional[np.ndarray] = None, + max_leaf_refine_weight: Optional[int] = None, + extent_norm: Optional[str] = None, + bbox: Optional[np.ndarray] = None, + morton_nr_dtype: Optional[np.dtype] = None, + box_level_dtype: Optional[np.dtype] = None, + root_extent_stretch_factor: float = 1.0e-4, + debug: bool = False, + **kwargs: Any) -> Tree: + """ + :arg particles: an object array of (XYZ) point coordinate arrays. + :arg kind: One of the following strings: + + - 'adaptive' + - 'adaptive-level-restricted' + - 'non-adaptive' + + 'adaptive' requests an adaptive tree without level restriction. See + :ref:`tree-kinds` for further explanation. + + :arg targets: an object array of (XYZ) point coordinate arrays or ``None``. + If ``None``, *particles* act as targets, too. + Must have the same (inner) dtype as *particles*. + :arg source_radii: If not *None*, an array of the same dtype as *particles*. + + If this is given, *targets* must also be given, i.e. sources and + targets must be separate. See :ref:`extent`. + + :arg target_radii: Like *source_radii*, but for targets. + :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`. + :arg refine_weights: If not *None*, an array of the + type :class:`numpy.int32`. A box will be split if it has a cumulative + refine_weight greater than *max_leaf_refine_weight*. If this is given, + *max_leaf_refine_weight* must also be given and *max_particles_in_box* + must be *None*. + :arg max_leaf_refine_weight: If not *None*, specifies the maximum weight + of a leaf box. + :arg max_particles_in_box: If not *None*, specifies the maximum number + of particles in a leaf box. If this is given, both + *refine_weights* and *max_leaf_refine_weight* must be *None*. + :arg extent_norm: ``"l2"`` or ``"linf"``. Indicates the norm with respect + to which particle stick-out is measured. See :attr:`Tree.extent_norm`. + :arg bbox: Bounding box of either type: + 1. A dim-by-2 array, with each row to be [min, max] coordinates + in its corresponding axis direction. + 2. (Internal use only) of the same type as returned by + *boxtree.bounding_box.make_bounding_box_dtype*. + When given, this bounding box is used for tree + building. Otherwise, the bounding box is determined from particles + in such a way that it is square and is slightly larger at the top (so + that scaled coordinates are always < 1). + When supplied, the bounding box must be square and have all the + particles in its closure. + :arg kwargs: Used internally for debugging. + """ - dimensions = len(particles) + # {{{ input processing - from boxtree.tools import AXIS_NAMES - axis_names = AXIS_NAMES[:dimensions] + if morton_nr_dtype is None: + morton_nr_dtype = np.dtype(np.int8) - sources_are_targets = targets is None - sources_have_extent = source_radii is not None - targets_have_extent = target_radii is not None + if box_level_dtype is None: + box_level_dtype = np.dtype(np.uint8) - if extent_norm is None: - extent_norm = "linf" + if kind not in ("adaptive", "adaptive-level-restricted", "non-adaptive"): + raise ValueError(f"unknown tree kind '{kind}'") - if extent_norm not in ["linf", "l2"]: - raise ValueError("unexpected value of 'extent_norm': %s" - % extent_norm) + dimensions = len(particles) - srcntgts_extent_norm = extent_norm - srcntgts_have_extent = sources_have_extent or targets_have_extent - if not srcntgts_have_extent: - srcntgts_extent_norm = None + from boxtree.tools import AXIS_NAMES + axis_names = AXIS_NAMES[:dimensions] - del extent_norm + sources_are_targets = targets is None + sources_have_extent = source_radii is not None + targets_have_extent = target_radii is not None - if srcntgts_extent_norm and targets is None: - raise ValueError("must specify targets when specifying " - "any kind of radii") + if extent_norm is None: + extent_norm = "linf" - from pytools import single_valued - particle_id_dtype = np.int32 - box_id_dtype = np.int32 - coord_dtype = single_valued(coord.dtype for coord in particles) + if extent_norm not in ("linf", "l2"): + raise ValueError(f"unexpected value of 'extent_norm': {extent_norm}") - if targets is None: - nsrcntgts = single_valued(len(coord) for coord in particles) - else: - nsources = single_valued(len(coord) for coord in particles) - ntargets = single_valued(len(coord) for coord in targets) - nsrcntgts = nsources + ntargets - - if source_radii is not None: - if source_radii.shape != (nsources,): - raise ValueError("source_radii has an invalid shape") - - if source_radii.dtype != coord_dtype: - raise TypeError("dtypes of coordinate arrays and " - "source_radii must agree") - - if target_radii is not None: - if target_radii.shape != (ntargets,): - raise ValueError("target_radii has an invalid shape") - - if target_radii.dtype != coord_dtype: - raise TypeError("dtypes of coordinate arrays and " - "target_radii must agree") - - if sources_have_extent or targets_have_extent: - if stick_out_factor is None: - raise ValueError("if sources or targets have extent, " - "stick_out_factor must be explicitly specified") - else: - stick_out_factor = 0 + srcntgts_extent_norm = extent_norm + srcntgts_have_extent = sources_have_extent or targets_have_extent + if not srcntgts_have_extent: + srcntgts_extent_norm = None - # }}} - - def zeros(shape, dtype): - result = actx.zeros(shape, dtype) + del extent_norm - if result.events: - event, = result.events - else: - from numbers import Number - if isinstance(shape, Number): - shape = (shape,) + if srcntgts_extent_norm and targets is None: + raise ValueError( + "must specify targets when specifying any kind of radii") - from pytools import product - assert product(shape) == 0 + from pytools import single_valued + particle_id_dtype = np.int32 + box_id_dtype = np.int32 + coord_dtype = single_valued(coord.dtype for coord in particles) - from pyopencl import enqueue_marker - event = enqueue_marker(actx.queue) + if targets is None: + nsrcntgts = single_valued(len(coord) for coord in particles) + else: + nsources = single_valued(len(coord) for coord in particles) + ntargets = single_valued(len(coord) for coord in targets) + nsrcntgts = nsources + ntargets - return result, event + if source_radii is not None: + if source_radii.shape != (nsources,): + raise ValueError( + "'source_radii' has an invalid shape: " + f"expected {(nsources,)} but got {source_radii.shape}") - knl_info = self.get_kernel_info(dimensions, coord_dtype, - particle_id_dtype, box_id_dtype, - sources_are_targets, srcntgts_extent_norm, - kind=kind) + if source_radii.dtype != coord_dtype: + raise TypeError( + "dtypes of coordinate arrays and 'source_radii' must agree: " + f"got {coord_dtype} and {source_radii.dtype}") - logger.debug("tree build: start") + if target_radii is not None: + if target_radii.shape != (ntargets,): + raise ValueError( + "'target_radii' has an invalid shape: " + f"expected {(ntargets,)} but got {target_radii.shape}") - # {{{ combine sources and targets into one array, if necessary + if target_radii.dtype != coord_dtype: + raise TypeError( + "dtypes of coordinate arrays and 'target_radii' must agree: " + f"got {coord_dtype} and {target_radii.dtype}") - prep_events = [] + if sources_have_extent or targets_have_extent: + if stick_out_factor is None: + raise ValueError( + "if sources or targets have extent, " + "'stick_out_factor' must be explicitly specified") + else: + stick_out_factor = 0 - if targets is None: - # Targets weren't specified. Sources are also targets. Let's - # call them "srcntgts". + # }}} - if isinstance(particles, np.ndarray) and particles.dtype.char == "O": - srcntgts = particles - else: - from pytools.obj_array import make_obj_array - srcntgts = make_obj_array([ - actx.np.copy(actx.thaw(p)) for p in particles - ]) + # {{{ kernels - assert source_radii is None - assert target_radii is None + knl_info = get_kernel_info( + actx, + dimensions, + coord_dtype, particle_id_dtype, box_id_dtype, + sources_are_targets, + srcntgts_extent_norm, + kind, + morton_nr_dtype, box_level_dtype, + ) - srcntgt_radii = None + # }}} - else: - # Here, we mash sources and targets into one array to give us one - # big array of "srcntgts". In this case, a "srcntgt" is either a - # source or a target, but not really both, as above. How will we be - # able to tell which it was? Easy: We'll compare its 'user' id with - # nsources. If it's >=, it's a target, otherwise it's a source. + logger.debug("tree build: start") - target_coord_dtype = single_valued(tgt_i.dtype for tgt_i in targets) + # {{{ combine sources and targets into one array, if necessary - if target_coord_dtype != coord_dtype: - raise TypeError("sources and targets must have same coordinate " - "dtype") + if targets is None: + # Targets weren't specified. Sources are also targets. Let's + # call them "srcntgts". - def combine_srcntgt_arrays(ary1, ary2=None): - if ary2 is None: - dtype = ary1.dtype - else: - dtype = ary2.dtype + if isinstance(particles, np.ndarray) and particles.dtype.char == "O": + srcntgts = particles + else: + from pytools.obj_array import make_obj_array + srcntgts = make_obj_array([ + actx.np.copy(actx.thaw(p)) for p in particles + ]) - result = actx.empty(nsrcntgts, dtype) - if (ary1 is None) or (ary2 is None): - result.fill(0) + assert source_radii is None + assert target_radii is None - if ary1 is not None and ary1.nbytes: - result[:len(ary1)] = ary1 + srcntgt_radii = None - if ary2 is not None and ary2.nbytes: - result[nsources:] = ary2 + else: + # Here, we mash sources and targets into one array to give us one + # big array of "srcntgts". In this case, a "srcntgt" is either a + # source or a target, but not really both, as above. How will we be + # able to tell which it was? Easy: We'll compare its 'user' id with + # nsources. If it's >=, it's a target, otherwise it's a source. - return result + target_coord_dtype = single_valued(tgt_i.dtype for tgt_i in targets) - from pytools.obj_array import make_obj_array - srcntgts = make_obj_array([ - combine_srcntgt_arrays(src_i, tgt_i) - for src_i, tgt_i in zip(particles, targets) - ]) + if target_coord_dtype != coord_dtype: + raise TypeError( + "sources and targets must have same coordinate dtype: " + f"got {coord_dtype} and {target_coord_dtype}") - if srcntgts_have_extent: - srcntgt_radii = combine_srcntgt_arrays(source_radii, target_radii) + def combine_srcntgt_arrays(ary1, ary2=None): + if ary2 is None: + dtype = ary1.dtype else: - srcntgt_radii = None + dtype = ary2.dtype - del source_radii - del target_radii + result = actx.empty(nsrcntgts, dtype) + if (ary1 is None) or (ary2 is None): + result.fill(0) - del particles + if ary1 is not None and ary1.nbytes: + result[:len(ary1)] = ary1 - user_srcntgt_ids = actx.from_numpy( - np.arange(nsrcntgts, dtype=particle_id_dtype) - ) + if ary2 is not None and ary2.nbytes: + result[nsources:] = ary2 - evt, = user_srcntgt_ids.events - wait_for.append(evt) - del evt + return result - # }}} + from pytools.obj_array import make_obj_array + srcntgts = make_obj_array([ + combine_srcntgt_arrays(src_i, tgt_i) + for src_i, tgt_i in zip(particles, targets) + ]) - # {{{ process refine_weights - - from boxtree.tree_build_kernels import refine_weight_dtype - - specified_max_particles_in_box = max_particles_in_box is not None - specified_refine_weights = refine_weights is not None and \ - max_leaf_refine_weight is not None - - if specified_max_particles_in_box and specified_refine_weights: - raise ValueError("may only specify one of max_particles_in_box and " - "refine_weights/max_leaf_refine_weight") - elif not specified_max_particles_in_box and not specified_refine_weights: - raise ValueError("must specify either max_particles_in_box or " - "refine_weights/max_leaf_refine_weight") - elif specified_max_particles_in_box: - refine_weights = actx.empty(nsrcntgts, refine_weight_dtype) - refine_weights.fill(1) - - event, = refine_weights.events - prep_events.append(event) - max_leaf_refine_weight = max_particles_in_box - elif specified_refine_weights: - if refine_weights.dtype != refine_weight_dtype: - raise TypeError("refine_weights must have dtype '%s'" - % refine_weight_dtype) - - if max_leaf_refine_weight <= 0: - raise ValueError("max_leaf_refine_weight must be positive") - - max_refine_weights = actx.to_numpy(actx.np.amax(refine_weights)) - if max_leaf_refine_weight < max_refine_weights: - raise ValueError( - "entries of refine_weights cannot exceed max_leaf_refine_weight") + if srcntgts_have_extent: + srcntgt_radii = combine_srcntgt_arrays(source_radii, target_radii) + else: + srcntgt_radii = None - min_refine_weights = actx.to_numpy(actx.np.amin(refine_weights)) - if min_refine_weights < 0: - raise ValueError("all entries of refine_weights must be nonnegative") + del source_radii + del target_radii + del particles - total_refine_weight = actx.to_numpy( - actx.np.sum(refine_weights, dtype=np.dtype(np.int64)) - ) + user_srcntgt_ids = actx.from_numpy( + np.arange(nsrcntgts, dtype=particle_id_dtype) + ) - del max_particles_in_box - del specified_max_particles_in_box - del specified_refine_weights + # }}} - # }}} + # {{{ process refine_weights + + from boxtree.tree_build_kernels import refine_weight_dtype + + specified_max_particles_in_box = max_particles_in_box is not None + specified_refine_weights = refine_weights is not None and \ + max_leaf_refine_weight is not None + + if specified_max_particles_in_box and specified_refine_weights: + raise ValueError( + "may only specify one of 'max_particles_in_box' and " + "'refine_weights' / 'max_leaf_refine_weight'") + elif not specified_max_particles_in_box and not specified_refine_weights: + raise ValueError( + "must specify either max_'particles_in_box' or " + "'refine_weights' / 'max_leaf_refine_weight'") + elif specified_max_particles_in_box: + refine_weights = actx.empty(nsrcntgts, refine_weight_dtype) + refine_weights.fill(1) + + max_leaf_refine_weight = max_particles_in_box + elif specified_refine_weights: + if refine_weights.dtype != refine_weight_dtype: + raise TypeError( + f"'refine_weights' must have dtype '{refine_weight_dtype}', " + f"bit got {refine_weights.dtype}") + + if max_leaf_refine_weight <= 0: + raise ValueError("'max_leaf_refine_weight' must be positive") + + max_refine_weights = actx.to_numpy(actx.np.amax(refine_weights)).item() + if max_leaf_refine_weight < max_refine_weights: + raise ValueError( + "entries of 'refine_weights' cannot exceed 'max_leaf_refine_weight'") + + min_refine_weights = actx.to_numpy(actx.np.amin(refine_weights)).item() + if min_refine_weights < 0: + raise ValueError("all entries of 'refine_weights' must be nonnegative") + + total_refine_weight = actx.to_numpy( + actx.np.sum(refine_weights, dtype=np.dtype(np.int64)) + ) + + del max_particles_in_box + del specified_max_particles_in_box + del specified_refine_weights - # {{{ find and process bounding box + # }}} - if bbox is None: - bbox, _ = self.bbox_finder( - actx, srcntgts, srcntgt_radii, wait_for=wait_for) - bbox = actx.to_numpy(bbox) + # {{{ find and process bounding box - root_extent = max( - bbox["max_"+ax] - bbox["min_"+ax] - for ax in axis_names) * (1+TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR) + from boxtree.bounding_box import find_bounding_box + if bbox is None: + bbox = find_bounding_box(actx, srcntgts, srcntgt_radii) + bbox = actx.to_numpy(bbox) - # make bbox square and slightly larger at the top, to ensure scaled - # coordinates are always < 1 - bbox_min = np.empty(dimensions, coord_dtype) - for i, ax in enumerate(axis_names): - bbox_min[i] = bbox["min_"+ax] + root_extent = ( + (1 + root_extent_stretch_factor) + * max([bbox[f"max_{ax}"] - bbox[f"min_{ax}"] for ax in axis_names]) + ) - bbox_max = bbox_min + root_extent - for i, ax in enumerate(axis_names): - bbox["max_"+ax] = bbox_max[i] - else: - # Validate that bbox is a superset of particle-derived bbox - bbox_auto, _ = self.bbox_finder( - srcntgts, srcntgt_radii, wait_for=wait_for) - bbox_auto = actx.to_numpy(bbox_auto) - - # Convert unstructured numpy array to bbox_type - if isinstance(bbox, np.ndarray): - if len(bbox) == dimensions: - bbox_bak = bbox.copy() - bbox = np.empty(1, bbox_auto.dtype) - for i, ax in enumerate(axis_names): - bbox["min_"+ax] = bbox_bak[i][0] - bbox["max_"+ax] = bbox_bak[i][1] - else: - assert len(bbox) == 1 + # make bbox square and slightly larger at the top, to ensure scaled + # coordinates are always < 1 + bbox_min = np.empty(dimensions, coord_dtype) + for i, ax in enumerate(axis_names): + bbox_min[i] = bbox[f"min_{ax}"] + + bbox_max = bbox_min + root_extent + for i, ax in enumerate(axis_names): + bbox[f"max_{ax}"] = bbox_max[i] + else: + # Validate that bbox is a superset of particle-derived bbox + bbox_auto = find_bounding_box(actx, srcntgts, srcntgt_radii) + bbox_auto = actx.to_numpy(bbox_auto) + + # Convert unstructured numpy array to bbox_type + if isinstance(bbox, np.ndarray): + if len(bbox) == dimensions: + bbox_bak = bbox.copy() + bbox = np.empty(1, bbox_auto.dtype) + for i, ax in enumerate(axis_names): + bbox[f"min_{ax}"] = bbox_bak[i][0] + bbox[f"max_{ax}"] = bbox_bak[i][1] else: - raise NotImplementedError("Unsupported bounding box type: " - + str(type(bbox))) + assert len(bbox) == 1 + else: + raise NotImplementedError( + f"Unsupported bounding box type: {type(bbox).__name__}") - # bbox must cover bbox_auto - bbox_min = np.empty(dimensions, coord_dtype) - bbox_max = np.empty(dimensions, coord_dtype) + # bbox must cover bbox_auto + bbox_min = np.empty(dimensions, coord_dtype) + bbox_max = np.empty(dimensions, coord_dtype) - for i, ax in enumerate(axis_names): - bbox_min[i] = bbox["min_" + ax] - bbox_max[i] = bbox["max_" + ax] - assert bbox_min[i] < bbox_max[i] - assert bbox_min[i] <= bbox_auto["min_" + ax] - assert bbox_max[i] >= bbox_auto["max_" + ax] + for i, ax in enumerate(axis_names): + bbox_min[i] = bbox[f"min_{ax}"] + bbox_max[i] = bbox[f"max_{ax}"] + assert bbox_min[i] < bbox_max[i] + assert bbox_min[i] <= bbox_auto[f"min_{ax}"] + assert bbox_max[i] >= bbox_auto[f"max_{ax}"] - # bbox must be a square - bbox_exts = bbox_max - bbox_min - for ext in bbox_exts: - assert abs(ext - bbox_exts[0]) < 1e-15 + # bbox must be a square + bbox_exts = bbox_max - bbox_min + for ext in bbox_exts: + assert abs(ext - bbox_exts[0]) < 1e-15 - root_extent = bbox_exts[0] + root_extent = bbox_exts[0] - # }}} - - # {{{ allocate data + # }}} - logger.debug("allocating memory") + # {{{ allocate data + + logger.debug("allocating memory") + + # box-local morton bin counts for each particle at the current level + # only valid from scan -> split'n'sort + morton_bin_counts = actx.empty( + nsrcntgts, dtype=knl_info.morton_bin_count_dtype) + + # (local) morton nrs for each particle at the current level + # only valid from scan -> split'n'sort + morton_nrs = actx.empty(nsrcntgts, dtype=morton_nr_dtype) + + # 0/1 segment flags + # invariant to sorting once set + # (particles are only reordered within a box) + # valid throughout computation + box_start_flags = actx.zeros(nsrcntgts, dtype=np.int8) + srcntgt_box_ids = actx.zeros(nsrcntgts, dtype=box_id_dtype) + + # Outside nboxes_guess feeding is solely for debugging purposes, + # to test the reallocation code. + nboxes_guess = kwargs.get("nboxes_guess") + if nboxes_guess is None: + nboxes_guess = 2**dimensions * ( + (max_leaf_refine_weight + total_refine_weight - 1) + // max_leaf_refine_weight) + + assert nboxes_guess > 0 + + # /!\ IMPORTANT + # + # If you're allocating an array here that depends on nboxes_guess, or if + # your array contains box numbers, you have to write code for the + # following down below as well: + # + # * You *must* write reallocation code to handle box renumbering and + # reallocation triggered at the top of the level loop. + # + # * If your array persists after the level loop, you *must* write code + # to handle box renumbering and reallocation triggered by the box + # pruning step. + + split_box_ids = actx.zeros(nboxes_guess, dtype=box_id_dtype) + + # per-box morton bin counts + box_morton_bin_counts = actx.zeros( + nboxes_guess, dtype=knl_info.morton_bin_count_dtype) + + # particle# at which each box starts + box_srcntgt_starts = actx.zeros(nboxes_guess, dtype=particle_id_dtype) + + # pointer to parent box + box_parent_ids = actx.zeros(nboxes_guess, dtype=box_id_dtype) + + # pointer to child box, by morton number + box_child_ids = tuple([ + actx.zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions) + ]) + + # box centers, by dimension + box_centers = tuple([ + actx.zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions) + ]) + + # Initialize box_centers[0] to contain the root box's center + for d, ax in enumerate(axis_names): + center_ax = bbox[f"min_{ax}"] + (bbox[f"max_{ax}"] - bbox[f"min_{ax}"]) / 2 + box_centers[d][0].fill(center_ax) + + # box -> level map + box_levels = actx.zeros(nboxes_guess, dtype=box_level_dtype) + + # number of particles in each box + # needs to be globally initialized because empty boxes never get touched + box_srcntgt_counts_cumul = actx.zeros(nboxes_guess, dtype=particle_id_dtype) + + # Initialize box 0 to contain all particles + box_srcntgt_counts_cumul[0].fill(nsrcntgts, queue=actx.queue) + + # box -> whether the box has a child. FIXME: use smaller integer type + box_has_children = actx.zeros(nboxes_guess, dtype=np.dtype(np.int32)) + + # box -> whether the box needs a splitting to enforce level restriction. + # FIXME: use smaller integer type + force_split_box = actx.zeros( + nboxes_guess if knl_info.level_restrict else 0, + dtype=np.dtype(np.int32)) + + # set parent of root box to itself + from pyopencl import enqueue_copy + evt = enqueue_copy( + actx.queue, box_parent_ids.data, + np.zeros((), dtype=box_parent_ids.dtype)) + box_parent_ids.add_event(evt) + del evt + + # 2*(num bits in the significand) + # https://gitlab.tiker.net/inducer/boxtree/issues/23 + nlevels_max = 2*(np.finfo(coord_dtype).nmant + 1) + assert nlevels_max <= np.iinfo(box_level_dtype).max + + # level -> starting box on level + level_start_box_nrs_dev = actx.zeros(nlevels_max, dtype=box_id_dtype) + + # level -> number of used boxes on level + level_used_box_counts_dev = actx.zeros(nlevels_max, dtype=box_id_dtype) - # box-local morton bin counts for each particle at the current level - # only valid from scan -> split'n'sort - morton_bin_counts = actx.empty( - nsrcntgts, dtype=knl_info.morton_bin_count_dtype) + # }}} - # (local) morton nrs for each particle at the current level - # only valid from scan -> split'n'sort - morton_nrs = actx.empty(nsrcntgts, dtype=self.morton_nr_dtype) + def debug_with_finish(s): + if debug: + actx.queue.finish() - # 0/1 segment flags - # invariant to sorting once set - # (particles are only reordered within a box) - # valid throughout computation - box_start_flags, evt = zeros(nsrcntgts, dtype=np.int8) - prep_events.append(evt) - srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype) - prep_events.append(evt) + logger.debug(s) - # Outside nboxes_guess feeding is solely for debugging purposes, - # to test the reallocation code. - nboxes_guess = kwargs.get("nboxes_guess") - if nboxes_guess is None: - nboxes_guess = 2**dimensions * ( - (max_leaf_refine_weight + total_refine_weight - 1) - // max_leaf_refine_weight) + from pytools.obj_array import make_obj_array + have_oversize_split_box = actx.zeros((), np.int32) - assert nboxes_guess > 0 + # True if and only if the level restrict kernel found a box to split in + # order to enforce level restriction. + have_upper_level_split_box = actx.zeros((), np.int32) - # /!\ IMPORTANT - # - # If you're allocating an array here that depends on nboxes_guess, or if - # your array contains box numbers, you have to write code for the - # following down below as well: - # - # * You *must* write reallocation code to handle box renumbering and - # reallocation triggered at the top of the level loop. - # - # * If your array persists after the level loop, you *must* write code - # to handle box renumbering and reallocation triggered by the box - # pruning step. - - split_box_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype) - prep_events.append(evt) - - # per-box morton bin counts - box_morton_bin_counts, evt = zeros(nboxes_guess, - dtype=knl_info.morton_bin_count_dtype) - prep_events.append(evt) - - # particle# at which each box starts - box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype) - prep_events.append(evt) - - # pointer to parent box - box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype) - prep_events.append(evt) - - # pointer to child box, by morton number - box_child_ids, evts = zip( - *(zeros(nboxes_guess, dtype=box_id_dtype) for d in range(2**dimensions))) - prep_events.extend(evts) - - # box centers, by dimension - box_centers, evts = zip( - *(zeros(nboxes_guess, dtype=coord_dtype) for d in range(dimensions))) - prep_events.extend(evts) - - # Initialize box_centers[0] to contain the root box's center - for d, (ax, evt) in enumerate(zip(axis_names, evts)): - center_ax = bbox["min_"+ax] + (bbox["max_"+ax] - bbox["min_"+ax]) / 2 - box_centers[d][0].fill(center_ax, wait_for=[evt]) - - # box -> level map - box_levels, evt = zeros(nboxes_guess, self.box_level_dtype) - prep_events.append(evt) - - # number of particles in each box - # needs to be globally initialized because empty boxes never get touched - box_srcntgt_counts_cumul, evt = zeros(nboxes_guess, dtype=particle_id_dtype) - prep_events.append(evt) - - # Initialize box 0 to contain all particles - box_srcntgt_counts_cumul[0].fill(nsrcntgts, queue=actx.queue, wait_for=[evt]) - - # box -> whether the box has a child. FIXME: use smaller integer type - box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32)) - prep_events.append(evt) - - # box -> whether the box needs a splitting to enforce level restriction. - # FIXME: use smaller integer type - force_split_box, evt = zeros(nboxes_guess - if knl_info.level_restrict - else 0, dtype=np.dtype(np.int32)) - prep_events.append(evt) - - # set parent of root box to itself - from pyopencl import enqueue_copy - evt = enqueue_copy( - actx.queue, box_parent_ids.data, - np.zeros((), dtype=box_parent_ids.dtype)) - prep_events.append(evt) - - # 2*(num bits in the significand) - # https://gitlab.tiker.net/inducer/boxtree/issues/23 - nlevels_max = 2*(np.finfo(coord_dtype).nmant + 1) - assert nlevels_max <= np.iinfo(self.box_level_dtype).max - - # level -> starting box on level - level_start_box_nrs_dev, evt = zeros(nlevels_max, dtype=box_id_dtype) - prep_events.append(evt) - - # level -> number of used boxes on level - level_used_box_counts_dev, evt = zeros(nlevels_max, dtype=box_id_dtype) - prep_events.append(evt) + from pytools import div_ceil - # }}} + # {{{ level loop - def debug_with_finish(s): - if debug: - actx.queue.finish() + # Level 0 starts at 0 and always contains box 0 and nothing else. + # Level 1 therefore starts at 1. + level_start_box_nrs = [0, 1] + level_start_box_nrs_dev[0] = 0 + level_start_box_nrs_dev[1] = 1 - logger.debug(s) + # This counts the number of boxes that have been used per level. Note + # that this could be fewer than the actual number of boxes allocated to + # the level (in the case of building a level restricted tree, more boxes + # are pre-allocated for a level than used since we may decide to split + # parent level boxes later). + level_used_box_counts = [1] + level_used_box_counts_dev[0] = 1 - from pytools.obj_array import make_obj_array - have_oversize_split_box, evt = zeros((), np.int32) - prep_events.append(evt) + # level -> number of leaf boxes on level. Initially the root node is a + # leaf. + level_leaf_counts = np.array([1]) - # True if and only if the level restrict kernel found a box to split in - # order to enforce level restriction. - have_upper_level_split_box, evt = zeros((), np.int32) - prep_events.append(evt) + tree_build_proc = ProcessLogger(logger, "tree build") - wait_for = prep_events + if total_refine_weight > max_leaf_refine_weight: + level = 1 + else: + level = 0 - from pytools import div_ceil + # INVARIANTS -- Upon entry to this loop: + # + # - level is the level being built. + # - the last entry of level_start_box_nrs is the beginning of the level + # to be built + # - the last entry of level_used_box_counts is the number of boxes that + # are used (not just allocated) at the previous level - # {{{ level loop + # This while condition prevents entering the loop in case there's just a + # single box, by how 'level' is set above. Read this as 'while True' with + # an edge case. - # Level 0 starts at 0 and always contains box 0 and nothing else. - # Level 1 therefore starts at 1. - level_start_box_nrs = [0, 1] - level_start_box_nrs_dev[0] = 0 - level_start_box_nrs_dev[1] = 1 - wait_for.extend(level_start_box_nrs_dev.events) + level_loop_proc = DebugProcessLogger(logger, "tree build level loop") - # This counts the number of boxes that have been used per level. Note - # that this could be fewer than the actual number of boxes allocated to - # the level (in the case of building a level restricted tree, more boxes - # are pre-allocated for a level than used since we may decide to split - # parent level boxes later). - level_used_box_counts = [1] - level_used_box_counts_dev[0] = 1 - wait_for.extend(level_used_box_counts_dev.events) + # When doing level restriction, the level loop may need to be entered + # one more time after creating all the levels (see fixme note below + # regarding this). This flag is set to True when that happens. + final_level_restrict_iteration = False - # level -> number of leaf boxes on level. Initially the root node is a - # leaf. - level_leaf_counts = np.array([1]) + from boxtree.tools import map_values, copy_and_map_gappy + while level: + if debug: + # More invariants: + assert level == len(level_start_box_nrs) - 1 + assert level == len(level_used_box_counts) + assert level == len(level_leaf_counts) + + if level + 1 >= nlevels_max: # level is zero-based + raise MaxLevelsExceeded("Level count exceeded number of significant " + "bits in coordinate dtype. That means that a large number " + "of particles was indistinguishable up to floating point " + "precision (because they ended up in the same box).") + + common_args = ((morton_bin_counts, morton_nrs, + box_start_flags, + srcntgt_box_ids, split_box_ids, + box_morton_bin_counts, + refine_weights, + max_leaf_refine_weight, + box_srcntgt_starts, box_srcntgt_counts_cumul, + box_parent_ids, box_levels, + level, bbox, + user_srcntgt_ids) + + tuple(srcntgts) + + ((srcntgt_radii,) if srcntgts_have_extent else ()) + ) - tree_build_proc = ProcessLogger(logger, "tree build") + debug_with_finish("morton count scan") - if total_refine_weight > max_leaf_refine_weight: - level = 1 - else: - level = 0 + morton_count_args = common_args + if srcntgts_have_extent: + morton_count_args += (stick_out_factor,) - # INVARIANTS -- Upon entry to this loop: - # - # - level is the level being built. - # - the last entry of level_start_box_nrs is the beginning of the level - # to be built - # - the last entry of level_used_box_counts is the number of boxes that - # are used (not just allocated) at the previous level + # writes: box_morton_bin_counts + knl_info.morton_count_scan( + *morton_count_args, queue=actx.queue, size=nsrcntgts, + allocator=actx.allocator, + ) - # This while condition prevents entering the loop in case there's just a - # single box, by how 'level' is set above. Read this as 'while True' with - # an edge case. + debug_with_finish("split box id scan") + + # writes: box_has_children, split_box_ids + knl_info.split_box_id_scan( + srcntgt_box_ids, + box_srcntgt_counts_cumul, + box_morton_bin_counts, + refine_weights, + max_leaf_refine_weight, + box_levels, + level_start_box_nrs_dev, + level_used_box_counts_dev, + force_split_box, + level, - level_loop_proc = DebugProcessLogger(logger, "tree build level loop") + # output: + box_has_children, + split_box_ids, + have_oversize_split_box, - # When doing level restriction, the level loop may need to be entered - # one more time after creating all the levels (see fixme note below - # regarding this). This flag is set to True when that happens. - final_level_restrict_iteration = False + queue=actx.queue, + size=level_start_box_nrs[level], + allocator=actx.allocator, + ) - from pyopencl import wait_for_events - while level: - if debug: - # More invariants: - assert level == len(level_start_box_nrs) - 1 - assert level == len(level_used_box_counts) - assert level == len(level_leaf_counts) - - if level + 1 >= nlevels_max: # level is zero-based - raise MaxLevelsExceeded("Level count exceeded number of significant " - "bits in coordinate dtype. That means that a large number " - "of particles was indistinguishable up to floating point " - "precision (because they ended up in the same box).") - - common_args = ((morton_bin_counts, morton_nrs, - box_start_flags, - srcntgt_box_ids, split_box_ids, - box_morton_bin_counts, - refine_weights, - max_leaf_refine_weight, - box_srcntgt_starts, box_srcntgt_counts_cumul, - box_parent_ids, box_levels, - level, bbox, - user_srcntgt_ids) - + tuple(srcntgts) - + ((srcntgt_radii,) if srcntgts_have_extent else ()) - ) + # {{{ compute new level_used_box_counts, level_leaf_counts + + # The last split_box_id on each level tells us how many boxes are + # needed at the next level. + new_level_used_box_counts = [1] + for level_start_box_id in level_start_box_nrs[1:]: + last_box_on_prev_level = level_start_box_id - 1 + new_level_used_box_counts.append( + # FIXME: Get this all at once. + int(actx.to_numpy(split_box_ids[last_box_on_prev_level])) + - level_start_box_id) + + # New leaf count = + # old leaf count + # + nr. new boxes from splitting parent's leaves + # - nr. new boxes from splitting current level's leaves / 2**d + level_used_box_counts_diff = (new_level_used_box_counts + - np.append(level_used_box_counts, [0])) + new_level_leaf_counts = (level_leaf_counts + + level_used_box_counts_diff[:-1] + - level_used_box_counts_diff[1:] // 2 ** dimensions) + new_level_leaf_counts = np.append( + new_level_leaf_counts, + [level_used_box_counts_diff[-1]]) + del level_used_box_counts_diff - debug_with_finish("morton count scan") - - morton_count_args = common_args - if srcntgts_have_extent: - morton_count_args += (stick_out_factor,) - - # writes: box_morton_bin_counts - evt = knl_info.morton_count_scan( - *morton_count_args, queue=actx.queue, size=nsrcntgts, - wait_for=wait_for) - wait_for = [evt] - - debug_with_finish("split box id scan") - - # writes: box_has_children, split_box_ids - evt = knl_info.split_box_id_scan( - srcntgt_box_ids, - box_srcntgt_counts_cumul, - box_morton_bin_counts, - refine_weights, - max_leaf_refine_weight, - box_levels, - level_start_box_nrs_dev, - level_used_box_counts_dev, - force_split_box, - level, + # }}} - # output: - box_has_children, - split_box_ids, - have_oversize_split_box, + # Assumption: Everything between here and the top of the loop must + # be repeatable, so that in an out-of-memory situation, we can just + # rerun this bit of the code after reallocating and a minimal reset + # procedure. + + # The algorithm for deciding on level sizes is as follows: + # 1. Compute the minimal necessary size of each level, including the + # new level being created. + # 2. If level restricting, add padding to the new level being created. + # 3. Check if there is enough existing space for each level. + # 4. If any level does not have sufficient space, reallocate all levels: + # 4a. Compute new sizes of upper levels + # 4b. If level restricting, add padding to all levels. + + curr_upper_level_lengths = np.diff(level_start_box_nrs) + minimal_upper_level_lengths = np.max( + [new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0) + minimal_new_level_length = new_level_used_box_counts[-1] + + # Allocate extra space at the end of the current level for higher + # level leaves that may be split later. + # + # If there are no further levels to split (i.e. + # have_oversize_split_box = 0), then we do not need to allocate any + # extra space, since no new leaves can be created at the bottom + # level. + if knl_info.level_restrict and actx.to_numpy(have_oversize_split_box): + # Currently undocumented. + lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1) + minimal_new_level_length += sum( + 2**(lev*dimensions) * new_level_leaf_counts[level - lev] + for lev in range(1, 1 + min(level, lr_lookbehind_levels))) + + nboxes_minimal = ( + sum(minimal_upper_level_lengths) + minimal_new_level_length) + + needs_renumbering = ( + (curr_upper_level_lengths < minimal_upper_level_lengths).any()) + + # {{{ prepare for reallocation/renumbering + + if needs_renumbering: + assert knl_info.level_restrict + + # {{{ compute new level_start_box_nrs + + # Represents the amount of padding needed for upper levels. + upper_level_padding = np.zeros(level, dtype=int) + + # Recompute the level padding. + for ulevel in range(level): + upper_level_padding[ulevel] = sum( + 2**(lev*dimensions) * new_level_leaf_counts[ulevel - lev] + for lev in range( + 1, 1 + min(ulevel, lr_lookbehind_levels))) + + new_upper_level_unused_box_counts = np.max( + [upper_level_padding, + minimal_upper_level_lengths - new_level_used_box_counts[:-1]], + axis=0) + + new_level_start_box_nrs = np.empty(level + 1, dtype=int) + new_level_start_box_nrs[0] = 0 + new_level_start_box_nrs[1:] = np.cumsum( + new_level_used_box_counts[:-1] + + new_upper_level_unused_box_counts) - queue=actx.queue, - size=level_start_box_nrs[level], - wait_for=wait_for) - wait_for = [evt] - - # {{{ compute new level_used_box_counts, level_leaf_counts - - # The last split_box_id on each level tells us how many boxes are - # needed at the next level. - new_level_used_box_counts = [1] - for level_start_box_id in level_start_box_nrs[1:]: - last_box_on_prev_level = level_start_box_id - 1 - new_level_used_box_counts.append( - # FIXME: Get this all at once. - int(actx.to_numpy(split_box_ids[last_box_on_prev_level])) - - level_start_box_id) - - # New leaf count = - # old leaf count - # + nr. new boxes from splitting parent's leaves - # - nr. new boxes from splitting current level's leaves / 2**d - level_used_box_counts_diff = (new_level_used_box_counts - - np.append(level_used_box_counts, [0])) - new_level_leaf_counts = (level_leaf_counts - + level_used_box_counts_diff[:-1] - - level_used_box_counts_diff[1:] // 2 ** dimensions) - new_level_leaf_counts = np.append( - new_level_leaf_counts, - [level_used_box_counts_diff[-1]]) - del level_used_box_counts_diff + assert not (level_start_box_nrs == new_level_start_box_nrs).all() # }}} - # Assumption: Everything between here and the top of the loop must - # be repeatable, so that in an out-of-memory situation, we can just - # rerun this bit of the code after reallocating and a minimal reset - # procedure. - - # The algorithm for deciding on level sizes is as follows: - # 1. Compute the minimal necessary size of each level, including the - # new level being created. - # 2. If level restricting, add padding to the new level being created. - # 3. Check if there is enough existing space for each level. - # 4. If any level does not have sufficient space, reallocate all levels: - # 4a. Compute new sizes of upper levels - # 4b. If level restricting, add padding to all levels. - - curr_upper_level_lengths = np.diff(level_start_box_nrs) - minimal_upper_level_lengths = np.max( - [new_level_used_box_counts[:-1], curr_upper_level_lengths], axis=0) - minimal_new_level_length = new_level_used_box_counts[-1] - - # Allocate extra space at the end of the current level for higher - # level leaves that may be split later. - # - # If there are no further levels to split (i.e. - # have_oversize_split_box = 0), then we do not need to allocate any - # extra space, since no new leaves can be created at the bottom - # level. - if knl_info.level_restrict and actx.to_numpy(have_oversize_split_box): - # Currently undocumented. - lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1) - minimal_new_level_length += sum( - 2**(lev*dimensions) * new_level_leaf_counts[level - lev] - for lev in range(1, 1 + min(level, lr_lookbehind_levels))) - - nboxes_minimal = \ - sum(minimal_upper_level_lengths) + minimal_new_level_length - - needs_renumbering = \ - (curr_upper_level_lengths < minimal_upper_level_lengths).any() - - # {{{ prepare for reallocation/renumbering - - if needs_renumbering: - assert knl_info.level_restrict - - # {{{ compute new level_start_box_nrs - - # Represents the amount of padding needed for upper levels. - upper_level_padding = np.zeros(level, dtype=int) - - # Recompute the level padding. - for ulevel in range(level): - upper_level_padding[ulevel] = sum( - 2**(lev*dimensions) * new_level_leaf_counts[ulevel - lev] - for lev in range( - 1, 1 + min(ulevel, lr_lookbehind_levels))) - - new_upper_level_unused_box_counts = np.max( - [upper_level_padding, - minimal_upper_level_lengths - new_level_used_box_counts[:-1]], - axis=0) - - new_level_start_box_nrs = np.empty(level + 1, dtype=int) - new_level_start_box_nrs[0] = 0 - new_level_start_box_nrs[1:] = np.cumsum( - new_level_used_box_counts[:-1] - + new_upper_level_unused_box_counts) - - assert not (level_start_box_nrs == new_level_start_box_nrs).all() - - # }}} - - # {{{ set up reallocators - - old_box_count = level_start_box_nrs[-1] - # Where should I put this box? - dst_box_id = actx.empty(shape=old_box_count, dtype=box_id_dtype) - - for level_start, new_level_start, level_len in zip( - level_start_box_nrs, new_level_start_box_nrs, - curr_upper_level_lengths): - dst_box_id[level_start:level_start+level_len] = actx.from_numpy( - np.arange(new_level_start, - new_level_start + level_len, - dtype=box_id_dtype) - ) - - wait_for.extend(dst_box_id.events) + # {{{ set up reallocators - realloc_array = partial(self.gappy_copy_and_map, - dst_indices=dst_box_id, range=slice(old_box_count), - debug=debug) - realloc_and_renumber_array = partial(self.gappy_copy_and_map, - dst_indices=dst_box_id, map_values=dst_box_id, - range=slice(old_box_count), debug=debug) - renumber_array = partial(self.map_values_kernel, dst_box_id) - - # }}} + old_box_count = level_start_box_nrs[-1] + # Where should I put this box? + dst_box_id = actx.empty(shape=old_box_count, dtype=box_id_dtype) - # Update level_start_box_nrs. This will be the - # level_start_box_nrs for the reallocated data. + for level_start, new_level_start, level_len in zip( + level_start_box_nrs, new_level_start_box_nrs, + curr_upper_level_lengths): + dst_box_id[level_start:level_start+level_len] = actx.from_numpy( + np.arange(new_level_start, + new_level_start + level_len, + dtype=box_id_dtype) + ) - level_start_box_nrs = list(new_level_start_box_nrs) - level_start_box_nrs_dev[:level + 1] = \ - np.array(new_level_start_box_nrs, dtype=box_id_dtype) - level_start_box_nrs_updated = True - wait_for.extend(level_start_box_nrs_dev.events) + realloc_array = partial(copy_and_map_gappy, + actx, + dst_indices=dst_box_id, range=slice(old_box_count), + debug=debug) + realloc_and_renumber_array = partial(copy_and_map_gappy, + actx, + dst_indices=dst_box_id, mapping=dst_box_id, + range=slice(old_box_count), debug=debug) + renumber_array = partial(map_values, actx, dst_box_id) - nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length + # }}} - del new_level_start_box_nrs - else: - from boxtree.tools import realloc_array - realloc_and_renumber_array = realloc_array - renumber_array = None - level_start_box_nrs_updated = False - nboxes_new = nboxes_minimal + # Update level_start_box_nrs. This will be the + # level_start_box_nrs for the reallocated data. - del nboxes_minimal + level_start_box_nrs = list(new_level_start_box_nrs) + level_start_box_nrs_dev[:level + 1] = ( + np.array(new_level_start_box_nrs, dtype=box_id_dtype)) + level_start_box_nrs_updated = True - # }}} + nboxes_new = level_start_box_nrs[-1] + minimal_new_level_length - # {{{ reallocate and/or renumber boxes if necessary - - if level_start_box_nrs_updated or nboxes_new > nboxes_guess: - debug_with_finish("starting nboxes_guess increase") - - while nboxes_guess < nboxes_new: - nboxes_guess *= 2 - - def my_realloc_nocopy(ary, shape=nboxes_guess): - return actx.zeros(shape=shape, dtype=ary.dtype) - - def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): - result = actx.zeros(shape=shape, dtype=ary.dtype) - return result, result.events[0] - - my_realloc = partial( - realloc_array, - actx, nboxes_guess, wait_for=wait_for) - my_realloc_zeros = partial( - realloc_array, - actx, nboxes_guess, zero_fill=True, wait_for=wait_for) - my_realloc_zeros_and_renumber = partial( - realloc_and_renumber_array, - actx, nboxes_guess, zero_fill=True, wait_for=wait_for) - - resize_events = [] - - split_box_ids = my_realloc_nocopy(split_box_ids) - - # *Most*, but not *all* of the values in this array are - # rewritten when the morton scan is redone. Specifically, - # only the box morton bin counts of boxes on the level - # currently being processed are written-but we need to - # retain the box morton bin counts from the higher levels. - box_morton_bin_counts, evt = my_realloc_zeros(box_morton_bin_counts) - resize_events.append(evt) - - # force_split_box is unused unless level restriction is enabled. - if knl_info.level_restrict: - force_split_box, evt = my_realloc_zeros(force_split_box) - resize_events.append(evt) - - box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts) - resize_events.append(evt) - - box_srcntgt_counts_cumul, evt = \ - my_realloc_zeros(box_srcntgt_counts_cumul) - resize_events.append(evt) - - box_has_children, evt = my_realloc_zeros(box_has_children) - resize_events.append(evt) - - box_centers, evts = zip( - *(my_realloc(ary) for ary in box_centers)) - resize_events.extend(evts) - - box_child_ids, evts = zip( - *(my_realloc_zeros_and_renumber(ary) - for ary in box_child_ids)) - resize_events.extend(evts) - - box_parent_ids, evt = my_realloc_zeros_and_renumber(box_parent_ids) - resize_events.append(evt) - - if not level_start_box_nrs_updated: - box_levels, evt = my_realloc(box_levels) - resize_events.append(evt) - else: - box_levels, evt = my_realloc_zeros_nocopy(box_levels) - wait_for_events([evt]) - for box_level, (level_start, level_end) in enumerate(zip( - level_start_box_nrs, level_start_box_nrs[1:])): - box_levels[level_start:level_end].fill(box_level) - resize_events.extend(box_levels.events) - - if level_start_box_nrs_updated: - srcntgt_box_ids, evt = renumber_array(srcntgt_box_ids) - resize_events.append(evt) - - del my_realloc_zeros - del my_realloc_nocopy - del my_realloc_zeros_nocopy - del renumber_array - - # Can't del on Py2.7 - these are used in generator expressions - # above, which are nested scopes - my_realloc = None - my_realloc_zeros_and_renumber = None - - # retry - logger.info("nboxes_guess exceeded: " - "enlarged allocations, restarting level") + del new_level_start_box_nrs + else: + from boxtree.tools import realloc_array as _realloc_array + realloc_array = partial(_realloc_array, actx) + realloc_and_renumber_array = realloc_array + renumber_array = None + level_start_box_nrs_updated = False + nboxes_new = nboxes_minimal - continue + del nboxes_minimal - # }}} + # }}} - logger.debug("LEVEL %d -> %d boxes" % (level, nboxes_new)) + # {{{ reallocate and/or renumber boxes if necessary - assert ( - level_start_box_nrs[-1] != nboxes_new - or srcntgts_have_extent - or final_level_restrict_iteration) + if level_start_box_nrs_updated or nboxes_new > nboxes_guess: + debug_with_finish("starting nboxes_guess increase") - if level_start_box_nrs[-1] == nboxes_new: - # We haven't created new boxes in this level loop trip. - # - # If srcntgts have extent, this can happen if boxes were - # in-principle overfull, but couldn't subdivide because of - # extent restrictions. - if srcntgts_have_extent and not final_level_restrict_iteration: - level -= 1 - break - assert final_level_restrict_iteration + while nboxes_guess < nboxes_new: + nboxes_guess *= 2 - # {{{ update level_start_box_nrs, level_used_box_counts + def my_realloc_nocopy(ary, shape=nboxes_guess): + return actx.zeros(shape=shape, dtype=ary.dtype) - level_start_box_nrs.append(nboxes_new) - level_start_box_nrs_dev[level + 1].fill(nboxes_new) - wait_for.extend(level_start_box_nrs_dev.events) + def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): + return actx.zeros(shape=shape, dtype=ary.dtype) - level_used_box_counts = new_level_used_box_counts - level_used_box_counts_dev[:level + 1] = \ - np.array(level_used_box_counts, dtype=box_id_dtype) - wait_for.extend(level_used_box_counts_dev.events) + my_realloc = partial(realloc_array, nboxes_guess) + my_realloc_zeros = partial( + realloc_array, nboxes_guess, zero_fill=True) + my_realloc_zeros_and_renumber = partial( + realloc_and_renumber_array, nboxes_guess, zero_fill=True) - level_leaf_counts = new_level_leaf_counts - if debug: - for level_start, level_nboxes, leaf_count in zip( - level_start_box_nrs, - level_used_box_counts, - level_leaf_counts): - if level_nboxes == 0: - assert leaf_count == 0 - continue - nleaves_actual = level_nboxes - int(actx.to_numpy( - actx.np.sum( - box_has_children[level_start:level_start + level_nboxes] - ) - )) - assert leaf_count == nleaves_actual - - # Can't del in Py2.7 - see note below - new_level_leaf_counts = None + split_box_ids = my_realloc_nocopy(split_box_ids) - # }}} + # *Most*, but not *all* of the values in this array are + # rewritten when the morton scan is redone. Specifically, + # only the box morton bin counts of boxes on the level + # currently being processed are written-but we need to + # retain the box morton bin counts from the higher levels. + box_morton_bin_counts = my_realloc_zeros(box_morton_bin_counts) - del nboxes_new - del new_level_used_box_counts + # force_split_box is unused unless level restriction is enabled. + if knl_info.level_restrict: + force_split_box = my_realloc_zeros(force_split_box) - # {{{ split boxes + box_srcntgt_starts = my_realloc_zeros(box_srcntgt_starts) + box_srcntgt_counts_cumul = my_realloc_zeros(box_srcntgt_counts_cumul) + box_has_children = my_realloc_zeros(box_has_children) - box_splitter_args = ( - common_args - + (box_has_children, force_split_box, root_extent) - + box_child_ids - + box_centers) + box_centers = tuple([my_realloc(ary) for ary in box_centers]) + box_child_ids = tuple([ + my_realloc_zeros_and_renumber(ary) for ary in box_child_ids + ]) + box_parent_ids = my_realloc_zeros_and_renumber(box_parent_ids) - evt = knl_info.box_splitter_kernel(*box_splitter_args, - range=slice(level_start_box_nrs[-1]), - wait_for=wait_for) + if not level_start_box_nrs_updated: + box_levels = my_realloc(box_levels) + else: + box_levels = my_realloc_zeros_nocopy(box_levels) + for box_level, (level_start, level_end) in enumerate(zip( + level_start_box_nrs, level_start_box_nrs[1:])): + box_levels[level_start:level_end].fill(box_level) - wait_for = [evt] + if level_start_box_nrs_updated: + srcntgt_box_ids = renumber_array(srcntgt_box_ids) - debug_with_finish("box splitter") + del my_realloc + del my_realloc_zeros + del my_realloc_nocopy + del my_realloc_zeros_nocopy + del my_realloc_zeros_and_renumber + del renumber_array - # Mark the levels of boxes added for padding (these were not updated - # by the box splitter kernel). - last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1] - box_levels[last_used_box:level_start_box_nrs[-1]].fill(level) + # retry + logger.info("nboxes_guess exceeded: " + "enlarged allocations, restarting level") - wait_for.extend(box_levels.events) + continue - if debug: - box_levels.finish() - level_bl_chunk = actx.to_numpy(box_levels)[ - level_start_box_nrs[-2]:level_start_box_nrs[-1]] - assert np.all(level_bl_chunk == level) - del level_bl_chunk + # }}} - if debug: - assert np.all(actx.to_numpy(box_srcntgt_starts) < nsrcntgts) + logger.debug("LEVEL %d -> %d boxes" % (level, nboxes_new)) - # }}} + assert ( + level_start_box_nrs[-1] != nboxes_new + or srcntgts_have_extent + or final_level_restrict_iteration) - # {{{ renumber particles within split boxes + if level_start_box_nrs[-1] == nboxes_new: + # We haven't created new boxes in this level loop trip. + # + # If srcntgts have extent, this can happen if boxes were + # in-principle overfull, but couldn't subdivide because of + # extent restrictions. + if srcntgts_have_extent and not final_level_restrict_iteration: + level -= 1 + break + assert final_level_restrict_iteration - new_user_srcntgt_ids = actx.np.zeros_like(user_srcntgt_ids) - new_srcntgt_box_ids = actx.np.zeros_like(srcntgt_box_ids) + # {{{ update level_start_box_nrs, level_used_box_counts - particle_renumberer_args = ( - common_args - + (box_has_children, force_split_box, - new_user_srcntgt_ids, new_srcntgt_box_ids)) + level_start_box_nrs.append(nboxes_new) + level_start_box_nrs_dev[level + 1].fill(nboxes_new) - evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args, - range=slice(nsrcntgts), wait_for=wait_for) + level_used_box_counts = new_level_used_box_counts + level_used_box_counts_dev[:level + 1] = ( + np.array(level_used_box_counts, dtype=box_id_dtype)) - wait_for = [evt] + level_leaf_counts = new_level_leaf_counts + if debug: + for level_start, level_nboxes, leaf_count in zip( + level_start_box_nrs, + level_used_box_counts, + level_leaf_counts): + if level_nboxes == 0: + assert leaf_count == 0 + continue + nleaves_actual = level_nboxes - int(actx.to_numpy( + actx.np.sum( + box_has_children[level_start:level_start + level_nboxes] + ) + )) + assert leaf_count == nleaves_actual - debug_with_finish("particle renumbering") + # Can't del in Py2.7 - see note below + new_level_leaf_counts = None - user_srcntgt_ids = new_user_srcntgt_ids - del new_user_srcntgt_ids - srcntgt_box_ids = new_srcntgt_box_ids - del new_srcntgt_box_ids + # }}} - # }}} + del nboxes_new + del new_level_used_box_counts - # {{{ enforce level restriction on upper levels + # {{{ split boxes - if final_level_restrict_iteration: - # Roll back level update. - # - # FIXME: The extra iteration at the end to split boxes should - # not be necessary. Instead, all the work for the final box - # split should be done in the last iteration of the level - # loop. Currently the main issue that forces the extra iteration - # to be there is the need to use the box renumbering and - # reallocation code. In order to fix this issue, the box - # numbering and reallocation code needs to be accessible after - # the final level restriction is done. - assert int(actx.to_numpy(have_oversize_split_box)) == 0 - assert level_used_box_counts[-1] == 0 - del level_used_box_counts[-1] - del level_start_box_nrs[-1] - level -= 1 - break + box_splitter_args = ( + common_args + + (box_has_children, force_split_box, root_extent) + + box_child_ids + + box_centers) - if knl_info.level_restrict: - # Avoid generating too many kernels. - LEVEL_STEP = 10 # noqa - if level % LEVEL_STEP == 1: - level_restrict_kernel = knl_info.level_restrict_kernel_builder( - LEVEL_STEP * div_ceil(level, LEVEL_STEP)) + knl_info.box_splitter_kernel(*box_splitter_args, + range=slice(level_start_box_nrs[-1]), + queue=actx.queue, + ) - # Upward pass - check if leaf boxes at higher levels need - # further splitting. - assert len(force_split_box) > 0 - force_split_box.fill(0) - wait_for.extend(force_split_box.events) + debug_with_finish("box splitter") - did_upper_level_split = False + # Mark the levels of boxes added for padding (these were not updated + # by the box splitter kernel). + last_used_box = level_start_box_nrs[-2] + level_used_box_counts[-1] + box_levels[last_used_box:level_start_box_nrs[-1]].fill(level) - if debug: - boxes_split = [] - - for upper_level, upper_level_start, upper_level_box_count in zip( - # We just built level. Our parent level doesn't need to - # be rechecked for splitting because the smallest boxes - # in the tree (ours) already have a 2-to-1 ratio with - # that. Start checking at the level above our parent. - range(level - 2, 0, -1), - # At this point, the last entry in level_start_box_nrs - # already refers to (level + 1). - level_start_box_nrs[-4::-1], - level_used_box_counts[-3::-1]): - - upper_level_slice = slice( - upper_level_start, upper_level_start + upper_level_box_count) - - have_upper_level_split_box.fill(0) - wait_for.extend(have_upper_level_split_box.events) - - # writes: force_split_box, have_upper_level_split_box - evt = level_restrict_kernel( - upper_level, - root_extent, - box_has_children, - force_split_box, - have_upper_level_split_box, - *(box_child_ids + box_centers), - slice=upper_level_slice, - wait_for=wait_for) - - wait_for = [evt] - - if debug: - force_split_box.finish() - boxes_split.append(int(actx.to_numpy( - actx.np.sum(force_split_box[upper_level_slice]) - ))) - - if int(actx.to_numpy(have_upper_level_split_box)) == 0: - break - - did_upper_level_split = True + if debug: + level_bl_chunk = actx.to_numpy(box_levels)[ + level_start_box_nrs[-2]:level_start_box_nrs[-1]] + assert np.all(level_bl_chunk == level) + del level_bl_chunk - if debug: - total_boxes_split = sum(boxes_split) - logger.debug("level restriction: {total_boxes_split} boxes split" - .format(total_boxes_split=total_boxes_split)) - from itertools import count - for level_, nboxes_split in zip( - count(level - 2, step=-1), boxes_split[:-1]): - logger.debug("level {level}: {nboxes_split} boxes split" - .format(level=level_, nboxes_split=nboxes_split)) - del boxes_split - - if (int(actx.to_numpy(have_oversize_split_box)) == 0 - and did_upper_level_split): - # We are in the situation where there are boxes left to - # split on upper levels, and the level loop is done creating - # lower levels. - # - # We re-run the level loop one more time to finish creating - # the upper level boxes. - final_level_restrict_iteration = True - level += 1 - continue + if debug: + assert np.all(actx.to_numpy(box_srcntgt_starts) < nsrcntgts) - # }}} + # }}} - if not int(actx.to_numpy(have_oversize_split_box)): - logger.debug("no boxes left to split") - break + # {{{ renumber particles within split boxes - level += 1 - have_oversize_split_box.fill(0) + new_user_srcntgt_ids = actx.np.zeros_like(user_srcntgt_ids) + new_srcntgt_box_ids = actx.np.zeros_like(srcntgt_box_ids) - # {{{ check that nonchild part of box_morton_bin_counts is consistent + particle_renumberer_args = ( + common_args + + (box_has_children, force_split_box, + new_user_srcntgt_ids, new_srcntgt_box_ids)) - if debug and 0: - h_box_morton_bin_counts = actx.to_numpy(box_morton_bin_counts) - h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul) - h_box_child_ids = tuple([ - actx.to_numpy(bci) for bci in box_child_ids - ]) + knl_info.particle_renumberer_kernel( + *particle_renumberer_args, range=slice(nsrcntgts), + queue=actx.queue, + ) - has_mismatch = False - for ibox in range(level_start_box_nrs[-1]): - is_leaf = all(bci[ibox] == 0 for bci in h_box_child_ids) - if is_leaf: - # nonchild count only found in box_info kernel - continue + debug_with_finish("particle renumbering") - if h_box_srcntgt_counts_cumul[ibox] == 0: - # empty boxes don't have box_morton_bin_counts written - continue + user_srcntgt_ids = new_user_srcntgt_ids + del new_user_srcntgt_ids + srcntgt_box_ids = new_srcntgt_box_ids + del new_srcntgt_box_ids - kid_sum = sum( - h_box_srcntgt_counts_cumul[bci[ibox]] - for bci in h_box_child_ids - if bci[ibox] != 0) + # }}} - if ( - h_box_srcntgt_counts_cumul[ibox] - != (h_box_morton_bin_counts[ibox]["nonchild_srcntgts"] - + kid_sum)): - print("MISMATCH", level, ibox) - has_mismatch = True + # {{{ enforce level restriction on upper levels - assert not has_mismatch - print("LEVEL %d OK" % level) + if final_level_restrict_iteration: + # Roll back level update. + # + # FIXME: The extra iteration at the end to split boxes should + # not be necessary. Instead, all the work for the final box + # split should be done in the last iteration of the level + # loop. Currently the main issue that forces the extra iteration + # to be there is the need to use the box renumbering and + # reallocation code. In order to fix this issue, the box + # numbering and reallocation code needs to be accessible after + # the final level restriction is done. + assert int(actx.to_numpy(have_oversize_split_box)) == 0 + assert level_used_box_counts[-1] == 0 + del level_used_box_counts[-1] + del level_start_box_nrs[-1] + level -= 1 + break + + if knl_info.level_restrict: + # Avoid generating too many kernels. + LEVEL_STEP = 10 # noqa + if level % LEVEL_STEP == 1: + level_restrict_kernel = knl_info.level_restrict_kernel_builder( + LEVEL_STEP * div_ceil(level, LEVEL_STEP)) + + # Upward pass - check if leaf boxes at higher levels need + # further splitting. + assert len(force_split_box) > 0 + force_split_box.fill(0) + + did_upper_level_split = False - # Cannot delete in Py 2.7: referred to from nested scope. - h_box_srcntgt_counts_cumul = None + if debug: + boxes_split = [] + + for upper_level, upper_level_start, upper_level_box_count in zip( + # We just built level. Our parent level doesn't need to + # be rechecked for splitting because the smallest boxes + # in the tree (ours) already have a 2-to-1 ratio with + # that. Start checking at the level above our parent. + range(level - 2, 0, -1), + # At this point, the last entry in level_start_box_nrs + # already refers to (level + 1). + level_start_box_nrs[-4::-1], + level_used_box_counts[-3::-1]): + + upper_level_slice = slice( + upper_level_start, upper_level_start + upper_level_box_count) + + have_upper_level_split_box.fill(0) + + # writes: force_split_box, have_upper_level_split_box + level_restrict_kernel( + upper_level, + root_extent, + box_has_children, + force_split_box, + have_upper_level_split_box, + *(box_child_ids + box_centers), + slice=upper_level_slice, + queue=actx.queue, + ) - del h_box_morton_bin_counts - del h_box_child_ids + if debug: + boxes_split.append(int(actx.to_numpy( + actx.np.sum(force_split_box[upper_level_slice]) + ))) - # }}} + if int(actx.to_numpy(have_upper_level_split_box)) == 0: + break - nboxes = level_start_box_nrs[-1] + did_upper_level_split = True - npasses = level+1 - level_loop_proc.done("%d levels, %d boxes", level, nboxes) - del npasses + if debug: + total_boxes_split = sum(boxes_split) + logger.debug("level restriction: {total_boxes_split} boxes split" + .format(total_boxes_split=total_boxes_split)) + from itertools import count + for level_, nboxes_split in zip( + count(level - 2, step=-1), boxes_split[:-1]): + logger.debug("level {level}: {nboxes_split} boxes split" + .format(level=level_, nboxes_split=nboxes_split)) + del boxes_split + + if (int(actx.to_numpy(have_oversize_split_box)) == 0 + and did_upper_level_split): + # We are in the situation where there are boxes left to + # split on upper levels, and the level loop is done creating + # lower levels. + # + # We re-run the level loop one more time to finish creating + # the upper level boxes. + final_level_restrict_iteration = True + level += 1 + continue # }}} - # {{{ extract number of non-child srcntgts from box morton counts + if not int(actx.to_numpy(have_oversize_split_box)): + logger.debug("no boxes left to split") + break - if srcntgts_have_extent: - box_srcntgt_counts_nonchild = actx.empty(nboxes, particle_id_dtype) - debug_with_finish("extract non-child srcntgt count") + level += 1 + have_oversize_split_box.fill(0) - assert len(level_start_box_nrs) >= 2 - highest_possibly_split_box_nr = level_start_box_nrs[-2] + # {{{ check that nonchild part of box_morton_bin_counts is consistent - evt = knl_info.extract_nonchild_srcntgt_count_kernel( - # input - box_morton_bin_counts, - box_srcntgt_counts_cumul, - highest_possibly_split_box_nr, + if debug and 0: + h_box_morton_bin_counts = actx.to_numpy(box_morton_bin_counts) + h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul) + h_box_child_ids = tuple([ + actx.to_numpy(bci) for bci in box_child_ids + ]) - # output - box_srcntgt_counts_nonchild, + has_mismatch = False + for ibox in range(level_start_box_nrs[-1]): + is_leaf = all(bci[ibox] == 0 for bci in h_box_child_ids) + if is_leaf: + # nonchild count only found in box_info kernel + continue - range=slice(nboxes), wait_for=wait_for) - wait_for = [evt] + if h_box_srcntgt_counts_cumul[ibox] == 0: + # empty boxes don't have box_morton_bin_counts written + continue - del highest_possibly_split_box_nr + kid_sum = sum( + h_box_srcntgt_counts_cumul[bci[ibox]] + for bci in h_box_child_ids + if bci[ibox] != 0) - if debug: - h_box_srcntgt_counts_nonchild = ( - actx.to_numpy(box_srcntgt_counts_nonchild)) - h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul) + if ( + h_box_srcntgt_counts_cumul[ibox] + != (h_box_morton_bin_counts[ibox]["nonchild_srcntgts"] + + kid_sum)): + print("MISMATCH", level, ibox) + has_mismatch = True - assert np.all( - h_box_srcntgt_counts_nonchild - <= h_box_srcntgt_counts_cumul[:nboxes]) + assert not has_mismatch + print("LEVEL %d OK" % level) - del h_box_srcntgt_counts_nonchild + # Cannot delete in Py 2.7: referred to from nested scope. + h_box_srcntgt_counts_cumul = None - # Cannot delete in Py 2.7: referred to from nested scope. - h_box_srcntgt_counts_cumul = None + del h_box_morton_bin_counts + del h_box_child_ids # }}} - del morton_nrs - del box_morton_bin_counts + nboxes = level_start_box_nrs[-1] - # {{{ prune empty/unused leaf boxes + npasses = level+1 + level_loop_proc.done("%d levels, %d boxes", level, nboxes) + del npasses - prune_empty_leaves = not kwargs.get("skip_prune") - - if prune_empty_leaves: - # What is the original index of this box? - src_box_id = actx.empty(nboxes, box_id_dtype) + # }}} - # Where should I put this box? - # - # Initialize to all zeros, because pruned boxes should be mapped to - # zero (e.g. when pruning child_box_ids). - dst_box_id, evt = zeros(nboxes, box_id_dtype) - wait_for.append(evt) - - debug_with_finish("find prune indices") - - nboxes_post_prune_dev = actx.empty((), dtype=box_id_dtype) - evt = knl_info.find_prune_indices_kernel( - box_srcntgt_counts_cumul, - src_box_id, dst_box_id, nboxes_post_prune_dev, - size=nboxes, wait_for=wait_for) - wait_for = [evt] - nboxes_post_prune = int(actx.to_numpy(nboxes_post_prune_dev)) - logger.debug("{} boxes after pruning " - "({} empty leaves and/or unused boxes removed)" - .format(nboxes_post_prune, nboxes - nboxes_post_prune)) - should_prune = True - elif knl_info.level_restrict: - # Remove unused boxes from the tree. - src_box_id = actx.empty(nboxes, box_id_dtype) - dst_box_id = actx.empty(nboxes, box_id_dtype) - - new_level_start_box_nrs = np.zeros_like(level_start_box_nrs) - new_level_start_box_nrs[0] = 0 - new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts) - for level_start, new_level_start, level_used_box_count in zip( - level_start_box_nrs, new_level_start_box_nrs, - level_used_box_counts): + # {{{ extract number of non-child srcntgts from box morton counts - def make_slice(start, offset=level_used_box_count): - return slice(start, start + offset) + if srcntgts_have_extent: + box_srcntgt_counts_nonchild = actx.empty(nboxes, particle_id_dtype) + debug_with_finish("extract non-child srcntgt count") - def make_arange(start, offset=level_used_box_count): - return actx.from_numpy( - np.arange(start, start + offset, dtype=box_id_dtype) - ) + assert len(level_start_box_nrs) >= 2 + highest_possibly_split_box_nr = level_start_box_nrs[-2] - src_box_id[make_slice(new_level_start)] = make_arange(level_start) - dst_box_id[make_slice(level_start)] = make_arange(new_level_start) - wait_for.extend(src_box_id.events + dst_box_id.events) + knl_info.extract_nonchild_srcntgt_count_kernel( + # input + box_morton_bin_counts, + box_srcntgt_counts_cumul, + highest_possibly_split_box_nr, - nboxes_post_prune = new_level_start_box_nrs[-1] + # output + box_srcntgt_counts_nonchild, - logger.info("{} boxes after pruning ({} unused boxes removed)" - .format(nboxes_post_prune, nboxes - nboxes_post_prune)) - should_prune = True - else: - should_prune = False + range=slice(nboxes), + queue=actx.queue, + ) - if should_prune: - prune_events = [] + del highest_possibly_split_box_nr - prune_empty = partial(self.gappy_copy_and_map, - actx, nboxes_post_prune, - src_indices=src_box_id, - range=slice(nboxes_post_prune), debug=debug) + if debug: + h_box_srcntgt_counts_nonchild = ( + actx.to_numpy(box_srcntgt_counts_nonchild)) + h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul) - box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts) - prune_events.append(evt) + assert np.all( + h_box_srcntgt_counts_nonchild + <= h_box_srcntgt_counts_cumul[:nboxes]) - box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul) - prune_events.append(evt) + del h_box_srcntgt_counts_nonchild - if debug and prune_empty_leaves: - assert np.all(actx.to_numpy(box_srcntgt_counts_cumul) > 0) + # Cannot delete in Py 2.7: referred to from nested scope. + h_box_srcntgt_counts_cumul = None - srcntgt_box_ids, evt = self.map_values_kernel( - dst_box_id, srcntgt_box_ids) - prune_events.append(evt) + # }}} - box_parent_ids, evt = prune_empty(box_parent_ids, map_values=dst_box_id) - prune_events.append(evt) + del morton_nrs + del box_morton_bin_counts - box_levels, evt = prune_empty(box_levels) - prune_events.append(evt) + # {{{ prune empty/unused leaf boxes - if srcntgts_have_extent: - box_srcntgt_counts_nonchild, evt = prune_empty( - box_srcntgt_counts_nonchild) - prune_events.append(evt) + prune_empty_leaves = not kwargs.get("skip_prune") - box_has_children, evt = prune_empty(box_has_children) - prune_events.append(evt) + if prune_empty_leaves: + # What is the original index of this box? + src_box_id = actx.empty(nboxes, box_id_dtype) - box_child_ids, evts = zip( - *(prune_empty(ary, map_values=dst_box_id) - for ary in box_child_ids)) - prune_events.extend(evts) + # Where should I put this box? + # + # Initialize to all zeros, because pruned boxes should be mapped to + # zero (e.g. when pruning child_box_ids). + dst_box_id = actx.zeros(nboxes, box_id_dtype) + + debug_with_finish("find prune indices") + + nboxes_post_prune_dev = actx.empty((), dtype=box_id_dtype) + knl_info.find_prune_indices_kernel( + box_srcntgt_counts_cumul, + src_box_id, dst_box_id, nboxes_post_prune_dev, + size=nboxes, + queue=actx.queue, + allocator=actx.allocator, + ) + nboxes_post_prune = int(actx.to_numpy(nboxes_post_prune_dev).item()) + + logger.debug( + "%d boxes after pruning (%d empty leaves and/or unused boxes removed)", + nboxes_post_prune, nboxes - nboxes_post_prune) + should_prune = True + + elif knl_info.level_restrict: + # Remove unused boxes from the tree. + src_box_id = actx.empty(nboxes, box_id_dtype) + dst_box_id = actx.empty(nboxes, box_id_dtype) + + new_level_start_box_nrs = np.zeros_like(level_start_box_nrs) + new_level_start_box_nrs[0] = 0 + new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts) + for level_start, new_level_start, level_used_box_count in zip( + level_start_box_nrs, new_level_start_box_nrs, + level_used_box_counts): + + def make_slice(start, offset=level_used_box_count): + return slice(start, start + offset) + + def make_arange(start, offset=level_used_box_count): + return actx.from_numpy( + np.arange(start, start + offset, dtype=box_id_dtype) + ) - box_centers, evts = zip( - *(prune_empty(ary) for ary in box_centers)) - prune_events.extend(evts) + src_box_id[make_slice(new_level_start)] = make_arange(level_start) + dst_box_id[make_slice(level_start)] = make_arange(new_level_start) - # Update box counts and level start box indices. - box_levels.finish() + nboxes_post_prune = new_level_start_box_nrs[-1] - evt = knl_info.find_level_box_counts_kernel( - box_levels, level_used_box_counts_dev) - wait_for_events([evt]) + logger.info("%d boxes after pruning (%d unused boxes removed)", + nboxes_post_prune, nboxes - nboxes_post_prune) + should_prune = True - nlevels = len(level_used_box_counts) - level_used_box_counts = ( - actx.to_numpy(level_used_box_counts_dev[:nlevels])) + else: + should_prune = False - level_start_box_nrs = [0] - level_start_box_nrs.extend(np.cumsum(level_used_box_counts)) + if should_prune: + prune_empty = partial(copy_and_map_gappy, + actx, nboxes_post_prune, + src_indices=src_box_id, + range=slice(nboxes_post_prune), debug=debug) - level_start_box_nrs_dev[:nlevels + 1] = np.array( - level_start_box_nrs, dtype=box_id_dtype) - prune_events.extend(level_start_box_nrs_dev.events) + box_srcntgt_starts = prune_empty(box_srcntgt_starts) + box_srcntgt_counts_cumul = prune_empty(box_srcntgt_counts_cumul) - wait_for = prune_events - else: - logger.info("skipping empty-leaf pruning") - nboxes_post_prune = nboxes + if debug and prune_empty_leaves: + assert np.all(actx.to_numpy(box_srcntgt_counts_cumul) > 0) - level_start_box_nrs = np.array(level_start_box_nrs, box_id_dtype) + srcntgt_box_ids = map_values(actx, dst_box_id, srcntgt_box_ids) + box_parent_ids = prune_empty(box_parent_ids, mapping=dst_box_id) + box_levels = prune_empty(box_levels) - # }}} + if srcntgts_have_extent: + box_srcntgt_counts_nonchild = ( + prune_empty(box_srcntgt_counts_nonchild)) - del nboxes + box_has_children = prune_empty(box_has_children) - # {{{ compute source/target particle indices and counts in each box + box_child_ids = tuple([ + prune_empty(ary, mapping=dst_box_id) + for ary in box_child_ids]) + box_centers = tuple([prune_empty(ary) for ary in box_centers]) - if targets is None: - from boxtree.tools import reverse_index_array - user_source_ids = user_srcntgt_ids - sorted_target_ids = reverse_index_array(actx, user_srcntgt_ids) + # Update box counts and level start box indices. + knl_info.find_level_box_counts_kernel( + box_levels, level_used_box_counts_dev, + queue=actx.queue, + allocator=actx.allocator, + ) - box_source_starts = box_target_starts = box_srcntgt_starts - box_source_counts_cumul = box_target_counts_cumul = \ - box_srcntgt_counts_cumul - if srcntgts_have_extent: - box_source_counts_nonchild = box_target_counts_nonchild = \ - box_srcntgt_counts_nonchild - else: - source_numbers = actx.empty(nsrcntgts, particle_id_dtype) - - debug_with_finish("source counter") - evt = knl_info.source_counter(user_srcntgt_ids, nsources, - source_numbers, queue=actx.queue, allocator=actx.allocator, - wait_for=wait_for) - wait_for = [evt] - - user_source_ids = actx.empty(nsources, particle_id_dtype) - # srcntgt_target_ids is temporary until particle permutation is done - srcntgt_target_ids = actx.empty(ntargets, particle_id_dtype) - sorted_target_ids = actx.empty(ntargets, particle_id_dtype) - - # need to use zeros because parent boxes won't be initialized - box_source_starts, evt = zeros(nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - box_source_counts_cumul, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - box_target_starts, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - box_target_counts_cumul, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - - if srcntgts_have_extent: - box_source_counts_nonchild, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - box_target_counts_nonchild, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - - debug_with_finish("source and target index finder") - evt = knl_info.source_and_target_index_finder(*( - # input: - ( - user_srcntgt_ids, nsources, srcntgt_box_ids, - box_parent_ids, + nlevels = len(level_used_box_counts) + level_used_box_counts = ( + actx.to_numpy(level_used_box_counts_dev[:nlevels])) - box_srcntgt_starts, box_srcntgt_counts_cumul, - source_numbers, - ) - + ((box_srcntgt_counts_nonchild,) - if srcntgts_have_extent else ()) + level_start_box_nrs = [0] + level_start_box_nrs.extend(np.cumsum(level_used_box_counts)) - # output: - + ( - user_source_ids, srcntgt_target_ids, sorted_target_ids, - box_source_starts, box_source_counts_cumul, - box_target_starts, box_target_counts_cumul, - ) - + (( - box_source_counts_nonchild, - box_target_counts_nonchild, - ) if srcntgts_have_extent else ()) - ), - queue=actx.queue, range=slice(nsrcntgts), - wait_for=wait_for) - wait_for = [evt] + level_start_box_nrs_dev[:nlevels + 1] = np.array( + level_start_box_nrs, dtype=box_id_dtype) + else: + logger.info("skipping empty-leaf pruning") + nboxes_post_prune = nboxes - if srcntgts_have_extent: - if debug: - assert np.all(actx.to_numpy( - box_srcntgt_counts_nonchild - == (box_source_counts_nonchild + box_target_counts_nonchild) - )) + level_start_box_nrs = np.array(level_start_box_nrs, box_id_dtype) - if debug: - usi_host = actx.to_numpy(user_source_ids) - assert np.all(usi_host < nsources) - assert np.all(0 <= usi_host) - del usi_host + # }}} - sti_host = actx.to_numpy(srcntgt_target_ids) - assert np.all(sti_host < nsources+ntargets) - assert np.all(nsources <= sti_host) - del sti_host + del nboxes - assert np.all(actx.to_numpy( - box_source_counts_cumul + box_target_counts_cumul - == box_srcntgt_counts_cumul - )) + # {{{ compute source/target particle indices and counts in each box - del source_numbers + if targets is None: + from boxtree.tools import reverse_index_array + user_source_ids = user_srcntgt_ids + sorted_target_ids = reverse_index_array(actx, user_srcntgt_ids) - del box_srcntgt_starts + box_source_starts = box_target_starts = box_srcntgt_starts + box_source_counts_cumul = box_target_counts_cumul = \ + box_srcntgt_counts_cumul if srcntgts_have_extent: - del box_srcntgt_counts_nonchild + box_source_counts_nonchild = box_target_counts_nonchild = \ + box_srcntgt_counts_nonchild + else: + source_numbers = actx.empty(nsrcntgts, particle_id_dtype) + + debug_with_finish("source counter") + knl_info.source_counter( + user_srcntgt_ids, nsources, source_numbers, + queue=actx.queue, + allocator=actx.allocator, + ) - # }}} + user_source_ids = actx.empty(nsources, particle_id_dtype) + # srcntgt_target_ids is temporary until particle permutation is done + srcntgt_target_ids = actx.empty(ntargets, particle_id_dtype) + sorted_target_ids = actx.empty(ntargets, particle_id_dtype) + + # need to use zeros because parent boxes won't be initialized + box_source_starts = actx.zeros(nboxes_post_prune, particle_id_dtype) + box_source_counts_cumul = actx.zeros(nboxes_post_prune, particle_id_dtype) + box_target_starts = actx.zeros(nboxes_post_prune, particle_id_dtype) + box_target_counts_cumul = actx.zeros(nboxes_post_prune, particle_id_dtype) - # {{{ permute and source/target-split (if necessary) particle array + if srcntgts_have_extent: + box_source_counts_nonchild = ( + actx.zeros(nboxes_post_prune, particle_id_dtype)) + box_target_counts_nonchild = ( + actx.zeros(nboxes_post_prune, particle_id_dtype)) + + debug_with_finish("source and target index finder") + knl_info.source_and_target_index_finder(*( + # input: + ( + user_srcntgt_ids, nsources, srcntgt_box_ids, + box_parent_ids, + + box_srcntgt_starts, box_srcntgt_counts_cumul, + source_numbers, + ) + + ((box_srcntgt_counts_nonchild,) + if srcntgts_have_extent else ()) + + # output: + + ( + user_source_ids, srcntgt_target_ids, sorted_target_ids, + box_source_starts, box_source_counts_cumul, + box_target_starts, box_target_counts_cumul, + ) + + (( + box_source_counts_nonchild, + box_target_counts_nonchild, + ) if srcntgts_have_extent else ()) + ), + queue=actx.queue, range=slice(nsrcntgts), + ) - if targets is None: - sources = targets = actx.np.zeros_like(srcntgts) + if srcntgts_have_extent: + if debug: + assert np.all(actx.to_numpy( + box_srcntgt_counts_nonchild + == (box_source_counts_nonchild + box_target_counts_nonchild) + )) - debug_with_finish("srcntgt permuter (particles)") - evt = knl_info.srcntgt_permuter( - user_srcntgt_ids, - *(tuple(srcntgts) + tuple(sources)), - wait_for=wait_for) - wait_for = [evt] + if debug: + usi_host = actx.to_numpy(user_source_ids) + assert np.all(usi_host < nsources) + assert np.all(0 <= usi_host) + del usi_host - assert srcntgt_radii is None + sti_host = actx.to_numpy(srcntgt_target_ids) + assert np.all(sti_host < nsources+ntargets) + assert np.all(nsources <= sti_host) + del sti_host - else: - sources = make_obj_array([ - actx.empty(nsources, coord_dtype) for i in range(dimensions) - ]) - debug_with_finish("srcntgt permuter (sources)") - evt = knl_info.srcntgt_permuter( - user_source_ids, - *(tuple(srcntgts) + tuple(sources)), - queue=actx.queue, range=slice(nsources), - wait_for=wait_for) - wait_for = [evt] - - targets = make_obj_array([ - actx.empty(ntargets, coord_dtype) for i in range(dimensions) - ]) - debug_with_finish("srcntgt permuter (targets)") - evt = knl_info.srcntgt_permuter( - srcntgt_target_ids, - *(tuple(srcntgts) + tuple(targets)), - queue=actx.queue, range=slice(ntargets), - wait_for=wait_for) - wait_for = [evt] + assert np.all(actx.to_numpy( + box_source_counts_cumul + box_target_counts_cumul + == box_srcntgt_counts_cumul + )) - if srcntgt_radii is not None: - import pyopencl.array as cl_array - debug_with_finish("srcntgt permuter (source radii)") - source_radii = cl_array.take( - srcntgt_radii, user_source_ids, queue=actx.queue, - wait_for=wait_for) + del source_numbers - debug_with_finish("srcntgt permuter (target radii)") - target_radii = cl_array.take( - srcntgt_radii, srcntgt_target_ids, queue=actx.queue, - wait_for=wait_for) + del box_srcntgt_starts + if srcntgts_have_extent: + del box_srcntgt_counts_nonchild - wait_for = source_radii.events + target_radii.events + # }}} - del srcntgt_target_ids + # {{{ permute and source/target-split (if necessary) particle array - del srcntgt_radii + if targets is None: + sources = targets = actx.np.zeros_like(srcntgts) - # }}} + debug_with_finish("srcntgt permuter (particles)") + knl_info.srcntgt_permuter( + user_srcntgt_ids, + *(tuple(srcntgts) + tuple(sources)), + queue=actx.queue, + ) - del srcntgts + assert srcntgt_radii is None + + else: + sources = make_obj_array([ + actx.empty(nsources, coord_dtype) for i in range(dimensions) + ]) + debug_with_finish("srcntgt permuter (sources)") + knl_info.srcntgt_permuter( + user_source_ids, + *(tuple(srcntgts) + tuple(sources)), + queue=actx.queue, range=slice(nsources), + ) - nlevels = len(level_start_box_nrs) - 1 + targets = make_obj_array([ + actx.empty(ntargets, coord_dtype) for i in range(dimensions) + ]) + debug_with_finish("srcntgt permuter (targets)") + knl_info.srcntgt_permuter( + srcntgt_target_ids, + *(tuple(srcntgts) + tuple(targets)), + queue=actx.queue, range=slice(ntargets), + ) - assert nlevels == len(level_used_box_counts) - assert level + 1 == nlevels, (level+1, nlevels) - if debug: - max_level = np.max(actx.to_numpy(box_levels)) - assert max_level + 1 == nlevels + if srcntgt_radii is not None: + debug_with_finish("srcntgt permuter (source radii)") + source_radii = srcntgt_radii[user_source_ids] - # {{{ gather box child ids, box centers + debug_with_finish("srcntgt permuter (target radii)") + target_radii = srcntgt_radii[srcntgt_target_ids] - # A number of arrays below are nominally 2-dimensional and stored with - # the box index as the fastest-moving index. To make sure that accesses - # remain aligned, we round up the number of boxes used for indexing. - aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32 + del srcntgt_target_ids - box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype) - wait_for.append(evt) - box_centers_new = actx.empty((dimensions, aligned_nboxes), coord_dtype) + del srcntgt_radii - for mnr, child_row in enumerate(box_child_ids): - box_child_ids_new[mnr, :nboxes_post_prune] = \ - child_row[:nboxes_post_prune] - wait_for.extend(box_child_ids_new.events) + # }}} - for dim, center_row in enumerate(box_centers): - box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune] - wait_for.extend(box_centers_new.events) + del srcntgts - wait_for_events(wait_for) + nlevels = len(level_start_box_nrs) - 1 - box_centers = box_centers_new - box_child_ids = box_child_ids_new + assert nlevels == len(level_used_box_counts) + assert level + 1 == nlevels, (level+1, nlevels) + if debug: + max_level = np.max(actx.to_numpy(box_levels)) + assert max_level + 1 == nlevels - del box_centers_new - del box_child_ids_new + # {{{ gather box child ids, box centers - # }}} + # A number of arrays below are nominally 2-dimensional and stored with + # the box index as the fastest-moving index. To make sure that accesses + # remain aligned, we round up the number of boxes used for indexing. + aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32 - # {{{ compute box flags + box_child_ids_new = actx.zeros((2**dimensions, aligned_nboxes), box_id_dtype) + box_centers_new = actx.empty((dimensions, aligned_nboxes), coord_dtype) - from boxtree.tree import box_flags_enum - box_flags = actx.empty(nboxes_post_prune, box_flags_enum.dtype) + for mnr, child_row in enumerate(box_child_ids): + box_child_ids_new[mnr, :nboxes_post_prune] = \ + child_row[:nboxes_post_prune] - if not srcntgts_have_extent: - # If srcntgts_have_extent, then non-child counts have already been - # computed, and we have nothing to do here. But if not, then - # we must fill these non-child counts. This amounts to copying - # the cumulative counts and setting them to zero for non-leaves. + for dim, center_row in enumerate(box_centers): + box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune] - # {{{ make sure box_{source,target}_counts_nonchild are not defined + box_centers = box_centers_new + box_child_ids = box_child_ids_new - # (before we overwrite them) + del box_centers_new + del box_child_ids_new - try: - box_source_counts_nonchild - except NameError: - pass - else: - raise AssertionError + # }}} - try: - box_target_counts_nonchild - except NameError: - pass - else: - raise AssertionError + # {{{ compute box flags - # }}} + from boxtree.tree import box_flags_enum + box_flags = actx.empty(nboxes_post_prune, box_flags_enum.dtype) - box_source_counts_nonchild, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) + if not srcntgts_have_extent: + # If srcntgts_have_extent, then non-child counts have already been + # computed, and we have nothing to do here. But if not, then + # we must fill these non-child counts. This amounts to copying + # the cumulative counts and setting them to zero for non-leaves. - if sources_are_targets: - box_target_counts_nonchild = box_source_counts_nonchild - else: - box_target_counts_nonchild, evt = zeros( - nboxes_post_prune, particle_id_dtype) - wait_for.append(evt) - - debug_with_finish("compute box info") - evt = knl_info.box_info_kernel( - *( - # input: - box_parent_ids, box_srcntgt_counts_cumul, - box_source_counts_cumul, box_target_counts_cumul, - box_has_children, box_levels, nlevels, - - # output if srcntgts_have_extent, input+output otherwise - box_source_counts_nonchild, box_target_counts_nonchild, - - # output: - box_flags, - ), - range=slice(nboxes_post_prune), - wait_for=wait_for) + # {{{ make sure box_{source,target}_counts_nonchild are not defined - # }}} + # (before we overwrite them) - del box_has_children - wait_for = [evt] + try: + box_source_counts_nonchild + except NameError: + pass + else: + raise AssertionError - # {{{ compute box bounding box + try: + box_target_counts_nonchild + except NameError: + pass + else: + raise AssertionError - debug_with_finish("finding box extents") + # }}} - box_source_bounding_box_min = actx.empty( - (dimensions, aligned_nboxes), dtype=coord_dtype) - box_source_bounding_box_max = actx.empty( - (dimensions, aligned_nboxes), dtype=coord_dtype) + box_source_counts_nonchild = ( + actx.zeros(nboxes_post_prune, particle_id_dtype)) if sources_are_targets: - box_target_bounding_box_min = box_source_bounding_box_min - box_target_bounding_box_max = box_source_bounding_box_max + box_target_counts_nonchild = box_source_counts_nonchild else: - box_target_bounding_box_min = actx.empty( - (dimensions, aligned_nboxes), dtype=coord_dtype) - box_target_bounding_box_max = actx.empty( - (dimensions, aligned_nboxes), dtype=coord_dtype) + box_target_counts_nonchild = ( + actx.zeros(nboxes_post_prune, particle_id_dtype)) - bogus_radii_array = actx.empty(1, dtype=coord_dtype) + debug_with_finish("compute box info") + knl_info.box_info_kernel( + *( + # input: + box_parent_ids, box_srcntgt_counts_cumul, + box_source_counts_cumul, box_target_counts_cumul, + box_has_children, box_levels, nlevels, - # nlevels-1 is the highest valid level index - for level in range(nlevels-1, -1, -1): - start, stop = level_start_box_nrs[level:level+2] + # output if srcntgts_have_extent, input+output otherwise + box_source_counts_nonchild, box_target_counts_nonchild, - for (skip, enable_radii, box_bounding_box_min, box_bounding_box_max, - pstarts, pcounts, particle_radii, particles) in [ - ( - # never skip - False, - - sources_have_extent, - box_source_bounding_box_min, - box_source_bounding_box_max, - box_source_starts, - box_source_counts_nonchild, - source_radii if sources_have_extent else bogus_radii_array, - sources), - ( - # skip the 'target' round if sources and targets - # are the same. - sources_are_targets, - - targets_have_extent, - box_target_bounding_box_min, - box_target_bounding_box_max, - box_target_starts, - box_target_counts_nonchild, - target_radii if targets_have_extent else bogus_radii_array, - targets), - ]: - - if skip: - continue + # output: + box_flags, + ), + range=slice(nboxes_post_prune), + queue=actx.queue, + ) - args = ( - ( - aligned_nboxes, - box_child_ids, - box_centers, - pstarts, pcounts,) - + tuple(particles) - + ( - particle_radii, - enable_radii, + # }}} - box_bounding_box_min, - box_bounding_box_max)) + del box_has_children - evt = knl_info.box_extents_finder_kernel( - *args, + # {{{ compute box bounding box - range=slice(start, stop), - queue=actx.queue, wait_for=wait_for) + debug_with_finish("finding box extents") - wait_for = [evt] + box_source_bounding_box_min = actx.empty( + (dimensions, aligned_nboxes), dtype=coord_dtype) + box_source_bounding_box_max = actx.empty( + (dimensions, aligned_nboxes), dtype=coord_dtype) - del bogus_radii_array + if sources_are_targets: + box_target_bounding_box_min = box_source_bounding_box_min + box_target_bounding_box_max = box_source_bounding_box_max + else: + box_target_bounding_box_min = actx.empty( + (dimensions, aligned_nboxes), dtype=coord_dtype) + box_target_bounding_box_max = actx.empty( + (dimensions, aligned_nboxes), dtype=coord_dtype) - # }}} + bogus_radii_array = actx.empty(1, dtype=coord_dtype) - # {{{ build output + # nlevels-1 is the highest valid level index + for level in range(nlevels-1, -1, -1): + start, stop = level_start_box_nrs[level:level+2] - extra_tree_attrs = {} + for (skip, enable_radii, box_bounding_box_min, box_bounding_box_max, + pstarts, pcounts, particle_radii, particles) in [ + ( + # never skip + False, - if sources_have_extent: - extra_tree_attrs.update(source_radii=source_radii) - else: - extra_tree_attrs.update(source_radii=None) + sources_have_extent, + box_source_bounding_box_min, + box_source_bounding_box_max, + box_source_starts, + box_source_counts_nonchild, + source_radii if sources_have_extent else bogus_radii_array, + sources), + ( + # skip the 'target' round if sources and targets + # are the same. + sources_are_targets, + + targets_have_extent, + box_target_bounding_box_min, + box_target_bounding_box_max, + box_target_starts, + box_target_counts_nonchild, + target_radii if targets_have_extent else bogus_radii_array, + targets), + ]: - if targets_have_extent: - extra_tree_attrs.update(target_radii=target_radii) - else: - extra_tree_attrs.update(target_radii=None) + if skip: + continue - tree_build_proc.done( - "%d levels, %d boxes, %d particles, box extent norm: %s, " - "max_leaf_refine_weight: %d", - nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm, - max_leaf_refine_weight) + args = ( + ( + aligned_nboxes, + box_child_ids, + box_centers, + pstarts, pcounts,) + + tuple(particles) + + ( + particle_radii, + enable_radii, + + box_bounding_box_min, + box_bounding_box_max)) + + knl_info.box_extents_finder_kernel( + *args, range=slice(start, stop), + queue=actx.queue, + ) - tree = Tree( - # If you change this, also change the documentation - # of what's in the tree, above. + del bogus_radii_array - sources_are_targets=sources_are_targets, - sources_have_extent=sources_have_extent, - targets_have_extent=targets_have_extent, + # }}} - particle_id_dtype=knl_info.particle_id_dtype, - box_id_dtype=knl_info.box_id_dtype, - coord_dtype=coord_dtype, - box_level_dtype=self.box_level_dtype, + # {{{ build output - root_extent=root_extent, - stick_out_factor=stick_out_factor, - extent_norm=srcntgts_extent_norm, + extra_tree_attrs = {} - bounding_box=(bbox_min, bbox_max), - level_start_box_nrs=actx.from_numpy(level_start_box_nrs), + if sources_have_extent: + extra_tree_attrs.update(source_radii=source_radii) + else: + extra_tree_attrs.update(source_radii=None) - sources=sources, - targets=targets, + if targets_have_extent: + extra_tree_attrs.update(target_radii=target_radii) + else: + extra_tree_attrs.update(target_radii=None) - box_source_starts=box_source_starts, - box_source_counts_nonchild=box_source_counts_nonchild, - box_source_counts_cumul=box_source_counts_cumul, - box_target_starts=box_target_starts, - box_target_counts_nonchild=box_target_counts_nonchild, - box_target_counts_cumul=box_target_counts_cumul, + tree_build_proc.done( + "%d levels, %d boxes, %d particles, box extent norm: %s, " + "max_leaf_refine_weight: %d", + nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm, + max_leaf_refine_weight) - box_parent_ids=box_parent_ids, - box_child_ids=box_child_ids, - box_centers=box_centers, - box_levels=box_levels, - box_flags=box_flags, + tree = Tree( + # If you change this, also change the documentation + # of what's in the tree, above. - user_source_ids=user_source_ids, - sorted_target_ids=sorted_target_ids, + sources_are_targets=sources_are_targets, + sources_have_extent=sources_have_extent, + targets_have_extent=targets_have_extent, - box_source_bounding_box_min=box_source_bounding_box_min, - box_source_bounding_box_max=box_source_bounding_box_max, - box_target_bounding_box_min=box_target_bounding_box_min, - box_target_bounding_box_max=box_target_bounding_box_max, + particle_id_dtype=knl_info.particle_id_dtype, + box_id_dtype=knl_info.box_id_dtype, + coord_dtype=coord_dtype, + box_level_dtype=box_level_dtype, - _is_pruned=prune_empty_leaves, + root_extent=root_extent, + stick_out_factor=stick_out_factor, + extent_norm=srcntgts_extent_norm, - **extra_tree_attrs - ) + bounding_box=(bbox_min, bbox_max), + level_start_box_nrs=actx.from_numpy(level_start_box_nrs), - return actx.freeze(tree), evt + sources=sources, + targets=targets, - # }}} + box_source_starts=box_source_starts, + box_source_counts_nonchild=box_source_counts_nonchild, + box_source_counts_cumul=box_source_counts_cumul, + box_target_starts=box_target_starts, + box_target_counts_nonchild=box_target_counts_nonchild, + box_target_counts_cumul=box_target_counts_cumul, + + box_parent_ids=box_parent_ids, + box_child_ids=box_child_ids, + box_centers=box_centers, + box_levels=box_levels, + box_flags=box_flags, + + user_source_ids=user_source_ids, + sorted_target_ids=sorted_target_ids, + + box_source_bounding_box_min=box_source_bounding_box_min, + box_source_bounding_box_max=box_source_bounding_box_max, + box_target_bounding_box_min=box_target_bounding_box_min, + box_target_bounding_box_max=box_target_bounding_box_max, + + root_extent_stretch_factor=root_extent_stretch_factor, + _is_pruned=prune_empty_leaves, + + **extra_tree_attrs + ) + + return actx.freeze(tree) # }}} +# }}} + # vim: foldmethod=marker:filetype=pyopencl diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py index 2e5fa295..7f3b5803 100644 --- a/boxtree/tree_build_kernels.py +++ b/boxtree/tree_build_kernels.py @@ -122,7 +122,7 @@ @dataclass(frozen=True) -class _KernelInfo: +class TreeBuildKernelInfo: particle_id_dtype: np.dtype box_id_dtype: np.dtype morton_bin_count_dtype: np.dtype @@ -1834,7 +1834,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # }}} - return _KernelInfo( + return TreeBuildKernelInfo( particle_id_dtype=particle_id_dtype, box_id_dtype=box_id_dtype, morton_bin_count_dtype=morton_bin_count_dtype, diff --git a/doc/Makefile b/doc/Makefile index c45814ac..d0ac5f2f 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,130 +1,20 @@ -# Makefile for Sphinx documentation +# Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = python $(shell which sphinx-build) -PAPER = +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= python $(shell which sphinx-build) +SOURCEDIR = . BUILDDIR = _build -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest - +# Put it first so that "make" without argument is like "make help". help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - -rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/boxtree.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/boxtree.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/boxtree" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/boxtree" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - make -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." +.PHONY: help Makefile -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/conf.py b/doc/conf.py index 07572b6c..8a9553ac 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -18,5 +18,6 @@ "https://docs.python.org/3/": None, "https://numpy.org/doc/stable/": None, "https://documen.tician.de/pyopencl/": None, + "https://documen.tician.de/arraycontext/": None, "https://documen.tician.de/pytential/": None, } diff --git a/doc/tools.rst b/doc/tools.rst index 6db9bc70..0b5225ee 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -4,3 +4,5 @@ Utility Functionality .. automodule:: boxtree.timing .. automodule:: boxtree.constant_one + +.. automodule:: boxtree.array_context diff --git a/examples/cost_model.py b/examples/cost_model.py index 14a1b8f2..d832fa0c 100644 --- a/examples/cost_model.py +++ b/examples/cost_model.py @@ -1,29 +1,16 @@ +import os + import numpy as np import pyopencl as cl -import sys import logging -import os - -# Configure the root logger logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING")) - logger = logging.getLogger(__name__) - -# Set the logger level of this module to INFO so that logging outputs of this module -# are shown logger.setLevel(logging.INFO) -# `process_elapsed` in `ProcessTimer` is only supported for Python >= 3.3 -SUPPORTS_PROCESS_TIME = (sys.version_info >= (3, 3)) - def demo_cost_model(): - if not SUPPORTS_PROCESS_TIME: - raise NotImplementedError( - "Currently this script uses process time which only works on Python>=3.3" - ) - + from boxtree.array_context import PyOpenCLArrayContext from boxtree.pyfmmlib_integration import ( Kernel, FMMLibTreeIndependentDataForWrangler, @@ -36,6 +23,7 @@ def demo_cost_model(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) + actx = PyOpenCLArrayContext(queue, force_device_scalars=True) traversals = [] traversals_dev = [] @@ -49,30 +37,25 @@ def fmm_level_to_order(tree, ilevel): # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(queue, nsources, dims, dtype, seed=15) - targets = p_normal(queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) - from pyopencl.clrandom import PhiloxGenerator - rng = PhiloxGenerator(queue.context, seed=22) - target_radii = rng.uniform( - queue, ntargets, a=0, b=0.05, dtype=dtype - ).get() + rng = np.random.default_rng(seed=22) + target_radii = rng.uniform(low=0.0, high=0.05, size=ntargets) # }}} # {{{ Generate tree and traversal - from boxtree import TreeBuilder - tb = TreeBuilder(ctx) - tree, _ = tb( - queue, sources, targets=targets, target_radii=target_radii, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) - trav_dev, _ = tg(queue, tree, debug=True) - trav = trav_dev.get(queue=queue) + from boxtree.traversal import build_traversal + trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True) + trav = actx.to_numpy(trav_dev) traversals.append(trav) traversals_dev.append(trav_dev) @@ -88,7 +71,7 @@ def fmm_level_to_order(tree, ilevel): timing_data = {} from boxtree.fmm import drive_fmm src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) - drive_fmm(wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) timing_results.append(timing_data) @@ -103,7 +86,7 @@ def fmm_level_to_order(tree, ilevel): traversal = traversals_dev[icase] model_results.append( cost_model.cost_per_stage( - queue, traversal, level_orders_list[icase], + actx, traversal, level_orders_list[icase], FMMCostModel.get_unit_calibration_params(), ) ) @@ -114,7 +97,7 @@ def fmm_level_to_order(tree, ilevel): ) predicted_time = cost_model.cost_per_stage( - queue, traversals_dev[-1], level_orders_list[-1], params, + actx, traversals_dev[-1], level_orders_list[-1], params, ) queue.finish() diff --git a/examples/demo.py b/examples/demo.py index 8105c37a..14dcd8e2 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -4,8 +4,11 @@ import logging logging.basicConfig(level="INFO") +from boxtree.array_context import PyOpenCLArrayContext + ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) +actx = PyOpenCLArrayContext(queue, force_device_scalars=True) dims = 2 nparticles = 500 @@ -13,24 +16,21 @@ # ----------------------------------------------------------------------------- # generate some random particle positions # ----------------------------------------------------------------------------- -from pyopencl.clrandom import PhiloxGenerator -rng = PhiloxGenerator(ctx, seed=15) - from pytools.obj_array import make_obj_array +rng = np.random.default_rng(seed=15) + particles = make_obj_array([ - rng.normal(queue, nparticles, dtype=np.float64) + actx.from_numpy(rng.normal(size=nparticles)) for i in range(dims)]) # ----------------------------------------------------------------------------- # build tree and traversals (lists) # ----------------------------------------------------------------------------- -from boxtree import TreeBuilder -tb = TreeBuilder(ctx) -tree, _ = tb(queue, particles, max_particles_in_box=5) +from boxtree import build_tree +tree = build_tree(actx, particles, max_particles_in_box=5) -from boxtree.traversal import FMMTraversalBuilder -tg = FMMTraversalBuilder(ctx) -trav, _ = tg(queue, tree) +from boxtree.traversal import build_traversal +trav = build_traversal(actx, tree) # ENDEXAMPLE @@ -38,12 +38,15 @@ # plot the tree # ----------------------------------------------------------------------------- +particles = actx.to_numpy(particles) +tree = actx.to_numpy(tree) + import matplotlib.pyplot as pt +from boxtree.visualization import TreePlotter -pt.plot(particles[0].get(), particles[1].get(), "+") +pt.plot(particles[0], particles[1], "+") +plotter = TreePlotter(tree) -from boxtree.visualization import TreePlotter -plotter = TreePlotter(tree.get(queue=queue)) plotter.draw_tree(fill=False, edgecolor="black") #plotter.draw_box_numbers() plotter.set_bounding_box() diff --git a/test/test_cost_model.py b/test/test_cost_model.py index 80fda05e..2446ca0c 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -33,6 +33,7 @@ from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import ( # noqa: F401 PytestPyOpenCLArrayContextFactory, _acf) + from boxtree.cost import FMMCostModel, _PythonFMMCostModel from boxtree.cost import make_pde_aware_translation_cost_model @@ -58,8 +59,8 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) rng = np.random.default_rng(22) target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype) @@ -68,17 +69,15 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt # {{{ Generate tree and traversal - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) - trav_dev, _ = tg(actx.queue, tree, debug=True) - trav = trav_dev.get(queue=actx.queue) + from boxtree.traversal import build_traversal + trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True) + trav = actx.to_numpy(trav_dev) # }}} @@ -106,12 +105,12 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt context=constant_one_params ) p2m_cost_dev = actx.from_numpy(p2m_cost) - actx.queue.finish() + start_time = time.time() cl_form_multipoles = cl_cost_model.process_form_multipoles( - actx.queue, trav_dev, p2m_cost_dev + actx, trav_dev, p2m_cost_dev ) actx.queue.finish() @@ -121,7 +120,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_form_multipoles = python_cost_model.process_form_multipoles( - actx.queue, trav, p2m_cost + actx, trav, p2m_cost ) logger.info("Python time for process_form_multipoles: %gs", @@ -144,7 +143,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles( - actx.queue, trav_dev, m2m_cost_dev + actx, trav_dev, m2m_cost_dev ) actx.queue.finish() @@ -154,7 +153,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles( - actx.queue, trav, m2m_cost + actx, trav, m2m_cost ) logger.info("Python time for coarsen_multipoles: %gs", @@ -170,10 +169,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() cl_ndirect_sources_per_target_box = \ - cl_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav_dev) + cl_cost_model.get_ndirect_sources_per_target_box(actx, trav_dev) cl_direct = cl_cost_model.process_direct( - actx.queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0 + actx, trav_dev, cl_ndirect_sources_per_target_box, 5.0 ) actx.queue.finish() @@ -183,10 +182,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_ndirect_sources_per_target_box = \ - python_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav) + python_cost_model.get_ndirect_sources_per_target_box(actx, trav) python_direct = python_cost_model.process_direct( - actx.queue, trav, python_ndirect_sources_per_target_box, 5.0 + actx, trav, python_ndirect_sources_per_target_box, 5.0 ) logger.info("Python time for process_direct: %gs", @@ -200,7 +199,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() - cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct) + cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(actx, cl_direct) actx.queue.finish() logger.info("OpenCL time for aggregate_over_boxes: %gs", @@ -208,7 +207,9 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() - python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct) + python_direct_aggregate = ( + python_cost_model.aggregate_over_boxes(actx, python_direct) + ) logger.info("Python time for aggregate_over_boxes: %gs", time.time() - start_time) @@ -231,14 +232,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() - cl_m2l_cost = cl_cost_model.process_list2(actx.queue, trav_dev, m2l_cost_dev) + cl_m2l_cost = cl_cost_model.process_list2(actx, trav_dev, m2l_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_list2: %gs", time.time() - start_time) start_time = time.time() - python_m2l_cost = python_cost_model.process_list2(actx.queue, trav, m2l_cost) + python_m2l_cost = python_cost_model.process_list2(actx, trav, m2l_cost) logger.info("Python time for process_list2: %gs", time.time() - start_time) @@ -259,14 +260,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() - cl_m2p_cost = cl_cost_model.process_list3(actx.queue, trav_dev, m2p_cost_dev) + cl_m2p_cost = cl_cost_model.process_list3(actx, trav_dev, m2p_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_list3: %gs", time.time() - start_time) start_time = time.time() - python_m2p_cost = python_cost_model.process_list3(actx.queue, trav, m2p_cost) + python_m2p_cost = python_cost_model.process_list3(actx, trav, m2p_cost) logger.info("Python time for process_list3: %gs", time.time() - start_time) @@ -287,14 +288,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() - cl_p2l_cost = cl_cost_model.process_list4(actx.queue, trav_dev, p2l_cost_dev) + cl_p2l_cost = cl_cost_model.process_list4(actx, trav_dev, p2l_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_list4: %gs", time.time() - start_time) start_time = time.time() - python_p2l_cost = python_cost_model.process_list4(actx.queue, trav, p2l_cost) + python_p2l_cost = python_cost_model.process_list4(actx, trav, p2l_cost) logger.info("Python time for process_list4: %gs", time.time() - start_time) @@ -316,7 +317,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() cl_refine_locals_cost = cl_cost_model.process_refine_locals( - actx.queue, trav_dev, l2l_cost_dev + actx, trav_dev, l2l_cost_dev ) actx.queue.finish() @@ -325,7 +326,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_refine_locals_cost = python_cost_model.process_refine_locals( - actx.queue, trav, l2l_cost + actx, trav, l2l_cost ) logger.info("Python time for refine_locals: %gs", time.time() - start_time) @@ -348,7 +349,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() cl_l2p_cost = cl_cost_model.process_eval_locals( - actx.queue, trav_dev, l2p_cost_dev) + actx, trav_dev, l2p_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_eval_locals: %gs", @@ -356,7 +357,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_l2p_cost = python_cost_model.process_eval_locals( - actx.queue, trav, l2p_cost) + actx, trav, l2p_cost) logger.info("Python time for process_eval_locals: %gs", time.time() - start_time) @@ -395,8 +396,8 @@ def fmm_level_to_order(tree, ilevel): # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) rng = np.random.default_rng(22) target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype) @@ -405,17 +406,15 @@ def fmm_level_to_order(tree, ilevel): # {{{ Generate tree and traversal - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) - trav_dev, _ = tg(actx.queue, tree, debug=True) - trav = trav_dev.get(queue=actx.queue) + from boxtree.traversal import build_traversal + trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True) + trav = actx.to_numpy(trav_dev) traversals.append(trav) traversals_dev.append(trav_dev) @@ -431,7 +430,7 @@ def fmm_level_to_order(tree, ilevel): timing_data = {} from boxtree.fmm import drive_fmm src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) - drive_fmm(wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) timing_results.append(timing_data) @@ -458,7 +457,7 @@ def test_params_equal(test_params1, test_params2): level_to_order = level_to_orders[icase] python_model_results.append(python_cost_model.cost_per_stage( - actx.queue, traversal, level_to_order, + actx, traversal, level_to_order, _PythonFMMCostModel.get_unit_calibration_params(), )) @@ -477,7 +476,7 @@ def test_params_equal(test_params1, test_params2): level_to_order = level_to_orders[icase] cl_model_results.append(cl_cost_model.cost_per_stage( - actx.queue, traversal, level_to_order, + actx, traversal, level_to_order, FMMCostModel.get_unit_calibration_params(), )) @@ -530,23 +529,21 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( actx = actx_factory() from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=16) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=19) + sources = p_normal(actx, nsources, dims, dtype, seed=16) + targets = p_normal(actx, ntargets, dims, dtype, seed=19) rng = np.random.default_rng(20) target_radii = rng.uniform(0, 0.04, (ntargets,)).astype(dtype) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) - trav_dev, _ = tg(actx.queue, tree, debug=True) - trav = trav_dev.get(queue=actx.queue) + from boxtree.traversal import build_traversal + trav_dev = build_traversal(actx, tree, well_sep_is_n_away=2, debug=True) + trav = actx.to_numpy(trav_dev) from boxtree.constant_one import ( ConstantOneTreeIndependentDataForWrangler, @@ -557,7 +554,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( timing_data = {} from boxtree.fmm import drive_fmm src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) - drive_fmm(wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) cost_model = FMMCostModel( translation_cost_model_factory=OpCountingTranslationCostModel @@ -566,7 +563,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( level_to_order = np.array([1 for _ in range(tree.nlevels)]) modeled_time = cost_model.cost_per_stage( - actx.queue, trav_dev, level_to_order, + actx, trav_dev, level_to_order, FMMCostModel.get_unit_calibration_params(), ) @@ -585,10 +582,10 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( total_cost += timing_data[stage]["ops_elapsed"] per_box_cost = cost_model.cost_per_box( - actx.queue, trav_dev, level_to_order, + actx, trav_dev, level_to_order, FMMCostModel.get_unit_calibration_params(), ) - total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost) + total_aggregate_cost = cost_model.aggregate_over_boxes(actx, per_box_cost) assert total_cost == ( total_aggregate_cost diff --git a/test/test_distributed.py b/test/test_distributed.py index c9543519..b6ca7d32 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -33,7 +33,7 @@ from boxtree.pyfmmlib_integration import ( Kernel, FMMLibTreeIndependentDataForWrangler, - FMMLibExpansionWrangler) + FMMLibExpansionWrangler, FMMLibRotationData) from boxtree.constant_one import ( ConstantOneExpansionWrangler as ConstantOneExpansionWranglerBase, ConstantOneTreeIndependentDataForWrangler) @@ -78,8 +78,11 @@ def fmm_level_to_order(tree, level): with patch.dict(os.environ, {"XDG_CACHE_HOME": rank_cache_dir}): actx = _acf() - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) + from functools import partial + from boxtree.traversal import build_traversal + build_traversal = partial(build_traversal, + well_sep_is_n_away=2, + debug=True) tree_indep = FMMLibTreeIndependentDataForWrangler( dims, Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE) @@ -88,32 +91,32 @@ def fmm_level_to_order(tree, level): if rank == 0: # Generate random particles and source weights from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) rng = np.random.default_rng(20) sources_weights = rng.uniform(0.0, 1.0, (nsources,)) target_radii = rng.uniform(0.0, 0.05, (ntargets,)) # Build the tree and interaction lists - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - global_tree_dev, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + from boxtree import build_tree + global_tree_dev = build_tree( + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.25, max_particles_in_box=30, debug=True) - d_trav, _ = tg(actx.queue, global_tree_dev, debug=True) - global_traversal_host = d_trav.get(queue=actx.queue) + d_trav = build_traversal(actx, global_tree_dev) + global_traversal_host = actx.to_numpy(d_trav) global_tree_host = global_traversal_host.tree # Get pyfmmlib expansion wrangler wrangler = FMMLibExpansionWrangler( tree_indep, global_traversal_host, - fmm_level_to_order=fmm_level_to_order) + fmm_level_to_order=fmm_level_to_order, + rotation_data=FMMLibRotationData(actx, global_traversal_host)) # Compute FMM with one MPI rank from boxtree.fmm import drive_fmm - pot_fmm = drive_fmm(wrangler, [sources_weights]) * 2 * np.pi + pot_fmm = drive_fmm(actx, wrangler, [sources_weights]) * 2 * np.pi # Compute FMM using the distributed implementation @@ -122,17 +125,17 @@ def wrangler_factory(local_traversal, global_traversal): DistributedFMMLibExpansionWrangler return DistributedFMMLibExpansionWrangler( - actx.context, comm, tree_indep, local_traversal, global_traversal, + actx, comm, tree_indep, local_traversal, global_traversal, fmm_level_to_order=fmm_level_to_order, communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) from boxtree.distributed import DistributedFMMRunner distribued_fmm_info = DistributedFMMRunner( - actx.queue, global_tree_host, tg, wrangler_factory, comm=comm) + actx, global_tree_host, build_traversal, wrangler_factory, comm=comm) timing_data = {} pot_dfmm = distribued_fmm_info.drive_dfmm( - [sources_weights], timing_data=timing_data) + actx, [sources_weights], timing_data=timing_data) assert timing_data # Uncomment the following section to print the time taken of each stage @@ -182,31 +185,42 @@ def test_against_shared( # {{{ test_constantone def _test_constantone(tmp_cache_basedir, dims, nsources, ntargets, dtype): - from boxtree.distributed.calculation import DistributedExpansionWrangler + from boxtree.distributed.calculation import DistributedExpansionWranglerMixin class ConstantOneExpansionWrangler( - ConstantOneExpansionWranglerBase, DistributedExpansionWrangler): + DistributedExpansionWranglerMixin, + ConstantOneExpansionWranglerBase): def __init__( - self, queue, comm, tree_indep, local_traversal, global_traversal): - DistributedExpansionWrangler.__init__( - self, queue, comm, global_traversal, False, - communicate_mpoles_via_allreduce=True) + self, array_context, comm, + tree_indep, local_traversal, global_traversal): ConstantOneExpansionWranglerBase.__init__( self, tree_indep, local_traversal) + + self._setup_actx = array_context + self.comm = comm + self.global_traversal = global_traversal + self.communicate_mpoles_via_allreduce = True + self.level_orders = np.ones(local_traversal.tree.nlevels, dtype=np.int32) def reorder_sources(self, source_array): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return source_array[self.global_traversal.tree.user_source_ids] else: return None def reorder_potentials(self, potentials): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return potentials[self.global_traversal.tree.sorted_target_ids] else: return None + def finalize_potentials(self, potentials, template_ary): + if self.is_mpi_root: + return super().finalize_potentials(potentials, template_ary) + else: + return None + from mpi4py import MPI # Get the current rank @@ -222,39 +236,36 @@ def reorder_potentials(self, potentials): with patch.dict(os.environ, {"XDG_CACHE_HOME": rank_cache_dir}): actx = _acf() - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context) + from boxtree.traversal import build_traversal if rank == 0: - # Generate random particles from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = (p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = (p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) # Constant one source weights sources_weights = np.ones((nsources,), dtype=dtype) # Build the global tree - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb( - actx.queue, sources, targets=targets, max_particles_in_box=30, - debug=True) - tree = tree.get(actx.queue) + from boxtree import build_tree + tree = build_tree( + actx, sources, + targets=targets, max_particles_in_box=30, debug=True) + tree = actx.to_numpy(tree) tree_indep = ConstantOneTreeIndependentDataForWrangler() def wrangler_factory(local_traversal, global_traversal): return ConstantOneExpansionWrangler( - actx.queue, comm, tree_indep, local_traversal, global_traversal) + actx, comm, tree_indep, local_traversal, global_traversal) from boxtree.distributed import DistributedFMMRunner distributed_fmm_info = DistributedFMMRunner( - actx.queue, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD) + actx, tree, build_traversal, wrangler_factory, comm=MPI.COMM_WORLD) - pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights]) + pot_dfmm = distributed_fmm_info.drive_dfmm(actx, [sources_weights]) if rank == 0: assert (np.all(pot_dfmm == nsources)) diff --git a/test/test_fmm.py b/test/test_fmm.py index b7446f7e..d472f4d0 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -29,11 +29,7 @@ from boxtree.array_context import ( # noqa: F401 PytestPyOpenCLArrayContextFactory, _acf) -from boxtree.tools import ( # noqa: F401 - make_normal_particle_array as p_normal, - make_surface_particle_array as p_surface, - make_uniform_particle_array as p_uniform, - particle_array_to_host) +from boxtree.tools import make_normal_particle_array as p_normal from boxtree.constant_one import ( ConstantOneTreeIndependentDataForWrangler, ConstantOneExpansionWrangler) @@ -48,7 +44,8 @@ # {{{ ref fmmlib pot computation -def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host, +def get_fmmlib_ref_pot( + actx, wrangler, weights, sources_host, targets_host, helmholtz_k, dipole_vec=None): dims = sources_host.shape[0] eqn_letter = "h" if helmholtz_k else "l" @@ -84,10 +81,10 @@ def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host, kwargs["zk"] = helmholtz_k return wrangler.finalize_potentials( + actx, fmmlib_routine( sources=sources_host, targets=targets_host, - **kwargs)[0], - template_ary=weights) + **kwargs)[0]) # }}} @@ -177,7 +174,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, dtype = np.float64 try: - sources = source_gen(actx.queue, nsources_req, dims, dtype, seed=15) + sources = source_gen(actx, nsources_req, dims, dtype, seed=15) nsources = len(sources[0]) if ntargets_req is None: @@ -185,7 +182,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, targets = None ntargets = ntargets_req else: - targets = target_gen(actx.queue, ntargets_req, dims, dtype, seed=16) + targets = target_gen(actx, ntargets_req, dims, dtype, seed=16) ntargets = len(targets[0]) except ImportError: pytest.skip("loopy not available, but needed for particle array " @@ -206,41 +203,40 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, else: target_radii = None - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + tree = build_tree(actx, sources, targets=targets, max_particles_in_box=30, source_radii=source_radii, target_radii=target_radii, debug=True, stick_out_factor=0.25, extent_norm=extent_norm) if 0: - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) tree.plot() import matplotlib.pyplot as pt pt.show() - from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context, - well_sep_is_n_away=well_sep_is_n_away, - from_sep_smaller_crit=from_sep_smaller_crit) - trav, _ = tbuild(actx.queue, tree, debug=True) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, + well_sep_is_n_away=well_sep_is_n_away, + from_sep_smaller_crit=from_sep_smaller_crit, + debug=True) if who_has_extent: pre_merge_trav = trav - trav = trav.merge_close_lists(actx.queue) + trav = trav.merge_close_lists(actx) #weights = np.random.randn(nsources) weights = np.ones(nsources) weights_sum = np.sum(weights) - host_trav = trav.get(queue=actx.queue) + host_trav = actx.to_numpy(trav) host_tree = host_trav.tree if who_has_extent: - pre_merge_host_trav = pre_merge_trav.get(queue=actx.queue) + pre_merge_host_trav = actx.to_numpy(pre_merge_trav) - from boxtree.tree import ParticleListFilter - plfilt = ParticleListFilter(actx.context) + from boxtree.tree import ( + filter_target_lists_in_user_order, + filter_target_lists_in_tree_order) tree_indep = ConstantOneTreeIndependentDataForWrangler() @@ -250,17 +246,15 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, ) if filter_kind == "user": - filtered_targets = plfilt.filter_target_lists_in_user_order( - actx.queue, tree, flags) + filtered_targets = filter_target_lists_in_user_order(actx, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder( tree_indep, host_trav, - filtered_targets.get(queue=actx.queue)) + actx.to_numpy(filtered_targets)) elif filter_kind == "tree": - filtered_targets = plfilt.filter_target_lists_in_tree_order( - actx.queue, tree, flags) + filtered_targets = filter_target_lists_in_tree_order(actx, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder( tree_indep, host_trav, - filtered_targets.get(queue=actx.queue)) + actx.to_numpy(filtered_targets)) else: raise ValueError("unsupported value of 'filter_kind'") else: @@ -274,7 +268,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, == weights) from boxtree.fmm import drive_fmm - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) if filter_kind: pot = pot[actx.to_numpy(flags) > 0] @@ -292,7 +286,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, for i in range(nsources): unit_vec = np.zeros(nsources, dtype=dtype) unit_vec[i] = 1 - mat[:, i] = drive_fmm(host_trav, wrangler, (unit_vec,)) + mat[:, i] = drive_fmm(actx, wrangler, (unit_vec,)) pb.progress() pb.finished() @@ -401,25 +395,23 @@ def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k): ntargets = 1000 dtype = np.float64 - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) targets = ( - p_normal(actx.queue, ntargets, dims, dtype, seed=18) + p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) - sources_host = particle_array_to_host(sources) - targets_host = particle_array_to_host(targets) - - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + sources_host = np.stack(actx.to_numpy(sources)) + targets_host = np.stack(actx.to_numpy(targets)) - tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, max_particles_in_box=30, debug=True) - from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, debug=True) - trav = trav.get(queue=actx.queue) + trav = actx.to_numpy(trav) rng = np.random.default_rng(20) weights = rng.uniform(0.0, 1.0, (nsources,)) @@ -461,7 +453,7 @@ def fmm_level_to_order(tree, lev): from boxtree.fmm import drive_fmm timing_data = {} - pot = drive_fmm(wrangler, (weights,), timing_data=timing_data) + pot = drive_fmm(actx, wrangler, (weights,), timing_data=timing_data) print(timing_data) assert timing_data @@ -469,8 +461,8 @@ def fmm_level_to_order(tree, lev): logger.info("computing direct (reference) result") - ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources_host.T, - targets_host.T, helmholtz_k, dipole_vec) + ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources_host, + targets_host, helmholtz_k, dipole_vec) rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf) logger.info("relative l2 error vs fmmlib direct: %g", rel_err) @@ -504,15 +496,17 @@ def fmm_level_to_order(tree, lev): if use_dipoles: knl = DirectionalSourceDerivative(knl) - sumpy_extra_kwargs["src_derivative_dir"] = dipole_vec + sumpy_extra_kwargs["src_derivative_dir"] = actx.from_numpy(dipole_vec) - p2p = P2P(actx.context, - [knl], - exclude_self=False) + p2p = P2P(target_kernels=[knl], exclude_self=False) - evt, (sumpy_ref_pot,) = p2p( - actx.queue, targets, sources, (weights,), - out_host=True, **sumpy_extra_kwargs) + result = p2p( + actx, + targets, + sources, + (actx.from_numpy(weights),), + **sumpy_extra_kwargs) + sumpy_ref_pot = actx.to_numpy(result["result_s0"]) sumpy_rel_err = ( la.norm(pot - sumpy_ref_pot, np.inf) @@ -552,19 +546,17 @@ def test_pyfmmlib_numerical_stability(actx_factory, dims, helmholtz_k, order): targets = sources * (1 + 1e-3) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, max_particles_in_box=2, debug=True) assert tree.nlevels >= 15 - from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, debug=True) - trav = trav.get(queue=actx.queue) + trav = actx.to_numpy(trav) weights = np.ones_like(sources[0]) from boxtree.pyfmmlib_integration import ( @@ -581,17 +573,17 @@ def fmm_level_to_order(tree, lev): tree_indep, trav, helmholtz_k=helmholtz_k, fmm_level_to_order=fmm_level_to_order, - rotation_data=FMMLibRotationData(actx.queue, trav)) + rotation_data=FMMLibRotationData(actx, trav)) from boxtree.fmm import drive_fmm - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) assert not np.isnan(pot).any() # {{{ ref fmmlib computation logger.info("computing direct (reference) result") - ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources, targets, + ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources, targets, helmholtz_k) rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf) @@ -625,8 +617,8 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten from_sep_smaller_min_nsources_cumul = 1 + max_particles_in_box from boxtree.fmm import drive_fmm - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=15) rng = np.random.default_rng(22) if enable_extents: @@ -636,28 +628,30 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten else: target_radii = None - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + tree = build_tree( + actx, sources, + targets=targets, max_particles_in_box=max_particles_in_box, target_radii=target_radii, - debug=True, stick_out_factor=0.25) + stick_out_factor=0.25, + debug=True, + ) - from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True, - _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, + debug=True, + _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul) weights = np.ones(nsources) weights_sum = np.sum(weights) - host_trav = trav.get(queue=actx.queue) + host_trav = actx.to_numpy(trav) tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav) - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) assert np.all(pot == weights_sum) @@ -680,8 +674,8 @@ def test_fmm_float32(actx_factory, enable_extents): dtype = np.float32 from boxtree.fmm import drive_fmm - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=15) rng = np.random.default_rng(12) if enable_extents: @@ -691,27 +685,28 @@ def test_fmm_float32(actx_factory, enable_extents): else: target_radii = None - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + tree = build_tree( + actx, sources, + targets=targets, max_particles_in_box=30, target_radii=target_radii, - debug=True, stick_out_factor=0.25) + stick_out_factor=0.25, + debug=True, + ) - from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, debug=True) weights = np.ones(nsources) weights_sum = np.sum(weights) - host_trav = trav.get(queue=actx.queue) + host_trav = actx.to_numpy(trav) tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav) - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) assert np.all(pot == weights_sum) @@ -732,21 +727,19 @@ def test_fmm_with_optimized_3d_m2l(actx_factory, nsrcntgts, helmholtz_k, nsources = ntargets = nsrcntgts // 2 dtype = np.float64 - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) targets = ( - p_normal(actx.queue, ntargets, dims, dtype, seed=18) + p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + tree = build_tree( + actx, sources, targets=targets, max_particles_in_box=30, debug=True) - from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) - trav = trav.get(queue=actx.queue) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, debug=True) + trav = actx.to_numpy(trav) rng = np.random.default_rng(20) weights = rng.uniform(0.0, 1.0, (nsources,)) @@ -778,17 +771,17 @@ def fmm_level_to_order(tree, lev): tree_indep, trav, helmholtz_k=helmholtz_k, fmm_level_to_order=fmm_level_to_order, - rotation_data=FMMLibRotationData(actx.queue, trav)) + rotation_data=FMMLibRotationData(actx, trav)) from boxtree.fmm import drive_fmm baseline_timing_data = {} baseline_pot = drive_fmm( - baseline_wrangler, (weights,), timing_data=baseline_timing_data) + actx, baseline_wrangler, (weights,), timing_data=baseline_timing_data) optimized_timing_data = {} optimized_pot = drive_fmm( - optimized_wrangler, (weights,), timing_data=optimized_timing_data) + actx, optimized_wrangler, (weights,), timing_data=optimized_timing_data) baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"] if baseline_time is not None: diff --git a/test/test_tools.py b/test/test_tools.py index d83491c4..683a6c07 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -95,8 +95,7 @@ def test_allreduce_comm_pattern(p): def test_masked_matrix_compression(actx_factory, order): actx = actx_factory() - from boxtree.tools import MaskCompressorKernel - matcompr = MaskCompressorKernel(actx.context) + from boxtree.tools import mask_to_csr n = 40 m = 10 @@ -105,7 +104,7 @@ def test_masked_matrix_compression(actx_factory, order): arr = (rng.random((n, m)) > 0.5).astype(np.int8).copy(order=order) d_arr = actx.from_numpy(arr) - arr_starts, arr_lists, evt = matcompr(actx.queue, d_arr) + arr_starts, arr_lists = mask_to_csr(actx, d_arr) arr_starts = actx.to_numpy(arr_starts) arr_lists = actx.to_numpy(arr_lists) @@ -121,8 +120,7 @@ def test_masked_matrix_compression(actx_factory, order): def test_masked_list_compression(actx_factory): actx = actx_factory() - from boxtree.tools import MaskCompressorKernel - listcompr = MaskCompressorKernel(actx.context) + from boxtree.tools import mask_to_csr n = 20 @@ -131,7 +129,7 @@ def test_masked_list_compression(actx_factory): arr = (np.random.rand(n) > 0.5).astype(np.int8) d_arr = actx.from_numpy(arr) - arr_list, evt = listcompr(actx.queue, d_arr) + arr_list = mask_to_csr(actx, d_arr) arr_list = actx.to_numpy(arr_list) assert set(arr_list) == set(arr.nonzero()[0]) @@ -164,6 +162,50 @@ def test_device_record(actx_factory): for i in range(3): assert np.array_equal(record_host.obj_array[i], record.obj_array[i]) + +def test_device_record_array_context(actx_factory): + actx = actx_factory() + + from typing import Optional + from dataclasses import dataclass + from arraycontext import Array + + from boxtree.array_context import dataclass_array_container + + @dataclass_array_container + @dataclass(frozen=True) + class MyDeviceDataRecord: + array: Array + obj_array: np.ndarray + opt_array: Optional[Array] + value: float + + from pytools.obj_array import make_obj_array + rng = np.random.default_rng() + record = MyDeviceDataRecord( + array=rng.random(128), + obj_array=make_obj_array([rng.random(128) for _ in range(3)]), + opt_array=None, + value=3) + + actx_record = actx.from_numpy(record) + assert actx_record.array.queue is actx.queue + + frozen_record = actx.freeze(actx_record) + assert frozen_record.array.queue is None + + thawed_record = actx.thaw(frozen_record) + assert actx_record.array.queue is actx.queue + + host_record = actx.to_numpy(thawed_record) + assert isinstance(host_record.array, np.ndarray) + + assert record.value == host_record.value + assert np.allclose(record.array, host_record.array) + assert np.all([ + np.allclose(record.obj_array[i], host_record.obj_array[i]) for i in range(3) + ]) + # }}} @@ -175,7 +217,7 @@ def test_device_record(actx_factory): def test_particle_array(actx_factory, array_factory, dim, dtype): actx = actx_factory() - particles = array_factory(actx.queue, 1000, dim, dtype) + particles = array_factory(actx, 1000, dim, dtype) assert len(particles) == dim assert all(len(particles[0]) == len(axis) for axis in particles) diff --git a/test/test_traversal.py b/test/test_traversal.py index a86001a0..1b3c3a8d 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -52,22 +52,21 @@ def test_tree_connectivity(actx_factory, dims, sources_are_targets): actx = actx_factory() dtype = np.float64 - sources = make_normal_particle_array(actx.queue, 1 * 10**5, dims, dtype) + sources = make_normal_particle_array(actx, 1 * 10**5, dims, dtype) if sources_are_targets: targets = None else: - targets = make_normal_particle_array(actx.queue, 2 * 10**5, dims, dtype) + targets = make_normal_particle_array(actx, 2 * 10**5, dims, dtype) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb(actx.queue, sources, max_particles_in_box=30, - targets=targets, debug=True) + from boxtree import build_tree + tree = build_tree( + actx, sources, + targets=targets, max_particles_in_box=30, debug=True) - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context) - trav, _ = tg(actx.queue, tree, debug=True) - tree = tree.get(queue=actx.queue) - trav = trav.get(queue=actx.queue) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, debug=True) + tree = actx.to_numpy(tree) + trav = actx.to_numpy(trav) levels = tree.box_levels parents = tree.box_parent_ids.T @@ -282,18 +281,14 @@ def test_plot_traversal(actx_factory, well_sep_is_n_away=1, visualize=False): actx.from_numpy(rng.normal(0.0, 1.0, (nparticles,)).astype(dtype)) for i in range(dims)]) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree.traversal import build_traversal + trav = build_traversal(actx, tree, well_sep_is_n_away=well_sep_is_n_away) - from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away) - trav, _ = tg(actx.queue, tree) - - tree = tree.get(queue=actx.queue) - trav = trav.get(queue=actx.queue) + tree = actx.to_numpy(tree) + trav = actx.to_numpy(trav) from boxtree.visualization import TreePlotter plotter = TreePlotter(tree) @@ -336,28 +331,20 @@ def test_from_sep_siblings_translation_and_rotation_classes( actx.from_numpy(rng.normal(0.0, 1.0, (nparticles,)).astype(dtype)) for i in range(dims)]) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) # }}} # {{{ build traversal - from boxtree.traversal import FMMTraversalBuilder - from boxtree.rotation_classes import RotationClassesBuilder - from boxtree.translation_classes import TranslationClassesBuilder - - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away) - trav, _ = tg(actx.queue, tree) - - rb = RotationClassesBuilder(actx.context) - result, _ = rb(actx.queue, trav, tree) + from boxtree.traversal import build_traversal + from boxtree.rotation_classes import build_rotation_classes + from boxtree.translation_classes import build_translation_classes - tb = TranslationClassesBuilder(actx.context) - result_tb, _ = tb(actx.queue, trav, tree) + trav = build_traversal(actx, tree, well_sep_is_n_away=well_sep_is_n_away) + result = build_rotation_classes(actx, trav, tree) + result_tb = build_translation_classes(actx, trav, tree) rot_classes = actx.to_numpy( result.from_sep_siblings_rotation_classes) @@ -369,8 +356,8 @@ def test_from_sep_siblings_translation_and_rotation_classes( distance_vectors = actx.to_numpy( result_tb.from_sep_siblings_translation_class_to_distance_vector) - tree = tree.get(queue=actx.queue) - trav = trav.get(queue=actx.queue) + tree = actx.to_numpy(tree) + trav = actx.to_numpy(trav) centers = tree.box_centers.T diff --git a/test/test_tree.py b/test/test_tree.py index d72ed5c5..35aee864 100644 --- a/test/test_tree.py +++ b/test/test_tree.py @@ -28,6 +28,7 @@ from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import ( # noqa: F401 PytestPyOpenCLArrayContextFactory, _acf) + from boxtree.tools import make_normal_particle_array import logging @@ -47,18 +48,16 @@ def test_bounding_box(actx_factory, dtype, dims, nparticles): actx = actx_factory() from boxtree.tools import AXIS_NAMES - from boxtree.bounding_box import BoundingBoxFinder - bbf = BoundingBoxFinder(actx.context) - + from boxtree.bounding_box import find_bounding_box axis_names = AXIS_NAMES[:dims] logger.info("%s - %s %s", dtype, dims, nparticles) - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) bbox_min = [np.min(actx.to_numpy(x)) for x in particles] bbox_max = [np.max(actx.to_numpy(x)) for x in particles] - bbox_cl, evt = bbf(particles, radii=None) + bbox_cl = find_bounding_box(actx, particles, radii=None) bbox_cl = actx.to_numpy(bbox_cl) bbox_min_cl = np.empty(dims, dtype) @@ -76,7 +75,7 @@ def test_bounding_box(actx_factory, dtype, dims, nparticles): # {{{ test basic (no source/target distinction) tree build -def run_build_test(builder, actx, dims, dtype, nparticles, visualize, +def run_build_test(actx, dims, dtype, nparticles, visualize, max_particles_in_box=None, max_leaf_refine_weight=None, refine_weights=None, **kwargs): dtype = np.dtype(dtype) @@ -101,21 +100,20 @@ def run_build_test(builder, actx, dims, dtype, nparticles, visualize, logger.info(75 * "-") - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - actx.queue.finish() - - tree, _ = builder(actx.queue, particles, + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=max_particles_in_box, refine_weights=refine_weights, max_leaf_refine_weight=max_leaf_refine_weight, debug=True, **kwargs) - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) sorted_particles = np.array(list(tree.sources)) @@ -233,10 +231,7 @@ def particle_tree_test_decorator(f): def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - - run_build_test(builder, actx, dims, + run_build_test(actx, dims, dtype, 4, max_particles_in_box=30, visualize=visualize) @@ -244,10 +239,7 @@ def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False): def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - - run_build_test(builder, actx, dims, + run_build_test(actx, dims, dtype, 50, max_particles_in_box=30, visualize=visualize) @@ -255,11 +247,8 @@ def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False): def test_unpruned_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - # test unpruned tree build - run_build_test(builder, actx, dims, dtype, 10**5, + run_build_test(actx, dims, dtype, 10**5, visualize=visualize, max_particles_in_box=30, skip_prune=True) @@ -268,10 +257,7 @@ def test_particle_tree_with_reallocations( actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - - run_build_test(builder, actx, dims, dtype, 10**5, + run_build_test(actx, dims, dtype, 10**5, max_particles_in_box=30, visualize=visualize, nboxes_guess=5) @@ -280,10 +266,7 @@ def test_particle_tree_with_many_empty_leaves( actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - - run_build_test(builder, actx, dims, dtype, 10**5, + run_build_test(actx, dims, dtype, 10**5, max_particles_in_box=5, visualize=visualize) @@ -291,10 +274,7 @@ def test_particle_tree_with_many_empty_leaves( def test_vanilla_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - - run_build_test(builder, actx, dims, dtype, 10**5, + run_build_test(actx, dims, dtype, 10**5, max_particles_in_box=30, visualize=visualize) @@ -303,9 +283,6 @@ def test_explicit_refine_weights_particle_tree( actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - nparticles = 10**5 rng = np.random.default_rng(10) @@ -313,7 +290,7 @@ def test_explicit_refine_weights_particle_tree( rng.integers(1, 10, (nparticles,), dtype=np.int32) ) - run_build_test(builder, actx, dims, dtype, nparticles, + run_build_test(actx, dims, dtype, nparticles, refine_weights=refine_weights, max_leaf_refine_weight=100, visualize=visualize) @@ -322,10 +299,7 @@ def test_explicit_refine_weights_particle_tree( def test_non_adaptive_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() - from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) - - run_build_test(builder, actx, dims, dtype, 10**4, + run_build_test(actx, dims, dtype, 10**4, max_particles_in_box=30, visualize=visualize, kind="non-adaptive") # }}} @@ -342,9 +316,9 @@ def test_source_target_tree(actx_factory, dims, visualize=False): ntargets = 3 * 10**5 dtype = np.float64 - sources = make_normal_particle_array(actx.queue, nsources, dims, dtype, + sources = make_normal_particle_array(actx, nsources, dims, dtype, seed=12) - targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype, + targets = make_normal_particle_array(actx, ntargets, dims, dtype, seed=19) if visualize: @@ -354,13 +328,11 @@ def test_source_target_tree(actx_factory, dims, visualize=False): pt.plot(np_targets[0], np_targets[1], "g+") pt.show() - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, sources, targets=targets, - max_particles_in_box=10, debug=True) - tree = tree.get(queue=actx.queue) + from boxtree import build_tree + tree = build_tree( + actx, sources, + targets=targets, max_particles_in_box=10, debug=True) + tree = actx.to_numpy(tree) sorted_sources = np.array(list(tree.sources)) sorted_targets = np.array(list(tree.targets)) @@ -454,9 +426,9 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): dtype = np.float64 npoint_sources_per_source = 16 - sources = make_normal_particle_array(actx.queue, nsources, dims, dtype, + sources = make_normal_particle_array(actx, nsources, dims, dtype, seed=12) - targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype, + targets = make_normal_particle_array(actx, ntargets, dims, dtype, seed=19) refine_weights = actx.zeros(nsources + ntargets, np.int32) @@ -470,11 +442,8 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): 2**rng.uniform(-10, 0, (ntargets,)).astype(dtype) ) - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - dev_tree, _ = tb(actx.queue, sources, targets=targets, + from boxtree import build_tree + dev_tree = build_tree(actx, sources, targets=targets, source_radii=source_radii, target_radii=target_radii, extent_norm=extent_norm, @@ -492,7 +461,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): logger.info("transfer tree, check orderings") - tree = dev_tree.get(queue=actx.queue) + tree = actx.to_numpy(dev_tree) if visualize: import matplotlib.pyplot as pt @@ -655,7 +624,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): ) from boxtree.tree import link_point_sources - dev_tree = link_point_sources(actx.queue, dev_tree, + dev_tree = link_point_sources(actx, dev_tree, point_source_starts, point_sources, debug=True) @@ -675,31 +644,26 @@ def test_leaves_to_balls_query(actx_factory, dims, visualize=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.zeros(nballs, dtype) - from boxtree.area_query import LeavesToBallsLookupBuilder - lblb = LeavesToBallsLookupBuilder(actx.context) - - lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii) + from boxtree.area_query import build_leaves_to_balls_lookup + lbl = build_leaves_to_balls_lookup(actx, tree, ball_centers, ball_radii) # get data to host for test - tree = tree.get(queue=actx.queue) - lbl = lbl.get(queue=actx.queue) + tree = actx.to_numpy(tree) + lbl = actx.to_numpy(lbl) ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T ball_radii = actx.to_numpy(ball_radii) @@ -731,14 +695,12 @@ def run_area_query_test(actx, tree, ball_centers, ball_radii): """ Performs an area query and checks that the result is as expected. """ - from boxtree.area_query import AreaQueryBuilder - aqb = AreaQueryBuilder(actx.context) - - area_query, _ = aqb(actx.queue, tree, ball_centers, ball_radii) + from boxtree.area_query import build_area_query + area_query = build_area_query(actx, tree, ball_centers, ball_radii) # Get data to host for test. - tree = tree.get(queue=actx.queue) - area_query = area_query.get(queue=actx.queue) + tree = actx.to_numpy(tree) + area_query = actx.to_numpy(area_query) ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T ball_radii = actx.to_numpy(ball_radii) @@ -777,21 +739,18 @@ def test_area_query(actx_factory, dims, visualize=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.zeros(nballs, dtype) run_area_query_test(actx, tree, ball_centers, ball_radii) @@ -810,18 +769,15 @@ def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False): nparticles = 10**4 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 bbox_min = tree.bounding_box[0].min() @@ -847,25 +803,21 @@ def test_area_query_elwise(actx_factory, dims, visualize=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.zeros(nballs, dtype) - from boxtree.area_query import ( - AreaQueryElementwiseTemplate, PeerListFinder) + from boxtree.area_query import AreaQueryElementwiseTemplate, build_peer_list template = AreaQueryElementwiseTemplate( extra_args=""" @@ -882,21 +834,19 @@ def test_area_query_elwise(actx_factory, dims, visualize=False): """, leaf_found_op="") - peer_lists, evt = PeerListFinder(actx.context)(actx.queue, tree) - + peer_lists = build_peer_list(actx, tree) kernel = template.generate( - actx.context, + actx.queue.context, dims, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.nlevels) - evt = kernel( + kernel( *template.unwrap_args( tree, peer_lists, ball_radii, *ball_centers), queue=actx.queue, - wait_for=[evt], range=slice(len(ball_radii))) # }}} @@ -916,19 +866,15 @@ def test_level_restriction( dtype = np.float64 from boxtree.tools import make_surface_particle_array - particles = make_surface_particle_array( - actx.queue, nparticles, dims, dtype, seed=15) + particles = make_surface_particle_array(actx, nparticles, dims, dtype, seed=15) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree_dev, _ = tb(actx.queue, particles, + from boxtree import build_tree + tree_dev = build_tree(actx, particles, kind="adaptive-level-restricted", max_particles_in_box=30, debug=True, skip_prune=skip_prune, lr_lookbehind=lookbehind, @@ -942,19 +888,18 @@ def find_neighbors(leaf_box_centers, leaf_box_radii): # # Note that since this comes from an area query, the self box will be # included in the neighbor list. - from boxtree.area_query import AreaQueryBuilder - aqb = AreaQueryBuilder(actx.context) + from boxtree.area_query import build_area_query ball_radii = actx.from_numpy(np.min(leaf_box_radii) / 2 + leaf_box_radii) leaf_box_centers = [actx.from_numpy(axis) for axis in leaf_box_centers] - area_query, _ = aqb(actx.queue, tree_dev, leaf_box_centers, ball_radii) - area_query = area_query.get(queue=actx.queue) + area_query = build_area_query(actx, tree_dev, leaf_box_centers, ball_radii) + area_query = actx.to_numpy(area_query) return (area_query.leaves_near_ball_starts, area_query.leaves_near_ball_lists) # Get data to host for test. - tree = tree_dev.get(queue=actx.queue) + tree = actx.to_numpy(tree_dev) # Find leaf boxes. from boxtree import box_flags_enum @@ -996,38 +941,32 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False): dtype = np.dtype(dtype) nparticles = 10**5 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + from boxtree import build_tree + tree = build_tree(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.zeros(nballs, dtype) from boxtree.area_query import ( - LeavesToBallsLookupBuilder, SpaceInvaderQueryBuilder) + build_leaves_to_balls_lookup, build_space_invader_query) - siqb = SpaceInvaderQueryBuilder(actx.context) # We can use leaves-to-balls lookup to get the set of overlapping balls for # each box, and from there to compute the outer space invader distance. - lblb = LeavesToBallsLookupBuilder(actx.context) - - siq, _ = siqb(actx.queue, tree, ball_centers, ball_radii) - lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii) + siq = build_space_invader_query(actx, tree, ball_centers, ball_radii) + lbl = build_leaves_to_balls_lookup(actx, tree, ball_centers, ball_radii) # get data to host for test - tree = tree.get(queue=actx.queue) - siq = siq.get(queue=actx.queue) - lbl = lbl.get(queue=actx.queue) + tree = actx.to_numpy(tree) + siq = actx.to_numpy(siq) + lbl = actx.to_numpy(lbl) ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]) ball_radii = actx.to_numpy(ball_radii) @@ -1058,16 +997,14 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False): @pytest.mark.opencl @pytest.mark.parametrize("dims", [2, 3]) -def test_same_tree_with_zero_weight_particles(actx_factory, dims): +def test_same_tree_with_zero_weight_particles(actx_factory, dims, visualize=False): actx = actx_factory() ntargets_values = [300, 400, 500] stick_out_factors = [0, 0.1, 0.3, 1] nsources = 20 - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - + from boxtree import build_tree trees = [] rng = np.random.default_rng(10) @@ -1087,18 +1024,18 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims): refine_weights[:nsources] = 1 refine_weights[nsources:] = 0 - tree, _ = tb(actx.queue, sources, targets=targets, + tree = build_tree(actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=stick_out_factor, max_leaf_refine_weight=10, refine_weights=refine_weights, debug=True) - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) trees.append(tree) print("TREE:", tree.nboxes) - if 0: + if visualize: import matplotlib.pyplot as plt for tree in trees: plt.figure() @@ -1114,13 +1051,12 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims): def test_max_levels_error(actx_factory): actx = actx_factory() - from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - + from boxtree import build_tree sources = [actx.zeros(11, np.float64) for i in range(2)] + from boxtree.tree_build import MaxLevelsExceeded with pytest.raises(MaxLevelsExceeded): - tree, _ = tb(actx.queue, sources, max_particles_in_box=10, debug=True) + build_tree(actx, sources, max_particles_in_box=10, debug=True) # }}}