inducer · matthiasdiener · Mar 10, 2022 · Nov 14, 2021 · Aug 16, 2022 · Oct 20, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -104,6 +104,20 @@ jobs:
                 ( test_py_project )
                 ( test_py_project )
 
+    pytest_with_barvinok:
+        name: Conda Pytest with Barvinok
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                echo "- barvinok" >> "$CONDA_ENVIRONMENT"
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                build_py_project_in_conda_env
+                test_py_project
+
     examples:
         name: Conda Examples
         runs-on: ubuntu-latest

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -22,6 +22,7 @@ Pytest Nvidia Titan V:
   - source /opt/enable-intel-cl.sh
   - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
+
   tags:
   - python3
   - nvidia-titan-v

diff --git a/doc/misc.rst b/doc/misc.rst
@@ -456,6 +456,13 @@ Here's a Bibtex entry for your convenience::
        doi = "{10.1145/2627373.2627387}",
     }
 
+References
+==========
+
+.. [Seghir_2006] Seghir and Loechner, Proceedings of the 2006 International
+    Conference on Compilers, Architecture and Synthesis for Embedded systems,
+    `(DOI) <https://doi.org/10.1145/1176760.1176771>`__
+
 Getting help
 ============
 

diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
@@ -54,6 +54,8 @@ Influencing data access
 
 .. autofunction:: allocate_temporaries_for_base_storage
 
+.. automodule:: loopy.transform.reindex
+
 Padding Data
 ------------
 
@@ -80,6 +82,13 @@ Manipulating Instructions
 
 .. autofunction:: add_barrier
 
+Manipulating Reductions
+-----------------------
+
+.. autofunction:: hoist_invariant_multiplicative_terms_in_sum_reduction
+
+.. autofunction:: extract_multiplicative_terms_in_sum_reduction_as_subst
+
 Registering Library Routines
 ----------------------------
 
@@ -143,4 +152,10 @@ TODO: Matching instruction tags
 
 .. automodule:: loopy.match
 
+
+Fusing Loops
+------------
+
+.. automodule:: loopy.transform.loop_fusion
+
 .. vim: tw=75:spell
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -612,7 +612,7 @@ commonly called 'loop tiling':
     ...     assumptions="n mod 16 = 0 and n >= 1")
     >>> knl = lp.split_iname(knl, "i", 16)
     >>> knl = lp.split_iname(knl, "j", 16)
-    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
+    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
     >>> knl = lp.set_options(knl, write_code=True)
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
@@ -1031,8 +1031,8 @@ transformation exists in :func:`loopy.add_prefetch`:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-        acc_k = 0.0f;
         a_fetch = a[16 * gid(0) + lid(0)];
+        acc_k = 0.0f;
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch;
         out[16 * gid(0) + lid(0)] = acc_k;
@@ -1055,10 +1055,9 @@ earlier:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
-        acc_k = 0.0f;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
         a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
+
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
         for (int k = 0; k <= 15; ++k)
@@ -1909,18 +1908,16 @@ Now to make things more interesting, we'll create a kernel with barriers:
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
-      {
-        int const k_outer = 0;
-    <BLANKLINE>
+      for (int i = 0; i <= 49; ++i)
         for (int j = 0; j <= 9; ++j)
-          for (int i = 0; i <= 49; ++i)
-          {
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
-            c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
-            e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
-          }
-      }
+        {
+          int const k_outer = 0;
+    <BLANKLINE>
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
+          c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
+          e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
+        }
     }
 
 In this kernel, when a work-item performs the second instruction it uses data

diff --git a/loopy/__init__.py b/loopy/__init__.py
@@ -108,6 +108,8 @@
 from loopy.transform.fusion import fuse_kernels
 from loopy.transform.concatenate import concatenate_arrays
 
+from loopy.transform.reindex import reindex_temporary_using_seghir_loechner_scheme
+
 from loopy.transform.arithmetic import (
         fold_constants,
         collect_common_factors_on_increment)
@@ -123,6 +125,10 @@
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
 from loopy.transform.add_barrier import add_barrier
+from loopy.transform.reduction import (
+    hoist_invariant_multiplicative_terms_in_sum_reduction,
+    extract_multiplicative_terms_in_sum_reduction_as_subst)
+from loopy.transform.domain import decouple_domain
 from loopy.transform.callable import (register_callable,
         merge, inline_callable_kernel, rename_callable)
 from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call
@@ -159,11 +165,14 @@
 from loopy.target.cuda import CudaTarget
 from loopy.target.opencl import OpenCLTarget
 from loopy.target.pyopencl import PyOpenCLTarget
+from loopy.target.pycuda import PyCudaTarget, PyCudaWithPackedArgsTarget
 from loopy.target.ispc import ISPCTarget
 
 from loopy.tools import (Optional, t_unit_to_python, memoize_on_disk,
                          clear_in_mem_caches)
 
+from loopy.transform.loop_fusion import (get_kennedy_unweighted_fusion_candidates,
+                                          rename_inames_in_batch)
 from loopy.target.execution import ExecutorBase
 
 
@@ -241,6 +250,8 @@
 
         "fold_constants", "collect_common_factors_on_increment",
 
+        "reindex_temporary_using_seghir_loechner_scheme",
+
         "split_array_axis", "split_array_dim", "split_arg_axis",
         "find_padding_multiple", "add_padding",
 
@@ -255,13 +266,20 @@
 
         "add_barrier",
 
+        "hoist_invariant_multiplicative_terms_in_sum_reduction",
+        "extract_multiplicative_terms_in_sum_reduction_as_subst",
+        "decouple_domain",
+
         "register_callable",
         "merge",
 
         "inline_callable_kernel", "rename_callable",
 
         "pack_and_unpack_args_for_call",
 
+        "rename_inames_in_batch",
+        "get_kennedy_unweighted_fusion_candidates",
+
         # }}}
 
         "get_dot_dependency_graph",
@@ -309,7 +327,7 @@
         "CWithGNULibcTarget", "ExecutableCWithGNULibcTarget",
         "CudaTarget", "OpenCLTarget",
         "PyOpenCLTarget", "ISPCTarget",
-        "ASTBuilderBase",
+        "PyCudaTarget", "PyCudaWithPackedArgsTarget", "ASTBuilderBase",
 
         "Optional", "memoize_on_disk", "clear_in_mem_caches",
 
@@ -328,6 +346,15 @@
         # }}}
         ]
 
+
+try:
+    import loopy.relations as relations
+except ImportError:
+    # catching ImportErrors to avoid making minikanren a hard-dep
+    pass
+else:
+    __all__ += ["relations"]
+
 # }}}
 
 

diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
@@ -23,10 +23,12 @@
 
 from loopy.diagnostic import warn, LoopyError
 from loopy.codegen.result import merge_codegen_results
+from loopy.kernel import LoopKernel
 import islpy as isl
 from islpy import dim_type
 from loopy.codegen.control import build_loop_nest
 from pymbolic.mapper.stringifier import PREC_NONE
+from typing import FrozenSet
 
 
 # {{{ conditional-reducing slab decomposition
@@ -343,6 +345,16 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
 # {{{ sequential loop
 
+def _get_intersecting_inames(kernel: LoopKernel, iname: str) -> FrozenSet[str]:
+    from functools import reduce
+    return reduce(frozenset.union,
+                  ((kernel.id_to_insn[insn].within_inames
+                    | kernel.id_to_insn[insn].reduction_inames()
+                    | kernel.id_to_insn[insn].sub_array_ref_inames())
+                   for insn in kernel.iname_to_insns()[iname]),
+                  frozenset())
+
+
 def generate_sequential_loop_dim_code(codegen_state, sched_index, hints):
     kernel = codegen_state.kernel
 
@@ -357,6 +369,9 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index, hints):
     usable_inames = get_usable_inames_for_conditional(kernel, sched_index,
             codegen_state.codegen_cachemanager)
 
+    # get rid of disjoint loop nests, see <www.github.com/inducer/loopy/issues/724>
+    usable_inames = usable_inames & _get_intersecting_inames(kernel, loop_iname)
+
     domain = kernel.get_inames_domain(loop_iname)
 
     result = []

diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
@@ -31,6 +31,7 @@
 
 from loopy.diagnostic import LoopyError
 from loopy.tools import Optional
+from collections.abc import Set as abc_Set
 
 
 # {{{ instruction tags
@@ -186,7 +187,7 @@ class InstructionBase(ImmutableRecord, Taggable):
         A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to
         provide metadata on this object. Legacy string tags are converted to
         :class:`LegacyStringInstructionTag` or, if they used to carry
-        a functional meaning, the tag carrying that same fucntional meaning
+        a functional meaning, the tag carrying that same functional meaning
         (e.g. :class:`UseStreamingStoreTag`).
 
     .. automethod:: __init__
@@ -267,7 +268,7 @@ def __init__(self, id, depends_on, depends_on_is_final,
         if depends_on_is_final is None:
             depends_on_is_final = False
 
-        if depends_on_is_final and not isinstance(depends_on, frozenset):
+        if depends_on_is_final and not isinstance(depends_on, abc_Set):
             raise LoopyError("Setting depends_on_is_final to True requires "
                     "actually specifying depends_on")
 
@@ -277,7 +278,7 @@ def __init__(self, id, depends_on, depends_on_is_final,
         if priority is None:
             priority = 0
 
-        if not isinstance(tags, frozenset):
+        if not isinstance(tags, abc_Set):
             # was previously allowed to be tuple
             tags = frozenset(tags)
 
@@ -292,10 +293,10 @@ def __init__(self, id, depends_on, depends_on_is_final,
         # assert all(is_interned(iname) for iname in within_inames)
         # assert all(is_interned(pred) for pred in predicates)
 
-        assert isinstance(within_inames, frozenset)
-        assert isinstance(depends_on, frozenset) or depends_on is None
-        assert isinstance(groups, frozenset)
-        assert isinstance(conflicts_with_groups, frozenset)
+        assert isinstance(within_inames, abc_Set)
+        assert isinstance(depends_on, abc_Set) or depends_on is None
+        assert isinstance(groups, abc_Set)
+        assert isinstance(conflicts_with_groups, abc_Set)
 
         ImmutableRecord.__init__(self,
                 id=id,

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
@@ -2114,6 +2114,66 @@ def get_outer_params(domains):
 
 # }}}
 
+# {{{ get access map from an instruction
+
+
+class _IndexCollector(CombineMapper):
+    def __init__(self, var):
+        self.var = var
+        super().__init__()
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_subscript(self, expr):
+        if expr.aggregate.name == self.var:
+            return (super().map_subscript(expr) | frozenset([expr.index_tuple]))
+        else:
+            return super().map_subscript(expr)
+
+    def map_algebraic_leaf(self, expr):
+        return frozenset()
+
+    map_constant = map_algebraic_leaf
+
+
+def _project_out_inames_from_maps(amaps, inames_to_project_out):
+    new_amaps = []
+    for amap in amaps:
+        for iname in inames_to_project_out:
+            dt, pos = amap.get_var_dict()[iname]
+            amap = amap.project_out(dt, pos, 1)
+
+        new_amaps.append(amap)
+
+    return new_amaps
+
+
+def _union_amaps(amaps):
+    import islpy as isl
+    return reduce(isl.Map.union, amaps[1:], amaps[0])
+
+
+def get_insn_access_map(kernel, insn_id, var):
+    from loopy.transform.subst import expand_subst
+    from loopy.symbolic import get_access_map
+
+    insn = kernel.id_to_insn[insn_id]
+
+    kernel = expand_subst(kernel)
+    indices = list(_IndexCollector(var)((insn.expression,
+                                         insn.assignees,
+                                         tuple(insn.predicates))))
+
+    amaps = [get_access_map(kernel.get_inames_domain(insn.within_inames),
+                        idx, kernel.assumptions)
+            for idx in indices]
+
+    return _union_amaps(amaps)
+
+# }}}
+
 
 def get_hw_axis_base_for_codegen(kernel: LoopKernel, iname: str) -> isl.Aff:
     """