From ad721258878d8a50ad0ac764b3f05265bbdc1914 Mon Sep 17 00:00:00 2001
From: Max West <maxwest@uw.edu>
Date: Tue, 5 Dec 2023 16:09:34 -0800
Subject: [PATCH] initial commit

---
 src/hipscat_import/catalog/__init__.py        |   1 +
 .../cross_match/macauff_arguments.py          |  24 +++
 .../cross_match/run_macauff_import.py         | 172 +++++++++++++++++-
 3 files changed, 196 insertions(+), 1 deletion(-)

diff --git a/src/hipscat_import/catalog/__init__.py b/src/hipscat_import/catalog/__init__.py
index 1c2e403f..9ff3f268 100644
--- a/src/hipscat_import/catalog/__init__.py
+++ b/src/hipscat_import/catalog/__init__.py
@@ -3,3 +3,4 @@
 from .arguments import ImportArguments
 from .map_reduce import map_to_pixels, reduce_pixel_shards, split_pixels
 from .run_import import run
+from .resume_plan import ResumePlan
diff --git a/src/hipscat_import/cross_match/macauff_arguments.py b/src/hipscat_import/cross_match/macauff_arguments.py
index 74283f18..abb10016 100644
--- a/src/hipscat_import/cross_match/macauff_arguments.py
+++ b/src/hipscat_import/cross_match/macauff_arguments.py
@@ -6,8 +6,10 @@
 
 from hipscat.io import FilePointer
 from hipscat.io.validation import is_valid_catalog
+from hipscat_import.catalog.file_readers import InputReader, get_file_reader
 
 from hipscat_import.runtime_arguments import RuntimeArguments, find_input_paths
+from hipscat_import.catalog import ResumePlan
 
 # pylint: disable=too-many-instance-attributes
 # pylint: disable=unsupported-binary-operation
@@ -48,6 +50,16 @@ class MacauffArguments(RuntimeArguments):
     match_probability_columns: List[str] = field(default_factory=list)
     column_names: List[str] = field(default_factory=list)
 
+    resume_plan : ResumePlan | None = None
+
+    file_reader: InputReader | None = None
+    """instance of input reader that specifies arguments necessary for reading
+    from your input files"""
+
+    constant_healpix_order: int = 10
+    highest_healpix_order: int = 10
+    mapping_healpix_order: int = 10
+
     def __post_init__(self):
         self._check_arguments()
 
@@ -91,6 +103,18 @@ def _check_arguments(self):
 
         self.column_names = self.get_column_names()
 
+        self.resume_plan = ResumePlan(
+            input_paths=self.input_paths,
+            tmp_path=self.tmp_path,
+            progress_bar=True,
+        )
+
+        if not self.file_reader:
+            self.file_reader = get_file_reader(
+                file_format = self.input_format,
+                column_names = self.column_names
+            )
+
     def get_column_names(self):
         """Grab the macauff column names."""
         # TODO: Actually read in the metadata file once we get the example file from Tom.
diff --git a/src/hipscat_import/cross_match/run_macauff_import.py b/src/hipscat_import/cross_match/run_macauff_import.py
index aad49b51..fcef8776 100644
--- a/src/hipscat_import/cross_match/run_macauff_import.py
+++ b/src/hipscat_import/cross_match/run_macauff_import.py
@@ -1,7 +1,103 @@
 from hipscat_import.cross_match.macauff_arguments import MacauffArguments
+from hipscat.catalog import Catalog
+from hipscat_import.catalog import ResumePlan
+from tqdm import tqdm
+import hipscat_import.catalog.map_reduce as mr
+from hipscat_import.pipeline_resume_plan import PipelineResumePlan
+from hipscat import pixel_math
+from hipscat.pixel_tree.pixel_tree_builder import PixelTreeBuilder
+import numpy as np
+from hipscat.pixel_tree import PixelAlignment, align_trees
+from hipscat.pixel_tree.pixel_alignment_types import PixelAlignmentType
+import  healpy as hp
 
 # pylint: disable=unused-argument
+def _map_pixels(args, client):
+    """Generate a raw histogram of object counts in each healpix pixel"""
 
+    if args.resume_plan.is_mapping_done():
+        return
+
+    reader_future = client.scatter(args.file_reader)
+    futures = []
+    for key, file_path in args.resume_plan.map_files:
+        futures.append(
+            client.submit(
+                mr.map_to_pixels,
+                key=key,
+                input_file=file_path,
+                resume_path=args.resume_plan.tmp_path,
+                file_reader=reader_future,
+                mapping_key=key,
+                highest_order=args.mapping_healpix_order,
+                ra_column=args.left_ra_column,
+                dec_column=args.left_dec_column,
+                use_hipscat_index=False,
+            )
+        )
+    args.resume_plan.wait_for_mapping(futures)
+
+def _split_pixels(args, alignment_future, client):
+    """Generate a raw histogram of object counts in each healpix pixel"""
+
+    if args.resume_plan.is_splitting_done():
+        return
+
+    reader_future = client.scatter(args.file_reader)
+    futures = []
+    for key, file_path in args.resume_plan.split_keys:
+        futures.append(
+            client.submit(
+                mr.split_pixels,
+                key=key,
+                input_file=file_path,
+                file_reader=reader_future,
+                highest_order=args.mapping_healpix_order,
+                ra_column=args.left_ra_column,
+                dec_column=args.left_dec_column,
+                splitting_key=key,
+                cache_shard_path=args.tmp_path,
+                resume_path=args.resume_plan.tmp_path,
+                alignment=alignment_future,
+                use_hipscat_index=False,
+            )
+        )
+
+    args.resume_plan.wait_for_splitting(futures)
+
+def _reduce_pixels(args, destination_pixel_map, client):
+    """Loop over destination pixels and merge into parquet files"""
+
+    if args.resume_plan.is_reducing_done():
+        return
+
+    futures = []
+    for (
+        destination_pixel,
+        source_pixels,
+        destination_pixel_key,
+    ) in args.resume_plan.get_reduce_items(destination_pixel_map):
+        futures.append(
+            client.submit(
+                mr.reduce_pixel_shards,
+                key=destination_pixel_key,
+                cache_shard_path=args.tmp_path,
+                resume_path=args.resume_plan.tmp_path,
+                reducing_key=destination_pixel_key,
+                destination_pixel_order=destination_pixel.order,
+                destination_pixel_number=destination_pixel.pixel,
+                destination_pixel_size=source_pixels[0],
+                output_path=args.catalog_path,
+                ra_column=args.left_ra_column,
+                dec_column=args.left_dec_column,
+                id_column=args.left_id_column,
+                add_hipscat_index=False,
+                use_schema_file=False,
+                use_hipscat_index=False,
+            )
+        )
+
+    args.resume_plan.wait_for_reducing(futures)
 
 def run(args, client):
     """run macauff cross-match import pipeline"""
@@ -10,4 +106,78 @@ def run(args, client):
     if not isinstance(args, MacauffArguments):
         raise TypeError("args must be type MacauffArguments")
 
-    raise NotImplementedError("macauff pipeline not implemented yet.")
+    # Basic plan for map reduce
+    # co partitioning the pixels from the left catalog
+
+    # MAP
+    #   Get the partition info from left catalog
+    # this might be better to read in from `MacauffArguments`
+    left_catalog = Catalog.read_from_hipscat(args.left_catalog_dir)
+    left_pixel_tree = left_catalog.pixel_tree
+    left_pixels = left_catalog.partition_info.get_healpix_pixels()
+
+    #   assign a constant healpix order value for the data
+    _map_pixels(args, client)
+
+    with tqdm(
+        total=2, desc=PipelineResumePlan.get_formatted_stage_name("Binning"), disable=False
+    ) as step_progress:
+        raw_histogram = args.resume_plan.read_histogram(args.constant_healpix_order)
+        step_progress.update(1)
+
+        # alignment = np.full(len(raw_histogram), None)
+        # for pixel_num, pixel_sum in enumerate(raw_histogram):
+        #     alignment[pixel_num] = (
+        #         args.constant_healpix_order,
+        #         pixel_num,
+        #         pixel_sum,
+        #     )
+        nonzero_pix = list(np.nonzero(raw_histogram)[0])
+        nonzero_pixels = list(map(
+            lambda p : pixel_math.HealpixPixel(args.constant_healpix_order, p),
+            nonzero_pix
+        ))
+        cross_match_pixel_tree = PixelTreeBuilder.from_healpix(healpix_pixels=nonzero_pixels)
+
+        step_progress.update(1)
+
+    alignment = align_trees(
+        left=left_pixel_tree,
+        right=cross_match_pixel_tree,
+        alignment_type=PixelAlignmentType.INNER
+    )
+
+    pixel_map = alignment.pixel_mapping
+
+    # generate alignment
+    # TODO: make this a utility function in hipscat
+    final_alignment = np.full(hp.order2npix(args.mapping_healpix_order), None)
+    for pixel in left_pixels:
+        df = alignment.pixel_mapping
+        sub_df = df.loc[
+            (df[alignment.PRIMARY_ORDER_COLUMN_NAME] == pixel.order) & 
+            (df[alignment.PRIMARY_PIXEL_COLUMN_NAME] == pixel.pixel)
+        ]
+        origin_pixels = np.array(sub_df[alignment.JOIN_PIXEL_COLUMN_NAME].values).astype(int)
+        row_sum = np.sum(raw_histogram[origin_pixels])
+        if row_sum > 0:
+            values = [(pixel.order, pixel.pixel, row_sum) for _ in range(len(origin_pixels))]
+            final_alignment[origin_pixels] = values
+
+    # generate destination_pixel_map
+    # TODO: make this a utility function in hipscat
+    destination_pixel_map = {}
+    for pixel in left_pixels:
+        df = alignment.pixel_mapping
+        sub_df = df.loc[
+            (df[alignment.PRIMARY_ORDER_COLUMN_NAME] == pixel.order) & 
+            (df[alignment.PRIMARY_PIXEL_COLUMN_NAME] == pixel.pixel)
+        ]
+        origin_pixels = np.array(sub_df[alignment.JOIN_PIXEL_COLUMN_NAME].values).astype(int)
+        row_sum = np.sum(raw_histogram[origin_pixels])
+        if row_sum > 0:
+            destination_pixel_map[pixel] = (row_sum, origin_pixels)
+
+    alignment_future = client.scatter(final_alignment, broadcast=True)
+    _split_pixels(args, alignment_future, client)
+    _reduce_pixels(args, destination_pixel_map, client)