diff --git a/src/hipscat_import/catalog/arguments.py b/src/hipscat_import/catalog/arguments.py index 81711bcd..4cab57a6 100644 --- a/src/hipscat_import/catalog/arguments.py +++ b/src/hipscat_import/catalog/arguments.py @@ -57,16 +57,21 @@ class ImportArguments(RuntimeArguments): a positive number, this will be the order of all final pixels and we will not combine pixels according to the threshold""" lowest_healpix_order: int = 0 - """the lowest possible healpix order that we will use for the final - catalog partitioning. setting this higher than 0 will prevent creating + """when determining bins for the final partitioning, the lowest possible healpix order + for resulting pixels. setting this higher than 0 will prevent creating partitions with a large area on the sky.""" - highest_healpix_order: int = 7 + highest_healpix_order: int = 10 """healpix order to use when mapping. this will not necessarily be the order used in the final catalog, as we may combine pixels that don't meed the threshold""" pixel_threshold: int = 1_000_000 - """maximum number of rows for a single resulting pixel. - we may combine hierarchically until we near the ``pixel_threshold``""" + """when determining bins for the final partitioning, the maximum number + of rows for a single resulting pixel. we may combine hierarchically until + we near the ``pixel_threshold``""" + drop_empty_siblings: bool = False + """when determining bins for the final partitioning, should we keep result pixels + at a higher order (smaller area) if the 3 sibling pixels are empty. setting this to + False will result in the same number of result pixels, but they may differ in Norder""" mapping_healpix_order: int = -1 """healpix order to use when mapping. will be ``highest_healpix_order`` unless a positive value is provided for diff --git a/src/hipscat_import/catalog/resume_plan.py b/src/hipscat_import/catalog/resume_plan.py index 467d0177..0b16398d 100644 --- a/src/hipscat_import/catalog/resume_plan.py +++ b/src/hipscat_import/catalog/resume_plan.py @@ -204,6 +204,7 @@ def get_alignment_file( highest_healpix_order, lowest_healpix_order, pixel_threshold, + drop_empty_siblings, expected_total_rows, ) -> str: """Get a pointer to the existing alignment file for the pipeline, or @@ -218,6 +219,7 @@ def get_alignment_file( lowest_healpix_order (int): the lowest healpix order (e.g. 1-5). specifying a lowest order constrains the partitioning to prevent spatially large pixels. threshold (int): the maximum number of objects allowed in a single pixel + drop_empty_siblings (bool): if 3 of 4 pixels are empty, keep only the non-empty pixel expected_total_rows (int): number of expected rows found in the dataset. Returns: @@ -239,6 +241,7 @@ def get_alignment_file( highest_order=highest_healpix_order, lowest_order=lowest_healpix_order, threshold=pixel_threshold, + drop_empty_siblings=drop_empty_siblings, ) with open(file_name, "wb") as pickle_file: pickle.dump(alignment, pickle_file) diff --git a/src/hipscat_import/catalog/run_import.py b/src/hipscat_import/catalog/run_import.py index dd6aa72d..bca978d4 100644 --- a/src/hipscat_import/catalog/run_import.py +++ b/src/hipscat_import/catalog/run_import.py @@ -60,6 +60,7 @@ def run(args, client): args.highest_healpix_order, args.lowest_healpix_order, args.pixel_threshold, + args.drop_empty_siblings, total_rows, ) diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hipscat_import/catalog/test_map_reduce.py index 36158010..0cc67580 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hipscat_import/catalog/test_map_reduce.py @@ -286,7 +286,7 @@ def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_ plan = ResumePlan(tmp_path=tmp_path, progress_bar=False, input_paths=["foo1"]) raw_histogram = np.full(12, 0) raw_histogram[11] = 131 - alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 131) + alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, False, 131) mr.split_pixels( input_file=formats_headers_csv, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), diff --git a/tests/hipscat_import/catalog/test_resume_plan.py b/tests/hipscat_import/catalog/test_resume_plan.py index d7015928..0fbf3e00 100644 --- a/tests/hipscat_import/catalog/test_resume_plan.py +++ b/tests/hipscat_import/catalog/test_resume_plan.py @@ -115,14 +115,14 @@ def test_get_alignment_file(tmp_path): plan = ResumePlan(tmp_path=tmp_path, progress_bar=False, input_paths=["foo1"]) raw_histogram = np.full(12, 0) raw_histogram[11] = 131 - alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 131) + alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, True, 131) - alignment_file2 = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 131) + alignment_file2 = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, True, 131) assert alignment_file == alignment_file2 with pytest.raises(ValueError, match="does not match expectation"): - plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 130) + plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, True, 130) def never_fails(): diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hipscat_import/catalog/test_run_round_trip.py index d80b3619..3680d34e 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hipscat_import/catalog/test_run_round_trip.py @@ -270,6 +270,7 @@ def test_import_keep_intermediate_files( tmp_dir=temp, dask_tmp=temp, progress_bar=False, + highest_healpix_order=2, delete_intermediate_parquet_files=False, delete_resume_log_files=False, )