Skip to content

Commit

Permalink
Provide drop_empty_siblings argument (#357)
Browse files Browse the repository at this point in the history
  • Loading branch information
delucchi-cmu authored Jul 31, 2024
1 parent 8a34d62 commit ee02e32
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 9 deletions.
15 changes: 10 additions & 5 deletions src/hipscat_import/catalog/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,21 @@ class ImportArguments(RuntimeArguments):
a positive number, this will be the order of all final pixels and we
will not combine pixels according to the threshold"""
lowest_healpix_order: int = 0
"""the lowest possible healpix order that we will use for the final
catalog partitioning. setting this higher than 0 will prevent creating
"""when determining bins for the final partitioning, the lowest possible healpix order
for resulting pixels. setting this higher than 0 will prevent creating
partitions with a large area on the sky."""
highest_healpix_order: int = 7
highest_healpix_order: int = 10
"""healpix order to use when mapping. this will
not necessarily be the order used in the final catalog, as we may combine
pixels that don't meed the threshold"""
pixel_threshold: int = 1_000_000
"""maximum number of rows for a single resulting pixel.
we may combine hierarchically until we near the ``pixel_threshold``"""
"""when determining bins for the final partitioning, the maximum number
of rows for a single resulting pixel. we may combine hierarchically until
we near the ``pixel_threshold``"""
drop_empty_siblings: bool = False
"""when determining bins for the final partitioning, should we keep result pixels
at a higher order (smaller area) if the 3 sibling pixels are empty. setting this to
False will result in the same number of result pixels, but they may differ in Norder"""
mapping_healpix_order: int = -1
"""healpix order to use when mapping. will be
``highest_healpix_order`` unless a positive value is provided for
Expand Down
3 changes: 3 additions & 0 deletions src/hipscat_import/catalog/resume_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ def get_alignment_file(
highest_healpix_order,
lowest_healpix_order,
pixel_threshold,
drop_empty_siblings,
expected_total_rows,
) -> str:
"""Get a pointer to the existing alignment file for the pipeline, or
Expand All @@ -218,6 +219,7 @@ def get_alignment_file(
lowest_healpix_order (int): the lowest healpix order (e.g. 1-5). specifying a lowest order
constrains the partitioning to prevent spatially large pixels.
threshold (int): the maximum number of objects allowed in a single pixel
drop_empty_siblings (bool): if 3 of 4 pixels are empty, keep only the non-empty pixel
expected_total_rows (int): number of expected rows found in the dataset.
Returns:
Expand All @@ -239,6 +241,7 @@ def get_alignment_file(
highest_order=highest_healpix_order,
lowest_order=lowest_healpix_order,
threshold=pixel_threshold,
drop_empty_siblings=drop_empty_siblings,
)
with open(file_name, "wb") as pickle_file:
pickle.dump(alignment, pickle_file)
Expand Down
1 change: 1 addition & 0 deletions src/hipscat_import/catalog/run_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def run(args, client):
args.highest_healpix_order,
args.lowest_healpix_order,
args.pixel_threshold,
args.drop_empty_siblings,
total_rows,
)

Expand Down
2 changes: 1 addition & 1 deletion tests/hipscat_import/catalog/test_map_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_
plan = ResumePlan(tmp_path=tmp_path, progress_bar=False, input_paths=["foo1"])
raw_histogram = np.full(12, 0)
raw_histogram[11] = 131
alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 131)
alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, False, 131)
mr.split_pixels(
input_file=formats_headers_csv,
pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")),
Expand Down
6 changes: 3 additions & 3 deletions tests/hipscat_import/catalog/test_resume_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,14 @@ def test_get_alignment_file(tmp_path):
plan = ResumePlan(tmp_path=tmp_path, progress_bar=False, input_paths=["foo1"])
raw_histogram = np.full(12, 0)
raw_histogram[11] = 131
alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 131)
alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, True, 131)

alignment_file2 = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 131)
alignment_file2 = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, True, 131)

assert alignment_file == alignment_file2

with pytest.raises(ValueError, match="does not match expectation"):
plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, 130)
plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000, True, 130)


def never_fails():
Expand Down
1 change: 1 addition & 0 deletions tests/hipscat_import/catalog/test_run_round_trip.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def test_import_keep_intermediate_files(
tmp_dir=temp,
dask_tmp=temp,
progress_bar=False,
highest_healpix_order=2,
delete_intermediate_parquet_files=False,
delete_resume_log_files=False,
)
Expand Down

0 comments on commit ee02e32

Please sign in to comment.