From 3b2c3ae66e65c1bb06bed5ff77e20f09504214b2 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:29:45 -0400 Subject: [PATCH 01/10] Pickle instead of scatter. (#328) * Pickle instead of scatter. * Code review updates * Improve code coverage. --- src/hipscat_import/catalog/map_reduce.py | 24 ++- src/hipscat_import/catalog/resume_plan.py | 84 ++++++-- src/hipscat_import/catalog/run_import.py | 204 +++++++----------- .../hipscat_import/catalog/test_map_reduce.py | 72 ++++--- .../catalog/test_resume_plan.py | 19 +- .../catalog/test_run_round_trip.py | 21 +- 6 files changed, 240 insertions(+), 184 deletions(-) diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hipscat_import/catalog/map_reduce.py index f94de055..7326335a 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hipscat_import/catalog/map_reduce.py @@ -1,5 +1,6 @@ """Import a set of non-hipscat files using dask for parallelization""" +import pickle from typing import Any, Dict, Union import healpy as hp @@ -11,7 +12,6 @@ from hipscat.pixel_math.healpix_pixel import HealpixPixel from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN, hipscat_id_to_healpix -from hipscat_import.catalog.file_readers import InputReader from hipscat_import.catalog.resume_plan import ResumePlan from hipscat_import.catalog.sparse_histogram import SparseHistogram from hipscat_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure @@ -35,7 +35,7 @@ def _has_named_index(dataframe): def _iterate_input_file( input_file: FilePointer, - file_reader: InputReader, + pickled_reader_file: str, highest_order, ra_column, dec_column, @@ -43,6 +43,8 @@ def _iterate_input_file( read_columns=None, ): """Helper function to handle input file reading and healpix pixel calculation""" + with open(pickled_reader_file, "rb") as pickle_file: + file_reader = pickle.load(pickle_file) if not file_reader: raise NotImplementedError("No file reader implemented") @@ -66,7 +68,7 @@ def _iterate_input_file( def map_to_pixels( input_file: FilePointer, - file_reader: InputReader, + pickled_reader_file: str, resume_path: FilePointer, mapping_key, highest_order, @@ -103,7 +105,13 @@ def map_to_pixels( read_columns = [ra_column, dec_column] for _, _, mapped_pixels in _iterate_input_file( - input_file, file_reader, highest_order, ra_column, dec_column, use_hipscat_index, read_columns + input_file, + pickled_reader_file, + highest_order, + ra_column, + dec_column, + use_hipscat_index, + read_columns, ): mapped_pixel, count_at_pixel = np.unique(mapped_pixels, return_counts=True) @@ -120,14 +128,14 @@ def map_to_pixels( def split_pixels( input_file: FilePointer, - file_reader: InputReader, + pickled_reader_file: str, splitting_key, highest_order, ra_column, dec_column, cache_shard_path: FilePointer, resume_path: FilePointer, - alignment=None, + alignment_file=None, use_hipscat_index=False, ): """Map a file of input objects to their healpix pixels and split into shards. @@ -149,8 +157,10 @@ def split_pixels( FileNotFoundError: if the file does not exist, or is a directory """ try: + with open(alignment_file, "rb") as pickle_file: + alignment = pickle.load(pickle_file) for chunk_number, data, mapped_pixels in _iterate_input_file( - input_file, file_reader, highest_order, ra_column, dec_column, use_hipscat_index + input_file, pickled_reader_file, highest_order, ra_column, dec_column, use_hipscat_index ): aligned_pixels = alignment[mapped_pixels] unique_pixels, unique_inverse = np.unique(aligned_pixels, return_inverse=True) diff --git a/src/hipscat_import/catalog/resume_plan.py b/src/hipscat_import/catalog/resume_plan.py index 604d8a91..30750d12 100644 --- a/src/hipscat_import/catalog/resume_plan.py +++ b/src/hipscat_import/catalog/resume_plan.py @@ -2,10 +2,13 @@ from __future__ import annotations +import pickle from dataclasses import dataclass, field from typing import List, Optional, Tuple import healpy as hp +import numpy as np +from hipscat import pixel_math from hipscat.io import FilePointer, file_io from hipscat.pixel_math.healpix_pixel import HealpixPixel from tqdm.auto import tqdm @@ -24,8 +27,8 @@ class ResumePlan(PipelineResumePlan): """list of files (and job keys) that have yet to be mapped""" split_keys: List[Tuple[str, str]] = field(default_factory=list) """set of files (and job keys) that have yet to be split""" - destination_pixel_map: Optional[List[Tuple[HealpixPixel, List[HealpixPixel], str]]] = None - """Fully resolved map of destination pixels to constituent smaller pixels""" + destination_pixel_map: Optional[List[Tuple[int, int, int]]] = None + """Destination pixels and their expected final count""" MAPPING_STAGE = "mapping" SPLITTING_STAGE = "splitting" @@ -33,6 +36,7 @@ class ResumePlan(PipelineResumePlan): HISTOGRAM_BINARY_FILE = "mapping_histogram.npz" HISTOGRAMS_DIR = "histograms" + ALIGNMENT_FILE = "alignment.pickle" def __post_init__(self): """Initialize the plan.""" @@ -196,6 +200,61 @@ def is_mapping_done(self) -> bool: """Are there files left to map?""" return self.done_file_exists(self.MAPPING_STAGE) + def get_alignment_file( + self, + raw_histogram, + constant_healpix_order, + highest_healpix_order, + lowest_healpix_order, + pixel_threshold, + ) -> str: + """Get a pointer to the existing alignment file for the pipeline, or + generate a new alignment using provided arguments. + + Args: + raw_histogram (:obj:`np.array`): one-dimensional numpy array of long integers where the + value at each index corresponds to the number of objects found at the healpix pixel. + constant_healpix_order (int): if positive, use this as the order for + all non-empty partitions. else, use remaining arguments. + highest_healpix_order (int): the highest healpix order (e.g. 5-10) + lowest_healpix_order (int): the lowest healpix order (e.g. 1-5). specifying a lowest order + constrains the partitioning to prevent spatially large pixels. + threshold (int): the maximum number of objects allowed in a single pixel + + Returns: + path to cached alignment file. + """ + file_name = file_io.append_paths_to_pointer(self.tmp_path, self.ALIGNMENT_FILE) + if not file_io.does_file_or_directory_exist(file_name): + if constant_healpix_order >= 0: + alignment = np.full(len(raw_histogram), None) + for pixel_num, pixel_sum in enumerate(raw_histogram): + alignment[pixel_num] = ( + constant_healpix_order, + pixel_num, + pixel_sum, + ) + else: + alignment = pixel_math.generate_alignment( + raw_histogram, + highest_order=highest_healpix_order, + lowest_order=lowest_healpix_order, + threshold=pixel_threshold, + ) + with open(file_name, "wb") as pickle_file: + pickle.dump(alignment, pickle_file) + + if self.destination_pixel_map is None: + with open(file_name, "rb") as pickle_file: + alignment = pickle.load(pickle_file) + non_none_elements = alignment[alignment != np.array(None)] + self.destination_pixel_map = np.unique(non_none_elements) + self.destination_pixel_map = [ + (order, pix, count) for (order, pix, count) in self.destination_pixel_map if int(count) > 0 + ] + + return file_name + def wait_for_splitting(self, futures): """Wait for splitting futures to complete.""" self.wait_for_futures(futures, self.SPLITTING_STAGE) @@ -208,30 +267,31 @@ def is_splitting_done(self) -> bool: """Are there files left to split?""" return self.done_file_exists(self.SPLITTING_STAGE) - def get_reduce_items(self, destination_pixel_map=None): + def get_reduce_items(self): """Fetch a triple for each partition to reduce. Triple contains: - destination pixel (healpix pixel with both order and pixel) - - source pixels (list of pixels at mapping order) + - number of rows expected for this pixel - reduce key (string of destination order+pixel) - """ reduced_keys = set(self.read_done_keys(self.REDUCING_STAGE)) - if destination_pixel_map is None: - destination_pixel_map = self.destination_pixel_map - elif self.destination_pixel_map is None: - self.destination_pixel_map = destination_pixel_map if self.destination_pixel_map is None: raise RuntimeError("destination pixel map not provided for progress tracking.") reduce_items = [ - (hp_pixel, source_pixels, f"{hp_pixel.order}_{hp_pixel.pixel}") - for hp_pixel, source_pixels in destination_pixel_map.items() - if f"{hp_pixel.order}_{hp_pixel.pixel}" not in reduced_keys + (HealpixPixel(hp_order, hp_pixel), row_count, f"{hp_order}_{hp_pixel}") + for hp_order, hp_pixel, row_count in self.destination_pixel_map + if f"{hp_order}_{hp_pixel}" not in reduced_keys ] return reduce_items + def get_destination_pixels(self): + """Create HealpixPixel list of all destination pixels.""" + if self.destination_pixel_map is None: + raise RuntimeError("destination pixel map not known.") + return [HealpixPixel(hp_order, hp_pixel) for hp_order, hp_pixel, _ in self.destination_pixel_map] + def is_reducing_done(self) -> bool: """Are there partitions left to reduce?""" return self.done_file_exists(self.REDUCING_STAGE) diff --git a/src/hipscat_import/catalog/run_import.py b/src/hipscat_import/catalog/run_import.py index 1b97b547..7081670a 100644 --- a/src/hipscat_import/catalog/run_import.py +++ b/src/hipscat_import/catalog/run_import.py @@ -4,9 +4,10 @@ The actual logic of the map reduce is in the `map_reduce.py` file. """ +import os +import pickle + import hipscat.io.write_metadata as io -import numpy as np -from hipscat import pixel_math from hipscat.catalog import PartitionInfo from hipscat.io import paths from hipscat.io.parquet_metadata import write_parquet_metadata @@ -17,140 +18,101 @@ from hipscat_import.pipeline_resume_plan import PipelineResumePlan -def _map_pixels(args, client): - """Generate a raw histogram of object counts in each healpix pixel""" - - if args.resume_plan.is_mapping_done(): - return - - reader_future = client.scatter(args.file_reader, hash=False) - futures = [] - for key, file_path in args.resume_plan.map_files: - futures.append( - client.submit( - mr.map_to_pixels, - input_file=file_path, - resume_path=args.resume_plan.tmp_path, - file_reader=reader_future, - mapping_key=key, - highest_order=args.mapping_healpix_order, - ra_column=args.ra_column, - dec_column=args.dec_column, - use_hipscat_index=args.use_hipscat_index, - ) - ) - args.resume_plan.wait_for_mapping(futures) - - -def _split_pixels(args, alignment_future, client): - """Generate a raw histogram of object counts in each healpix pixel""" - - if args.resume_plan.is_splitting_done(): - return - - reader_future = client.scatter(args.file_reader, hash=False) - futures = [] - for key, file_path in args.resume_plan.split_keys: - futures.append( - client.submit( - mr.split_pixels, - input_file=file_path, - file_reader=reader_future, - highest_order=args.mapping_healpix_order, - ra_column=args.ra_column, - dec_column=args.dec_column, - splitting_key=key, - cache_shard_path=args.tmp_path, - resume_path=args.resume_plan.tmp_path, - alignment=alignment_future, - use_hipscat_index=args.use_hipscat_index, - ) - ) - - args.resume_plan.wait_for_splitting(futures) - - -def _reduce_pixels(args, destination_pixel_map, client): - """Loop over destination pixels and merge into parquet files""" - - if args.resume_plan.is_reducing_done(): - return - - futures = [] - for ( - destination_pixel, - source_pixels, - destination_pixel_key, - ) in args.resume_plan.get_reduce_items(destination_pixel_map): - futures.append( - client.submit( - mr.reduce_pixel_shards, - cache_shard_path=args.tmp_path, - resume_path=args.resume_plan.tmp_path, - reducing_key=destination_pixel_key, - destination_pixel_order=destination_pixel.order, - destination_pixel_number=destination_pixel.pixel, - destination_pixel_size=source_pixels[0], - output_path=args.catalog_path, - ra_column=args.ra_column, - dec_column=args.dec_column, - sort_columns=args.sort_columns, - add_hipscat_index=args.add_hipscat_index, - use_schema_file=args.use_schema_file, - use_hipscat_index=args.use_hipscat_index, - delete_input_files=args.delete_intermediate_parquet_files, - storage_options=args.output_storage_options, - ) - ) - - args.resume_plan.wait_for_reducing(futures) - - def run(args, client): """Run catalog creation pipeline.""" if not args: raise ValueError("args is required and should be type ImportArguments") if not isinstance(args, ImportArguments): raise ValueError("args must be type ImportArguments") - _map_pixels(args, client) + + pickled_reader_file = os.path.join(args.resume_plan.tmp_path, "reader.pickle") + with open(pickled_reader_file, "wb") as pickle_file: + pickle.dump(args.file_reader, pickle_file) + + if not args.resume_plan.is_mapping_done(): + futures = [] + for key, file_path in args.resume_plan.map_files: + futures.append( + client.submit( + mr.map_to_pixels, + input_file=file_path, + resume_path=args.resume_plan.tmp_path, + pickled_reader_file=pickled_reader_file, + mapping_key=key, + highest_order=args.mapping_healpix_order, + ra_column=args.ra_column, + dec_column=args.dec_column, + use_hipscat_index=args.use_hipscat_index, + ) + ) + args.resume_plan.wait_for_mapping(futures) with tqdm( total=2, desc=PipelineResumePlan.get_formatted_stage_name("Binning"), disable=not args.progress_bar ) as step_progress: raw_histogram = args.resume_plan.read_histogram(args.mapping_healpix_order) step_progress.update(1) - if args.constant_healpix_order >= 0: - alignment = np.full(len(raw_histogram), None) - for pixel_num, pixel_sum in enumerate(raw_histogram): - alignment[pixel_num] = ( - args.constant_healpix_order, - pixel_num, - pixel_sum, - ) + alignment_file = args.resume_plan.get_alignment_file( + raw_histogram, + args.constant_healpix_order, + args.highest_healpix_order, + args.lowest_healpix_order, + args.pixel_threshold, + ) - destination_pixel_map = pixel_math.generate_constant_pixel_map( - histogram=raw_histogram, - constant_healpix_order=args.constant_healpix_order, - ) - else: - alignment = pixel_math.generate_alignment( - raw_histogram, - highest_order=args.highest_healpix_order, - lowest_order=args.lowest_healpix_order, - threshold=args.pixel_threshold, - ) - destination_pixel_map = pixel_math.compute_pixel_map( - raw_histogram, - highest_order=args.highest_healpix_order, - lowest_order=args.lowest_healpix_order, - threshold=args.pixel_threshold, - ) step_progress.update(1) if not args.debug_stats_only: - alignment_future = client.scatter(alignment) - _split_pixels(args, alignment_future, client) - _reduce_pixels(args, destination_pixel_map, client) + if not args.resume_plan.is_splitting_done(): + futures = [] + for key, file_path in args.resume_plan.split_keys: + futures.append( + client.submit( + mr.split_pixels, + input_file=file_path, + pickled_reader_file=pickled_reader_file, + highest_order=args.mapping_healpix_order, + ra_column=args.ra_column, + dec_column=args.dec_column, + splitting_key=key, + cache_shard_path=args.tmp_path, + resume_path=args.resume_plan.tmp_path, + alignment_file=alignment_file, + use_hipscat_index=args.use_hipscat_index, + ) + ) + + args.resume_plan.wait_for_splitting(futures) + + if not args.resume_plan.is_reducing_done(): + futures = [] + for ( + destination_pixel, + source_pixel_count, + destination_pixel_key, + ) in args.resume_plan.get_reduce_items(): + futures.append( + client.submit( + mr.reduce_pixel_shards, + cache_shard_path=args.tmp_path, + resume_path=args.resume_plan.tmp_path, + reducing_key=destination_pixel_key, + destination_pixel_order=destination_pixel.order, + destination_pixel_number=destination_pixel.pixel, + destination_pixel_size=source_pixel_count, + output_path=args.catalog_path, + ra_column=args.ra_column, + dec_column=args.dec_column, + sort_columns=args.sort_columns, + add_hipscat_index=args.add_hipscat_index, + use_schema_file=args.use_schema_file, + use_hipscat_index=args.use_hipscat_index, + delete_input_files=args.delete_intermediate_parquet_files, + storage_options=args.output_storage_options, + ) + ) + + args.resume_plan.wait_for_reducing(futures) # All done - write out the metadata with tqdm( @@ -171,7 +133,7 @@ def run(args, client): storage_options=args.output_storage_options, ) step_progress.update(1) - partition_info = PartitionInfo.from_healpix(destination_pixel_map.keys()) + partition_info = PartitionInfo.from_healpix(args.resume_plan.get_destination_pixels()) partition_info_file = paths.get_partition_info_pointer(args.catalog_path) partition_info.write_to_file(partition_info_file, storage_options=args.output_storage_options) if not args.debug_stats_only: diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hipscat_import/catalog/test_map_reduce.py index adc1758b..95a27347 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hipscat_import/catalog/test_map_reduce.py @@ -1,6 +1,7 @@ """Tests of map reduce operations""" import os +import pickle from io import StringIO import healpy as hp @@ -13,15 +14,24 @@ import hipscat_import.catalog.map_reduce as mr from hipscat_import.catalog.file_readers import get_file_reader +from hipscat_import.catalog.resume_plan import ResumePlan from hipscat_import.catalog.sparse_histogram import SparseHistogram +def pickle_file_reader(tmp_path, file_reader) -> str: + """Utility method to pickle a file reader, and return path to pickle.""" + pickled_reader_file = os.path.join(tmp_path, "reader.pickle") + with open(pickled_reader_file, "wb") as pickle_file: + pickle.dump(file_reader, pickle_file) + return pickled_reader_file + + def test_read_empty_filename(): """Empty file name""" with pytest.raises(FileNotFoundError): mr.map_to_pixels( input_file="", - file_reader=get_file_reader("parquet"), + pickled_reader_file="", highest_order=10, ra_column="ra", dec_column="dec", @@ -30,12 +40,12 @@ def test_read_empty_filename(): ) -def test_read_wrong_fileformat(small_sky_file0): +def test_read_wrong_fileformat(small_sky_file0, tmp_path): """CSV file attempting to be read as parquet""" with pytest.raises(pa.lib.ArrowInvalid): mr.map_to_pixels( input_file=small_sky_file0, - file_reader=get_file_reader("parquet"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("parquet")), highest_order=0, ra_column="ra_mean", dec_column="dec_mean", @@ -44,12 +54,12 @@ def test_read_wrong_fileformat(small_sky_file0): ) -def test_read_directory(test_data_dir): +def test_read_directory(test_data_dir, tmp_path): """Provide directory, not file""" with pytest.raises(FileNotFoundError): mr.map_to_pixels( input_file=test_data_dir, - file_reader=get_file_reader("parquet"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("parquet")), highest_order=0, ra_column="ra", dec_column="dec", @@ -58,12 +68,12 @@ def test_read_directory(test_data_dir): ) -def test_read_bad_fileformat(blank_data_file, capsys): +def test_read_bad_fileformat(blank_data_file, capsys, tmp_path): """Unsupported file format""" with pytest.raises(NotImplementedError): mr.map_to_pixels( input_file=blank_data_file, - file_reader=None, + pickled_reader_file=pickle_file_reader(tmp_path, None), highest_order=0, ra_column="ra", dec_column="dec", @@ -86,7 +96,7 @@ def test_read_single_fits(tmp_path, formats_fits): os.makedirs(os.path.join(tmp_path, "histograms")) mr.map_to_pixels( input_file=formats_fits, - file_reader=get_file_reader("fits"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("fits")), highest_order=0, ra_column="ra", dec_column="dec", @@ -101,12 +111,12 @@ def test_read_single_fits(tmp_path, formats_fits): npt.assert_array_equal(result, expected) -def test_map_headers_wrong(formats_headers_csv): +def test_map_headers_wrong(formats_headers_csv, tmp_path): """Test loading the a file with non-default headers (without specifying right headers)""" with pytest.raises(ValueError, match="columns expected but not found"): mr.map_to_pixels( input_file=formats_headers_csv, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="ra", dec_column="dec", @@ -120,7 +130,7 @@ def test_map_headers(tmp_path, formats_headers_csv): os.makedirs(os.path.join(tmp_path, "histograms")) mr.map_to_pixels( input_file=formats_headers_csv, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="ra_mean", dec_column="dec_mean", @@ -143,7 +153,7 @@ def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): input_file = os.path.join(formats_dir, "hipscat_index.csv") mr.map_to_pixels( input_file=input_file, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="NOPE", dec_column="NOPE", @@ -161,7 +171,7 @@ def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): with pytest.raises(ValueError, match="columns expected but not found"): mr.map_to_pixels( input_file=small_sky_single_file, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="NOPE", dec_column="NOPE", @@ -177,9 +187,12 @@ def test_map_with_schema(tmp_path, mixed_schema_csv_dir, mixed_schema_csv_parque input_file = os.path.join(mixed_schema_csv_dir, "input_01.csv") mr.map_to_pixels( input_file=input_file, - file_reader=get_file_reader( - "csv", - schema_file=mixed_schema_csv_parquet, + pickled_reader_file=pickle_file_reader( + tmp_path, + get_file_reader( + "csv", + schema_file=mixed_schema_csv_parquet, + ), ), highest_order=0, ra_column="ra", @@ -203,7 +216,7 @@ def test_map_small_sky_order0(tmp_path, small_sky_single_file): os.makedirs(os.path.join(tmp_path, "histograms")) mr.map_to_pixels( input_file=small_sky_single_file, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="ra", dec_column="dec", @@ -229,7 +242,7 @@ def test_map_small_sky_part_order1(tmp_path, small_sky_file0): os.makedirs(os.path.join(tmp_path, "histograms")) mr.map_to_pixels( input_file=small_sky_file0, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=1, ra_column="ra", dec_column="dec", @@ -249,41 +262,42 @@ def test_map_small_sky_part_order1(tmp_path, small_sky_file0): def test_split_pixels_bad_format(blank_data_file, tmp_path, capsys): - """Test loading the a file with non-default headers""" + """Test error behavior, e.g. when alignment file is missing.""" alignment = np.full(12, None) alignment[11] = (0, 11, 131) - with pytest.raises(NotImplementedError): + with pytest.raises(FileNotFoundError): mr.split_pixels( input_file=blank_data_file, - file_reader=None, + pickled_reader_file="", highest_order=0, ra_column="ra_mean", dec_column="dec_mean", splitting_key="0", cache_shard_path=tmp_path, resume_path=tmp_path, - alignment=alignment, + alignment_file="", ) captured = capsys.readouterr() - assert "No file reader implemented" in captured.out + assert "No such file or directory" in captured.out os.makedirs(os.path.join(tmp_path, "splitting")) def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_path): - """Test loading the a file with non-default headers""" - os.makedirs(os.path.join(tmp_path, "splitting")) - alignment = np.full(12, None) - alignment[11] = (0, 11, 131) + """Test loading a file with non-default headers""" + plan = ResumePlan(tmp_path=tmp_path, progress_bar=False, input_paths=["foo1"]) + raw_histogram = np.full(12, 0) + raw_histogram[11] = 131 + alignment_file = plan.get_alignment_file(raw_histogram, -1, 0, 0, 1_000) mr.split_pixels( input_file=formats_headers_csv, - file_reader=get_file_reader("csv"), + pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="ra_mean", dec_column="dec_mean", splitting_key="0", cache_shard_path=tmp_path, resume_path=tmp_path, - alignment=alignment, + alignment_file=alignment_file, ) file_name = os.path.join(tmp_path, "order_0", "dir_0", "pixel_11", "shard_0_0.parquet") diff --git a/tests/hipscat_import/catalog/test_resume_plan.py b/tests/hipscat_import/catalog/test_resume_plan.py index 4cf7751b..a63214a8 100644 --- a/tests/hipscat_import/catalog/test_resume_plan.py +++ b/tests/hipscat_import/catalog/test_resume_plan.py @@ -2,7 +2,6 @@ import numpy.testing as npt import pytest -from hipscat.pixel_math.healpix_pixel import HealpixPixel from hipscat_import.catalog.resume_plan import ResumePlan from hipscat_import.catalog.sparse_histogram import SparseHistogram @@ -180,17 +179,24 @@ def test_some_split_task_failures(tmp_path, dask_client): def test_get_reduce_items(tmp_path): """Test generation of remaining reduce items""" - destination_pixel_map = {HealpixPixel(0, 11): (131, [44, 45, 46])} + destination_pixel_map = [(0, 11, 131)] plan = ResumePlan(tmp_path=tmp_path, progress_bar=False) with pytest.raises(RuntimeError, match="destination pixel map"): remaining_reduce_items = plan.get_reduce_items() - remaining_reduce_items = plan.get_reduce_items(destination_pixel_map=destination_pixel_map) + with pytest.raises(RuntimeError, match="destination pixel map"): + remaining_reduce_items = plan.get_destination_pixels() + + plan.destination_pixel_map = destination_pixel_map + remaining_reduce_items = plan.get_reduce_items() assert len(remaining_reduce_items) == 1 + all_pixels = plan.get_destination_pixels() + assert len(all_pixels) == 1 + ResumePlan.reducing_key_done(tmp_path=tmp_path, reducing_key="0_11") - remaining_reduce_items = plan.get_reduce_items(destination_pixel_map=destination_pixel_map) + remaining_reduce_items = plan.get_reduce_items() assert len(remaining_reduce_items) == 0 @@ -199,8 +205,9 @@ def test_some_reduce_task_failures(tmp_path, dask_client): """Test that we only consider reduce stage successful if all done files are written""" plan = ResumePlan(tmp_path=tmp_path, progress_bar=False) - destination_pixel_map = {HealpixPixel(0, 11): (131, [44, 45, 46])} - remaining_reduce_items = plan.get_reduce_items(destination_pixel_map=destination_pixel_map) + destination_pixel_map = [(0, 11, 131)] + plan.destination_pixel_map = destination_pixel_map + remaining_reduce_items = plan.get_reduce_items() assert len(remaining_reduce_items) == 1 ## Method doesn't FAIL, but it doesn't write out the done file either. diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hipscat_import/catalog/test_run_round_trip.py index 11e8aaf7..0d3a1e1c 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hipscat_import/catalog/test_run_round_trip.py @@ -284,11 +284,13 @@ def test_import_keep_intermediate_files( ## Check that stage-level done files are still around. base_intermediate_dir = os.path.join(temp, "small_sky_object_catalog", "intermediate") expected_contents = [ + "alignment.pickle", "histograms", # directory containing sub-histograms "input_paths.txt", # original input paths for subsequent comparison "mapping_done", # stage-level done file "mapping_histogram.npz", # concatenated histogram file "order_0", # all intermediate parquet files + "reader.pickle", # pickled InputReader "reducing", # directory containing task-level done files "reducing_done", # stage-level done file "splitting", # directory containing task-level done files @@ -362,6 +364,16 @@ def test_import_lowest_healpix_order( assert np.logical_and(ids >= 700, ids < 832).all() +class StarrReader(CsvReader): + """Shallow subclass""" + + def read(self, input_file, read_columns=None): + files = glob.glob(f"{input_file}/*.starr") + files.sort() + for file in files: + return super().read(file, read_columns) + + @pytest.mark.dask def test_import_starr_file( dask_client, @@ -374,15 +386,6 @@ def test_import_starr_file( as a valid InputReader implementation is provided. """ - class StarrReader(CsvReader): - """Shallow subclass""" - - def read(self, input_file, read_columns=None): - files = glob.glob(f"{input_file}/*.starr") - files.sort() - for file in files: - return super().read(file, read_columns) - args = ImportArguments( output_artifact_name="starr", input_file_list=[formats_dir], From 952854ebb58ba128f3048a66dbf3c6ce91dfd532 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Thu, 6 Jun 2024 14:37:04 -0400 Subject: [PATCH 02/10] Update margin cache info output --- src/hipscat_import/margin_cache/margin_cache_arguments.py | 3 +++ .../hipscat_import/margin_cache/test_arguments_margin_cache.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/hipscat_import/margin_cache/margin_cache_arguments.py b/src/hipscat_import/margin_cache/margin_cache_arguments.py index 8c22d4ae..11362cde 100644 --- a/src/hipscat_import/margin_cache/margin_cache_arguments.py +++ b/src/hipscat_import/margin_cache/margin_cache_arguments.py @@ -83,6 +83,9 @@ def to_catalog_info(self, total_rows) -> MarginCacheCatalogInfo: "catalog_name": self.output_artifact_name, "total_rows": total_rows, "catalog_type": "margin", + "epoch": self.catalog.catalog_info.epoch, + "ra_column": self.catalog.catalog_info.ra_column, + "dec_column": self.catalog.catalog_info.dec_column, "primary_catalog": self.input_catalog_path, "margin_threshold": self.margin_threshold, } diff --git a/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py b/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py index 546fad04..dfa8491d 100644 --- a/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py +++ b/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py @@ -125,6 +125,9 @@ def test_to_catalog_info(small_sky_source_catalog, tmp_path): catalog_info = args.to_catalog_info(total_rows=10) assert catalog_info.catalog_name == args.output_artifact_name assert catalog_info.total_rows == 10 + assert catalog_info.epoch == "J2000" + assert catalog_info.ra_column == "source_ra" + assert catalog_info.dec_column == "source_dec" def test_provenance_info(small_sky_source_catalog, tmp_path): From 87ecd85ad4b9f9304b57fb9ac8be49e08bec55a9 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Thu, 6 Jun 2024 14:57:04 -0400 Subject: [PATCH 03/10] Install hipscat from main on the CI --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..124b2043 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/astronomy-commons/hipscat.git@main \ No newline at end of file From 04c1a8fc2f3bb301caf9a02a70a7aeaec4ccbe1f Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Tue, 11 Jun 2024 09:30:23 -0400 Subject: [PATCH 04/10] Use pathlib for pytest fixtures. (#333) * Use pathlib for pytest fixtures. * Use Path.mkdirs --- .../catalog/test_argument_validation.py | 2 +- .../catalog/test_file_readers.py | 16 +++--- .../hipscat_import/catalog/test_map_reduce.py | 51 +++++++++--------- .../hipscat_import/catalog/test_run_import.py | 50 ++++++------------ .../catalog/test_run_round_trip.py | 30 +++++------ .../catalog/test_sparse_histogram.py | 4 +- tests/hipscat_import/conftest.py | 52 ++++++++----------- .../index/test_index_map_reduce.py | 16 +++--- .../test_margin_cache_map_reduce.py | 16 +++--- .../soap/test_soap_map_reduce.py | 35 +++++++------ .../test_pipeline_resume_plan.py | 3 +- .../hipscat_import/test_runtime_arguments.py | 14 +++-- .../test_verification_arguments.py | 2 +- 13 files changed, 130 insertions(+), 161 deletions(-) diff --git a/tests/hipscat_import/catalog/test_argument_validation.py b/tests/hipscat_import/catalog/test_argument_validation.py index 65625eec..ee81d07f 100644 --- a/tests/hipscat_import/catalog/test_argument_validation.py +++ b/tests/hipscat_import/catalog/test_argument_validation.py @@ -78,7 +78,7 @@ def test_good_paths(blank_data_dir, blank_data_file, tmp_path): ) assert args.input_path == blank_data_dir assert len(args.input_paths) == 1 - assert blank_data_file in args.input_paths[0] + assert str(blank_data_file) in args.input_paths[0] def test_multiple_files_in_path(small_sky_parts_dir, tmp_path): diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py index 677b4a7d..fbd056cc 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hipscat_import/catalog/test_file_readers.py @@ -1,7 +1,5 @@ """Test dataframe-generating file readers""" -import os - import hipscat.io.write_metadata as io import numpy as np import pandas as pd @@ -91,7 +89,7 @@ def test_csv_reader_parquet_metadata(small_sky_single_file, tmp_path): pa.field("dec_error", pa.float64()), ] ) - schema_file = os.path.join(tmp_path, "metadata.parquet") + schema_file = tmp_path / "metadata.parquet" pq.write_metadata( small_sky_schema, schema_file, @@ -187,7 +185,7 @@ def test_csv_reader_pipe_delimited(formats_pipe_csv, tmp_path): pa.field("numeric", pa.int64()), ] ) - schema_file = os.path.join(tmp_path, "metadata.parquet") + schema_file = tmp_path / "metadata.parquet" pq.write_metadata(parquet_schema_types, schema_file) frame = next( @@ -224,7 +222,7 @@ def test_csv_reader_provenance_info(tmp_path, basic_catalog_info): ) provenance_info = reader.provenance_info() catalog_base_dir = tmp_path / "test_catalog" - os.makedirs(catalog_base_dir) + catalog_base_dir.mkdir(parents=True) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) with open(catalog_base_dir / "provenance_info.json", "r", encoding="utf-8") as file: @@ -258,8 +256,8 @@ def test_parquet_reader_provenance_info(tmp_path, basic_catalog_info): """Test that we get some provenance info and it is parseable into JSON.""" reader = ParquetReader(chunksize=1) provenance_info = reader.provenance_info() - catalog_base_dir = os.path.join(tmp_path, "test_catalog") - os.makedirs(catalog_base_dir) + catalog_base_dir = tmp_path / "test_catalog" + catalog_base_dir.mkdir(parents=True) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) @@ -309,6 +307,6 @@ def test_fits_reader_provenance_info(tmp_path, basic_catalog_info): """Test that we get some provenance info and it is parseable into JSON.""" reader = FitsReader() provenance_info = reader.provenance_info() - catalog_base_dir = os.path.join(tmp_path, "test_catalog") - os.makedirs(catalog_base_dir) + catalog_base_dir = tmp_path / "test_catalog" + catalog_base_dir.mkdir(parents=True) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hipscat_import/catalog/test_map_reduce.py index 95a27347..34c72147 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hipscat_import/catalog/test_map_reduce.py @@ -20,7 +20,7 @@ def pickle_file_reader(tmp_path, file_reader) -> str: """Utility method to pickle a file reader, and return path to pickle.""" - pickled_reader_file = os.path.join(tmp_path, "reader.pickle") + pickled_reader_file = tmp_path / "reader.pickle" with open(pickled_reader_file, "wb") as pickle_file: pickle.dump(file_reader, pickle_file) return pickled_reader_file @@ -86,14 +86,14 @@ def test_read_bad_fileformat(blank_data_file, capsys, tmp_path): def read_partial_histogram(tmp_path, mapping_key): """Helper to read in the former result of a map operation.""" - histogram_file = os.path.join(tmp_path, "histograms", f"{mapping_key}.npz") + histogram_file = tmp_path / "histograms" / f"{mapping_key}.npz" hist = SparseHistogram.from_file(histogram_file) return hist.to_array() def test_read_single_fits(tmp_path, formats_fits): """Success case - fits file that exists being read as fits""" - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=formats_fits, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("fits")), @@ -127,7 +127,7 @@ def test_map_headers_wrong(formats_headers_csv, tmp_path): def test_map_headers(tmp_path, formats_headers_csv): """Test loading the a file with non-default headers""" - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=formats_headers_csv, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -149,8 +149,8 @@ def test_map_headers(tmp_path, formats_headers_csv): def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): - os.makedirs(os.path.join(tmp_path, "histograms")) - input_file = os.path.join(formats_dir, "hipscat_index.csv") + (tmp_path / "histograms").mkdir(parents=True) + input_file = formats_dir / "hipscat_index.csv" mr.map_to_pixels( input_file=input_file, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -183,8 +183,8 @@ def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): def test_map_with_schema(tmp_path, mixed_schema_csv_dir, mixed_schema_csv_parquet): """Test loading the a file when using a parquet schema file for dtypes""" - os.makedirs(os.path.join(tmp_path, "histograms")) - input_file = os.path.join(mixed_schema_csv_dir, "input_01.csv") + (tmp_path / "histograms").mkdir(parents=True) + input_file = mixed_schema_csv_dir / "input_01.csv" mr.map_to_pixels( input_file=input_file, pickled_reader_file=pickle_file_reader( @@ -213,7 +213,7 @@ def test_map_with_schema(tmp_path, mixed_schema_csv_dir, mixed_schema_csv_parque def test_map_small_sky_order0(tmp_path, small_sky_single_file): """Test loading the small sky catalog and partitioning each object into the same large bucket""" - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=small_sky_single_file, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -239,7 +239,7 @@ def test_map_small_sky_part_order1(tmp_path, small_sky_file0): Test loading a small portion of the small sky catalog and partitioning objects into four smaller buckets """ - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=small_sky_file0, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -279,7 +279,6 @@ def test_split_pixels_bad_format(blank_data_file, tmp_path, capsys): ) captured = capsys.readouterr() assert "No such file or directory" in captured.out - os.makedirs(os.path.join(tmp_path, "splitting")) def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_path): @@ -300,17 +299,17 @@ def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_ alignment_file=alignment_file, ) - file_name = os.path.join(tmp_path, "order_0", "dir_0", "pixel_11", "shard_0_0.parquet") + file_name = tmp_path / "order_0" / "dir_0" / "pixel_11" / "shard_0_0.parquet" expected_ids = [*range(700, 708)] assert_parquet_file_ids(file_name, "object_id", expected_ids) - file_name = os.path.join(tmp_path, "order_0", "dir_0", "pixel_1", "shard_0_0.parquet") + file_name = tmp_path / "order_0" / "dir_0" / "pixel_1" / "shard_0_0.parquet" assert not os.path.exists(file_name) def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): """Test reducing into one large pixel""" - os.makedirs(os.path.join(tmp_path, "reducing")) + (tmp_path / "reducing").mkdir(parents=True) mr.reduce_pixel_shards( cache_shard_path=parquet_shards_dir, resume_path=tmp_path, @@ -326,7 +325,7 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): delete_input_files=False, ) - output_file = os.path.join(tmp_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @@ -334,7 +333,7 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_path): """Test reducing with or without a _hipscat_index field""" - os.makedirs(os.path.join(tmp_path, "reducing")) + (tmp_path / "reducing").mkdir(parents=True) mr.reduce_pixel_shards( cache_shard_path=parquet_shards_dir, resume_path=tmp_path, @@ -349,7 +348,7 @@ def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_p delete_input_files=False, ) - output_file = os.path.join(tmp_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @@ -414,10 +413,10 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): First, we take some time to set up these silly data points, then we test out reducing them into a single parquet file using a mix of reduction options. """ - os.makedirs(os.path.join(tmp_path, "reducing")) - shard_dir = os.path.join(tmp_path, "reduce_shards", "order_0", "dir_0", "pixel_11") - os.makedirs(shard_dir) - output_file = os.path.join(tmp_path, "Norder=0", "Dir=0", "Npix=11.parquet") + (tmp_path / "reducing").mkdir(parents=True) + shard_dir = tmp_path / "reduce_shards" / "order_0" / "dir_0" / "pixel_11" + shard_dir.mkdir(parents=True) + output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" file1_string = """source_id,object_id,time,ra,dec 1200,700,3000,282.5,-58.5 @@ -427,7 +426,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): 1404,702,3200,310.5,-27.5 1505,703,4000,286.5,-69.5""" file1_data = pd.read_csv(StringIO(file1_string)) - file1_data.to_parquet(os.path.join(shard_dir, "file_1_shard_1.parquet")) + file1_data.to_parquet(shard_dir / "file_1_shard_1.parquet") file2_string = """source_id,object_id,time,ra,dec 1206,700,2000,282.5,-58.5 @@ -435,7 +434,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): 1308,701,2100,299.5,-48.5 1309,701,2000,299.5,-48.5""" file2_data = pd.read_csv(StringIO(file2_string)) - file2_data.to_parquet(os.path.join(shard_dir, "file_2_shard_1.parquet")) + file2_data.to_parquet(shard_dir / "file_2_shard_1.parquet") combined_data = pd.concat([file1_data, file2_data]) combined_data["norder19_healpix"] = hp.ang2pix( @@ -452,7 +451,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ## This will sort WITHIN an order 19 healpix pixel. In that ordering, the objects are ## (703, 700, 701, 702) mr.reduce_pixel_shards( - cache_shard_path=os.path.join(tmp_path, "reduce_shards"), + cache_shard_path=tmp_path / "reduce_shards", resume_path=tmp_path, reducing_key="0_11", destination_pixel_order=0, @@ -489,7 +488,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ######################## Sort option 2: by object id and time ## sort order is effectively (norder19 healpix, object id, time) mr.reduce_pixel_shards( - cache_shard_path=os.path.join(tmp_path, "reduce_shards"), + cache_shard_path=tmp_path / "reduce_shards", resume_path=tmp_path, reducing_key="0_11", destination_pixel_order=0, @@ -526,7 +525,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ## spatial properties for sorting, only numeric. ## sort order is effectively (object id, time) mr.reduce_pixel_shards( - cache_shard_path=os.path.join(tmp_path, "reduce_shards"), + cache_shard_path=tmp_path / "reduce_shards", resume_path=tmp_path, reducing_key="0_11", destination_pixel_order=0, diff --git a/tests/hipscat_import/catalog/test_run_import.py b/tests/hipscat_import/catalog/test_run_import.py index 587650c8..4821a0af 100644 --- a/tests/hipscat_import/catalog/test_run_import.py +++ b/tests/hipscat_import/catalog/test_run_import.py @@ -2,6 +2,7 @@ import os import shutil +from pathlib import Path import numpy as np import pandas as pd @@ -41,13 +42,10 @@ def test_resume_dask_runner( """Test execution in the presence of some resume files.""" ## First, copy over our intermediate files. ## This prevents overwriting source-controlled resume files. - intermediate_dir = os.path.join(tmp_path, "resume_catalog", "intermediate") - shutil.copytree( - os.path.join(resume_dir, "intermediate"), - intermediate_dir, - ) + intermediate_dir = tmp_path / "resume_catalog" / "intermediate" + shutil.copytree(resume_dir / "intermediate", intermediate_dir) ## Now set up our resume files to match previous work. - resume_tmp = os.path.join(tmp_path, "tmp", "resume_catalog") + resume_tmp = tmp_path / "tmp" / "resume_catalog" plan = ResumePlan(tmp_path=resume_tmp, progress_bar=False) histogram = SparseHistogram.make_from_counts([11], [131], 0) empty = SparseHistogram.make_empty(0) @@ -63,10 +61,7 @@ def test_resume_dask_runner( ResumePlan.touch_key_done_file(resume_tmp, ResumePlan.REDUCING_STAGE, "0_11") - shutil.copytree( - os.path.join(resume_dir, "Norder=0"), - os.path.join(tmp_path, "resume_catalog", "Norder=0"), - ) + shutil.copytree(resume_dir / "Norder=0", tmp_path / "resume_catalog" / "Norder=0") args = ImportArguments( output_artifact_name="resume_catalog", @@ -75,7 +70,7 @@ def test_resume_dask_runner( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", highest_healpix_order=0, pixel_threshold=1000, progress_bar=False, @@ -93,17 +88,14 @@ def test_resume_dask_runner( assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = Path(args.catalog_path) / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) ## Re-running the pipeline with fully done intermediate files ## should result in no changes to output files. - shutil.copytree( - os.path.join(resume_dir, "intermediate"), - resume_tmp, - ) + shutil.copytree(resume_dir / "intermediate", resume_tmp) plan = args.resume_plan plan.touch_stage_done_file(ResumePlan.MAPPING_STAGE) plan.touch_stage_done_file(ResumePlan.SPLITTING_STAGE) @@ -145,25 +137,17 @@ def test_resume_dask_runner_diff_pixel_order( with the current HEALPix order.""" ## First, copy over our intermediate files. ## This prevents overwriting source-controlled resume files. - intermediate_dir = os.path.join(tmp_path, "resume_catalog", "intermediate") - shutil.copytree( - os.path.join(resume_dir, "intermediate"), - intermediate_dir, - ) + intermediate_dir = tmp_path / "resume_catalog" / "intermediate" + shutil.copytree(resume_dir / "intermediate", intermediate_dir) ## Now set up our resume files to match previous work. - resume_tmp = os.path.join(tmp_path, "tmp", "resume_catalog") + resume_tmp = tmp_path / "tmp" / "resume_catalog" ResumePlan(tmp_path=resume_tmp, progress_bar=False) - SparseHistogram.make_from_counts([11], [131], 0).to_file( - os.path.join(resume_tmp, "mapping_histogram.npz") - ) + SparseHistogram.make_from_counts([11], [131], 0).to_file(resume_tmp / "mapping_histogram.npz") for file_index in range(0, 5): ResumePlan.touch_key_done_file(resume_tmp, ResumePlan.SPLITTING_STAGE, f"split_{file_index}") - shutil.copytree( - os.path.join(resume_dir, "Norder=0"), - os.path.join(tmp_path, "resume_catalog", "Norder=0"), - ) + shutil.copytree(resume_dir / "Norder=0", tmp_path / "resume_catalog" / "Norder=0") with pytest.raises(ValueError, match="incompatible with the highest healpix order"): args = ImportArguments( @@ -173,7 +157,7 @@ def test_resume_dask_runner_diff_pixel_order( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", constant_healpix_order=1, pixel_threshold=1000, progress_bar=False, @@ -188,7 +172,7 @@ def test_resume_dask_runner_diff_pixel_order( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", constant_healpix_order=1, pixel_threshold=1000, progress_bar=False, @@ -220,7 +204,7 @@ def test_resume_dask_runner_histograms_diff_size( tmp_path, ): """Tests that the pipeline errors if the partial histograms have different sizes.""" - resume_tmp = os.path.join(tmp_path, "tmp", "resume_catalog") + resume_tmp = tmp_path / "tmp" / "resume_catalog" ResumePlan(tmp_path=resume_tmp, progress_bar=False) # We'll create mock partial histograms of size 0 and 2 @@ -246,7 +230,7 @@ def test_resume_dask_runner_histograms_diff_size( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", constant_healpix_order=1, pixel_threshold=1000, progress_bar=False, diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hipscat_import/catalog/test_run_round_trip.py index 0d3a1e1c..d9862067 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hipscat_import/catalog/test_run_round_trip.py @@ -79,8 +79,8 @@ def test_import_mixed_schema_csv( Path(mixed_schema_csv_dir) / "input_01.csv", Path(mixed_schema_csv_dir) / "input_02.csv", ], - output_path=Path(tmp_path), - dask_tmp=Path(tmp_path), + output_path=tmp_path, + dask_tmp=tmp_path, highest_healpix_order=1, file_reader=get_file_reader( "csv", @@ -260,8 +260,8 @@ def test_import_keep_intermediate_files( """Test that ALL intermediate files are still around on-disk after successful import, when setting the appropriate flags. """ - temp = os.path.join(tmp_path, "intermediate_files") - os.makedirs(temp) + temp = tmp_path / "intermediate_files" + temp.mkdir(parents=True) args = ImportArguments( output_artifact_name="small_sky_object_catalog", input_path=small_sky_parts_dir, @@ -282,7 +282,7 @@ def test_import_keep_intermediate_files( assert catalog.catalog_path == args.catalog_path ## Check that stage-level done files are still around. - base_intermediate_dir = os.path.join(temp, "small_sky_object_catalog", "intermediate") + base_intermediate_dir = temp / "small_sky_object_catalog" / "intermediate" expected_contents = [ "alignment.pickle", "histograms", # directory containing sub-histograms @@ -298,21 +298,21 @@ def test_import_keep_intermediate_files( ] assert_directory_contains(base_intermediate_dir, expected_contents) - checking_dir = os.path.join(base_intermediate_dir, "histograms") + checking_dir = base_intermediate_dir / "histograms" assert_directory_contains( checking_dir, ["map_0.npz", "map_1.npz", "map_2.npz", "map_3.npz", "map_4.npz", "map_5.npz"] ) - checking_dir = os.path.join(base_intermediate_dir, "splitting") + checking_dir = base_intermediate_dir / "splitting" assert_directory_contains( checking_dir, ["split_0_done", "split_1_done", "split_2_done", "split_3_done", "split_4_done", "split_5_done"], ) - checking_dir = os.path.join(base_intermediate_dir, "reducing") + checking_dir = base_intermediate_dir / "reducing" assert_directory_contains(checking_dir, ["0_11_done"]) # Check that all of the intermediate parquet shards are still around. - checking_dir = os.path.join(base_intermediate_dir, "order_0", "dir_0", "pixel_11") + checking_dir = base_intermediate_dir / "order_0" / "dir_0" / "pixel_11" assert_directory_contains( checking_dir, [ @@ -424,7 +424,7 @@ def test_import_hipscat_index( ## First, let's just check the assumptions we have about our input file: ## - should have _hipscat_index as the indexed column ## - should NOT have any columns like "ra" or "dec" - input_file = os.path.join(formats_dir, "hipscat_index.parquet") + input_file = formats_dir / "hipscat_index.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(input_file, "id", expected_ids) @@ -475,7 +475,7 @@ def test_import_hipscat_index_no_pandas( tmp_path, ): """Test basic execution, using a previously-computed _hipscat_index column for spatial partitioning.""" - input_file = os.path.join(formats_dir, "hipscat_index.csv") + input_file = formats_dir / "hipscat_index.csv" args = ImportArguments( output_artifact_name="using_hipscat_index", input_file_list=[input_file], @@ -517,8 +517,8 @@ def test_import_gaia_minimum( tmp_path, ): """Test end-to-end import, using a representative chunk of gaia data.""" - input_file = os.path.join(formats_dir, "gaia_minimum.csv") - schema_file = os.path.join(formats_dir, "gaia_minimum_schema.parquet") + input_file = formats_dir / "gaia_minimum.csv" + schema_file = formats_dir / "gaia_minimum_schema.parquet" args = ImportArguments( output_artifact_name="gaia_minimum", @@ -569,7 +569,7 @@ def test_gaia_ecsv( tmp_path, assert_parquet_file_ids, ): - input_file = os.path.join(formats_dir, "gaia_epoch.ecsv") + input_file = formats_dir / "gaia_epoch.ecsv" args = ImportArguments( output_artifact_name="gaia_e_astropy", @@ -661,7 +661,7 @@ def test_gaia_ecsv( # In-memory schema uses list naming convention, but pyarrow converts to # the parquet-compliant list convention when writing to disk. # Round trip the schema to get a schema with compliant nested naming convention. - schema_path = os.path.join(tmp_path, "temp_schema.parquet") + schema_path = tmp_path / "temp_schema.parquet" pq.write_table(expected_parquet_schema.empty_table(), where=schema_path) expected_parquet_schema = pq.read_metadata(schema_path).schema.to_arrow_schema() diff --git a/tests/hipscat_import/catalog/test_sparse_histogram.py b/tests/hipscat_import/catalog/test_sparse_histogram.py index 52e22164..57ce78f2 100644 --- a/tests/hipscat_import/catalog/test_sparse_histogram.py +++ b/tests/hipscat_import/catalog/test_sparse_histogram.py @@ -1,7 +1,5 @@ """Test sparse histogram behavior.""" -import os - import numpy as np import numpy.testing as npt import pytest @@ -12,7 +10,7 @@ def test_read_write_round_trip(tmp_path): """Test that we can read what we write into a histogram file.""" - file_name = os.path.join(tmp_path, "round_trip.npz") + file_name = tmp_path / "round_trip.npz" histogram = SparseHistogram.make_from_counts([11], [131], 0) histogram.to_file(file_name) diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index 303144bc..7ef2fc20 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -2,6 +2,7 @@ import os import re +from pathlib import Path import healpy as hp import numpy as np @@ -52,119 +53,112 @@ def test_long_running(): @pytest.fixture def test_data_dir(): - return os.path.join(TEST_DIR, "data") + return Path(TEST_DIR) / "data" @pytest.fixture def small_sky_dir(test_data_dir): - return os.path.join(test_data_dir, "small_sky") + return test_data_dir / "small_sky" @pytest.fixture def small_sky_single_file(test_data_dir): - return os.path.join(test_data_dir, "small_sky", "catalog.csv") + return test_data_dir / "small_sky" / "catalog.csv" @pytest.fixture def small_sky_object_catalog(test_data_dir): - return os.path.join(test_data_dir, "small_sky_object_catalog") + return test_data_dir / "small_sky_object_catalog" @pytest.fixture def small_sky_source_dir(test_data_dir): - return os.path.join(test_data_dir, "small_sky_source") + return test_data_dir / "small_sky_source" @pytest.fixture def small_sky_source_catalog(test_data_dir): - return os.path.join(test_data_dir, "small_sky_source_catalog") + return test_data_dir / "small_sky_source_catalog" @pytest.fixture def blank_data_dir(test_data_dir): - return os.path.join(test_data_dir, "blank") + return test_data_dir / "blank" @pytest.fixture def blank_data_file(test_data_dir): - return os.path.join(test_data_dir, "blank", "blank.csv") + return test_data_dir / "blank" / "blank.csv" @pytest.fixture def empty_data_dir(test_data_dir): - return os.path.join(test_data_dir, "empty") + return test_data_dir / "empty" @pytest.fixture def formats_dir(test_data_dir): - return os.path.join(test_data_dir, "test_formats") + return test_data_dir / "test_formats" @pytest.fixture def formats_headers_csv(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "headers.csv") + return test_data_dir / "test_formats" / "headers.csv" @pytest.fixture def formats_pipe_csv(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "pipe_delimited.csv") + return test_data_dir / "test_formats" / "pipe_delimited.csv" @pytest.fixture def formats_fits(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "small_sky.fits") + return test_data_dir / "test_formats" / "small_sky.fits" @pytest.fixture def formats_pandasindex(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "pandasindex.parquet") + return test_data_dir / "test_formats" / "pandasindex.parquet" @pytest.fixture def small_sky_parts_dir(test_data_dir): - return os.path.join(test_data_dir, "small_sky_parts") + return test_data_dir / "small_sky_parts" @pytest.fixture def small_sky_file0(test_data_dir): - return os.path.join(test_data_dir, "small_sky_parts", "catalog_00_of_05.csv") + return test_data_dir / "small_sky_parts" / "catalog_00_of_05.csv" @pytest.fixture def parquet_shards_dir(test_data_dir): - return os.path.join(test_data_dir, "parquet_shards") + return test_data_dir / "parquet_shards" @pytest.fixture def soap_intermediate_dir(test_data_dir): - return os.path.join(test_data_dir, "soap_intermediate") + return test_data_dir / "soap_intermediate" @pytest.fixture def parquet_shards_shard_44_0(test_data_dir): - return os.path.join( - test_data_dir, - "parquet_shards", - "order_1", - "dir_0", - "pixel_44", - "shard_3_0.parquet", - ) + return test_data_dir / "parquet_shards" / "order_1" / "dir_0" / "pixel_44" / "shard_3_0.parquet" @pytest.fixture def mixed_schema_csv_dir(test_data_dir): - return os.path.join(test_data_dir, "mixed_schema") + return test_data_dir / "mixed_schema" @pytest.fixture def mixed_schema_csv_parquet(test_data_dir): - return os.path.join(test_data_dir, "mixed_schema", "schema.parquet") + return test_data_dir / "mixed_schema" / "schema.parquet" @pytest.fixture def resume_dir(test_data_dir): - return os.path.join(test_data_dir, "resume") + return test_data_dir / "resume" @pytest.fixture diff --git a/tests/hipscat_import/index/test_index_map_reduce.py b/tests/hipscat_import/index/test_index_map_reduce.py index 0dbba81d..025d0f02 100644 --- a/tests/hipscat_import/index/test_index_map_reduce.py +++ b/tests/hipscat_import/index/test_index_map_reduce.py @@ -1,7 +1,5 @@ """Tests of map reduce operations""" -import os - import numpy as np import numpy.testing as npt import pandas as pd @@ -28,7 +26,7 @@ def test_create_index( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_object_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_index(output_file, expected_ids) @@ -55,7 +53,7 @@ def test_create_index_no_hipscat_index(small_sky_object_catalog, tmp_path, dask_ ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_object_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal(data_frame.columns, ["Norder", "Dir", "Npix"]) @@ -76,7 +74,7 @@ def test_create_index_no_order_pixel(small_sky_object_catalog, tmp_path, dask_cl ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_object_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal(data_frame.columns, ["_hipscat_index"]) @@ -95,7 +93,7 @@ def test_create_index_source(small_sky_source_catalog, assert_parquet_file_index ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = [*range(70_000, 87_161)] assert_parquet_file_index(output_file, expected_ids) @@ -134,7 +132,7 @@ def test_create_index_with_divisions( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = [*range(70_000, 87_161)] assert_parquet_file_index(output_file, expected_ids) @@ -167,7 +165,7 @@ def test_create_index_source_by_object( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = np.repeat([*range(700, 831)], 131) assert_parquet_file_index(output_file, expected_ids) @@ -199,7 +197,7 @@ def test_create_index_extra_columns( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = np.repeat([*range(700, 831)], 131) assert_parquet_file_index(output_file, expected_ids) diff --git a/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py b/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py index c6332c21..72b93dbc 100644 --- a/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py +++ b/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py @@ -46,7 +46,7 @@ def test_to_pixel_shard_equator(tmp_path, basic_data_shard_df): dec_column="weird_dec", ) - path = os.path.join(tmp_path, "order_1", "dir_0", "pixel_21", "Norder=1", "Dir=0", "Npix=0.parquet") + path = tmp_path / "order_1" / "dir_0" / "pixel_21" / "Norder=1" / "Dir=0" / "Npix=0.parquet" assert os.path.exists(path) @@ -63,7 +63,7 @@ def test_to_pixel_shard_polar(tmp_path, polar_data_shard_df): dec_column="weird_dec", ) - path = os.path.join(tmp_path, "order_2", "dir_0", "pixel_15", "Norder=2", "Dir=0", "Npix=0.parquet") + path = tmp_path / "order_2" / "dir_0" / "pixel_15" / "Norder=2" / "Dir=0" / "Npix=0.parquet" assert os.path.exists(path) @@ -92,12 +92,12 @@ def test_map_pixel_shards_error(tmp_path, capsys): def test_reduce_margin_shards(tmp_path): - intermediate_dir = os.path.join(tmp_path, "intermediate") + intermediate_dir = tmp_path / "intermediate" partition_dir = get_pixel_cache_directory(intermediate_dir, HealpixPixel(1, 21)) shard_dir = paths.pixel_directory(partition_dir, 1, 21) os.makedirs(shard_dir) - os.makedirs(os.path.join(intermediate_dir, "reducing")) + os.makedirs(intermediate_dir / "reducing") first_shard_path = paths.pixel_catalog_file(partition_dir, 1, 0) second_shard_path = paths.pixel_catalog_file(partition_dir, 1, 1) @@ -128,7 +128,7 @@ def test_reduce_margin_shards(tmp_path): ) # Create a schema parquet file. - schema_path = os.path.join(tmp_path, "metadata.parquet") + schema_path = tmp_path / "metadata.parquet" schema_df = test_df.drop(columns=["margin_Norder", "margin_Dir", "margin_Npix"]) schema_df.to_parquet(schema_path) @@ -176,14 +176,14 @@ def test_reduce_margin_shards(tmp_path): def test_reduce_margin_shards_error(tmp_path, basic_data_shard_df, capsys): """Test error behavior on reduce stage. e.g. by not creating the original catalog metadata.""" - intermediate_dir = os.path.join(tmp_path, "intermediate") + intermediate_dir = tmp_path / "intermediate" partition_dir = get_pixel_cache_directory(intermediate_dir, HealpixPixel(1, 21)) shard_dir = paths.pixel_directory(partition_dir, 1, 21) os.makedirs(shard_dir) - os.makedirs(os.path.join(intermediate_dir, "reducing")) + os.makedirs(intermediate_dir / "reducing") # Don't write anything at the metadata path! - schema_path = os.path.join(tmp_path, "metadata.parquet") + schema_path = tmp_path / "metadata.parquet" basic_data_shard_df.to_parquet(paths.pixel_catalog_file(partition_dir, 1, 0)) basic_data_shard_df.to_parquet(paths.pixel_catalog_file(partition_dir, 1, 1)) diff --git a/tests/hipscat_import/soap/test_soap_map_reduce.py b/tests/hipscat_import/soap/test_soap_map_reduce.py index b95745e7..ab88f176 100644 --- a/tests/hipscat_import/soap/test_soap_map_reduce.py +++ b/tests/hipscat_import/soap/test_soap_map_reduce.py @@ -2,6 +2,7 @@ import os import shutil +from pathlib import Path import numpy.testing as npt import pandas as pd @@ -19,9 +20,7 @@ def test_count_joins(small_sky_soap_args, tmp_path, small_sky_soap_maps): count_joins(small_sky_soap_args, source, objects) result = pd.read_csv( - os.path.join( - tmp_path, "small_sky_association", "intermediate", f"{source.order}_{source.pixel}.csv" - ) + tmp_path / "small_sky_association" / "intermediate" / f"{source.order}_{source.pixel}.csv" ) assert len(result) == 1 assert result["num_rows"].sum() > 0 @@ -32,16 +31,20 @@ def test_count_joins_with_leaf(small_sky_soap_args, small_sky_soap_maps): small_sky_soap_args.write_leaf_files = True small_sky_soap_args.source_id_column = "source_id" - intermediate_dir = small_sky_soap_args.tmp_path + intermediate_dir = Path(small_sky_soap_args.tmp_path) for source, objects in small_sky_soap_maps.items(): count_joins(small_sky_soap_args, source, objects) - result = pd.read_csv(os.path.join(intermediate_dir, f"{source.order}_{source.pixel}.csv")) + result = pd.read_csv(intermediate_dir / f"{source.order}_{source.pixel}.csv") assert len(result) == 1 assert result["num_rows"].sum() > 0 - parquet_file_name = os.path.join( - intermediate_dir, "order_0", "dir_0", "pixel_11", f"source_{source.order}_{source.pixel}.parquet" + parquet_file_name = ( + intermediate_dir + / "order_0" + / "dir_0" + / "pixel_11" + / f"source_{source.order}_{source.pixel}.parquet" ) assert os.path.exists(parquet_file_name), f"file not found [{parquet_file_name}]" @@ -69,9 +72,7 @@ def test_count_joins_missing(small_sky_source_catalog, tmp_path): source = HealpixPixel(2, 176) count_joins(args, source, [HealpixPixel(2, 177), HealpixPixel(2, 178)]) - result_csv = os.path.join( - tmp_path, "small_sky_association", "intermediate", f"{source.order}_{source.pixel}.csv" - ) + result_csv = tmp_path / "small_sky_association" / "intermediate" / f"{source.order}_{source.pixel}.csv" result = pd.read_csv(result_csv) assert len(result) == 3 @@ -91,11 +92,11 @@ def test_count_joins_missing(small_sky_source_catalog, tmp_path): def test_combine_results(tmp_path): """Test combining many CSVs into a single one""" - input_path = os.path.join(tmp_path, "input") - os.makedirs(input_path, exist_ok=True) + input_path = tmp_path / "input" + input_path.mkdir(parents=True) - output_path = os.path.join(tmp_path, "output") - os.makedirs(output_path, exist_ok=True) + output_path = tmp_path / "output" + output_path.mkdir(parents=True) join_info = pd.DataFrame( data=[ @@ -113,16 +114,16 @@ def test_combine_results(tmp_path): "num_rows", ], ) - partitions_csv_file = os.path.join(input_path, "0_11.csv") + partitions_csv_file = input_path / "0_11.csv" join_info.to_csv(partitions_csv_file, index=False) total_num_rows = combine_partial_results(input_path, output_path, None) assert total_num_rows == 131 - result = pd.read_csv(os.path.join(output_path, "partition_join_info.csv")) + result = pd.read_csv(output_path / "partition_join_info.csv") assert len(result) == 2 - result = pd.read_csv(os.path.join(output_path, "unmatched_sources.csv")) + result = pd.read_csv(output_path / "unmatched_sources.csv") assert len(result) == 1 diff --git a/tests/hipscat_import/test_pipeline_resume_plan.py b/tests/hipscat_import/test_pipeline_resume_plan.py index b694f33c..c5e0b77c 100644 --- a/tests/hipscat_import/test_pipeline_resume_plan.py +++ b/tests/hipscat_import/test_pipeline_resume_plan.py @@ -1,6 +1,5 @@ """Test resume file operations""" -import os from pathlib import Path import numpy.testing as npt @@ -13,7 +12,7 @@ def test_done_key(tmp_path): """Verify expected behavior of marking stage progress via done files.""" plan = PipelineResumePlan(tmp_path=tmp_path, progress_bar=False) stage = "testing" - os.makedirs(os.path.join(tmp_path, stage)) + (tmp_path / stage).mkdir(parents=True) keys = plan.read_done_keys(stage) assert len(keys) == 0 diff --git a/tests/hipscat_import/test_runtime_arguments.py b/tests/hipscat_import/test_runtime_arguments.py index 69aded26..cea801cc 100644 --- a/tests/hipscat_import/test_runtime_arguments.py +++ b/tests/hipscat_import/test_runtime_arguments.py @@ -1,7 +1,5 @@ """Tests of argument validation""" -import os - import pytest from hipscat_import.runtime_arguments import RuntimeArguments @@ -70,12 +68,12 @@ def test_good_paths(tmp_path): def test_tmp_path_creation(tmp_path): """Check that we create a new temp path for this catalog.""" - output_path = os.path.join(tmp_path, "unique_output_directory") - temp_path = os.path.join(tmp_path, "unique_tmp_directory") - dask_tmp_path = os.path.join(tmp_path, "unique_dask_directory") - os.makedirs(output_path, exist_ok=True) - os.makedirs(temp_path, exist_ok=True) - os.makedirs(dask_tmp_path, exist_ok=True) + output_path = tmp_path / "unique_output_directory" + temp_path = tmp_path / "unique_tmp_directory" + dask_tmp_path = tmp_path / "unique_dask_directory" + output_path.mkdir(parents=True) + temp_path.mkdir(parents=True) + dask_tmp_path.mkdir(parents=True) ## If no tmp paths are given, use the output directory args = RuntimeArguments( diff --git a/tests/hipscat_import/verification/test_verification_arguments.py b/tests/hipscat_import/verification/test_verification_arguments.py index 8ebd6c81..303a63f7 100644 --- a/tests/hipscat_import/verification/test_verification_arguments.py +++ b/tests/hipscat_import/verification/test_verification_arguments.py @@ -62,7 +62,7 @@ def test_catalog_object(tmp_path, small_sky_object_catalog): output_path=tmp_path, output_artifact_name="small_sky_object_verification_report", ) - assert args.input_catalog_path == small_sky_object_catalog + assert args.input_catalog_path == str(small_sky_object_catalog) assert str(args.output_path) == tmp_path_str assert str(args.tmp_path).startswith(tmp_path_str) From 33d70e79516009e14dee3d60d02d6a7b08171e31 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Wed, 12 Jun 2024 13:54:42 -0400 Subject: [PATCH 05/10] Enable "index file" reads for catalog import (#334) * Indexed CSV reads. * Indexed parquet reads. * implement dataset.to_batches method in IndexedParquetReader * Use file_io dataset read for cloud URIs * Add documentation on index batching --------- Co-authored-by: Troy Raen --- docs/catalogs/arguments.rst | 36 ++++- src/hipscat_import/catalog/file_readers.py | 129 +++++++++++++++++- .../catalog/test_file_readers.py | 63 ++++++++- .../catalog/test_run_round_trip.py | 47 +++++++ tests/hipscat_import/conftest.py | 5 + .../indexed_files/csv_list_double_1_of_2.txt | 3 + .../indexed_files/csv_list_double_2_of_2.txt | 3 + .../data/indexed_files/csv_list_single.txt | 6 + .../indexed_files/parquet_list_single.txt | 5 + 9 files changed, 291 insertions(+), 6 deletions(-) create mode 100644 tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt create mode 100644 tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt create mode 100644 tests/hipscat_import/data/indexed_files/csv_list_single.txt create mode 100644 tests/hipscat_import/data/indexed_files/parquet_list_single.txt diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst index ca86302b..86e378f8 100644 --- a/docs/catalogs/arguments.rst +++ b/docs/catalogs/arguments.rst @@ -97,7 +97,6 @@ Reading input files Catalog import reads through a list of files and converts them into a hipscatted catalog. - Which files? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -129,9 +128,10 @@ along to the map/reduce stages. We've provided reference implementations for reading CSV, FITS, and Parquet input files, but you can subclass the reader type to suit whatever input files you've got. -You only need to provide the ``file_reader`` argument if you are using a custom file reader +You only need to provide an object ``file_reader`` argument if you are using a custom file reader or passing parameters to the file reader. For example you might use ``file_reader=CsvReader(sep="\s+")`` -to parse a whitespace separated file. +to parse a whitespace separated file. Otherwise, you can use a short string to +specify an existing file reader type e.g. ``file_reader="csv"``. You can find the full API documentation for :py:class:`hipscat_import.catalog.file_readers.InputReader` @@ -171,6 +171,36 @@ You can find the full API documentation for If you're reading from cloud storage, or otherwise have some filesystem credential dict, put those in ``input_storage_options``. +Indexed batching strategy +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you have many small files (think 400k+ CSV files with a few rows each), you +may benefit from "indexed" file readers. These allow you to explicitly create +batches for tasks by providing a set of index files, where each file is a +text file that contains only paths to data files. + +Benefits: + +1. If you have 400k+ input files, you don't want to create 400k+ dask tasks + to process these files. +2. If the files are very small, batching them in this way allows the import + process to *combine* several small files into a single chunk for processing. + This will result in fewer intermediate files during the ``splitting`` stage. +3. If you have a parquet files over a slow networked file system, we support + pyarrow's readahead protocol through indexed readers. + +Warnings: + +1. If you have 20 dask workers in your pool, you may be tempted to create + 20 index files. This is not always an efficient use of resources! + You'd be better served by 200 index files, so that: + + a. dask can spread the load if some lists of files take longer to process + than others + b. if the pipeline dies after successfully processing 15 lists, when you + retry the pipeline, you'll only be processing 5 lists with those same 20 + workers and many workers will be sitting idle. + Which fields? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hipscat_import/catalog/file_readers.py index f3f27bf0..e0202ddb 100644 --- a/src/hipscat_import/catalog/file_readers.py +++ b/src/hipscat_import/catalog/file_readers.py @@ -3,6 +3,9 @@ import abc from typing import Any, Dict, Union +import pandas as pd +import pyarrow +import pyarrow.dataset import pyarrow.parquet as pq from astropy.io import ascii as ascii_reader from astropy.table import Table @@ -30,8 +33,16 @@ def get_file_reader( includes `.csv.gz` and other compressed csv files - `fits`, flexible image transport system. often used for astropy tables. - `parquet`, compressed columnar data format + - `ecsv`, astropy's enhanced CSV + - `indexed_csv`, "index" style reader, that accepts a file with a list + of csv files that are appended in-memory + - `indexed_parquet`, "index" style reader, that accepts a file with a list + of parquet files that are appended in-memory chunksize (int): number of rows to read in a single iteration. + for single-file readers, large files are split into batches based on this value. + for index-style readers, we read files until we reach this chunksize and + create a single batch in-memory. schema_file (str): path to a parquet schema file. if provided, header names and column types will be pulled from the parquet schema metadata. column_names (list[str]): for CSV files, the names of columns if no header @@ -59,7 +70,16 @@ def get_file_reader( ) if file_format == "parquet": return ParquetReader(chunksize=chunksize, **kwargs) - + if file_format == "indexed_csv": + return IndexedCsvReader( + chunksize=chunksize, + schema_file=schema_file, + column_names=column_names, + type_map=type_map, + **kwargs, + ) + if file_format == "indexed_parquet": + return IndexedParquetReader(chunksize=chunksize, **kwargs) raise NotImplementedError(f"File Format: {file_format} not supported") @@ -96,7 +116,7 @@ def provenance_info(self) -> dict: def regular_file_exists(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **_kwargs): """Check that the `input_file` points to a single regular file - Raises + Raises: FileNotFoundError: if nothing exists at path, or directory found. """ if not file_io.does_file_or_directory_exist(input_file, storage_options=storage_options): @@ -104,6 +124,20 @@ def regular_file_exists(self, input_file, storage_options: Union[Dict[Any, Any], if not file_io.is_regular_file(input_file, storage_options=storage_options): raise FileNotFoundError(f"Directory found at path - requires regular file: {input_file}") + def read_index_file(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **kwargs): + """Read an "indexed" file. + + This should contain a list of paths to files to be read and batched. + + Raises: + FileNotFoundError: if nothing exists at path, or directory found. + """ + self.regular_file_exists(input_file, **kwargs) + file_names = file_io.load_text_file(input_file, storage_options=storage_options) + file_names = [f.strip() for f in file_names] + file_names = [f for f in file_names if f] + return file_names + class CsvReader(InputReader): """CSV reader for the most common CSV reading arguments. @@ -180,6 +214,37 @@ def read(self, input_file, read_columns=None): yield from reader +class IndexedCsvReader(CsvReader): + """Reads an index file, containing paths to CSV files to be read and batched + + See CsvReader for additional configuration for reading CSV files. + """ + + def read(self, input_file, read_columns=None): + file_names = self.read_index_file(input_file=input_file, **self.kwargs) + + batch_size = 0 + batch_frames = [] + for file in file_names: + for single_frame in super().read(file, read_columns=read_columns): + if batch_size + len(single_frame) >= self.chunksize: + # We've hit our chunksize, send the batch off to the task. + if len(batch_frames) == 0: + yield single_frame + batch_size = 0 + else: + yield pd.concat(batch_frames, ignore_index=True) + batch_frames = [] + batch_frames.append(single_frame) + batch_size = len(single_frame) + else: + batch_frames.append(single_frame) + batch_size += len(single_frame) + + if len(batch_frames) > 0: + yield pd.concat(batch_frames, ignore_index=True) + + class AstropyEcsvReader(InputReader): """Reads astropy ascii .ecsv files. @@ -291,3 +356,63 @@ def read(self, input_file, read_columns=None): batch_size=self.chunksize, columns=columns, use_pandas_metadata=True ): yield smaller_table.to_pandas() + + +class IndexedParquetReader(InputReader): + """Reads an index file, containing paths to parquet files to be read and batched + + Attributes: + chunksize (int): maximum number of rows to process at once. + Large files will be processed in chunks. Small files will be concatenated. + Also passed to pyarrow.dataset.Dataset.to_batches as `batch_size`. + batch_readahead (int): number of batches to read ahead. + Passed to pyarrow.dataset.Dataset.to_batches. + fragment_readahead (int): number of fragments to read ahead. + Passed to pyarrow.dataset.Dataset.to_batches. + use_threads (bool): whether to use multiple threads for reading. + Passed to pyarrow.dataset.Dataset.to_batches. + column_names (list[str] or None): Names of columns to use from the input dataset. + If None, use all columns. + kwargs: additional arguments to pass along to InputReader.read_index_file. + """ + + def __init__( + self, + chunksize=500_000, + batch_readahead=16, + fragment_readahead=4, + use_threads=True, + column_names=None, + **kwargs, + ): + self.chunksize = chunksize + self.batch_readahead = batch_readahead + self.fragment_readahead = fragment_readahead + self.use_threads = use_threads + self.column_names = column_names + self.kwargs = kwargs + + def read(self, input_file, read_columns=None): + columns = read_columns or self.column_names + file_names = self.read_index_file(input_file=input_file, **self.kwargs) + (_, input_dataset) = file_io.read_parquet_dataset(file_names, **self.kwargs) + + batches, nrows = [], 0 + for batch in input_dataset.to_batches( + batch_size=self.chunksize, + batch_readahead=self.batch_readahead, + fragment_readahead=self.fragment_readahead, + use_threads=self.use_threads, + columns=columns, + ): + if nrows + batch.num_rows > self.chunksize: + # We've hit the chunksize so load to a DataFrame and yield. + # There should always be at least one batch in here since batch_size == self.chunksize above. + yield pyarrow.Table.from_batches(batches).to_pandas() + batches, nrows = [], 0 + + batches.append(batch) + nrows += batch.num_rows + + if len(batches) > 0: + yield pyarrow.Table.from_batches(batches).to_pandas() diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py index fbd056cc..5438f750 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hipscat_import/catalog/test_file_readers.py @@ -8,7 +8,14 @@ import pytest from hipscat.catalog.catalog import CatalogInfo -from hipscat_import.catalog.file_readers import CsvReader, FitsReader, ParquetReader, get_file_reader +from hipscat_import.catalog.file_readers import ( + CsvReader, + FitsReader, + IndexedCsvReader, + IndexedParquetReader, + ParquetReader, + get_file_reader, +) # pylint: disable=redefined-outer-name @@ -233,6 +240,32 @@ def test_csv_reader_provenance_info(tmp_path, basic_catalog_info): assert "SECRETS" not in data +def test_indexed_csv_reader(indexed_files_dir): + # Chunksize covers all the inputs. + total_chunks = 0 + for frame in IndexedCsvReader(chunksize=10_000).read(indexed_files_dir / "csv_list_single.txt"): + total_chunks += 1 + assert len(frame) == 131 + + assert total_chunks == 1 + + # Chunksize requires splitting into just a few batches. + total_chunks = 0 + for frame in IndexedCsvReader(chunksize=60).read(indexed_files_dir / "csv_list_single.txt"): + total_chunks += 1 + assert len(frame) < 60 + + assert total_chunks == 3 + + # Requesting a very small chunksize. This will split up reads on the CSV. + total_chunks = 0 + for frame in IndexedCsvReader(chunksize=5).read(indexed_files_dir / "csv_list_single.txt"): + total_chunks += 1 + assert len(frame) <= 5 + + assert total_chunks == 29 + + def test_parquet_reader(parquet_shards_shard_44_0): """Verify we can read the parquet file into a single data frame.""" total_chunks = 0 @@ -252,6 +285,34 @@ def test_parquet_reader_chunked(parquet_shards_shard_44_0): assert total_chunks == 7 +def test_indexed_parquet_reader(indexed_files_dir): + # Chunksize covers all the inputs. + total_chunks = 0 + for frame in get_file_reader("indexed_parquet", chunksize=10_000).read( + indexed_files_dir / "parquet_list_single.txt" + ): + total_chunks += 1 + assert len(frame) == 131 + + assert total_chunks == 1 + + # Chunksize requires splitting into just a few batches. + total_chunks = 0 + for frame in IndexedParquetReader(chunksize=60).read(indexed_files_dir / "parquet_list_single.txt"): + total_chunks += 1 + assert len(frame) < 60 + + assert total_chunks == 3 + + # Requesting a very small chunksize. This will split up reads on the CSV. + total_chunks = 0 + for frame in IndexedParquetReader(chunksize=5).read(indexed_files_dir / "parquet_list_single.txt"): + total_chunks += 1 + assert len(frame) <= 5 + + assert total_chunks == 29 + + def test_parquet_reader_provenance_info(tmp_path, basic_catalog_info): """Test that we get some provenance info and it is parseable into JSON.""" reader = ParquetReader(chunksize=1) diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hipscat_import/catalog/test_run_round_trip.py index d9862067..d80b3619 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hipscat_import/catalog/test_run_round_trip.py @@ -673,3 +673,50 @@ def test_gaia_ecsv( assert schema.equals(expected_parquet_schema, check_metadata=False) schema = pds.dataset(args.catalog_path, format="parquet").schema assert schema.equals(expected_parquet_schema, check_metadata=False) + + +@pytest.mark.dask +def test_import_indexed_csv( + dask_client, + indexed_files_dir, + tmp_path, +): + """Use indexed-style CSV reads. There are two index files, and we expect + to have two batches worth of intermediate files.""" + temp = tmp_path / "intermediate_files" + os.makedirs(temp) + + args = ImportArguments( + output_artifact_name="indexed_csv", + input_file_list=[ + indexed_files_dir / "csv_list_double_1_of_2.txt", + indexed_files_dir / "csv_list_double_2_of_2.txt", + ], + output_path=tmp_path, + file_reader="indexed_csv", + sort_columns="id", + tmp_dir=temp, + dask_tmp=temp, + highest_healpix_order=2, + delete_intermediate_parquet_files=False, + delete_resume_log_files=False, + pixel_threshold=3_000, + progress_bar=False, + ) + + runner.run(args, dask_client) + + # Check that the catalog metadata file exists + catalog = Catalog.read_from_hipscat(args.catalog_path) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + assert len(catalog.get_healpix_pixels()) == 1 + + # Check that there are TWO intermediate parquet file (two index files). + assert_directory_contains( + temp / "indexed_csv" / "intermediate" / "order_0" / "dir_0" / "pixel_11", + [ + "shard_split_0_0.parquet", + "shard_split_1_0.parquet", + ], + ) diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index 7ef2fc20..e16e9fab 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -121,6 +121,11 @@ def formats_pandasindex(test_data_dir): return test_data_dir / "test_formats" / "pandasindex.parquet" +@pytest.fixture +def indexed_files_dir(test_data_dir): + return test_data_dir / "indexed_files" + + @pytest.fixture def small_sky_parts_dir(test_data_dir): return test_data_dir / "small_sky_parts" diff --git a/tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt b/tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt new file mode 100644 index 00000000..8e9c9d54 --- /dev/null +++ b/tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt @@ -0,0 +1,3 @@ +tests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv + diff --git a/tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt b/tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt new file mode 100644 index 00000000..352c08ea --- /dev/null +++ b/tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt @@ -0,0 +1,3 @@ +tests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv \ No newline at end of file diff --git a/tests/hipscat_import/data/indexed_files/csv_list_single.txt b/tests/hipscat_import/data/indexed_files/csv_list_single.txt new file mode 100644 index 00000000..04817f83 --- /dev/null +++ b/tests/hipscat_import/data/indexed_files/csv_list_single.txt @@ -0,0 +1,6 @@ +tests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv +tests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv + diff --git a/tests/hipscat_import/data/indexed_files/parquet_list_single.txt b/tests/hipscat_import/data/indexed_files/parquet_list_single.txt new file mode 100644 index 00000000..63e5b84f --- /dev/null +++ b/tests/hipscat_import/data/indexed_files/parquet_list_single.txt @@ -0,0 +1,5 @@ +tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet +tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet +tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet +tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet +tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet From 22cba75c47416f07befd648952493ea938ae5b01 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Wed, 12 Jun 2024 15:07:58 -0400 Subject: [PATCH 06/10] Enable std tqdm bar, refactor for re-use. (#330) * Enable std tqdm bar, refactor for re-use. * Test output of progress printing. * WTF isort. * Document new progress bar option. --- docs/catalogs/arguments.rst | 6 ++ src/hipscat_import/catalog/resume_plan.py | 5 +- src/hipscat_import/catalog/run_import.py | 10 +-- src/hipscat_import/index/run_index.py | 10 ++- .../margin_cache/margin_cache.py | 3 +- .../margin_cache/margin_cache_resume_plan.py | 6 +- src/hipscat_import/pipeline_resume_plan.py | 88 +++++++++++++++---- src/hipscat_import/runtime_arguments.py | 6 +- src/hipscat_import/soap/resume_plan.py | 6 +- src/hipscat_import/soap/run_soap.py | 6 +- .../test_pipeline_resume_plan.py | 32 +++++-- 11 files changed, 124 insertions(+), 54 deletions(-) diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst index 86e378f8..b7c7e0a9 100644 --- a/docs/catalogs/arguments.rst +++ b/docs/catalogs/arguments.rst @@ -275,6 +275,12 @@ reporting to look like the following: Reducing : 100%|██████████| 10895/10895 [7:46:07<00:00, 2.57s/it] Finishing: 100%|██████████| 6/6 [08:03<00:00, 80.65s/it] +``tqdm`` will try to make a guess about the type of output to provide: plain +text as for a command line, or a pretty ipywidget. If it tries to use a pretty +widget but your execution environment can't support the widget, you can +force the pipeline to use a simple progress bar with the ``simple_progress_bar`` +argument. + For very long-running pipelines (e.g. multi-TB inputs), you can get an email notification when the pipeline completes using the ``completion_email_address`` argument. This will send a brief email, diff --git a/src/hipscat_import/catalog/resume_plan.py b/src/hipscat_import/catalog/resume_plan.py index 30750d12..5ff33f2e 100644 --- a/src/hipscat_import/catalog/resume_plan.py +++ b/src/hipscat_import/catalog/resume_plan.py @@ -11,7 +11,6 @@ from hipscat import pixel_math from hipscat.io import FilePointer, file_io from hipscat.pixel_math.healpix_pixel import HealpixPixel -from tqdm.auto import tqdm from hipscat_import.catalog.sparse_histogram import SparseHistogram from hipscat_import.pipeline_resume_plan import PipelineResumePlan @@ -44,9 +43,7 @@ def __post_init__(self): def gather_plan(self): """Initialize the plan.""" - with tqdm( - total=5, desc=self.get_formatted_stage_name("Planning"), disable=not self.progress_bar - ) as step_progress: + with self.print_progress(total=5, stage_name="Planning") as step_progress: ## Make sure it's safe to use existing resume state. super().safe_to_resume() step_progress.update(1) diff --git a/src/hipscat_import/catalog/run_import.py b/src/hipscat_import/catalog/run_import.py index 7081670a..1e4054cc 100644 --- a/src/hipscat_import/catalog/run_import.py +++ b/src/hipscat_import/catalog/run_import.py @@ -11,11 +11,9 @@ from hipscat.catalog import PartitionInfo from hipscat.io import paths from hipscat.io.parquet_metadata import write_parquet_metadata -from tqdm.auto import tqdm import hipscat_import.catalog.map_reduce as mr from hipscat_import.catalog.arguments import ImportArguments -from hipscat_import.pipeline_resume_plan import PipelineResumePlan def run(args, client): @@ -47,9 +45,7 @@ def run(args, client): ) args.resume_plan.wait_for_mapping(futures) - with tqdm( - total=2, desc=PipelineResumePlan.get_formatted_stage_name("Binning"), disable=not args.progress_bar - ) as step_progress: + with args.resume_plan.print_progress(total=2, stage_name="Binning") as step_progress: raw_histogram = args.resume_plan.read_histogram(args.mapping_healpix_order) step_progress.update(1) alignment_file = args.resume_plan.get_alignment_file( @@ -115,9 +111,7 @@ def run(args, client): args.resume_plan.wait_for_reducing(futures) # All done - write out the metadata - with tqdm( - total=5, desc=PipelineResumePlan.get_formatted_stage_name("Finishing"), disable=not args.progress_bar - ) as step_progress: + with args.resume_plan.print_progress(total=5, stage_name="Finishing") as step_progress: catalog_info = args.to_catalog_info(int(raw_histogram.sum())) io.write_provenance_info( catalog_base_dir=args.catalog_path, diff --git a/src/hipscat_import/index/run_index.py b/src/hipscat_import/index/run_index.py index c4279623..bbf870d1 100644 --- a/src/hipscat_import/index/run_index.py +++ b/src/hipscat_import/index/run_index.py @@ -1,11 +1,10 @@ """Create columnar index of hipscat table using dask for parallelization""" from hipscat.io import file_io, parquet_metadata, write_metadata -from tqdm.auto import tqdm import hipscat_import.index.map_reduce as mr from hipscat_import.index.arguments import IndexArguments -from hipscat_import.pipeline_resume_plan import PipelineResumePlan +from hipscat_import.pipeline_resume_plan import print_progress def run(args, client): @@ -17,8 +16,11 @@ def run(args, client): rows_written = mr.create_index(args, client) # All done - write out the metadata - with tqdm( - total=4, desc=PipelineResumePlan.get_formatted_stage_name("Finishing"), disable=not args.progress_bar + with print_progress( + total=4, + stage_name="Finishing", + use_progress_bar=args.progress_bar, + simple_progress_bar=args.simple_progress_bar, ) as step_progress: index_catalog_info = args.to_catalog_info(int(rows_written)) write_metadata.write_provenance_info( diff --git a/src/hipscat_import/margin_cache/margin_cache.py b/src/hipscat_import/margin_cache/margin_cache.py index 217fcc56..8611393f 100644 --- a/src/hipscat_import/margin_cache/margin_cache.py +++ b/src/hipscat_import/margin_cache/margin_cache.py @@ -1,6 +1,5 @@ from hipscat.catalog import PartitionInfo from hipscat.io import file_io, parquet_metadata, paths, write_metadata -from tqdm.auto import tqdm import hipscat_import.margin_cache.margin_cache_map_reduce as mcmr from hipscat_import.margin_cache.margin_cache_resume_plan import MarginCachePlan @@ -59,7 +58,7 @@ def generate_margin_cache(args, client): ) resume_plan.wait_for_reducing(futures) - with tqdm(total=4, desc="Finishing", disable=not args.progress_bar) as step_progress: + with resume_plan.print_progress(total=4, stage_name="Finishing") as step_progress: parquet_metadata.write_parquet_metadata( args.catalog_path, storage_options=args.output_storage_options ) diff --git a/src/hipscat_import/margin_cache/margin_cache_resume_plan.py b/src/hipscat_import/margin_cache/margin_cache_resume_plan.py index 72aec80d..9a6274df 100644 --- a/src/hipscat_import/margin_cache/margin_cache_resume_plan.py +++ b/src/hipscat_import/margin_cache/margin_cache_resume_plan.py @@ -9,7 +9,6 @@ from hipscat import pixel_math from hipscat.io import file_io from hipscat.pixel_math.healpix_pixel import HealpixPixel -from tqdm.auto import tqdm from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments from hipscat_import.pipeline_resume_plan import PipelineResumePlan @@ -33,6 +32,7 @@ def __init__(self, args: MarginCacheArguments): super().__init__( resume=args.resume, progress_bar=args.progress_bar, + simple_progress_bar=args.simple_progress_bar, tmp_path=args.tmp_path, delete_resume_log_files=args.delete_resume_log_files, ) @@ -40,9 +40,7 @@ def __init__(self, args: MarginCacheArguments): def _gather_plan(self, args): """Initialize the plan.""" - with tqdm( - total=3, desc=self.get_formatted_stage_name("Planning"), disable=not self.progress_bar - ) as step_progress: + with self.print_progress(total=3, stage_name="Planning") as step_progress: ## Make sure it's safe to use existing resume state. super().safe_to_resume() mapping_done = self.is_mapping_done() diff --git a/src/hipscat_import/pipeline_resume_plan.py b/src/hipscat_import/pipeline_resume_plan.py index 7e859e91..84a1e603 100644 --- a/src/hipscat_import/pipeline_resume_plan.py +++ b/src/hipscat_import/pipeline_resume_plan.py @@ -10,7 +10,8 @@ from dask.distributed import print as dask_print from hipscat.io import FilePointer, file_io from hipscat.pixel_math.healpix_pixel import HealpixPixel -from tqdm.auto import tqdm +from tqdm.auto import tqdm as auto_tqdm +from tqdm.std import tqdm as std_tqdm @dataclass @@ -25,6 +26,10 @@ class PipelineResumePlan: progress_bar: bool = True """if true, a tqdm progress bar will be displayed for user feedback of planning progress""" + simple_progress_bar: bool = False + """if displaying a progress bar, use a text-only simple progress + bar instead of widget. this can be useful in some environments when running + in a notebook where ipywidgets cannot be used (see `progress_bar` argument)""" delete_resume_log_files: bool = True """should we delete task-level done files once each stage is complete? if False, we will keep all sub-histograms from the mapping stage, and all @@ -131,13 +136,7 @@ def wait_for_futures(self, futures, stage_name, fail_fast=False): RuntimeError: if any future returns an error status. """ some_error = False - formatted_stage_name = self.get_formatted_stage_name(stage_name) - for future in tqdm( - as_completed(futures), - desc=formatted_stage_name, - total=len(futures), - disable=(not self.progress_bar), - ): + for future in self.print_progress(as_completed(futures), stage_name=stage_name, total=len(futures)): if future.status == "error": some_error = True if fail_fast: @@ -146,18 +145,26 @@ def wait_for_futures(self, futures, stage_name, fail_fast=False): if some_error: raise RuntimeError(f"Some {stage_name} stages failed. See logs for details.") - @staticmethod - def get_formatted_stage_name(stage_name) -> str: - """Create a stage name of consistent minimum length. Ensures that the tqdm - progress bars can line up nicely when multiple stages must run. + def print_progress(self, iterable=None, total=None, stage_name=None): + """Create a progress bar that will provide user with task feedback. + + This is a thin wrapper around the static ``print_progress`` method that uses + member variables for the caller's convenience. Args: - stage_name (str): name of the stage (e.g. mapping, reducing) + iterable (iterable): Optional. provides iterations to progress updates. + total (int): Optional. Expected iterations. + stage_name (str): name of the stage (e.g. mapping, reducing). this will + be further formatted with ``get_formatted_stage_name``, so the caller + doesn't need to worry about that. """ - if stage_name is None or len(stage_name) == 0: - stage_name = "progress" - - return f"{stage_name.capitalize(): <10}" + return print_progress( + iterable=iterable, + total=total, + stage_name=stage_name, + use_progress_bar=self.progress_bar, + simple_progress_bar=self.simple_progress_bar, + ) def check_original_input_paths(self, input_paths): """Validate that we're operating on the same file set as the original pipeline, @@ -230,3 +237,50 @@ def print_task_failure(custom_message, exception): except Exception: # pylint: disable=broad-exception-caught pass dask_print(exception) + + +def get_formatted_stage_name(stage_name) -> str: + """Create a stage name of consistent minimum length. Ensures that the tqdm + progress bars can line up nicely when multiple stages must run. + + Args: + stage_name (str): name of the stage (e.g. mapping, reducing) + """ + if stage_name is None or len(stage_name) == 0: + stage_name = "progress" + + return f"{stage_name.capitalize(): <10}" + + +def print_progress( + iterable=None, total=None, stage_name=None, use_progress_bar=True, simple_progress_bar=False +): + """Create a progress bar that will provide user with task feedback. + + Args: + iterable (iterable): Optional. provides iterations to progress updates. + total (int): Optional. Expected iterations. + stage_name (str): name of the stage (e.g. mapping, reducing). this will + be further formatted with ``get_formatted_stage_name``, so the caller + doesn't need to worry about that. + use_progress_bar (bool): should we display any progress. typically False + when no stdout is expected. + simple_progress_bar (bool): if displaying a progress bar, use a text-only + simple progress bar instead of widget. this can be useful when running + in a particular notebook where ipywidgets cannot be used + (only used when ``use_progress_bar`` is True) + """ + if simple_progress_bar: + return std_tqdm( + iterable, + desc=get_formatted_stage_name(stage_name), + total=total, + disable=not use_progress_bar, + ) + + return auto_tqdm( + iterable, + desc=get_formatted_stage_name(stage_name), + total=total, + disable=not use_progress_bar, + ) diff --git a/src/hipscat_import/runtime_arguments.py b/src/hipscat_import/runtime_arguments.py index 4f4d7126..9ce5e8ad 100644 --- a/src/hipscat_import/runtime_arguments.py +++ b/src/hipscat_import/runtime_arguments.py @@ -32,8 +32,12 @@ class RuntimeArguments: the pipeline where we left off. If False, we start the import from scratch, overwriting any content of the output directory.""" progress_bar: bool = True - """if true, a tqdm progress bar will be displayed for user + """if true, a progress bar will be displayed for user feedback of map reduce progress""" + simple_progress_bar: bool = False + """if displaying a progress bar, use a text-only simple progress + bar instead of widget. this can be useful in some environments when running + in a notebook where ipywidgets cannot be used (see `progress_bar` argument)""" dask_tmp: str = "" """directory for dask worker space. this should be local to the execution of the pipeline, for speed of reads and writes""" diff --git a/src/hipscat_import/soap/resume_plan.py b/src/hipscat_import/soap/resume_plan.py index be1a6f86..3afd8626 100644 --- a/src/hipscat_import/soap/resume_plan.py +++ b/src/hipscat_import/soap/resume_plan.py @@ -11,7 +11,6 @@ from hipscat.io import file_io from hipscat.pixel_math.healpix_pixel import HealpixPixel from hipscat.pixel_tree import PixelAlignment, align_trees -from tqdm.auto import tqdm from hipscat_import.pipeline_resume_plan import PipelineResumePlan from hipscat_import.soap.arguments import SoapArguments @@ -39,6 +38,7 @@ def __init__(self, args: SoapArguments): super().__init__( resume=args.resume, progress_bar=args.progress_bar, + simple_progress_bar=args.simple_progress_bar, tmp_path=args.tmp_path, delete_resume_log_files=args.delete_resume_log_files, ) @@ -46,9 +46,7 @@ def __init__(self, args: SoapArguments): def gather_plan(self, args): """Initialize the plan.""" - with tqdm( - total=3, desc=self.get_formatted_stage_name("Planning"), disable=not self.progress_bar - ) as step_progress: + with self.print_progress(total=3, stage_name="Planning") as step_progress: ## Make sure it's safe to use existing resume state. super().safe_to_resume() step_progress.update(1) diff --git a/src/hipscat_import/soap/run_soap.py b/src/hipscat_import/soap/run_soap.py index dafafae4..34d50b8a 100644 --- a/src/hipscat_import/soap/run_soap.py +++ b/src/hipscat_import/soap/run_soap.py @@ -5,9 +5,7 @@ from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo from hipscat.io import parquet_metadata, paths, write_metadata -from tqdm.auto import tqdm -from hipscat_import.pipeline_resume_plan import PipelineResumePlan from hipscat_import.soap.arguments import SoapArguments from hipscat_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins from hipscat_import.soap.resume_plan import SoapPlan @@ -50,9 +48,7 @@ def run(args, client): resume_plan.wait_for_reducing(futures) # All done - write out the metadata - with tqdm( - total=4, desc=PipelineResumePlan.get_formatted_stage_name("Finishing"), disable=not args.progress_bar - ) as step_progress: + with resume_plan.print_progress(total=4, stage_name="Finishing") as step_progress: if args.write_leaf_files: parquet_metadata.write_parquet_metadata( args.catalog_path, diff --git a/tests/hipscat_import/test_pipeline_resume_plan.py b/tests/hipscat_import/test_pipeline_resume_plan.py index c5e0b77c..7334d6a2 100644 --- a/tests/hipscat_import/test_pipeline_resume_plan.py +++ b/tests/hipscat_import/test_pipeline_resume_plan.py @@ -5,7 +5,7 @@ import numpy.testing as npt import pytest -from hipscat_import.pipeline_resume_plan import PipelineResumePlan +from hipscat_import.pipeline_resume_plan import PipelineResumePlan, get_formatted_stage_name def test_done_key(tmp_path): @@ -118,6 +118,28 @@ def error_on_even(argument): plan.wait_for_futures(futures, "test") +@pytest.mark.dask +def test_wait_for_futures_progress(tmp_path, dask_client, capsys): + """Test that we can wait around for futures to complete. + + Additionally test that relevant parts of the traceback are printed to stdout.""" + plan = PipelineResumePlan(tmp_path=tmp_path, progress_bar=True, simple_progress_bar=True, resume=False) + + def error_on_even(argument): + """Silly little method used to test futures that fail under predictable conditions""" + if argument % 2 == 0: + raise RuntimeError("we are at odds with evens") + + ## Everything is fine if we're all odd, but use a silly name so it's + ## clear that the stage name is present, and well-formatted. + futures = [dask_client.submit(error_on_even, 1)] + plan.wait_for_futures(futures, "teeeest") + + captured = capsys.readouterr() + assert "Teeeest" in captured.err + assert "100%" in captured.err + + @pytest.mark.dask def test_wait_for_futures_fail_fast(tmp_path, dask_client): """Test that we can wait around for futures to complete. @@ -137,16 +159,16 @@ def error_on_even(argument): def test_formatted_stage_name(): """Test that we make pretty stage names for presenting in progress bars""" - formatted = PipelineResumePlan.get_formatted_stage_name(None) + formatted = get_formatted_stage_name(None) assert formatted == "Progress " - formatted = PipelineResumePlan.get_formatted_stage_name("") + formatted = get_formatted_stage_name("") assert formatted == "Progress " - formatted = PipelineResumePlan.get_formatted_stage_name("stage") + formatted = get_formatted_stage_name("stage") assert formatted == "Stage " - formatted = PipelineResumePlan.get_formatted_stage_name("very long stage name") + formatted = get_formatted_stage_name("very long stage name") assert formatted == "Very long stage name" From de8799c8bd51662911fdd2df6217ff846338270c Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:29:44 -0400 Subject: [PATCH 07/10] Only specify the astropy table columns once. (#336) --- src/hipscat_import/catalog/file_readers.py | 2 +- tests/hipscat_import/catalog/test_file_readers.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hipscat_import/catalog/file_readers.py index e0202ddb..f94c2fc1 100644 --- a/src/hipscat_import/catalog/file_readers.py +++ b/src/hipscat_import/catalog/file_readers.py @@ -306,7 +306,7 @@ def read(self, input_file, read_columns=None): table = Table.read(input_file, memmap=True, **self.kwargs) if read_columns: table.keep_columns(read_columns) - if self.column_names: + elif self.column_names: table.keep_columns(self.column_names) elif self.skip_column_names: table.remove_columns(self.skip_column_names) diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py index 5438f750..8ceedb3f 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hipscat_import/catalog/test_file_readers.py @@ -360,9 +360,17 @@ def test_read_fits_columns(formats_fits): frame = next(FitsReader(column_names=["id", "ra", "dec"]).read(formats_fits)) assert list(frame.columns) == ["id", "ra", "dec"] + frame = next(FitsReader(column_names=["id", "ra", "dec"]).read(formats_fits, read_columns=["ra", "dec"])) + assert list(frame.columns) == ["ra", "dec"] + frame = next(FitsReader(skip_column_names=["ra_error", "dec_error"]).read(formats_fits)) assert list(frame.columns) == ["id", "ra", "dec", "test_id"] + frame = next( + FitsReader(skip_column_names=["ra_error", "dec_error"]).read(formats_fits, read_columns=["ra", "dec"]) + ) + assert list(frame.columns) == ["ra", "dec"] + def test_fits_reader_provenance_info(tmp_path, basic_catalog_info): """Test that we get some provenance info and it is parseable into JSON.""" From a9e8048e3d312763b93863f94173c53d1cc4f82d Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Fri, 14 Jun 2024 14:55:34 -0400 Subject: [PATCH 08/10] Require recent hipscat (#337) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 931aa25e..9315e911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "dask[complete]>=2024.3.0", # Includes dask expressions. "deprecated", "healpy", - "hipscat >=0.3.4", + "hipscat >=0.3.5", "ipykernel", # Support for Jupyter notebooks "numpy", "pandas", From 205818afb670636c401f613b3f2879afe1465c99 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:42:32 -0400 Subject: [PATCH 09/10] Bump actions/add-to-project from 1.0.1 to 1.0.2 (#340) Bumps [actions/add-to-project](https://github.com/actions/add-to-project) from 1.0.1 to 1.0.2. - [Release notes](https://github.com/actions/add-to-project/releases) - [Commits](https://github.com/actions/add-to-project/compare/v1.0.1...v1.0.2) --- updated-dependencies: - dependency-name: actions/add-to-project dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/add-issue-to-project-tracker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/add-issue-to-project-tracker.yml b/.github/workflows/add-issue-to-project-tracker.yml index 18b113a5..8f0a3694 100644 --- a/.github/workflows/add-issue-to-project-tracker.yml +++ b/.github/workflows/add-issue-to-project-tracker.yml @@ -10,7 +10,7 @@ jobs: name: Add issue to project runs-on: ubuntu-latest steps: - - uses: actions/add-to-project@v1.0.1 + - uses: actions/add-to-project@v1.0.2 with: project-url: https://github.com/orgs/astronomy-commons/projects/7 github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} From 72a76265b849dea2b752cc6f8d50c932fb3e0779 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Tue, 16 Jul 2024 11:51:40 -0400 Subject: [PATCH 10/10] Use list of names, instead of pandas index. (#343) --- src/hipscat_import/catalog/file_readers.py | 2 +- .../catalog/test_argument_validation.py | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hipscat_import/catalog/file_readers.py index f94c2fc1..ba9004f4 100644 --- a/src/hipscat_import/catalog/file_readers.py +++ b/src/hipscat_import/catalog/file_readers.py @@ -192,7 +192,7 @@ def __init__( if self.column_names: self.kwargs["names"] = self.column_names elif not self.header and schema_parquet is not None: - self.kwargs["names"] = schema_parquet.columns + self.kwargs["names"] = list(schema_parquet.columns) if self.type_map: self.kwargs["dtype"] = self.type_map diff --git a/tests/hipscat_import/catalog/test_argument_validation.py b/tests/hipscat_import/catalog/test_argument_validation.py index ee81d07f..c39636d9 100644 --- a/tests/hipscat_import/catalog/test_argument_validation.py +++ b/tests/hipscat_import/catalog/test_argument_validation.py @@ -1,8 +1,10 @@ """Tests of argument validation""" import pytest +from hipscat.io import write_metadata from hipscat_import.catalog.arguments import ImportArguments, check_healpix_order_range +from hipscat_import.catalog.file_readers import CsvReader # pylint: disable=protected-access @@ -206,6 +208,38 @@ def test_provenance_info(blank_data_dir, tmp_path): assert "epoch" in runtime_args +def test_write_provenance_info(formats_dir, tmp_path): + """Verify that provenance info can be written to JSON file.""" + input_file = formats_dir / "gaia_minimum.csv" + schema_file = formats_dir / "gaia_minimum_schema.parquet" + + args = ImportArguments( + output_artifact_name="gaia_minimum", + input_file_list=[input_file], + file_reader=CsvReader( + comment="#", + header=None, + schema_file=schema_file, + ), + ra_column="ra", + dec_column="dec", + sort_columns="solution_id", + use_schema_file=schema_file, + output_path=tmp_path, + dask_tmp=tmp_path, + highest_healpix_order=2, + pixel_threshold=3_000, + progress_bar=False, + ) + + write_metadata.write_provenance_info( + catalog_base_dir=args.catalog_path, + dataset_info=args.to_catalog_info(0), + tool_args=args.provenance_info(), + storage_options=args.output_storage_options, + ) + + def test_check_healpix_order_range(): """Test method check_healpix_order_range""" check_healpix_order_range(5, "order_field")