diff --git a/tests/hipscat_import/catalog/test_argument_validation.py b/tests/hipscat_import/catalog/test_argument_validation.py index 65625eec..ee81d07f 100644 --- a/tests/hipscat_import/catalog/test_argument_validation.py +++ b/tests/hipscat_import/catalog/test_argument_validation.py @@ -78,7 +78,7 @@ def test_good_paths(blank_data_dir, blank_data_file, tmp_path): ) assert args.input_path == blank_data_dir assert len(args.input_paths) == 1 - assert blank_data_file in args.input_paths[0] + assert str(blank_data_file) in args.input_paths[0] def test_multiple_files_in_path(small_sky_parts_dir, tmp_path): diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py index 677b4a7d..fbd056cc 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hipscat_import/catalog/test_file_readers.py @@ -1,7 +1,5 @@ """Test dataframe-generating file readers""" -import os - import hipscat.io.write_metadata as io import numpy as np import pandas as pd @@ -91,7 +89,7 @@ def test_csv_reader_parquet_metadata(small_sky_single_file, tmp_path): pa.field("dec_error", pa.float64()), ] ) - schema_file = os.path.join(tmp_path, "metadata.parquet") + schema_file = tmp_path / "metadata.parquet" pq.write_metadata( small_sky_schema, schema_file, @@ -187,7 +185,7 @@ def test_csv_reader_pipe_delimited(formats_pipe_csv, tmp_path): pa.field("numeric", pa.int64()), ] ) - schema_file = os.path.join(tmp_path, "metadata.parquet") + schema_file = tmp_path / "metadata.parquet" pq.write_metadata(parquet_schema_types, schema_file) frame = next( @@ -224,7 +222,7 @@ def test_csv_reader_provenance_info(tmp_path, basic_catalog_info): ) provenance_info = reader.provenance_info() catalog_base_dir = tmp_path / "test_catalog" - os.makedirs(catalog_base_dir) + catalog_base_dir.mkdir(parents=True) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) with open(catalog_base_dir / "provenance_info.json", "r", encoding="utf-8") as file: @@ -258,8 +256,8 @@ def test_parquet_reader_provenance_info(tmp_path, basic_catalog_info): """Test that we get some provenance info and it is parseable into JSON.""" reader = ParquetReader(chunksize=1) provenance_info = reader.provenance_info() - catalog_base_dir = os.path.join(tmp_path, "test_catalog") - os.makedirs(catalog_base_dir) + catalog_base_dir = tmp_path / "test_catalog" + catalog_base_dir.mkdir(parents=True) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) @@ -309,6 +307,6 @@ def test_fits_reader_provenance_info(tmp_path, basic_catalog_info): """Test that we get some provenance info and it is parseable into JSON.""" reader = FitsReader() provenance_info = reader.provenance_info() - catalog_base_dir = os.path.join(tmp_path, "test_catalog") - os.makedirs(catalog_base_dir) + catalog_base_dir = tmp_path / "test_catalog" + catalog_base_dir.mkdir(parents=True) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hipscat_import/catalog/test_map_reduce.py index 95a27347..34c72147 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hipscat_import/catalog/test_map_reduce.py @@ -20,7 +20,7 @@ def pickle_file_reader(tmp_path, file_reader) -> str: """Utility method to pickle a file reader, and return path to pickle.""" - pickled_reader_file = os.path.join(tmp_path, "reader.pickle") + pickled_reader_file = tmp_path / "reader.pickle" with open(pickled_reader_file, "wb") as pickle_file: pickle.dump(file_reader, pickle_file) return pickled_reader_file @@ -86,14 +86,14 @@ def test_read_bad_fileformat(blank_data_file, capsys, tmp_path): def read_partial_histogram(tmp_path, mapping_key): """Helper to read in the former result of a map operation.""" - histogram_file = os.path.join(tmp_path, "histograms", f"{mapping_key}.npz") + histogram_file = tmp_path / "histograms" / f"{mapping_key}.npz" hist = SparseHistogram.from_file(histogram_file) return hist.to_array() def test_read_single_fits(tmp_path, formats_fits): """Success case - fits file that exists being read as fits""" - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=formats_fits, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("fits")), @@ -127,7 +127,7 @@ def test_map_headers_wrong(formats_headers_csv, tmp_path): def test_map_headers(tmp_path, formats_headers_csv): """Test loading the a file with non-default headers""" - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=formats_headers_csv, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -149,8 +149,8 @@ def test_map_headers(tmp_path, formats_headers_csv): def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): - os.makedirs(os.path.join(tmp_path, "histograms")) - input_file = os.path.join(formats_dir, "hipscat_index.csv") + (tmp_path / "histograms").mkdir(parents=True) + input_file = formats_dir / "hipscat_index.csv" mr.map_to_pixels( input_file=input_file, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -183,8 +183,8 @@ def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): def test_map_with_schema(tmp_path, mixed_schema_csv_dir, mixed_schema_csv_parquet): """Test loading the a file when using a parquet schema file for dtypes""" - os.makedirs(os.path.join(tmp_path, "histograms")) - input_file = os.path.join(mixed_schema_csv_dir, "input_01.csv") + (tmp_path / "histograms").mkdir(parents=True) + input_file = mixed_schema_csv_dir / "input_01.csv" mr.map_to_pixels( input_file=input_file, pickled_reader_file=pickle_file_reader( @@ -213,7 +213,7 @@ def test_map_with_schema(tmp_path, mixed_schema_csv_dir, mixed_schema_csv_parque def test_map_small_sky_order0(tmp_path, small_sky_single_file): """Test loading the small sky catalog and partitioning each object into the same large bucket""" - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=small_sky_single_file, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -239,7 +239,7 @@ def test_map_small_sky_part_order1(tmp_path, small_sky_file0): Test loading a small portion of the small sky catalog and partitioning objects into four smaller buckets """ - os.makedirs(os.path.join(tmp_path, "histograms")) + (tmp_path / "histograms").mkdir(parents=True) mr.map_to_pixels( input_file=small_sky_file0, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), @@ -279,7 +279,6 @@ def test_split_pixels_bad_format(blank_data_file, tmp_path, capsys): ) captured = capsys.readouterr() assert "No such file or directory" in captured.out - os.makedirs(os.path.join(tmp_path, "splitting")) def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_path): @@ -300,17 +299,17 @@ def test_split_pixels_headers(formats_headers_csv, assert_parquet_file_ids, tmp_ alignment_file=alignment_file, ) - file_name = os.path.join(tmp_path, "order_0", "dir_0", "pixel_11", "shard_0_0.parquet") + file_name = tmp_path / "order_0" / "dir_0" / "pixel_11" / "shard_0_0.parquet" expected_ids = [*range(700, 708)] assert_parquet_file_ids(file_name, "object_id", expected_ids) - file_name = os.path.join(tmp_path, "order_0", "dir_0", "pixel_1", "shard_0_0.parquet") + file_name = tmp_path / "order_0" / "dir_0" / "pixel_1" / "shard_0_0.parquet" assert not os.path.exists(file_name) def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): """Test reducing into one large pixel""" - os.makedirs(os.path.join(tmp_path, "reducing")) + (tmp_path / "reducing").mkdir(parents=True) mr.reduce_pixel_shards( cache_shard_path=parquet_shards_dir, resume_path=tmp_path, @@ -326,7 +325,7 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): delete_input_files=False, ) - output_file = os.path.join(tmp_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @@ -334,7 +333,7 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_path): """Test reducing with or without a _hipscat_index field""" - os.makedirs(os.path.join(tmp_path, "reducing")) + (tmp_path / "reducing").mkdir(parents=True) mr.reduce_pixel_shards( cache_shard_path=parquet_shards_dir, resume_path=tmp_path, @@ -349,7 +348,7 @@ def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_p delete_input_files=False, ) - output_file = os.path.join(tmp_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @@ -414,10 +413,10 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): First, we take some time to set up these silly data points, then we test out reducing them into a single parquet file using a mix of reduction options. """ - os.makedirs(os.path.join(tmp_path, "reducing")) - shard_dir = os.path.join(tmp_path, "reduce_shards", "order_0", "dir_0", "pixel_11") - os.makedirs(shard_dir) - output_file = os.path.join(tmp_path, "Norder=0", "Dir=0", "Npix=11.parquet") + (tmp_path / "reducing").mkdir(parents=True) + shard_dir = tmp_path / "reduce_shards" / "order_0" / "dir_0" / "pixel_11" + shard_dir.mkdir(parents=True) + output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" file1_string = """source_id,object_id,time,ra,dec 1200,700,3000,282.5,-58.5 @@ -427,7 +426,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): 1404,702,3200,310.5,-27.5 1505,703,4000,286.5,-69.5""" file1_data = pd.read_csv(StringIO(file1_string)) - file1_data.to_parquet(os.path.join(shard_dir, "file_1_shard_1.parquet")) + file1_data.to_parquet(shard_dir / "file_1_shard_1.parquet") file2_string = """source_id,object_id,time,ra,dec 1206,700,2000,282.5,-58.5 @@ -435,7 +434,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): 1308,701,2100,299.5,-48.5 1309,701,2000,299.5,-48.5""" file2_data = pd.read_csv(StringIO(file2_string)) - file2_data.to_parquet(os.path.join(shard_dir, "file_2_shard_1.parquet")) + file2_data.to_parquet(shard_dir / "file_2_shard_1.parquet") combined_data = pd.concat([file1_data, file2_data]) combined_data["norder19_healpix"] = hp.ang2pix( @@ -452,7 +451,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ## This will sort WITHIN an order 19 healpix pixel. In that ordering, the objects are ## (703, 700, 701, 702) mr.reduce_pixel_shards( - cache_shard_path=os.path.join(tmp_path, "reduce_shards"), + cache_shard_path=tmp_path / "reduce_shards", resume_path=tmp_path, reducing_key="0_11", destination_pixel_order=0, @@ -489,7 +488,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ######################## Sort option 2: by object id and time ## sort order is effectively (norder19 healpix, object id, time) mr.reduce_pixel_shards( - cache_shard_path=os.path.join(tmp_path, "reduce_shards"), + cache_shard_path=tmp_path / "reduce_shards", resume_path=tmp_path, reducing_key="0_11", destination_pixel_order=0, @@ -526,7 +525,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ## spatial properties for sorting, only numeric. ## sort order is effectively (object id, time) mr.reduce_pixel_shards( - cache_shard_path=os.path.join(tmp_path, "reduce_shards"), + cache_shard_path=tmp_path / "reduce_shards", resume_path=tmp_path, reducing_key="0_11", destination_pixel_order=0, diff --git a/tests/hipscat_import/catalog/test_run_import.py b/tests/hipscat_import/catalog/test_run_import.py index 587650c8..4821a0af 100644 --- a/tests/hipscat_import/catalog/test_run_import.py +++ b/tests/hipscat_import/catalog/test_run_import.py @@ -2,6 +2,7 @@ import os import shutil +from pathlib import Path import numpy as np import pandas as pd @@ -41,13 +42,10 @@ def test_resume_dask_runner( """Test execution in the presence of some resume files.""" ## First, copy over our intermediate files. ## This prevents overwriting source-controlled resume files. - intermediate_dir = os.path.join(tmp_path, "resume_catalog", "intermediate") - shutil.copytree( - os.path.join(resume_dir, "intermediate"), - intermediate_dir, - ) + intermediate_dir = tmp_path / "resume_catalog" / "intermediate" + shutil.copytree(resume_dir / "intermediate", intermediate_dir) ## Now set up our resume files to match previous work. - resume_tmp = os.path.join(tmp_path, "tmp", "resume_catalog") + resume_tmp = tmp_path / "tmp" / "resume_catalog" plan = ResumePlan(tmp_path=resume_tmp, progress_bar=False) histogram = SparseHistogram.make_from_counts([11], [131], 0) empty = SparseHistogram.make_empty(0) @@ -63,10 +61,7 @@ def test_resume_dask_runner( ResumePlan.touch_key_done_file(resume_tmp, ResumePlan.REDUCING_STAGE, "0_11") - shutil.copytree( - os.path.join(resume_dir, "Norder=0"), - os.path.join(tmp_path, "resume_catalog", "Norder=0"), - ) + shutil.copytree(resume_dir / "Norder=0", tmp_path / "resume_catalog" / "Norder=0") args = ImportArguments( output_artifact_name="resume_catalog", @@ -75,7 +70,7 @@ def test_resume_dask_runner( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", highest_healpix_order=0, pixel_threshold=1000, progress_bar=False, @@ -93,17 +88,14 @@ def test_resume_dask_runner( assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = Path(args.catalog_path) / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) ## Re-running the pipeline with fully done intermediate files ## should result in no changes to output files. - shutil.copytree( - os.path.join(resume_dir, "intermediate"), - resume_tmp, - ) + shutil.copytree(resume_dir / "intermediate", resume_tmp) plan = args.resume_plan plan.touch_stage_done_file(ResumePlan.MAPPING_STAGE) plan.touch_stage_done_file(ResumePlan.SPLITTING_STAGE) @@ -145,25 +137,17 @@ def test_resume_dask_runner_diff_pixel_order( with the current HEALPix order.""" ## First, copy over our intermediate files. ## This prevents overwriting source-controlled resume files. - intermediate_dir = os.path.join(tmp_path, "resume_catalog", "intermediate") - shutil.copytree( - os.path.join(resume_dir, "intermediate"), - intermediate_dir, - ) + intermediate_dir = tmp_path / "resume_catalog" / "intermediate" + shutil.copytree(resume_dir / "intermediate", intermediate_dir) ## Now set up our resume files to match previous work. - resume_tmp = os.path.join(tmp_path, "tmp", "resume_catalog") + resume_tmp = tmp_path / "tmp" / "resume_catalog" ResumePlan(tmp_path=resume_tmp, progress_bar=False) - SparseHistogram.make_from_counts([11], [131], 0).to_file( - os.path.join(resume_tmp, "mapping_histogram.npz") - ) + SparseHistogram.make_from_counts([11], [131], 0).to_file(resume_tmp / "mapping_histogram.npz") for file_index in range(0, 5): ResumePlan.touch_key_done_file(resume_tmp, ResumePlan.SPLITTING_STAGE, f"split_{file_index}") - shutil.copytree( - os.path.join(resume_dir, "Norder=0"), - os.path.join(tmp_path, "resume_catalog", "Norder=0"), - ) + shutil.copytree(resume_dir / "Norder=0", tmp_path / "resume_catalog" / "Norder=0") with pytest.raises(ValueError, match="incompatible with the highest healpix order"): args = ImportArguments( @@ -173,7 +157,7 @@ def test_resume_dask_runner_diff_pixel_order( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", constant_healpix_order=1, pixel_threshold=1000, progress_bar=False, @@ -188,7 +172,7 @@ def test_resume_dask_runner_diff_pixel_order( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", constant_healpix_order=1, pixel_threshold=1000, progress_bar=False, @@ -220,7 +204,7 @@ def test_resume_dask_runner_histograms_diff_size( tmp_path, ): """Tests that the pipeline errors if the partial histograms have different sizes.""" - resume_tmp = os.path.join(tmp_path, "tmp", "resume_catalog") + resume_tmp = tmp_path / "tmp" / "resume_catalog" ResumePlan(tmp_path=resume_tmp, progress_bar=False) # We'll create mock partial histograms of size 0 and 2 @@ -246,7 +230,7 @@ def test_resume_dask_runner_histograms_diff_size( output_path=tmp_path, dask_tmp=tmp_path, tmp_dir=tmp_path, - resume_tmp=os.path.join(tmp_path, "tmp"), + resume_tmp=tmp_path / "tmp", constant_healpix_order=1, pixel_threshold=1000, progress_bar=False, diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hipscat_import/catalog/test_run_round_trip.py index 0d3a1e1c..d9862067 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hipscat_import/catalog/test_run_round_trip.py @@ -79,8 +79,8 @@ def test_import_mixed_schema_csv( Path(mixed_schema_csv_dir) / "input_01.csv", Path(mixed_schema_csv_dir) / "input_02.csv", ], - output_path=Path(tmp_path), - dask_tmp=Path(tmp_path), + output_path=tmp_path, + dask_tmp=tmp_path, highest_healpix_order=1, file_reader=get_file_reader( "csv", @@ -260,8 +260,8 @@ def test_import_keep_intermediate_files( """Test that ALL intermediate files are still around on-disk after successful import, when setting the appropriate flags. """ - temp = os.path.join(tmp_path, "intermediate_files") - os.makedirs(temp) + temp = tmp_path / "intermediate_files" + temp.mkdir(parents=True) args = ImportArguments( output_artifact_name="small_sky_object_catalog", input_path=small_sky_parts_dir, @@ -282,7 +282,7 @@ def test_import_keep_intermediate_files( assert catalog.catalog_path == args.catalog_path ## Check that stage-level done files are still around. - base_intermediate_dir = os.path.join(temp, "small_sky_object_catalog", "intermediate") + base_intermediate_dir = temp / "small_sky_object_catalog" / "intermediate" expected_contents = [ "alignment.pickle", "histograms", # directory containing sub-histograms @@ -298,21 +298,21 @@ def test_import_keep_intermediate_files( ] assert_directory_contains(base_intermediate_dir, expected_contents) - checking_dir = os.path.join(base_intermediate_dir, "histograms") + checking_dir = base_intermediate_dir / "histograms" assert_directory_contains( checking_dir, ["map_0.npz", "map_1.npz", "map_2.npz", "map_3.npz", "map_4.npz", "map_5.npz"] ) - checking_dir = os.path.join(base_intermediate_dir, "splitting") + checking_dir = base_intermediate_dir / "splitting" assert_directory_contains( checking_dir, ["split_0_done", "split_1_done", "split_2_done", "split_3_done", "split_4_done", "split_5_done"], ) - checking_dir = os.path.join(base_intermediate_dir, "reducing") + checking_dir = base_intermediate_dir / "reducing" assert_directory_contains(checking_dir, ["0_11_done"]) # Check that all of the intermediate parquet shards are still around. - checking_dir = os.path.join(base_intermediate_dir, "order_0", "dir_0", "pixel_11") + checking_dir = base_intermediate_dir / "order_0" / "dir_0" / "pixel_11" assert_directory_contains( checking_dir, [ @@ -424,7 +424,7 @@ def test_import_hipscat_index( ## First, let's just check the assumptions we have about our input file: ## - should have _hipscat_index as the indexed column ## - should NOT have any columns like "ra" or "dec" - input_file = os.path.join(formats_dir, "hipscat_index.parquet") + input_file = formats_dir / "hipscat_index.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(input_file, "id", expected_ids) @@ -475,7 +475,7 @@ def test_import_hipscat_index_no_pandas( tmp_path, ): """Test basic execution, using a previously-computed _hipscat_index column for spatial partitioning.""" - input_file = os.path.join(formats_dir, "hipscat_index.csv") + input_file = formats_dir / "hipscat_index.csv" args = ImportArguments( output_artifact_name="using_hipscat_index", input_file_list=[input_file], @@ -517,8 +517,8 @@ def test_import_gaia_minimum( tmp_path, ): """Test end-to-end import, using a representative chunk of gaia data.""" - input_file = os.path.join(formats_dir, "gaia_minimum.csv") - schema_file = os.path.join(formats_dir, "gaia_minimum_schema.parquet") + input_file = formats_dir / "gaia_minimum.csv" + schema_file = formats_dir / "gaia_minimum_schema.parquet" args = ImportArguments( output_artifact_name="gaia_minimum", @@ -569,7 +569,7 @@ def test_gaia_ecsv( tmp_path, assert_parquet_file_ids, ): - input_file = os.path.join(formats_dir, "gaia_epoch.ecsv") + input_file = formats_dir / "gaia_epoch.ecsv" args = ImportArguments( output_artifact_name="gaia_e_astropy", @@ -661,7 +661,7 @@ def test_gaia_ecsv( # In-memory schema uses list naming convention, but pyarrow converts to # the parquet-compliant list convention when writing to disk. # Round trip the schema to get a schema with compliant nested naming convention. - schema_path = os.path.join(tmp_path, "temp_schema.parquet") + schema_path = tmp_path / "temp_schema.parquet" pq.write_table(expected_parquet_schema.empty_table(), where=schema_path) expected_parquet_schema = pq.read_metadata(schema_path).schema.to_arrow_schema() diff --git a/tests/hipscat_import/catalog/test_sparse_histogram.py b/tests/hipscat_import/catalog/test_sparse_histogram.py index 52e22164..57ce78f2 100644 --- a/tests/hipscat_import/catalog/test_sparse_histogram.py +++ b/tests/hipscat_import/catalog/test_sparse_histogram.py @@ -1,7 +1,5 @@ """Test sparse histogram behavior.""" -import os - import numpy as np import numpy.testing as npt import pytest @@ -12,7 +10,7 @@ def test_read_write_round_trip(tmp_path): """Test that we can read what we write into a histogram file.""" - file_name = os.path.join(tmp_path, "round_trip.npz") + file_name = tmp_path / "round_trip.npz" histogram = SparseHistogram.make_from_counts([11], [131], 0) histogram.to_file(file_name) diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index 303144bc..7ef2fc20 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -2,6 +2,7 @@ import os import re +from pathlib import Path import healpy as hp import numpy as np @@ -52,119 +53,112 @@ def test_long_running(): @pytest.fixture def test_data_dir(): - return os.path.join(TEST_DIR, "data") + return Path(TEST_DIR) / "data" @pytest.fixture def small_sky_dir(test_data_dir): - return os.path.join(test_data_dir, "small_sky") + return test_data_dir / "small_sky" @pytest.fixture def small_sky_single_file(test_data_dir): - return os.path.join(test_data_dir, "small_sky", "catalog.csv") + return test_data_dir / "small_sky" / "catalog.csv" @pytest.fixture def small_sky_object_catalog(test_data_dir): - return os.path.join(test_data_dir, "small_sky_object_catalog") + return test_data_dir / "small_sky_object_catalog" @pytest.fixture def small_sky_source_dir(test_data_dir): - return os.path.join(test_data_dir, "small_sky_source") + return test_data_dir / "small_sky_source" @pytest.fixture def small_sky_source_catalog(test_data_dir): - return os.path.join(test_data_dir, "small_sky_source_catalog") + return test_data_dir / "small_sky_source_catalog" @pytest.fixture def blank_data_dir(test_data_dir): - return os.path.join(test_data_dir, "blank") + return test_data_dir / "blank" @pytest.fixture def blank_data_file(test_data_dir): - return os.path.join(test_data_dir, "blank", "blank.csv") + return test_data_dir / "blank" / "blank.csv" @pytest.fixture def empty_data_dir(test_data_dir): - return os.path.join(test_data_dir, "empty") + return test_data_dir / "empty" @pytest.fixture def formats_dir(test_data_dir): - return os.path.join(test_data_dir, "test_formats") + return test_data_dir / "test_formats" @pytest.fixture def formats_headers_csv(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "headers.csv") + return test_data_dir / "test_formats" / "headers.csv" @pytest.fixture def formats_pipe_csv(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "pipe_delimited.csv") + return test_data_dir / "test_formats" / "pipe_delimited.csv" @pytest.fixture def formats_fits(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "small_sky.fits") + return test_data_dir / "test_formats" / "small_sky.fits" @pytest.fixture def formats_pandasindex(test_data_dir): - return os.path.join(test_data_dir, "test_formats", "pandasindex.parquet") + return test_data_dir / "test_formats" / "pandasindex.parquet" @pytest.fixture def small_sky_parts_dir(test_data_dir): - return os.path.join(test_data_dir, "small_sky_parts") + return test_data_dir / "small_sky_parts" @pytest.fixture def small_sky_file0(test_data_dir): - return os.path.join(test_data_dir, "small_sky_parts", "catalog_00_of_05.csv") + return test_data_dir / "small_sky_parts" / "catalog_00_of_05.csv" @pytest.fixture def parquet_shards_dir(test_data_dir): - return os.path.join(test_data_dir, "parquet_shards") + return test_data_dir / "parquet_shards" @pytest.fixture def soap_intermediate_dir(test_data_dir): - return os.path.join(test_data_dir, "soap_intermediate") + return test_data_dir / "soap_intermediate" @pytest.fixture def parquet_shards_shard_44_0(test_data_dir): - return os.path.join( - test_data_dir, - "parquet_shards", - "order_1", - "dir_0", - "pixel_44", - "shard_3_0.parquet", - ) + return test_data_dir / "parquet_shards" / "order_1" / "dir_0" / "pixel_44" / "shard_3_0.parquet" @pytest.fixture def mixed_schema_csv_dir(test_data_dir): - return os.path.join(test_data_dir, "mixed_schema") + return test_data_dir / "mixed_schema" @pytest.fixture def mixed_schema_csv_parquet(test_data_dir): - return os.path.join(test_data_dir, "mixed_schema", "schema.parquet") + return test_data_dir / "mixed_schema" / "schema.parquet" @pytest.fixture def resume_dir(test_data_dir): - return os.path.join(test_data_dir, "resume") + return test_data_dir / "resume" @pytest.fixture diff --git a/tests/hipscat_import/index/test_index_map_reduce.py b/tests/hipscat_import/index/test_index_map_reduce.py index 0dbba81d..025d0f02 100644 --- a/tests/hipscat_import/index/test_index_map_reduce.py +++ b/tests/hipscat_import/index/test_index_map_reduce.py @@ -1,7 +1,5 @@ """Tests of map reduce operations""" -import os - import numpy as np import numpy.testing as npt import pandas as pd @@ -28,7 +26,7 @@ def test_create_index( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_object_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_index(output_file, expected_ids) @@ -55,7 +53,7 @@ def test_create_index_no_hipscat_index(small_sky_object_catalog, tmp_path, dask_ ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_object_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal(data_frame.columns, ["Norder", "Dir", "Npix"]) @@ -76,7 +74,7 @@ def test_create_index_no_order_pixel(small_sky_object_catalog, tmp_path, dask_cl ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_object_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal(data_frame.columns, ["_hipscat_index"]) @@ -95,7 +93,7 @@ def test_create_index_source(small_sky_source_catalog, assert_parquet_file_index ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = [*range(70_000, 87_161)] assert_parquet_file_index(output_file, expected_ids) @@ -134,7 +132,7 @@ def test_create_index_with_divisions( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = [*range(70_000, 87_161)] assert_parquet_file_index(output_file, expected_ids) @@ -167,7 +165,7 @@ def test_create_index_source_by_object( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = np.repeat([*range(700, 831)], 131) assert_parquet_file_index(output_file, expected_ids) @@ -199,7 +197,7 @@ def test_create_index_extra_columns( ) mr.create_index(args, dask_client) - output_file = os.path.join(tmp_path, "small_sky_source_index", "index", "part.0.parquet") + output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" expected_ids = np.repeat([*range(700, 831)], 131) assert_parquet_file_index(output_file, expected_ids) diff --git a/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py b/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py index c6332c21..72b93dbc 100644 --- a/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py +++ b/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py @@ -46,7 +46,7 @@ def test_to_pixel_shard_equator(tmp_path, basic_data_shard_df): dec_column="weird_dec", ) - path = os.path.join(tmp_path, "order_1", "dir_0", "pixel_21", "Norder=1", "Dir=0", "Npix=0.parquet") + path = tmp_path / "order_1" / "dir_0" / "pixel_21" / "Norder=1" / "Dir=0" / "Npix=0.parquet" assert os.path.exists(path) @@ -63,7 +63,7 @@ def test_to_pixel_shard_polar(tmp_path, polar_data_shard_df): dec_column="weird_dec", ) - path = os.path.join(tmp_path, "order_2", "dir_0", "pixel_15", "Norder=2", "Dir=0", "Npix=0.parquet") + path = tmp_path / "order_2" / "dir_0" / "pixel_15" / "Norder=2" / "Dir=0" / "Npix=0.parquet" assert os.path.exists(path) @@ -92,12 +92,12 @@ def test_map_pixel_shards_error(tmp_path, capsys): def test_reduce_margin_shards(tmp_path): - intermediate_dir = os.path.join(tmp_path, "intermediate") + intermediate_dir = tmp_path / "intermediate" partition_dir = get_pixel_cache_directory(intermediate_dir, HealpixPixel(1, 21)) shard_dir = paths.pixel_directory(partition_dir, 1, 21) os.makedirs(shard_dir) - os.makedirs(os.path.join(intermediate_dir, "reducing")) + os.makedirs(intermediate_dir / "reducing") first_shard_path = paths.pixel_catalog_file(partition_dir, 1, 0) second_shard_path = paths.pixel_catalog_file(partition_dir, 1, 1) @@ -128,7 +128,7 @@ def test_reduce_margin_shards(tmp_path): ) # Create a schema parquet file. - schema_path = os.path.join(tmp_path, "metadata.parquet") + schema_path = tmp_path / "metadata.parquet" schema_df = test_df.drop(columns=["margin_Norder", "margin_Dir", "margin_Npix"]) schema_df.to_parquet(schema_path) @@ -176,14 +176,14 @@ def test_reduce_margin_shards(tmp_path): def test_reduce_margin_shards_error(tmp_path, basic_data_shard_df, capsys): """Test error behavior on reduce stage. e.g. by not creating the original catalog metadata.""" - intermediate_dir = os.path.join(tmp_path, "intermediate") + intermediate_dir = tmp_path / "intermediate" partition_dir = get_pixel_cache_directory(intermediate_dir, HealpixPixel(1, 21)) shard_dir = paths.pixel_directory(partition_dir, 1, 21) os.makedirs(shard_dir) - os.makedirs(os.path.join(intermediate_dir, "reducing")) + os.makedirs(intermediate_dir / "reducing") # Don't write anything at the metadata path! - schema_path = os.path.join(tmp_path, "metadata.parquet") + schema_path = tmp_path / "metadata.parquet" basic_data_shard_df.to_parquet(paths.pixel_catalog_file(partition_dir, 1, 0)) basic_data_shard_df.to_parquet(paths.pixel_catalog_file(partition_dir, 1, 1)) diff --git a/tests/hipscat_import/soap/test_soap_map_reduce.py b/tests/hipscat_import/soap/test_soap_map_reduce.py index b95745e7..ab88f176 100644 --- a/tests/hipscat_import/soap/test_soap_map_reduce.py +++ b/tests/hipscat_import/soap/test_soap_map_reduce.py @@ -2,6 +2,7 @@ import os import shutil +from pathlib import Path import numpy.testing as npt import pandas as pd @@ -19,9 +20,7 @@ def test_count_joins(small_sky_soap_args, tmp_path, small_sky_soap_maps): count_joins(small_sky_soap_args, source, objects) result = pd.read_csv( - os.path.join( - tmp_path, "small_sky_association", "intermediate", f"{source.order}_{source.pixel}.csv" - ) + tmp_path / "small_sky_association" / "intermediate" / f"{source.order}_{source.pixel}.csv" ) assert len(result) == 1 assert result["num_rows"].sum() > 0 @@ -32,16 +31,20 @@ def test_count_joins_with_leaf(small_sky_soap_args, small_sky_soap_maps): small_sky_soap_args.write_leaf_files = True small_sky_soap_args.source_id_column = "source_id" - intermediate_dir = small_sky_soap_args.tmp_path + intermediate_dir = Path(small_sky_soap_args.tmp_path) for source, objects in small_sky_soap_maps.items(): count_joins(small_sky_soap_args, source, objects) - result = pd.read_csv(os.path.join(intermediate_dir, f"{source.order}_{source.pixel}.csv")) + result = pd.read_csv(intermediate_dir / f"{source.order}_{source.pixel}.csv") assert len(result) == 1 assert result["num_rows"].sum() > 0 - parquet_file_name = os.path.join( - intermediate_dir, "order_0", "dir_0", "pixel_11", f"source_{source.order}_{source.pixel}.parquet" + parquet_file_name = ( + intermediate_dir + / "order_0" + / "dir_0" + / "pixel_11" + / f"source_{source.order}_{source.pixel}.parquet" ) assert os.path.exists(parquet_file_name), f"file not found [{parquet_file_name}]" @@ -69,9 +72,7 @@ def test_count_joins_missing(small_sky_source_catalog, tmp_path): source = HealpixPixel(2, 176) count_joins(args, source, [HealpixPixel(2, 177), HealpixPixel(2, 178)]) - result_csv = os.path.join( - tmp_path, "small_sky_association", "intermediate", f"{source.order}_{source.pixel}.csv" - ) + result_csv = tmp_path / "small_sky_association" / "intermediate" / f"{source.order}_{source.pixel}.csv" result = pd.read_csv(result_csv) assert len(result) == 3 @@ -91,11 +92,11 @@ def test_count_joins_missing(small_sky_source_catalog, tmp_path): def test_combine_results(tmp_path): """Test combining many CSVs into a single one""" - input_path = os.path.join(tmp_path, "input") - os.makedirs(input_path, exist_ok=True) + input_path = tmp_path / "input" + input_path.mkdir(parents=True) - output_path = os.path.join(tmp_path, "output") - os.makedirs(output_path, exist_ok=True) + output_path = tmp_path / "output" + output_path.mkdir(parents=True) join_info = pd.DataFrame( data=[ @@ -113,16 +114,16 @@ def test_combine_results(tmp_path): "num_rows", ], ) - partitions_csv_file = os.path.join(input_path, "0_11.csv") + partitions_csv_file = input_path / "0_11.csv" join_info.to_csv(partitions_csv_file, index=False) total_num_rows = combine_partial_results(input_path, output_path, None) assert total_num_rows == 131 - result = pd.read_csv(os.path.join(output_path, "partition_join_info.csv")) + result = pd.read_csv(output_path / "partition_join_info.csv") assert len(result) == 2 - result = pd.read_csv(os.path.join(output_path, "unmatched_sources.csv")) + result = pd.read_csv(output_path / "unmatched_sources.csv") assert len(result) == 1 diff --git a/tests/hipscat_import/test_pipeline_resume_plan.py b/tests/hipscat_import/test_pipeline_resume_plan.py index b694f33c..c5e0b77c 100644 --- a/tests/hipscat_import/test_pipeline_resume_plan.py +++ b/tests/hipscat_import/test_pipeline_resume_plan.py @@ -1,6 +1,5 @@ """Test resume file operations""" -import os from pathlib import Path import numpy.testing as npt @@ -13,7 +12,7 @@ def test_done_key(tmp_path): """Verify expected behavior of marking stage progress via done files.""" plan = PipelineResumePlan(tmp_path=tmp_path, progress_bar=False) stage = "testing" - os.makedirs(os.path.join(tmp_path, stage)) + (tmp_path / stage).mkdir(parents=True) keys = plan.read_done_keys(stage) assert len(keys) == 0 diff --git a/tests/hipscat_import/test_runtime_arguments.py b/tests/hipscat_import/test_runtime_arguments.py index 69aded26..cea801cc 100644 --- a/tests/hipscat_import/test_runtime_arguments.py +++ b/tests/hipscat_import/test_runtime_arguments.py @@ -1,7 +1,5 @@ """Tests of argument validation""" -import os - import pytest from hipscat_import.runtime_arguments import RuntimeArguments @@ -70,12 +68,12 @@ def test_good_paths(tmp_path): def test_tmp_path_creation(tmp_path): """Check that we create a new temp path for this catalog.""" - output_path = os.path.join(tmp_path, "unique_output_directory") - temp_path = os.path.join(tmp_path, "unique_tmp_directory") - dask_tmp_path = os.path.join(tmp_path, "unique_dask_directory") - os.makedirs(output_path, exist_ok=True) - os.makedirs(temp_path, exist_ok=True) - os.makedirs(dask_tmp_path, exist_ok=True) + output_path = tmp_path / "unique_output_directory" + temp_path = tmp_path / "unique_tmp_directory" + dask_tmp_path = tmp_path / "unique_dask_directory" + output_path.mkdir(parents=True) + temp_path.mkdir(parents=True) + dask_tmp_path.mkdir(parents=True) ## If no tmp paths are given, use the output directory args = RuntimeArguments( diff --git a/tests/hipscat_import/verification/test_verification_arguments.py b/tests/hipscat_import/verification/test_verification_arguments.py index 8ebd6c81..303a63f7 100644 --- a/tests/hipscat_import/verification/test_verification_arguments.py +++ b/tests/hipscat_import/verification/test_verification_arguments.py @@ -62,7 +62,7 @@ def test_catalog_object(tmp_path, small_sky_object_catalog): output_path=tmp_path, output_artifact_name="small_sky_object_verification_report", ) - assert args.input_catalog_path == small_sky_object_catalog + assert args.input_catalog_path == str(small_sky_object_catalog) assert str(args.output_path) == tmp_path_str assert str(args.tmp_path).startswith(tmp_path_str)