From 8253efc625f1ba3957ca93d53c9e2fd1d519f0cf Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 24 Jul 2024 13:51:11 -0400 Subject: [PATCH 1/4] Fix smoke tests --- requirements.txt | 6 +++--- src/hipscat_cloudtests/file_checks.py | 5 +++-- tests/hipscat/io/file_io/test_file_io_cloud.py | 9 +++++++-- tests/hipscat_import/test_run_catalog_import.py | 4 ++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index b3c81f2..df98186 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -git+https://github.com/astronomy-commons/hipscat.git@main -git+https://github.com/astronomy-commons/hipscat-import.git@main -git+https://github.com/astronomy-commons/lsdb.git@main \ No newline at end of file +git+https://github.com/astronomy-commons/hipscat.git@add-arrow-schema-to-catalog +git+https://github.com/astronomy-commons/hipscat-import.git@read-parquet-with-schema +git+https://github.com/astronomy-commons/lsdb.git@add-arrow-schema \ No newline at end of file diff --git a/src/hipscat_cloudtests/file_checks.py b/src/hipscat_cloudtests/file_checks.py index 7e366e9..7e3b210 100644 --- a/src/hipscat_cloudtests/file_checks.py +++ b/src/hipscat_cloudtests/file_checks.py @@ -4,6 +4,7 @@ import numpy.testing as npt import pandas as pd +import pyarrow as pa from hipscat.io.file_io.file_io import load_text_file from hipscat.io.file_io.file_pointer import does_file_or_directory_exist @@ -40,7 +41,7 @@ def assert_text_file_matches(expected_lines, file_name, storage_options: dict = def assert_parquet_file_ids( - file_name, id_column, expected_ids, resort_ids=True, storage_options: dict = None + file_name, id_column, schema: pa.Schema, expected_ids, resort_ids=True, storage_options: dict = None ): """ Convenience method to read a parquet file and compare the object IDs to @@ -54,7 +55,7 @@ def assert_parquet_file_ids( is the same between the read IDs and expected_ids storage_options (dict): dictionary of filesystem storage options """ - data_frame = pd.read_parquet(file_name, engine="pyarrow", storage_options=storage_options) + data_frame = pd.read_parquet(file_name, engine="pyarrow", schema=schema, storage_options=storage_options) assert id_column in data_frame.columns ids = data_frame[id_column].tolist() if resort_ids: diff --git a/tests/hipscat/io/file_io/test_file_io_cloud.py b/tests/hipscat/io/file_io/test_file_io_cloud.py index cbe0162..55c40e4 100644 --- a/tests/hipscat/io/file_io/test_file_io_cloud.py +++ b/tests/hipscat/io/file_io/test_file_io_cloud.py @@ -37,11 +37,16 @@ def test_load_json(small_sky_dir_local, small_sky_dir_cloud, storage_options): assert json_dict_cloud == json_dict_local -def test_load_parquet_to_pandas(small_sky_dir_local, small_sky_dir_cloud, storage_options): +def test_load_parquet_to_pandas( + small_sky_catalog_cloud, small_sky_dir_local, small_sky_dir_cloud, storage_options +): pixel_data_path = pixel_catalog_file(small_sky_dir_local, 0, 11) pixel_data_path_cloud = pixel_catalog_file(small_sky_dir_cloud, 0, 11) parquet_df = pd.read_parquet(pixel_data_path) - loaded_df = load_parquet_to_pandas(pixel_data_path_cloud, storage_options=storage_options) + catalog_schema = small_sky_catalog_cloud.hc_structure.schema + loaded_df = load_parquet_to_pandas( + pixel_data_path_cloud, schema=catalog_schema, storage_options=storage_options + ) pd.testing.assert_frame_equal(parquet_df, loaded_df) diff --git a/tests/hipscat_import/test_run_catalog_import.py b/tests/hipscat_import/test_run_catalog_import.py index 8d3e368..2c33d03 100644 --- a/tests/hipscat_import/test_run_catalog_import.py +++ b/tests/hipscat_import/test_run_catalog_import.py @@ -46,7 +46,7 @@ def test_catalog_import_write_to_cloud( output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] - assert_parquet_file_ids(output_file, "id", expected_ids, storage_options=storage_options) + assert_parquet_file_ids(output_file, "id", catalog.schema, expected_ids, storage_options=storage_options) @pytest.mark.dask @@ -85,7 +85,7 @@ def test_catalog_import_read_from_cloud( output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] - assert_parquet_file_ids(output_file, "id", expected_ids) + assert_parquet_file_ids(output_file, "id", catalog.schema, expected_ids) def test_read_csv_cloud(storage_options, small_sky_parts_dir_cloud): From f67c412a988ca2675e0d0d2d6ecb258c9b8d16e8 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 31 Jul 2024 10:46:41 -0400 Subject: [PATCH 2/4] Revert changes to requirements file --- requirements.txt | 6 +++--- tests/hipscat/io/test_write_metadata_cloud.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index df98186..b3c81f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -git+https://github.com/astronomy-commons/hipscat.git@add-arrow-schema-to-catalog -git+https://github.com/astronomy-commons/hipscat-import.git@read-parquet-with-schema -git+https://github.com/astronomy-commons/lsdb.git@add-arrow-schema \ No newline at end of file +git+https://github.com/astronomy-commons/hipscat.git@main +git+https://github.com/astronomy-commons/hipscat-import.git@main +git+https://github.com/astronomy-commons/lsdb.git@main \ No newline at end of file diff --git a/tests/hipscat/io/test_write_metadata_cloud.py b/tests/hipscat/io/test_write_metadata_cloud.py index e516624..66b0b88 100644 --- a/tests/hipscat/io/test_write_metadata_cloud.py +++ b/tests/hipscat/io/test_write_metadata_cloud.py @@ -6,9 +6,11 @@ import hipscat.pixel_math as hist import numpy.testing as npt import pyarrow as pa +import pyarrow.parquet as pq import pytest from hipscat.catalog.catalog_info import CatalogInfo from hipscat.io import file_io +from hipscat.io.file_io.file_pointer import get_fs from hipscat.io.parquet_metadata import write_parquet_metadata from hipscat_cloudtests import assert_text_file_matches @@ -175,7 +177,8 @@ def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1, assert schema.equals(expected_schema, check_metadata=False) - parquet_file = file_io.read_parquet_file(file_pointer=file_name, storage_options=storage_options) + file_system, file_pointer = get_fs(file_name, storage_options=storage_options) + parquet_file = pq.ParquetFile(file_pointer, filesystem=file_system) assert parquet_file.metadata.num_row_groups == expected_num_row_groups for row_index in range(0, parquet_file.metadata.num_row_groups): From 2bd031bc1b883d060040cde0e4dec6de7da89006 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Wed, 14 Aug 2024 13:44:58 -0400 Subject: [PATCH 3/4] Address smoke test failures. (#35) * Address smoke test failures. * Remove unused import. --- .../hipscat/io/file_io/test_file_io_cloud.py | 6 +-- tests/hipscat_import/test_create_margin.py | 47 +++++++++---------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/tests/hipscat/io/file_io/test_file_io_cloud.py b/tests/hipscat/io/file_io/test_file_io_cloud.py index 55c40e4..91e7f58 100644 --- a/tests/hipscat/io/file_io/test_file_io_cloud.py +++ b/tests/hipscat/io/file_io/test_file_io_cloud.py @@ -6,8 +6,8 @@ get_file_pointer_from_path, load_csv_to_pandas, load_json_file, - load_parquet_to_pandas, load_text_file, + read_parquet_file_to_pandas, write_dataframe_to_csv, write_string_to_file, ) @@ -37,14 +37,14 @@ def test_load_json(small_sky_dir_local, small_sky_dir_cloud, storage_options): assert json_dict_cloud == json_dict_local -def test_load_parquet_to_pandas( +def test_read_parquet_to_pandas( small_sky_catalog_cloud, small_sky_dir_local, small_sky_dir_cloud, storage_options ): pixel_data_path = pixel_catalog_file(small_sky_dir_local, 0, 11) pixel_data_path_cloud = pixel_catalog_file(small_sky_dir_cloud, 0, 11) parquet_df = pd.read_parquet(pixel_data_path) catalog_schema = small_sky_catalog_cloud.hc_structure.schema - loaded_df = load_parquet_to_pandas( + loaded_df = read_parquet_file_to_pandas( pixel_data_path_cloud, schema=catalog_schema, storage_options=storage_options ) pd.testing.assert_frame_equal(parquet_df, loaded_df) diff --git a/tests/hipscat_import/test_create_margin.py b/tests/hipscat_import/test_create_margin.py index 57ba5be..b264fff 100644 --- a/tests/hipscat_import/test_create_margin.py +++ b/tests/hipscat_import/test_create_margin.py @@ -1,5 +1,4 @@ import hipscat_import.margin_cache.margin_cache as mc -import pytest from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments @@ -17,18 +16,17 @@ def test_margin_cache_gen( - local origin catalog. - writing to CLOUD. """ - with pytest.warns(UserWarning, match="smaller resolution"): - args = MarginCacheArguments( - margin_threshold=7200.0, - input_catalog_path=small_sky_order1_dir_local, - output_path=tmp_cloud_path, - output_artifact_name="small_sky_order1_margin", - output_storage_options=storage_options, - dask_tmp=tmp_path, - tmp_dir=tmp_path, - margin_order=8, - progress_bar=False, - ) + args = MarginCacheArguments( + input_catalog_path=small_sky_order1_dir_local, + output_path=tmp_cloud_path, + output_artifact_name="small_sky_order1_margin", + output_storage_options=storage_options, + dask_tmp=tmp_path, + tmp_dir=tmp_path, + margin_order=8, + fine_filtering=False, + progress_bar=False, + ) assert args.catalog.catalog_info.ra_column == "ra" @@ -51,18 +49,17 @@ def test_margin_cache_gen_read_from_cloud( - CLOUD origin catalog - writing to local tmp """ - with pytest.warns(UserWarning, match="smaller resolution"): - args = MarginCacheArguments( - margin_threshold=7200.0, - input_catalog_path=small_sky_order1_dir_cloud, - input_storage_options=storage_options, - output_path=tmp_path, - output_artifact_name="small_sky_order1_margin", - dask_tmp=tmp_path, - tmp_dir=tmp_path, - margin_order=8, - progress_bar=False, - ) + args = MarginCacheArguments( + input_catalog_path=small_sky_order1_dir_cloud, + input_storage_options=storage_options, + output_path=tmp_path, + output_artifact_name="small_sky_order1_margin", + dask_tmp=tmp_path, + tmp_dir=tmp_path, + margin_order=8, + fine_filtering=False, + progress_bar=False, + ) assert args.catalog.catalog_info.ra_column == "ra" From 35598b24994e5954ff2dd69dd6379943601aa492 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Tue, 27 Aug 2024 11:08:02 -0400 Subject: [PATCH 4/4] Create temp directory for each soap args --- tests/hipscat_import/test_run_soap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/hipscat_import/test_run_soap.py b/tests/hipscat_import/test_run_soap.py index 19c0033..f43c7df 100644 --- a/tests/hipscat_import/test_run_soap.py +++ b/tests/hipscat_import/test_run_soap.py @@ -10,7 +10,7 @@ @pytest.mark.dask def test_object_to_self_write_to_cloud( dask_client, - tmp_path, + tmp_path_factory, tmp_cloud_path, small_sky_dir_local, small_sky_order1_dir_local, @@ -35,7 +35,7 @@ def test_object_to_self_write_to_cloud( output_path=tmp_cloud_path, output_storage_options=storage_options, progress_bar=False, - tmp_dir=tmp_path, + tmp_dir=tmp_path_factory.mktemp("small_sky_order_to_order1"), ) runner.run(small_sky_soap_args, dask_client) @@ -66,7 +66,7 @@ def test_object_to_self_write_to_cloud( output_path=tmp_cloud_path, output_storage_options=storage_options, progress_bar=False, - tmp_dir=tmp_path, + tmp_dir=tmp_path_factory.mktemp("small_sky_to_order1_soft"), ) runner.run(small_sky_soap_args, dask_client)