diff --git a/src/hipscat_cloudtests/file_checks.py b/src/hipscat_cloudtests/file_checks.py index 7e366e9..7e3b210 100644 --- a/src/hipscat_cloudtests/file_checks.py +++ b/src/hipscat_cloudtests/file_checks.py @@ -4,6 +4,7 @@ import numpy.testing as npt import pandas as pd +import pyarrow as pa from hipscat.io.file_io.file_io import load_text_file from hipscat.io.file_io.file_pointer import does_file_or_directory_exist @@ -40,7 +41,7 @@ def assert_text_file_matches(expected_lines, file_name, storage_options: dict = def assert_parquet_file_ids( - file_name, id_column, expected_ids, resort_ids=True, storage_options: dict = None + file_name, id_column, schema: pa.Schema, expected_ids, resort_ids=True, storage_options: dict = None ): """ Convenience method to read a parquet file and compare the object IDs to @@ -54,7 +55,7 @@ def assert_parquet_file_ids( is the same between the read IDs and expected_ids storage_options (dict): dictionary of filesystem storage options """ - data_frame = pd.read_parquet(file_name, engine="pyarrow", storage_options=storage_options) + data_frame = pd.read_parquet(file_name, engine="pyarrow", schema=schema, storage_options=storage_options) assert id_column in data_frame.columns ids = data_frame[id_column].tolist() if resort_ids: diff --git a/tests/hipscat/io/file_io/test_file_io_cloud.py b/tests/hipscat/io/file_io/test_file_io_cloud.py index cbe0162..91e7f58 100644 --- a/tests/hipscat/io/file_io/test_file_io_cloud.py +++ b/tests/hipscat/io/file_io/test_file_io_cloud.py @@ -6,8 +6,8 @@ get_file_pointer_from_path, load_csv_to_pandas, load_json_file, - load_parquet_to_pandas, load_text_file, + read_parquet_file_to_pandas, write_dataframe_to_csv, write_string_to_file, ) @@ -37,11 +37,16 @@ def test_load_json(small_sky_dir_local, small_sky_dir_cloud, storage_options): assert json_dict_cloud == json_dict_local -def test_load_parquet_to_pandas(small_sky_dir_local, small_sky_dir_cloud, storage_options): +def test_read_parquet_to_pandas( + small_sky_catalog_cloud, small_sky_dir_local, small_sky_dir_cloud, storage_options +): pixel_data_path = pixel_catalog_file(small_sky_dir_local, 0, 11) pixel_data_path_cloud = pixel_catalog_file(small_sky_dir_cloud, 0, 11) parquet_df = pd.read_parquet(pixel_data_path) - loaded_df = load_parquet_to_pandas(pixel_data_path_cloud, storage_options=storage_options) + catalog_schema = small_sky_catalog_cloud.hc_structure.schema + loaded_df = read_parquet_file_to_pandas( + pixel_data_path_cloud, schema=catalog_schema, storage_options=storage_options + ) pd.testing.assert_frame_equal(parquet_df, loaded_df) diff --git a/tests/hipscat/io/test_write_metadata_cloud.py b/tests/hipscat/io/test_write_metadata_cloud.py index e516624..66b0b88 100644 --- a/tests/hipscat/io/test_write_metadata_cloud.py +++ b/tests/hipscat/io/test_write_metadata_cloud.py @@ -6,9 +6,11 @@ import hipscat.pixel_math as hist import numpy.testing as npt import pyarrow as pa +import pyarrow.parquet as pq import pytest from hipscat.catalog.catalog_info import CatalogInfo from hipscat.io import file_io +from hipscat.io.file_io.file_pointer import get_fs from hipscat.io.parquet_metadata import write_parquet_metadata from hipscat_cloudtests import assert_text_file_matches @@ -175,7 +177,8 @@ def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1, assert schema.equals(expected_schema, check_metadata=False) - parquet_file = file_io.read_parquet_file(file_pointer=file_name, storage_options=storage_options) + file_system, file_pointer = get_fs(file_name, storage_options=storage_options) + parquet_file = pq.ParquetFile(file_pointer, filesystem=file_system) assert parquet_file.metadata.num_row_groups == expected_num_row_groups for row_index in range(0, parquet_file.metadata.num_row_groups): diff --git a/tests/hipscat_import/test_create_margin.py b/tests/hipscat_import/test_create_margin.py index 57ba5be..b264fff 100644 --- a/tests/hipscat_import/test_create_margin.py +++ b/tests/hipscat_import/test_create_margin.py @@ -1,5 +1,4 @@ import hipscat_import.margin_cache.margin_cache as mc -import pytest from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments @@ -17,18 +16,17 @@ def test_margin_cache_gen( - local origin catalog. - writing to CLOUD. """ - with pytest.warns(UserWarning, match="smaller resolution"): - args = MarginCacheArguments( - margin_threshold=7200.0, - input_catalog_path=small_sky_order1_dir_local, - output_path=tmp_cloud_path, - output_artifact_name="small_sky_order1_margin", - output_storage_options=storage_options, - dask_tmp=tmp_path, - tmp_dir=tmp_path, - margin_order=8, - progress_bar=False, - ) + args = MarginCacheArguments( + input_catalog_path=small_sky_order1_dir_local, + output_path=tmp_cloud_path, + output_artifact_name="small_sky_order1_margin", + output_storage_options=storage_options, + dask_tmp=tmp_path, + tmp_dir=tmp_path, + margin_order=8, + fine_filtering=False, + progress_bar=False, + ) assert args.catalog.catalog_info.ra_column == "ra" @@ -51,18 +49,17 @@ def test_margin_cache_gen_read_from_cloud( - CLOUD origin catalog - writing to local tmp """ - with pytest.warns(UserWarning, match="smaller resolution"): - args = MarginCacheArguments( - margin_threshold=7200.0, - input_catalog_path=small_sky_order1_dir_cloud, - input_storage_options=storage_options, - output_path=tmp_path, - output_artifact_name="small_sky_order1_margin", - dask_tmp=tmp_path, - tmp_dir=tmp_path, - margin_order=8, - progress_bar=False, - ) + args = MarginCacheArguments( + input_catalog_path=small_sky_order1_dir_cloud, + input_storage_options=storage_options, + output_path=tmp_path, + output_artifact_name="small_sky_order1_margin", + dask_tmp=tmp_path, + tmp_dir=tmp_path, + margin_order=8, + fine_filtering=False, + progress_bar=False, + ) assert args.catalog.catalog_info.ra_column == "ra" diff --git a/tests/hipscat_import/test_run_catalog_import.py b/tests/hipscat_import/test_run_catalog_import.py index 8d3e368..2c33d03 100644 --- a/tests/hipscat_import/test_run_catalog_import.py +++ b/tests/hipscat_import/test_run_catalog_import.py @@ -46,7 +46,7 @@ def test_catalog_import_write_to_cloud( output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] - assert_parquet_file_ids(output_file, "id", expected_ids, storage_options=storage_options) + assert_parquet_file_ids(output_file, "id", catalog.schema, expected_ids, storage_options=storage_options) @pytest.mark.dask @@ -85,7 +85,7 @@ def test_catalog_import_read_from_cloud( output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] - assert_parquet_file_ids(output_file, "id", expected_ids) + assert_parquet_file_ids(output_file, "id", catalog.schema, expected_ids) def test_read_csv_cloud(storage_options, small_sky_parts_dir_cloud): diff --git a/tests/hipscat_import/test_run_soap.py b/tests/hipscat_import/test_run_soap.py index 19c0033..f43c7df 100644 --- a/tests/hipscat_import/test_run_soap.py +++ b/tests/hipscat_import/test_run_soap.py @@ -10,7 +10,7 @@ @pytest.mark.dask def test_object_to_self_write_to_cloud( dask_client, - tmp_path, + tmp_path_factory, tmp_cloud_path, small_sky_dir_local, small_sky_order1_dir_local, @@ -35,7 +35,7 @@ def test_object_to_self_write_to_cloud( output_path=tmp_cloud_path, output_storage_options=storage_options, progress_bar=False, - tmp_dir=tmp_path, + tmp_dir=tmp_path_factory.mktemp("small_sky_order_to_order1"), ) runner.run(small_sky_soap_args, dask_client) @@ -66,7 +66,7 @@ def test_object_to_self_write_to_cloud( output_path=tmp_cloud_path, output_storage_options=storage_options, progress_bar=False, - tmp_dir=tmp_path, + tmp_dir=tmp_path_factory.mktemp("small_sky_to_order1_soft"), ) runner.run(small_sky_soap_args, dask_client)