Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/github_actions/actions/add-to-pro…
Browse files Browse the repository at this point in the history
…ject-1.0.2
  • Loading branch information
delucchi-cmu committed Sep 16, 2024
2 parents 1812e39 + ee85fe8 commit 9f66b6e
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 36 deletions.
5 changes: 3 additions & 2 deletions src/hipscat_cloudtests/file_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy.testing as npt
import pandas as pd
import pyarrow as pa
from hipscat.io.file_io.file_io import load_text_file
from hipscat.io.file_io.file_pointer import does_file_or_directory_exist

Expand Down Expand Up @@ -40,7 +41,7 @@ def assert_text_file_matches(expected_lines, file_name, storage_options: dict =


def assert_parquet_file_ids(
file_name, id_column, expected_ids, resort_ids=True, storage_options: dict = None
file_name, id_column, schema: pa.Schema, expected_ids, resort_ids=True, storage_options: dict = None
):
"""
Convenience method to read a parquet file and compare the object IDs to
Expand All @@ -54,7 +55,7 @@ def assert_parquet_file_ids(
is the same between the read IDs and expected_ids
storage_options (dict): dictionary of filesystem storage options
"""
data_frame = pd.read_parquet(file_name, engine="pyarrow", storage_options=storage_options)
data_frame = pd.read_parquet(file_name, engine="pyarrow", schema=schema, storage_options=storage_options)
assert id_column in data_frame.columns
ids = data_frame[id_column].tolist()
if resort_ids:
Expand Down
11 changes: 8 additions & 3 deletions tests/hipscat/io/file_io/test_file_io_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
get_file_pointer_from_path,
load_csv_to_pandas,
load_json_file,
load_parquet_to_pandas,
load_text_file,
read_parquet_file_to_pandas,
write_dataframe_to_csv,
write_string_to_file,
)
Expand Down Expand Up @@ -37,11 +37,16 @@ def test_load_json(small_sky_dir_local, small_sky_dir_cloud, storage_options):
assert json_dict_cloud == json_dict_local


def test_load_parquet_to_pandas(small_sky_dir_local, small_sky_dir_cloud, storage_options):
def test_read_parquet_to_pandas(
small_sky_catalog_cloud, small_sky_dir_local, small_sky_dir_cloud, storage_options
):
pixel_data_path = pixel_catalog_file(small_sky_dir_local, 0, 11)
pixel_data_path_cloud = pixel_catalog_file(small_sky_dir_cloud, 0, 11)
parquet_df = pd.read_parquet(pixel_data_path)
loaded_df = load_parquet_to_pandas(pixel_data_path_cloud, storage_options=storage_options)
catalog_schema = small_sky_catalog_cloud.hc_structure.schema
loaded_df = read_parquet_file_to_pandas(
pixel_data_path_cloud, schema=catalog_schema, storage_options=storage_options
)
pd.testing.assert_frame_equal(parquet_df, loaded_df)


Expand Down
5 changes: 4 additions & 1 deletion tests/hipscat/io/test_write_metadata_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import hipscat.pixel_math as hist
import numpy.testing as npt
import pyarrow as pa
import pyarrow.parquet as pq
import pytest
from hipscat.catalog.catalog_info import CatalogInfo
from hipscat.io import file_io
from hipscat.io.file_io.file_pointer import get_fs
from hipscat.io.parquet_metadata import write_parquet_metadata

from hipscat_cloudtests import assert_text_file_matches
Expand Down Expand Up @@ -175,7 +177,8 @@ def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1,

assert schema.equals(expected_schema, check_metadata=False)

parquet_file = file_io.read_parquet_file(file_pointer=file_name, storage_options=storage_options)
file_system, file_pointer = get_fs(file_name, storage_options=storage_options)
parquet_file = pq.ParquetFile(file_pointer, filesystem=file_system)
assert parquet_file.metadata.num_row_groups == expected_num_row_groups

for row_index in range(0, parquet_file.metadata.num_row_groups):
Expand Down
47 changes: 22 additions & 25 deletions tests/hipscat_import/test_create_margin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import hipscat_import.margin_cache.margin_cache as mc
import pytest
from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset
from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments

Expand All @@ -17,18 +16,17 @@ def test_margin_cache_gen(
- local origin catalog.
- writing to CLOUD.
"""
with pytest.warns(UserWarning, match="smaller resolution"):
args = MarginCacheArguments(
margin_threshold=7200.0,
input_catalog_path=small_sky_order1_dir_local,
output_path=tmp_cloud_path,
output_artifact_name="small_sky_order1_margin",
output_storage_options=storage_options,
dask_tmp=tmp_path,
tmp_dir=tmp_path,
margin_order=8,
progress_bar=False,
)
args = MarginCacheArguments(
input_catalog_path=small_sky_order1_dir_local,
output_path=tmp_cloud_path,
output_artifact_name="small_sky_order1_margin",
output_storage_options=storage_options,
dask_tmp=tmp_path,
tmp_dir=tmp_path,
margin_order=8,
fine_filtering=False,
progress_bar=False,
)

assert args.catalog.catalog_info.ra_column == "ra"

Expand All @@ -51,18 +49,17 @@ def test_margin_cache_gen_read_from_cloud(
- CLOUD origin catalog
- writing to local tmp
"""
with pytest.warns(UserWarning, match="smaller resolution"):
args = MarginCacheArguments(
margin_threshold=7200.0,
input_catalog_path=small_sky_order1_dir_cloud,
input_storage_options=storage_options,
output_path=tmp_path,
output_artifact_name="small_sky_order1_margin",
dask_tmp=tmp_path,
tmp_dir=tmp_path,
margin_order=8,
progress_bar=False,
)
args = MarginCacheArguments(
input_catalog_path=small_sky_order1_dir_cloud,
input_storage_options=storage_options,
output_path=tmp_path,
output_artifact_name="small_sky_order1_margin",
dask_tmp=tmp_path,
tmp_dir=tmp_path,
margin_order=8,
fine_filtering=False,
progress_bar=False,
)

assert args.catalog.catalog_info.ra_column == "ra"

Expand Down
4 changes: 2 additions & 2 deletions tests/hipscat_import/test_run_catalog_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_catalog_import_write_to_cloud(
output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet")

expected_ids = [*range(700, 831)]
assert_parquet_file_ids(output_file, "id", expected_ids, storage_options=storage_options)
assert_parquet_file_ids(output_file, "id", catalog.schema, expected_ids, storage_options=storage_options)


@pytest.mark.dask
Expand Down Expand Up @@ -85,7 +85,7 @@ def test_catalog_import_read_from_cloud(
output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet")

expected_ids = [*range(700, 831)]
assert_parquet_file_ids(output_file, "id", expected_ids)
assert_parquet_file_ids(output_file, "id", catalog.schema, expected_ids)


def test_read_csv_cloud(storage_options, small_sky_parts_dir_cloud):
Expand Down
6 changes: 3 additions & 3 deletions tests/hipscat_import/test_run_soap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
@pytest.mark.dask
def test_object_to_self_write_to_cloud(
dask_client,
tmp_path,
tmp_path_factory,
tmp_cloud_path,
small_sky_dir_local,
small_sky_order1_dir_local,
Expand All @@ -35,7 +35,7 @@ def test_object_to_self_write_to_cloud(
output_path=tmp_cloud_path,
output_storage_options=storage_options,
progress_bar=False,
tmp_dir=tmp_path,
tmp_dir=tmp_path_factory.mktemp("small_sky_order_to_order1"),
)
runner.run(small_sky_soap_args, dask_client)

Expand Down Expand Up @@ -66,7 +66,7 @@ def test_object_to_self_write_to_cloud(
output_path=tmp_cloud_path,
output_storage_options=storage_options,
progress_bar=False,
tmp_dir=tmp_path,
tmp_dir=tmp_path_factory.mktemp("small_sky_to_order1_soft"),
)
runner.run(small_sky_soap_args, dask_client)

Expand Down

0 comments on commit 9f66b6e

Please sign in to comment.