From 54802148b68ae91aca8f4988e8d2881a4131c174 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:16:34 -0400 Subject: [PATCH] Create a single tmp dir per test session (#20) * Add test for lsdb.to_hipscat * Only create one temp dir per execution. * Address lint introduced in merge * Remove comment. --- README.md | 4 +- src/hipscat_cloudtests/__init__.py | 1 - .../temp_cloud_directory.py | 20 +- tests/conftest.py | 64 +++-- .../dataset/test_base_catalog_info_cloud.py | 8 +- tests/hipscat/catalog/test_catalog_cloud.py | 69 +++--- tests/hipscat/catalog/test_index_catalog.py | 4 +- tests/hipscat/catalog/test_margin_catalog.py | 7 +- .../hipscat/inspection/test_almanac_cloud.py | 8 +- .../test_visualize_catalog_cloud.py | 4 +- .../hipscat/io/file_io/test_file_io_cloud.py | 64 +++-- .../io/file_io/test_file_pointers_cloud.py | 38 +-- tests/hipscat/io/test_write_metadata_cloud.py | 226 +++++++++--------- tests/hipscat_import/conftest.py | 4 +- tests/hipscat_import/test_create_margin.py | 48 ++-- .../hipscat_import/test_run_catalog_import.py | 77 +++--- tests/hipscat_import/test_run_index.py | 97 ++++---- tests/hipscat_import/test_run_soap.py | 123 +++++----- tests/lsdb/catalog/test_cone_search.py | 2 - tests/lsdb/catalog/test_crossmatch.py | 12 +- tests/lsdb/catalog/test_index_search.py | 6 +- tests/lsdb/conftest.py | 22 +- tests/lsdb/io/test_to_hipscat.py | 35 ++- .../lsdb/loaders/hipscat/test_read_hipscat.py | 8 +- 24 files changed, 458 insertions(+), 493 deletions(-) diff --git a/README.md b/README.md index 1e781b8..26ed7a7 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ There are various steps to have tests run on another cloud bucket provider (like ... #...line 38... @pytest.fixture -def example_cloud_path(cloud): +def cloud_path(cloud): if cloud == "abfs": return "abfs://hipscat/pytests/hipscat" @@ -73,7 +73,7 @@ def example_cloud_path(cloud): raise NotImplementedError("Cloud format not implemented for hipscat tests!") @pytest.fixture -def example_cloud_storage_options(cloud): +def storage_options(cloud): if cloud == "abfs": storage_options = { "account_key" : os.environ.get("ABFS_LINCCDATA_ACCOUNT_KEY"), diff --git a/src/hipscat_cloudtests/__init__.py b/src/hipscat_cloudtests/__init__.py index cf375ef..70e29f0 100644 --- a/src/hipscat_cloudtests/__init__.py +++ b/src/hipscat_cloudtests/__init__.py @@ -3,4 +3,3 @@ __all__ = ["greetings", "meaning"] from .file_checks import assert_parquet_file_ids, assert_text_file_matches -from .temp_cloud_directory import TempCloudDirectory diff --git a/src/hipscat_cloudtests/temp_cloud_directory.py b/src/hipscat_cloudtests/temp_cloud_directory.py index e052908..63e04e6 100644 --- a/src/hipscat_cloudtests/temp_cloud_directory.py +++ b/src/hipscat_cloudtests/temp_cloud_directory.py @@ -32,6 +32,15 @@ def __init__(self, prefix_path, method_name="", storage_options: dict = None): def __enter__(self): """Create a new temporary path + Returns: + string path that's been created. it will take the form of + / + """ + return self.open() + + def open(self): + """Create a new temporary path + Returns: string path that's been created. it will take the form of / @@ -45,15 +54,22 @@ def __exit__(self, exc_type, exc_val, exc_tb): This will try to delete 3 times, with exponential backoff. We give up after the third attempt.""" + self.close() + + def close(self, num_retries=4): + """Recursively delete the created resources. + + This will try to delete `num_retries` times, with exponential backoff. + We give up after the last attempt.""" sleep_time = 2 if self.temp_path: - for attempt_number in range(3): + for attempt_number in range(1, num_retries + 1): ## Try try: file_io.remove_directory(self.temp_path, storage_options=self.storage_options) return except RuntimeError: - if attempt_number == 2: + if attempt_number == num_retries: print(f"Failed to remove directory {self.temp_path}. Giving up.") return print(f"Failed to remove directory {self.temp_path}. Trying again.") diff --git a/tests/conftest.py b/tests/conftest.py index dbb8842..97d310a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,9 @@ import os import pytest +import shortuuid + +from hipscat_cloudtests.temp_cloud_directory import TempCloudDirectory ALMANAC_DIR_NAME = "almanac" SMALL_SKY_DIR_NAME = "small_sky" @@ -15,24 +18,21 @@ def pytest_addoption(parser): parser.addoption("--cloud", action="store", default="abfs") -def pytest_generate_tests(metafunc): - # This is called for every test. Only get/set command line arguments - # if the argument is specified in the list of test "fixturenames". - option_value = metafunc.config.option.cloud - if "cloud" in metafunc.fixturenames and option_value is not None: - metafunc.parametrize("cloud", [option_value]) +@pytest.fixture(scope="session", name="cloud") +def cloud(request): + return request.config.getoption("--cloud") -@pytest.fixture -def example_cloud_path(cloud): +@pytest.fixture(scope="session", name="cloud_path") +def cloud_path(cloud): if cloud == "abfs": return "abfs://hipscat/pytests/" raise NotImplementedError("Cloud format not implemented for tests!") -@pytest.fixture -def example_cloud_storage_options(cloud): +@pytest.fixture(scope="session", name="storage_options") +def storage_options(cloud): if cloud == "abfs": storage_options = { "account_key": os.environ.get("ABFS_LINCCDATA_ACCOUNT_KEY"), @@ -65,35 +65,51 @@ def small_sky_parts_dir_local(local_data_dir): @pytest.fixture -def tmp_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "tmp") +def test_data_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data") @pytest.fixture -def test_data_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data") +def almanac_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data", ALMANAC_DIR_NAME) @pytest.fixture -def almanac_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data", ALMANAC_DIR_NAME) +def small_sky_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data", SMALL_SKY_DIR_NAME) @pytest.fixture -def small_sky_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data", SMALL_SKY_DIR_NAME) +def small_sky_order1_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data", SMALL_SKY_ORDER1_DIR_NAME) @pytest.fixture -def small_sky_order1_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data", SMALL_SKY_ORDER1_DIR_NAME) +def small_sky_index_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data", "small_sky_object_index") @pytest.fixture -def small_sky_index_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data", "small_sky_object_index") +def small_sky_margin_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data", "small_sky_order1_margin") + + +@pytest.fixture(scope="session", name="tmp_dir_cloud") +def tmp_dir_cloud(cloud_path, storage_options): + """Create a single client for use by all unit test cases.""" + tmp = TempCloudDirectory( + os.path.join(cloud_path, "tmp"), + method_name="full_test", + storage_options=storage_options, + ) + yield tmp.open() + tmp.close() @pytest.fixture -def small_sky_margin_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data", "small_sky_order1_margin") +def tmp_cloud_path(request, tmp_dir_cloud): + name = request.node.name + my_uuid = shortuuid.uuid() + # Strip out the "test_" at the beginning of each method name, make it a little + # shorter, and add a disambuating UUID. + return f"{tmp_dir_cloud}/{name[5:25]}_{my_uuid}" diff --git a/tests/hipscat/catalog/dataset/test_base_catalog_info_cloud.py b/tests/hipscat/catalog/dataset/test_base_catalog_info_cloud.py index eb2e028..43f485f 100644 --- a/tests/hipscat/catalog/dataset/test_base_catalog_info_cloud.py +++ b/tests/hipscat/catalog/dataset/test_base_catalog_info_cloud.py @@ -4,13 +4,11 @@ from hipscat.io import file_io -def test_read_from_file(base_catalog_info_file_cloud, example_cloud_storage_options): +def test_read_from_file(base_catalog_info_file_cloud, storage_options): base_cat_info_fp = file_io.get_file_pointer_from_path(base_catalog_info_file_cloud) - catalog_info = BaseCatalogInfo.read_from_metadata_file( - base_cat_info_fp, storage_options=example_cloud_storage_options - ) + catalog_info = BaseCatalogInfo.read_from_metadata_file(base_cat_info_fp, storage_options=storage_options) catalog_info_json = file_io.file_io.load_json_file( - base_catalog_info_file_cloud, storage_options=example_cloud_storage_options + base_catalog_info_file_cloud, storage_options=storage_options ) catalog_info_dict = dataclasses.asdict(catalog_info) diff --git a/tests/hipscat/catalog/test_catalog_cloud.py b/tests/hipscat/catalog/test_catalog_cloud.py index 809200f..8f45ca1 100644 --- a/tests/hipscat/catalog/test_catalog_cloud.py +++ b/tests/hipscat/catalog/test_catalog_cloud.py @@ -9,57 +9,52 @@ from hipscat.loaders import read_from_hipscat from hipscat.pixel_math import HealpixPixel -from hipscat_cloudtests import TempCloudDirectory - -def test_load_catalog_small_sky(small_sky_dir_cloud, example_cloud_storage_options): +def test_load_catalog_small_sky(small_sky_dir_cloud, storage_options): """Instantiate a catalog with 1 pixel""" - cat = Catalog.read_from_hipscat(small_sky_dir_cloud, storage_options=example_cloud_storage_options) + cat = Catalog.read_from_hipscat(small_sky_dir_cloud, storage_options=storage_options) assert cat.catalog_name == "small_sky" assert len(cat.get_healpix_pixels()) == 1 - assert is_valid_catalog(small_sky_dir_cloud, storage_options=example_cloud_storage_options) + assert is_valid_catalog(small_sky_dir_cloud, storage_options=storage_options) -def test_load_catalog_small_sky_with_loader(small_sky_dir_cloud, example_cloud_storage_options): +def test_load_catalog_small_sky_with_loader(small_sky_dir_cloud, storage_options): """Instantiate a catalog with 1 pixel""" - cat = read_from_hipscat(small_sky_dir_cloud, storage_options=example_cloud_storage_options) + cat = read_from_hipscat(small_sky_dir_cloud, storage_options=storage_options) assert isinstance(cat, Catalog) assert cat.catalog_name == "small_sky" assert len(cat.get_healpix_pixels()) == 1 - assert is_valid_catalog(small_sky_dir_cloud, storage_options=example_cloud_storage_options) + assert is_valid_catalog(small_sky_dir_cloud, storage_options=storage_options) -def test_empty_directory(tmp_dir_cloud, example_cloud_storage_options): +def test_empty_directory(tmp_cloud_path, storage_options): """Test loading empty or incomplete data""" - with TempCloudDirectory(tmp_dir_cloud, "empty", example_cloud_storage_options) as temp_path: - catalog_path = temp_path - - ## Path exists but there's nothing there (which means it doesn't exist!) - with pytest.raises(FileNotFoundError, match="No directory"): - Catalog.read_from_hipscat(catalog_path, storage_options=example_cloud_storage_options) - - ## catalog_info file exists - getting closer - file_name = os.path.join(catalog_path, "catalog_info.json") - file_io.write_string_to_file( - file_name, - string='{"catalog_name":"empty", "catalog_type":"source"}', - storage_options=example_cloud_storage_options, - ) - - with pytest.raises(FileNotFoundError, match="metadata"): - Catalog.read_from_hipscat(catalog_path, storage_options=example_cloud_storage_options) - - ## partition_info file exists - enough to create a catalog - ## Now we create the needed _metadata and everything is right. - part_info = PartitionInfo.from_healpix([HealpixPixel(0, 11)]) - part_info.write_to_metadata_files( - catalog_path=catalog_path, storage_options=example_cloud_storage_options - ) - - with pytest.warns(UserWarning, match="slow"): - catalog = Catalog.read_from_hipscat(catalog_path, storage_options=example_cloud_storage_options) - assert catalog.catalog_name == "empty" + catalog_path = tmp_cloud_path + + ## Path exists but there's nothing there (which means it doesn't exist!) + with pytest.raises(FileNotFoundError, match="No directory"): + Catalog.read_from_hipscat(catalog_path, storage_options=storage_options) + + ## catalog_info file exists - getting closer + file_name = os.path.join(catalog_path, "catalog_info.json") + file_io.write_string_to_file( + file_name, + string='{"catalog_name":"empty", "catalog_type":"source"}', + storage_options=storage_options, + ) + + with pytest.raises(FileNotFoundError, match="metadata"): + Catalog.read_from_hipscat(catalog_path, storage_options=storage_options) + + ## partition_info file exists - enough to create a catalog + ## Now we create the needed _metadata and everything is right. + part_info = PartitionInfo.from_healpix([HealpixPixel(0, 11)]) + part_info.write_to_metadata_files(catalog_path=catalog_path, storage_options=storage_options) + + with pytest.warns(UserWarning, match="slow"): + catalog = Catalog.read_from_hipscat(catalog_path, storage_options=storage_options) + assert catalog.catalog_name == "empty" diff --git a/tests/hipscat/catalog/test_index_catalog.py b/tests/hipscat/catalog/test_index_catalog.py index c4573d1..cd11f65 100644 --- a/tests/hipscat/catalog/test_index_catalog.py +++ b/tests/hipscat/catalog/test_index_catalog.py @@ -4,8 +4,8 @@ from hipscat.pixel_math import HealpixPixel -def test_loc_partition(small_sky_index_dir_cloud, example_cloud_storage_options): - catalog = read_from_hipscat(small_sky_index_dir_cloud, storage_options=example_cloud_storage_options) +def test_loc_partition(small_sky_index_dir_cloud, storage_options): + catalog = read_from_hipscat(small_sky_index_dir_cloud, storage_options=storage_options) assert isinstance(catalog, IndexCatalog) assert catalog.on_disk diff --git a/tests/hipscat/catalog/test_margin_catalog.py b/tests/hipscat/catalog/test_margin_catalog.py index e5e77ea..2a71ad1 100644 --- a/tests/hipscat/catalog/test_margin_catalog.py +++ b/tests/hipscat/catalog/test_margin_catalog.py @@ -3,11 +3,8 @@ from hipscat.pixel_math.healpix_pixel import HealpixPixel -def test_read_margin_from_file( - small_sky_margin_dir_cloud, - example_cloud_storage_options, -): - catalog = read_from_hipscat(small_sky_margin_dir_cloud, storage_options=example_cloud_storage_options) +def test_read_margin_from_file(small_sky_margin_dir_cloud, storage_options): + catalog = read_from_hipscat(small_sky_margin_dir_cloud, storage_options=storage_options) assert isinstance(catalog, MarginCatalog) assert catalog.on_disk diff --git a/tests/hipscat/inspection/test_almanac_cloud.py b/tests/hipscat/inspection/test_almanac_cloud.py index c6386e6..2a58a61 100644 --- a/tests/hipscat/inspection/test_almanac_cloud.py +++ b/tests/hipscat/inspection/test_almanac_cloud.py @@ -3,19 +3,19 @@ from hipscat.inspection.almanac import Almanac -def test_default(almanac_dir_cloud, test_data_dir_cloud, example_cloud_storage_options): +def test_default(almanac_dir_cloud, test_data_dir_cloud, storage_options): """Test loading from a default directory""" os.environ["HIPSCAT_ALMANAC_DIR"] = "" os.environ["HIPSCAT_DEFAULT_DIR"] = test_data_dir_cloud - alms = Almanac(include_default_dir=True, storage_options=example_cloud_storage_options) + alms = Almanac(include_default_dir=True, storage_options=storage_options) assert len(alms.catalogs()) == 0 os.environ["HIPSCAT_ALMANAC_DIR"] = almanac_dir_cloud - alms = Almanac(include_default_dir=True, storage_options=example_cloud_storage_options) + alms = Almanac(include_default_dir=True, storage_options=storage_options) assert len(alms.catalogs()) == 2 os.environ.pop("HIPSCAT_ALMANAC_DIR") - alms = Almanac(include_default_dir=True, storage_options=example_cloud_storage_options) + alms = Almanac(include_default_dir=True, storage_options=storage_options) assert len(alms.catalogs()) == 0 diff --git a/tests/hipscat/inspection/test_visualize_catalog_cloud.py b/tests/hipscat/inspection/test_visualize_catalog_cloud.py index cde7068..82e4831 100644 --- a/tests/hipscat/inspection/test_visualize_catalog_cloud.py +++ b/tests/hipscat/inspection/test_visualize_catalog_cloud.py @@ -5,9 +5,9 @@ # pylint: disable=no-member -def test_generate_map_order1(small_sky_dir_cloud, example_cloud_storage_options, mocker): +def test_generate_map_order1(small_sky_dir_cloud, storage_options, mocker): """Basic test that map data can be generated (does not test that a plot is rendered)""" - cat = Catalog.read_from_hipscat(small_sky_dir_cloud, storage_options=example_cloud_storage_options) + cat = Catalog.read_from_hipscat(small_sky_dir_cloud, storage_options=storage_options) mocker.patch("healpy.mollview") plot_pixels(cat) diff --git a/tests/hipscat/io/file_io/test_file_io_cloud.py b/tests/hipscat/io/file_io/test_file_io_cloud.py index cc6845b..cbe0162 100644 --- a/tests/hipscat/io/file_io/test_file_io_cloud.py +++ b/tests/hipscat/io/file_io/test_file_io_cloud.py @@ -13,51 +13,47 @@ ) from hipscat.io.paths import pixel_catalog_file -from hipscat_cloudtests import TempCloudDirectory - -def test_write_string_to_file(tmp_dir_cloud, example_cloud_storage_options): - with TempCloudDirectory(tmp_dir_cloud, "write_string", example_cloud_storage_options) as temp_path: - test_file_path = os.path.join(temp_path, "text_file.txt") - test_file_pointer = get_file_pointer_from_path(test_file_path) - test_string = "this is a test" - write_string_to_file( - test_file_pointer, - test_string, - encoding="utf-8", - storage_options=example_cloud_storage_options, - ) - data = load_text_file(test_file_path, encoding="utf-8", storage_options=example_cloud_storage_options) - assert data[0] == test_string - - -def test_load_json(small_sky_dir_local, small_sky_dir_cloud, example_cloud_storage_options): +def test_write_string_to_file(tmp_cloud_path, storage_options): + test_file_path = os.path.join(tmp_cloud_path, "text_file.txt") + test_file_pointer = get_file_pointer_from_path(test_file_path) + test_string = "this is a test" + write_string_to_file( + test_file_pointer, + test_string, + encoding="utf-8", + storage_options=storage_options, + ) + data = load_text_file(test_file_path, encoding="utf-8", storage_options=storage_options) + assert data[0] == test_string + + +def test_load_json(small_sky_dir_local, small_sky_dir_cloud, storage_options): catalog_cloud_path = os.path.join(small_sky_dir_cloud, "catalog_info.json") catalog_info_path = os.path.join(small_sky_dir_local, "catalog_info.json") catalog_info_pointer = get_file_pointer_from_path(catalog_info_path) - json_dict_cloud = load_json_file(catalog_cloud_path, storage_options=example_cloud_storage_options) + json_dict_cloud = load_json_file(catalog_cloud_path, storage_options=storage_options) json_dict_local = load_json_file(catalog_info_pointer, encoding="utf-8") assert json_dict_cloud == json_dict_local -def test_load_parquet_to_pandas(small_sky_dir_local, small_sky_dir_cloud, example_cloud_storage_options): +def test_load_parquet_to_pandas(small_sky_dir_local, small_sky_dir_cloud, storage_options): pixel_data_path = pixel_catalog_file(small_sky_dir_local, 0, 11) pixel_data_path_cloud = pixel_catalog_file(small_sky_dir_cloud, 0, 11) parquet_df = pd.read_parquet(pixel_data_path) - loaded_df = load_parquet_to_pandas(pixel_data_path_cloud, storage_options=example_cloud_storage_options) + loaded_df = load_parquet_to_pandas(pixel_data_path_cloud, storage_options=storage_options) pd.testing.assert_frame_equal(parquet_df, loaded_df) -def test_write_df_to_csv(tmp_dir_cloud, example_cloud_storage_options): - with TempCloudDirectory(tmp_dir_cloud, "write_df_to_csv", example_cloud_storage_options) as temp_path: - random_df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD")) - test_file_path = os.path.join(temp_path, "test.csv") - test_file_pointer = get_file_pointer_from_path(test_file_path) - write_dataframe_to_csv( - random_df, - test_file_pointer, - index=False, - storage_options=example_cloud_storage_options, - ) - loaded_df = load_csv_to_pandas(test_file_pointer, storage_options=example_cloud_storage_options) - pd.testing.assert_frame_equal(loaded_df, random_df) +def test_write_df_to_csv(tmp_cloud_path, storage_options): + random_df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD")) + test_file_path = os.path.join(tmp_cloud_path, "test.csv") + test_file_pointer = get_file_pointer_from_path(test_file_path) + write_dataframe_to_csv( + random_df, + test_file_pointer, + index=False, + storage_options=storage_options, + ) + loaded_df = load_csv_to_pandas(test_file_pointer, storage_options=storage_options) + pd.testing.assert_frame_equal(loaded_df, random_df) diff --git a/tests/hipscat/io/file_io/test_file_pointers_cloud.py b/tests/hipscat/io/file_io/test_file_pointers_cloud.py index 284479d..3b8ea70 100644 --- a/tests/hipscat/io/file_io/test_file_pointers_cloud.py +++ b/tests/hipscat/io/file_io/test_file_pointers_cloud.py @@ -10,37 +10,37 @@ ) -def test_file_or_dir_exist(small_sky_dir_cloud, example_cloud_storage_options): +def test_file_or_dir_exist(small_sky_dir_cloud, storage_options): small_sky_pointer = get_file_pointer_from_path(small_sky_dir_cloud) - assert does_file_or_directory_exist(small_sky_pointer, storage_options=example_cloud_storage_options) + assert does_file_or_directory_exist(small_sky_pointer, storage_options=storage_options) catalog_info_string = os.path.join(small_sky_dir_cloud, "catalog_info.json") catalog_info_pointer = get_file_pointer_from_path(catalog_info_string) - assert does_file_or_directory_exist(catalog_info_pointer, storage_options=example_cloud_storage_options) + assert does_file_or_directory_exist(catalog_info_pointer, storage_options=storage_options) -def test_file_or_dir_exist_false(small_sky_dir_cloud, example_cloud_storage_options): +def test_file_or_dir_exist_false(small_sky_dir_cloud, storage_options): small_sky_pointer = get_file_pointer_from_path(small_sky_dir_cloud + "incorrect file") - assert not does_file_or_directory_exist(small_sky_pointer, storage_options=example_cloud_storage_options) + assert not does_file_or_directory_exist(small_sky_pointer, storage_options=storage_options) -def test_is_regular_file(small_sky_dir_cloud, example_cloud_storage_options): +def test_is_regular_file(small_sky_dir_cloud, storage_options): partition_info_file = os.path.join(small_sky_dir_cloud, "catalog_info.json") - assert is_regular_file(partition_info_file, storage_options=example_cloud_storage_options) + assert is_regular_file(partition_info_file, storage_options=storage_options) - assert not is_regular_file(small_sky_dir_cloud, storage_options=example_cloud_storage_options) + assert not is_regular_file(small_sky_dir_cloud, storage_options=storage_options) partition_dir = os.path.join(small_sky_dir_cloud, "Norder=0") - assert not is_regular_file(partition_dir, storage_options=example_cloud_storage_options) + assert not is_regular_file(partition_dir, storage_options=storage_options) -def test_find_files_matching_path(small_sky_dir_cloud, example_cloud_storage_options): +def test_find_files_matching_path(small_sky_dir_cloud, storage_options): ## no_wildcard assert ( len( find_files_matching_path( small_sky_dir_cloud, "catalog_info.json", - storage_options=example_cloud_storage_options, + storage_options=storage_options, ) ) == 1 @@ -52,19 +52,19 @@ def test_find_files_matching_path(small_sky_dir_cloud, example_cloud_storage_opt find_files_matching_path( small_sky_dir_cloud, "*.json", - storage_options=example_cloud_storage_options, + storage_options=storage_options, ) ) == 2 ) -def test_find_files_matching_path_directory(small_sky_order1_dir_cloud, example_cloud_storage_options): +def test_find_files_matching_path_directory(small_sky_order1_dir_cloud, storage_options): assert ( len( find_files_matching_path( small_sky_order1_dir_cloud, - storage_options=example_cloud_storage_options, + storage_options=storage_options, ) ) == 1 @@ -78,22 +78,22 @@ def test_find_files_matching_path_directory(small_sky_order1_dir_cloud, example_ "*", "*", "*", - storage_options=example_cloud_storage_options, + storage_options=storage_options, ) ) == 4 ) -def test_directory_has_contents(small_sky_order1_dir_cloud, example_cloud_storage_options): - assert directory_has_contents(small_sky_order1_dir_cloud, storage_options=example_cloud_storage_options) +def test_directory_has_contents(small_sky_order1_dir_cloud, storage_options): + assert directory_has_contents(small_sky_order1_dir_cloud, storage_options=storage_options) -def test_get_directory_contents(small_sky_order1_dir_cloud, example_cloud_storage_options): +def test_get_directory_contents(small_sky_order1_dir_cloud, storage_options): small_sky_contents = get_directory_contents( small_sky_order1_dir_cloud, include_protocol=True, - storage_options=example_cloud_storage_options, + storage_options=storage_options, ) expected = [ diff --git a/tests/hipscat/io/test_write_metadata_cloud.py b/tests/hipscat/io/test_write_metadata_cloud.py index 91393fc..e516624 100644 --- a/tests/hipscat/io/test_write_metadata_cloud.py +++ b/tests/hipscat/io/test_write_metadata_cloud.py @@ -11,7 +11,7 @@ from hipscat.io import file_io from hipscat.io.parquet_metadata import write_parquet_metadata -from hipscat_cloudtests import TempCloudDirectory, assert_text_file_matches +from hipscat_cloudtests import assert_text_file_matches @pytest.fixture @@ -48,127 +48,116 @@ def catalog_info(catalog_info_data) -> CatalogInfo: return CatalogInfo(**catalog_info_data) -def test_write_catalog_info(tmp_dir_cloud, catalog_info, example_cloud_storage_options): +def test_write_catalog_info(tmp_cloud_path, catalog_info, storage_options): """Test that we accurately write out catalog metadata""" - with TempCloudDirectory(tmp_dir_cloud, "write_catalog_info", example_cloud_storage_options) as temp_path: - catalog_base_dir = temp_path - expected_lines = [ - "{", - ' "catalog_name": "test_name",', - ' "catalog_type": "object",', - ' "total_rows": 10,', - ' "epoch": "J2000",', - ' "ra_column": "ra",', - ' "dec_column": "dec"', - "}", - ] - - io.write_catalog_info( - dataset_info=catalog_info, - catalog_base_dir=catalog_base_dir, - storage_options=example_cloud_storage_options, - ) - metadata_filename = os.path.join(catalog_base_dir, "catalog_info.json") - assert_text_file_matches( - expected_lines, metadata_filename, storage_options=example_cloud_storage_options - ) + catalog_base_dir = tmp_cloud_path + expected_lines = [ + "{", + ' "catalog_name": "test_name",', + ' "catalog_type": "object",', + ' "total_rows": 10,', + ' "epoch": "J2000",', + ' "ra_column": "ra",', + ' "dec_column": "dec"', + "}", + ] + + io.write_catalog_info( + dataset_info=catalog_info, + catalog_base_dir=catalog_base_dir, + storage_options=storage_options, + ) + metadata_filename = os.path.join(catalog_base_dir, "catalog_info.json") + assert_text_file_matches(expected_lines, metadata_filename, storage_options=storage_options) -def test_write_provenance_info(tmp_dir_cloud, catalog_info, example_cloud_storage_options): +def test_write_provenance_info(tmp_cloud_path, catalog_info, storage_options): """Test that we accurately write out tool-provided generation metadata""" - with TempCloudDirectory( - tmp_dir_cloud, "write_provenance_info", example_cloud_storage_options - ) as temp_path: - catalog_base_dir = temp_path - expected_lines = [ - "{", - ' "catalog_name": "test_name",', - ' "catalog_type": "object",', - ' "total_rows": 10,', - ' "epoch": "J2000",', - ' "ra_column": "ra",', - ' "dec_column": "dec",', - r' "version": ".*",', # version matches digits - r' "generation_date": "[.\d]+",', # date matches date format - ' "tool_args": {', - ' "tool_name": "hipscat-import",', - ' "tool_version": "1.0.0",', - r' "input_file_names": \[', - ' "file1",', - ' "file2",', - ' "file3"', - " ]", - " }", - "}", - ] - - tool_args = { - "tool_name": "hipscat-import", - "tool_version": "1.0.0", - "input_file_names": ["file1", "file2", "file3"], - } + catalog_base_dir = tmp_cloud_path + expected_lines = [ + "{", + ' "catalog_name": "test_name",', + ' "catalog_type": "object",', + ' "total_rows": 10,', + ' "epoch": "J2000",', + ' "ra_column": "ra",', + ' "dec_column": "dec",', + r' "version": ".*",', # version matches digits + r' "generation_date": "[.\d]+",', # date matches date format + ' "tool_args": {', + ' "tool_name": "hipscat-import",', + ' "tool_version": "1.0.0",', + r' "input_file_names": \[', + ' "file1",', + ' "file2",', + ' "file3"', + " ]", + " }", + "}", + ] + + tool_args = { + "tool_name": "hipscat-import", + "tool_version": "1.0.0", + "input_file_names": ["file1", "file2", "file3"], + } - io.write_provenance_info( - catalog_base_dir=catalog_base_dir, - dataset_info=catalog_info, - tool_args=tool_args, - storage_options=example_cloud_storage_options, - ) - metadata_filename = os.path.join(catalog_base_dir, "provenance_info.json") - assert_text_file_matches( - expected_lines, metadata_filename, storage_options=example_cloud_storage_options - ) + io.write_provenance_info( + catalog_base_dir=catalog_base_dir, + dataset_info=catalog_info, + tool_args=tool_args, + storage_options=storage_options, + ) + metadata_filename = os.path.join(catalog_base_dir, "provenance_info.json") + assert_text_file_matches(expected_lines, metadata_filename, storage_options=storage_options) def test_write_parquet_metadata( - tmp_dir_cloud, + tmp_cloud_path, small_sky_dir_cloud, basic_catalog_parquet_metadata, - example_cloud_storage_options, + storage_options, ): """Use existing catalog parquet files and create new metadata files for it""" - with TempCloudDirectory( - tmp_dir_cloud, "write_parquet_metadata", example_cloud_storage_options - ) as temp_path: - catalog_base_dir = temp_path - - write_parquet_metadata( - catalog_path=small_sky_dir_cloud, - storage_options=example_cloud_storage_options, - output_path=catalog_base_dir, - ) - - check_parquet_schema( - os.path.join(catalog_base_dir, "_metadata"), - basic_catalog_parquet_metadata, - storage_options=example_cloud_storage_options, - ) - ## _common_metadata has 0 row groups - check_parquet_schema( - os.path.join(catalog_base_dir, "_common_metadata"), - basic_catalog_parquet_metadata, - 0, - storage_options=example_cloud_storage_options, - ) - - ## Re-write - should still have the same properties. - write_parquet_metadata( - catalog_path=small_sky_dir_cloud, - storage_options=example_cloud_storage_options, - output_path=catalog_base_dir, - ) - check_parquet_schema( - os.path.join(catalog_base_dir, "_metadata"), - basic_catalog_parquet_metadata, - storage_options=example_cloud_storage_options, - ) - ## _common_metadata has 0 row groups - check_parquet_schema( - os.path.join(catalog_base_dir, "_common_metadata"), - basic_catalog_parquet_metadata, - 0, - storage_options=example_cloud_storage_options, - ) + catalog_base_dir = tmp_cloud_path + + write_parquet_metadata( + catalog_path=small_sky_dir_cloud, + storage_options=storage_options, + output_path=catalog_base_dir, + ) + + check_parquet_schema( + os.path.join(catalog_base_dir, "_metadata"), + basic_catalog_parquet_metadata, + storage_options=storage_options, + ) + ## _common_metadata has 0 row groups + check_parquet_schema( + os.path.join(catalog_base_dir, "_common_metadata"), + basic_catalog_parquet_metadata, + 0, + storage_options=storage_options, + ) + + ## Re-write - should still have the same properties. + write_parquet_metadata( + catalog_path=small_sky_dir_cloud, + storage_options=storage_options, + output_path=catalog_base_dir, + ) + check_parquet_schema( + os.path.join(catalog_base_dir, "_metadata"), + basic_catalog_parquet_metadata, + storage_options=storage_options, + ) + ## _common_metadata has 0 row groups + check_parquet_schema( + os.path.join(catalog_base_dir, "_common_metadata"), + basic_catalog_parquet_metadata, + 0, + storage_options=storage_options, + ) def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1, storage_options: dict = None): @@ -196,15 +185,14 @@ def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1, assert column_metadata.file_path.endswith(".parquet") -def test_read_write_fits_point_map(tmp_dir_cloud, example_cloud_storage_options): +def test_read_write_fits_point_map(tmp_cloud_path, storage_options): """Check that we write and can read a FITS file for spatial distribution.""" - with TempCloudDirectory(tmp_dir_cloud, "write_fits", example_cloud_storage_options) as temp_path: - initial_histogram = hist.empty_histogram(1) - filled_pixels = [51, 29, 51, 0] - initial_histogram[44:] = filled_pixels[:] - io.write_fits_map(temp_path, initial_histogram, storage_options=example_cloud_storage_options) + initial_histogram = hist.empty_histogram(1) + filled_pixels = [51, 29, 51, 0] + initial_histogram[44:] = filled_pixels[:] + io.write_fits_map(tmp_cloud_path, initial_histogram, storage_options=storage_options) - output_file = os.path.join(temp_path, "point_map.fits") + output_file = os.path.join(tmp_cloud_path, "point_map.fits") - output = file_io.read_fits_image(output_file, storage_options=example_cloud_storage_options) - npt.assert_array_equal(output, initial_histogram) + output = file_io.read_fits_image(output_file, storage_options=storage_options) + npt.assert_array_equal(output, initial_histogram) diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index 8e24ad2..05827fc 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -14,5 +14,5 @@ def dask_client(): @pytest.fixture -def small_sky_parts_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "hipscat_import", "data", "small_sky_parts") +def small_sky_parts_dir_cloud(cloud_path): + return os.path.join(cloud_path, "hipscat_import", "data", "small_sky_parts") diff --git a/tests/hipscat_import/test_create_margin.py b/tests/hipscat_import/test_create_margin.py index 2b1828c..57ba5be 100644 --- a/tests/hipscat_import/test_create_margin.py +++ b/tests/hipscat_import/test_create_margin.py @@ -3,14 +3,12 @@ from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments -from hipscat_cloudtests import TempCloudDirectory - def test_margin_cache_gen( small_sky_order1_dir_local, tmp_path, - tmp_dir_cloud, - example_cloud_storage_options, + tmp_cloud_path, + storage_options, dask_client, ): """Test that margin cache generation works end to end. @@ -19,36 +17,32 @@ def test_margin_cache_gen( - local origin catalog. - writing to CLOUD. """ + with pytest.warns(UserWarning, match="smaller resolution"): + args = MarginCacheArguments( + margin_threshold=7200.0, + input_catalog_path=small_sky_order1_dir_local, + output_path=tmp_cloud_path, + output_artifact_name="small_sky_order1_margin", + output_storage_options=storage_options, + dask_tmp=tmp_path, + tmp_dir=tmp_path, + margin_order=8, + progress_bar=False, + ) - with TempCloudDirectory(tmp_dir_cloud, "margin_cache_gen", example_cloud_storage_options) as temp_path: - with pytest.warns(UserWarning, match="smaller resolution"): - args = MarginCacheArguments( - margin_threshold=7200.0, - input_catalog_path=small_sky_order1_dir_local, - output_path=temp_path, - output_artifact_name="small_sky_order1_margin", - output_storage_options=example_cloud_storage_options, - dask_tmp=tmp_path, - tmp_dir=tmp_path, - margin_order=8, - progress_bar=False, - ) - - assert args.catalog.catalog_info.ra_column == "ra" + assert args.catalog.catalog_info.ra_column == "ra" - mc.generate_margin_cache(args, dask_client) + mc.generate_margin_cache(args, dask_client) - catalog = HealpixDataset.read_from_hipscat( - args.catalog_path, storage_options=example_cloud_storage_options - ) - assert catalog.on_disk - assert catalog.catalog_path == args.catalog_path + catalog = HealpixDataset.read_from_hipscat(args.catalog_path, storage_options=storage_options) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path def test_margin_cache_gen_read_from_cloud( small_sky_order1_dir_cloud, tmp_path, - example_cloud_storage_options, + storage_options, dask_client, ): """Test that margin cache generation works end to end. @@ -61,7 +55,7 @@ def test_margin_cache_gen_read_from_cloud( args = MarginCacheArguments( margin_threshold=7200.0, input_catalog_path=small_sky_order1_dir_cloud, - input_storage_options=example_cloud_storage_options, + input_storage_options=storage_options, output_path=tmp_path, output_artifact_name="small_sky_order1_margin", dask_tmp=tmp_path, diff --git a/tests/hipscat_import/test_run_catalog_import.py b/tests/hipscat_import/test_run_catalog_import.py index be22892..7d92106 100644 --- a/tests/hipscat_import/test_run_catalog_import.py +++ b/tests/hipscat_import/test_run_catalog_import.py @@ -8,67 +8,62 @@ from hipscat_import.catalog.arguments import ImportArguments from hipscat_import.catalog.file_readers import CsvReader -from hipscat_cloudtests import TempCloudDirectory, assert_parquet_file_ids +from hipscat_cloudtests import assert_parquet_file_ids @pytest.mark.dask def test_catalog_import_write_to_cloud( dask_client, small_sky_parts_dir_local, - tmp_dir_cloud, - example_cloud_storage_options, + tmp_cloud_path, + storage_options, tmp_path, ): """Using local CSV files, write a new catalog to the cloud bucket.""" - with TempCloudDirectory( - tmp_dir_cloud, "write_catalog_to_cloud", example_cloud_storage_options - ) as temp_path: - args = ImportArguments( - output_artifact_name="small_sky_object_catalog", - input_path=small_sky_parts_dir_local, - output_storage_options=example_cloud_storage_options, - file_reader="csv", - output_path=temp_path, - dask_tmp=tmp_path, - highest_healpix_order=1, - progress_bar=False, - overwrite=True, - ) - - runner.run(args, dask_client) - - # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path, storage_options=example_cloud_storage_options) - assert catalog.on_disk - assert catalog.catalog_path == args.catalog_path - assert catalog.catalog_info.ra_column == "ra" - assert catalog.catalog_info.dec_column == "dec" - assert catalog.catalog_info.total_rows == 131 - assert len(catalog.get_healpix_pixels()) == 1 - - # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") - - expected_ids = [*range(700, 831)] - assert_parquet_file_ids( - output_file, "id", expected_ids, storage_options=example_cloud_storage_options - ) + args = ImportArguments( + output_artifact_name="small_sky_object_catalog", + input_path=small_sky_parts_dir_local, + output_storage_options=storage_options, + file_reader="csv", + output_path=tmp_cloud_path, + dask_tmp=tmp_path, + highest_healpix_order=1, + progress_bar=False, + overwrite=True, + ) + + runner.run(args, dask_client) + + # Check that the catalog metadata file exists + catalog = Catalog.read_from_hipscat(args.catalog_path, storage_options=storage_options) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + assert catalog.catalog_info.ra_column == "ra" + assert catalog.catalog_info.dec_column == "dec" + assert catalog.catalog_info.total_rows == 131 + assert len(catalog.get_healpix_pixels()) == 1 + + # Check that the catalog parquet file exists and contains correct object IDs + output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + + expected_ids = [*range(700, 831)] + assert_parquet_file_ids(output_file, "id", expected_ids, storage_options=storage_options) @pytest.mark.dask def test_catalog_import_read_from_cloud( dask_client, small_sky_parts_dir_cloud, - example_cloud_storage_options, + storage_options, tmp_path, ): """Using cloud CSV files, write a new catalog to local disk.""" args = ImportArguments( output_artifact_name="small_sky_object_catalog", input_path=small_sky_parts_dir_cloud, - input_storage_options=example_cloud_storage_options, + input_storage_options=storage_options, file_reader=CsvReader( - storage_options=example_cloud_storage_options, + storage_options=storage_options, ), output_path=tmp_path, dask_tmp=tmp_path, @@ -94,11 +89,11 @@ def test_catalog_import_read_from_cloud( assert_parquet_file_ids(output_file, "id", expected_ids) -def test_read_csv_cloud(example_cloud_storage_options, small_sky_parts_dir_cloud): +def test_read_csv_cloud(storage_options, small_sky_parts_dir_cloud): """Verify we can read the csv file into a single data frame.""" single_file = os.path.join(small_sky_parts_dir_cloud, "catalog_00_of_05.csv") total_chunks = 0 - for frame in CsvReader(storage_options=example_cloud_storage_options).read(single_file): + for frame in CsvReader(storage_options=storage_options).read(single_file): total_chunks += 1 assert len(frame) == 25 diff --git a/tests/hipscat_import/test_run_index.py b/tests/hipscat_import/test_run_index.py index f441e03..dd6fb73 100644 --- a/tests/hipscat_import/test_run_index.py +++ b/tests/hipscat_import/test_run_index.py @@ -6,75 +6,70 @@ from hipscat.io.file_io import read_parquet_metadata from hipscat_import.index.arguments import IndexArguments -from hipscat_cloudtests import TempCloudDirectory - def test_run_index( small_sky_order1_dir_local, tmp_path, - tmp_dir_cloud, - example_cloud_storage_options, + tmp_cloud_path, + storage_options, dask_client, ): """Test appropriate metadata is written""" - with TempCloudDirectory(tmp_dir_cloud, "run_index", example_cloud_storage_options) as temp_path: - args = IndexArguments( - input_catalog_path=small_sky_order1_dir_local, - indexing_column="id", - output_path=temp_path, - output_artifact_name="small_sky_object_index", - output_storage_options=example_cloud_storage_options, - tmp_dir=tmp_path, - dask_tmp=tmp_path, - overwrite=True, - progress_bar=False, - ) - runner.run(args, dask_client) - - # Check that the catalog metadata file exists - catalog = Dataset.read_from_hipscat(args.catalog_path, storage_options=example_cloud_storage_options) - assert catalog.on_disk - assert catalog.catalog_path == args.catalog_path - - basic_index_parquet_schema = pa.schema( - [ - pa.field("_hipscat_index", pa.uint64()), - pa.field("Norder", pa.uint8()), - pa.field("Dir", pa.uint64()), - pa.field("Npix", pa.uint64()), - pa.field("id", pa.int64()), - ] - ) - - outfile = os.path.join(args.catalog_path, "index", "part.0.parquet") - schema = read_parquet_metadata( - outfile, storage_options=example_cloud_storage_options - ).schema.to_arrow_schema() - assert schema.equals(basic_index_parquet_schema, check_metadata=False) - - schema = read_parquet_metadata( - os.path.join(args.catalog_path, "_metadata"), storage_options=example_cloud_storage_options - ).schema.to_arrow_schema() - assert schema.equals(basic_index_parquet_schema, check_metadata=False) - - schema = read_parquet_metadata( - os.path.join(args.catalog_path, "_common_metadata"), storage_options=example_cloud_storage_options - ).schema.to_arrow_schema() - assert schema.equals(basic_index_parquet_schema, check_metadata=False) + args = IndexArguments( + input_catalog_path=small_sky_order1_dir_local, + indexing_column="id", + output_path=tmp_cloud_path, + output_artifact_name="small_sky_object_index", + output_storage_options=storage_options, + tmp_dir=tmp_path, + dask_tmp=tmp_path, + overwrite=True, + progress_bar=False, + ) + runner.run(args, dask_client) + + # Check that the catalog metadata file exists + catalog = Dataset.read_from_hipscat(args.catalog_path, storage_options=storage_options) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + + basic_index_parquet_schema = pa.schema( + [ + pa.field("_hipscat_index", pa.uint64()), + pa.field("Norder", pa.uint8()), + pa.field("Dir", pa.uint64()), + pa.field("Npix", pa.uint64()), + pa.field("id", pa.int64()), + ] + ) + + outfile = os.path.join(args.catalog_path, "index", "part.0.parquet") + schema = read_parquet_metadata(outfile, storage_options=storage_options).schema.to_arrow_schema() + assert schema.equals(basic_index_parquet_schema, check_metadata=False) + + schema = read_parquet_metadata( + os.path.join(args.catalog_path, "_metadata"), storage_options=storage_options + ).schema.to_arrow_schema() + assert schema.equals(basic_index_parquet_schema, check_metadata=False) + + schema = read_parquet_metadata( + os.path.join(args.catalog_path, "_common_metadata"), storage_options=storage_options + ).schema.to_arrow_schema() + assert schema.equals(basic_index_parquet_schema, check_metadata=False) def test_run_index_read_from_cloud( small_sky_order1_dir_cloud, tmp_path, - example_cloud_storage_options, + storage_options, dask_client, ): """Test appropriate metadata is written""" args = IndexArguments( input_catalog_path=small_sky_order1_dir_cloud, - input_storage_options=example_cloud_storage_options, + input_storage_options=storage_options, indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", @@ -86,7 +81,7 @@ def test_run_index_read_from_cloud( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Dataset.read_from_hipscat(args.catalog_path, storage_options=example_cloud_storage_options) + catalog = Dataset.read_from_hipscat(args.catalog_path, storage_options=storage_options) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path diff --git a/tests/hipscat_import/test_run_soap.py b/tests/hipscat_import/test_run_soap.py index 28fc179..19c0033 100644 --- a/tests/hipscat_import/test_run_soap.py +++ b/tests/hipscat_import/test_run_soap.py @@ -6,17 +6,15 @@ from hipscat.io.file_io import read_parquet_metadata from hipscat_import.soap.arguments import SoapArguments -from hipscat_cloudtests import TempCloudDirectory - @pytest.mark.dask def test_object_to_self_write_to_cloud( dask_client, tmp_path, - tmp_dir_cloud, + tmp_cloud_path, small_sky_dir_local, small_sky_order1_dir_local, - example_cloud_storage_options, + storage_options, ): """Test creating association between object catalogs. @@ -26,71 +24,66 @@ def test_object_to_self_write_to_cloud( First test creates leaf files, the second test does not (exercies different write calls). """ - with TempCloudDirectory(tmp_dir_cloud, "run_soap", example_cloud_storage_options) as temp_path: - small_sky_soap_args = SoapArguments( - object_catalog_dir=small_sky_dir_local, - object_id_column="id", - source_catalog_dir=small_sky_order1_dir_local, - source_object_id_column="id", - source_id_column="id", - write_leaf_files=True, - output_artifact_name="small_sky_to_order1", - output_path=temp_path, - output_storage_options=example_cloud_storage_options, - progress_bar=False, - tmp_dir=tmp_path, - ) - runner.run(small_sky_soap_args, dask_client) + small_sky_soap_args = SoapArguments( + object_catalog_dir=small_sky_dir_local, + object_id_column="id", + source_catalog_dir=small_sky_order1_dir_local, + source_object_id_column="id", + source_id_column="id", + write_leaf_files=True, + output_artifact_name="small_sky_to_order1", + output_path=tmp_cloud_path, + output_storage_options=storage_options, + progress_bar=False, + tmp_dir=tmp_path, + ) + runner.run(small_sky_soap_args, dask_client) - ## Check that the association data can be parsed as a valid association catalog. - catalog = AssociationCatalog.read_from_hipscat( - small_sky_soap_args.catalog_path, storage_options=example_cloud_storage_options - ) - assert catalog.on_disk - assert catalog.catalog_path == small_sky_soap_args.catalog_path - assert len(catalog.get_join_pixels()) == 4 - assert catalog.catalog_info.total_rows == 131 - assert catalog.catalog_info.contains_leaf_files + ## Check that the association data can be parsed as a valid association catalog. + catalog = AssociationCatalog.read_from_hipscat( + small_sky_soap_args.catalog_path, storage_options=storage_options + ) + assert catalog.on_disk + assert catalog.catalog_path == small_sky_soap_args.catalog_path + assert len(catalog.get_join_pixels()) == 4 + assert catalog.catalog_info.total_rows == 131 + assert catalog.catalog_info.contains_leaf_files - parquet_file_name = os.path.join( - small_sky_soap_args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet" - ) - parquet_file_metadata = read_parquet_metadata( - parquet_file_name, storage_options=example_cloud_storage_options - ) - assert parquet_file_metadata.num_row_groups == 4 - assert parquet_file_metadata.num_rows == 131 - assert parquet_file_metadata.num_columns == 8 + parquet_file_name = os.path.join(small_sky_soap_args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + parquet_file_metadata = read_parquet_metadata(parquet_file_name, storage_options=storage_options) + assert parquet_file_metadata.num_row_groups == 4 + assert parquet_file_metadata.num_rows == 131 + assert parquet_file_metadata.num_columns == 8 - small_sky_soap_args = SoapArguments( - object_catalog_dir=small_sky_dir_local, - object_id_column="id", - source_catalog_dir=small_sky_order1_dir_local, - source_object_id_column="id", - source_id_column="id", - write_leaf_files=False, - output_artifact_name="small_sky_to_order1_soft", - output_path=temp_path, - output_storage_options=example_cloud_storage_options, - progress_bar=False, - tmp_dir=tmp_path, - ) - runner.run(small_sky_soap_args, dask_client) + small_sky_soap_args = SoapArguments( + object_catalog_dir=small_sky_dir_local, + object_id_column="id", + source_catalog_dir=small_sky_order1_dir_local, + source_object_id_column="id", + source_id_column="id", + write_leaf_files=False, + output_artifact_name="small_sky_to_order1_soft", + output_path=tmp_cloud_path, + output_storage_options=storage_options, + progress_bar=False, + tmp_dir=tmp_path, + ) + runner.run(small_sky_soap_args, dask_client) - ## Check that the association data can be parsed as a valid association catalog. - catalog = AssociationCatalog.read_from_hipscat( - small_sky_soap_args.catalog_path, storage_options=example_cloud_storage_options - ) - assert catalog.on_disk - assert catalog.catalog_path == small_sky_soap_args.catalog_path - assert len(catalog.get_join_pixels()) == 4 - assert catalog.catalog_info.total_rows == 131 - assert not catalog.catalog_info.contains_leaf_files + ## Check that the association data can be parsed as a valid association catalog. + catalog = AssociationCatalog.read_from_hipscat( + small_sky_soap_args.catalog_path, storage_options=storage_options + ) + assert catalog.on_disk + assert catalog.catalog_path == small_sky_soap_args.catalog_path + assert len(catalog.get_join_pixels()) == 4 + assert catalog.catalog_info.total_rows == 131 + assert not catalog.catalog_info.contains_leaf_files @pytest.mark.dask def test_object_to_self_read_from_cloud( - dask_client, tmp_path, small_sky_dir_cloud, small_sky_order1_dir_cloud, example_cloud_storage_options + dask_client, tmp_path, small_sky_dir_cloud, small_sky_order1_dir_cloud, storage_options ): """Test creating association between object catalogs. @@ -103,11 +96,11 @@ def test_object_to_self_read_from_cloud( small_sky_soap_args = SoapArguments( object_catalog_dir=small_sky_dir_cloud, object_id_column="id", - object_storage_options=example_cloud_storage_options, + object_storage_options=storage_options, source_catalog_dir=small_sky_order1_dir_cloud, source_object_id_column="id", source_id_column="id", - source_storage_options=example_cloud_storage_options, + source_storage_options=storage_options, write_leaf_files=True, output_artifact_name="small_sky_to_order1", output_path=tmp_path, @@ -125,9 +118,7 @@ def test_object_to_self_read_from_cloud( assert catalog.catalog_info.contains_leaf_files parquet_file_name = os.path.join(small_sky_soap_args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") - parquet_file_metadata = read_parquet_metadata( - parquet_file_name, storage_options=example_cloud_storage_options - ) + parquet_file_metadata = read_parquet_metadata(parquet_file_name, storage_options=storage_options) assert parquet_file_metadata.num_row_groups == 4 assert parquet_file_metadata.num_rows == 131 assert parquet_file_metadata.num_columns == 8 diff --git a/tests/lsdb/catalog/test_cone_search.py b/tests/lsdb/catalog/test_cone_search.py index cb7104a..3eca14e 100644 --- a/tests/lsdb/catalog/test_cone_search.py +++ b/tests/lsdb/catalog/test_cone_search.py @@ -8,7 +8,6 @@ def test_cone_search_filters_correct_points(small_sky_order1_catalog_cloud): radius = radius_degrees * 3600 center_coord = SkyCoord(ra, dec, unit="deg") cone_search_catalog = small_sky_order1_catalog_cloud.cone_search(ra, dec, radius).compute() - print(len(cone_search_catalog)) for _, row in small_sky_order1_catalog_cloud.compute().iterrows(): row_ra = row[small_sky_order1_catalog_cloud.hc_structure.catalog_info.ra_column] row_dec = row[small_sky_order1_catalog_cloud.hc_structure.catalog_info.dec_column] @@ -28,6 +27,5 @@ def test_cone_search_filters_partitions(small_sky_order1_catalog_cloud): consearch_catalog = small_sky_order1_catalog_cloud.cone_search(ra, dec, radius) assert len(hc_conesearch.get_healpix_pixels()) == len(consearch_catalog.get_healpix_pixels()) assert len(hc_conesearch.get_healpix_pixels()) == consearch_catalog._ddf.npartitions - print(hc_conesearch.get_healpix_pixels()) for pixel in hc_conesearch.get_healpix_pixels(): assert pixel in consearch_catalog._ddf_pixel_map diff --git a/tests/lsdb/catalog/test_crossmatch.py b/tests/lsdb/catalog/test_crossmatch.py index 680dd99..387eb79 100644 --- a/tests/lsdb/catalog/test_crossmatch.py +++ b/tests/lsdb/catalog/test_crossmatch.py @@ -21,19 +21,15 @@ def test_crossmatch_with_margin( small_sky_xmatch_dir_cloud, small_sky_margin_dir_cloud, xmatch_with_margin, - example_cloud_storage_options, + storage_options, ): - small_sky_margin_catalog = lsdb.read_hipscat( - small_sky_margin_dir_cloud, storage_options=example_cloud_storage_options - ) + small_sky_margin_catalog = lsdb.read_hipscat(small_sky_margin_dir_cloud, storage_options=storage_options) small_sky_order1_catalog = lsdb.read_hipscat( small_sky_order1_dir_cloud, margin_cache=small_sky_margin_catalog, - storage_options=example_cloud_storage_options, - ) - small_sky_xmatch_catalog = lsdb.read_hipscat( - small_sky_xmatch_dir_cloud, storage_options=example_cloud_storage_options + storage_options=storage_options, ) + small_sky_xmatch_catalog = lsdb.read_hipscat(small_sky_xmatch_dir_cloud, storage_options=storage_options) xmatched = small_sky_xmatch_catalog.crossmatch( small_sky_order1_catalog, n_neighbors=3, radius_arcsec=2 * 3600, algo=KdTreeCrossmatch ).compute() diff --git a/tests/lsdb/catalog/test_index_search.py b/tests/lsdb/catalog/test_index_search.py index 01c1536..2e40a88 100644 --- a/tests/lsdb/catalog/test_index_search.py +++ b/tests/lsdb/catalog/test_index_search.py @@ -4,11 +4,9 @@ def test_index_search( small_sky_order1_catalog_cloud, small_sky_index_dir_cloud, - example_cloud_storage_options, + storage_options, ): - catalog_index = IndexCatalog.read_from_hipscat( - small_sky_index_dir_cloud, storage_options=example_cloud_storage_options - ) + catalog_index = IndexCatalog.read_from_hipscat(small_sky_index_dir_cloud, storage_options=storage_options) index_search_catalog = small_sky_order1_catalog_cloud.index_search([900], catalog_index) index_search_df = index_search_catalog.compute() diff --git a/tests/lsdb/conftest.py b/tests/lsdb/conftest.py index 755b52e..f5c9c55 100644 --- a/tests/lsdb/conftest.py +++ b/tests/lsdb/conftest.py @@ -10,30 +10,28 @@ @pytest.fixture -def small_sky_xmatch_dir_cloud(example_cloud_path): - return os.path.join(example_cloud_path, "data", SMALL_SKY_XMATCH_NAME) +def small_sky_xmatch_dir_cloud(cloud_path): + return os.path.join(cloud_path, "data", SMALL_SKY_XMATCH_NAME) @pytest.fixture -def small_sky_catalog_cloud(small_sky_dir_cloud, example_cloud_storage_options): - return lsdb.read_hipscat(small_sky_dir_cloud, storage_options=example_cloud_storage_options) +def small_sky_catalog_cloud(small_sky_dir_cloud, storage_options): + return lsdb.read_hipscat(small_sky_dir_cloud, storage_options=storage_options) @pytest.fixture -def small_sky_xmatch_catalog_cloud(small_sky_xmatch_dir_cloud, example_cloud_storage_options): - return lsdb.read_hipscat(small_sky_xmatch_dir_cloud, storage_options=example_cloud_storage_options) +def small_sky_xmatch_catalog_cloud(small_sky_xmatch_dir_cloud, storage_options): + return lsdb.read_hipscat(small_sky_xmatch_dir_cloud, storage_options=storage_options) @pytest.fixture -def small_sky_order1_hipscat_catalog_cloud(small_sky_order1_dir_cloud, example_cloud_storage_options): - return hc.catalog.Catalog.read_from_hipscat( - small_sky_order1_dir_cloud, storage_options=example_cloud_storage_options - ) +def small_sky_order1_hipscat_catalog_cloud(small_sky_order1_dir_cloud, storage_options): + return hc.catalog.Catalog.read_from_hipscat(small_sky_order1_dir_cloud, storage_options=storage_options) @pytest.fixture -def small_sky_order1_catalog_cloud(small_sky_order1_dir_cloud, example_cloud_storage_options): - return lsdb.read_hipscat(small_sky_order1_dir_cloud, storage_options=example_cloud_storage_options) +def small_sky_order1_catalog_cloud(small_sky_order1_dir_cloud, storage_options): + return lsdb.read_hipscat(small_sky_order1_dir_cloud, storage_options=storage_options) @pytest.fixture diff --git a/tests/lsdb/io/test_to_hipscat.py b/tests/lsdb/io/test_to_hipscat.py index e46a6e8..633cf65 100644 --- a/tests/lsdb/io/test_to_hipscat.py +++ b/tests/lsdb/io/test_to_hipscat.py @@ -3,31 +3,26 @@ import lsdb import pandas as pd -from hipscat_cloudtests import TempCloudDirectory - -def test_save_catalog_and_margin(local_data_dir, example_cloud_storage_options, tmp_dir_cloud): +def test_save_catalog_and_margin(local_data_dir, storage_options, tmp_cloud_path): pathway = os.path.join(local_data_dir, "xmatch", "xmatch_catalog_raw.csv") input_df = pd.read_csv(pathway) catalog = lsdb.from_dataframe( input_df, margin_threshold=5000, catalog_name="small_sky_from_dataframe", catalog_type="object" ) - with TempCloudDirectory( - tmp_dir_cloud, "lsdb_save_catalog_and_margin", example_cloud_storage_options - ) as temp_path: - base_catalog_path = f"{temp_path}/new_catalog_name" - catalog.to_hipscat(base_catalog_path, storage_options=example_cloud_storage_options) - expected_catalog = lsdb.read_hipscat(base_catalog_path, storage_options=example_cloud_storage_options) - assert expected_catalog.hc_structure.catalog_name == catalog.hc_structure.catalog_name - assert expected_catalog.hc_structure.catalog_info == catalog.hc_structure.catalog_info - assert expected_catalog.get_healpix_pixels() == catalog.get_healpix_pixels() - pd.testing.assert_frame_equal(expected_catalog.compute(), catalog._ddf.compute()) + base_catalog_path = f"{tmp_cloud_path}/new_catalog_name" + catalog.to_hipscat(base_catalog_path, storage_options=storage_options) + expected_catalog = lsdb.read_hipscat(base_catalog_path, storage_options=storage_options) + assert expected_catalog.hc_structure.catalog_name == catalog.hc_structure.catalog_name + assert expected_catalog.hc_structure.catalog_info == catalog.hc_structure.catalog_info + assert expected_catalog.get_healpix_pixels() == catalog.get_healpix_pixels() + pd.testing.assert_frame_equal(expected_catalog.compute(), catalog._ddf.compute()) - base_catalog_path = f"{temp_path}/new_margin_name" - catalog.margin.to_hipscat(base_catalog_path, storage_options=example_cloud_storage_options) - expected_catalog = lsdb.read_hipscat(base_catalog_path, storage_options=example_cloud_storage_options) - assert expected_catalog.hc_structure.catalog_name == catalog.margin.hc_structure.catalog_name - assert expected_catalog.hc_structure.catalog_info == catalog.margin.hc_structure.catalog_info - assert expected_catalog.get_healpix_pixels() == catalog.margin.get_healpix_pixels() - pd.testing.assert_frame_equal(expected_catalog.compute(), catalog.margin._ddf.compute()) + base_catalog_path = f"{tmp_cloud_path}/new_margin_name" + catalog.margin.to_hipscat(base_catalog_path, storage_options=storage_options) + expected_catalog = lsdb.read_hipscat(base_catalog_path, storage_options=storage_options) + assert expected_catalog.hc_structure.catalog_name == catalog.margin.hc_structure.catalog_name + assert expected_catalog.hc_structure.catalog_info == catalog.margin.hc_structure.catalog_info + assert expected_catalog.get_healpix_pixels() == catalog.margin.get_healpix_pixels() + pd.testing.assert_frame_equal(expected_catalog.compute(), catalog.margin._ddf.compute()) diff --git a/tests/lsdb/loaders/hipscat/test_read_hipscat.py b/tests/lsdb/loaders/hipscat/test_read_hipscat.py index f9bdb60..6f76291 100644 --- a/tests/lsdb/loaders/hipscat/test_read_hipscat.py +++ b/tests/lsdb/loaders/hipscat/test_read_hipscat.py @@ -6,9 +6,9 @@ def test_read_hipscat( small_sky_order1_dir_cloud, small_sky_order1_hipscat_catalog_cloud, - example_cloud_storage_options, + storage_options, ): - catalog = lsdb.read_hipscat(small_sky_order1_dir_cloud, storage_options=example_cloud_storage_options) + catalog = lsdb.read_hipscat(small_sky_order1_dir_cloud, storage_options=storage_options) assert isinstance(catalog, lsdb.Catalog) assert catalog.hc_structure.catalog_base_dir == small_sky_order1_hipscat_catalog_cloud.catalog_base_dir assert catalog.get_healpix_pixels() == small_sky_order1_hipscat_catalog_cloud.get_healpix_pixels() @@ -19,9 +19,9 @@ def test_read_hipscat( def test_read_hipscat_margin( small_sky_margin_dir_cloud, - example_cloud_storage_options, + storage_options, ): - catalog = lsdb.read_hipscat(small_sky_margin_dir_cloud, storage_options=example_cloud_storage_options) + catalog = lsdb.read_hipscat(small_sky_margin_dir_cloud, storage_options=storage_options) assert isinstance(catalog, MarginCatalog) assert catalog.hc_structure.catalog_base_dir == small_sky_margin_dir_cloud assert catalog.get_healpix_pixels() == [