diff --git a/tests/conftest.py b/tests/conftest.py index c72bed9..7b5ac65 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import os +from pathlib import Path import hipscat as hc import lsdb @@ -52,7 +53,7 @@ def storage_options(cloud): @pytest.fixture def local_data_dir(): local_data_path = os.path.dirname(__file__) - return os.path.join(local_data_path, "data") + return Path(local_data_path) / "data" @pytest.fixture diff --git a/tests/data/indexed_files/csv_list_single.txt b/tests/data/indexed_files/csv_list_single.txt new file mode 100644 index 0000000..129e81d --- /dev/null +++ b/tests/data/indexed_files/csv_list_single.txt @@ -0,0 +1,5 @@ +abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv +abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv +abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv +abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv +abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv \ No newline at end of file diff --git a/tests/data/indexed_files/parquet_list_single.txt b/tests/data/indexed_files/parquet_list_single.txt new file mode 100644 index 0000000..74555bf --- /dev/null +++ b/tests/data/indexed_files/parquet_list_single.txt @@ -0,0 +1,4 @@ +abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=44.parquet +abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=45.parquet +abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=46.parquet +abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=47.parquet \ No newline at end of file diff --git a/tests/hipscat_import/test_input_readers.py b/tests/hipscat_import/test_input_readers.py new file mode 100644 index 0000000..89c4cf7 --- /dev/null +++ b/tests/hipscat_import/test_input_readers.py @@ -0,0 +1,57 @@ +from hipscat_import.catalog.file_readers import IndexedCsvReader, IndexedParquetReader + + +def test_indexed_parquet_reader(storage_options, local_data_dir): + # Chunksize covers all the inputs. + total_chunks = 0 + total_len = 0 + for frame in IndexedParquetReader(chunksize=10_000, storage_options=storage_options).read( + local_data_dir / "indexed_files" / "parquet_list_single.txt" + ): + total_chunks += 1 + assert len(frame) == 131 + total_len += len(frame) + + assert total_chunks == 1 + assert total_len == 131 + + # Requesting a very small chunksize. This will split up reads on the parquet. + total_chunks = 0 + total_len = 0 + for frame in IndexedParquetReader(chunksize=5, storage_options=storage_options).read( + local_data_dir / "indexed_files" / "parquet_list_single.txt" + ): + total_chunks += 1 + assert len(frame) <= 5 + total_len += len(frame) + + assert total_chunks == 28 + assert total_len == 131 + + +def test_indexed_csv_reader(storage_options, local_data_dir): + # Chunksize covers all the inputs. + total_chunks = 0 + total_len = 0 + for frame in IndexedCsvReader(chunksize=10_000, storage_options=storage_options).read( + local_data_dir / "indexed_files" / "csv_list_single.txt" + ): + total_chunks += 1 + assert len(frame) == 131 + total_len += len(frame) + + assert total_chunks == 1 + assert total_len == 131 + + # Requesting a very small chunksize. This will split up reads on the parquet. + total_chunks = 0 + total_len = 0 + for frame in IndexedCsvReader(chunksize=5, storage_options=storage_options).read( + local_data_dir / "indexed_files" / "csv_list_single.txt" + ): + total_chunks += 1 + assert len(frame) <= 5 + total_len += len(frame) + + assert total_chunks == 29 + assert total_len == 131