Tests for indexed readers

astronomy-commons · Jun 10, 2024 · da0a69c · da0a69c
1 parent 4b2e63b
commit da0a69c
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 1 deletion.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 
 import hipscat as hc
 import lsdb
@@ -52,7 +53,7 @@ def storage_options(cloud):
 @pytest.fixture
 def local_data_dir():
     local_data_path = os.path.dirname(__file__)
-    return os.path.join(local_data_path, "data")
+    return Path(local_data_path) / "data"
 
 
 @pytest.fixture

diff --git a/tests/data/indexed_files/csv_list_single.txt b/tests/data/indexed_files/csv_list_single.txt
@@ -0,0 +1,5 @@
+abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv
+abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv
+abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv
+abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv
+abfs://hipscat/pytests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv
diff --git a/tests/data/indexed_files/parquet_list_single.txt b/tests/data/indexed_files/parquet_list_single.txt
@@ -0,0 +1,4 @@
+abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=44.parquet
+abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=45.parquet
+abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=46.parquet
+abfs://hipscat/pytests/hipscat/data/small_sky_order1/Norder=1/Dir=0/Npix=47.parquet
diff --git a/tests/hipscat_import/test_input_readers.py b/tests/hipscat_import/test_input_readers.py
@@ -0,0 +1,57 @@
+from hipscat_import.catalog.file_readers import IndexedCsvReader, IndexedParquetReader
+
+
+def test_indexed_parquet_reader(storage_options, local_data_dir):
+    # Chunksize covers all the inputs.
+    total_chunks = 0
+    total_len = 0
+    for frame in IndexedParquetReader(chunksize=10_000, storage_options=storage_options).read(
+        local_data_dir / "indexed_files" / "parquet_list_single.txt"
+    ):
+        total_chunks += 1
+        assert len(frame) == 131
+        total_len += len(frame)
+
+    assert total_chunks == 1
+    assert total_len == 131
+
+    # Requesting a very small chunksize. This will split up reads on the parquet.
+    total_chunks = 0
+    total_len = 0
+    for frame in IndexedParquetReader(chunksize=5, storage_options=storage_options).read(
+        local_data_dir / "indexed_files" / "parquet_list_single.txt"
+    ):
+        total_chunks += 1
+        assert len(frame) <= 5
+        total_len += len(frame)
+
+    assert total_chunks == 28
+    assert total_len == 131
+
+
+def test_indexed_csv_reader(storage_options, local_data_dir):
+    # Chunksize covers all the inputs.
+    total_chunks = 0
+    total_len = 0
+    for frame in IndexedCsvReader(chunksize=10_000, storage_options=storage_options).read(
+        local_data_dir / "indexed_files" / "csv_list_single.txt"
+    ):
+        total_chunks += 1
+        assert len(frame) == 131
+        total_len += len(frame)
+
+    assert total_chunks == 1
+    assert total_len == 131
+
+    # Requesting a very small chunksize. This will split up reads on the parquet.
+    total_chunks = 0
+    total_len = 0
+    for frame in IndexedCsvReader(chunksize=5, storage_options=storage_options).read(
+        local_data_dir / "indexed_files" / "csv_list_single.txt"
+    ):
+        total_chunks += 1
+        assert len(frame) <= 5
+        total_len += len(frame)
+
+    assert total_chunks == 29
+    assert total_len == 131