add safety around parsing empty CSV; close #414

gerlichlab · Jan 17, 2025 · a0682cf · a0682cf
1 parent 6ccd6ea
commit a0682cf
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 1 deletion.
diff --git a/looptrace/ImageHandler.py b/looptrace/ImageHandler.py
@@ -27,6 +27,7 @@
 from looptrace.filepaths import SPOT_IMAGES_SUBFOLDER, FilePathLike, FolderPathLike, get_analysis_path, simplify_path
 from looptrace.image_io import ignore_path, NPZ_wrapper
 from looptrace.numeric_types import NumberLike
+from looptrace.utilities import read_csv_maybe_empty
 
 __author__ = "Kai Sandvold Beckwith"
 __credits__ = ["Kai Sandvold Beckwith", "Vince Reuter"]
@@ -481,7 +482,7 @@ def zarr_conversions(self) -> Mapping[str, str]:
     def load_tables(self):
         # TODO: the CSV parse needs to depend on whether the first column really is the index or not.
         # See: https://github.com/gerlichlab/looptrace/issues/261
-        parsers = {".csv": pd.read_csv, ".pkl": pd.read_pickle}
+        parsers = {".csv": read_csv_maybe_empty, ".pkl": pd.read_pickle}
         try:
             table_files = os.scandir(self.analysis_path)
         except FileNotFoundError:

diff --git a/looptrace/utilities.py b/looptrace/utilities.py
@@ -2,11 +2,15 @@
 
 from collections import Counter
 import functools
+import logging
+from os import PathLike
+from pathlib import Path
 from typing import Callable, Iterable, Mapping, Optional, ParamSpec, TypeAlias, TypeVar
 from expression import Option, Result, compose, curry_flip, snd
 from expression.collections import Seq
 from expression import result
 from numpydoc_decorator import doc
+import pandas as pd
 
 _A = TypeVar("_A")
 _B = TypeVar("_B")
@@ -17,6 +21,8 @@
 
 _Exception = TypeVar("_Exception", bound=Exception)
 
+CsvReadable: TypeAlias = str | Path | PathLike[str]
+
 
 # Courtesy of @Hugovdberg in Issues discussion on dbratti/Expression repo
 @curry_flip(1)
@@ -113,6 +119,14 @@ def list_from_object(obj: object) -> Result[list[object], str]:
     return list(obj)
 
 
+def read_csv_maybe_empty(f: CsvReadable) -> pd.DataFrame:
+    try:
+        return pd.read_csv(f)
+    except pd.errors.EmptyDataError:
+        logging.info("Empty CSV read target: %s", f)
+        return pd.DataFrame()
+
+
 @curry_flip(1)
 def traverse_through_either(inputs: Iterable[_A], f: Callable[[_A], Result[_B, _E]]) -> Result[Seq[_B], Seq[_E]]:
     State: TypeAlias = Result[Seq[_B], Seq[_E]]

diff --git a/tests/test_read_csv_maybe_empty.py b/tests/test_read_csv_maybe_empty.py
@@ -0,0 +1,85 @@
+"""Tests for the functionality of the maybe-empty CSV reader utility function"""
+
+import pathlib
+import string
+from typing import Iterable, TypeAlias
+
+import hypothesis as hyp
+from hypothesis import strategies as st
+from hypothesis.extra import pandas as hyp_pd
+from hypothesis.extra.pandas import column as PandasColumn, data_frames
+from hypothesis.strategies import SearchStrategy
+import pandas as pd
+from pandas.testing import assert_frame_equal
+import pytest
+
+from looptrace.utilities import read_csv_maybe_empty
+
+
+Colnames: TypeAlias = list[str]
+ColumnsSpecLike: TypeAlias = int | Colnames
+
+
+def gen_colnames(gen_one: SearchStrategy[str] = st.text()) -> SearchStrategy[Colnames]:
+    """Generate a sequence of column names."""
+    return st.sets(gen_one, min_size=1).map(list)
+
+
+def gen_col_spec_like(gen_one: SearchStrategy[str] = st.text()) -> SearchStrategy[ColumnsSpecLike]:
+    """Generate either a number of columns, or a sequence of column names."""
+    return st.one_of(gen_colnames(gen_one), st.integers(min_value=1, max_value=10))
+
+
+def gen_colspec_and_dtype(gen_one_name: SearchStrategy[str] = st.text()) -> SearchStrategy[tuple[ColumnsSpecLike, type]]:
+    """Generate either a column count or sequence of column names, along with a data type."""
+    return st.tuples(
+        gen_col_spec_like(gen_one_name), 
+        st.just(int), # Peg to int for now, as using general scalar_dtypes() was causing generation problems.
+    )
+
+
+def gen_columns(gen_one_name: SearchStrategy[str] = st.text()) -> SearchStrategy[Iterable[PandasColumn]]:
+    """Generate a collection of columns with which to populate a pandas DataFrame."""
+    return gen_colspec_and_dtype(gen_one_name).map(
+        lambda spec_and_dtype: hyp_pd.columns(
+            names_or_number=spec_and_dtype[0],
+            dtype=spec_and_dtype[1],
+        )
+    )
+
+
+def gen_random_frame(gen_one_name: SearchStrategy[str] = st.text()) -> SearchStrategy[pd.DataFrame]:
+    """Generate a random pandas DataFrame."""
+    return gen_columns(gen_one_name).flatmap(lambda cols: data_frames(columns=cols))
+
+
+@pytest.fixture
+def tmp_file(tmp_path: pathlib.Path) -> pathlib.Path:
+    return tmp_path / "table.csv"
+
+
+@hyp.given(frame=gen_random_frame())
+@hyp.settings(suppress_health_check=(hyp.HealthCheck.function_scoped_fixture, ))
+def test_equivalence_to_read_csv_when_input_is_not_empty(tmp_file, frame: pd.DataFrame):
+    frame.to_csv(tmp_file)
+    from_custom = read_csv_maybe_empty(tmp_file)
+    from_pandas = pd.read_csv(tmp_file)
+    assert_frame_equal(from_custom, from_pandas)
+
+
+def test_empty_frame_with_no_columns_results_when_file_is_totally_empty(tmp_file):
+    tmp_file.touch()
+    from_custom = read_csv_maybe_empty(tmp_file)
+    assert from_custom.empty
+    assert list(from_custom.columns) == []
+
+
+# Prevent edge cases due to single empty column name, or cases where column names are numbers and parsed as data.
+@hyp.given(colnames=gen_colnames(st.text(min_size=1, alphabet=string.ascii_letters)))
+@hyp.settings(suppress_health_check=(hyp.HealthCheck.function_scoped_fixture, ))
+def test_empty_frame_with_correct_columns_results_when_input_is_file_with_just_header(tmp_file, colnames):
+    with tmp_file.open(mode='w') as fh:
+        fh.write(",".join(colnames))
+    from_custom = read_csv_maybe_empty(tmp_file)
+    assert from_custom.empty
+    assert list(from_custom.columns) == colnames