Skip to content

Commit

Permalink
add safety around parsing empty CSV; close #414
Browse files Browse the repository at this point in the history
  • Loading branch information
vreuter committed Jan 17, 2025
1 parent 6ccd6ea commit a0682cf
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 1 deletion.
3 changes: 2 additions & 1 deletion looptrace/ImageHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from looptrace.filepaths import SPOT_IMAGES_SUBFOLDER, FilePathLike, FolderPathLike, get_analysis_path, simplify_path
from looptrace.image_io import ignore_path, NPZ_wrapper
from looptrace.numeric_types import NumberLike
from looptrace.utilities import read_csv_maybe_empty

__author__ = "Kai Sandvold Beckwith"
__credits__ = ["Kai Sandvold Beckwith", "Vince Reuter"]
Expand Down Expand Up @@ -481,7 +482,7 @@ def zarr_conversions(self) -> Mapping[str, str]:
def load_tables(self):
# TODO: the CSV parse needs to depend on whether the first column really is the index or not.
# See: https://github.com/gerlichlab/looptrace/issues/261
parsers = {".csv": pd.read_csv, ".pkl": pd.read_pickle}
parsers = {".csv": read_csv_maybe_empty, ".pkl": pd.read_pickle}
try:
table_files = os.scandir(self.analysis_path)
except FileNotFoundError:
Expand Down
14 changes: 14 additions & 0 deletions looptrace/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@

from collections import Counter
import functools
import logging
from os import PathLike
from pathlib import Path
from typing import Callable, Iterable, Mapping, Optional, ParamSpec, TypeAlias, TypeVar
from expression import Option, Result, compose, curry_flip, snd
from expression.collections import Seq
from expression import result
from numpydoc_decorator import doc
import pandas as pd

_A = TypeVar("_A")
_B = TypeVar("_B")
Expand All @@ -17,6 +21,8 @@

_Exception = TypeVar("_Exception", bound=Exception)

CsvReadable: TypeAlias = str | Path | PathLike[str]


# Courtesy of @Hugovdberg in Issues discussion on dbratti/Expression repo
@curry_flip(1)
Expand Down Expand Up @@ -113,6 +119,14 @@ def list_from_object(obj: object) -> Result[list[object], str]:
return list(obj)


def read_csv_maybe_empty(f: CsvReadable) -> pd.DataFrame:
try:
return pd.read_csv(f)
except pd.errors.EmptyDataError:
logging.info("Empty CSV read target: %s", f)
return pd.DataFrame()


@curry_flip(1)
def traverse_through_either(inputs: Iterable[_A], f: Callable[[_A], Result[_B, _E]]) -> Result[Seq[_B], Seq[_E]]:
State: TypeAlias = Result[Seq[_B], Seq[_E]]
Expand Down
85 changes: 85 additions & 0 deletions tests/test_read_csv_maybe_empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Tests for the functionality of the maybe-empty CSV reader utility function"""

import pathlib
import string
from typing import Iterable, TypeAlias

import hypothesis as hyp
from hypothesis import strategies as st
from hypothesis.extra import pandas as hyp_pd
from hypothesis.extra.pandas import column as PandasColumn, data_frames
from hypothesis.strategies import SearchStrategy
import pandas as pd
from pandas.testing import assert_frame_equal
import pytest

from looptrace.utilities import read_csv_maybe_empty


Colnames: TypeAlias = list[str]
ColumnsSpecLike: TypeAlias = int | Colnames


def gen_colnames(gen_one: SearchStrategy[str] = st.text()) -> SearchStrategy[Colnames]:
"""Generate a sequence of column names."""
return st.sets(gen_one, min_size=1).map(list)


def gen_col_spec_like(gen_one: SearchStrategy[str] = st.text()) -> SearchStrategy[ColumnsSpecLike]:
"""Generate either a number of columns, or a sequence of column names."""
return st.one_of(gen_colnames(gen_one), st.integers(min_value=1, max_value=10))


def gen_colspec_and_dtype(gen_one_name: SearchStrategy[str] = st.text()) -> SearchStrategy[tuple[ColumnsSpecLike, type]]:
"""Generate either a column count or sequence of column names, along with a data type."""
return st.tuples(
gen_col_spec_like(gen_one_name),
st.just(int), # Peg to int for now, as using general scalar_dtypes() was causing generation problems.
)


def gen_columns(gen_one_name: SearchStrategy[str] = st.text()) -> SearchStrategy[Iterable[PandasColumn]]:
"""Generate a collection of columns with which to populate a pandas DataFrame."""
return gen_colspec_and_dtype(gen_one_name).map(
lambda spec_and_dtype: hyp_pd.columns(
names_or_number=spec_and_dtype[0],
dtype=spec_and_dtype[1],
)
)


def gen_random_frame(gen_one_name: SearchStrategy[str] = st.text()) -> SearchStrategy[pd.DataFrame]:
"""Generate a random pandas DataFrame."""
return gen_columns(gen_one_name).flatmap(lambda cols: data_frames(columns=cols))


@pytest.fixture
def tmp_file(tmp_path: pathlib.Path) -> pathlib.Path:
return tmp_path / "table.csv"


@hyp.given(frame=gen_random_frame())
@hyp.settings(suppress_health_check=(hyp.HealthCheck.function_scoped_fixture, ))
def test_equivalence_to_read_csv_when_input_is_not_empty(tmp_file, frame: pd.DataFrame):
frame.to_csv(tmp_file)
from_custom = read_csv_maybe_empty(tmp_file)
from_pandas = pd.read_csv(tmp_file)
assert_frame_equal(from_custom, from_pandas)


def test_empty_frame_with_no_columns_results_when_file_is_totally_empty(tmp_file):
tmp_file.touch()
from_custom = read_csv_maybe_empty(tmp_file)
assert from_custom.empty
assert list(from_custom.columns) == []


# Prevent edge cases due to single empty column name, or cases where column names are numbers and parsed as data.
@hyp.given(colnames=gen_colnames(st.text(min_size=1, alphabet=string.ascii_letters)))
@hyp.settings(suppress_health_check=(hyp.HealthCheck.function_scoped_fixture, ))
def test_empty_frame_with_correct_columns_results_when_input_is_file_with_just_header(tmp_file, colnames):
with tmp_file.open(mode='w') as fh:
fh.write(",".join(colnames))
from_custom = read_csv_maybe_empty(tmp_file)
assert from_custom.empty
assert list(from_custom.columns) == colnames

0 comments on commit a0682cf

Please sign in to comment.