Skip to content

Commit

Permalink
ref: Introduce a class for golden tests (#608)
Browse files Browse the repository at this point in the history
This makes things a lot clearer than using the argument to determine how
the golden file should be produced.

Change `json` to `jsonl` to reflect that it is line-delimited.
  • Loading branch information
bjchambers authored Aug 7, 2023
1 parent a1e1ca8 commit 4ba4936
Show file tree
Hide file tree
Showing 25 changed files with 107 additions and 98 deletions.
4 changes: 2 additions & 2 deletions sparrow-py/pytests/aggregation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ def source() -> kt.sources.CsvSource:
def test_sum_unwindowed(source, golden) -> None:
m = source["m"]
n = source["n"]
golden(kt.record({"m": m, "sum_m": m.sum(), "n": n, "sum_n": n.sum()}))
golden.jsonl(kt.record({"m": m, "sum_m": m.sum(), "n": n, "sum_n": n.sum()}))


def test_sum_windowed(source, golden) -> None:
m = source["m"]
n = source["n"]
golden(
golden.jsonl(
kt.record(
{
"m": m,
Expand Down
6 changes: 3 additions & 3 deletions sparrow-py/pytests/collect_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def source() -> kt.sources.CsvSource:
def test_collect_basic(source, golden) -> None:
m = source["m"]
n = source["n"]
golden(
golden.jsonl(
kt.record(
{
"m": m,
Expand All @@ -36,7 +36,7 @@ def test_collect_basic(source, golden) -> None:
def test_collect_with_max(source, golden) -> None:
m = source["m"]
n = source["n"]
golden(
golden.jsonl(
kt.record(
{
"m": m,
Expand All @@ -50,4 +50,4 @@ def test_collect_with_max(source, golden) -> None:

def test_collect_since_window(source, golden) -> None:
m = source["m"]
golden(kt.record({"m": m, "since_m": m.sum(window=kt.SinceWindow(m > 10))}))
golden.jsonl(kt.record({"m": m, "since_m": m.sum(window=kt.SinceWindow(m > 10))}))
169 changes: 91 additions & 78 deletions sparrow-py/pytests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,89 +17,102 @@ def pytest_addoption(parser: pytest.Parser):
parser.addoption("--save-golden", action="store_true", help="update golden files")


@pytest.fixture
def golden(request: pytest.FixtureRequest, pytestconfig: pytest.Config): # noqa: C901
"""Test fixture for checking results against a golden file."""
output = 0

def handler(
query: Union[kt.Timestream, pd.DataFrame],
format: Union[Literal["csv"], Literal["parquet"], Literal["json"]] = "json",
):
class GoldenFixture(object):
def __init__(self, dirname: str, test_name: str, save: bool):
self._output = 0
self._dirname = dirname
self._test_name = test_name
self._save = save

def csv(self, data: Union[kt.Timestream, pd.DataFrame]) -> None:
"""
Check query results against a golden file.
Parameters
----------
query : kt.Timestream to execute or pd.DataFrame
The query to run (or a result to use).
format : str, optional
The format to store the golden file in.
Defaults to "json".
Raises
------
ValueError
If the `format` is not recognized.
Golden test against CSV file.
Uses Pandas to save and load the CSV file. The schema of the `data`
is used to align types.
"""
nonlocal output

if isinstance(query, pd.DataFrame):
df = query
elif isinstance(query, kt.Timestream):
df = query.run().to_pandas()
else:
raise ValueError(
f"query must be a Timestream or a DataFrame, was {type(query)}"
)

test_name = request.node.name
module_name = request.node.module.__name__
dirname = os.path.join("pytests", "golden", module_name)
df = _data_to_dataframe(data)
filename = self._filename("csv")

# Save the CSV file, if requested.
if self._save:
df.to_csv(filename, index=False)

# Load the CSV file. Use the schema of the data to set expected types.
dtypes = {}
parse_dates = []
for name, dtype in df.dtypes.to_dict().items():
if pd.api.types.is_datetime64_dtype(dtype):
parse_dates.append(name)
else:
dtypes[name] = dtype
golden = pd.read_csv(filename, dtype=dtypes, parse_dates=parse_dates)

pd.testing.assert_frame_equal(df, golden)

def jsonl(self, data: Union[kt.Timestream, pd.DataFrame]) -> None:
"""Golden test against newline-delimited JSON file (json-lines)."""
df = _data_to_dataframe(data)
filename = self._filename("jsonl")

if self._save:
df.to_json(filename, orient="records", lines=True, date_unit="ns")

golden = pd.read_json(
filename,
orient="records",
lines=True,
dtype=df.dtypes.to_dict(),
date_unit="ns",
)

pd.testing.assert_frame_equal(df, golden)

def parquet(self, data: Union[kt.Timestream, pd.DataFrame]) -> None:
"""Golden test against Parquet file."""
df = _data_to_dataframe(data)
filename = self._filename("parquet")

if self._save:
df.to_parquet(filename)

golden = pd.read_parquet(filename)

pd.testing.assert_frame_equal(df, golden)

def _filename(self, suffix: str) -> str:
filename = (
f"{test_name}.{format}" if output == 0 else f"{test_name}_{output}.{format}"
f"{self._test_name}.{suffix}" if self._output == 0 else f"{self._test_name}_{self._output}.{suffix}"
)
filename = os.path.join(dirname, filename)
output += 1

save = pytestconfig.getoption("--save-golden", default=False)

if save:
os.makedirs(dirname, exist_ok=True)
if format == "csv":
df.to_csv(filename, index=False)
elif format == "parquet":
df.to_parquet(filename)
elif format == "json":
df.to_json(filename, orient="records", lines=True, date_unit="ns")
else:
raise ValueError(f"Unknown format {format}")
else:
filename = os.path.join(self._dirname, filename)
self._output += 1

if not self._save:
assert os.path.exists(
filename
), f"Golden file {filename} does not exist. Run with `--save-golden` to create it."
return filename

def _data_to_dataframe(data: Union[kt.Timestream, pd.DataFrame]) -> pd.DataFrame:
if isinstance(data, pd.DataFrame):
return data
elif isinstance(data, kt.Timestream):
return data.run().to_pandas()
else:
raise ValueError(
f"query must be a Timestream or a DataFrame, was {type(query)}")

@pytest.fixture
def golden(request: pytest.FixtureRequest, pytestconfig: pytest.Config) -> GoldenFixture:
"""Test fixture for checking results against a golden file."""
test_name = request.node.name
module_name = request.node.module.__name__
dirname = os.path.join("pytests", "golden", module_name)

save = pytestconfig.getoption("--save-golden", default=False)
if save:
os.makedirs(dirname, exist_ok=True)
else:
assert os.path.isdir(dirname), f"golden directory {dirname} does not exist. run with `--save-golden` to create it."

if format == "csv":
dtypes = {}
parse_dates = []
for name, dtype in df.dtypes.to_dict().items():
if pd.api.types.is_datetime64_dtype(dtype):
parse_dates.append(name)
else:
dtypes[name] = dtype
correct = pd.read_csv(filename, dtype=dtypes, parse_dates=parse_dates)
elif format == "parquet":
correct = pd.read_parquet(filename)
elif format == "json":
correct = pd.read_json(
filename,
orient="records",
lines=True,
dtype=df.dtypes.to_dict(),
date_unit="ns",
)
else:
raise ValueError(f"Unknown format {format}")
pd.testing.assert_frame_equal(df, correct)

return handler
return GoldenFixture(dirname, test_name, save)

This file was deleted.

2 changes: 1 addition & 1 deletion sparrow-py/pytests/math_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def source_int64() -> kt.sources.CsvSource:
def test_math_int64(golden, source_int64) -> None:
m = source_int64["m"]
n = source_int64["n"]
golden(
golden.jsonl(
kt.record(
{
"m": m,
Expand Down
8 changes: 4 additions & 4 deletions sparrow-py/pytests/record_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_record(source, golden) -> None:
m = source["m"]
n = source["n"]

golden(
golden.jsonl(
kt.record(
{
"m": m,
Expand All @@ -35,12 +35,12 @@ def test_record(source, golden) -> None:
def test_extend_record(source, golden) -> None:
m = source["m"]
n = source["n"]
golden(source.extend({"add": m + n}))
golden.jsonl(source.extend({"add": m + n}))


def test_select_record(source, golden) -> None:
golden(source.select("n"))
golden.jsonl(source.select("n"))


def test_remove_record(source, golden) -> None:
golden(source.remove("n"))
golden.jsonl(source.remove("n"))
4 changes: 2 additions & 2 deletions sparrow-py/pytests/result_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def test_iterate_pandas(golden, source_int64) -> None:
results = source_int64.run(row_limit=4, max_batch_size=2).iter_pandas()

# 4 rows, max 2 per batch = 2 batches
golden(next(results))
golden(next(results))
golden.jsonl(next(results))
golden.jsonl(next(results))
with pytest.raises(StopIteration):
next(results)

Expand Down
4 changes: 2 additions & 2 deletions sparrow-py/pytests/source_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_add_dataframe(golden) -> None:
dataset1 = pd.DataFrame(records)

table = kt.sources.ArrowSource("time", "key", dataset1)
golden(table)
golden.jsonl(table)

records.clear()
for member_id in member_ids:
Expand All @@ -79,4 +79,4 @@ def test_add_dataframe(golden) -> None:
)
dataset2 = pd.DataFrame(records)
table.add(dataset2)
golden(table)
golden.jsonl(table)
4 changes: 2 additions & 2 deletions sparrow-py/pytests/timestream_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_timestream_preview(golden) -> None:
)
source = kt.sources.CsvSource("time", "key", content)

golden(source.preview(limit=4))
golden.jsonl(source.preview(limit=4))

def test_timestream_run_non_record(golden) -> None:
content = "\n".join(
Expand All @@ -126,4 +126,4 @@ def test_timestream_run_non_record(golden) -> None:
]
)
source = kt.sources.CsvSource("time", "key", content)
golden(source["m"])
golden.jsonl(source["m"])

0 comments on commit 4ba4936

Please sign in to comment.