Skip to content

Commit

Permalink
added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
epinzur committed Sep 21, 2023
1 parent ba36bfa commit 8547e92
Show file tree
Hide file tree
Showing 14 changed files with 426 additions and 7 deletions.
10 changes: 5 additions & 5 deletions python/pysrc/kaskada/_timestream.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,16 +584,16 @@ def select(self, *args: str) -> Timestream:
"""
return Timestream._call("select_fields", self, *args)

def substring(self, start: Optional[int], end: Optional[int] = None) -> Timestream:
def substring(self, start: Optional[int] = None, end: Optional[int] = None) -> Timestream:
"""Return a Timestream with a substring between the start and end indices.
Args:
start: The inclusive index to start at. `None` indicates the beginning
of the string. Negative indices count backwards from the end of
the string.
end: (optional) The exclusive index to end at. `None` indicates the
length of the string. Negative indices count backwards from the
end of the string.
end: The exclusive index to end at. `None` indicates the length of
the string. Negative indices count backwards from the end of
the string.
Notes:
Returns the substring starting at `start` (inclusive) up to but not
Expand All @@ -602,7 +602,7 @@ def substring(self, start: Optional[int], end: Optional[int] = None) -> Timestre
If the input is `null`, returns `null`. If `end` > `start` an empty
string is returned.
"""
return Timestream._call("substring", self)
return Timestream._call("substring", self, start, end)

def remove(self, *args: str) -> Timestream:
"""Return a Timestream removing the given fields from `self`.
Expand Down
8 changes: 6 additions & 2 deletions python/pysrc/kaskada/sources/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,10 @@ def __init__(
strings_can_be_null=True,
)

_parse_options = pyarrow.csv.ParseOptions(
escape_char="\\",
)

@staticmethod
async def create(
csv_string: Optional[str | BytesIO] = None,
Expand Down Expand Up @@ -277,7 +281,7 @@ async def create(
if schema is None:
if csv_string is None:
raise ValueError("Must provide schema or csv_string")
schema = pa.csv.read_csv(csv_string).schema
schema = pa.csv.read_csv(csv_string, parse_options=CsvString._parse_options).schema
csv_string.seek(0)

source = CsvString(
Expand All @@ -297,7 +301,7 @@ async def add_string(self, csv_string: str | BytesIO) -> None:
"""Add data to the source."""
if isinstance(csv_string, str):
csv_string = BytesIO(csv_string.encode("utf-8"))
content = pa.csv.read_csv(csv_string, convert_options=self._convert_options)
content = pa.csv.read_csv(csv_string, convert_options=self._convert_options, parse_options=CsvString._parse_options)
for batch in content.to_batches():
await self._ffi_table.add_pyarrow(batch)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","a":true,"is_valid":true}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","a":false,"is_valid":true}
{"_time":"1996-12-19T16:39:59.000000000","_key":"B","a":null,"is_valid":false}
{"_time":"1996-12-19T16:40:00.000000000","_key":"B","a":true,"is_valid":true}
{"_time":"1996-12-19T16:40:01.000000000","_key":"B","a":false,"is_valid":true}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","a":false,"is_valid":true}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","a":null,"is_valid":false}
6 changes: 6 additions & 0 deletions python/pytests/golden/is_valid_test/test_is_valid_f64.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","m":5.2,"is_valid":true}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","m":24.3,"is_valid":true}
{"_time":"1996-12-19T16:39:59.000000000","_key":"A","m":17.6,"is_valid":true}
{"_time":"1996-12-19T16:40:00.000000000","_key":"A","m":null,"is_valid":false}
{"_time":"1996-12-19T16:40:01.000000000","_key":"A","m":12.4,"is_valid":true}
{"_time":"1996-12-19T16:40:02.000000000","_key":"A","m":null,"is_valid":false}
6 changes: 6 additions & 0 deletions python/pytests/golden/is_valid_test/test_is_valid_i64.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","m":5.0,"is_valid":true}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","m":24.0,"is_valid":true}
{"_time":"1996-12-19T16:39:59.000000000","_key":"A","m":17.0,"is_valid":true}
{"_time":"1996-12-19T16:40:00.000000000","_key":"A","m":null,"is_valid":false}
{"_time":"1996-12-19T16:40:01.000000000","_key":"A","m":12.0,"is_valid":true}
{"_time":"1996-12-19T16:40:02.000000000","_key":"A","m":null,"is_valid":false}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","is_valid":{"time":"1996-12-19T16:39:57.000000000","key":"A","n":2.0,"m":4.0,"other_time":"2003-12-19T16:39:57.000000000","fruit":"pear"}}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","is_valid":{"time":"1996-12-19T16:39:58.000000000","key":"B","n":4.0,"m":3.0,"other_time":"1994-11-19T16:39:57.000000000","fruit":"watermelon"}}
{"_time":"1996-12-19T16:39:59.000000000","_key":"B","is_valid":{"time":"1996-12-19T16:39:59.000000000","key":"B","n":5.0,"m":null,"other_time":"1998-12-19T16:39:57.000000000","fruit":"mango"}}
{"_time":"1996-12-19T16:40:00.000000000","_key":"B","is_valid":{"time":"1996-12-19T16:40:00.000000000","key":"B","n":null,"m":null,"other_time":"1992-12-19T16:39:57.000000000","fruit":null}}
{"_time":"1996-12-19T16:40:01.000000000","_key":"B","is_valid":{"time":"1996-12-19T16:40:01.000000000","key":"B","n":8.0,"m":8.0,"other_time":null,"fruit":null}}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","is_valid":{"time":"1996-12-19T16:40:02.000000000","key":"B","n":23.0,"m":11.0,"other_time":"1994-12-19T16:39:57.000000000","fruit":"mango"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","s":"hEllo","is_valid":true}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","s":"World","is_valid":true}
{"_time":"1996-12-19T16:39:59.000000000","_key":"B","s":"hello world","is_valid":true}
{"_time":"1996-12-19T16:40:00.000000000","_key":"B","s":null,"is_valid":false}
{"_time":"1996-12-19T16:40:01.000000000","_key":"B","s":null,"is_valid":false}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","s":"goodbye","is_valid":true}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","n":2.0,"is_valid":true}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","n":4.0,"is_valid":true}
{"_time":"1996-12-19T16:39:59.000000000","_key":"B","n":5.0,"is_valid":true}
{"_time":"1996-12-19T16:40:00.000000000","_key":"B","n":null,"is_valid":false}
{"_time":"1996-12-19T16:40:01.000000000","_key":"B","n":8.0,"is_valid":true}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","n":23.0,"is_valid":true}
6 changes: 6 additions & 0 deletions python/pytests/golden/len_test/test_len.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","len":5.0}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","len":5.0}
{"_time":"1996-12-19T16:39:59.000000000","_key":"B","len":11.0}
{"_time":"1996-12-19T16:40:00.000000000","_key":"B","len":null}
{"_time":"1996-12-19T16:40:01.000000000","_key":"B","len":null}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","len":7.0}
6 changes: 6 additions & 0 deletions python/pytests/golden/substring_test/test_substring.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"_time":"1996-12-19T16:39:57.000000000","_key":"A","substring_0_2":"hE","substring_1":"Ello","substring_0_i":"","substring_i":"hEllo"}
{"_time":"1996-12-19T16:39:58.000000000","_key":"B","substring_0_2":"Wo","substring_1":"orld","substring_0_i":"World","substring_i":""}
{"_time":"1996-12-19T16:39:59.000000000","_key":"B","substring_0_2":"he","substring_1":"ello world","substring_0_i":"hello wor","substring_i":"ld"}
{"_time":"1996-12-19T16:40:00.000000000","_key":"B","substring_0_2":null,"substring_1":null,"substring_0_i":null,"substring_i":null}
{"_time":"1996-12-19T16:40:01.000000000","_key":"B","substring_0_2":null,"substring_1":null,"substring_0_i":null,"substring_i":null}
{"_time":"1996-12-19T16:40:02.000000000","_key":"B","substring_0_2":"go","substring_1":"oodbye","substring_0_i":"goodbye","substring_i":"goodbye"}
152 changes: 152 additions & 0 deletions python/pytests/is_valid_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import kaskada as kd
import pytest

@pytest.fixture(scope="module")
async def boolean_source() -> kd.sources.CsvString:
content = "\n".join(
[
"time,key,a,b",
'1996-12-19T16:39:57,A,true,true',
'1996-12-19T16:39:58,B,false,false',
'1996-12-19T16:39:59,B,,true',
'1996-12-19T16:40:00,B,true,false',
'1996-12-19T16:40:01,B,false,true',
'1996-12-19T16:40:02,B,false,',
'1996-12-19T16:40:02,B,,',
]
)
return await kd.sources.CsvString.create(
content, time_column="time", key_column="key"
)

@pytest.fixture(scope="module")
async def f64_source() -> kd.sources.CsvString:
content = "\n".join(
[
"time,key,m,n",
'1996-12-19T16:39:57,A,5.2,10',
'1996-12-19T16:39:58,B,24.3,3.9',
'1996-12-19T16:39:59,A,17.6,6.2',
'1996-12-19T16:40:00,A,,9.25',
'1996-12-19T16:40:01,A,12.4,',
'1996-12-19T16:40:02,A,,',
]
)
return await kd.sources.CsvString.create(
content, time_column="time", key_column="key"
)

@pytest.fixture(scope="module")
async def i64_source() -> kd.sources.CsvString:
content = "\n".join(
[
"time,key,m,n",
'1996-12-19T16:39:57,A,5,10',
'1996-12-19T16:39:58,B,24,3',
'1996-12-19T16:39:59,A,17,6',
'1996-12-19T16:40:00,A,,9',
'1996-12-19T16:40:01,A,12,',
'1996-12-19T16:40:02,A,,',
]
)
return await kd.sources.CsvString.create(
content, time_column="time", key_column="key"
)

@pytest.fixture(scope="module")
async def string_source() -> kd.sources.CsvString:
content = "\n".join(
[
"time,key,s,n,t",
'1996-12-19T16:39:57,A,"hEllo",0,"hEllo"',
'1996-12-19T16:39:58,B,"World",5,"world"',
'1996-12-19T16:39:59,B,"hello world",-2,"hello world"',
'1996-12-19T16:40:00,B,,-2,"greetings"',
'1996-12-19T16:40:01,B,,2,"salutations"',
'1996-12-19T16:40:02,B,"goodbye",,',
]
)
return await kd.sources.CsvString.create(
content, time_column="time", key_column="key"
)

@pytest.fixture(scope="module")
async def timestamp_ns_source() -> kd.sources.CsvString:
content = "\n".join(
[
"time,key,n,m,other_time,fruit",
'1996-12-19T16:39:57,A,2,4,2003-12-19T16:39:57,pear',
'1996-12-19T16:39:58,B,4,3,1994-11-19T16:39:57,watermelon',
'1996-12-19T16:39:59,B,5,,1998-12-19T16:39:57,mango',
'1996-12-19T16:40:00,B,,,1992-12-19T16:39:57,',
'1996-12-19T16:40:01,B,8,8,,',
'1996-12-19T16:40:02,B,23,11,1994-12-19T16:39:57,mango',
]
)
return await kd.sources.CsvString.create(
content, time_column="time", key_column="key"
)

async def test_is_valid_boolean(boolean_source, golden) -> None:
a = boolean_source.col("a")
golden.jsonl(
kd.record(
{
"a": a,
"is_valid": a.is_valid(),
}
)
)

async def test_is_valid_f64(f64_source, golden) -> None:
m = f64_source.col("m")
golden.jsonl(
kd.record(
{
"m": m,
"is_valid": m.is_valid(),
}
)
)

async def test_is_valid_i64(i64_source, golden) -> None:
m = i64_source.col("m")
golden.jsonl(
kd.record(
{
"m": m,
"is_valid": m.is_valid(),
}
)
)

async def test_is_valid_string(string_source, golden) -> None:
s = string_source.col("s")
golden.jsonl(
kd.record(
{
"s": s,
"is_valid": s.is_valid(),
}
)
)

async def test_is_valid_timestamp_ns(timestamp_ns_source, golden) -> None:
n = timestamp_ns_source.col("n")
golden.jsonl(
kd.record(
{
"n": n,
"is_valid": n.is_valid(),
}
)
)

async def test_is_valid_record(timestamp_ns_source, golden) -> None:
golden.jsonl(
kd.record(
{
"is_valid": timestamp_ns_source,
}
)
)
Loading

0 comments on commit 8547e92

Please sign in to comment.