From ae9ebaffeb4a1e361e645a65afff3640f30b0481 Mon Sep 17 00:00:00 2001 From: Ben Chambers <35960+bjchambers@users.noreply.github.com> Date: Wed, 9 Aug 2023 21:55:56 -0700 Subject: [PATCH] fix: allow null strings in CSVs (#646) --- sparrow-py/pysrc/sparrow_py/sources/arrow.py | 3 ++- sparrow-py/pytests/conftest.py | 4 +++- .../pytests/golden/length_test/test_length.jsonl | 12 ++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sparrow-py/pysrc/sparrow_py/sources/arrow.py b/sparrow-py/pysrc/sparrow_py/sources/arrow.py index e6cb9e10b..87a91852c 100644 --- a/sparrow-py/pysrc/sparrow_py/sources/arrow.py +++ b/sparrow-py/pysrc/sparrow_py/sources/arrow.py @@ -119,7 +119,8 @@ def __init__( super().__init__(schema, **kwargs) self._convert_options = pyarrow.csv.ConvertOptions( - column_types=schema + column_types=schema, + strings_can_be_null=True, ) self.add_string(csv_string) diff --git a/sparrow-py/pytests/conftest.py b/sparrow-py/pytests/conftest.py index f9434069e..8b54facfa 100644 --- a/sparrow-py/pytests/conftest.py +++ b/sparrow-py/pytests/conftest.py @@ -40,7 +40,9 @@ def csv(self, data: Union[kt.Timestream, pd.DataFrame]) -> None: # Load the CSV file. Use the schema of the data to set expected types. golden = pa.read_csv(filename, - convert_options = pa.csv.ConvertOptions(column_types=data.schema)) + convert_options = pa.csv.ConvertOptions( + column_types=data.schema, + strings_can_be_null=True,)) pd.testing.assert_frame_equal(df, golden) diff --git a/sparrow-py/pytests/golden/length_test/test_length.jsonl b/sparrow-py/pytests/golden/length_test/test_length.jsonl index 6497eb233..c17bf7583 100644 --- a/sparrow-py/pytests/golden/length_test/test_length.jsonl +++ b/sparrow-py/pytests/golden/length_test/test_length.jsonl @@ -1,6 +1,6 @@ -{"_time":"1996-12-19T16:39:57.000","_subsort":0,"_key_hash":12960666915911099378,"_key":"A","str":"apple","len_key":5,"list":["apple"],"len_list":1} -{"_time":"1996-12-19T16:39:58.000","_subsort":1,"_key_hash":2867199309159137213,"_key":"B","str":"dog","len_key":3,"list":["dog"],"len_list":1} -{"_time":"1996-12-19T16:39:59.000","_subsort":2,"_key_hash":12960666915911099378,"_key":"A","str":"carrot","len_key":6,"list":["apple","carrot"],"len_list":2} -{"_time":"1996-12-19T16:40:00.000","_subsort":3,"_key_hash":12960666915911099378,"_key":"A","str":"","len_key":0,"list":["apple","carrot",""],"len_list":3} -{"_time":"1996-12-19T16:40:01.000","_subsort":4,"_key_hash":12960666915911099378,"_key":"A","str":"eggplant","len_key":8,"list":["apple","carrot","","eggplant"],"len_list":4} -{"_time":"1996-12-19T16:40:02.000","_subsort":5,"_key_hash":12960666915911099378,"_key":"A","str":"fig","len_key":3,"list":["apple","carrot","","eggplant","fig"],"len_list":5} +{"_time":"1996-12-19T16:39:57.000","_subsort":0,"_key_hash":12960666915911099378,"_key":"A","str":"apple","len_key":5.0,"list":["apple"],"len_list":1} +{"_time":"1996-12-19T16:39:58.000","_subsort":1,"_key_hash":2867199309159137213,"_key":"B","str":"dog","len_key":3.0,"list":["dog"],"len_list":1} +{"_time":"1996-12-19T16:39:59.000","_subsort":2,"_key_hash":12960666915911099378,"_key":"A","str":"carrot","len_key":6.0,"list":["apple","carrot"],"len_list":2} +{"_time":"1996-12-19T16:40:00.000","_subsort":3,"_key_hash":12960666915911099378,"_key":"A","str":null,"len_key":null,"list":["apple","carrot"],"len_list":2} +{"_time":"1996-12-19T16:40:01.000","_subsort":4,"_key_hash":12960666915911099378,"_key":"A","str":"eggplant","len_key":8.0,"list":["apple","carrot","eggplant"],"len_list":3} +{"_time":"1996-12-19T16:40:02.000","_subsort":5,"_key_hash":12960666915911099378,"_key":"A","str":"fig","len_key":3.0,"list":["apple","carrot","eggplant","fig"],"len_list":4}