From b669358587d39aa3960fcb02d004c488f6f90a4f Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Sat, 11 Nov 2023 12:56:39 +0100 Subject: [PATCH] Add test for more write inputs with large types --- python/deltalake/schema.py | 4 ++-- python/tests/test_writer.py | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/python/deltalake/schema.py b/python/deltalake/schema.py index 22f4507cd7..3378cc7e0a 100644 --- a/python/deltalake/schema.py +++ b/python/deltalake/schema.py @@ -111,7 +111,7 @@ def convert_pyarrow_table(data: pa.Table, large_dtypes: bool) -> pa.RecordBatchR def convert_pyarrow_dataset( data: ds.Dataset, large_dtypes: bool ) -> pa.RecordBatchReader: - """Converts a PyArrow table to a PyArrow RecordBatchReader with a compatible delta schema""" + """Converts a PyArrow dataset to a PyArrow RecordBatchReader, schema is kept aside and used during write""" schema = _convert_pa_schema_to_delta(data.schema, large_dtypes=large_dtypes) - data = data.replace_schema(schema).scanner().to_reader() + data = data.scanner().to_reader() return data, schema diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py index 00a53d3fb9..44cfe3eb0b 100644 --- a/python/tests/test_writer.py +++ b/python/tests/test_writer.py @@ -313,6 +313,15 @@ def test_write_dataset( assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data +def test_write_dataset_large_types( + tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table +): + dataset = existing_table.to_pyarrow_dataset() + + write_deltalake(tmp_path, dataset, mode="overwrite", large_dtypes=True) + assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data + + def test_write_table( tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table ): @@ -322,6 +331,15 @@ def test_write_table( assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data +def test_write_table_large_dtypes( + tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table +): + dataset = existing_table.to_pyarrow_table() + + write_deltalake(tmp_path, dataset, mode="overwrite", large_dtypes=True) + assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data + + def test_write_recordbatch( tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table ): @@ -332,6 +350,16 @@ def test_write_recordbatch( assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data +def test_write_recordbatch_large_dtypes( + tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table +): + batch = existing_table.to_pyarrow_table().to_batches() + print(len(batch)) + + write_deltalake(tmp_path, batch[0], mode="overwrite", large_dtypes=True) + assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data + + def test_write_recordbatchreader( tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table ): @@ -344,6 +372,18 @@ def test_write_recordbatchreader( assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data +def test_write_recordbatchreader_large_dtypes( + tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table +): + batches = existing_table.to_pyarrow_dataset().to_batches() + reader = RecordBatchReader.from_batches( + existing_table.to_pyarrow_dataset().schema, batches + ) + + write_deltalake(tmp_path, reader, mode="overwrite", large_dtypes=True) + assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data + + def test_writer_partitioning(tmp_path: pathlib.Path): test_strings = ["a=b", "hello world", "hello%20world"] data = pa.table(