From b669358587d39aa3960fcb02d004c488f6f90a4f Mon Sep 17 00:00:00 2001
From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com>
Date: Sat, 11 Nov 2023 12:56:39 +0100
Subject: [PATCH] Add test for more write inputs with large types

---
 python/deltalake/schema.py  |  4 ++--
 python/tests/test_writer.py | 40 +++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/python/deltalake/schema.py b/python/deltalake/schema.py
index 22f4507cd7..3378cc7e0a 100644
--- a/python/deltalake/schema.py
+++ b/python/deltalake/schema.py
@@ -111,7 +111,7 @@ def convert_pyarrow_table(data: pa.Table, large_dtypes: bool) -> pa.RecordBatchR
 def convert_pyarrow_dataset(
     data: ds.Dataset, large_dtypes: bool
 ) -> pa.RecordBatchReader:
-    """Converts a PyArrow table to a PyArrow RecordBatchReader with a compatible delta schema"""
+    """Converts a PyArrow dataset to a PyArrow RecordBatchReader, schema is kept aside and used during write"""
     schema = _convert_pa_schema_to_delta(data.schema, large_dtypes=large_dtypes)
-    data = data.replace_schema(schema).scanner().to_reader()
+    data = data.scanner().to_reader()
     return data, schema
diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py
index 00a53d3fb9..44cfe3eb0b 100644
--- a/python/tests/test_writer.py
+++ b/python/tests/test_writer.py
@@ -313,6 +313,15 @@ def test_write_dataset(
     assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
 
 
+def test_write_dataset_large_types(
+    tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
+):
+    dataset = existing_table.to_pyarrow_dataset()
+
+    write_deltalake(tmp_path, dataset, mode="overwrite", large_dtypes=True)
+    assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
+
+
 def test_write_table(
     tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
 ):
@@ -322,6 +331,15 @@ def test_write_table(
     assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
 
 
+def test_write_table_large_dtypes(
+    tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
+):
+    dataset = existing_table.to_pyarrow_table()
+
+    write_deltalake(tmp_path, dataset, mode="overwrite", large_dtypes=True)
+    assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
+
+
 def test_write_recordbatch(
     tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
 ):
@@ -332,6 +350,16 @@ def test_write_recordbatch(
     assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
 
 
+def test_write_recordbatch_large_dtypes(
+    tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
+):
+    batch = existing_table.to_pyarrow_table().to_batches()
+    print(len(batch))
+
+    write_deltalake(tmp_path, batch[0], mode="overwrite", large_dtypes=True)
+    assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
+
+
 def test_write_recordbatchreader(
     tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
 ):
@@ -344,6 +372,18 @@ def test_write_recordbatchreader(
     assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
 
 
+def test_write_recordbatchreader_large_dtypes(
+    tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
+):
+    batches = existing_table.to_pyarrow_dataset().to_batches()
+    reader = RecordBatchReader.from_batches(
+        existing_table.to_pyarrow_dataset().schema, batches
+    )
+
+    write_deltalake(tmp_path, reader, mode="overwrite", large_dtypes=True)
+    assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
+
+
 def test_writer_partitioning(tmp_path: pathlib.Path):
     test_strings = ["a=b", "hello world", "hello%20world"]
     data = pa.table(