Generate expected data for integration tests as feather files (#73)

datafusion-contrib · Mar 23, 2024 · fd23fdb · fd23fdb
1 parent bb885c0
commit fd23fdb
Show file tree

Hide file tree

Showing 29 changed files with 93 additions and 89 deletions.
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.columnProjection.feather b/tests/integration/data/expected_arrow/TestOrcFile.columnProjection.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.emptyFile.feather b/tests/integration/data/expected_arrow/TestOrcFile.emptyFile.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.metaData.feather b/tests/integration/data/expected_arrow/TestOrcFile.metaData.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.test1.feather b/tests/integration/data/expected_arrow/TestOrcFile.test1.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testDate1900.feather b/tests/integration/data/expected_arrow/TestOrcFile.testDate1900.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testDate2038.feather b/tests/integration/data/expected_arrow/TestOrcFile.testDate2038.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV11.feather b/tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV11.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV12.feather b/tests/integration/data/expected_arrow/TestOrcFile.testMemoryManagementV12.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testPredicatePushdown.feather b/tests/integration/data/expected_arrow/TestOrcFile.testPredicatePushdown.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testSeek.feather b/tests/integration/data/expected_arrow/TestOrcFile.testSeek.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testSnappy.feather b/tests/integration/data/expected_arrow/TestOrcFile.testSnappy.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testStringAndBinaryStatistics.feather b/tests/integration/data/expected_arrow/TestOrcFile.testStringAndBinaryStatistics.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testStripeLevelStats.feather b/tests/integration/data/expected_arrow/TestOrcFile.testStripeLevelStats.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testUnionAndTimestamp.feather b/tests/integration/data/expected_arrow/TestOrcFile.testUnionAndTimestamp.feather
diff --git a/tests/integration/data/expected_arrow/TestOrcFile.testWithoutIndex.feather b/tests/integration/data/expected_arrow/TestOrcFile.testWithoutIndex.feather
diff --git a/tests/integration/data/expected_arrow/TestStringDictionary.testRowIndex.feather b/tests/integration/data/expected_arrow/TestStringDictionary.testRowIndex.feather
diff --git a/tests/integration/data/expected_arrow/TestVectorOrcFile.testLz4.feather b/tests/integration/data/expected_arrow/TestVectorOrcFile.testLz4.feather
diff --git a/tests/integration/data/expected_arrow/TestVectorOrcFile.testLzo.feather b/tests/integration/data/expected_arrow/TestVectorOrcFile.testLzo.feather
diff --git a/tests/integration/data/expected_arrow/decimal.feather b/tests/integration/data/expected_arrow/decimal.feather
diff --git a/tests/integration/data/expected_arrow/demo-12-zlib.feather b/tests/integration/data/expected_arrow/demo-12-zlib.feather
diff --git a/tests/integration/data/expected_arrow/nulls-at-end-snappy.feather b/tests/integration/data/expected_arrow/nulls-at-end-snappy.feather
diff --git a/tests/integration/data/expected_arrow/orc-file-11-format.feather b/tests/integration/data/expected_arrow/orc-file-11-format.feather
diff --git a/tests/integration/data/expected_arrow/orc_index_int_string.feather b/tests/integration/data/expected_arrow/orc_index_int_string.feather
diff --git a/tests/integration/data/expected_arrow/orc_split_elim.feather b/tests/integration/data/expected_arrow/orc_split_elim.feather
diff --git a/tests/integration/data/expected_arrow/orc_split_elim_cpp.feather b/tests/integration/data/expected_arrow/orc_split_elim_cpp.feather
diff --git a/tests/integration/data/expected_arrow/orc_split_elim_new.feather b/tests/integration/data/expected_arrow/orc_split_elim_new.feather
diff --git a/tests/integration/data/expected_arrow/over1k_bloom.feather b/tests/integration/data/expected_arrow/over1k_bloom.feather
diff --git a/tests/integration/generate_arrow.py b/tests/integration/generate_arrow.py
@@ -0,0 +1,17 @@
+# Requires pyarrow to be installed
+import glob
+from pyarrow import orc, feather
+
+files = glob.glob("data/expected/*")
+files = [file.removeprefix("data/expected/").removesuffix(".jsn.gz") for file in files]
+
+ignore_files = [
+    "TestOrcFile.testTimestamp" # Root data type isn't struct
+]
+
+files = [file for file in files if file not in ignore_files]
+
+for file in files:
+    print(f"Converting {file} from ORC to feather")
+    table = orc.read_table(f"data/{file}.orc")
+    feather.write_feather(table, f"data/expected_arrow/{file}.feather")
diff --git a/tests/integration/main.rs b/tests/integration/main.rs
@@ -1,195 +1,182 @@
-#![allow(non_snake_case)]
-
-/// Tests against `.orc` and `.jsn.gz` in the official test suite (`orc/examples/`)
+/// Tests ORC files from the official test suite (`orc/examples/`) against Arrow feather
+/// expected data sourced by reading the ORC files with PyArrow and persisting as feather.
 use std::fs::File;
-use std::io::Read;
 
+use arrow::{ipc::reader::FileReader, util::pretty};
 use pretty_assertions::assert_eq;
 
-use arrow::array::StructArray;
-use arrow::record_batch::RecordBatch;
 use datafusion_orc::arrow_reader::ArrowReaderBuilder;
 
-/// Checks parsing a `.orc` file produces the expected result in the `.jsn.gz` path
+/// Checks specific `.orc` file against corresponding expected feather file
 fn test_expected_file(name: &str) {
     let dir = env!("CARGO_MANIFEST_DIR");
     let orc_path = format!("{}/tests/integration/data/{}.orc", dir, name);
-    let jsn_gz_path = format!("{}/tests/integration/data/expected/{}.jsn.gz", dir, name);
-    let f = File::open(orc_path).expect("Could not open .orc");
-    let builder = ArrowReaderBuilder::try_new(f).unwrap();
-    let orc_reader = builder.build();
-    let total_row_count = orc_reader.total_row_count();
-
-    // Read .orc into JSON objects
-    let batches: Vec<RecordBatch> = orc_reader.collect::<Result<Vec<_>, _>>().unwrap();
-    let objects: Vec<serde_json::Value> = batches
-        .into_iter()
-        .map(|batch| -> StructArray { batch.into() })
-        .flat_map(|array| {
-            arrow_json::writer::array_to_json_array(&array)
-                .expect("Could not serialize convert row from .orc to JSON value")
-        })
-        .collect();
-
-    // Read expected JSON objects
-    let mut expected_json = String::new();
-    flate2::read::GzDecoder::new(&File::open(jsn_gz_path).expect("Could not open .jsn.gz"))
-        .read_to_string(&mut expected_json)
-        .expect("Could not read .jsn.gz");
-
-    let objects_count = objects.len();
-
-    // Reencode the input to normalize it
-    let expected_lines = expected_json
-        .split('\n')
-        .filter(|line| !line.is_empty())
-        .map(|line| {
-            serde_json::from_str::<serde_json::Value>(line)
-                .expect("Could not parse line in .jsn.gz")
-        })
-        .map(|v| {
-            serde_json::to_string_pretty(&v).expect("Could not re-serialize line from .jsn.gz")
-        })
-        .collect::<Vec<_>>()
-        .join("\n");
-
-    let lines = objects
-        .into_iter()
-        .map(|v| serde_json::to_string_pretty(&v).expect("Could not serialize row from .orc"))
-        .collect::<Vec<_>>()
-        .join("\n");
-
-    if lines.len() < 1000 {
-        assert_eq!(lines, expected_lines);
-    } else {
-        // pretty_assertions consumes too much RAM and CPU on large diffs,
-        // and it's unreadable anyway
-        assert_eq!(lines[0..1000], expected_lines[0..1000]);
-        assert!(lines == expected_lines);
-    }
-
-    assert_eq!(total_row_count, objects_count as u64);
-}
-
-#[test]
-fn columnProjection() {
+    let feather_path = format!(
+        "{}/tests/integration/data/expected_arrow/{}.feather",
+        dir, name
+    );
+
+    let f = File::open(orc_path).unwrap();
+    let orc_reader = ArrowReaderBuilder::try_new(f).unwrap().build();
+    let actual_batches = orc_reader.collect::<Result<Vec<_>, _>>().unwrap();
+
+    let f = File::open(feather_path).unwrap();
+    let feather_reader = FileReader::try_new(f, None).unwrap();
+    let expected_batches = feather_reader.collect::<Result<Vec<_>, _>>().unwrap();
+
+    // TODO: better way of checking equality? this step is slow for zlib
+    let formatted_actual = pretty::pretty_format_batches(&actual_batches)
+        .unwrap()
+        .to_string();
+    let actual_lines = formatted_actual.trim().lines().collect::<Vec<_>>();
+    let formatted_expected = pretty::pretty_format_batches(&expected_batches)
+        .unwrap()
+        .to_string();
+    let expected_lines = formatted_expected.trim().lines().collect::<Vec<_>>();
+
+    // TODO: Also test schema? Ignore nullability however?
+    assert_eq!(actual_lines, expected_lines);
+}
+
+#[test]
+fn column_projection() {
     test_expected_file("TestOrcFile.columnProjection");
 }
+
 #[test]
-fn emptyFile() {
+fn empty_file() {
     test_expected_file("TestOrcFile.emptyFile");
 }
+
 #[test]
 #[ignore] // TODO: Why?
-fn metaData() {
+fn meta_data() {
     test_expected_file("TestOrcFile.metaData");
 }
+
 #[test]
-#[ignore] // TODO: Why?
 fn test1() {
     test_expected_file("TestOrcFile.test1");
 }
+
 #[test]
 #[ignore] // TODO: Incorrect timezone + representation differs
-fn testDate1900() {
+fn test_date_1900() {
     test_expected_file("TestOrcFile.testDate1900");
 }
+
 #[test]
 #[ignore] // TODO: Incorrect timezone + representation differs
-fn testDate2038() {
+fn test_date_2038() {
     test_expected_file("TestOrcFile.testDate2038");
 }
+
 #[test]
-fn testMemoryManagementV11() {
+fn test_memory_management_v11() {
     test_expected_file("TestOrcFile.testMemoryManagementV11");
 }
+
 #[test]
-fn testMemoryManagementV12() {
+fn test_memory_management_v12() {
     test_expected_file("TestOrcFile.testMemoryManagementV12");
 }
+
 #[test]
-fn testPredicatePushdown() {
+fn test_predicate_pushdown() {
     test_expected_file("TestOrcFile.testPredicatePushdown");
 }
+
 #[test]
 #[ignore] // TODO: Why?
-fn testSeek() {
+fn test_seek() {
     test_expected_file("TestOrcFile.testSeek");
 }
+
 #[test]
-fn testSnappy() {
+fn test_snappy() {
     test_expected_file("TestOrcFile.testSnappy");
 }
+
 #[test]
-#[ignore] // TODO: arrow_json does not support binaries
-fn testStringAndBinaryStatistics() {
+fn test_string_and_binary_statistics() {
     test_expected_file("TestOrcFile.testStringAndBinaryStatistics");
 }
+
 #[test]
-fn testStripeLevelStats() {
+fn test_stripe_level_stats() {
     test_expected_file("TestOrcFile.testStripeLevelStats");
 }
+
 #[test]
 #[ignore] // TODO: Non-struct root type are not supported yet
-fn testTimestamp() {
+fn test_timestamp() {
     test_expected_file("TestOrcFile.testTimestamp");
 }
+
 #[test]
 #[ignore] // TODO: Unions are not supported yet
-fn testUnionAndTimestamp() {
+fn test_union_and_timestamp() {
     test_expected_file("TestOrcFile.testUnionAndTimestamp");
 }
+
 #[test]
-fn testWithoutIndex() {
+fn test_without_index() {
     test_expected_file("TestOrcFile.testWithoutIndex");
 }
+
 #[test]
-fn testLz4() {
+fn test_lz4() {
     test_expected_file("TestVectorOrcFile.testLz4");
 }
+
 #[test]
-fn testLzo() {
+fn test_lzo() {
     test_expected_file("TestVectorOrcFile.testLzo");
 }
+
 #[test]
-#[ignore] // TODO: Differs on representation of some Decimals
 fn decimal() {
     test_expected_file("decimal");
 }
+
 #[test]
-#[ignore] // TODO: Too slow
+#[ignore] // TODO: Too slow when generating pretty formatted batch strings
 fn zlib() {
     test_expected_file("demo-12-zlib");
 }
+
 #[test]
-#[ignore] // TODO: Why?
 fn nulls_at_end_snappy() {
     test_expected_file("nulls-at-end-snappy");
 }
+
 #[test]
 #[ignore] // TODO: Why?
 fn orc_11_format() {
     test_expected_file("orc-file-11-format");
 }
+
 #[test]
 fn orc_index_int_string() {
     test_expected_file("orc_index_int_string");
 }
+
 #[test]
 #[ignore] // TODO: not yet implemented
 fn orc_split_elim() {
     test_expected_file("orc_split_elim");
 }
+
 #[test]
-#[ignore] // TODO: not yet implemented
 fn orc_split_elim_cpp() {
     test_expected_file("orc_split_elim_cpp");
 }
+
 #[test]
-#[ignore] // TODO: not yet implemented
 fn orc_split_elim_new() {
     test_expected_file("orc_split_elim_new");
 }
+
 #[test]
 #[ignore] // TODO: not yet implemented
 fn over1k_bloom() {