Skip to content

Commit

Permalink
Generate expected data for integration tests as feather files (#73)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey authored Mar 23, 2024
1 parent bb885c0 commit fd23fdb
Show file tree
Hide file tree
Showing 29 changed files with 93 additions and 89 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
17 changes: 17 additions & 0 deletions tests/integration/generate_arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Requires pyarrow to be installed
import glob
from pyarrow import orc, feather

files = glob.glob("data/expected/*")
files = [file.removeprefix("data/expected/").removesuffix(".jsn.gz") for file in files]

ignore_files = [
"TestOrcFile.testTimestamp" # Root data type isn't struct
]

files = [file for file in files if file not in ignore_files]

for file in files:
print(f"Converting {file} from ORC to feather")
table = orc.read_table(f"data/{file}.orc")
feather.write_feather(table, f"data/expected_arrow/{file}.feather")
165 changes: 76 additions & 89 deletions tests/integration/main.rs
Original file line number Diff line number Diff line change
@@ -1,195 +1,182 @@
#![allow(non_snake_case)]

/// Tests against `.orc` and `.jsn.gz` in the official test suite (`orc/examples/`)
/// Tests ORC files from the official test suite (`orc/examples/`) against Arrow feather
/// expected data sourced by reading the ORC files with PyArrow and persisting as feather.
use std::fs::File;
use std::io::Read;

use arrow::{ipc::reader::FileReader, util::pretty};
use pretty_assertions::assert_eq;

use arrow::array::StructArray;
use arrow::record_batch::RecordBatch;
use datafusion_orc::arrow_reader::ArrowReaderBuilder;

/// Checks parsing a `.orc` file produces the expected result in the `.jsn.gz` path
/// Checks specific `.orc` file against corresponding expected feather file
fn test_expected_file(name: &str) {
let dir = env!("CARGO_MANIFEST_DIR");
let orc_path = format!("{}/tests/integration/data/{}.orc", dir, name);
let jsn_gz_path = format!("{}/tests/integration/data/expected/{}.jsn.gz", dir, name);
let f = File::open(orc_path).expect("Could not open .orc");
let builder = ArrowReaderBuilder::try_new(f).unwrap();
let orc_reader = builder.build();
let total_row_count = orc_reader.total_row_count();

// Read .orc into JSON objects
let batches: Vec<RecordBatch> = orc_reader.collect::<Result<Vec<_>, _>>().unwrap();
let objects: Vec<serde_json::Value> = batches
.into_iter()
.map(|batch| -> StructArray { batch.into() })
.flat_map(|array| {
arrow_json::writer::array_to_json_array(&array)
.expect("Could not serialize convert row from .orc to JSON value")
})
.collect();

// Read expected JSON objects
let mut expected_json = String::new();
flate2::read::GzDecoder::new(&File::open(jsn_gz_path).expect("Could not open .jsn.gz"))
.read_to_string(&mut expected_json)
.expect("Could not read .jsn.gz");

let objects_count = objects.len();

// Reencode the input to normalize it
let expected_lines = expected_json
.split('\n')
.filter(|line| !line.is_empty())
.map(|line| {
serde_json::from_str::<serde_json::Value>(line)
.expect("Could not parse line in .jsn.gz")
})
.map(|v| {
serde_json::to_string_pretty(&v).expect("Could not re-serialize line from .jsn.gz")
})
.collect::<Vec<_>>()
.join("\n");

let lines = objects
.into_iter()
.map(|v| serde_json::to_string_pretty(&v).expect("Could not serialize row from .orc"))
.collect::<Vec<_>>()
.join("\n");

if lines.len() < 1000 {
assert_eq!(lines, expected_lines);
} else {
// pretty_assertions consumes too much RAM and CPU on large diffs,
// and it's unreadable anyway
assert_eq!(lines[0..1000], expected_lines[0..1000]);
assert!(lines == expected_lines);
}

assert_eq!(total_row_count, objects_count as u64);
}

#[test]
fn columnProjection() {
let feather_path = format!(
"{}/tests/integration/data/expected_arrow/{}.feather",
dir, name
);

let f = File::open(orc_path).unwrap();
let orc_reader = ArrowReaderBuilder::try_new(f).unwrap().build();
let actual_batches = orc_reader.collect::<Result<Vec<_>, _>>().unwrap();

let f = File::open(feather_path).unwrap();
let feather_reader = FileReader::try_new(f, None).unwrap();
let expected_batches = feather_reader.collect::<Result<Vec<_>, _>>().unwrap();

// TODO: better way of checking equality? this step is slow for zlib
let formatted_actual = pretty::pretty_format_batches(&actual_batches)
.unwrap()
.to_string();
let actual_lines = formatted_actual.trim().lines().collect::<Vec<_>>();
let formatted_expected = pretty::pretty_format_batches(&expected_batches)
.unwrap()
.to_string();
let expected_lines = formatted_expected.trim().lines().collect::<Vec<_>>();

// TODO: Also test schema? Ignore nullability however?
assert_eq!(actual_lines, expected_lines);
}

#[test]
fn column_projection() {
test_expected_file("TestOrcFile.columnProjection");
}

#[test]
fn emptyFile() {
fn empty_file() {
test_expected_file("TestOrcFile.emptyFile");
}

#[test]
#[ignore] // TODO: Why?
fn metaData() {
fn meta_data() {
test_expected_file("TestOrcFile.metaData");
}

#[test]
#[ignore] // TODO: Why?
fn test1() {
test_expected_file("TestOrcFile.test1");
}

#[test]
#[ignore] // TODO: Incorrect timezone + representation differs
fn testDate1900() {
fn test_date_1900() {
test_expected_file("TestOrcFile.testDate1900");
}

#[test]
#[ignore] // TODO: Incorrect timezone + representation differs
fn testDate2038() {
fn test_date_2038() {
test_expected_file("TestOrcFile.testDate2038");
}

#[test]
fn testMemoryManagementV11() {
fn test_memory_management_v11() {
test_expected_file("TestOrcFile.testMemoryManagementV11");
}

#[test]
fn testMemoryManagementV12() {
fn test_memory_management_v12() {
test_expected_file("TestOrcFile.testMemoryManagementV12");
}

#[test]
fn testPredicatePushdown() {
fn test_predicate_pushdown() {
test_expected_file("TestOrcFile.testPredicatePushdown");
}

#[test]
#[ignore] // TODO: Why?
fn testSeek() {
fn test_seek() {
test_expected_file("TestOrcFile.testSeek");
}

#[test]
fn testSnappy() {
fn test_snappy() {
test_expected_file("TestOrcFile.testSnappy");
}

#[test]
#[ignore] // TODO: arrow_json does not support binaries
fn testStringAndBinaryStatistics() {
fn test_string_and_binary_statistics() {
test_expected_file("TestOrcFile.testStringAndBinaryStatistics");
}

#[test]
fn testStripeLevelStats() {
fn test_stripe_level_stats() {
test_expected_file("TestOrcFile.testStripeLevelStats");
}

#[test]
#[ignore] // TODO: Non-struct root type are not supported yet
fn testTimestamp() {
fn test_timestamp() {
test_expected_file("TestOrcFile.testTimestamp");
}

#[test]
#[ignore] // TODO: Unions are not supported yet
fn testUnionAndTimestamp() {
fn test_union_and_timestamp() {
test_expected_file("TestOrcFile.testUnionAndTimestamp");
}

#[test]
fn testWithoutIndex() {
fn test_without_index() {
test_expected_file("TestOrcFile.testWithoutIndex");
}

#[test]
fn testLz4() {
fn test_lz4() {
test_expected_file("TestVectorOrcFile.testLz4");
}

#[test]
fn testLzo() {
fn test_lzo() {
test_expected_file("TestVectorOrcFile.testLzo");
}

#[test]
#[ignore] // TODO: Differs on representation of some Decimals
fn decimal() {
test_expected_file("decimal");
}

#[test]
#[ignore] // TODO: Too slow
#[ignore] // TODO: Too slow when generating pretty formatted batch strings
fn zlib() {
test_expected_file("demo-12-zlib");
}

#[test]
#[ignore] // TODO: Why?
fn nulls_at_end_snappy() {
test_expected_file("nulls-at-end-snappy");
}

#[test]
#[ignore] // TODO: Why?
fn orc_11_format() {
test_expected_file("orc-file-11-format");
}

#[test]
fn orc_index_int_string() {
test_expected_file("orc_index_int_string");
}

#[test]
#[ignore] // TODO: not yet implemented
fn orc_split_elim() {
test_expected_file("orc_split_elim");
}

#[test]
#[ignore] // TODO: not yet implemented
fn orc_split_elim_cpp() {
test_expected_file("orc_split_elim_cpp");
}

#[test]
#[ignore] // TODO: not yet implemented
fn orc_split_elim_new() {
test_expected_file("orc_split_elim_new");
}

#[test]
#[ignore] // TODO: not yet implemented
fn over1k_bloom() {
Expand Down

0 comments on commit fd23fdb

Please sign in to comment.