Skip to content

Commit

Permalink
Initial ORC file format support
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Oct 25, 2023
1 parent e79b3bc commit c4bbd56
Show file tree
Hide file tree
Showing 25 changed files with 2,404 additions and 0 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ members = [
"arrow-integration-testing",
"arrow-ipc",
"arrow-json",
"arrow-orc",
"arrow-orc/orc-gen",
"arrow-ord",
"arrow-row",
"arrow-schema",
Expand Down Expand Up @@ -86,6 +88,7 @@ arrow-csv = { version = "48.0.0", path = "./arrow-csv" }
arrow-data = { version = "48.0.0", path = "./arrow-data" }
arrow-ipc = { version = "48.0.0", path = "./arrow-ipc" }
arrow-json = { version = "48.0.0", path = "./arrow-json" }
arrow-orc = { version = "48.0.0", path = "./arrow-orc" }
arrow-ord = { version = "48.0.0", path = "./arrow-ord" }
arrow-row = { version = "48.0.0", path = "./arrow-row" }
arrow-schema = { version = "48.0.0", path = "./arrow-schema" }
Expand Down
51 changes: 51 additions & 0 deletions arrow-orc/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "arrow-orc"
version = { workspace = true }
description = "Support for parsing ORC format into the Arrow format"
homepage = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
keywords = { workspace = true }
include = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }

[lib]
name = "arrow_orc"
path = "src/lib.rs"
bench = false

[dependencies]
arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
arrow-cast = { workspace = true }
arrow-data = { workspace = true }
arrow-schema = { workspace = true }

bytes = { version = "1", default-features = false, features = ["std"] }
snap = { version = "1.1", default-features = false }
flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] }
lz4_flex = { version = "0.11", default-features = false, features = ["std"] }
zstd = { version = "0.12", default-features = false }
lzokay-native = "0.1"

prost = "0.12.1"

32 changes: 32 additions & 0 deletions arrow-orc/orc-gen/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "orc-gen"
description = "Code generation for arrow-orc"
version = "0.1.0"
edition = { workspace = true }
rust-version = { workspace = true }
authors = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
license = { workspace = true }
publish = false


[dependencies]
prost-build = { version = "=0.12.1", default-features = false }
48 changes: 48 additions & 0 deletions arrow-orc/orc-gen/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::{
fs::{remove_file, OpenOptions},
io::{Read, Write},
};

fn main() -> Result<(), Box<dyn std::error::Error>> {
prost_build::Config::new()
.out_dir("src/")
.compile_well_known_types()
.extern_path(".google.protobuf", "::pbjson_types")
.compile_protos(&["../format/orc_proto.proto"], &["../format"])?;

// read file contents to string
let mut file = OpenOptions::new().read(true).open("src/orc.proto.rs")?;
let mut buffer = String::new();
file.read_to_string(&mut buffer)?;
// append warning that file was auto-generate
let mut file = OpenOptions::new()
.write(true)
.truncate(true)
.create(true)
.open("src/proto.rs")?;
file.write_all("// This file was automatically generated through the regen.sh script, and should not be edited.\n\n".as_bytes())?;
file.write_all(buffer.as_bytes())?;

// since we renamed file to proto.rs to avoid period in the name
remove_file("src/orc.proto.rs")?;

// As the proto file is checked in, the build should not fail if the file is not found
Ok(())
}
21 changes: 21 additions & 0 deletions arrow-orc/regen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR && cargo run --manifest-path orc-gen/Cargo.toml
29 changes: 29 additions & 0 deletions arrow-orc/src/array_reader/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Read ORC file columns as Arrow arrays.
use arrow_array::ArrayRef;

use crate::errors::Result;

pub mod struct_array_reader;

/// Used to be able to read batches of data from columns into Arrow arrays.
pub trait ArrayReader {
fn next_batch(&mut self, batch_size: usize) -> Result<Option<ArrayRef>>;
}
94 changes: 94 additions & 0 deletions arrow-orc/src/array_reader/struct_array_reader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Read Struct Arrays from ORC file column
use std::sync::Arc;

use arrow_array::{Array, ArrayRef, StructArray};
use arrow_data::ArrayDataBuilder;
use arrow_schema::DataType;

use crate::errors::Result;

use super::ArrayReader;

pub struct StructArrayReader {
children: Vec<Box<dyn ArrayReader>>,
data_type: DataType,
}

impl StructArrayReader {
pub fn new(children: Vec<Box<dyn ArrayReader>>, data_type: DataType) -> Self {
Self {
children,
data_type,
}
}

// For convenience when reading root of ORC file (expect Struct as root type)
pub fn next_struct_array_batch(
&mut self,
batch_size: usize,
) -> Result<Option<Arc<StructArray>>> {
if self.children.is_empty() {
return Ok(None);
}

let children_arrays = self
.children
.iter_mut()
.map(|reader| reader.next_batch(batch_size))
.collect::<Result<Vec<_>>>()?;
let expected_length = children_arrays
.first()
.and_then(|a| a.as_ref().map(Array::len));
let all_child_len_match = children_arrays
.iter()
.all(|array| array.as_ref().map(Array::len) == expected_length);
if !all_child_len_match {
return Err(general_err!(
"Struct array reader has children with mismatched lengths"
));
}

match expected_length {
None => Ok(None),
Some(length) => {
// TODO: account for nullability?
let array_data = ArrayDataBuilder::new(self.data_type.clone())
.len(length)
.child_data(
children_arrays
.iter()
.flatten()
.map(Array::to_data)
.collect::<Vec<_>>(),
);
let array_data = array_data.build()?;
Ok(Some(Arc::new(StructArray::from(array_data))))
}
}
}
}

impl ArrayReader for StructArrayReader {
fn next_batch(&mut self, batch_size: usize) -> Result<Option<ArrayRef>> {
self.next_struct_array_batch(batch_size)
.map(|opt| opt.map(|sa| sa as ArrayRef))
}
}
Loading

0 comments on commit c4bbd56

Please sign in to comment.