From cf13577180e414c96a77588554231d0842361768 Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sat, 4 Nov 2023 14:02:38 +0000 Subject: [PATCH 1/3] feat: support to read tinyint --- README.md | 2 +- src/arrow_reader.rs | 9 +- src/arrow_reader/column.rs | 1 + src/arrow_reader/column/tinyint.rs | 23 ++++++ src/reader/decode.rs | 1 + src/reader/decode/byte_rle.rs | 127 +++++++++++++++++++++++++++++ tests/basic/data/test.orc | Bin 2616 -> 2737 bytes tests/basic/data/write.py | 5 +- tests/basic/main.rs | 36 ++++---- 9 files changed, 183 insertions(+), 21 deletions(-) create mode 100644 src/arrow_reader/column/tinyint.rs create mode 100644 src/reader/decode/byte_rle.rs diff --git a/README.md b/README.md index c418287e..b28e7db4 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Read [Apache ORC](https://orc.apache.org/) in Rust. | Float, Double | ✓ | | f32, f64 | Float32, Float64 | | String, Char, and VarChar | ✓ | | string | Utf8 | | Boolean | ✓ | | bool | Boolean | -| TinyInt | ✗ | | | | +| TinyInt | ✓ | | u8 | Uint8 | | Binary | ✓ | | Vec\ | Binary | | Decimal | ✗ | | | | | Date | ✓ | | chrono::NaiveDate | Date32 | diff --git a/src/arrow_reader.rs b/src/arrow_reader.rs index 183b2406..d1faae20 100644 --- a/src/arrow_reader.rs +++ b/src/arrow_reader.rs @@ -15,6 +15,7 @@ use arrow::record_batch::{RecordBatch, RecordBatchReader}; use chrono::{Datelike, NaiveDate, NaiveDateTime}; use snafu::{OptionExt, ResultExt}; +use self::column::tinyint::new_u8_iter; use self::column::Column; use crate::arrow_reader::column::binary::new_binary_iterator; use crate::arrow_reader::column::boolean::new_boolean_iter; @@ -117,6 +118,7 @@ pub enum Decoder { Int64(NullableIterator), Int32(NullableIterator), Int16(NullableIterator), + Uint8(NullableIterator), Boolean(NullableIterator), Float32(NullableIterator), Float64(NullableIterator), @@ -207,6 +209,7 @@ macro_rules! impl_decode_next_batch_cast { impl_decode_next_batch_cast!(i64, Int64Type); impl_decode_next_batch_cast!(i32, Int32Type); impl_decode_next_batch_cast!(i16, Int16Type); +impl_decode_next_batch!(u8); impl_decode_next_batch!(f32); impl_decode_next_batch!(f64); @@ -238,6 +241,10 @@ impl NaiveStripeDecoder { Some(array) => fields.push(array), None => break, }, + Decoder::Uint8(decoder) => match decode_next_batch_u8(decoder, chunk)? { + Some(array) => fields.push(array), + None => break, + }, Decoder::Float32(decoder) => match decode_next_batch_f32(decoder, chunk)? { Some(array) => fields.push(array), None => break, @@ -328,7 +335,7 @@ impl NaiveStripeDecoder { for col in &stripe.columns { let decoder = match col.kind() { crate::proto::r#type::Kind::Boolean => Decoder::Boolean(new_boolean_iter(col)?), - crate::proto::r#type::Kind::Byte => todo!(), + crate::proto::r#type::Kind::Byte => Decoder::Uint8(new_u8_iter(col)?), crate::proto::r#type::Kind::Short => Decoder::Int16(new_i64_iter(col)?), crate::proto::r#type::Kind::Int => Decoder::Int32(new_i64_iter(col)?), crate::proto::r#type::Kind::Long => Decoder::Int64(new_i64_iter(col)?), diff --git a/src/arrow_reader/column.rs b/src/arrow_reader/column.rs index 9ed6a0d0..a0f453cc 100644 --- a/src/arrow_reader/column.rs +++ b/src/arrow_reader/column.rs @@ -20,6 +20,7 @@ pub mod int; pub mod present; pub mod string; pub mod timestamp; +pub mod tinyint; #[derive(Debug)] pub struct Column { diff --git a/src/arrow_reader/column/tinyint.rs b/src/arrow_reader/column/tinyint.rs new file mode 100644 index 00000000..0c91ea53 --- /dev/null +++ b/src/arrow_reader/column/tinyint.rs @@ -0,0 +1,23 @@ +use snafu::OptionExt; + +use crate::arrow_reader::column::present::new_present_iter; +use crate::arrow_reader::column::{Column, NullableIterator}; +use crate::error::{InvalidColumnSnafu, Result}; +use crate::proto::stream::Kind; +use crate::reader::decode::byte_rle::ByteRleIter; + +pub fn new_u8_iter(column: &Column) -> Result> { + let present = new_present_iter(column)?.collect::>>()?; + let rows: usize = present.iter().filter(|&p| *p).count(); + + let iter = column + .stream(Kind::Data) + .transpose()? + .map(|reader| Box::new(ByteRleIter::new(reader, rows)) as _) + .context(InvalidColumnSnafu { name: &column.name })?; + + Ok(NullableIterator { + present: Box::new(present.into_iter()), + iter, + }) +} diff --git a/src/reader/decode.rs b/src/reader/decode.rs index ab72f1ac..db18c350 100644 --- a/src/reader/decode.rs +++ b/src/reader/decode.rs @@ -1,4 +1,5 @@ pub mod boolean_rle; +pub mod byte_rle; pub mod float; pub mod rle_v2; mod util; diff --git a/src/reader/decode/byte_rle.rs b/src/reader/decode/byte_rle.rs new file mode 100644 index 00000000..2152fa64 --- /dev/null +++ b/src/reader/decode/byte_rle.rs @@ -0,0 +1,127 @@ +use crate::error::Result; +use std::io::Read; + +use super::util::read_u8; + +const MAX_LITERAL_SIZE: usize = 128; +const MIN_REPEAT_SIZE: usize = 3; + +pub struct ByteRleIter { + reader: R, + literals: [u8; MAX_LITERAL_SIZE], + next_byte: Option, + num_literals: usize, + used: usize, + repeat: bool, + min_repeat_size: usize, + remaining: usize, +} + +impl ByteRleIter { + pub fn new(reader: R, length: usize) -> Self { + Self { + reader, + literals: [0u8; MAX_LITERAL_SIZE], + next_byte: None, + num_literals: 0, + used: 0, + repeat: false, + min_repeat_size: MIN_REPEAT_SIZE, + remaining: length, + } + } + + pub fn into_inner(self) -> R { + self.reader + } + + fn read_byte(&mut self) -> Result { + if let Some(byt) = self.next_byte.take() { + Ok(byt) + } else { + read_u8(&mut self.reader) + } + } + + fn read_values(&mut self) -> Result<()> { + let control = self.read_byte()?; + self.used = 0; + if control < 0x80 { + self.repeat = true; + self.num_literals = control as usize + self.min_repeat_size; + let val = self.read_byte()?; + self.literals[0] = val; + } else { + self.repeat = false; + self.num_literals = 0x100 - control as usize; + for i in 0..self.num_literals { + let result = self.read_byte()?; + self.literals[i] = result; + } + } + Ok(()) + } +} + +impl Iterator for ByteRleIter { + type Item = Result; + + fn next(&mut self) -> Option { + if self.remaining == 0 { + return None; + } + if self.used == self.num_literals { + match self.read_values() { + Ok(_) => {} + Err(err) => return Some(Err(err)), + } + } + + let result = if self.repeat { + self.literals[0] + } else { + self.literals[self.used] + }; + self.used += 1; + self.remaining -= 1; + Some(Ok(result)) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn reader_test() { + let data = [0x61u8, 0x00]; + + let data = &mut data.as_ref(); + + let iter = ByteRleIter::new(data, 100) + .collect::>>() + .unwrap(); + + assert_eq!(iter, vec![0; 100]); + + let data = [0x01, 0x01]; + + let data = &mut data.as_ref(); + + let iter = ByteRleIter::new(data, 4) + .collect::>>() + .unwrap(); + + assert_eq!(iter, vec![1; 4]); + + let data = [0xfe, 0x44, 0x45]; + + let data = &mut data.as_ref(); + + let iter = ByteRleIter::new(data, 2) + .collect::>>() + .unwrap(); + + assert_eq!(iter, vec![0x44, 0x45]); + } +} diff --git a/tests/basic/data/test.orc b/tests/basic/data/test.orc index 204cb2911cd4aade263f4b04959782943c4c3d60..cfcc323f8d7ff2013e1ba4bda3b711a07058302a 100644 GIT binary patch delta 242 zcmdlXvQcz{0Fxw_I2S7e2ng|Uun2K*{AUzklK9UUz&Kfe=>z-!9e)@Y8~#sXUd+rU zAhLNLa}lEe7aIqIfQSSWkY)tZtdj*;D|m%ifPxHC%%LI9T#q;RvxYF*bD@ZlfST^W>MncVBEZuBbw2~3e9YX34R`oNvwu?=6dEFFW3YaBovgG7" diff --git a/tests/basic/main.rs b/tests/basic/main.rs index 6114bd40..77869c43 100644 --- a/tests/basic/main.rs +++ b/tests/basic/main.rs @@ -229,15 +229,15 @@ pub fn basic_test_0() { let reader = new_arrow_reader_root(&path); let batch = reader.collect::, _>>().unwrap(); - let expected = r#"+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+ -| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | -+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+ -| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | -| 2.0 | false | cccccc | bb | cc | bbbbb | 5 | -5 | 2 | 4 | 6 | -6 | 6 | -6 | -5 | bb | dddd | 2021-08-22T07:26:44.525777 | 2023-03-01 | -| | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | -| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | -| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | -+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+"#; + let expected = r#"+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ +| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple | ++-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ +| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | 0 | +| 2.0 | false | cccccc | bb | cc | bbbbb | 5 | -5 | 2 | 4 | 6 | -6 | 6 | -6 | -5 | bb | dddd | 2021-08-22T07:26:44.525777 | 2023-03-01 | | +| | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | 1 | +| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 128 | +| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | 255 | ++-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+"#; assert_eq!( expected, pretty::pretty_format_batches(&batch).unwrap().to_string() @@ -250,15 +250,15 @@ pub async fn async_basic_test_0() { let reader = new_arrow_stream_reader_root(&path).await; let batch = reader.try_collect::>().await.unwrap(); - let expected = r#"+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+ -| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | -+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+ -| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | -| 2.0 | false | cccccc | bb | cc | bbbbb | 5 | -5 | 2 | 4 | 6 | -6 | 6 | -6 | -5 | bb | dddd | 2021-08-22T07:26:44.525777 | 2023-03-01 | -| | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | -| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | -| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | -+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+"#; + let expected = r#"+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ +| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple | ++-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ +| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | 0 | +| 2.0 | false | cccccc | bb | cc | bbbbb | 5 | -5 | 2 | 4 | 6 | -6 | 6 | -6 | -5 | bb | dddd | 2021-08-22T07:26:44.525777 | 2023-03-01 | | +| | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | 1 | +| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 128 | +| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | 255 | ++-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+"#; assert_eq!( expected, From 3378874ee7b60491c184254cad613eddd630fa5c Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sun, 5 Nov 2023 06:09:06 +0000 Subject: [PATCH 2/3] feat: map tinyint to i8 --- README.md | 2 +- src/arrow_reader.rs | 10 +++++----- src/arrow_reader/column/tinyint.rs | 7 +++++-- tests/basic/data/test.orc | Bin 2737 -> 2737 bytes tests/basic/data/write.py | 2 +- tests/basic/main.rs | 13 ++++++------- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b28e7db4..62badeb5 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Read [Apache ORC](https://orc.apache.org/) in Rust. | Float, Double | ✓ | | f32, f64 | Float32, Float64 | | String, Char, and VarChar | ✓ | | string | Utf8 | | Boolean | ✓ | | bool | Boolean | -| TinyInt | ✓ | | u8 | Uint8 | +| TinyInt | ✓ | | i8 | Int8 | | Binary | ✓ | | Vec\ | Binary | | Decimal | ✗ | | | | | Date | ✓ | | chrono::NaiveDate | Date32 | diff --git a/src/arrow_reader.rs b/src/arrow_reader.rs index d1faae20..d338e6e3 100644 --- a/src/arrow_reader.rs +++ b/src/arrow_reader.rs @@ -15,7 +15,7 @@ use arrow::record_batch::{RecordBatch, RecordBatchReader}; use chrono::{Datelike, NaiveDate, NaiveDateTime}; use snafu::{OptionExt, ResultExt}; -use self::column::tinyint::new_u8_iter; +use self::column::tinyint::new_i8_iter; use self::column::Column; use crate::arrow_reader::column::binary::new_binary_iterator; use crate::arrow_reader::column::boolean::new_boolean_iter; @@ -118,7 +118,7 @@ pub enum Decoder { Int64(NullableIterator), Int32(NullableIterator), Int16(NullableIterator), - Uint8(NullableIterator), + Int8(NullableIterator), Boolean(NullableIterator), Float32(NullableIterator), Float64(NullableIterator), @@ -209,7 +209,7 @@ macro_rules! impl_decode_next_batch_cast { impl_decode_next_batch_cast!(i64, Int64Type); impl_decode_next_batch_cast!(i32, Int32Type); impl_decode_next_batch_cast!(i16, Int16Type); -impl_decode_next_batch!(u8); +impl_decode_next_batch!(i8); impl_decode_next_batch!(f32); impl_decode_next_batch!(f64); @@ -241,7 +241,7 @@ impl NaiveStripeDecoder { Some(array) => fields.push(array), None => break, }, - Decoder::Uint8(decoder) => match decode_next_batch_u8(decoder, chunk)? { + Decoder::Int8(decoder) => match decode_next_batch_i8(decoder, chunk)? { Some(array) => fields.push(array), None => break, }, @@ -335,7 +335,7 @@ impl NaiveStripeDecoder { for col in &stripe.columns { let decoder = match col.kind() { crate::proto::r#type::Kind::Boolean => Decoder::Boolean(new_boolean_iter(col)?), - crate::proto::r#type::Kind::Byte => Decoder::Uint8(new_u8_iter(col)?), + crate::proto::r#type::Kind::Byte => Decoder::Int8(new_i8_iter(col)?), crate::proto::r#type::Kind::Short => Decoder::Int16(new_i64_iter(col)?), crate::proto::r#type::Kind::Int => Decoder::Int32(new_i64_iter(col)?), crate::proto::r#type::Kind::Long => Decoder::Int64(new_i64_iter(col)?), diff --git a/src/arrow_reader/column/tinyint.rs b/src/arrow_reader/column/tinyint.rs index 0c91ea53..03915dcc 100644 --- a/src/arrow_reader/column/tinyint.rs +++ b/src/arrow_reader/column/tinyint.rs @@ -6,14 +6,17 @@ use crate::error::{InvalidColumnSnafu, Result}; use crate::proto::stream::Kind; use crate::reader::decode::byte_rle::ByteRleIter; -pub fn new_u8_iter(column: &Column) -> Result> { +pub fn new_i8_iter(column: &Column) -> Result> { let present = new_present_iter(column)?.collect::>>()?; let rows: usize = present.iter().filter(|&p| *p).count(); let iter = column .stream(Kind::Data) .transpose()? - .map(|reader| Box::new(ByteRleIter::new(reader, rows)) as _) + .map(|reader| { + Box::new(ByteRleIter::new(reader, rows).map(|value| value.map(|value| value as i8))) + as _ + }) .context(InvalidColumnSnafu { name: &column.name })?; Ok(NullableIterator { diff --git a/tests/basic/data/test.orc b/tests/basic/data/test.orc index cfcc323f8d7ff2013e1ba4bda3b711a07058302a..135d527ca91b10a4e7e4c61fe22c9e6e4c598948 100644 GIT binary patch delta 54 ycmdlex>0n40u%dRMuC5f5)6~onZC08XRL4Btia5{0u$Y=&mPJQ<4o@6QUd_fh!60n40u%dxMgb;?|BRE>nZB|xFgE<(tia5{0u$Y=&mPJQ<4o@6QUd_I&JUOX diff --git a/tests/basic/data/write.py b/tests/basic/data/write.py index 4c6483ad..0ef46f7e 100644 --- a/tests/basic/data/write.py +++ b/tests/basic/data/write.py @@ -23,7 +23,7 @@ "utf8_decrease": ["eeeee", "dddd", "ccc", "bb", "a"], "timestamp_simple": [datetime.datetime(2023, 4, 1, 20, 15, 30, 2000), datetime.datetime.fromtimestamp(int('1629617204525777000')/1000000000), datetime.datetime(2023, 1, 1), datetime.datetime(2023, 2, 1), datetime.datetime(2023, 3, 1)], "date_simple": [datetime.date(2023, 4, 1), datetime.date(2023, 3, 1), datetime.date(2023, 1, 1), datetime.date(2023, 2, 1), datetime.date(2023, 3, 1)], - "tinyint_simple": [0, None, 1, 128, 255] + "tinyint_simple": [-1, None, 1, 127, -127] } def infer_schema(data): diff --git a/tests/basic/main.rs b/tests/basic/main.rs index 77869c43..14b49313 100644 --- a/tests/basic/main.rs +++ b/tests/basic/main.rs @@ -232,11 +232,11 @@ pub fn basic_test_0() { let expected = r#"+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ | a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple | +-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ -| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | 0 | +| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | -1 | | 2.0 | false | cccccc | bb | cc | bbbbb | 5 | -5 | 2 | 4 | 6 | -6 | 6 | -6 | -5 | bb | dddd | 2021-08-22T07:26:44.525777 | 2023-03-01 | | | | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | 1 | -| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 128 | -| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | 255 | +| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 127 | +| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | -127 | +-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+"#; assert_eq!( expected, @@ -253,13 +253,12 @@ pub async fn async_basic_test_0() { let expected = r#"+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ | a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple | +-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+ -| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | 0 | +| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | -1 | | 2.0 | false | cccccc | bb | cc | bbbbb | 5 | -5 | 2 | 4 | 6 | -6 | 6 | -6 | -5 | bb | dddd | 2021-08-22T07:26:44.525777 | 2023-03-01 | | | | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | 1 | -| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 128 | -| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | 255 | +| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 127 | +| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | -127 | +-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+----------------+"#; - assert_eq!( expected, pretty::pretty_format_batches(&batch).unwrap().to_string() From 995e0f34aa6e6a89c1c5f10fecf124344db3e753 Mon Sep 17 00:00:00 2001 From: WenyXu Date: Sun, 5 Nov 2023 06:33:04 +0000 Subject: [PATCH 3/3] chore: apply suggestions from CR --- src/reader/decode/byte_rle.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/reader/decode/byte_rle.rs b/src/reader/decode/byte_rle.rs index 2152fa64..8eb6969e 100644 --- a/src/reader/decode/byte_rle.rs +++ b/src/reader/decode/byte_rle.rs @@ -9,11 +9,9 @@ const MIN_REPEAT_SIZE: usize = 3; pub struct ByteRleIter { reader: R, literals: [u8; MAX_LITERAL_SIZE], - next_byte: Option, num_literals: usize, used: usize, repeat: bool, - min_repeat_size: usize, remaining: usize, } @@ -22,11 +20,9 @@ impl ByteRleIter { Self { reader, literals: [0u8; MAX_LITERAL_SIZE], - next_byte: None, num_literals: 0, used: 0, repeat: false, - min_repeat_size: MIN_REPEAT_SIZE, remaining: length, } } @@ -36,11 +32,7 @@ impl ByteRleIter { } fn read_byte(&mut self) -> Result { - if let Some(byt) = self.next_byte.take() { - Ok(byt) - } else { - read_u8(&mut self.reader) - } + read_u8(&mut self.reader) } fn read_values(&mut self) -> Result<()> { @@ -48,7 +40,7 @@ impl ByteRleIter { self.used = 0; if control < 0x80 { self.repeat = true; - self.num_literals = control as usize + self.min_repeat_size; + self.num_literals = control as usize + MIN_REPEAT_SIZE; let val = self.read_byte()?; self.literals[0] = val; } else {