Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 4, 2024
1 parent 200c8ee commit d85e951
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
23 changes: 19 additions & 4 deletions crates/polars-io/src/csv/read/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ pub(super) struct SplitLines<'a> {
#[cfg(feature = "simd")]
simd_quote_char: SimdVec,
#[cfg(feature = "simd")]
previous_valid_eol: u64,
previous_valid_eols: u64,
total_index: usize,
quoting: bool,
}
Expand Down Expand Up @@ -376,7 +376,7 @@ impl<'a> SplitLines<'a> {
#[cfg(feature = "simd")]
simd_quote_char,
#[cfg(feature = "simd")]
previous_valid_eol: 0,
previous_valid_eols: 0,
total_index: 0,
quoting,
}
Expand Down Expand Up @@ -439,6 +439,21 @@ impl<'a> Iterator for SplitLines<'a> {
self.total_index = 0;
let mut not_in_field_previous_iter = true;

if self.previous_valid_eols != 0 {
let pos = self.previous_valid_eols.trailing_zeros() as usize;
self.previous_valid_eols >>= (pos + 1) as u64;

unsafe {
debug_assert!((pos) <= self.v.len());

// return line up to this position
let ret = Some(self.v.get_unchecked(..pos));
// skip the '\n' token and update slice.
self.v = self.v.get_unchecked_release(pos + 1..);
return ret;
}
}

loop {
let bytes = unsafe { self.v.get_unchecked_release(self.total_index..) };
if bytes.len() > SIMD_SIZE {
Expand Down Expand Up @@ -468,9 +483,9 @@ impl<'a> Iterator for SplitLines<'a> {
if valid_eols != 0 {
let pos = valid_eols.trailing_zeros() as usize;
if pos == SIMD_SIZE - 1 {
self.previous_valid_eol = 0;
self.previous_valid_eols = 0;
} else {
self.previous_valid_eol = valid_eols >> (pos + 1) as u64;
self.previous_valid_eols = valid_eols >> (pos + 1) as u64;
}

unsafe {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-io/src/csv/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ impl<'a> CoreReader<'a> {
let mut total_line_count = 0;

// let t = std::time::Instant::now();
// let mut iter = SplitLines::new(bytes, self.quote_char.unwrap_or(b'"'), self.eol_char);
// let mut iter = SplitLines::new(bytes, self.quote_char, self.eol_char);
// let c = iter.count();
// dbg!(c);
// dbg!(t.elapsed());
Expand Down
5 changes: 3 additions & 2 deletions py-polars/polars/io/csv/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,6 @@ def read_csv(
skip_rows_after_header=skip_rows_after_header,
row_index_name=row_index_name,
row_index_offset=row_index_offset,
sample_size=sample_size,
eol_char=eol_char,
raise_if_empty=raise_if_empty,
truncate_ragged_lines=truncate_ragged_lines,
Expand Down Expand Up @@ -674,7 +673,6 @@ def _read_csv_impl(
try_parse_dates,
skip_rows_after_header,
parse_row_index_args(row_index_name, row_index_offset),
sample_size=sample_size,
eol_char=eol_char,
raise_if_empty=raise_if_empty,
truncate_ragged_lines=truncate_ragged_lines,
Expand Down Expand Up @@ -814,6 +812,9 @@ def read_csv_batched(
sample_size
Set the sample size. This is used to sample statistics to estimate the
allocation needed.
.. deprecated:: 1.10.0
Is a no-op.
eol_char
Single byte end of line character (default: `\n`). When encountering a file
with windows line endings (`\r\n`), one can go with the default `\n`. The extra
Expand Down

0 comments on commit d85e951

Please sign in to comment.