Skip to content

Commit

Permalink
fix: search for first non-empty chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
jonasspinner committed Sep 14, 2024
1 parent 2632065 commit 0c47798
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 10 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- make_sample search for first non-empty chunk ([#35](https://github.com/spiraldb/fsst/pull/35))

## [0.4.1](https://github.com/spiraldb/fsst/compare/v0.4.0...v0.4.1) - 2024-09-12

### Other
Expand Down
19 changes: 9 additions & 10 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -551,26 +551,25 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>)

while sample_buf_offset < sample_lim {
sample_rnd = fsst_hash(sample_rnd);
let mut line_nr = (sample_rnd as usize) % str_in.len();
let line_nr = (sample_rnd as usize) % str_in.len();

// Find the first non-empty chunk starting at line_nr, wrapping around if
// necessary.
//
// TODO: this will loop infinitely if there are no non-empty lines in the sample
while str_in[line_nr].is_empty() {
if line_nr == str_in.len() {
line_nr = 0;
}
}
let Some(line) = (line_nr..str_in.len())
.chain(0..line_nr)
.map(|line_nr| str_in[line_nr])
.find(|line| !line.is_empty())
else {
return sample;
};

let line = str_in[line_nr];
let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE);
sample_rnd = fsst_hash(sample_rnd);
let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks);

let len = FSST_SAMPLELINE.min(line.len() - chunk);

sample_buf.extend_from_slice(&str_in[line_nr][chunk..chunk + len]);
sample_buf.extend_from_slice(&line[chunk..chunk + len]);

// SAFETY: this is the data we just placed into `sample_buf` in the line above.
let slice =
Expand Down

0 comments on commit 0c47798

Please sign in to comment.