diff --git a/CHANGELOG.md b/CHANGELOG.md index dae59f4..209d438 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- make_sample search for first non-empty chunk ([#35](https://github.com/spiraldb/fsst/pull/35)) + ## [0.4.1](https://github.com/spiraldb/fsst/compare/v0.4.0...v0.4.1) - 2024-09-12 ### Other diff --git a/src/builder.rs b/src/builder.rs index d801dd5..1dbe02f 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -551,26 +551,25 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) while sample_buf_offset < sample_lim { sample_rnd = fsst_hash(sample_rnd); - let mut line_nr = (sample_rnd as usize) % str_in.len(); + let line_nr = (sample_rnd as usize) % str_in.len(); // Find the first non-empty chunk starting at line_nr, wrapping around if // necessary. - // - // TODO: this will loop infinitely if there are no non-empty lines in the sample - while str_in[line_nr].is_empty() { - if line_nr == str_in.len() { - line_nr = 0; - } - } + let Some(line) = (line_nr..str_in.len()) + .chain(0..line_nr) + .map(|line_nr| str_in[line_nr]) + .find(|line| !line.is_empty()) + else { + return sample; + }; - let line = str_in[line_nr]; let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE); sample_rnd = fsst_hash(sample_rnd); let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks); let len = FSST_SAMPLELINE.min(line.len() - chunk); - sample_buf.extend_from_slice(&str_in[line_nr][chunk..chunk + len]); + sample_buf.extend_from_slice(&line[chunk..chunk + len]); // SAFETY: this is the data we just placed into `sample_buf` in the line above. let slice =