From 5229d3910e86af15b53a7724a7d5c037bc7df047 Mon Sep 17 00:00:00 2001 From: Jesse Szwedko Date: Thu, 8 Jul 2021 20:38:52 +0000 Subject: [PATCH] fix(file source): fix regression in fingerprint calculations 0.14.0 included an upgrade to the `crc` crate, with associated updates to use the new API, however it appears to calculate different checksums. I put a comment here asking about how to calculate an equivalent checksum using the new API: https://github.com/mrhooray/crc-rs/issues/62#issuecomment-876720733 This also adds an alias to ensure that checkpoints written by 0.14.0 will be able to be read by 0.15.0 when it is released. Fixes: #8182 Signed-off-by: Jesse Szwedko --- Cargo.lock | 19 +++++++++++++++-- lib/file-source/Cargo.toml | 2 +- lib/file-source/src/fingerprinter.rs | 31 +++++++++++++++++----------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1e4c677da0aca..1b033d08ef3d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -941,6 +941,12 @@ dependencies = [ "tracing 0.1.26", ] +[[package]] +name = "build_const" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ae4235e6dac0694637c763029ecea1a2ec9e4e06ec2729bd21ba4d9c863eb7" + [[package]] name = "bumpalo" version = "3.6.1" @@ -1370,6 +1376,15 @@ dependencies = [ "wasmparser 0.58.0", ] +[[package]] +name = "crc" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" +dependencies = [ + "build_const", +] + [[package]] name = "crc" version = "2.0.0" @@ -2245,7 +2260,7 @@ dependencies = [ "bstr", "bytes 1.0.1", "chrono", - "crc", + "crc 1.8.1", "criterion", "dashmap", "flate2", @@ -5274,7 +5289,7 @@ dependencies = [ "bit-vec 0.6.3", "bytes 1.0.1", "chrono", - "crc", + "crc 2.0.0", "futures 0.3.15", "futures-io", "futures-timer", diff --git a/lib/file-source/Cargo.toml b/lib/file-source/Cargo.toml index 42e87157f2ab9..f4eb1e7a76824 100644 --- a/lib/file-source/Cargo.toml +++ b/lib/file-source/Cargo.toml @@ -11,7 +11,7 @@ libc = "0.2" winapi = { version = "0.3", features = ["winioctl"] } [dependencies] -crc = "2.0.0" +crc = "1.8.1" glob = "0.3.0" scan_fmt = "0.2.6" diff --git a/lib/file-source/src/fingerprinter.rs b/lib/file-source/src/fingerprinter.rs index d6ba7744e4d3e..6a3db656704a5 100644 --- a/lib/file-source/src/fingerprinter.rs +++ b/lib/file-source/src/fingerprinter.rs @@ -1,5 +1,4 @@ use crate::{metadata_ext::PortableFileExt, FileSourceInternalEvents}; -use crc::Crc; use serde::{Deserialize, Serialize}; use std::{ collections::HashSet, @@ -9,8 +8,6 @@ use std::{ }; use tracing::trace_span; -const FINGERPRINT_CRC: Crc = Crc::::new(&crc::CRC_64_ECMA_182); - #[derive(Clone)] pub struct Fingerprinter { pub strategy: FingerprintStrategy, @@ -37,6 +34,7 @@ pub enum FingerprintStrategy { pub enum FileFingerprint { #[serde(rename = "checksum")] BytesChecksum(u64), + #[serde(alias = "first_line_checksum")] FirstLinesChecksum(u64), DevInode(u64, u64), Unknown(u64), @@ -53,7 +51,7 @@ impl FileFingerprint { let mut buf = Vec::with_capacity(std::mem::size_of_val(dev) * 2); buf.write_all(&dev.to_be_bytes()).expect("writing to array"); buf.write_all(&ino.to_be_bytes()).expect("writing to array"); - FINGERPRINT_CRC.checksum(&buf[..]) + crc::crc64::checksum_ecma(&buf[..]) } Unknown(c) => *c, } @@ -94,7 +92,7 @@ impl Fingerprinter { let mut fp = fs::File::open(path)?; fp.seek(SeekFrom::Start(ignored_header_bytes as u64))?; fingerprinter_read_until(fp, b'\n', lines, buffer)?; - let fingerprint = FINGERPRINT_CRC.checksum(&buffer[..]); + let fingerprint = crc::crc64::checksum_ecma(&buffer[..]); Ok(FirstLinesChecksum(fingerprint)) } } @@ -151,7 +149,7 @@ impl Fingerprinter { let mut fp = fs::File::open(path)?; fp.seek(io::SeekFrom::Start(ignored_header_bytes as u64))?; fp.read_exact(&mut buffer[..bytes])?; - let fingerprint = FINGERPRINT_CRC.checksum(&buffer[..]); + let fingerprint = crc::crc64::checksum_ecma(&buffer[..]); Ok(Some(FileFingerprint::BytesChecksum(fingerprint))) } _ => Ok(None), @@ -193,7 +191,7 @@ fn fingerprinter_read_until( #[cfg(test)] mod test { - use super::{FileSourceInternalEvents, FingerprintStrategy, Fingerprinter}; + use super::{FileFingerprint, FileSourceInternalEvents, FingerprintStrategy, Fingerprinter}; use std::{collections::HashSet, fs, io::Error, path::Path, time::Duration}; use tempfile::tempdir; @@ -226,9 +224,12 @@ mod test { assert!(fingerprinter .get_fingerprint_of_file(&empty_path, &mut buf) .is_err()); - assert!(fingerprinter - .get_fingerprint_of_file(&full_line_path, &mut buf) - .is_ok()); + assert_eq!( + fingerprinter + .get_fingerprint_of_file(&full_line_path, &mut buf) + .unwrap(), + FileFingerprint::FirstLinesChecksum(8302183670541403209), + ); assert!(fingerprinter .get_fingerprint_of_file(¬_full_line_path, &mut buf) .is_err()); @@ -298,7 +299,10 @@ mod test { assert!(run(&incomlete_line).is_err()); assert!(run(&incomplete_under_max_line_length_by_one).is_err()); - assert!(run(&one_line).is_ok()); + assert_eq!( + run(&one_line).unwrap(), + FileFingerprint::FirstLinesChecksum(12790833211255586118) + ); assert!(run(&one_line_duplicate).is_ok()); assert!(run(&one_line_continued).is_ok()); assert!(run(&different_two_lines).is_ok()); @@ -356,7 +360,10 @@ mod test { assert!(run(&incomlete_lines).is_err()); - assert!(run(&two_lines).is_ok()); + assert_eq!( + run(&two_lines).unwrap(), + FileFingerprint::FirstLinesChecksum(8288549968916239272) + ); assert!(run(&two_lines_duplicate).is_ok()); assert!(run(&two_lines_continued).is_ok()); assert!(run(&different_three_lines).is_ok());