diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ddce31eb4..ae52b1278 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -250,11 +250,11 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: "1.65.0" + toolchain: "1.74.0" override: true - name: check if README matches MSRV defined here - run: grep '1.65.0' src/core/README.md + run: grep '1.74.0' src/core/README.md - name: Check if it builds properly uses: actions-rs/cargo@v1 diff --git a/Cargo.lock b/Cargo.lock index 2db96670d..8e5121d3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -261,6 +261,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.34" @@ -339,15 +350,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "codepage-437" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e40c1169585d8d08e5675a39f2fc056cd19a258fc4cba5e3bbf4a9c1026de535" -dependencies = [ - "csv", -] - [[package]] name = "codespan-reporting" version = "0.11.1" @@ -533,6 +535,15 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + [[package]] name = "enum_dispatch" version = "0.3.12" @@ -545,6 +556,12 @@ dependencies = [ "syn 2.0.46", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.8" @@ -593,7 +610,7 @@ dependencies = [ "cfg-if", "crc32fast", "libc", - "miniz_oxide", + "miniz_oxide 0.4.4", ] [[package]] @@ -648,6 +665,12 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" + [[package]] name = "heck" version = "0.4.1" @@ -693,6 +716,16 @@ dependencies = [ "cxx-build", ] +[[package]] +name = "indexmap" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177" +dependencies = [ + "equivalent", + "hashbrown 0.14.3", +] + [[package]] name = "inplace-vec-builder" version = "0.1.1" @@ -894,9 +927,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.4.1" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap" @@ -942,6 +975,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + [[package]] name = "murmurhash3" version = "0.0.5" @@ -1069,6 +1111,39 @@ dependencies = [ "libm", ] +[[package]] +name = "num_enum" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "oem_cp" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "330138902ab4dab09a86e6b7ab7ddeffb5f8435d52fe0df1bce8b06a17b10ee4" +dependencies = [ + "phf", + "phf_codegen", + "serde", + "serde_json", +] + [[package]] name = "once_cell" version = "1.19.0" @@ -1106,6 +1181,33 @@ dependencies = [ "syn 2.0.46", ] +[[package]] +name = "oval" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135cef32720c6746450d910890b0b69bcba2bbf6f85c9f4583df13fe415de828" + +[[package]] +name = "ownable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcba94d1536fcc470287d96fd26356c38da8215fdb9a74285b09621f35d9350" +dependencies = [ + "ownable-macro", +] + +[[package]] +name = "ownable-macro" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c91d2781624dec1234581a1a01e63638f36546ad72ee82873ac1b84f41117b" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.46", +] + [[package]] name = "paste" version = "1.0.14" @@ -1119,21 +1221,49 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] -name = "piz" -version = "0.5.1" +name = "phf" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ - "camino", - "chrono", - "codepage-437", - "crc32fast", - "flate2", - "log", - "memchr", - "thiserror", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", ] +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + [[package]] name = "pkg-config" version = "0.3.24" @@ -1168,6 +1298,17 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "positioned-io" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccabfeeb89c73adf4081f0dca7f8e28dbda90981a222ceea37f619e93ea6afe9" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + [[package]] name = "ppv-lite86" version = "0.2.16" @@ -1193,6 +1334,15 @@ dependencies = [ "num-integer", ] +[[package]] +name = "proc-macro-crate" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -1365,6 +1515,39 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rc-zip" +version = "5.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29dd39582d138524d58f6d50b440e9bcc707ebf13807b70ff77b12cf3341dd19" +dependencies = [ + "cfg-if", + "chardetng", + "chrono", + "crc32fast", + "encoding_rs", + "miniz_oxide 0.7.2", + "num_enum", + "oem_cp", + "oval", + "ownable", + "thiserror", + "tracing", + "winnow", +] + +[[package]] +name = "rc-zip-sync" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b119fa4c811a1b9112daaef1437db7ec4394fc15972487c1ce74fd94a6682d" +dependencies = [ + "oval", + "positioned-io", + "rc-zip", + "tracing", +] + [[package]] name = "regex" version = "1.5.6" @@ -1404,7 +1587,7 @@ dependencies = [ "bitvec", "bytecheck", "bytes", - "hashbrown", + "hashbrown 0.12.1", "ptr_meta", "rend", "rkyv_derive", @@ -1593,6 +1776,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "smallvec" version = "1.8.0" @@ -1634,11 +1823,12 @@ dependencies = [ "num-iter", "once_cell", "ouroboros", - "piz", "primal-check", "proptest", "rand", "rayon", + "rc-zip", + "rc-zip-sync", "rkyv", "roaring", "rocksdb", @@ -1779,6 +1969,54 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "toml_datetime" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" + +[[package]] +name = "toml_edit" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -2160,6 +2398,15 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "winnow" +version = "0.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +dependencies = [ + "memchr", +] + [[package]] name = "wyz" version = "0.5.1" diff --git a/include/sourmash.h b/include/sourmash.h index 410a07eab..2b5e3bd47 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -45,6 +45,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_NIFFLER_ERROR = 100005, SOURMASH_ERROR_CODE_CSV_ERROR = 100006, SOURMASH_ERROR_CODE_ROCKS_DB_ERROR = 100007, + SOURMASH_ERROR_CODE_ZIP_ERROR = 100008, }; typedef uint32_t SourmashErrorCode; diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 0f292db6d..65cd6190c 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -11,7 +11,7 @@ edition = "2021" readme = "README.md" autoexamples = false autobins = false -rust-version = "1.65.0" +rust-version = "1.74.0" [lib] name = "sourmash" @@ -48,7 +48,6 @@ nohash-hasher = "0.2.0" num-iter = "0.1.44" once_cell = "1.18.0" ouroboros = "0.18.3" -piz = "0.5.0" primal-check = "0.3.1" rayon = { version = "1.9.0", optional = true } rkyv = { version = "0.7.44", optional = true } @@ -113,3 +112,5 @@ wasm-bindgen-test = "0.3.42" ### These crates don't compile on wasm [target.'cfg(not(target_arch = "wasm32"))'.dependencies] rocksdb = { version = "0.21.0", optional = true } +rc-zip = { version = "5.1.0", default-features = false } +rc-zip-sync = "4.1.0" diff --git a/src/core/README.md b/src/core/README.md index b71baaabc..a17319868 100644 --- a/src/core/README.md +++ b/src/core/README.md @@ -38,4 +38,4 @@ Development happens on github at ## Minimum supported Rust version -Currently the minimum supported Rust version is 1.65.0. +Currently the minimum supported Rust version is 1.74.0. diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index 90c028eb3..3f93244f5 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -74,6 +74,10 @@ pub enum SourmashError { #[error(transparent)] Panic(#[from] crate::ffi::utils::Panic), + #[cfg(not(target_arch = "wasm32"))] + #[error(transparent)] + ZipError(#[from] rc_zip::error::Error), + #[cfg(not(target_arch = "wasm32"))] #[cfg(feature = "branchwater")] #[error(transparent)] @@ -124,6 +128,7 @@ pub enum SourmashErrorCode { NifflerError = 100_005, CsvError = 100_006, RocksDBError = 100_007, + ZipError = 100_008, } #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] @@ -155,6 +160,7 @@ impl SourmashErrorCode { SourmashError::NifflerError { .. } => SourmashErrorCode::NifflerError, SourmashError::Utf8Error { .. } => SourmashErrorCode::Utf8Error, SourmashError::CsvError { .. } => SourmashErrorCode::CsvError, + SourmashError::ZipError { .. } => SourmashErrorCode::ZipError, #[cfg(not(target_arch = "wasm32"))] #[cfg(feature = "branchwater")] diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index 26ca5ba6f..6aa7c85f5 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -1,5 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; -use std::ffi::OsStr; +use std::collections::HashMap; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; use std::ops::Deref; @@ -8,6 +7,7 @@ use std::sync::{Arc, RwLock}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use once_cell::sync::OnceCell; +use rc_zip_sync::{ArchiveHandle, ReadZip}; use serde::{Deserialize, Serialize}; use thiserror::Error; use typed_builder::TypedBuilder; @@ -108,18 +108,14 @@ pub struct FSStorage { #[ouroboros::self_referencing] pub struct ZipStorage { - mapping: Option, + file: std::fs::File, - #[borrows(mapping)] + #[borrows(file)] #[covariant] - archive: piz::ZipArchive<'this>, + archive: ArchiveHandle<'this, std::fs::File>, subdir: Option, path: Option, - - #[borrows(archive)] - #[covariant] - metadata: Metadata<'this>, } /// Store data in memory (no permanent storage) @@ -129,8 +125,6 @@ pub struct MemStorage { sigs: Arc>>, } -pub type Metadata<'a> = BTreeMap<&'a OsStr, &'a piz::read::FileMetadata<'a>>; - // ========================================= impl InnerStorage { @@ -286,56 +280,24 @@ impl Storage for FSStorage { } } -fn lookup<'a, P: AsRef>( - metadata: &'a Metadata, - path: P, -) -> Result<&'a piz::read::FileMetadata<'a>> { - let path = path.as_ref(); - metadata - .get(&path.as_os_str()) - .ok_or_else(|| StorageError::PathNotFoundError(path.to_string()).into()) - .copied() -} - -fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result> { - let subdirs: Vec<_> = archive - .entries() - .iter() - .filter(|entry| entry.is_dir()) - .collect(); - if subdirs.len() == 1 { - Ok(Some(subdirs[0].path.as_str().into())) - } else { - Ok(None) - } -} - impl Storage for ZipStorage { fn save(&self, _path: &str, _content: &[u8]) -> Result { unimplemented!(); } fn load(&self, path: &str) -> Result> { - let metadata = self.borrow_metadata(); + let archive = self.borrow_archive(); + if let Some(entry) = archive.by_name(path) { + return Ok(entry.bytes()?); + } - let entry = lookup(metadata, path).or_else(|_| { - if let Some(subdir) = self.borrow_subdir() { - lookup(metadata, subdir.to_owned() + path) - .map_err(|_| StorageError::PathNotFoundError(path.into())) - } else { - Err(StorageError::PathNotFoundError(path.into())) + if let Some(subdir) = &self.borrow_subdir() { + if let Some(entry) = archive.by_name(subdir.to_owned() + path) { + return Ok(entry.bytes()?); } - })?; - - let mut reader = BufReader::new( - self.borrow_archive() - .read(entry) - .map_err(|_| StorageError::DataReadError(path.into()))?, - ); - let mut contents = Vec::new(); - reader.read_to_end(&mut contents)?; + } - Ok(contents) + Err(StorageError::PathNotFoundError(path.into()).into()) } fn args(&self) -> StorageArgs { @@ -352,35 +314,41 @@ impl Storage for ZipStorage { } fn spec(&self) -> String { - format!("zip://{}", self.path().unwrap_or_else(|| "".into())) + format!("zip://{}", self.borrow_path().clone().unwrap_or("".into())) } } impl ZipStorage { pub fn from_file>(location: P) -> Result { - let zip_file = File::open(location.as_ref())?; - let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; + let file = File::open(location.as_ref())?; let mut storage = ZipStorageBuilder { - mapping: Some(mapping), - archive_builder: |mapping: &Option| { - piz::ZipArchive::new(mapping.as_ref().unwrap()).unwrap() - }, - metadata_builder: |archive: &piz::ZipArchive| { - archive - .entries() - .iter() - .map(|entry| (entry.path.as_os_str(), entry)) - .collect() - }, + file, + archive_builder: |file: &std::fs::File| file.read_zip().expect("Error loading zipfile"), subdir: None, path: Some(location.as_ref().into()), } .build(); - let subdir = find_subdirs(storage.borrow_archive())?; - storage.with_mut(|fields| *fields.subdir = subdir); + let subdir = { + let subdirs: Vec<_> = storage + .borrow_archive() + .entries() + .filter(|entry| matches!(entry.kind(), rc_zip::parse::EntryKind::Directory)) + .collect(); + if subdirs.len() == 1 { + Some( + subdirs[0] + .sanitized_name() + .expect("TODO throw right error") + .into(), + ) + } else { + None + } + }; + storage.with_mut(|fields| *fields.subdir = subdir); Ok(storage) } @@ -400,9 +368,8 @@ impl ZipStorage { Ok(self .borrow_archive() .entries() - .iter() .filter_map(|entry| { - let path = entry.path.as_str(); + let path = entry.sanitized_name().expect("TODO throw right error"); if path.ends_with(".sbt.json") { Some(path.into()) } else { @@ -416,8 +383,12 @@ impl ZipStorage { Ok(self .borrow_archive() .entries() - .iter() - .map(|entry| entry.path.as_str().into()) + .map(|entry| { + entry + .sanitized_name() + .expect("TODO throw right error") + .into() + }) .collect()) } }