From e95b1bcc035b098f5ba9bf99f40303f1cefb742a Mon Sep 17 00:00:00 2001 From: Plamen Hristov Date: Fri, 7 Jun 2024 11:50:28 -0400 Subject: [PATCH] MIME type sniffing (#30) * Preliminary implementation of MIME type sniffing * Some refactoring and improvements. * Fixing erroneous commit * Made all tests pass * Removed comment * Applying sad formatting * Addressed some comments * Adding banyan fork of mime * Fixed a small bug and added one expect * Reformat * Variable rename * Fixed test issue. --- Cargo.lock | 35 ++- Cargo.toml | 3 + src/codec/crypto/authentication_tag.rs | 2 +- src/codec/crypto/nonce.rs | 2 +- src/codec/data_storage/data_block.rs | 4 +- .../data_storage/encrypted_data_chunk.rs | 4 +- src/filesystem/drive/directory_entry.rs | 15 ++ src/filesystem/drive/directory_handle.rs | 210 +++++++++++++++- src/filesystem/drive/inner.rs | 14 +- src/filesystem/nodes/metadata/mime_type.rs | 234 ++++++++++++++++++ src/filesystem/nodes/metadata/mod.rs | 59 +++++ src/filesystem/nodes/mod.rs | 28 ++- src/filesystem/nodes/node_builder.rs | 3 +- 13 files changed, 576 insertions(+), 37 deletions(-) create mode 100644 src/filesystem/nodes/metadata/mime_type.rs create mode 100644 src/filesystem/nodes/metadata/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8881738..7ab309c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,6 +258,8 @@ dependencies = [ "futures", "getrandom", "js-sys", + "mime 0.4.0-a.0", + "mime_guess", "p384", "rand", "rand_chacha", @@ -1093,13 +1095,27 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mime" +version = "0.4.0-a.0" +source = "git+https://github.com/banyancomputer/mime.git?rev=d1a1744#d1a1744cbe6b87e33a4258e6bef555efc99016dd" +dependencies = [ + "mime-parse", + "quoted-string", +] + +[[package]] +name = "mime-parse" +version = "0.0.0" +source = "git+https://github.com/banyancomputer/mime.git?rev=d1a1744#d1a1744cbe6b87e33a4258e6bef555efc99016dd" + [[package]] name = "mime_guess" version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" dependencies = [ - "mime", + "mime 0.3.17", "unicase", ] @@ -1433,6 +1449,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.36" @@ -1442,6 +1464,15 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "quoted-string" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9586f8867f39941d8e796c18340a9cb5221a018df021169dc3e61c87d9f5f567" +dependencies = [ + "quick-error", +] + [[package]] name = "rand" version = "0.8.5" @@ -1544,7 +1575,7 @@ dependencies = [ "ipnet", "js-sys", "log", - "mime", + "mime 0.3.17", "mime_guess", "once_cell", "percent-encoding", diff --git a/Cargo.toml b/Cargo.toml index 414fc07..061468e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,9 @@ reqwest = { version = "^0.12", default-features = false, optional = true, featur serde = { version = "^1", features = ["derive"], optional = true } serde_json = { version = "^1", optional = true } url = { version = "^2", optional = true } +mime = {git = "https://github.com/banyancomputer/mime.git", rev = "d1a1744"} +mime_guess = "2.0.4" + [[example]] name = "full_fs_exercise" diff --git a/src/codec/crypto/authentication_tag.rs b/src/codec/crypto/authentication_tag.rs index bf14288..3bb374d 100644 --- a/src/codec/crypto/authentication_tag.rs +++ b/src/codec/crypto/authentication_tag.rs @@ -18,7 +18,7 @@ impl AuthenticationTag { } pub fn from_bytes(data: &[u8; TAG_LENGTH]) -> Self { - Self(data.clone()) + Self(*data) } pub async fn encode( diff --git a/src/codec/crypto/nonce.rs b/src/codec/crypto/nonce.rs index dc7bb7c..9a8d978 100644 --- a/src/codec/crypto/nonce.rs +++ b/src/codec/crypto/nonce.rs @@ -18,7 +18,7 @@ impl Nonce { } pub fn from_bytes(data: &[u8; NONCE_LENGTH]) -> Self { - Self(data.clone()) + Self(*data) } pub(crate) async fn encode( diff --git a/src/codec/data_storage/data_block.rs b/src/codec/data_storage/data_block.rs index 565e319..ab63e8d 100644 --- a/src/codec/data_storage/data_block.rs +++ b/src/codec/data_storage/data_block.rs @@ -122,7 +122,7 @@ impl DataBlock { self.contents.len() >= self.data_options.chunk_count().into() } - pub fn parse<'a>(input: Stream<'a>) -> ParserResult<'a, Self> { + pub fn parse(input: Stream<'_>) -> ParserResult<'_, Self> { let (input, version) = le_u8.parse_peek(input)?; if version != 0x01 { @@ -172,7 +172,7 @@ impl DataBlock { Ok((input, block)) } - pub fn parse_with_magic<'a>(input: Stream<'a>) -> ParserResult<'a, Self> { + pub fn parse_with_magic(input: Stream<'_>) -> ParserResult<'_, Self> { let (input, _magic) = banyan_data_magic_tag(input)?; Self::parse(input) } diff --git a/src/codec/data_storage/encrypted_data_chunk.rs b/src/codec/data_storage/encrypted_data_chunk.rs index bf36a04..462c708 100644 --- a/src/codec/data_storage/encrypted_data_chunk.rs +++ b/src/codec/data_storage/encrypted_data_chunk.rs @@ -47,8 +47,8 @@ impl EncryptedDataChunk { Ok((self.0.len(), cid)) } - pub fn decrypt<'a>( - &'a self, + pub fn decrypt( + &self, options: &DataOptions, access_key: &AccessKey, ) -> Result { diff --git a/src/filesystem/drive/directory_entry.rs b/src/filesystem/drive/directory_entry.rs index 2bcbd87..ed03f47 100644 --- a/src/filesystem/drive/directory_entry.rs +++ b/src/filesystem/drive/directory_entry.rs @@ -14,6 +14,8 @@ pub struct DirectoryEntry { name: NodeName, kind: NodeKind, + mime_type: Option, + size: u64, } @@ -48,6 +50,17 @@ impl DirectoryEntry { pub fn size(&self) -> u64 { self.size } + + pub fn mime_type(&self) -> Option { + match self.kind { + NodeKind::File => self.mime_type.clone(), + NodeKind::Directory => None, + NodeKind::AssociatedData => None, + NodeKind::InternalLink => None, + NodeKind::NativeMount => None, + NodeKind::Unknown(_) => None, + } + } } impl TryFrom<&Node> for DirectoryEntry { @@ -63,6 +76,8 @@ impl TryFrom<&Node> for DirectoryEntry { name: node.name().clone(), kind: node.kind().clone(), + mime_type: node.mime_type(), + size: node.size(), }) } diff --git a/src/filesystem/drive/directory_handle.rs b/src/filesystem/drive/directory_handle.rs index 1dc00db..e7003c0 100644 --- a/src/filesystem/drive/directory_handle.rs +++ b/src/filesystem/drive/directory_handle.rs @@ -13,7 +13,7 @@ use crate::codec::crypto::{AccessKey, SigningKey}; use crate::codec::data_storage::{data_chunk::DataChunk, DataBlock}; use crate::codec::filesystem::BlockKind; use crate::filesystem::drive::{DirectoryEntry, InnerDrive, OperationError, WalkState}; -use crate::filesystem::nodes::{Node, NodeData, NodeId, NodeName}; +use crate::filesystem::nodes::{MetadataKey, MimeGuesser, Node, NodeData, NodeId, NodeName}; use crate::filesystem::{ContentLocation, ContentReference, FileContent, NodeBuilder}; use crate::stores::DataStore; @@ -693,8 +693,15 @@ impl DirectoryHandle { let mut inner_write = self.inner.write().await; let node = inner_write.by_perm_id_mut(&new_permanent_id).await?; + if let Some(mime_type) = MimeGuesser::default() + .with_name(node.name().clone()) + .with_data(data) + .guess_mime_type() + { + node.set_attribute(MetadataKey::MimeType, mime_type.to_string().into()) + .await; + } let node_data = node.data_mut().await; - let file_content = FileContent::encrypted(locked_key, plaintext_cid, data_size, content_references); *node_data = NodeData::full_file(file_content); @@ -767,12 +774,13 @@ fn walk_path<'a>( mod test { use super::*; use crate::filesystem::drive::inner::test::build_interesting_inner; + use crate::prelude::MemoryDataStore; #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_dir_from_dir_to_cwd_specify_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv(&mut rng, &["dir_1", "dir_2"], &["dir_2_new"]) .await @@ -792,7 +800,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_dir_from_dir_to_dir_specify_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv( &mut rng, @@ -816,7 +824,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_file_from_dir_to_cwd_specify_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv( &mut rng, @@ -840,7 +848,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_file_from_dir_to_dir_specify_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv( &mut rng, @@ -864,7 +872,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_dir_from_dir_to_cwd_no_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle.mv(&mut rng, &["dir_1", "dir_2"], &[]).await.unwrap(); let cwd_ls = handle.ls(&[]).await.unwrap(); @@ -881,7 +889,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_dir_from_dir_to_dir_no_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv(&mut rng, &["dir_1", "dir_2", "dir_3"], &["dir_1"]) .await @@ -901,7 +909,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_file_from_dir_to_cwd_no_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv(&mut rng, &["dir_1", "dir_2", "dir_3", "file_3"], &[]) .await @@ -921,7 +929,7 @@ mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn mv_file_from_dir_to_dir_no_name() { let mut rng = crate::utils::crypto_rng(); - let mut handle = interesting_handle().await; + let mut handle = interesting_handle(None).await; handle .mv(&mut rng, &["dir_1", "dir_2", "dir_3", "file_3"], &["dir_1"]) .await @@ -937,7 +945,7 @@ mod test { ); } - async fn interesting_handle() -> DirectoryHandle { + async fn interesting_handle(current_key: Option) -> DirectoryHandle { // -----file_1 // / // root ---------file_2 @@ -948,13 +956,189 @@ mod test { // \ // ----file_5 let mut rng = crate::utils::crypto_rng(); - let inner = build_interesting_inner().await; + let inner = build_interesting_inner(current_key.clone()).await; let root_id = inner.root_node().unwrap().id(); let inner = Arc::new(RwLock::new(inner)); DirectoryHandle { - current_key: Arc::new(SigningKey::generate(&mut rng)), + current_key: Arc::new(current_key.unwrap_or_else(|| SigningKey::generate(&mut rng))), inner, cwd_id: root_id, } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] + async fn sniff_html_mime_type() { + let mut rng = crate::utils::crypto_rng(); + let current_key = SigningKey::generate(&mut rng); + let mut handle = interesting_handle(Some(current_key)).await; + let mut store = MemoryDataStore::default(); + + let test_cases = vec![ + (b"Test File

Hello World!

".to_vec(), "test.html"), + (b"Test File

Hello World!

".to_vec(), "TEST.HTML"), + (b"

Heading

Paragraph

".to_vec(), "file.htm"), + (b"
Some text
".to_vec(), "page.php"), + ( + b"Content".to_vec(), + "invalid_file_name", + ), + ]; + for (data, file_name) in test_cases { + handle + .write(&mut rng, &mut store, &[file_name], &data) + .await + .unwrap(); + + let cwd_ls = handle.ls(&[]).await.unwrap(); + assert_eq!( + cwd_ls + .iter() + .filter(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .count(), + 1 + ); + + let file_entry = cwd_ls + .iter() + .find(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .unwrap(); + + assert_eq!(file_entry.kind(), NodeKind::File); + + let file_data = handle.read(&mut store, &[file_name]).await.unwrap(); + assert_eq!(file_data.as_slice(), data); + + let mime_type = file_entry.mime_type().unwrap(); + assert_eq!(mime_type, "text/html"); + } + } + + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] + async fn sniff_mp3_file_mime_type() { + let mut rng = crate::utils::crypto_rng(); + let current_key = SigningKey::generate(&mut rng); + let mut handle = interesting_handle(Some(current_key)).await; + let mut store = MemoryDataStore::default(); + let mp3_test_case: &[u8] = &[ + 0x49, 0x44, 0x33, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x54, 0x53, 0x53, 0x45, + 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x03, 0x4c, 0x61, 0x76, 0x66, 0x36, 0x30, 0x2e, + 0x33, 0x2e, 0x31, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xfb, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + let file_name = "the_audio.mp4"; + handle + .write(&mut rng, &mut store, &[file_name], mp3_test_case) + .await + .unwrap(); + + let cwd_ls = handle.ls(&[]).await.unwrap(); + assert_eq!( + cwd_ls + .iter() + .filter(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .count(), + 1 + ); + + let file_entry = cwd_ls + .iter() + .find(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .unwrap(); + + assert_eq!(file_entry.kind(), NodeKind::File); + + let file_data = handle.read(&mut store, &[file_name]).await.unwrap(); + assert_eq!(file_data.as_slice(), mp3_test_case); + + let mime_type = file_entry.mime_type().unwrap(); + assert_eq!(mime_type, "audio/mpeg"); + } + + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] + async fn sniff_mp4_file_mime_type() { + let mut rng = crate::utils::crypto_rng(); + let current_key = SigningKey::generate(&mut rng); + let mut handle = interesting_handle(Some(current_key)).await; + let mut store = MemoryDataStore::default(); + let mp4_test_case: &[u8] = &[ + 0x00, 0x00, 0x00, 0x1c, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d, 0x00, 0x00, + 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32, 0x6d, 0x70, 0x34, 0x31, + 0x00, 0x00, 0x00, 0x08, + ]; + let file_name = "the_audio.mp3"; + handle + .write(&mut rng, &mut store, &[file_name], mp4_test_case) + .await + .unwrap(); + + let cwd_ls = handle.ls(&[]).await.unwrap(); + assert_eq!( + cwd_ls + .iter() + .filter(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .count(), + 1 + ); + + let file_entry = cwd_ls + .iter() + .find(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .unwrap(); + + assert_eq!(file_entry.kind(), NodeKind::File); + + let file_data = handle.read(&mut store, &[file_name]).await.unwrap(); + assert_eq!(file_data.as_slice(), mp4_test_case); + + let mime_type = file_entry.mime_type().unwrap(); + assert_eq!(mime_type, "video/mp4"); + } + + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] + async fn sniff_webm_file_mime_type() { + let mut rng = crate::utils::crypto_rng(); + let current_key = SigningKey::generate(&mut rng); + let mut handle = interesting_handle(Some(current_key)).await; + let mut store = MemoryDataStore::default(); + let webm_test_case: &[u8] = &[ + 0x1a, 0x45, 0xdf, 0xa3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x42, 0x86, + 0x81, 0x01, 0x42, 0xf7, 0x81, 0x01, 0x42, 0xf2, 0x81, 0x04, 0x42, 0xf3, 0x81, 0x08, + 0x42, 0x82, 0x84, 0x77, 0x65, 0x62, 0x6d, 0x42, 0x87, 0x81, 0x02, 0x42, 0x85, 0x81, + 0x02, 0x18, 0x53, 0x80, 0x67, 0x01, 0x00, 0x00, 0x00, 0x00, 0x0d, 0xc0, 0x0a, 0x11, + 0x4d, 0x9b, 0x74, 0x40, 0x3c, 0x4d, 0xbb, 0x8b, 0x53, 0xab, 0x84, 0x15, 0x49, 0xa9, + 0x66, 0x53, 0xac, 0x81, 0xe5, 0x4d, 0xbb, 0x8c, 0x53, 0xab, + ]; + let file_name = "the_audio.mp4"; + handle + .write(&mut rng, &mut store, &[file_name], webm_test_case) + .await + .unwrap(); + + let cwd_ls = handle.ls(&[]).await.unwrap(); + assert_eq!( + cwd_ls + .iter() + .filter(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .count(), + 1 + ); + + let file_entry = cwd_ls + .iter() + .find(|entry| entry.name() == NodeName::try_from(file_name).unwrap()) + .unwrap(); + + assert_eq!(file_entry.kind(), NodeKind::File); + + let file_data = handle.read(&mut store, &[file_name]).await.unwrap(); + assert_eq!(file_data.as_slice(), webm_test_case); + + let mime_type = file_entry.mime_type().unwrap(); + assert_eq!(mime_type, "video/webm"); + } } diff --git a/src/filesystem/drive/inner.rs b/src/filesystem/drive/inner.rs index 205fd90..34b830c 100644 --- a/src/filesystem/drive/inner.rs +++ b/src/filesystem/drive/inner.rs @@ -494,10 +494,10 @@ pub(crate) mod test { use super::*; - fn initialize_inner_drive() -> (ActorId, InnerDrive) { + fn initialize_inner_drive(signing_key: Option) -> (ActorId, InnerDrive) { let mut rng = crate::utils::crypto_rng(); - let signing_key = SigningKey::generate(&mut rng); + let signing_key = signing_key.unwrap_or_else(|| SigningKey::generate(&mut rng)); let verifying_key = signing_key.verifying_key(); let actor_id = verifying_key.actor_id(); @@ -511,7 +511,7 @@ pub(crate) mod test { #[test] fn test_drive_initialization() { - let (_, inner) = initialize_inner_drive(); + let (_, inner) = initialize_inner_drive(None); assert!(inner.nodes.capacity() == 32); assert!(inner.nodes.len() == 1); } @@ -520,7 +520,7 @@ pub(crate) mod test { #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn test_node_creation() { let mut rng = crate::utils::crypto_rng(); - let (actor_id, mut inner) = initialize_inner_drive(); + let (actor_id, mut inner) = initialize_inner_drive(None); let create_node_res = inner .create_node( @@ -549,7 +549,7 @@ pub(crate) mod test { #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[cfg_attr(not(target_arch = "wasm32"), tokio::test)] async fn test_drive_round_tripping() { - let inner = build_interesting_inner().await; + let inner = build_interesting_inner(None).await; let access = inner.access(); let journal = inner.journal_start(); @@ -570,9 +570,9 @@ pub(crate) mod test { } // A fixture to make a relatively interesting inner - pub(crate) async fn build_interesting_inner() -> InnerDrive { + pub(crate) async fn build_interesting_inner(current_key: Option) -> InnerDrive { let mut rng = crate::utils::crypto_rng(); - let (actor_id, mut inner) = initialize_inner_drive(); + let (actor_id, mut inner) = initialize_inner_drive(current_key); // -----file_1 // / diff --git a/src/filesystem/nodes/metadata/mime_type.rs b/src/filesystem/nodes/metadata/mime_type.rs new file mode 100644 index 0000000..eee2a41 --- /dev/null +++ b/src/filesystem/nodes/metadata/mime_type.rs @@ -0,0 +1,234 @@ +use crate::prelude::nodes::NodeName; + +#[derive(Default)] +pub struct MimeGuesser { + name: Option, + data: Vec, +} + +impl MimeGuesser { + pub fn with_name(mut self, name: NodeName) -> Self { + match name { + NodeName::Named(name) => self.name = Some(name.clone()), + NodeName::Root => {} + } + self + } + + pub fn with_data(mut self, data: &[u8]) -> Self { + self.data.extend_from_slice(data); + self + } + + pub fn guess_mime_type(&self) -> Option { + self.pattern_match() + .or_else(|| self.algorithm_match()) + .or_else(|| self.extension_match()) + } + + fn extension_match(&self) -> Option { + if let Some(name) = self.name.as_ref() { + let guess = mime_guess::from_path(name); + if !guess.is_empty() { + return mime::MediaType::parse(guess.first()?.as_ref()).ok(); + } + } + None + } + + fn pattern_match(&self) -> Option { + let magic_bytes = &self.data[..]; + + // Taken from https://mimesniff.spec.whatwg.org/ + match magic_bytes { + &[0xFF, 0xD8, 0xFF, ..] => Some(mime::IMAGE_JPEG), + &[0x89, b'P', b'N', b'G', 0x0D, 0x0A, 0x1A, 0x0A, ..] => Some(mime::IMAGE_PNG), + &[b'G', b'I', b'F', b'8', b'7', b'a', ..] + | &[b'G', b'I', b'F', b'8', b'9', b'a', ..] => Some(mime::IMAGE_GIF), + &[b'B', b'M', ..] => Some(mime::IMAGE_BMP), + &[b'<', b'?', b'x', b'm', b'l', ..] => Some(mime::TEXT_XML), + &[b'<', b's', b'v', b'g', ..] => Some(mime::IMAGE_SVG), + &[b'w', b'O', b'F', b'F', ..] => Some(mime::FONT_WOFF), + &[b'w', b'O', b'F', b'2', ..] => Some(mime::FONT_WOFF2), + &[b'%', b'P', b'D', b'F', b'-', ..] => Some(mime::APPLICATION_PDF), + &[b'{', ..] => Some(mime::APPLICATION_JSON), + &[b'F', b'O', b'R', b'M', _, _, _, _, b'A', b'I', b'F', b'F', ..] => { + Some(mime::AUDIO_AIFF) + } + &[b'I', b'D', b'3', ..] => Some(mime::AUDIO_MPEG), + &[b'O', b'g', b'g', b'S', 0, ..] => Some(mime::AUDIO_OGG), + &[b'M', b'T', b'h', b'd', 0, 0, 0, 0x06, ..] => Some(mime::AUDIO_MIDI), + &[b'R', b'I', b'F', b'F', _, _, _, _, b'A', b'V', b'I', b' ', ..] => { + Some(mime::VIDEO_AVI) + } + &[b'R', b'I', b'F', b'F', _, _, _, _, b'W', b'A', b'V', b'E', ..] => { + Some(mime::AUDIO_WAVE) + } + &[0x1F, 0x8B, 0x08, ..] => Some(mime::APPLICATION_GZIP), + &[b'P', b'K', 0x03, 0x04, ..] => Some(mime::APPLICATION_ZIP), + &[b'R', b'a', b'r', b' ', 0x1A, 0x07, 0, ..] => Some(mime::APPLICATION_RAR), + &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'L', b'P'] => { + Some(mime::APPLICATION_VND_MS_FONTOBJECT) + } + &[0, 0x01, 0, 0, ..] => Some(mime::FONT_TTF), + &[b'O', b'T', b'T', b'O', ..] => Some(mime::FONT_OTF), + &[b't', b't', b'c', b'f', ..] => Some(mime::FONT_COLLECTION), + &[b'%', b'!', b'P', b'S', b'-', b'A', b'd', b'o', b'b', b'e', b'-', ..] => { + Some(mime::APPLICATION_POSTSCRIPT) + } + &[0xFE, 0xFF, 0, 0, ..] | &[0xFF, 0xFE, 0, 0, ..] | &[0xEF, 0xBB, 0xBF, 0, ..] => { + Some(mime::TEXT_PLAIN) + } + [b'<', ..] => { + match &magic_bytes[1..] + .iter() + .map(|&b| b.to_ascii_uppercase()) + .collect::>() + .as_slice() + { + [b'!', b'D', b'O', b'C', b'T', b'Y', b'P', b'E', b' ', b'H', b'T', b'M', b'L', tt, ..] + if is_whitespace_or_tag_terminator(*tt) => + { + Some(mime::TEXT_HTML) + } + [b'H', b'T', b'M', b'L', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'H', b'E', b'A', b'D', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'S', b'C', b'R', b'I', b'P', b'T', tt, ..] + if is_whitespace_or_tag_terminator(*tt) => + { + Some(mime::TEXT_HTML) + } + [b'I', b'F', b'R', b'A', b'M', b'E', tt, ..] + if is_whitespace_or_tag_terminator(*tt) => + { + Some(mime::TEXT_HTML) + } + [b'H', b'1', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'D', b'I', b'V', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'F', b'O', b'N', b'T', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'T', b'A', b'B', b'L', b'E', tt, ..] + if is_whitespace_or_tag_terminator(*tt) => + { + Some(mime::TEXT_HTML) + } + [b'A', tt, ..] if is_whitespace_or_tag_terminator(*tt) => Some(mime::TEXT_HTML), + [b'S', b'T', b'Y', b'L', b'E', tt, ..] + if is_whitespace_or_tag_terminator(*tt) => + { + Some(mime::TEXT_HTML) + } + [b'T', b'I', b'T', b'L', b'E', tt, ..] + if is_whitespace_or_tag_terminator(*tt) => + { + Some(mime::TEXT_HTML) + } + [b'B', tt, ..] if is_whitespace_or_tag_terminator(*tt) => Some(mime::TEXT_HTML), + [b'B', b'O', b'D', b'Y', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'B', b'R', tt, ..] if is_whitespace_or_tag_terminator(*tt) => { + Some(mime::TEXT_HTML) + } + [b'P', tt, ..] if is_whitespace_or_tag_terminator(*tt) => Some(mime::TEXT_HTML), + _ => None, + } + } + _ => None, + } + } + + fn algorithm_match(&self) -> Option { + if self.is_mp4() { + return Some(mime::VIDEO_MP4); + } + if self.is_webm() { + return Some(mime::VIDEO_WEBM); + } + + None + } + + fn is_mp4(&self) -> bool { + let data = &self.data; + if data.len() < 12 { + return false; + } + let box_size = u32::from_be_bytes([data[0], data[1], data[2], data[3]]); + if data.len() < box_size as usize || box_size % 4 != 0 { + return false; + } + + data.get(4..8) == Some(b"ftyp") + && (data.get(8..11) == Some(b"mp4") + || data[16..] + .chunks_exact(4) + .any(|chunk| chunk.starts_with(b"mp4"))) + } + fn is_webm(&self) -> bool { + let data = &self.data; + if data.len() < 4 || data[..4] != [0x1A, 0x45, 0xDF, 0xA3] { + return false; + } + + let skip_first_bytes = 4; + let chunk_size = 2; + let magic_bytes_delim = [0x42, 0x82]; + for (chunk_idx, chunk) in data[skip_first_bytes..].chunks(chunk_size).enumerate() { + // went over 4 + 2 * 17 = 38 bytes + if chunk_idx >= 17 { + break; + } + + if chunk != magic_bytes_delim { + continue; + } + + let offset = skip_first_bytes + chunk_idx * chunk_size + magic_bytes_delim.len(); + if let Some((_, number_size)) = data.get(offset..).map(|d| parse_vint(d, 0)) { + let start = offset + number_size; + let end = start + 4; + if data.get(start..end) == Some(b"webm") { + return true; + } + } + } + + false + } +} + +fn parse_vint(data: &[u8], offset: usize) -> (usize, usize) { + let mut mask = 128; + let max_vint_length = 8; + let mut number_size = 1; + + while number_size < max_vint_length + && data.get(offset).is_some() + && (data.get(offset).expect("already checked") & mask == 0) + { + mask >>= 1; + number_size += 1; + } + + let mut parsed_number = data.get(offset).map_or(0, |&b| (b & !mask) as usize); + + for &b in data.get(offset + 1..offset + number_size).unwrap_or(&[]) { + parsed_number = (parsed_number << 8) | b as usize; + } + + (parsed_number, number_size) +} + +fn is_whitespace_or_tag_terminator(byte: u8) -> bool { + byte == b' ' || byte == b'>' +} diff --git a/src/filesystem/nodes/metadata/mod.rs b/src/filesystem/nodes/metadata/mod.rs new file mode 100644 index 0000000..03a3db2 --- /dev/null +++ b/src/filesystem/nodes/metadata/mod.rs @@ -0,0 +1,59 @@ +use std::str::FromStr; +mod mime_type; + +pub use mime_type::MimeGuesser; + +#[derive(Hash, Eq, PartialEq, Debug)] +pub enum MetadataKey { + MimeType, + Custom(String), +} + +impl MetadataKey { + pub fn as_str(&self) -> &str { + match self { + MetadataKey::MimeType => "mime", + MetadataKey::Custom(s) => s.as_str(), + } + } + + pub fn as_bytes(&self) -> Vec { + match self { + MetadataKey::MimeType => b"mime".to_vec(), + MetadataKey::Custom(s) => s.as_bytes().to_vec(), + } + } + + pub fn from_bytes(key: &[u8]) -> Option { + match key { + b"mime" => Some(MetadataKey::MimeType), + _ => { + if key.len() > 255 { + return None; + } + + match std::str::from_utf8(key) { + Ok(s) => Some(MetadataKey::Custom(s.to_string())), + Err(_) => None, + } + } + } + } +} + +impl FromStr for MetadataKey { + type Err = winnow::error::ErrorKind; + + fn from_str(s: &str) -> Result { + match s { + "mime" => Ok(MetadataKey::MimeType), + _ => { + if s.len() > 255 { + return Err(winnow::error::ErrorKind::Verify); + } + + Ok(MetadataKey::Custom(s.to_string())) + } + } + } +} diff --git a/src/filesystem/nodes/mod.rs b/src/filesystem/nodes/mod.rs index 93bb1fb..0576212 100644 --- a/src/filesystem/nodes/mod.rs +++ b/src/filesystem/nodes/mod.rs @@ -13,6 +13,7 @@ //! guarantee the major version will be increased when a breaking change is made). mod cid_cache; +mod metadata; mod node_builder; mod node_data; mod node_name; @@ -25,8 +26,10 @@ pub use node_name::{NodeName, NodeNameError}; use std::collections::HashMap; use std::io::{Error as StdError, ErrorKind as StdErrorKind}; +use std::str::FromStr; use futures::{AsyncWrite, AsyncWriteExt}; +use mime; use winnow::binary::{le_i64, le_u32, le_u8}; use winnow::stream::Offset; use winnow::token::take; @@ -36,6 +39,7 @@ use crate::codec::filesystem::NodeKind; use crate::codec::meta::{ActorId, Cid, PermanentId}; use crate::codec::{ParserResult, Stream, VectorClock}; use crate::filesystem::drive::OperationError; +pub use crate::prelude::nodes::metadata::{MetadataKey, MimeGuesser}; pub(crate) type NodeId = usize; @@ -75,7 +79,7 @@ pub struct Node { modified_at: i64, name: NodeName, - metadata: HashMap>, + metadata: HashMap>, inner: NodeData, } @@ -193,7 +197,7 @@ impl Node { node_data.write_all(&[entry_count]).await?; let mut sorted_metadata = self.metadata.iter().collect::>(); - sorted_metadata.sort_by(|(a, _), (b, _)| a.as_bytes().cmp(b.as_bytes())); + sorted_metadata.sort_by(|(a, _), (b, _)| a.as_bytes().cmp(&b.as_bytes())); for (key, val) in sorted_metadata.into_iter() { let key_bytes = key.as_bytes(); @@ -204,7 +208,7 @@ impl Node { } node_data.write_all(&[key_bytes_len as u8]).await?; - node_data.write_all(key_bytes).await?; + node_data.write_all(&key_bytes).await?; let val_bytes_len = val.len(); if val_bytes_len > u8::MAX as usize { @@ -250,7 +254,7 @@ impl Node { self.inner.kind() } - pub fn metadata(&self) -> &HashMap> { + pub fn metadata(&self) -> &HashMap> { &self.metadata } @@ -355,18 +359,17 @@ impl Node { for _ in 0..metadata_entries { let (meta_buf, key_len) = le_u8.parse_peek(input)?; let (meta_buf, key) = take(key_len).parse_peek(meta_buf)?; - let key_str = String::from_utf8(key.to_vec()).map_err(|_| { + let key_metadata = MetadataKey::from_bytes(key).ok_or_else(|| { winnow::error::ErrMode::Cut(winnow::error::ParserError::from_error_kind( &input, winnow::error::ErrorKind::Token, )) })?; - let (meta_buf, val_len) = le_u8.parse_peek(meta_buf)?; let (meta_buf, val) = take(val_len).parse_peek(meta_buf)?; let val = val.to_vec(); - metadata.insert(key_str, val); + metadata.insert(key_metadata, val); input = meta_buf; } @@ -440,7 +443,16 @@ impl Node { self.permanent_id } - pub async fn set_attribute(&mut self, key: String, value: Vec) -> Option> { + pub fn mime_type(&self) -> Option { + self.metadata + .get(&MetadataKey::MimeType) + .and_then(|mime_str| match std::str::from_utf8(mime_str) { + Ok(s) => Some(mime::MediaType::from_str(s).ok()?), + Err(_) => None, + }) + } + + pub async fn set_attribute(&mut self, key: MetadataKey, value: Vec) -> Option> { let old_value = self.metadata.insert(key, value); self.notify_of_change().await; old_value diff --git a/src/filesystem/nodes/node_builder.rs b/src/filesystem/nodes/node_builder.rs index 181a222..43c95e4 100644 --- a/src/filesystem/nodes/node_builder.rs +++ b/src/filesystem/nodes/node_builder.rs @@ -7,6 +7,7 @@ use crate::codec::meta::{ActorId, VectorClock}; use crate::filesystem::nodes::{ CidCache, Node, NodeData, NodeId, NodeName, NodeNameError, PermanentId, }; +use crate::prelude::nodes::metadata::MetadataKey; pub(crate) struct NodeBuilder { id: Option, @@ -17,7 +18,7 @@ pub(crate) struct NodeBuilder { size_hint: Option, kind: NodeKind, - metadata: HashMap>, + metadata: HashMap>, } impl NodeBuilder {