diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 69db9d97bc2c..3e9b021e2ef4 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1471,6 +1471,23 @@ mod tests { storage.delete(&path).await.unwrap(); + // Test handling of unicode paths + let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap(); + storage.put(&path, "test".into()).await.unwrap(); + + let r = storage.get(&path).await.unwrap(); + assert_eq!(r.bytes().await.unwrap(), "test"); + + let dir = Path::parse("🇦🇺").unwrap(); + let r = storage.list_with_delimiter(None).await.unwrap(); + assert!(r.common_prefixes.contains(&dir)); + + let r = storage.list_with_delimiter(Some(&dir)).await.unwrap(); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location, path); + + storage.delete(&path).await.unwrap(); + // Can also write non-percent encoded sequences let path = Path::parse("%Q.parquet").unwrap(); storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 919baf71b0a8..009f7d3f075f 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -144,6 +144,11 @@ pub(crate) enum Error { path: PathBuf, source: io::Error, }, + + #[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] + InvalidPath { + path: String, + }, } impl From for super::Error { @@ -176,6 +181,30 @@ impl From for super::Error { /// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme /// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986 /// +/// # Path Semantics +/// +/// [`LocalFileSystem`] will expose the path semantics of the underlying filesystem, which may +/// have additional restrictions beyond those enforced by [`Path`]. +/// +/// For example: +/// +/// * Windows forbids certain filenames, e.g. `COM0`, +/// * Windows forbids folders with trailing `.` +/// * Windows forbids certain ASCII characters, e.g. `<` or `|` +/// * OS X forbids filenames containing `:` +/// * Leading `-` are discouraged on Unix systems where they may be interpreted as CLI flags +/// * Filesystems may have restrictions on the maximum path or path segment length +/// * Filesystem support for non-ASCII characters is inconsistent +/// +/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst others like +/// FAT don't preserve case at all. Further some filesystems support non-unicode character +/// sequences, such as unpaired UTF-16 surrogates, [`LocalFileSystem`] will error on +/// encountering such sequences. +/// +/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`, are not supported +/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such files will be ignored +/// for listing operations, and attempting to address such a file will yield an error. +/// /// # Tokio Compatibility /// /// Tokio discourages performing blocking IO on a tokio worker thread, however, @@ -196,6 +225,11 @@ impl From for super::Error { /// * Mutating a file through one or more symlinks will mutate the underlying file /// * Deleting a path that resolves to a symlink will only delete the symlink /// +/// # Cross-Filesystem Copy +/// +/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and therefore +/// does not support copying across filesystem boundaries. +/// #[derive(Debug)] pub struct LocalFileSystem { config: Arc, @@ -246,8 +280,19 @@ impl LocalFileSystem { } impl Config { - /// Return an absolute filesystem path of the given location + /// Return an absolute filesystem path of the given file location fn path_to_filesystem(&self, location: &Path) -> Result { + ensure!( + is_valid_file_path(location), + InvalidPathSnafu { + path: location.as_ref() + } + ); + self.prefix_to_filesystem(location) + } + + /// Return an absolute filesystem path of the given location + fn prefix_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() .expect("url path") @@ -269,6 +314,19 @@ impl Config { } } +fn is_valid_file_path(path: &Path) -> bool { + match path.filename() { + Some(p) => match p.split_once('#') { + Some((_, suffix)) if !suffix.is_empty() => { + // Valid if contains non-digits + !suffix.as_bytes().iter().all(|x| x.is_ascii_digit()) + } + _ => true, + }, + None => false, + } +} + #[async_trait] impl ObjectStore for LocalFileSystem { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { @@ -445,7 +503,7 @@ impl ObjectStore for LocalFileSystem { let config = Arc::clone(&self.config); let root_path = match prefix { - Some(prefix) => match config.path_to_filesystem(prefix) { + Some(prefix) => match config.prefix_to_filesystem(prefix) { Ok(path) => path, Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), }, @@ -458,20 +516,21 @@ impl ObjectStore for LocalFileSystem { .follow_links(true); let s = walkdir.into_iter().flat_map(move |result_dir_entry| { - match convert_walkdir_result(result_dir_entry) { + let entry = match convert_walkdir_result(result_dir_entry).transpose()? { + Ok(entry) => entry, + Err(e) => return Some(Err(e)), + }; + + if !entry.path().is_file() { + return None; + } + + match config.filesystem_to_path(entry.path()) { + Ok(path) => match is_valid_file_path(&path) { + true => Some(convert_entry(entry, path)), + false => None, + }, Err(e) => Some(Err(e)), - Ok(None) => None, - Ok(entry @ Some(_)) => entry - .filter(|dir_entry| { - dir_entry.file_type().is_file() - // Ignore file names with # in them, since they might be in-progress uploads. - // They would be rejected anyways by filesystem_to_path below. - && !dir_entry.file_name().to_string_lossy().contains('#') - }) - .map(|entry| { - let location = config.filesystem_to_path(entry.path())?; - convert_entry(entry, location) - }), } }); @@ -512,7 +571,7 @@ impl ObjectStore for LocalFileSystem { let config = Arc::clone(&self.config); let prefix = prefix.cloned().unwrap_or_default(); - let resolved_prefix = config.path_to_filesystem(&prefix)?; + let resolved_prefix = config.prefix_to_filesystem(&prefix)?; maybe_spawn_blocking(move || { let walkdir = WalkDir::new(&resolved_prefix) @@ -525,15 +584,11 @@ impl ObjectStore for LocalFileSystem { for entry_res in walkdir.into_iter().map(convert_walkdir_result) { if let Some(entry) = entry_res? { - if entry.file_type().is_file() - // Ignore file names with # in them, since they might be in-progress uploads. - // They would be rejected anyways by filesystem_to_path below. - && entry.file_name().to_string_lossy().contains('#') - { - continue; - } let is_directory = entry.file_type().is_dir(); let entry_location = config.filesystem_to_path(entry.path())?; + if !is_directory && !is_valid_file_path(&entry_location) { + continue; + } let mut parts = match entry_location.prefix_match(&prefix) { Some(parts) => parts, @@ -1364,26 +1419,18 @@ mod tests { assert!(result.common_prefixes.is_empty()); assert_eq!(result.objects[0].location, object); - let illegal = root.join("💀"); - std::fs::write(illegal, "foo").unwrap(); - - // Can list directory that doesn't contain illegal path - flatten_list_stream(&integration, Some(&directory)) - .await - .unwrap(); + let emoji = root.join("💀"); + std::fs::write(emoji, "foo").unwrap(); - // Cannot list illegal file - let err = flatten_list_stream(&integration, None) - .await - .unwrap_err() - .to_string(); + // Can list illegal file + let paths = flatten_list_stream(&integration, None).await.unwrap(); - assert!( - err.contains( - "Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\"" - ), - "{}", - err + assert_eq!( + paths, + vec![ + Path::parse("💀").unwrap(), + Path::parse("directory/child.txt").unwrap() + ] ); } @@ -1442,6 +1489,51 @@ mod tests { let path = Path::from_filesystem_path(".").unwrap(); integration.list_with_delimiter(Some(&path)).await.unwrap(); } + + #[test] + fn test_valid_path() { + let cases = [ + ("foo#123/test.txt", true), + ("foo#123/test#23.txt", true), + ("foo#123/test#34", false), + ("foo😁/test#34", false), + ("foo/test#😁34", true), + ]; + + for (case, expected) in cases { + let path = Path::parse(case).unwrap(); + assert_eq!(is_valid_file_path(&path), expected); + } + } + + #[tokio::test] + async fn test_intermediate_files() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let a = Path::parse("foo#123/test.txt").unwrap(); + integration.put(&a, "test".into()).await.unwrap(); + + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![a.clone()]); + + std::fs::write(root.path().join("bar#123"), "test").unwrap(); + + // Should ignore file + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![a.clone()]); + + let b = Path::parse("bar#123").unwrap(); + let err = integration.get(&b).await.unwrap_err().to_string(); + assert_eq!(err, "Generic LocalFileSystem error: Filenames containing trailing '/#\\d+/' are not supported: bar#123"); + + let c = Path::parse("foo#123.txt").unwrap(); + integration.put(&c, "test".into()).await.unwrap(); + + let mut list = flatten_list_stream(&integration, None).await.unwrap(); + list.sort_unstable(); + assert_eq!(list, vec![c, a]); + } } #[cfg(not(target_arch = "wasm32"))] diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index e065c31d3145..77b3cf7d0d4f 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -65,7 +65,15 @@ pub enum Error { /// A parsed path representation that can be safely written to object storage /// -/// # Path Safety +/// A [`Path`] maintains the following invariants: +/// +/// * Paths are delimited by `/` +/// * Paths do not contain leading or trailing `/` +/// * Paths do not contain relative path segments, i.e. `.` or `..` +/// * Paths do not contain empty path segments +/// * Paths do not contain any ASCII control characters +/// +/// # Encode /// /// In theory object stores support any UTF-8 character sequence, however, certain character /// sequences cause compatibility problems with some applications and protocols. As such the @@ -76,34 +84,16 @@ pub enum Error { /// [GCS]: https://cloud.google.com/storage/docs/naming-objects /// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names /// -/// This presents libraries with two options for consistent path handling: -/// -/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths -/// that may not be consistently understood or supported -/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by -/// all other systems, but preventing interaction with objects at unsafe paths -/// -/// This library takes the second approach, in particular: -/// -/// * Paths are delimited by `/` -/// * Paths do not start with a `/` -/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`) -/// * Relative path segments, i.e. `.` and `..` are percent encoded -/// * Unsafe characters are percent encoded, as described by [RFC 1738] -/// * All paths are relative to the root of the object store -/// -/// In order to provide these guarantees there are two ways to safely construct a [`Path`] -/// -/// # Encode -/// -/// A string containing potentially illegal path segments can be encoded to a [`Path`] -/// using [`Path::from`] or [`Path::from_iter`]. +/// A string containing potentially problematic path segments can therefore be encoded to a [`Path`] +/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any problematic +/// segments according to [RFC 1738]. /// /// ``` /// # use object_store::path::Path; /// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar"); /// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar"); /// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar"); +/// assert_eq!(Path::from("/").as_ref(), ""); /// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar"); /// ``` /// @@ -116,17 +106,16 @@ pub enum Error { /// /// # Parse /// -/// Alternatively a [`Path`] can be created from an existing string, returning an -/// error if it is invalid. Unlike the encoding methods, this will permit -/// valid percent encoded sequences. +/// Alternatively a [`Path`] can be parsed from an existing string, returning an +/// error if it is invalid. Unlike the encoding methods above, this will permit +/// arbitrary unicode, including percent encoded sequences. /// /// ``` /// # use object_store::path::Path; -/// /// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar"); -/// Path::parse("..").unwrap_err(); -/// Path::parse("/foo//").unwrap_err(); -/// Path::parse("😀").unwrap_err(); +/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed +/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed +/// Path::parse("\x00").unwrap_err(); // ASCII control characters are disallowed /// ``` /// /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt @@ -236,7 +225,7 @@ impl Path { pub fn filename(&self) -> Option<&str> { match self.raw.is_empty() { true => None, - false => self.raw.split(DELIMITER).last(), + false => self.raw.rsplit(DELIMITER).next(), } } diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index 9da4815712db..df7097cbe9db 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -37,8 +37,10 @@ pub struct InvalidPart { /// The PathPart type exists to validate the directory/file names that form part /// of a path. /// -/// A PathPart instance is guaranteed to to contain no illegal characters (e.g. `/`) -/// as it can only be constructed by going through the `from` impl. +/// A [`PathPart`] is guaranteed to: +/// +/// * Contain no ASCII control characters or `/` +/// * Not be a relative path segment, i.e. `.` or `..` #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] pub struct PathPart<'a> { pub(super) raw: Cow<'a, str>, @@ -54,19 +56,12 @@ impl<'a> PathPart<'a> { }); } - for (idx, b) in segment.as_bytes().iter().cloned().enumerate() { - // A percent character is always valid, even if not - // followed by a valid 2-digit hex code - // https://url.spec.whatwg.org/#percent-encoded-bytes - if b == b'%' { - continue; - } - - if !b.is_ascii() || should_percent_encode(b) { + for c in segment.chars() { + if c.is_ascii_control() || c == '/' { return Err(InvalidPart { segment: segment.to_string(), // This is correct as only single byte characters up to this point - illegal: segment.chars().nth(idx).unwrap().to_string(), + illegal: c.to_string(), }); } } @@ -77,10 +72,6 @@ impl<'a> PathPart<'a> { } } -fn should_percent_encode(c: u8) -> bool { - percent_encode(&[c], INVALID).next().unwrap().len() != 1 -} - /// Characters we want to encode. const INVALID: &AsciiSet = &CONTROLS // The delimiter we are reserving for internal hierarchy