Skip to content

Commit

Permalink
Relax path safety (#5019)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Nov 1, 2023
1 parent 65f7be8 commit 16ec475
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 88 deletions.
17 changes: 17 additions & 0 deletions object_store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1471,6 +1471,23 @@ mod tests {

storage.delete(&path).await.unwrap();

// Test handling of unicode paths
let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap();
storage.put(&path, "test".into()).await.unwrap();

let r = storage.get(&path).await.unwrap();
assert_eq!(r.bytes().await.unwrap(), "test");

let dir = Path::parse("🇦🇺").unwrap();
let r = storage.list_with_delimiter(None).await.unwrap();
assert!(r.common_prefixes.contains(&dir));

let r = storage.list_with_delimiter(Some(&dir)).await.unwrap();
assert_eq!(r.objects.len(), 1);
assert_eq!(r.objects[0].location, path);

storage.delete(&path).await.unwrap();

// Can also write non-percent encoded sequences
let path = Path::parse("%Q.parquet").unwrap();
storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap();
Expand Down
174 changes: 133 additions & 41 deletions object_store/src/local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ pub(crate) enum Error {
path: PathBuf,
source: io::Error,
},

#[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))]
InvalidPath {
path: String,
},
}

impl From<Error> for super::Error {
Expand Down Expand Up @@ -176,6 +181,30 @@ impl From<Error> for super::Error {
/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
/// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986
///
/// # Path Semantics
///
/// [`LocalFileSystem`] will expose the path semantics of the underlying filesystem, which may
/// have additional restrictions beyond those enforced by [`Path`].
///
/// For example:
///
/// * Windows forbids certain filenames, e.g. `COM0`,
/// * Windows forbids folders with trailing `.`
/// * Windows forbids certain ASCII characters, e.g. `<` or `|`
/// * OS X forbids filenames containing `:`
/// * Leading `-` are discouraged on Unix systems where they may be interpreted as CLI flags
/// * Filesystems may have restrictions on the maximum path or path segment length
/// * Filesystem support for non-ASCII characters is inconsistent
///
/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst others like
/// FAT don't preserve case at all. Further some filesystems support non-unicode character
/// sequences, such as unpaired UTF-16 surrogates, [`LocalFileSystem`] will error on
/// encountering such sequences.
///
/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`, are not supported
/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such files will be ignored
/// for listing operations, and attempting to address such a file will yield an error.
///
/// # Tokio Compatibility
///
/// Tokio discourages performing blocking IO on a tokio worker thread, however,
Expand All @@ -196,6 +225,11 @@ impl From<Error> for super::Error {
/// * Mutating a file through one or more symlinks will mutate the underlying file
/// * Deleting a path that resolves to a symlink will only delete the symlink
///
/// # Cross-Filesystem Copy
///
/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and therefore
/// does not support copying across filesystem boundaries.
///
#[derive(Debug)]
pub struct LocalFileSystem {
config: Arc<Config>,
Expand Down Expand Up @@ -246,8 +280,19 @@ impl LocalFileSystem {
}

impl Config {
/// Return an absolute filesystem path of the given location
/// Return an absolute filesystem path of the given file location
fn path_to_filesystem(&self, location: &Path) -> Result<PathBuf> {
ensure!(
is_valid_file_path(location),
InvalidPathSnafu {
path: location.as_ref()
}
);
self.prefix_to_filesystem(location)
}

/// Return an absolute filesystem path of the given location
fn prefix_to_filesystem(&self, location: &Path) -> Result<PathBuf> {
let mut url = self.root.clone();
url.path_segments_mut()
.expect("url path")
Expand All @@ -269,6 +314,19 @@ impl Config {
}
}

fn is_valid_file_path(path: &Path) -> bool {
match path.filename() {
Some(p) => match p.split_once('#') {
Some((_, suffix)) if !suffix.is_empty() => {
// Valid if contains non-digits
!suffix.as_bytes().iter().all(|x| x.is_ascii_digit())
}
_ => true,
},
None => false,
}
}

#[async_trait]
impl ObjectStore for LocalFileSystem {
async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result<PutResult> {
Expand Down Expand Up @@ -445,7 +503,7 @@ impl ObjectStore for LocalFileSystem {
let config = Arc::clone(&self.config);

let root_path = match prefix {
Some(prefix) => match config.path_to_filesystem(prefix) {
Some(prefix) => match config.prefix_to_filesystem(prefix) {
Ok(path) => path,
Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(),
},
Expand All @@ -458,20 +516,21 @@ impl ObjectStore for LocalFileSystem {
.follow_links(true);

let s = walkdir.into_iter().flat_map(move |result_dir_entry| {
match convert_walkdir_result(result_dir_entry) {
let entry = match convert_walkdir_result(result_dir_entry).transpose()? {
Ok(entry) => entry,
Err(e) => return Some(Err(e)),
};

if !entry.path().is_file() {
return None;
}

match config.filesystem_to_path(entry.path()) {
Ok(path) => match is_valid_file_path(&path) {
true => Some(convert_entry(entry, path)),
false => None,
},
Err(e) => Some(Err(e)),
Ok(None) => None,
Ok(entry @ Some(_)) => entry
.filter(|dir_entry| {
dir_entry.file_type().is_file()
// Ignore file names with # in them, since they might be in-progress uploads.
// They would be rejected anyways by filesystem_to_path below.
&& !dir_entry.file_name().to_string_lossy().contains('#')
})
.map(|entry| {
let location = config.filesystem_to_path(entry.path())?;
convert_entry(entry, location)
}),
}
});

Expand Down Expand Up @@ -512,7 +571,7 @@ impl ObjectStore for LocalFileSystem {
let config = Arc::clone(&self.config);

let prefix = prefix.cloned().unwrap_or_default();
let resolved_prefix = config.path_to_filesystem(&prefix)?;
let resolved_prefix = config.prefix_to_filesystem(&prefix)?;

maybe_spawn_blocking(move || {
let walkdir = WalkDir::new(&resolved_prefix)
Expand All @@ -525,15 +584,11 @@ impl ObjectStore for LocalFileSystem {

for entry_res in walkdir.into_iter().map(convert_walkdir_result) {
if let Some(entry) = entry_res? {
if entry.file_type().is_file()
// Ignore file names with # in them, since they might be in-progress uploads.
// They would be rejected anyways by filesystem_to_path below.
&& entry.file_name().to_string_lossy().contains('#')
{
continue;
}
let is_directory = entry.file_type().is_dir();
let entry_location = config.filesystem_to_path(entry.path())?;
if !is_directory && !is_valid_file_path(&entry_location) {
continue;
}

let mut parts = match entry_location.prefix_match(&prefix) {
Some(parts) => parts,
Expand Down Expand Up @@ -1364,26 +1419,18 @@ mod tests {
assert!(result.common_prefixes.is_empty());
assert_eq!(result.objects[0].location, object);

let illegal = root.join("💀");
std::fs::write(illegal, "foo").unwrap();

// Can list directory that doesn't contain illegal path
flatten_list_stream(&integration, Some(&directory))
.await
.unwrap();
let emoji = root.join("💀");
std::fs::write(emoji, "foo").unwrap();

// Cannot list illegal file
let err = flatten_list_stream(&integration, None)
.await
.unwrap_err()
.to_string();
// Can list illegal file
let paths = flatten_list_stream(&integration, None).await.unwrap();

assert!(
err.contains(
"Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\""
),
"{}",
err
assert_eq!(
paths,
vec![
Path::parse("💀").unwrap(),
Path::parse("directory/child.txt").unwrap()
]
);
}

Expand Down Expand Up @@ -1442,6 +1489,51 @@ mod tests {
let path = Path::from_filesystem_path(".").unwrap();
integration.list_with_delimiter(Some(&path)).await.unwrap();
}

#[test]
fn test_valid_path() {
let cases = [
("foo#123/test.txt", true),
("foo#123/test#23.txt", true),
("foo#123/test#34", false),
("foo😁/test#34", false),
("foo/test#😁34", true),
];

for (case, expected) in cases {
let path = Path::parse(case).unwrap();
assert_eq!(is_valid_file_path(&path), expected);
}
}

#[tokio::test]
async fn test_intermediate_files() {
let root = TempDir::new().unwrap();
let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap();

let a = Path::parse("foo#123/test.txt").unwrap();
integration.put(&a, "test".into()).await.unwrap();

let list = flatten_list_stream(&integration, None).await.unwrap();
assert_eq!(list, vec![a.clone()]);

std::fs::write(root.path().join("bar#123"), "test").unwrap();

// Should ignore file
let list = flatten_list_stream(&integration, None).await.unwrap();
assert_eq!(list, vec![a.clone()]);

let b = Path::parse("bar#123").unwrap();
let err = integration.get(&b).await.unwrap_err().to_string();
assert_eq!(err, "Generic LocalFileSystem error: Filenames containing trailing '/#\\d+/' are not supported: bar#123");

let c = Path::parse("foo#123.txt").unwrap();
integration.put(&c, "test".into()).await.unwrap();

let mut list = flatten_list_stream(&integration, None).await.unwrap();
list.sort_unstable();
assert_eq!(list, vec![c, a]);
}
}

#[cfg(not(target_arch = "wasm32"))]
Expand Down
51 changes: 20 additions & 31 deletions object_store/src/path/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,15 @@ pub enum Error {

/// A parsed path representation that can be safely written to object storage
///
/// # Path Safety
/// A [`Path`] maintains the following invariants:
///
/// * Paths are delimited by `/`
/// * Paths do not contain leading or trailing `/`
/// * Paths do not contain relative path segments, i.e. `.` or `..`
/// * Paths do not contain empty path segments
/// * Paths do not contain any ASCII control characters
///
/// # Encode
///
/// In theory object stores support any UTF-8 character sequence, however, certain character
/// sequences cause compatibility problems with some applications and protocols. As such the
Expand All @@ -76,34 +84,16 @@ pub enum Error {
/// [GCS]: https://cloud.google.com/storage/docs/naming-objects
/// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names
///
/// This presents libraries with two options for consistent path handling:
///
/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths
/// that may not be consistently understood or supported
/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by
/// all other systems, but preventing interaction with objects at unsafe paths
///
/// This library takes the second approach, in particular:
///
/// * Paths are delimited by `/`
/// * Paths do not start with a `/`
/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`)
/// * Relative path segments, i.e. `.` and `..` are percent encoded
/// * Unsafe characters are percent encoded, as described by [RFC 1738]
/// * All paths are relative to the root of the object store
///
/// In order to provide these guarantees there are two ways to safely construct a [`Path`]
///
/// # Encode
///
/// A string containing potentially illegal path segments can be encoded to a [`Path`]
/// using [`Path::from`] or [`Path::from_iter`].
/// A string containing potentially problematic path segments can therefore be encoded to a [`Path`]
/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any problematic
/// segments according to [RFC 1738].
///
/// ```
/// # use object_store::path::Path;
/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar");
/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar");
/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar");
/// assert_eq!(Path::from("/").as_ref(), "");
/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar");
/// ```
///
Expand All @@ -116,17 +106,16 @@ pub enum Error {
///
/// # Parse
///
/// Alternatively a [`Path`] can be created from an existing string, returning an
/// error if it is invalid. Unlike the encoding methods, this will permit
/// valid percent encoded sequences.
/// Alternatively a [`Path`] can be parsed from an existing string, returning an
/// error if it is invalid. Unlike the encoding methods above, this will permit
/// arbitrary unicode, including percent encoded sequences.
///
/// ```
/// # use object_store::path::Path;
///
/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar");
/// Path::parse("..").unwrap_err();
/// Path::parse("/foo//").unwrap_err();
/// Path::parse("😀").unwrap_err();
/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed
/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed
/// Path::parse("\x00").unwrap_err(); // ASCII control characters are disallowed
/// ```
///
/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt
Expand Down Expand Up @@ -236,7 +225,7 @@ impl Path {
pub fn filename(&self) -> Option<&str> {
match self.raw.is_empty() {
true => None,
false => self.raw.split(DELIMITER).last(),
false => self.raw.rsplit(DELIMITER).next(),
}
}

Expand Down
Loading

0 comments on commit 16ec475

Please sign in to comment.