-
Notifications
You must be signed in to change notification settings - Fork 839
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Relax path safety (#5019) #5020
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -144,6 +144,11 @@ pub(crate) enum Error { | |
path: PathBuf, | ||
source: io::Error, | ||
}, | ||
|
||
#[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] | ||
InvalidPath { | ||
path: String, | ||
}, | ||
} | ||
|
||
impl From<Error> for super::Error { | ||
|
@@ -176,6 +181,30 @@ impl From<Error> for super::Error { | |
/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme | ||
/// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986 | ||
/// | ||
/// # Path Semantics | ||
/// | ||
/// [`LocalFileSystem`] will expose the path semantics of the underlying filesystem, which may | ||
/// have additional restrictions beyond those enforced by [`Path`]. | ||
/// | ||
/// For example: | ||
/// | ||
/// * Windows forbids certain filenames, e.g. `COM0`, | ||
/// * Windows forbids folders with trailing `.` | ||
/// * Windows forbids certain ASCII characters, e.g. `<` or `|` | ||
/// * OS X forbids filenames containing `:` | ||
/// * Leading `-` are discouraged on Unix systems where they may be interpreted as CLI flags | ||
/// * Filesystems may have restrictions on the maximum path or path segment length | ||
/// * Filesystem support for non-ASCII characters is inconsistent | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW: what we could do for FS (although that wouldn't be backwards compatible) is to percent-encode the path segments. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, the challenge is knowing ahead of time what character are permitted. We don't want to be in something similar to the current situation because we assume that people won't create paths with weird characters |
||
/// | ||
/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst others like | ||
/// FAT don't preserve case at all. Further some filesystems support non-unicode character | ||
/// sequences, such as unpaired UTF-16 surrogates, [`LocalFileSystem`] will error on | ||
/// encountering such sequences. | ||
/// | ||
/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`, are not supported | ||
/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such files will be ignored | ||
/// for listing operations, and attempting to address such a file will yield an error. | ||
/// | ||
/// # Tokio Compatibility | ||
/// | ||
/// Tokio discourages performing blocking IO on a tokio worker thread, however, | ||
|
@@ -196,6 +225,11 @@ impl From<Error> for super::Error { | |
/// * Mutating a file through one or more symlinks will mutate the underlying file | ||
/// * Deleting a path that resolves to a symlink will only delete the symlink | ||
/// | ||
/// # Cross-Filesystem Copy | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not a change in this PR, but is something I felt worth documenting whilst I remembered |
||
/// | ||
/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and therefore | ||
/// does not support copying across filesystem boundaries. | ||
/// | ||
#[derive(Debug)] | ||
pub struct LocalFileSystem { | ||
config: Arc<Config>, | ||
|
@@ -246,8 +280,19 @@ impl LocalFileSystem { | |
} | ||
|
||
impl Config { | ||
/// Return an absolute filesystem path of the given location | ||
/// Return an absolute filesystem path of the given file location | ||
fn path_to_filesystem(&self, location: &Path) -> Result<PathBuf> { | ||
ensure!( | ||
is_valid_file_path(location), | ||
InvalidPathSnafu { | ||
path: location.as_ref() | ||
} | ||
); | ||
self.prefix_to_filesystem(location) | ||
} | ||
|
||
/// Return an absolute filesystem path of the given location | ||
fn prefix_to_filesystem(&self, location: &Path) -> Result<PathBuf> { | ||
let mut url = self.root.clone(); | ||
url.path_segments_mut() | ||
.expect("url path") | ||
|
@@ -269,6 +314,19 @@ impl Config { | |
} | ||
} | ||
|
||
fn is_valid_file_path(path: &Path) -> bool { | ||
match path.filename() { | ||
Some(p) => match p.split_once('#') { | ||
Some((_, suffix)) if !suffix.is_empty() => { | ||
// Valid if contains non-digits | ||
!suffix.as_bytes().iter().all(|x| x.is_ascii_digit()) | ||
} | ||
_ => true, | ||
}, | ||
None => false, | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl ObjectStore for LocalFileSystem { | ||
async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result<PutResult> { | ||
|
@@ -445,7 +503,7 @@ impl ObjectStore for LocalFileSystem { | |
let config = Arc::clone(&self.config); | ||
|
||
let root_path = match prefix { | ||
Some(prefix) => match config.path_to_filesystem(prefix) { | ||
Some(prefix) => match config.prefix_to_filesystem(prefix) { | ||
Ok(path) => path, | ||
Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), | ||
}, | ||
|
@@ -458,20 +516,21 @@ impl ObjectStore for LocalFileSystem { | |
.follow_links(true); | ||
|
||
let s = walkdir.into_iter().flat_map(move |result_dir_entry| { | ||
match convert_walkdir_result(result_dir_entry) { | ||
let entry = match convert_walkdir_result(result_dir_entry).transpose()? { | ||
Ok(entry) => entry, | ||
Err(e) => return Some(Err(e)), | ||
}; | ||
|
||
if !entry.path().is_file() { | ||
return None; | ||
} | ||
|
||
match config.filesystem_to_path(entry.path()) { | ||
Ok(path) => match is_valid_file_path(&path) { | ||
true => Some(convert_entry(entry, path)), | ||
false => None, | ||
}, | ||
Err(e) => Some(Err(e)), | ||
Ok(None) => None, | ||
Ok(entry @ Some(_)) => entry | ||
.filter(|dir_entry| { | ||
dir_entry.file_type().is_file() | ||
// Ignore file names with # in them, since they might be in-progress uploads. | ||
// They would be rejected anyways by filesystem_to_path below. | ||
&& !dir_entry.file_name().to_string_lossy().contains('#') | ||
}) | ||
.map(|entry| { | ||
let location = config.filesystem_to_path(entry.path())?; | ||
convert_entry(entry, location) | ||
}), | ||
} | ||
}); | ||
|
||
|
@@ -512,7 +571,7 @@ impl ObjectStore for LocalFileSystem { | |
let config = Arc::clone(&self.config); | ||
|
||
let prefix = prefix.cloned().unwrap_or_default(); | ||
let resolved_prefix = config.path_to_filesystem(&prefix)?; | ||
let resolved_prefix = config.prefix_to_filesystem(&prefix)?; | ||
|
||
maybe_spawn_blocking(move || { | ||
let walkdir = WalkDir::new(&resolved_prefix) | ||
|
@@ -525,15 +584,11 @@ impl ObjectStore for LocalFileSystem { | |
|
||
for entry_res in walkdir.into_iter().map(convert_walkdir_result) { | ||
if let Some(entry) = entry_res? { | ||
if entry.file_type().is_file() | ||
// Ignore file names with # in them, since they might be in-progress uploads. | ||
// They would be rejected anyways by filesystem_to_path below. | ||
&& entry.file_name().to_string_lossy().contains('#') | ||
{ | ||
continue; | ||
} | ||
let is_directory = entry.file_type().is_dir(); | ||
let entry_location = config.filesystem_to_path(entry.path())?; | ||
if !is_directory && !is_valid_file_path(&entry_location) { | ||
continue; | ||
} | ||
|
||
let mut parts = match entry_location.prefix_match(&prefix) { | ||
Some(parts) => parts, | ||
|
@@ -1364,26 +1419,18 @@ mod tests { | |
assert!(result.common_prefixes.is_empty()); | ||
assert_eq!(result.objects[0].location, object); | ||
|
||
let illegal = root.join("💀"); | ||
std::fs::write(illegal, "foo").unwrap(); | ||
|
||
// Can list directory that doesn't contain illegal path | ||
flatten_list_stream(&integration, Some(&directory)) | ||
.await | ||
.unwrap(); | ||
let emoji = root.join("💀"); | ||
std::fs::write(emoji, "foo").unwrap(); | ||
|
||
// Cannot list illegal file | ||
let err = flatten_list_stream(&integration, None) | ||
.await | ||
.unwrap_err() | ||
.to_string(); | ||
// Can list illegal file | ||
let paths = flatten_list_stream(&integration, None).await.unwrap(); | ||
|
||
assert!( | ||
err.contains( | ||
"Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\"" | ||
), | ||
"{}", | ||
err | ||
assert_eq!( | ||
paths, | ||
vec![ | ||
Path::parse("💀").unwrap(), | ||
Path::parse("directory/child.txt").unwrap() | ||
] | ||
); | ||
} | ||
|
||
|
@@ -1442,6 +1489,51 @@ mod tests { | |
let path = Path::from_filesystem_path(".").unwrap(); | ||
integration.list_with_delimiter(Some(&path)).await.unwrap(); | ||
} | ||
|
||
#[test] | ||
fn test_valid_path() { | ||
let cases = [ | ||
("foo#123/test.txt", true), | ||
("foo#123/test#23.txt", true), | ||
("foo#123/test#34", false), | ||
("foo😁/test#34", false), | ||
("foo/test#😁34", true), | ||
]; | ||
|
||
for (case, expected) in cases { | ||
let path = Path::parse(case).unwrap(); | ||
assert_eq!(is_valid_file_path(&path), expected); | ||
} | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_intermediate_files() { | ||
let root = TempDir::new().unwrap(); | ||
let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); | ||
|
||
let a = Path::parse("foo#123/test.txt").unwrap(); | ||
integration.put(&a, "test".into()).await.unwrap(); | ||
|
||
let list = flatten_list_stream(&integration, None).await.unwrap(); | ||
assert_eq!(list, vec![a.clone()]); | ||
|
||
std::fs::write(root.path().join("bar#123"), "test").unwrap(); | ||
|
||
// Should ignore file | ||
let list = flatten_list_stream(&integration, None).await.unwrap(); | ||
assert_eq!(list, vec![a.clone()]); | ||
|
||
let b = Path::parse("bar#123").unwrap(); | ||
let err = integration.get(&b).await.unwrap_err().to_string(); | ||
assert_eq!(err, "Generic LocalFileSystem error: Filenames containing trailing '/#\\d+/' are not supported: bar#123"); | ||
|
||
let c = Path::parse("foo#123.txt").unwrap(); | ||
integration.put(&c, "test".into()).await.unwrap(); | ||
|
||
let mut list = flatten_list_stream(&integration, None).await.unwrap(); | ||
list.sort_unstable(); | ||
assert_eq!(list, vec![c, a]); | ||
} | ||
} | ||
|
||
#[cfg(not(target_arch = "wasm32"))] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is actually a real issue I've ran into in a past project.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 I will add this to the docs on Path proper