From ff8699e042981adf41a83894df3823bc44daeb97 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 28 Sep 2023 09:49:55 -0700 Subject: [PATCH] move prepared files to ~/.cache dir --- Cargo.lock | 28 ++++++++++++++ Cargo.toml | 1 + crates/sparrow-runtime/Cargo.toml | 1 + .../sparrow-runtime/src/prepare/preparer.rs | 37 +++++++++++-------- python/Cargo.lock | 28 ++++++++++++++ 5 files changed, 80 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 98b1b48c2..2025bb210 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1432,6 +1432,15 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -1442,6 +1451,18 @@ dependencies = [ "dirs-sys-next", ] +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -2999,6 +3020,12 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-float" version = "2.10.0" @@ -4739,6 +4766,7 @@ dependencies = [ "dashmap", "data-encoding", "derive_more", + "dirs", "enum-map", "erased-serde", "error-stack", diff --git a/Cargo.toml b/Cargo.toml index baed629a3..d88f0b7ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ dashmap = "5.4.0" data-encoding = "2.3.3" decorum = "0.3.1" derive_more = "0.99.17" +dirs = "5.0.1" edit-distance = "2.1.0" egg = "0.9.3" enum-as-inner = "0.6.0" diff --git a/crates/sparrow-runtime/Cargo.toml b/crates/sparrow-runtime/Cargo.toml index fee2c4a66..ac76d1dbf 100644 --- a/crates/sparrow-runtime/Cargo.toml +++ b/crates/sparrow-runtime/Cargo.toml @@ -34,6 +34,7 @@ clap.workspace = true dashmap.workspace = true data-encoding.workspace = true derive_more.workspace = true +dirs.workspace = true enum-map.workspace = true erased-serde.workspace = true error-stack.workspace = true diff --git a/crates/sparrow-runtime/src/prepare/preparer.rs b/crates/sparrow-runtime/src/prepare/preparer.rs index c9421783a..da58c0d96 100644 --- a/crates/sparrow-runtime/src/prepare/preparer.rs +++ b/crates/sparrow-runtime/src/prepare/preparer.rs @@ -18,6 +18,9 @@ use crate::PreparedMetadata; use super::{prepared_batches, write_parquet}; +const KASKADA_PATH: &str = ".cache/kaskada"; +const PREPARED_FILE_PREFIX: &str = "part"; + #[derive(derive_more::Display, Debug)] pub enum Error { #[display(fmt = "batch missing required column '{_0}'")] @@ -93,20 +96,9 @@ impl Preparer { ) -> error_stack::Result, Error> { // TODO: Support Slicing - // Prepared files are stored in the following format: - // file:////tables//prepared//part-.parquet - let cur_dir = std::env::current_dir().expect("current dir"); - let cur_dir = cur_dir.to_string_lossy(); - - let uuid = Uuid::new_v4(); - let output_path_prefix = format!( - "file:///{}/tables/{}/prepare/{uuid}/", - cur_dir, self.table_config.uuid - ); - let output_file_prefix = "part"; - + let output_path_prefix = self.prepared_output_prefix()?; let output_url = ObjectStoreUrl::from_str(&output_path_prefix) - .change_context_lazy(|| Error::InvalidUrl(path.to_string_lossy().to_string()))?; + .change_context_lazy(|| Error::InvalidUrl(output_path_prefix))?; let object_store = self .object_stores @@ -140,10 +132,10 @@ impl Preparer { let (data, metadata) = next.change_context(Error::Internal)?; let data_url = output_url - .join(&format!("{output_file_prefix}-{n}.parquet")) + .join(&format!("{PREPARED_FILE_PREFIX}-{n}.parquet")) .change_context(Error::Internal)?; let metadata_url = output_url - .join(&format!("{output_file_prefix}-{n}-metadata.parquet")) + .join(&format!("{PREPARED_FILE_PREFIX}-{n}-metadata.parquet")) .change_context(Error::Internal)?; // Create the prepared file via PreparedMetadata. @@ -185,6 +177,21 @@ impl Preparer { self.time_multiplier.as_ref(), ) } + // Prepared files are stored in the following format: + // file://///tables//prepared//part-.parquet + pub fn prepared_output_prefix(&self) -> error_stack::Result { + let uuid = Uuid::new_v4(); + let home_dir = dirs::home_dir(); + if let Some(home_dir) = home_dir.map(|p| p.display().to_string()) { + Ok(format!( + "file:///{}/{}/tables/{}/prepare/{uuid}/", + home_dir, KASKADA_PATH, self.table_config.uuid + )) + } else { + tracing::error!("Failed to get home directory"); + error_stack::bail!(Error::Internal) + } + } } pub fn prepare_batch( diff --git a/python/Cargo.lock b/python/Cargo.lock index 1a17ce41a..1ae66e179 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1224,6 +1224,15 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -1234,6 +1243,18 @@ dependencies = [ "dirs-sys-next", ] +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys", +] + [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -2560,6 +2581,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-float" version = "2.10.0" @@ -3924,6 +3951,7 @@ dependencies = [ "dashmap", "data-encoding", "derive_more", + "dirs", "enum-map", "erased-serde", "error-stack",