From 8b0d9cc2436ed81df9ba1db8043bd821462909f6 Mon Sep 17 00:00:00 2001 From: hoslo Date: Wed, 8 Jan 2025 21:28:12 +0800 Subject: [PATCH] feat(core): Implement list with deleted and versions for gcs --- core/src/services/gcs/backend.rs | 33 +++++++++---- core/src/services/gcs/config.rs | 2 + core/src/services/gcs/core.rs | 84 ++++++++++++++++++++++++++++---- core/src/services/gcs/delete.rs | 8 +-- core/src/services/gcs/lister.rs | 45 ++++++++++++----- 5 files changed, 136 insertions(+), 36 deletions(-) diff --git a/core/src/services/gcs/backend.rs b/core/src/services/gcs/backend.rs index c61ff75ca290..a4d5f217140a 100644 --- a/core/src/services/gcs/backend.rs +++ b/core/src/services/gcs/backend.rs @@ -192,6 +192,13 @@ impl GcsBuilder { self } + /// Set bucket versioning status for this backend + pub fn enable_versioning(mut self, enabled: bool) -> Self { + self.config.enable_versioning = enabled; + + self + } + /// Set the predefined acl for GCS. /// /// Available values are: @@ -326,6 +333,7 @@ impl Builder for GcsBuilder { predefined_acl: self.config.predefined_acl.clone(), default_storage_class: self.config.default_storage_class.clone(), allow_anonymous: self.config.allow_anonymous, + enable_versioning: self.config.enable_versioning, }), }; @@ -362,6 +370,7 @@ impl Access for GcsBackend { stat_has_content_md5: true, stat_has_content_length: true, stat_has_content_type: true, + stat_with_version: self.core.enable_versioning, stat_has_last_modified: true, stat_has_user_metadata: true, @@ -369,13 +378,14 @@ impl Access for GcsBackend { read_with_if_match: true, read_with_if_none_match: true, + read_with_version: self.core.enable_versioning, write: true, write_can_empty: true, write_can_multi: true, write_with_content_type: true, write_with_user_metadata: true, - write_with_if_not_exists: true, + write_with_if_not_exists: !self.core.enable_versioning, // The min multipart size of Gcs is 5 MiB. // @@ -392,6 +402,7 @@ impl Access for GcsBackend { delete: true, delete_max_size: Some(100), + delete_with_version: self.core.enable_versioning, copy: true, list: true, @@ -403,6 +414,8 @@ impl Access for GcsBackend { list_has_content_length: true, list_has_content_type: true, list_has_last_modified: true, + list_with_versions: self.core.enable_versioning, + list_with_deleted: self.core.enable_versioning, presign: true, presign_stat: true, @@ -432,6 +445,7 @@ impl Access for GcsBackend { m.set_etag(&meta.etag); m.set_content_md5(&meta.md5_hash); + m.set_version(&meta.generation); let size = meta .size @@ -485,15 +499,10 @@ impl Access for GcsBackend { } async fn list(&self, path: &str, args: OpList) -> Result<(RpList, Self::Lister)> { - let l = GcsLister::new( - self.core.clone(), - path, - args.recursive(), - args.limit(), - args.start_after(), - ); - - Ok((RpList::default(), oio::PageLister::new(l))) + Ok(( + RpList::default(), + oio::PageLister::new(GcsLister::new(self.core.clone(), path, args)), + )) } async fn copy(&self, from: &str, to: &str, _: OpCopy) -> Result { @@ -554,6 +563,10 @@ struct GetObjectJsonResponse { /// /// For example: `"contentType": "image/png",` content_type: String, + /// Generation of this object. + /// + /// For example: `"generation": "1660563214863653"` + generation: String, /// Custom metadata of this object. /// /// For example: `"metadata" : { "my-key": "my-value" }` diff --git a/core/src/services/gcs/config.rs b/core/src/services/gcs/config.rs index 43ff15175942..fb12871dc0ee 100644 --- a/core/src/services/gcs/config.rs +++ b/core/src/services/gcs/config.rs @@ -53,6 +53,8 @@ pub struct GcsConfig { pub disable_vm_metadata: bool, /// Disable loading configuration from the environment. pub disable_config_load: bool, + /// Enable versioning for the bucket. + pub enable_versioning: bool, /// A Google Cloud OAuth2 token. /// /// Takes precedence over `credential` and `credential_path`. diff --git a/core/src/services/gcs/core.rs b/core/src/services/gcs/core.rs index 3e06bf03b575..655a5ce82364 100644 --- a/core/src/services/gcs/core.rs +++ b/core/src/services/gcs/core.rs @@ -51,6 +51,8 @@ pub mod constants { pub const X_GOOG_ACL: &str = "x-goog-acl"; pub const X_GOOG_STORAGE_CLASS: &str = "x-goog-storage-class"; pub const X_GOOG_META_PREFIX: &str = "x-goog-meta-"; + pub const X_GOOG_IF_GENERATION_MATCH: &str = "x-goog-if-generation-match"; + pub const GENERATION: &str = "generation"; } pub struct GcsCore { @@ -69,6 +71,7 @@ pub struct GcsCore { pub default_storage_class: Option, pub allow_anonymous: bool, + pub enable_versioning: bool, } impl Debug for GcsCore { @@ -184,13 +187,25 @@ impl GcsCore { ) -> Result> { let p = build_abs_path(&self.root, path); - let url = format!( + let mut url = format!( "{}/storage/v1/b/{}/o/{}?alt=media", self.endpoint, self.bucket, percent_encode_path(&p) ); + let mut query_args = Vec::new(); + if let Some(version) = args.version() { + query_args.push(format!( + "{}={}", + constants::GENERATION, + percent_decode_path(version) + )) + } + if !query_args.is_empty() { + url.push_str(&format!("&{}", query_args.join("&"))); + } + let mut req = Request::get(&url); if let Some(if_match) = args.if_match() { @@ -216,6 +231,10 @@ impl GcsCore { let mut req = Request::get(&url); + if let Some(version) = args.version() { + req = req.header(constants::X_GOOG_IF_GENERATION_MATCH, version); + } + if let Some(if_match) = args.if_match() { req = req.header(IF_MATCH, if_match); } @@ -363,13 +382,25 @@ impl GcsCore { pub fn gcs_head_object_request(&self, path: &str, args: &OpStat) -> Result> { let p = build_abs_path(&self.root, path); - let url = format!( + let mut url = format!( "{}/storage/v1/b/{}/o/{}", self.endpoint, self.bucket, percent_encode_path(&p) ); + let mut query_args = Vec::new(); + if let Some(version) = args.version() { + query_args.push(format!( + "{}={}", + constants::GENERATION, + percent_decode_path(version) + )) + } + if !query_args.is_empty() { + url.push_str(&format!("?{}", query_args.join("&"))); + } + let mut req = Request::get(&url); if let Some(if_none_match) = args.if_none_match() { @@ -393,7 +424,19 @@ impl GcsCore { ) -> Result> { let p = build_abs_path(&self.root, path); - let url = format!("{}/{}/{}", self.endpoint, self.bucket, p); + let mut url = format!("{}/{}/{}", self.endpoint, self.bucket, p); + + let mut query_args = Vec::new(); + if let Some(version) = args.version() { + query_args.push(format!( + "{}={}", + constants::GENERATION, + percent_decode_path(version) + )) + } + if !query_args.is_empty() { + url.push_str(&format!("?{}", query_args.join("&"))); + } let mut req = Request::head(&url); @@ -422,35 +465,50 @@ impl GcsCore { self.send(req).await } - pub async fn gcs_delete_object(&self, path: &str) -> Result> { - let mut req = self.gcs_delete_object_request(path)?; + pub async fn gcs_delete_object(&self, path: &str, args: OpDelete) -> Result> { + let mut req = self.gcs_delete_object_request(path, args)?; self.sign(&mut req).await?; self.send(req).await } - pub fn gcs_delete_object_request(&self, path: &str) -> Result> { + pub fn gcs_delete_object_request(&self, path: &str, args: OpDelete) -> Result> { let p = build_abs_path(&self.root, path); - let url = format!( + let mut url = format!( "{}/storage/v1/b/{}/o/{}", self.endpoint, self.bucket, percent_encode_path(&p) ); + let mut query_args = Vec::new(); + if let Some(version) = args.version() { + query_args.push(format!( + "{}={}", + constants::GENERATION, + percent_decode_path(version) + )) + } + if !query_args.is_empty() { + url.push_str(&format!("?{}", query_args.join("&"))); + } + Request::delete(&url) .body(Buffer::new()) .map_err(new_request_build_error) } - pub async fn gcs_delete_objects(&self, paths: Vec) -> Result> { + pub async fn gcs_delete_objects( + &self, + batch: Vec<(String, OpDelete)>, + ) -> Result> { let uri = format!("{}/batch/storage/v1", self.endpoint); let mut multipart = Multipart::new(); - for (idx, path) in paths.iter().enumerate() { - let req = self.gcs_delete_object_request(path)?; + for (idx, (path, args)) in batch.iter().enumerate() { + let req = self.gcs_delete_object_request(path, args.clone())?; multipart = multipart.part( MixedPart::from_request(req).part_header("content-id".parse().unwrap(), idx.into()), @@ -493,6 +551,7 @@ impl GcsCore { delimiter: &str, limit: Option, start_after: Option, + versions: bool, ) -> Result> { let p = build_abs_path(&self.root, path); @@ -502,6 +561,9 @@ impl GcsCore { self.bucket, percent_encode_path(&p) ); + if versions { + write!(url, "&versions=true").expect("write into string must succeed"); + } if !delimiter.is_empty() { write!(url, "&delimiter={delimiter}").expect("write into string must succeed"); } @@ -681,6 +743,8 @@ pub struct ListResponseItem { pub md5_hash: String, pub updated: String, pub content_type: String, + pub time_deleted: Option, + pub generation: String, } /// Result of CreateMultipartUpload diff --git a/core/src/services/gcs/delete.rs b/core/src/services/gcs/delete.rs index 241b6152edc3..03968fd36228 100644 --- a/core/src/services/gcs/delete.rs +++ b/core/src/services/gcs/delete.rs @@ -34,8 +34,8 @@ impl GcsDeleter { } impl oio::BatchDelete for GcsDeleter { - async fn delete_once(&self, path: String, _: OpDelete) -> Result<()> { - let resp = self.core.gcs_delete_object(&path).await?; + async fn delete_once(&self, path: String, args: OpDelete) -> Result<()> { + let resp = self.core.gcs_delete_object(&path, args).await?; // deleting not existing objects is ok if resp.status().is_success() || resp.status() == StatusCode::NOT_FOUND { @@ -46,8 +46,8 @@ impl oio::BatchDelete for GcsDeleter { } async fn delete_batch(&self, batch: Vec<(String, OpDelete)>) -> Result { - let paths: Vec = batch.into_iter().map(|(p, _)| p).collect(); - let resp = self.core.gcs_delete_objects(paths.clone()).await?; + let paths: Vec = batch.clone().into_iter().map(|(p, _)| p).collect(); + let resp = self.core.gcs_delete_objects(batch).await?; let status = resp.status(); diff --git a/core/src/services/gcs/lister.rs b/core/src/services/gcs/lister.rs index cd66e964f77b..6236632a234b 100644 --- a/core/src/services/gcs/lister.rs +++ b/core/src/services/gcs/lister.rs @@ -33,6 +33,7 @@ pub struct GcsLister { path: String, delimiter: &'static str, limit: Option, + args: OpList, /// Filter results to objects whose names are lexicographically /// **equal to or after** startOffset @@ -41,21 +42,17 @@ pub struct GcsLister { impl GcsLister { /// Generate a new directory walker - pub fn new( - core: Arc, - path: &str, - recursive: bool, - limit: Option, - start_after: Option<&str>, - ) -> Self { - let delimiter = if recursive { "" } else { "/" }; + pub fn new(core: Arc, path: &str, args: OpList) -> Self { + let delimiter = if args.recursive() { "" } else { "/" }; + let start_after = args.start_after().map(String::from); Self { core, path: path.to_string(), delimiter, - limit, - start_after: start_after.map(String::from), + limit: args.limit(), + args, + start_after, } } } @@ -74,6 +71,7 @@ impl oio::PageList for GcsLister { } else { None }, + self.args.versions() || self.args.deleted(), ) .await?; @@ -100,7 +98,7 @@ impl oio::PageList for GcsLister { ctx.entries.push_back(de); } - for object in output.items { + for (index, object) in output.items.iter().enumerate() { // exclude the inclusive start_after itself let mut path = build_rel_path(&self.core.root, &object.name); if path.is_empty() { @@ -115,6 +113,7 @@ impl oio::PageList for GcsLister { // set metadata fields meta.set_content_md5(object.md5_hash.as_str()); meta.set_etag(object.etag.as_str()); + meta.set_version(object.generation.as_str()); let size = object.size.parse().map_err(|e| { Error::new(ErrorKind::Unexpected, "parse u64 from list response").set_source(e) @@ -126,9 +125,31 @@ impl oio::PageList for GcsLister { meta.set_last_modified(parse_datetime_from_rfc3339(object.updated.as_str())?); + let (mut is_latest, mut is_deleted) = (false, false); + // ref: https://cloud.google.com/storage/docs/json_api/v1/objects/list + // if versions is true, lists all versions of an object as distinct results in order of increasing generation number. + // so we need to check if the next item is not the same object, and the object is not deleted. + // then it is the current version. + if (index == output.items.len() - 1 || output.items[index + 1].name != object.name) + && object.time_deleted.is_none() + { + meta.set_is_current(true); + is_latest = true; + } + if object.time_deleted.is_some() { + meta.set_is_deleted(true); + is_deleted = true; + } + let de = oio::Entry::with(path, meta); - ctx.entries.push_back(de); + // if deleted is true, we need to include all deleted versions of an object. + // + // if versions is true, we need to include all versions of an object. + // if versions is false, we only include the latest version of an object. + if (self.args.deleted() && is_deleted) || (self.args.versions() || is_latest) { + ctx.entries.push_back(de); + } } Ok(())