From dcd6af0b32eb914120f9861a4d9d74e292328e28 Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Wed, 25 Sep 2024 14:40:34 +1000 Subject: [PATCH 1/8] feat(filemanager): move_id column and get object tagging logic --- .../database/migrations/0002_s3_move_id.sql | 1 + .../filemanager/src/clients/aws/s3.rs | 33 ++++ .../src/database/entities/s3_object.rs | 1 + .../stacks/filemanager/filemanager/src/env.rs | 57 ++++--- .../filemanager/filemanager/src/error.rs | 4 +- .../filemanager/src/events/aws/collecter.rs | 157 ++++++++++++++++-- .../filemanager/src/events/aws/inventory.rs | 33 ++-- .../filemanager/src/events/aws/mod.rs | 8 + .../filemanager/src/handlers/aws.rs | 6 +- .../filemanager/src/queries/mod.rs | 1 + 10 files changed, 235 insertions(+), 66 deletions(-) create mode 100644 lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql diff --git a/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql b/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql new file mode 100644 index 000000000..5c982b3d0 --- /dev/null +++ b/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql @@ -0,0 +1 @@ +alter table s3_object add column move_id uuid; \ No newline at end of file diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/clients/aws/s3.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/clients/aws/s3.rs index 30a26b833..6e1ff8f05 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/clients/aws/s3.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/clients/aws/s3.rs @@ -6,10 +6,13 @@ use std::result; use aws_sdk_s3 as s3; use aws_sdk_s3::error::SdkError; use aws_sdk_s3::operation::get_object::{GetObjectError, GetObjectOutput}; +use aws_sdk_s3::operation::get_object_tagging::{GetObjectTaggingError, GetObjectTaggingOutput}; use aws_sdk_s3::operation::head_object::{HeadObjectError, HeadObjectOutput}; use aws_sdk_s3::operation::list_buckets::{ListBucketsError, ListBucketsOutput}; +use aws_sdk_s3::operation::put_object_tagging::{PutObjectTaggingError, PutObjectTaggingOutput}; use aws_sdk_s3::presigning::{PresignedRequest, PresigningConfig}; use aws_sdk_s3::types::ChecksumMode::Enabled; +use aws_sdk_s3::types::Tagging; use chrono::Duration; use mockall::automock; @@ -70,6 +73,36 @@ impl Client { .await } + /// Execute the `GetObjectTagging` operation. + pub async fn get_object_tagging( + &self, + key: &str, + bucket: &str, + ) -> Result { + self.inner + .get_object_tagging() + .key(key) + .bucket(bucket) + .send() + .await + } + + /// Execute the `PutObjectTagging` operation. + pub async fn put_object_tagging( + &self, + key: &str, + bucket: &str, + tagging: Tagging, + ) -> Result { + self.inner + .put_object_tagging() + .key(key) + .bucket(bucket) + .tagging(tagging) + .send() + .await + } + /// Execute the `GetObject` operation and generate a presigned url for the object. pub async fn presign_url( &self, diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs index a914a730a..d3a9865f9 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs @@ -37,6 +37,7 @@ pub struct Model { #[sea_orm(column_type = "Text", nullable)] pub deleted_sequencer: Option, pub number_reordered: i64, + pub move_id: Option, } #[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] pub enum Relation {} diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs index 9e1591943..6223ec301 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs @@ -16,6 +16,7 @@ use crate::error::Result; /// Configuration environment variables for filemanager. #[serde_as] #[derive(Debug, Clone, Deserialize, Eq, PartialEq)] +#[serde(default)] pub struct Config { pub(crate) database_url: Option, pub(crate) pgpassword: Option, @@ -24,8 +25,12 @@ pub struct Config { pub(crate) pguser: Option, #[serde(rename = "filemanager_sqs_url")] pub(crate) sqs_url: Option, - #[serde(default, rename = "filemanager_paired_ingest_mode")] + #[serde(rename = "filemanager_paired_ingest_mode")] pub(crate) paired_ingest_mode: bool, + #[serde(rename = "filemanager_ingester_track_moves")] + pub(crate) ingester_track_moves: bool, + #[serde(rename = "filemanager_ingester_tag_name")] + pub(crate) ingester_tag_name: String, #[serde(default, rename = "filemanager_api_links_url")] pub(crate) api_links_url: Option, #[serde(rename = "filemanager_api_presign_limit")] @@ -35,15 +40,9 @@ pub struct Config { pub(crate) api_presign_expiry: Option, #[serde(rename = "filemanager_api_cors_allow_origins")] pub(crate) api_cors_allow_origins: Option>, - #[serde( - default = "default_allow_methods", - rename = "filemanager_api_cors_allow_methods" - )] + #[serde(rename = "filemanager_api_cors_allow_methods")] pub(crate) api_cors_allow_methods: Vec, - #[serde( - default = "default_allow_headers", - rename = "filemanager_api_cors_allow_headers" - )] + #[serde(rename = "filemanager_api_cors_allow_headers")] pub(crate) api_cors_allow_headers: Vec, } @@ -57,30 +56,24 @@ impl Default for Config { pguser: None, sqs_url: None, paired_ingest_mode: false, + ingester_track_moves: true, + ingester_tag_name: "filemanager_id".to_string(), api_links_url: None, api_presign_limit: None, api_presign_expiry: None, api_cors_allow_origins: None, - api_cors_allow_methods: default_allow_methods(), - api_cors_allow_headers: default_allow_headers(), + api_cors_allow_methods: vec![ + Method::GET.to_string(), + Method::HEAD.to_string(), + Method::OPTIONS.to_string(), + Method::POST.to_string(), + Method::PATCH.to_string(), + ], + api_cors_allow_headers: vec![AUTHORIZATION.to_string()], } } } -fn default_allow_methods() -> Vec { - vec![ - Method::GET.to_string(), - Method::HEAD.to_string(), - Method::OPTIONS.to_string(), - Method::POST.to_string(), - Method::PATCH.to_string(), - ] -} - -fn default_allow_headers() -> Vec { - vec![AUTHORIZATION.to_string()] -} - impl Config { /// Load environment variables into a `Config` struct. pub fn load() -> Result { @@ -133,6 +126,16 @@ impl Config { self.paired_ingest_mode } + /// Whether the ingester should track moves. + pub fn ingester_track_moves(&self) -> bool { + self.ingester_track_moves + } + + /// Get the ingester tag name. + pub fn ingester_tag_name(&self) -> &str { + &self.ingester_tag_name + } + /// Get the presigned size limit. pub fn api_links_url(&self) -> Option<&Url> { self.api_links_url.as_ref() @@ -192,6 +195,8 @@ mod tests { ("PGUSER", "user"), ("FILEMANAGER_SQS_URL", "url"), ("FILEMANAGER_PAIRED_INGEST_MODE", "true"), + ("FILEMANAGER_INGESTER_TRACK_MOVES", "false"), + ("FILEMANAGER_INGESTER_TAG_NAME", "tag"), ("FILEMANAGER_API_LINKS_URL", "https://localhost:8000"), ("FILEMANAGER_API_PRESIGN_LIMIT", "123"), ("FILEMANAGER_API_PRESIGN_EXPIRY", "60"), @@ -217,6 +222,8 @@ mod tests { pguser: Some("user".to_string()), sqs_url: Some("url".to_string()), paired_ingest_mode: true, + ingester_track_moves: false, + ingester_tag_name: "tag".to_string(), api_links_url: Some("https://localhost:8000".parse().unwrap()), api_presign_limit: Some(123), api_presign_expiry: Some(Duration::seconds(60)), diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs index 4a1f47082..73ae9942c 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs @@ -26,8 +26,8 @@ pub enum Error { ConfigError(String), #[error("credential generator error: `{0}`")] CredentialGeneratorError(String), - #[error("S3 inventory error: `{0}`")] - S3InventoryError(String), + #[error("S3 error: `{0}`")] + S3Error(String), #[error("{0}")] IoError(#[from] io::Error), #[error("numerical operation overflowed")] diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs index ce560e57b..66f923534 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs @@ -2,9 +2,12 @@ //! use async_trait::async_trait; +use aws_sdk_s3::error::BuildError; +use aws_sdk_s3::operation::get_object_tagging::GetObjectTaggingOutput; use aws_sdk_s3::operation::head_object::HeadObjectOutput; use aws_sdk_s3::primitives; use aws_sdk_s3::types::StorageClass::Standard; +use aws_sdk_s3::types::{Tag, Tagging}; use chrono::{DateTime, Utc}; use futures::future::join_all; use futures::TryFutureExt; @@ -18,12 +21,13 @@ use crate::clients::aws::s3::Client as S3Client; #[double] use crate::clients::aws::sqs::Client as SQSClient; use crate::env::Config; -use crate::error::Error::{SQSError, SerdeError}; -use crate::error::Result; +use crate::error::Error::{S3Error, SQSError, SerdeError}; +use crate::error::{Error, Result}; use crate::events::aws::{ EventType, FlatS3EventMessage, FlatS3EventMessages, StorageClass, TransposedS3EventMessages, }; use crate::events::{Collect, EventSource, EventSourceType}; +use crate::uuid::UuidGenerator; /// Build an AWS collector struct. #[derive(Default, Debug)] @@ -164,8 +168,60 @@ impl<'a> Collecter<'a> { .ok() } + /// Gets S3 tags from objects. + pub async fn tagging( + config: &Config, + client: &Client, + event: FlatS3EventMessage, + ) -> Result { + let tagging = client + .get_object_tagging(&event.key, &event.bucket) + .map_err(|err| { + let err = err.into_service_error(); + warn!("Error received from GetObjectTagging: {}", err); + err + }) + .await + .ok(); + + if let Some(tagging) = tagging { + trace!(tagging = ?tagging, "received GetObjectTagging output"); + + let GetObjectTaggingOutput { mut tag_set, .. } = tagging; + + if !tag_set + .iter() + .any(|tag| tag.key == config.ingester_tag_name()) + { + // Add the tag to the object if it's not already there. + let tag = Tag::builder() + .key(config.ingester_tag_name()) + .value(UuidGenerator::generate()) + .build()?; + tag_set.push(tag); + + // Try to push the tags to S3, only proceed if successful. + let result = client + .put_object_tagging( + &event.key, + &event.bucket, + Tagging::builder().set_tag_set(Some(tag_set)).build()?, + ) + .await; + if let Err(err) = result { + let err = err.into_service_error(); + warn!("Error received from PutObjectTagging: {}", err); + return Ok(event); + } + } + } + + Ok(event) + } + /// Process events and add header and datetime fields. pub async fn update_events( + config: &Config, client: &Client, events: FlatS3EventMessages, ) -> Result { @@ -180,10 +236,10 @@ impl<'a> Collecter<'a> { trace!(key = ?event.key, bucket = ?event.bucket, "updating event"); // Race condition: it's possible that an object gets deleted so quickly that it - // occurs before calling head. This means that there may be cases where the storage - // class and other fields are not known. + // occurs before calling head/tagging. This means that there may be cases where the + // storage class and other fields are not known, or object moves cannot be tracked. if let Some(head) = Self::head(client, &event.key, &event.bucket).await { - trace!(head = ?head, "received head object output"); + trace!(head = ?head, "received HeadObject output"); let HeadObjectOutput { storage_class, @@ -191,6 +247,7 @@ impl<'a> Collecter<'a> { content_length, e_tag, checksum_sha256, + delete_marker, .. } = head; @@ -203,10 +260,11 @@ impl<'a> Collecter<'a> { .update_last_modified_date(Self::convert_datetime(last_modified)) .update_size(content_length) .update_e_tag(e_tag) - .update_sha256(checksum_sha256); + .update_sha256(checksum_sha256) + .update_delete_marker(delete_marker); } - Ok(event) + Self::tagging(config, client, event).await })) .await .into_iter() @@ -220,17 +278,23 @@ impl<'a> Collecter<'a> { } } +impl From for Error { + fn from(err: BuildError) -> Self { + S3Error(err.to_string()) + } +} + #[async_trait] impl<'a> Collect for Collecter<'a> { async fn collect(mut self) -> Result { let (client, events, config) = self.into_inner(); - let n_records = events.0.len(); let events = events.sort_and_dedup(); - let events = Self::update_events(&client, events).await?; + let events = Self::update_events(config, &client, events).await?; // Get only the known event types. let events = events.filter_known(); + let n_records = events.0.len(); if config.paired_ingest_mode() { Ok(EventSource::new( @@ -251,7 +315,11 @@ pub(crate) mod tests { use std::result; use aws_sdk_s3::error::SdkError; + use aws_sdk_s3::operation::get_object_tagging::GetObjectTaggingError; use aws_sdk_s3::operation::head_object::HeadObjectError; + use aws_sdk_s3::operation::put_object_tagging::{ + PutObjectTaggingError, PutObjectTaggingOutput, + }; use aws_sdk_s3::primitives::{DateTimeFormat, SdkBody}; use aws_sdk_s3::types; use aws_sdk_s3::types::error::NotFound; @@ -259,7 +327,7 @@ pub(crate) mod tests { use aws_sdk_sqs::types::builders::MessageBuilder; use aws_smithy_runtime_api::client::orchestrator::HttpResponse; use aws_smithy_runtime_api::client::result::ServiceError; - use mockall::predicate::eq; + use mockall::predicate::{eq, function}; use crate::events::aws::tests::{ expected_event_record_simple, expected_flat_events_simple, EXPECTED_SHA256, @@ -295,7 +363,7 @@ pub(crate) mod tests { let mut s3_client = S3Client::default(); set_sqs_client_expectations(&mut sqs_client); - set_s3_client_expectations(&mut s3_client, vec![|| Ok(expected_head_object())]); + set_s3_client_expectations(&mut s3_client); let events = CollecterBuilder::default() .with_sqs_client(sqs_client) @@ -328,7 +396,7 @@ pub(crate) mod tests { let config = Default::default(); let mut collecter = test_collecter(&config).await; - set_s3_client_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); + set_s3_head_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); let result = Collecter::head(&collecter.client, "key", "bucket").await; assert_eq!(result, Some(expected_head_object())); @@ -339,7 +407,7 @@ pub(crate) mod tests { let config = Default::default(); let mut collecter = test_collecter(&config).await; - set_s3_client_expectations( + set_s3_head_expectations( &mut collecter.client, vec![|| Err(expected_head_object_not_found())], ); @@ -355,9 +423,9 @@ pub(crate) mod tests { let events = expected_flat_events_simple().sort_and_dedup(); - set_s3_client_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); + set_s3_client_expectations(&mut collecter.client); - let mut result = Collecter::update_events(&collecter.client, events) + let mut result = Collecter::update_events(&config, &collecter.client, events) .await .unwrap() .into_inner() @@ -377,7 +445,7 @@ pub(crate) mod tests { let config = Default::default(); let mut collecter = test_collecter(&config).await; - set_s3_client_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); + set_s3_client_expectations(&mut collecter.client); let result = collecter.collect().await.unwrap().into_inner().0; @@ -397,7 +465,7 @@ pub(crate) mod tests { } } - pub(crate) fn set_s3_client_expectations(client: &mut Client, expectations: Vec) + pub(crate) fn set_s3_head_expectations(client: &mut Client, expectations: Vec) where F: Fn() -> result::Result> + Send + 'static, { @@ -411,6 +479,41 @@ pub(crate) mod tests { } } + pub(crate) fn set_s3_tagging_expectations( + client: &mut Client, + get_tagging_expectations: Vec, + put_tagging_expectations: Vec, + ) where + F: Fn() -> result::Result> + + Send + + 'static, + T: Fn() -> result::Result> + + Send + + 'static, + { + let get_tagging = client + .expect_get_object_tagging() + .with(eq("key"), eq("bucket")) + .times(get_tagging_expectations.len()); + + for expectation in get_tagging_expectations { + get_tagging.returning(move |_, _| expectation()); + } + + let put_tagging = client + .expect_put_object_tagging() + .with( + eq("key"), + eq("bucket"), + function(|t: &Tagging| t.tag_set().first().unwrap().key == "filemanager_id"), + ) + .times(put_tagging_expectations.len()); + + for expectation in put_tagging_expectations { + put_tagging.returning(move |_, _, _| expectation()); + } + } + pub(crate) fn set_sqs_client_expectations(sqs_client: &mut SQSClient) { sqs_client .expect_receive_message() @@ -430,6 +533,26 @@ pub(crate) mod tests { .build() } + pub(crate) fn expected_get_object_tagging() -> GetObjectTaggingOutput { + GetObjectTaggingOutput::builder() + .set_tag_set(Some(vec![])) + .build() + .unwrap() + } + + pub(crate) fn expected_put_object_tagging() -> PutObjectTaggingOutput { + PutObjectTaggingOutput::builder().build() + } + + pub(crate) fn set_s3_client_expectations(s3_client: &mut Client) { + set_s3_head_expectations(s3_client, vec![|| Ok(expected_head_object())]); + set_s3_tagging_expectations( + s3_client, + vec![|| Ok(expected_get_object_tagging())], + vec![|| Ok(expected_put_object_tagging())], + ); + } + pub(crate) fn expected_head_object_not_found() -> SdkError { SdkError::ServiceError( ServiceError::builder() diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs index cab5c468e..9bd429550 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs @@ -26,7 +26,7 @@ use std::result; #[double] use crate::clients::aws::s3::Client; -use crate::error::Error::S3InventoryError; +use crate::error::Error::S3Error; use crate::error::{Error, Result}; use crate::events::aws::message::{default_version_id, quote_e_tag, EventType::Created}; use crate::events::aws::{FlatS3EventMessage, FlatS3EventMessages, StorageClass}; @@ -69,7 +69,7 @@ impl Inventory { let mut inventory_bytes = vec![]; MultiGzDecoder::new(BufReader::new(body)) .read_to_end(&mut inventory_bytes) - .map_err(|err| S3InventoryError(format!("decompressing CSV: {}", err)))?; + .map_err(|err| S3Error(format!("decompressing CSV: {}", err)))?; // AWS seems to return extra newlines at the end of the CSV, so we remove these let inventory_bytes = Self::trim_whitespace(inventory_bytes.as_slice()); @@ -132,9 +132,8 @@ impl Inventory { // https://github.com/Swoorup/arrow-convert // This is should be similar to arrow2_convert::TryIntoArrow in the above performance graph, // as it is a port of arrow2_convert with arrow-rs as the dependency. - from_slice::>(buf.as_slice()).map_err(|err| { - S3InventoryError(format!("failed to deserialize json from arrow: {}", err)) - }) + from_slice::>(buf.as_slice()) + .map_err(|err| S3Error(format!("failed to deserialize json from arrow: {}", err))) } /// Parse a parquet manifest file into records. @@ -162,11 +161,11 @@ impl Inventory { .client .get_object(key.as_ref(), bucket.as_ref()) .await - .map_err(|err| S3InventoryError(err.to_string()))? + .map_err(|err| S3Error(err.to_string()))? .body .collect() .await - .map_err(|err| S3InventoryError(err.to_string()))? + .map_err(|err| S3Error(err.to_string()))? .to_vec()) } @@ -174,10 +173,10 @@ impl Inventory { fn verify_md5>(data: T, verify_with: T) -> Result<()> { if md5::compute(data).0 != hex::decode(&verify_with) - .map_err(|err| S3InventoryError(format!("decoding hex string: {}", err)))? + .map_err(|err| S3Error(format!("decoding hex string: {}", err)))? .as_slice() { - return Err(S3InventoryError( + return Err(S3Error( "mismatched MD5 checksums in inventory manifest".to_string(), )); } @@ -232,9 +231,7 @@ impl Inventory { InventoryFormat::Csv => self.parse_csv(schema, body.as_slice()).await, InventoryFormat::Parquet => self.parse_parquet(body).await, InventoryFormat::Orc => self.parse_orc(body).await, - _ => Err(S3InventoryError( - "unsupported manifest file type".to_string(), - )), + _ => Err(S3Error("unsupported manifest file type".to_string())), } } @@ -246,9 +243,7 @@ impl Inventory { // Proper arn, parse out the bucket. Ok(arn) => { if arn.service != Service::S3.into() { - return Err(S3InventoryError( - "destination bucket ARN is not S3".to_string(), - )); + return Err(S3Error("destination bucket ARN is not S3".to_string())); } arn.resource.to_string() } @@ -286,25 +281,25 @@ impl Inventory { impl From for Error { fn from(error: csv::Error) -> Self { - S3InventoryError(error.to_string()) + S3Error(error.to_string()) } } impl From for Error { fn from(error: ParquetError) -> Self { - S3InventoryError(error.to_string()) + S3Error(error.to_string()) } } impl From for Error { fn from(error: ArrowError) -> Self { - S3InventoryError(error.to_string()) + S3Error(error.to_string()) } } impl From for Error { fn from(error: OrcError) -> Self { - S3InventoryError(error.to_string()) + S3Error(error.to_string()) } } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs index 82979617f..cfc13e81f 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs @@ -487,6 +487,14 @@ impl FlatS3EventMessage { self } + /// Update the delete marker if not None. + pub fn update_delete_marker(mut self, delete_marker: Option) -> Self { + delete_marker + .into_iter() + .for_each(|is_delete_marker| self.is_delete_marker = is_delete_marker); + self + } + /// Set the s3 object id. pub fn with_s3_object_id(mut self, s3_object_id: Uuid) -> Self { self.s3_object_id = s3_object_id; diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs index 9ffe59068..272537c9b 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs @@ -217,7 +217,7 @@ pub(crate) mod tests { }; use crate::database::aws::migration::tests::MIGRATOR; use crate::events::aws::collecter::tests::{ - expected_head_object, set_s3_client_expectations, set_sqs_client_expectations, + set_s3_client_expectations, set_sqs_client_expectations, }; use crate::events::aws::inventory::tests::{ csv_manifest_from_key_expectations, EXPECTED_LAST_MODIFIED_ONE, @@ -252,7 +252,7 @@ pub(crate) mod tests { async fn test_ingest_event(pool: PgPool) { let mut s3_client = S3Client::default(); - set_s3_client_expectations(&mut s3_client, vec![|| Ok(expected_head_object())]); + set_s3_client_expectations(&mut s3_client); let event = SqsEvent { records: vec![SqsMessage { @@ -376,7 +376,7 @@ pub(crate) mod tests { let mut s3_client = S3Client::default(); set_sqs_client_expectations(&mut sqs_client); - set_s3_client_expectations(&mut s3_client, vec![|| Ok(expected_head_object())]); + set_s3_client_expectations(&mut s3_client); f(sqs_client, s3_client).await; diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs index d0834633c..a6fe4d888 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs @@ -82,6 +82,7 @@ impl Entries { ActiveS3Object { s3_object_id: Set(UuidGenerator::generate()), + move_id: Set(Some(UuidGenerator::generate())), event_type: Set(event.clone()), bucket: Set((index / bucket_divisor).to_string()), key: Set((index / key_divisor).to_string()), From f87882b008668fb331b0a30f3024e16588761e4c Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Tue, 1 Oct 2024 13:48:34 +1000 Subject: [PATCH 2/8] feat(filemanager): add move_id logic to database when ingesting --- .../api/select_existing_by_bucket_key.sql | 1 + .../aws/insert_s3_created_objects.sql | 6 +- .../ingester/aws/insert_s3_objects.sql | 6 +- .../aws/update_reordered_for_created.sql | 11 +- .../aws/update_reordered_for_deleted.sql | 1 + .../filemanager/src/database/aws/ingester.rs | 1 + .../src/database/aws/ingester_paired.rs | 37 +++- .../filemanager/src/database/mod.rs | 1 + .../filemanager/src/events/aws/collecter.rs | 175 +++++++++++------- .../filemanager/src/events/aws/inventory.rs | 1 + .../filemanager/src/events/aws/message.rs | 1 + .../filemanager/src/events/aws/mod.rs | 16 ++ 12 files changed, 176 insertions(+), 81 deletions(-) diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql b/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql index 16e5909a7..c5549b6a4 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql @@ -34,6 +34,7 @@ select size, is_delete_marker, event_type, + move_id, 0::bigint as "number_reordered" from input -- Grab the most recent object in each input group. diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql index a182e1bef..16b0bf3e1 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql @@ -12,7 +12,8 @@ insert into s3_object ( version_id, sequencer, is_delete_marker, - event_type + event_type, + move_id ) values ( unnest($1::uuid[]), @@ -27,7 +28,8 @@ values ( unnest($10::text[]), unnest($11::text[]), unnest($12::boolean[]), - unnest($13::event_type[]) + unnest($13::event_type[]), + unnest($14::uuid[]) ) on conflict on constraint sequencer_unique do update set number_duplicate_events = s3_object.number_duplicate_events + 1 returning s3_object_id, number_duplicate_events; diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql index a182e1bef..16b0bf3e1 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql @@ -12,7 +12,8 @@ insert into s3_object ( version_id, sequencer, is_delete_marker, - event_type + event_type, + move_id ) values ( unnest($1::uuid[]), @@ -27,7 +28,8 @@ values ( unnest($10::text[]), unnest($11::text[]), unnest($12::boolean[]), - unnest($13::event_type[]) + unnest($13::event_type[]), + unnest($14::uuid[]) ) on conflict on constraint sequencer_unique do update set number_duplicate_events = s3_object.number_duplicate_events + 1 returning s3_object_id, number_duplicate_events; diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql index f136c4731..ac9af57d7 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql @@ -18,7 +18,8 @@ with input as ( $10::text[], $11::text[], $12::boolean[], - $13::event_type[] + $13::event_type[], + $14::uuid[] ) as input ( s3_object_id, bucket, @@ -32,7 +33,8 @@ with input as ( version_id, created_sequencer, is_delete_marker, - event_type + event_type, + move_id ) ), -- Then, select the objects that need to be updated. @@ -51,7 +53,8 @@ current_objects as ( input.e_tag as input_e_tag, input.storage_class as input_storage_class, input.is_delete_marker as input_is_delete_marker, - input.event_type as input_event_type + input.event_type as input_event_type, + input.move_id as input_move_id from s3_object -- Grab the relevant values to update with. join input on @@ -100,6 +103,7 @@ update as ( is_delete_marker = objects_to_update.input_is_delete_marker, storage_class = objects_to_update.input_storage_class, event_type = objects_to_update.input_event_type, + move_id = objects_to_update.input_move_id, number_reordered = s3_object.number_reordered + -- Note the asymmetry between this and the reorder for deleted query. case when objects_to_update.deleted_sequencer is not null or objects_to_update.sequencer is not null then @@ -127,6 +131,7 @@ select number_duplicate_events, size, is_delete_marker, + move_id, -- This is used to simplify re-constructing the FlatS3EventMessages in the Lambda. I.e. this update detected an -- out of order created event, so return a created event back. 'Created'::event_type as "event_type" diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql index 41f720502..8c3073d6c 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql @@ -98,6 +98,7 @@ select number_duplicate_events, size, is_delete_marker, + move_id, -- This is used to simplify re-constructing the FlatS3EventMessages in the Lambda. I.e. this update detected an -- out of order deleted event, so return a deleted event back. 'Deleted'::event_type as "event_type" diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs index ed784d1d0..6ae7c6c7e 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs @@ -53,6 +53,7 @@ impl Ingester { .bind(&events.sequencers) .bind(&events.is_delete_markers) .bind(&events.event_types) + .bind(&events.move_ids) .fetch_all(&mut *tx) .await?; diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs index aaab78780..6b9ad565e 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs @@ -104,6 +104,7 @@ impl IngesterPaired { .bind(&object_created.sequencers) .bind(&object_created.is_delete_markers) .bind(vec![Other; object_created.s3_object_ids.len()]) + .bind(&object_created.move_ids) .fetch_all(&mut *tx) .await?; @@ -124,6 +125,7 @@ impl IngesterPaired { .bind(&object_created.sequencers) .bind(&object_created.is_delete_markers) .bind(vec![Other; object_created.s3_object_ids.len()]) + .bind(&object_created.move_ids) .fetch_all(&mut *tx) .await?; @@ -179,12 +181,6 @@ impl IngesterPaired { pub(crate) mod tests { use std::ops::Add; - use chrono::{DateTime, Utc}; - use itertools::Itertools; - use sqlx::postgres::PgRow; - use sqlx::{Executor, PgPool, Row}; - use tokio::time::Instant; - use crate::database::aws::ingester::tests::fetch_results; use crate::database::aws::migration::tests::MIGRATOR; use crate::database::{Client, Ingest}; @@ -202,6 +198,13 @@ pub(crate) mod tests { use crate::events::aws::{Events, FlatS3EventMessage, FlatS3EventMessages, StorageClass}; use crate::events::EventSourceType; use crate::events::EventSourceType::S3Paired; + use crate::uuid::UuidGenerator; + use chrono::{DateTime, Utc}; + use itertools::Itertools; + use sqlx::postgres::PgRow; + use sqlx::{Executor, PgPool, Row}; + use tokio::time::Instant; + use uuid::Uuid; #[sqlx::test(migrator = "MIGRATOR")] async fn ingest_object_created(pool: PgPool) { @@ -213,6 +216,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_created(&s3_object_results[0]); } @@ -264,6 +270,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_with( &s3_object_results[0], Some(i64::MAX), @@ -301,6 +310,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_ingest_events(&s3_object_results[0], EXPECTED_VERSION_ID); } @@ -319,6 +331,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_eq!( 2, s3_object_results[0].get::("number_duplicate_events") @@ -444,6 +459,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_eq!(2, s3_object_results[0].get::("number_reordered")); assert_ingest_events(&s3_object_results[0], EXPECTED_VERSION_ID); } @@ -1633,6 +1651,13 @@ pub(crate) mod tests { update_last_modified(&mut events.object_created.last_modified_dates); update_storage_class(&mut events.object_created.storage_classes); update_sha256(&mut events.object_created.sha256s); + events + .object_created + .move_ids + .iter_mut() + .for_each(|move_id| { + *move_id = Some(UuidGenerator::generate()); + }); update_last_modified(&mut events.object_deleted.last_modified_dates); update_storage_class(&mut events.object_deleted.storage_classes); diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs index 037c770e9..03bc2caea 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs @@ -240,6 +240,7 @@ pub(crate) mod tests { .bind(vec![EXPECTED_SEQUENCER_CREATED_ONE.to_string()]) .bind(vec![false]) .bind(vec![event_type]) + .bind(vec![UuidGenerator::generate()]) .fetch_all(pool) .await .unwrap(); diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs index 66f923534..cf5ed0705 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs @@ -1,21 +1,6 @@ //! Definition and implementation of the aws Collecter. //! -use async_trait::async_trait; -use aws_sdk_s3::error::BuildError; -use aws_sdk_s3::operation::get_object_tagging::GetObjectTaggingOutput; -use aws_sdk_s3::operation::head_object::HeadObjectOutput; -use aws_sdk_s3::primitives; -use aws_sdk_s3::types::StorageClass::Standard; -use aws_sdk_s3::types::{Tag, Tagging}; -use chrono::{DateTime, Utc}; -use futures::future::join_all; -use futures::TryFutureExt; -use mockall_double::double; -use tracing::{trace, warn}; - -#[double] -use crate::clients::aws::s3::Client; #[double] use crate::clients::aws::s3::Client as S3Client; #[double] @@ -28,6 +13,20 @@ use crate::events::aws::{ }; use crate::events::{Collect, EventSource, EventSourceType}; use crate::uuid::UuidGenerator; +use async_trait::async_trait; +use aws_sdk_s3::error::BuildError; +use aws_sdk_s3::operation::get_object_tagging::GetObjectTaggingOutput; +use aws_sdk_s3::operation::head_object::HeadObjectOutput; +use aws_sdk_s3::primitives; +use aws_sdk_s3::types::StorageClass::Standard; +use aws_sdk_s3::types::{Tag, Tagging}; +use chrono::{DateTime, Utc}; +use futures::future::join_all; +use futures::TryFutureExt; +use mockall_double::double; +use std::str::FromStr; +use tracing::{trace, warn}; +use uuid::Uuid; /// Build an AWS collector struct. #[derive(Default, Debug)] @@ -124,7 +123,7 @@ impl CollecterBuilder { /// records is None before any events have been processed. #[derive(Debug)] pub struct Collecter<'a> { - client: Client, + client: S3Client, raw_events: FlatS3EventMessages, config: &'a Config, n_records: Option, @@ -132,7 +131,11 @@ pub struct Collecter<'a> { impl<'a> Collecter<'a> { /// Create a new collector. - pub(crate) fn new(client: Client, raw_events: FlatS3EventMessages, config: &'a Config) -> Self { + pub(crate) fn new( + client: S3Client, + raw_events: FlatS3EventMessages, + config: &'a Config, + ) -> Self { Self { client, raw_events, @@ -142,7 +145,7 @@ impl<'a> Collecter<'a> { } /// Get the inner values. - pub fn into_inner(self) -> (Client, FlatS3EventMessages, &'a Config) { + pub fn into_inner(self) -> (S3Client, FlatS3EventMessages, &'a Config) { (self.client, self.raw_events, self.config) } @@ -156,23 +159,55 @@ impl<'a> Collecter<'a> { } /// Gets S3 metadata from HeadObject such as creation/archival timestamps and statuses. - pub async fn head(client: &Client, key: &str, bucket: &str) -> Option { - client - .head_object(key, bucket) + pub async fn head( + client: &S3Client, + mut event: FlatS3EventMessage, + ) -> Result { + let head = client + .head_object(&event.key, &event.bucket) .map_err(|err| { let err = err.into_service_error(); warn!("Error received from HeadObject: {}", err); err }) .await - .ok() + .ok(); + + // Race condition: it's possible that an object gets deleted so quickly that it + // occurs before calling head/tagging. This means that there may be cases where the + // storage class and other fields are not known, or object moves cannot be tracked. + if let Some(head) = head { + trace!(head = ?head, "received HeadObject output"); + + let HeadObjectOutput { + storage_class, + last_modified, + content_length, + e_tag, + checksum_sha256, + delete_marker, + .. + } = head; + + // S3 does not return a storage class for standard, which means this is the + // default. See https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax + event = event + .update_storage_class(StorageClass::from_aws(storage_class.unwrap_or(Standard))) + .update_last_modified_date(Self::convert_datetime(last_modified)) + .update_size(content_length) + .update_e_tag(e_tag) + .update_sha256(checksum_sha256) + .update_delete_marker(delete_marker); + } + + Ok(event) } /// Gets S3 tags from objects. pub async fn tagging( config: &Config, - client: &Client, - event: FlatS3EventMessage, + client: &S3Client, + mut event: FlatS3EventMessage, ) -> Result { let tagging = client .get_object_tagging(&event.key, &event.bucket) @@ -185,15 +220,15 @@ impl<'a> Collecter<'a> { .ok(); if let Some(tagging) = tagging { - trace!(tagging = ?tagging, "received GetObjectTagging output"); + trace!(tagging = ?tagging, "received tagging output"); let GetObjectTaggingOutput { mut tag_set, .. } = tagging; + // Add the tag to the object if it's not already there. if !tag_set .iter() .any(|tag| tag.key == config.ingester_tag_name()) { - // Add the tag to the object if it's not already there. let tag = Tag::builder() .key(config.ingester_tag_name()) .value(UuidGenerator::generate()) @@ -205,15 +240,28 @@ impl<'a> Collecter<'a> { .put_object_tagging( &event.key, &event.bucket, - Tagging::builder().set_tag_set(Some(tag_set)).build()?, + Tagging::builder() + .set_tag_set(Some(tag_set.clone())) + .build()?, ) .await; + if let Err(err) = result { - let err = err.into_service_error(); - warn!("Error received from PutObjectTagging: {}", err); + warn!( + "Error received from PutObjectTagging: {}", + err.into_service_error() + ); return Ok(event); } } + + let tag = tag_set + .iter() + .find(|value| value.key == config.ingester_tag_name()) + .unwrap(); + let move_id = Uuid::from_str(tag.value()).unwrap(); + + event = event.update_move_id(Some(move_id)); } Ok(event) @@ -222,11 +270,11 @@ impl<'a> Collecter<'a> { /// Process events and add header and datetime fields. pub async fn update_events( config: &Config, - client: &Client, + client: &S3Client, events: FlatS3EventMessages, ) -> Result { Ok(FlatS3EventMessages( - join_all(events.into_inner().into_iter().map(|mut event| async move { + join_all(events.into_inner().into_iter().map(|event| async move { // No need to run this unnecessarily on removed events. match event.event_type { EventType::Deleted | EventType::Other => return Ok(event), @@ -235,35 +283,7 @@ impl<'a> Collecter<'a> { trace!(key = ?event.key, bucket = ?event.bucket, "updating event"); - // Race condition: it's possible that an object gets deleted so quickly that it - // occurs before calling head/tagging. This means that there may be cases where the - // storage class and other fields are not known, or object moves cannot be tracked. - if let Some(head) = Self::head(client, &event.key, &event.bucket).await { - trace!(head = ?head, "received HeadObject output"); - - let HeadObjectOutput { - storage_class, - last_modified, - content_length, - e_tag, - checksum_sha256, - delete_marker, - .. - } = head; - - // S3 does not return a storage class for standard, which means this is the - // default. See https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax - event = event - .update_storage_class(StorageClass::from_aws( - storage_class.unwrap_or(Standard), - )) - .update_last_modified_date(Self::convert_datetime(last_modified)) - .update_size(content_length) - .update_e_tag(e_tag) - .update_sha256(checksum_sha256) - .update_delete_marker(delete_marker); - } - + let event = Self::head(client, event).await?; Self::tagging(config, client, event).await })) .await @@ -398,8 +418,21 @@ pub(crate) mod tests { set_s3_head_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); - let result = Collecter::head(&collecter.client, "key", "bucket").await; - assert_eq!(result, Some(expected_head_object())); + let result = Collecter::head( + &collecter.client, + FlatS3EventMessage::new_with_generated_id() + .with_key("key".to_string()) + .with_bucket("bucket".to_string()), + ) + .await + .unwrap(); + let expected = result + .clone() + .with_sha256(Some(EXPECTED_SHA256.to_string())) + .with_storage_class(Some(IntelligentTiering)) + .with_last_modified_date(Some(Default::default())); + + assert_eq!(result, expected); } #[tokio::test] @@ -412,8 +445,14 @@ pub(crate) mod tests { vec![|| Err(expected_head_object_not_found())], ); - let result = Collecter::head(&collecter.client, "key", "bucket").await; - assert!(result.is_none()); + let result = Collecter::head( + &collecter.client, + FlatS3EventMessage::new_with_generated_id() + .with_key("key".to_string()) + .with_bucket("bucket".to_string()), + ) + .await; + assert!(result.is_ok()); } #[tokio::test] @@ -465,7 +504,7 @@ pub(crate) mod tests { } } - pub(crate) fn set_s3_head_expectations(client: &mut Client, expectations: Vec) + pub(crate) fn set_s3_head_expectations(client: &mut S3Client, expectations: Vec) where F: Fn() -> result::Result> + Send + 'static, { @@ -480,7 +519,7 @@ pub(crate) mod tests { } pub(crate) fn set_s3_tagging_expectations( - client: &mut Client, + client: &mut S3Client, get_tagging_expectations: Vec, put_tagging_expectations: Vec, ) where @@ -544,7 +583,7 @@ pub(crate) mod tests { PutObjectTaggingOutput::builder().build() } - pub(crate) fn set_s3_client_expectations(s3_client: &mut Client) { + pub(crate) fn set_s3_client_expectations(s3_client: &mut S3Client) { set_s3_head_expectations(s3_client, vec![|| Ok(expected_head_object())]); set_s3_tagging_expectations( s3_client, @@ -563,7 +602,7 @@ pub(crate) mod tests { } async fn test_collecter(config: &Config) -> Collecter<'_> { - Collecter::new(Client::default(), expected_flat_events_simple(), config) + Collecter::new(S3Client::default(), expected_flat_events_simple(), config) } fn expected_receive_message() -> ReceiveMessageOutput { diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs index 9bd429550..824c77b58 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs @@ -528,6 +528,7 @@ impl From for FlatS3EventMessage { // Anything in an inventory report is always a created event. event_type: Created, is_delete_marker: is_delete_marker.unwrap_or_default(), + move_id: None, number_duplicate_events: 0, number_reordered: 0, } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs index 3fe939148..287371d2a 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs @@ -198,6 +198,7 @@ impl From for FlatS3EventMessage { sha256: None, event_type, is_delete_marker, + move_id: None, number_duplicate_events: 0, number_reordered: 0, } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs index cfc13e81f..58945b939 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs @@ -79,6 +79,7 @@ pub struct TransposedS3EventMessages { pub last_modified_dates: Vec>>, pub event_types: Vec, pub is_delete_markers: Vec, + pub move_ids: Vec>, } impl TransposedS3EventMessages { @@ -99,6 +100,7 @@ impl TransposedS3EventMessages { last_modified_dates: Vec::with_capacity(capacity), event_types: Vec::with_capacity(capacity), is_delete_markers: Vec::with_capacity(capacity), + move_ids: Vec::with_capacity(capacity), } } @@ -118,6 +120,7 @@ impl TransposedS3EventMessages { last_modified_date, event_type, is_delete_marker, + move_id, .. } = message; @@ -134,6 +137,7 @@ impl TransposedS3EventMessages { self.last_modified_dates.push(last_modified_date); self.event_types.push(event_type); self.is_delete_markers.push(is_delete_marker); + self.move_ids.push(move_id); } } @@ -171,6 +175,7 @@ impl From for FlatS3EventMessages { messages.last_modified_dates, messages.event_types, messages.is_delete_markers, + messages.move_ids ) .map( |( @@ -187,6 +192,7 @@ impl From for FlatS3EventMessages { last_modified_date, event_type, is_delete_marker, + move_id, )| { FlatS3EventMessage { s3_object_id, @@ -202,6 +208,7 @@ impl From for FlatS3EventMessages { event_time, event_type, is_delete_marker, + move_id, number_duplicate_events: 0, number_reordered: 0, } @@ -435,6 +442,7 @@ pub struct FlatS3EventMessage { pub event_time: Option>, pub event_type: EventType, pub is_delete_marker: bool, + pub move_id: Option, pub number_duplicate_events: i64, pub number_reordered: i64, } @@ -495,6 +503,14 @@ impl FlatS3EventMessage { self } + /// Update the delete marker if not None. + pub fn update_move_id(mut self, move_id: Option) -> Self { + move_id + .into_iter() + .for_each(|move_id| self.move_id = Some(move_id)); + self + } + /// Set the s3 object id. pub fn with_s3_object_id(mut self, s3_object_id: Uuid) -> Self { self.s3_object_id = s3_object_id; From 15f778b686442812aa6f1ea98daebcc0270360ad Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Tue, 1 Oct 2024 19:31:16 +1000 Subject: [PATCH 3/8] feat(filemanager): update attributes from existing records when ingesting a moved object --- .../api/select_existing_by_bucket_key.sql | 1 + .../aws/update_reordered_for_created.sql | 1 + .../aws/update_reordered_for_deleted.sql | 1 + .../filemanager/src/database/aws/ingester.rs | 24 +- .../src/database/aws/ingester_paired.rs | 46 +--- .../filemanager/src/events/aws/collecter.rs | 217 ++++++++++++------ .../filemanager/src/events/aws/inventory.rs | 1 + .../filemanager/src/events/aws/message.rs | 1 + .../filemanager/src/events/aws/mod.rs | 34 ++- .../filemanager/src/handlers/aws.rs | 4 +- .../filemanager/src/queries/list.rs | 3 +- .../filemanager/src/routes/filter/mod.rs | 9 +- 12 files changed, 217 insertions(+), 125 deletions(-) diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql b/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql index c5549b6a4..3818b4725 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql @@ -35,6 +35,7 @@ select is_delete_marker, event_type, move_id, + attributes, 0::bigint as "number_reordered" from input -- Grab the most recent object in each input group. diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql index ac9af57d7..d6ecd1342 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql @@ -132,6 +132,7 @@ select size, is_delete_marker, move_id, + attributes, -- This is used to simplify re-constructing the FlatS3EventMessages in the Lambda. I.e. this update detected an -- out of order created event, so return a created event back. 'Created'::event_type as "event_type" diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql index 8c3073d6c..e13e50083 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql @@ -99,6 +99,7 @@ select size, is_delete_marker, move_id, + attributes, -- This is used to simplify re-constructing the FlatS3EventMessages in the Lambda. I.e. this update detected an -- out of order deleted event, so return a deleted event back. 'Deleted'::event_type as "event_type" diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs index 6ae7c6c7e..14bffe147 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs @@ -75,6 +75,7 @@ pub(crate) mod tests { use sqlx::postgres::PgRow; use sqlx::{Executor, PgPool, Row}; use tokio::time::Instant; + use uuid::Uuid; use crate::database::aws::migration::tests::MIGRATOR; use crate::database::{Client, Ingest}; @@ -91,6 +92,7 @@ pub(crate) mod tests { }; use crate::events::EventSourceType; use crate::events::EventSourceType::S3; + use crate::uuid::UuidGenerator; #[sqlx::test(migrator = "MIGRATOR")] async fn ingest_object_created(pool: PgPool) { @@ -102,6 +104,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_created(&s3_object_results[0]); } @@ -148,6 +153,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_with( &s3_object_results[0], Some(i64::MAX), @@ -184,6 +192,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 2); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); assert_eq!( 1, s3_object_results[0].get::("number_duplicate_events") @@ -288,6 +299,9 @@ pub(crate) mod tests { let s3_object_results = fetch_results(&ingester).await; assert_eq!(s3_object_results.len(), 2); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); // Order should be different here. assert_ingest_events( &s3_object_results[1], @@ -1065,7 +1079,9 @@ pub(crate) mod tests { assert_row(s3_object_results, message, sequencer, event_time); } - fn update_test_events(mut events: TransposedS3EventMessages) -> TransposedS3EventMessages { + pub(crate) fn update_test_events( + mut events: TransposedS3EventMessages, + ) -> TransposedS3EventMessages { let update_last_modified = |dates: &mut Vec>>| { dates.iter_mut().for_each(|last_modified| { *last_modified = Some(DateTime::default()); @@ -1081,10 +1097,16 @@ pub(crate) mod tests { *sha256 = Some(EXPECTED_SHA256.to_string()); }); }; + let update_move_ids = |move_ids: &mut Vec>| { + move_ids.iter_mut().for_each(|move_id| { + *move_id = Some(UuidGenerator::generate()); + }); + }; update_last_modified(&mut events.last_modified_dates); update_storage_class(&mut events.storage_classes); update_sha256(&mut events.sha256s); + update_move_ids(&mut events.move_ids); events } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs index 6b9ad565e..f5d4e763e 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs @@ -182,6 +182,7 @@ pub(crate) mod tests { use std::ops::Add; use crate::database::aws::ingester::tests::fetch_results; + use crate::database::aws::ingester::tests::update_test_events; use crate::database::aws::migration::tests::MIGRATOR; use crate::database::{Client, Ingest}; use crate::events::aws::message::default_version_id; @@ -195,10 +196,9 @@ pub(crate) mod tests { EXPECTED_SEQUENCER_DELETED_ONE, EXPECTED_SEQUENCER_DELETED_TWO, EXPECTED_SHA256, EXPECTED_VERSION_ID, }; - use crate::events::aws::{Events, FlatS3EventMessage, FlatS3EventMessages, StorageClass}; + use crate::events::aws::{Events, FlatS3EventMessage, FlatS3EventMessages}; use crate::events::EventSourceType; use crate::events::EventSourceType::S3Paired; - use crate::uuid::UuidGenerator; use chrono::{DateTime, Utc}; use itertools::Itertools; use sqlx::postgres::PgRow; @@ -530,7 +530,7 @@ pub(crate) mod tests { let mut events = vec![event]; events.extend(expected_flat_events_simple().sort_and_dedup().into_inner()); - let events = update_test_events(FlatS3EventMessages(events).into()); + let events = update_test_events_paired(FlatS3EventMessages(events).into()); // This also checks to make sure that the update duplicate constraint succeeds. ingester .ingest(EventSourceType::S3Paired(events)) @@ -700,7 +700,7 @@ pub(crate) mod tests { let mut events = vec![event]; events.extend(expected_flat_events_simple().sort_and_dedup().into_inner()); - let events = update_test_events(FlatS3EventMessages(events).into()); + let events = update_test_events_paired(FlatS3EventMessages(events).into()); ingester .ingest(EventSourceType::S3Paired(events)) @@ -1631,37 +1631,9 @@ pub(crate) mod tests { ); } - fn update_test_events(mut events: Events) -> Events { - let update_last_modified = |dates: &mut Vec>>| { - dates.iter_mut().for_each(|last_modified| { - *last_modified = Some(DateTime::default()); - }); - }; - let update_storage_class = |classes: &mut Vec>| { - classes.iter_mut().for_each(|storage_class| { - *storage_class = Some(StorageClass::Standard); - }); - }; - let update_sha256 = |sha256s: &mut Vec>| { - sha256s.iter_mut().for_each(|sha256| { - *sha256 = Some(EXPECTED_SHA256.to_string()); - }); - }; - - update_last_modified(&mut events.object_created.last_modified_dates); - update_storage_class(&mut events.object_created.storage_classes); - update_sha256(&mut events.object_created.sha256s); - events - .object_created - .move_ids - .iter_mut() - .for_each(|move_id| { - *move_id = Some(UuidGenerator::generate()); - }); - - update_last_modified(&mut events.object_deleted.last_modified_dates); - update_storage_class(&mut events.object_deleted.storage_classes); - update_sha256(&mut events.object_deleted.sha256s); + fn update_test_events_paired(mut events: Events) -> Events { + events.object_created = update_test_events(events.object_created); + events.object_deleted = update_test_events(events.object_deleted); events } @@ -1680,11 +1652,11 @@ pub(crate) mod tests { } pub(crate) fn test_events() -> Events { - update_test_events(expected_events_simple()) + update_test_events_paired(expected_events_simple()) } pub(crate) fn test_events_delete_marker() -> Events { - update_test_events(expected_events_simple_delete_marker()) + update_test_events_paired(expected_events_simple_delete_marker()) } pub(crate) fn test_created_events() -> Events { diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs index cf5ed0705..6f8617acf 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs @@ -5,6 +5,7 @@ use crate::clients::aws::s3::Client as S3Client; #[double] use crate::clients::aws::sqs::Client as SQSClient; +use crate::database; use crate::env::Config; use crate::error::Error::{S3Error, SQSError, SerdeError}; use crate::error::{Error, Result}; @@ -12,6 +13,8 @@ use crate::events::aws::{ EventType, FlatS3EventMessage, FlatS3EventMessages, StorageClass, TransposedS3EventMessages, }; use crate::events::{Collect, EventSource, EventSourceType}; +use crate::queries::list::ListQueryBuilder; +use crate::routes::filter::S3ObjectsFilter; use crate::uuid::UuidGenerator; use async_trait::async_trait; use aws_sdk_s3::error::BuildError; @@ -61,11 +64,16 @@ impl CollecterBuilder { } /// Build a collector using the raw events. - pub async fn build(self, raw_events: FlatS3EventMessages, config: &Config) -> Collecter<'_> { + pub async fn build<'a>( + self, + raw_events: FlatS3EventMessages, + config: &'a Config, + client: &'a database::Client, + ) -> Collecter<'a> { if let Some(s3_client) = self.s3_client { - Collecter::new(s3_client, raw_events, config) + Collecter::new(s3_client, client, raw_events, config) } else { - Collecter::new(S3Client::with_defaults().await, raw_events, config) + Collecter::new(S3Client::with_defaults().await, client, raw_events, config) } } @@ -98,20 +106,29 @@ impl CollecterBuilder { } /// Build a collector by manually calling receive to obtain the raw events. - pub async fn build_receive(mut self, config: &Config) -> Result { + pub async fn build_receive<'a>( + mut self, + config: &'a Config, + database_client: &'a database::Client, + ) -> Result> { let url = self.sqs_url.take(); let url = Config::value_or_else(url.as_deref(), config.sqs_url())?; let client = self.sqs_client.take(); if let Some(sqs_client) = &client { Ok(self - .build(Self::receive(sqs_client, url).await?, config) + .build( + Self::receive(sqs_client, url).await?, + config, + database_client, + ) .await) } else { Ok(self .build( Self::receive(&SQSClient::with_defaults().await, url).await?, config, + database_client, ) .await) } @@ -124,6 +141,7 @@ impl CollecterBuilder { #[derive(Debug)] pub struct Collecter<'a> { client: S3Client, + database_client: &'a database::Client, raw_events: FlatS3EventMessages, config: &'a Config, n_records: Option, @@ -133,11 +151,13 @@ impl<'a> Collecter<'a> { /// Create a new collector. pub(crate) fn new( client: S3Client, + database_client: &'a database::Client, raw_events: FlatS3EventMessages, config: &'a Config, ) -> Self { Self { client, + database_client, raw_events, config, n_records: None, @@ -145,8 +165,20 @@ impl<'a> Collecter<'a> { } /// Get the inner values. - pub fn into_inner(self) -> (S3Client, FlatS3EventMessages, &'a Config) { - (self.client, self.raw_events, self.config) + pub fn into_inner( + self, + ) -> ( + S3Client, + &'a database::Client, + FlatS3EventMessages, + &'a Config, + ) { + ( + self.client, + self.database_client, + self.raw_events, + self.config, + ) } /// Converts an AWS datetime to a standard database format. @@ -207,6 +239,7 @@ impl<'a> Collecter<'a> { pub async fn tagging( config: &Config, client: &S3Client, + database_client: &database::Client, mut event: FlatS3EventMessage, ) -> Result { let tagging = client @@ -224,45 +257,76 @@ impl<'a> Collecter<'a> { let GetObjectTaggingOutput { mut tag_set, .. } = tagging; - // Add the tag to the object if it's not already there. - if !tag_set - .iter() - .any(|tag| tag.key == config.ingester_tag_name()) - { - let tag = Tag::builder() - .key(config.ingester_tag_name()) - .value(UuidGenerator::generate()) - .build()?; - tag_set.push(tag); - - // Try to push the tags to S3, only proceed if successful. - let result = client - .put_object_tagging( - &event.key, - &event.bucket, - Tagging::builder() - .set_tag_set(Some(tag_set.clone())) - .build()?, - ) - .await; - - if let Err(err) = result { - warn!( - "Error received from PutObjectTagging: {}", - err.into_service_error() - ); - return Ok(event); + let tag = tag_set + .clone() + .into_iter() + .find(|tag| tag.key == config.ingester_tag_name()); + + match tag { + None => { + let move_id = UuidGenerator::generate(); + let tag = Tag::builder() + .key(config.ingester_tag_name()) + .value(move_id) + .build()?; + tag_set.push(tag); + + // Try to push the tags to S3, only proceed if successful. + let result = client + .put_object_tagging( + &event.key, + &event.bucket, + Tagging::builder().set_tag_set(Some(tag_set)).build()?, + ) + .await; + + if let Err(err) = result { + warn!( + "Error received from PutObjectTagging: {}", + err.into_service_error() + ); + return Ok(event); + } + + return Ok(event.with_move_id(Some(move_id))); + } + Some(tag) => { + let move_id = Uuid::from_str(tag.value()); + + match move_id { + Ok(move_id) => { + let filter = S3ObjectsFilter { + move_id: Some(move_id), + ..Default::default() + }; + let moved_object = + ListQueryBuilder::new(database_client.connection_ref()) + .filter_all(filter, true)? + .one() + .await + .ok() + .flatten(); + + match moved_object { + None => { + warn!("Object with move_id {} not found in database", move_id); + return Ok(event.with_move_id(Some(move_id))); + } + Some(moved_object) => { + event = event + .with_move_id(Some(move_id)) + .with_attributes(moved_object.attributes); + } + } + } + Err(err) => { + warn!("Failed parsing move_id from tag: {}", err); + return Ok(event); + } + } } } - - let tag = tag_set - .iter() - .find(|value| value.key == config.ingester_tag_name()) - .unwrap(); - let move_id = Uuid::from_str(tag.value()).unwrap(); - - event = event.update_move_id(Some(move_id)); - } + }; Ok(event) } @@ -271,6 +335,7 @@ impl<'a> Collecter<'a> { pub async fn update_events( config: &Config, client: &S3Client, + database_client: &database::Client, events: FlatS3EventMessages, ) -> Result { Ok(FlatS3EventMessages( @@ -284,7 +349,7 @@ impl<'a> Collecter<'a> { trace!(key = ?event.key, bucket = ?event.bucket, "updating event"); let event = Self::head(client, event).await?; - Self::tagging(config, client, event).await + Self::tagging(config, client, database_client, event).await })) .await .into_iter() @@ -307,11 +372,11 @@ impl From for Error { #[async_trait] impl<'a> Collect for Collecter<'a> { async fn collect(mut self) -> Result { - let (client, events, config) = self.into_inner(); + let (client, database_client, events, config) = self.into_inner(); let events = events.sort_and_dedup(); - let events = Self::update_events(config, &client, events).await?; + let events = Self::update_events(config, &client, database_client, events).await?; // Get only the known event types. let events = events.filter_known(); let n_records = events.0.len(); @@ -334,6 +399,10 @@ impl<'a> Collect for Collecter<'a> { pub(crate) mod tests { use std::result; + use crate::events::aws::tests::{ + expected_event_record_simple, expected_flat_events_simple, EXPECTED_SHA256, + }; + use crate::events::aws::StorageClass::IntelligentTiering; use aws_sdk_s3::error::SdkError; use aws_sdk_s3::operation::get_object_tagging::GetObjectTaggingError; use aws_sdk_s3::operation::head_object::HeadObjectError; @@ -348,11 +417,7 @@ pub(crate) mod tests { use aws_smithy_runtime_api::client::orchestrator::HttpResponse; use aws_smithy_runtime_api::client::result::ServiceError; use mockall::predicate::{eq, function}; - - use crate::events::aws::tests::{ - expected_event_record_simple, expected_flat_events_simple, EXPECTED_SHA256, - }; - use crate::events::aws::StorageClass::IntelligentTiering; + use sqlx::PgPool; use super::*; @@ -377,8 +442,8 @@ pub(crate) mod tests { assert_eq!(events, expected); } - #[tokio::test] - async fn build_receive() { + #[sqlx::test] + async fn build_receive(pool: PgPool) { let mut sqs_client = SQSClient::default(); let mut s3_client = S3Client::default(); @@ -389,7 +454,7 @@ pub(crate) mod tests { .with_sqs_client(sqs_client) .with_s3_client(s3_client) .with_sqs_url("url") - .build_receive(&Default::default()) + .build_receive(&Default::default(), &database::Client::from_pool(pool)) .await .unwrap() .collect() @@ -411,10 +476,11 @@ pub(crate) mod tests { assert_eq!(result, Some(DateTime::::default())); } - #[tokio::test] - async fn head() { + #[sqlx::test] + async fn head(pool: PgPool) { let config = Default::default(); - let mut collecter = test_collecter(&config).await; + let client = database::Client::from_pool(pool); + let mut collecter = test_collecter(&config, &client).await; set_s3_head_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); @@ -435,10 +501,11 @@ pub(crate) mod tests { assert_eq!(result, expected); } - #[tokio::test] - async fn head_not_found() { + #[sqlx::test] + async fn head_not_found(pool: PgPool) { let config = Default::default(); - let mut collecter = test_collecter(&config).await; + let client = database::Client::from_pool(pool); + let mut collecter = test_collecter(&config, &client).await; set_s3_head_expectations( &mut collecter.client, @@ -455,16 +522,17 @@ pub(crate) mod tests { assert!(result.is_ok()); } - #[tokio::test] - async fn update_events() { + #[sqlx::test] + async fn update_events(pool: PgPool) { let config = Default::default(); - let mut collecter = test_collecter(&config).await; + let client = database::Client::from_pool(pool); + let mut collecter = test_collecter(&config, &client).await; let events = expected_flat_events_simple().sort_and_dedup(); set_s3_client_expectations(&mut collecter.client); - let mut result = Collecter::update_events(&config, &collecter.client, events) + let mut result = Collecter::update_events(&config, &collecter.client, &client, events) .await .unwrap() .into_inner() @@ -479,10 +547,11 @@ pub(crate) mod tests { assert_eq!(second.last_modified_date, None); } - #[tokio::test] - async fn collect() { + #[sqlx::test] + async fn collect(pool: PgPool) { let config = Default::default(); - let mut collecter = test_collecter(&config).await; + let client = database::Client::from_pool(pool); + let mut collecter = test_collecter(&config, &client).await; set_s3_client_expectations(&mut collecter.client); @@ -601,8 +670,16 @@ pub(crate) mod tests { ) } - async fn test_collecter(config: &Config) -> Collecter<'_> { - Collecter::new(S3Client::default(), expected_flat_events_simple(), config) + async fn test_collecter<'a>( + config: &'a Config, + database_client: &'a database::Client, + ) -> Collecter<'a> { + Collecter::new( + S3Client::default(), + database_client, + expected_flat_events_simple(), + config, + ) } fn expected_receive_message() -> ReceiveMessageOutput { diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs index 824c77b58..584468ae5 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs @@ -529,6 +529,7 @@ impl From for FlatS3EventMessage { event_type: Created, is_delete_marker: is_delete_marker.unwrap_or_default(), move_id: None, + attributes: None, number_duplicate_events: 0, number_reordered: 0, } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs index 287371d2a..8cc53f20b 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs @@ -199,6 +199,7 @@ impl From for FlatS3EventMessage { event_type, is_delete_marker, move_id: None, + attributes: None, number_duplicate_events: 0, number_reordered: 0, } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs index 58945b939..8eae436a9 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs @@ -14,6 +14,7 @@ use message::EventMessage; use crate::events::aws::message::{default_version_id, EventType}; use crate::events::aws::EventType::{Created, Deleted, Other}; use crate::uuid::UuidGenerator; +use sea_orm::prelude::Json; pub mod collecter; pub mod inventory; @@ -80,6 +81,7 @@ pub struct TransposedS3EventMessages { pub event_types: Vec, pub is_delete_markers: Vec, pub move_ids: Vec>, + pub attributes: Vec>, } impl TransposedS3EventMessages { @@ -101,6 +103,7 @@ impl TransposedS3EventMessages { event_types: Vec::with_capacity(capacity), is_delete_markers: Vec::with_capacity(capacity), move_ids: Vec::with_capacity(capacity), + attributes: Vec::with_capacity(capacity), } } @@ -121,6 +124,7 @@ impl TransposedS3EventMessages { event_type, is_delete_marker, move_id, + attributes, .. } = message; @@ -138,6 +142,7 @@ impl TransposedS3EventMessages { self.event_types.push(event_type); self.is_delete_markers.push(is_delete_marker); self.move_ids.push(move_id); + self.attributes.push(attributes); } } @@ -175,7 +180,8 @@ impl From for FlatS3EventMessages { messages.last_modified_dates, messages.event_types, messages.is_delete_markers, - messages.move_ids + messages.move_ids, + messages.attributes, ) .map( |( @@ -193,6 +199,7 @@ impl From for FlatS3EventMessages { event_type, is_delete_marker, move_id, + attributes, )| { FlatS3EventMessage { s3_object_id, @@ -209,6 +216,7 @@ impl From for FlatS3EventMessages { event_type, is_delete_marker, move_id, + attributes, number_duplicate_events: 0, number_reordered: 0, } @@ -351,7 +359,6 @@ impl FlatS3EventMessages { pub fn sort(self) -> Self { let mut messages = self.into_inner(); - messages.sort(); messages.sort_by(|a, b| { if let (Some(a_sequencer), Some(b_sequencer)) = (a.sequencer.as_ref(), b.sequencer.as_ref()) @@ -427,7 +434,7 @@ impl FlatS3EventMessages { } /// A flattened AWS S3 record -#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone, Default, FromRow)] +#[derive(Debug, Eq, PartialEq, Clone, Default, FromRow)] pub struct FlatS3EventMessage { pub s3_object_id: Uuid, pub sequencer: Option, @@ -443,6 +450,7 @@ pub struct FlatS3EventMessage { pub event_type: EventType, pub is_delete_marker: bool, pub move_id: Option, + pub attributes: Option, pub number_duplicate_events: i64, pub number_reordered: i64, } @@ -503,14 +511,6 @@ impl FlatS3EventMessage { self } - /// Update the delete marker if not None. - pub fn update_move_id(mut self, move_id: Option) -> Self { - move_id - .into_iter() - .for_each(|move_id| self.move_id = Some(move_id)); - self - } - /// Set the s3 object id. pub fn with_s3_object_id(mut self, s3_object_id: Uuid) -> Self { self.s3_object_id = s3_object_id; @@ -583,6 +583,18 @@ impl FlatS3EventMessage { self } + /// Set the move id. + pub fn with_move_id(mut self, move_id: Option) -> Self { + self.move_id = move_id; + self + } + + /// Set the attributes. + pub fn with_attributes(mut self, attributes: Option) -> Self { + self.attributes = attributes; + self + } + /// Set the event type. pub fn with_event_type(mut self, event_type: EventType) -> Self { self.event_type = event_type; diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs index 272537c9b..afdd36e13 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs @@ -39,7 +39,7 @@ pub async fn receive_and_ingest<'a>( .with_s3_client(s3_client) .with_sqs_client(sqs_client) .set_sqs_url(sqs_url) - .build_receive(env_config) + .build_receive(env_config, database_client) .await? .collect() .await? @@ -74,7 +74,7 @@ pub async fn ingest_event( let events = CollecterBuilder::default() .with_s3_client(s3_client) - .build(events, env_config) + .build(events, env_config, &database_client) .await .collect() .await? diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs index b7cee82a8..7b2d93878 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs @@ -149,7 +149,8 @@ where filter .is_delete_marker .map(|v| s3_object::Column::IsDeleteMarker.eq(v)), - ); + ) + .add_option(filter.move_id.map(|v| s3_object::Column::MoveId.eq(v))); if let Some(attributes) = filter.attributes { let json_conditions = Self::json_conditions( diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs index 18d536647..e3782d345 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs @@ -1,13 +1,13 @@ //! Routing logic for query filtering. //! +use crate::database::entities::sea_orm_active_enums::{EventType, StorageClass}; +use crate::routes::filter::wildcard::{Wildcard, WildcardEither}; use sea_orm::prelude::{DateTimeWithTimeZone, Json}; use serde::{Deserialize, Serialize}; use serde_json::Map; use utoipa::IntoParams; - -use crate::database::entities::sea_orm_active_enums::{EventType, StorageClass}; -use crate::routes::filter::wildcard::{Wildcard, WildcardEither}; +use uuid::Uuid; pub mod wildcard; @@ -68,6 +68,9 @@ pub struct S3ObjectsFilter { /// Query by the object delete marker. pub(crate) is_delete_marker: Option, #[param(required = false)] + /// Query by the object delete marker. + pub(crate) move_id: Option, + #[param(required = false)] /// Query by JSON attributes. Supports nested syntax to access inner /// fields, e.g. `attributes[attribute_id]=...`. This only deserializes /// into string fields, and does not support other JSON types. E.g. From 3e26a49dc75b569c7fe1ba7f2db9a5e0e0f298bd Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Tue, 1 Oct 2024 19:57:39 +1000 Subject: [PATCH 4/8] refactor(filemanager): flatten nested if let conditions for better readability --- .../filemanager/src/events/aws/collecter.rs | 211 +++++++++--------- 1 file changed, 101 insertions(+), 110 deletions(-) diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs index 6f8617acf..2c578560f 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs @@ -191,16 +191,11 @@ impl<'a> Collecter<'a> { } /// Gets S3 metadata from HeadObject such as creation/archival timestamps and statuses. - pub async fn head( - client: &S3Client, - mut event: FlatS3EventMessage, - ) -> Result { + pub async fn head(client: &S3Client, event: FlatS3EventMessage) -> Result { let head = client .head_object(&event.key, &event.bucket) - .map_err(|err| { - let err = err.into_service_error(); + .inspect_err(|err| { warn!("Error received from HeadObject: {}", err); - err }) .await .ok(); @@ -208,31 +203,31 @@ impl<'a> Collecter<'a> { // Race condition: it's possible that an object gets deleted so quickly that it // occurs before calling head/tagging. This means that there may be cases where the // storage class and other fields are not known, or object moves cannot be tracked. - if let Some(head) = head { - trace!(head = ?head, "received HeadObject output"); - - let HeadObjectOutput { - storage_class, - last_modified, - content_length, - e_tag, - checksum_sha256, - delete_marker, - .. - } = head; - - // S3 does not return a storage class for standard, which means this is the - // default. See https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax - event = event - .update_storage_class(StorageClass::from_aws(storage_class.unwrap_or(Standard))) - .update_last_modified_date(Self::convert_datetime(last_modified)) - .update_size(content_length) - .update_e_tag(e_tag) - .update_sha256(checksum_sha256) - .update_delete_marker(delete_marker); - } + let Some(head) = head else { + return Ok(event); + }; - Ok(event) + trace!(head = ?head, "received HeadObject output"); + + let HeadObjectOutput { + storage_class, + last_modified, + content_length, + e_tag, + checksum_sha256, + delete_marker, + .. + } = head; + + // S3 does not return a storage class for standard, which means this is the + // default. See https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax + Ok(event + .update_storage_class(StorageClass::from_aws(storage_class.unwrap_or(Standard))) + .update_last_modified_date(Self::convert_datetime(last_modified)) + .update_size(content_length) + .update_e_tag(e_tag) + .update_sha256(checksum_sha256) + .update_delete_marker(delete_marker)) } /// Gets S3 tags from objects. @@ -240,95 +235,91 @@ impl<'a> Collecter<'a> { config: &Config, client: &S3Client, database_client: &database::Client, - mut event: FlatS3EventMessage, + event: FlatS3EventMessage, ) -> Result { let tagging = client .get_object_tagging(&event.key, &event.bucket) - .map_err(|err| { - let err = err.into_service_error(); + .inspect_err(|err| { warn!("Error received from GetObjectTagging: {}", err); - err }) .await .ok(); - if let Some(tagging) = tagging { - trace!(tagging = ?tagging, "received tagging output"); - - let GetObjectTaggingOutput { mut tag_set, .. } = tagging; - - let tag = tag_set - .clone() - .into_iter() - .find(|tag| tag.key == config.ingester_tag_name()); - - match tag { - None => { - let move_id = UuidGenerator::generate(); - let tag = Tag::builder() - .key(config.ingester_tag_name()) - .value(move_id) - .build()?; - tag_set.push(tag); - - // Try to push the tags to S3, only proceed if successful. - let result = client - .put_object_tagging( - &event.key, - &event.bucket, - Tagging::builder().set_tag_set(Some(tag_set)).build()?, - ) - .await; - - if let Err(err) = result { - warn!( - "Error received from PutObjectTagging: {}", - err.into_service_error() - ); - return Ok(event); - } - - return Ok(event.with_move_id(Some(move_id))); - } - Some(tag) => { - let move_id = Uuid::from_str(tag.value()); - - match move_id { - Ok(move_id) => { - let filter = S3ObjectsFilter { - move_id: Some(move_id), - ..Default::default() - }; - let moved_object = - ListQueryBuilder::new(database_client.connection_ref()) - .filter_all(filter, true)? - .one() - .await - .ok() - .flatten(); - - match moved_object { - None => { - warn!("Object with move_id {} not found in database", move_id); - return Ok(event.with_move_id(Some(move_id))); - } - Some(moved_object) => { - event = event - .with_move_id(Some(move_id)) - .with_attributes(moved_object.attributes); - } - } - } - Err(err) => { - warn!("Failed parsing move_id from tag: {}", err); - return Ok(event); - } - } - } - } + let Some(tagging) = tagging else { + return Ok(event); + }; + + trace!(tagging = ?tagging, "received tagging output"); + + let GetObjectTaggingOutput { mut tag_set, .. } = tagging; + + // Check if the object contains the move_id tag. + let tag = tag_set + .clone() + .into_iter() + .find(|tag| tag.key == config.ingester_tag_name()); + + let Some(tag) = tag else { + // If it doesn't, then a new tag needs to be generated. + let move_id = UuidGenerator::generate(); + let tag = Tag::builder() + .key(config.ingester_tag_name()) + .value(move_id) + .build()?; + tag_set.push(tag); + + // Try to push the tags to S3, only proceed if successful. + let result = client + .put_object_tagging( + &event.key, + &event.bucket, + Tagging::builder().set_tag_set(Some(tag_set)).build()?, + ) + .await + .inspect_err(|err| { + warn!("Error received from PutObjectTagging: {}", err); + }); + + // Only add a move_id to the new record if the tagging was successful. + return if result.is_ok() { + Ok(event.with_move_id(Some(move_id))) + } else { + Ok(event) + }; }; - Ok(event) + // The object has a move_id tag. Grab the existing the tag, returning a new record without + // the move_id if the is not valid. + let move_id = Uuid::from_str(tag.value()).inspect_err(|err| { + warn!("Failed to parse move_id from tag: {}", err); + }); + let Ok(move_id) = move_id else { + return Ok(event); + }; + + // From here, the new record must be a valid, moved object. + let event = event.with_move_id(Some(move_id)); + + // Get the attributes from the old record to update the new record with. + let filter = S3ObjectsFilter { + move_id: Some(move_id), + ..Default::default() + }; + let moved_object = ListQueryBuilder::new(database_client.connection_ref()) + .filter_all(filter, true)? + .one() + .await + .ok() + .flatten(); + + // Update the new record with the attributes if possible, or return the new record without + // the attributes if not possible. + if let Some(moved_object) = moved_object { + Ok(event.with_attributes(moved_object.attributes)) + } else { + warn!("Object with move_id {} not found in database", move_id); + Ok(event) + } } /// Process events and add header and datetime fields. From 01227cf8a8ba942a36df440f73543dd395018221 Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Wed, 2 Oct 2024 13:29:09 +1000 Subject: [PATCH 5/8] test(filemanager): move id tests --- .../aws/insert_s3_created_objects.sql | 6 +- .../ingester/aws/insert_s3_objects.sql | 6 +- .../aws/update_reordered_for_created.sql | 6 +- .../filemanager/src/database/aws/ingester.rs | 1 + .../src/database/aws/ingester_paired.rs | 2 + .../filemanager/src/database/mod.rs | 2 + .../filemanager/filemanager/src/error.rs | 12 +- .../filemanager/src/events/aws/collecter.rs | 192 ++++++++++++++++-- .../filemanager/src/events/aws/message.rs | 9 +- .../filemanager/src/events/aws/mod.rs | 9 +- .../filemanager/filemanager/src/events/mod.rs | 4 +- .../filemanager/src/handlers/aws.rs | 5 +- .../filemanager/src/queries/mod.rs | 27 ++- 13 files changed, 218 insertions(+), 63 deletions(-) diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql index 16b0bf3e1..b8bfbdde4 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql @@ -13,7 +13,8 @@ insert into s3_object ( sequencer, is_delete_marker, event_type, - move_id + move_id, + attributes ) values ( unnest($1::uuid[]), @@ -29,7 +30,8 @@ values ( unnest($11::text[]), unnest($12::boolean[]), unnest($13::event_type[]), - unnest($14::uuid[]) + unnest($14::uuid[]), + unnest($15::jsonb[]) ) on conflict on constraint sequencer_unique do update set number_duplicate_events = s3_object.number_duplicate_events + 1 returning s3_object_id, number_duplicate_events; diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql index 16b0bf3e1..b8bfbdde4 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql @@ -13,7 +13,8 @@ insert into s3_object ( sequencer, is_delete_marker, event_type, - move_id + move_id, + attributes ) values ( unnest($1::uuid[]), @@ -29,7 +30,8 @@ values ( unnest($11::text[]), unnest($12::boolean[]), unnest($13::event_type[]), - unnest($14::uuid[]) + unnest($14::uuid[]), + unnest($15::jsonb[]) ) on conflict on constraint sequencer_unique do update set number_duplicate_events = s3_object.number_duplicate_events + 1 returning s3_object_id, number_duplicate_events; diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql index d6ecd1342..793587e76 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql @@ -19,7 +19,8 @@ with input as ( $11::text[], $12::boolean[], $13::event_type[], - $14::uuid[] + $14::uuid[], + $15::jsonb[] ) as input ( s3_object_id, bucket, @@ -34,7 +35,8 @@ with input as ( created_sequencer, is_delete_marker, event_type, - move_id + move_id, + attributes ) ), -- Then, select the objects that need to be updated. diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs index 14bffe147..7833c2a72 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs @@ -54,6 +54,7 @@ impl Ingester { .bind(&events.is_delete_markers) .bind(&events.event_types) .bind(&events.move_ids) + .bind(&events.attributes) .fetch_all(&mut *tx) .await?; diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs index f5d4e763e..648bafedf 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs @@ -105,6 +105,7 @@ impl IngesterPaired { .bind(&object_created.is_delete_markers) .bind(vec![Other; object_created.s3_object_ids.len()]) .bind(&object_created.move_ids) + .bind(&object_created.attributes) .fetch_all(&mut *tx) .await?; @@ -126,6 +127,7 @@ impl IngesterPaired { .bind(&object_created.is_delete_markers) .bind(vec![Other; object_created.s3_object_ids.len()]) .bind(&object_created.move_ids) + .bind(&object_created.attributes) .fetch_all(&mut *tx) .await?; diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs index 03bc2caea..48fcee8e1 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/mod.rs @@ -152,6 +152,7 @@ pub trait Migrate { #[cfg(test)] pub(crate) mod tests { use chrono::{DateTime, Utc}; + use sea_orm::prelude::Json; use sqlx::{query, PgPool, Row}; use crate::database::aws::migration::tests::MIGRATOR; @@ -241,6 +242,7 @@ pub(crate) mod tests { .bind(vec![false]) .bind(vec![event_type]) .bind(vec![UuidGenerator::generate()]) + .bind(vec![None::]) .fetch_all(pool) .await .unwrap(); diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs index 73ae9942c..7b16d71cd 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/error.rs @@ -4,7 +4,6 @@ use std::{io, result}; use sea_orm::{DbErr, RuntimeErr}; -use sqlx::migrate::MigrateError; use thiserror::Error; use url::ParseError; use uuid::Uuid; @@ -16,8 +15,6 @@ pub type Result = result::Result; pub enum Error { #[error("database error: `{0}`")] DatabaseError(DbErr), - #[error("SQL migrate error: `{0}`")] - MigrateError(String), #[error("SQS error: `{0}`")] SQSError(String), #[error("serde error: `{0}`")] @@ -48,6 +45,9 @@ pub enum Error { PresignedUrlError(String), #[error("configuring API: `{0}`")] ApiConfigurationError(String), + #[cfg(feature = "migrate")] + #[error("SQL migrate error: `{0}`")] + MigrateError(String), } impl From for Error { @@ -62,12 +62,6 @@ impl From for Error { } } -impl From for Error { - fn from(err: MigrateError) -> Self { - Self::DatabaseError(DbErr::Migration(err.to_string())) - } -} - impl From for Error { fn from(err: serde_json::Error) -> Self { Self::SerdeError(err.to_string()) diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs index 2c578560f..29dc0cb70 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs @@ -390,10 +390,12 @@ impl<'a> Collect for Collecter<'a> { pub(crate) mod tests { use std::result; + use crate::database::aws::migration::tests::MIGRATOR; use crate::events::aws::tests::{ expected_event_record_simple, expected_flat_events_simple, EXPECTED_SHA256, }; use crate::events::aws::StorageClass::IntelligentTiering; + use aws_sdk_s3::error::SdkError; use aws_sdk_s3::operation::get_object_tagging::GetObjectTaggingError; use aws_sdk_s3::operation::head_object::HeadObjectError; @@ -408,9 +410,15 @@ pub(crate) mod tests { use aws_smithy_runtime_api::client::orchestrator::HttpResponse; use aws_smithy_runtime_api::client::result::ServiceError; use mockall::predicate::{eq, function}; - use sqlx::PgPool; + use sea_orm::prelude::Json; + use serde_json::json; + use sqlx::{PgPool, Row}; use super::*; + use crate::database::{Client, Ingest}; + use crate::events::aws::message::EventType::Created; + use crate::handlers::aws::tests::s3_object_results; + use crate::queries::EntriesBuilder; #[tokio::test] async fn receive() { @@ -433,7 +441,7 @@ pub(crate) mod tests { assert_eq!(events, expected); } - #[sqlx::test] + #[sqlx::test(migrator = "MIGRATOR")] async fn build_receive(pool: PgPool) { let mut sqs_client = SQSClient::default(); let mut s3_client = S3Client::default(); @@ -467,10 +475,10 @@ pub(crate) mod tests { assert_eq!(result, Some(DateTime::::default())); } - #[sqlx::test] + #[sqlx::test(migrator = "MIGRATOR")] async fn head(pool: PgPool) { let config = Default::default(); - let client = database::Client::from_pool(pool); + let client = Client::from_pool(pool); let mut collecter = test_collecter(&config, &client).await; set_s3_head_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); @@ -492,10 +500,10 @@ pub(crate) mod tests { assert_eq!(result, expected); } - #[sqlx::test] + #[sqlx::test(migrator = "MIGRATOR")] async fn head_not_found(pool: PgPool) { let config = Default::default(); - let client = database::Client::from_pool(pool); + let client = Client::from_pool(pool); let mut collecter = test_collecter(&config, &client).await; set_s3_head_expectations( @@ -513,10 +521,10 @@ pub(crate) mod tests { assert!(result.is_ok()); } - #[sqlx::test] + #[sqlx::test(migrator = "MIGRATOR")] async fn update_events(pool: PgPool) { let config = Default::default(); - let client = database::Client::from_pool(pool); + let client = Client::from_pool(pool); let mut collecter = test_collecter(&config, &client).await; let events = expected_flat_events_simple().sort_and_dedup(); @@ -538,10 +546,148 @@ pub(crate) mod tests { assert_eq!(second.last_modified_date, None); } - #[sqlx::test] + #[sqlx::test(migrator = "MIGRATOR")] + async fn tagging_without_move(pool: PgPool) { + let config = Default::default(); + let client = Client::from_pool(pool.clone()); + let mut collecter = test_collecter(&config, &client).await; + + collecter.raw_events = + FlatS3EventMessages(vec![FlatS3EventMessage::new_with_generated_id() + .with_event_type(Created) + .with_key("key".to_string()) + .with_bucket("bucket".to_string())]); + + set_s3_client_expectations(&mut collecter.client); + + let mut result = collecter.collect().await.unwrap(); + let EventSourceType::S3(events) = &mut result.event_type else { + panic!(); + }; + assert!(events.move_ids[0].is_some()); + + client.ingest(result.event_type).await.unwrap(); + + let s3_object_results = s3_object_results(&pool).await; + assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_some()); + } + + #[sqlx::test(migrator = "MIGRATOR")] + async fn tagging_with_move(pool: PgPool) { + let config = Default::default(); + let client = Client::from_pool(pool.clone()); + let mut collecter = test_collecter(&config, &client).await; + + let move_id = UuidGenerator::generate(); + EntriesBuilder::default() + .with_move_id(move_id) + .with_n(1) + .build(&client) + .await; + + collecter.raw_events = + FlatS3EventMessages(vec![FlatS3EventMessage::new_with_generated_id() + .with_event_type(Created) + .with_key("key".to_string()) + .with_bucket("bucket".to_string())]); + + set_s3_head_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); + set_s3_get_tagging_expectations( + &mut collecter.client, + vec![move || { + Ok(GetObjectTaggingOutput::builder() + .set_tag_set(Some(vec![Tag::builder() + .key("filemanager_id") + .value(move_id.to_string()) + .build() + .unwrap()])) + .build() + .unwrap()) + }], + ); + + let mut result = collecter.collect().await.unwrap(); + let EventSourceType::S3(events) = &mut result.event_type else { + panic!(); + }; + assert!(events.move_ids[0].is_some()); + + client.ingest(result.event_type).await.unwrap(); + + let s3_object_results = s3_object_results(&pool).await; + assert_eq!(s3_object_results.len(), 2); + assert_eq!( + s3_object_results[0].get::, _>("move_id"), + Some(move_id) + ); + assert_eq!( + s3_object_results[1].get::, _>("move_id"), + Some(move_id) + ); + + let expected_attributes = json!({ + "attributeId": "0", + "nestedId": { + "attributeId": "0" + } + }); + assert_eq!( + s3_object_results[0].get::, _>("attributes"), + Some(expected_attributes.clone()) + ); + assert_eq!( + s3_object_results[1].get::, _>("attributes"), + Some(expected_attributes) + ); + } + + #[sqlx::test(migrator = "MIGRATOR")] + async fn tagging_on_fail(pool: PgPool) { + let config = Default::default(); + let client = Client::from_pool(pool.clone()); + let mut collecter = test_collecter(&config, &client).await; + + collecter.raw_events = + FlatS3EventMessages(vec![FlatS3EventMessage::new_with_generated_id() + .with_event_type(Created) + .with_key("key".to_string()) + .with_bucket("bucket".to_string())]); + + set_s3_head_expectations(&mut collecter.client, vec![|| Ok(expected_head_object())]); + set_s3_get_tagging_expectations( + &mut collecter.client, + vec![move || { + Err(SdkError::ServiceError( + ServiceError::builder() + .source(GetObjectTaggingError::unhandled("unhandled")) + .raw(HttpResponse::new(404.try_into().unwrap(), SdkBody::empty())) + .build(), + )) + }], + ); + + let mut result = collecter.collect().await.unwrap(); + let EventSourceType::S3(events) = &mut result.event_type else { + panic!(); + }; + assert!(events.move_ids[0].is_none()); + + client.ingest(result.event_type).await.unwrap(); + + let s3_object_results = s3_object_results(&pool).await; + assert_eq!(s3_object_results.len(), 1); + assert!(s3_object_results[0] + .get::, _>("move_id") + .is_none()); + } + + #[sqlx::test(migrator = "MIGRATOR")] async fn collect(pool: PgPool) { let config = Default::default(); - let client = database::Client::from_pool(pool); + let client = Client::from_pool(pool); let mut collecter = test_collecter(&config, &client).await; set_s3_client_expectations(&mut collecter.client); @@ -578,17 +724,13 @@ pub(crate) mod tests { } } - pub(crate) fn set_s3_tagging_expectations( + pub(crate) fn set_s3_get_tagging_expectations( client: &mut S3Client, get_tagging_expectations: Vec, - put_tagging_expectations: Vec, ) where F: Fn() -> result::Result> + Send + 'static, - T: Fn() -> result::Result> - + Send - + 'static, { let get_tagging = client .expect_get_object_tagging() @@ -598,6 +740,21 @@ pub(crate) mod tests { for expectation in get_tagging_expectations { get_tagging.returning(move |_, _| expectation()); } + } + + pub(crate) fn set_s3_tagging_expectations( + client: &mut S3Client, + get_tagging_expectations: Vec, + put_tagging_expectations: Vec, + ) where + F: Fn() -> result::Result> + + Send + + 'static, + T: Fn() -> result::Result> + + Send + + 'static, + { + set_s3_get_tagging_expectations(client, get_tagging_expectations); let put_tagging = client .expect_put_object_tagging() @@ -661,10 +818,7 @@ pub(crate) mod tests { ) } - async fn test_collecter<'a>( - config: &'a Config, - database_client: &'a database::Client, - ) -> Collecter<'a> { + async fn test_collecter<'a>(config: &'a Config, database_client: &'a Client) -> Collecter<'a> { Collecter::new( S3Client::default(), database_client, diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs index 8cc53f20b..966821aee 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs @@ -3,14 +3,13 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -use sqlx::postgres::{PgHasArrayType, PgTypeInfo}; use crate::events::aws::{FlatS3EventMessage, FlatS3EventMessages}; use crate::uuid::UuidGenerator; /// The type of S3 event. #[derive(Debug, Default, Eq, PartialEq, Ord, PartialOrd, Clone, Hash, sqlx::Type)] -#[sqlx(type_name = "event_type", no_pg_array)] +#[sqlx(type_name = "event_type")] pub enum EventType { #[default] Created, @@ -18,12 +17,6 @@ pub enum EventType { Other, } -impl PgHasArrayType for EventType { - fn array_type_info() -> PgTypeInfo { - PgTypeInfo::with_name("_event_type") - } -} - /// Data for converting from S3 events to the internal filemanager event type. #[derive(Debug)] pub struct EventTypeData { diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs index 8eae436a9..268349d96 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs @@ -5,7 +5,6 @@ use aws_sdk_s3::types::StorageClass as AwsStorageClass; use chrono::{DateTime, Utc}; use itertools::{izip, Itertools}; use serde::{Deserialize, Serialize}; -use sqlx::postgres::{PgHasArrayType, PgTypeInfo}; use sqlx::FromRow; use uuid::Uuid; @@ -23,7 +22,7 @@ pub mod message; /// A wrapper around AWS storage types with sqlx support. #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Clone, sqlx::Type, Serialize, Deserialize)] #[serde(rename_all = "SCREAMING_SNAKE_CASE")] -#[sqlx(type_name = "storage_class", no_pg_array)] +#[sqlx(type_name = "storage_class")] pub enum StorageClass { DeepArchive, Glacier, @@ -37,12 +36,6 @@ pub enum StorageClass { StandardIa, } -impl PgHasArrayType for StorageClass { - fn array_type_info() -> PgTypeInfo { - PgTypeInfo::with_name("_storage_class") - } -} - impl StorageClass { /// Convert from the AWS storage class type to the filemanager storage class. pub fn from_aws(storage_class: AwsStorageClass) -> Option { diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/mod.rs index 0f65f9942..0922deb75 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/mod.rs @@ -16,7 +16,7 @@ pub trait Collect { } /// The event source with a type and the number of (potentially duplicate) records contained. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct EventSource { event_type: EventSourceType, n_records: usize, @@ -39,7 +39,7 @@ impl EventSource { /// The type of event. #[allow(clippy::large_enum_variant)] -#[derive(Debug)] +#[derive(Debug, Clone)] #[non_exhaustive] pub enum EventSourceType { S3(TransposedS3EventMessages), diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs index afdd36e13..1b4277ea1 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/handlers/aws.rs @@ -211,6 +211,7 @@ pub(crate) mod tests { use sqlx::postgres::PgRow; use sqlx::PgPool; + use super::*; use crate::database::aws::ingester::tests::{ assert_row, expected_message, fetch_results, remove_version_ids, replace_sequencers, test_events, test_ingester, @@ -234,8 +235,6 @@ pub(crate) mod tests { use crate::events::aws::FlatS3EventMessage; use crate::events::EventSourceType::S3; - use super::*; - #[sqlx::test(migrator = "MIGRATOR")] async fn test_receive_and_ingest(pool: PgPool) { let client = Client::from_pool(pool); @@ -528,7 +527,7 @@ pub(crate) mod tests { assert_row(row, message, Some("".to_string()), None); } - async fn s3_object_results(pool: &PgPool) -> Vec { + pub(crate) async fn s3_object_results(pool: &PgPool) -> Vec { sqlx::query("select * from s3_object order by sequencer, key") .fetch_all(pool) .await diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs index a6fe4d888..0244a0531 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs @@ -3,6 +3,11 @@ use std::ops::Add; +use crate::database::entities::s3_object::ActiveModel as ActiveS3Object; +use crate::database::entities::s3_object::Model as S3Object; +use crate::database::entities::sea_orm_active_enums::{EventType, StorageClass}; +use crate::database::Client; +use crate::uuid::UuidGenerator; use chrono::{DateTime, Days}; use rand::seq::SliceRandom; use rand::thread_rng; @@ -10,12 +15,7 @@ use sea_orm::Set; use sea_orm::{ActiveModelTrait, TryIntoModel}; use serde_json::json; use strum::EnumCount; - -use crate::database::entities::s3_object::ActiveModel as ActiveS3Object; -use crate::database::entities::s3_object::Model as S3Object; -use crate::database::entities::sea_orm_active_enums::{EventType, StorageClass}; -use crate::database::Client; -use crate::uuid::UuidGenerator; +use uuid::Uuid; pub mod get; pub mod list; @@ -41,11 +41,12 @@ impl Entries { shuffle: bool, bucket_divisor: usize, key_divisor: usize, + move_id: Option, ) -> Vec { let mut output = vec![]; let mut entries: Vec<_> = (0..n) - .map(|index| Self::generate_entry(index, bucket_divisor, key_divisor)) + .map(|index| Self::generate_entry(index, bucket_divisor, key_divisor, move_id)) .collect(); if shuffle { @@ -70,6 +71,7 @@ impl Entries { index: usize, bucket_divisor: usize, key_divisor: usize, + move_id: Option, ) -> ActiveS3Object { let event = Self::event_type(index); let date = || Set(Some(DateTime::default().add(Days::new(index as u64)))); @@ -82,7 +84,7 @@ impl Entries { ActiveS3Object { s3_object_id: Set(UuidGenerator::generate()), - move_id: Set(Some(UuidGenerator::generate())), + move_id: Set(Some(move_id.unwrap_or_else(UuidGenerator::generate))), event_type: Set(event.clone()), bucket: Set((index / bucket_divisor).to_string()), key: Set((index / key_divisor).to_string()), @@ -127,6 +129,7 @@ pub struct EntriesBuilder { bucket_divisor: usize, key_divisor: usize, shuffle: bool, + move_id: Option, } impl EntriesBuilder { @@ -154,6 +157,12 @@ impl EntriesBuilder { self } + /// Set whether to shuffle. + pub fn with_move_id(mut self, move_id: Uuid) -> Self { + self.move_id = Some(move_id); + self + } + /// Build the entries and initialize the database. pub async fn build(self, client: &Client) -> Entries { let mut entries = Entries::initialize_database( @@ -162,6 +171,7 @@ impl EntriesBuilder { self.shuffle, self.bucket_divisor, self.key_divisor, + self.move_id, ) .await; @@ -179,6 +189,7 @@ impl Default for EntriesBuilder { bucket_divisor: 2, key_divisor: 1, shuffle: false, + move_id: None, } } } From 6560b5eb850673e3b3642ba99c17d5ba3e6af121 Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Wed, 2 Oct 2024 16:21:01 +1000 Subject: [PATCH 6/8] docs(filemanager): add explanation of object moving logic --- .../stacks/filemanager/docs/MOVED_OBJECTS.md | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md diff --git a/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md b/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md new file mode 100644 index 000000000..ea4375fa9 --- /dev/null +++ b/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md @@ -0,0 +1,49 @@ +# Tracking moved objects + +The filemanager tracks records from S3 events, which do not describe how objects move from one location to another. Using +S3 events alone, it's impossible to tell whether an object that has been deleted in one place and created in another is +the same object that has been moved, or two different objects. This is because S3 only tracks `Created` or `Deleted` +events. + +To track moved objects, the filemanager has to store additional information on objects, that gets copied when the object +is moved. The design involves using object tagging to store an identifier on all objects that is copied when the +object is moved. This id can be used to track how object moves. + +When records are ingested, the filemanager first checks if the object contains the tag with the id field. If the tag is +present, then the object has been moved, and the new record reuses that id. If not, a new id is generated and the object +is tagged with it. Later, the database can be queried to find all record matching the id. This represents a sequence of moved +objects. + +## Tagging process + +The process of tagging is: + +* When an object record is ingested, the filemanager queries `Created` events for tags. +* If the tags can be retrieved, the filemanager looks for a tag called `filemanager_id`. The key name can be + configured in the environment variable `FILEMANAGER_INGESTER_TAG_NAME`. +* The tag is parsed as a UUID, and stored in the `move_id` column of `s3_object` for that record. +* If the tag does not exist, then a new UUID is generated, and the object is tagged on S3 by calling `PutObjectTagging`. + The new tag is also stored in the `move_id` column. +* The database is also queried for any records with the same `move_id` so that attributes can be copied to the new record. + +This logic is enabled by default, but it can be switched off by setting `FILEMANAGER_INGESTER_TRACK_MOVES`. The filemanager +API provides a way to query the database for records with a given `move_id`. + +## Design considerations + +Object tags on S3 are limited to 10 tags per object, where each tag can only store 258 unicode characters. This means that it +is not possible a large amount of data or attributes in tags. Instead, filemanager stores a single UUID in the tag, which is +linked to object records that store the attributes and data. + +The object tagging process cannot be atomic, so there is a chance for concurrency errors to occur. Tagging can also +fail due to database or network errors. The filemanager only tracks `move_id`s if it knows that a tag has been +successfully created on S3, and successfully stored in the database. If tagging fails, or it's not enabled then the `move_id` +column will be null. + +## Alternative designs + +Alternatively, S3 object metadata could also be used to track moves using a similar mechanism. However, metadata can +only be updated by deleting and recreated the object, so tagging was chosen instead. Another mechanism which could track +moved objects is to compare object checksums or etags. This works but may also be limited if checksum is not present, or +if the etag was computed using a different part-size. Both these approaches could be used in addition to object tagging +to provide the filemanager more ways to track moves. From 3388fdc33a2f5b7cd11a3ccde5309148a58ae2a3 Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Thu, 3 Oct 2024 13:51:34 +1000 Subject: [PATCH 7/8] refactor(filemanager): add correct ingest function permissions and rename ingest_id tag --- .../database/migrations/0002_s3_ingest_id.sql | 2 + .../database/migrations/0002_s3_move_id.sql | 1 - .../api/select_existing_by_bucket_key.sql | 2 +- .../aws/insert_s3_created_objects.sql | 2 +- .../ingester/aws/insert_s3_objects.sql | 2 +- .../aws/update_reordered_for_created.sql | 8 +-- .../aws/update_reordered_for_deleted.sql | 2 +- .../deploy/constructs/functions/api.ts | 1 + .../deploy/constructs/functions/function.ts | 4 +- .../deploy/constructs/functions/ingest.ts | 17 +++++- .../stacks/filemanager/docs/MOVED_OBJECTS.md | 44 ++++++++------- .../filemanager/src/database/aws/ingester.rs | 18 +++---- .../src/database/aws/ingester_paired.rs | 14 ++--- .../src/database/entities/s3_object.rs | 2 +- .../stacks/filemanager/filemanager/src/env.rs | 2 +- .../filemanager/src/events/aws/collecter.rs | 54 +++++++++---------- .../filemanager/src/events/aws/inventory.rs | 2 +- .../filemanager/src/events/aws/message.rs | 2 +- .../filemanager/src/events/aws/mod.rs | 20 +++---- .../filemanager/src/queries/list.rs | 2 +- .../filemanager/src/queries/mod.rs | 18 +++---- .../filemanager/src/routes/filter/mod.rs | 4 +- 22 files changed, 118 insertions(+), 105 deletions(-) create mode 100644 lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_ingest_id.sql delete mode 100644 lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql diff --git a/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_ingest_id.sql b/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_ingest_id.sql new file mode 100644 index 000000000..b468d1987 --- /dev/null +++ b/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_ingest_id.sql @@ -0,0 +1,2 @@ +alter table s3_object add column ingest_id uuid; +create index ingest_id_index on s3_object (ingest_id); diff --git a/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql b/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql deleted file mode 100644 index 5c982b3d0..000000000 --- a/lib/workload/stateless/stacks/filemanager/database/migrations/0002_s3_move_id.sql +++ /dev/null @@ -1 +0,0 @@ -alter table s3_object add column move_id uuid; \ No newline at end of file diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql b/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql index 3818b4725..cdee98d76 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/api/select_existing_by_bucket_key.sql @@ -34,7 +34,7 @@ select size, is_delete_marker, event_type, - move_id, + ingest_id, attributes, 0::bigint as "number_reordered" from input diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql index b8bfbdde4..cd02c7b33 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_created_objects.sql @@ -13,7 +13,7 @@ insert into s3_object ( sequencer, is_delete_marker, event_type, - move_id, + ingest_id, attributes ) values ( diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql index b8bfbdde4..cd02c7b33 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/insert_s3_objects.sql @@ -13,7 +13,7 @@ insert into s3_object ( sequencer, is_delete_marker, event_type, - move_id, + ingest_id, attributes ) values ( diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql index 793587e76..abe546c19 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_created.sql @@ -35,7 +35,7 @@ with input as ( created_sequencer, is_delete_marker, event_type, - move_id, + ingest_id, attributes ) ), @@ -56,7 +56,7 @@ current_objects as ( input.storage_class as input_storage_class, input.is_delete_marker as input_is_delete_marker, input.event_type as input_event_type, - input.move_id as input_move_id + input.ingest_id as input_ingest_id from s3_object -- Grab the relevant values to update with. join input on @@ -105,7 +105,7 @@ update as ( is_delete_marker = objects_to_update.input_is_delete_marker, storage_class = objects_to_update.input_storage_class, event_type = objects_to_update.input_event_type, - move_id = objects_to_update.input_move_id, + ingest_id = objects_to_update.input_ingest_id, number_reordered = s3_object.number_reordered + -- Note the asymmetry between this and the reorder for deleted query. case when objects_to_update.deleted_sequencer is not null or objects_to_update.sequencer is not null then @@ -133,7 +133,7 @@ select number_duplicate_events, size, is_delete_marker, - move_id, + ingest_id, attributes, -- This is used to simplify re-constructing the FlatS3EventMessages in the Lambda. I.e. this update detected an -- out of order created event, so return a created event back. diff --git a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql index e13e50083..6f435c01c 100644 --- a/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql +++ b/lib/workload/stateless/stacks/filemanager/database/queries/ingester/aws/update_reordered_for_deleted.sql @@ -98,7 +98,7 @@ select number_duplicate_events, size, is_delete_marker, - move_id, + ingest_id, attributes, -- This is used to simplify re-constructing the FlatS3EventMessages in the Lambda. I.e. this update detected an -- out of order deleted event, so return a deleted event back. diff --git a/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/api.ts b/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/api.ts index 2466dd508..58ee7f088 100644 --- a/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/api.ts +++ b/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/api.ts @@ -20,6 +20,7 @@ export class ApiFunction extends fn.Function { // See https://github.com/awslabs/aws-lambda-rust-runtime/tree/main/lambda-http#integration-with-api-gateway-stages // for more info. AWS_LAMBDA_HTTP_IGNORE_STAGE_IN_PATH: 'true', + ...props.environment, }, ...props, }); diff --git a/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/function.ts b/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/function.ts index d4f502751..a186dd8b5 100644 --- a/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/function.ts +++ b/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/function.ts @@ -165,11 +165,11 @@ export class Function extends Construct { /** * Add policies for 's3:List*' and 's3:Get*' on the buckets to this function's role. */ - addPoliciesForBuckets(buckets: string[]) { + addPoliciesForBuckets(buckets: string[], additionalActions?: string[]) { buckets.map((bucket) => { this.addToPolicy( new PolicyStatement({ - actions: ['s3:ListBucket', 's3:GetObject'], + actions: [...['s3:ListBucket', 's3:GetObject'], ...(additionalActions ?? [])], resources: [`arn:aws:s3:::${bucket}`, `arn:aws:s3:::${bucket}/*`], }) ); diff --git a/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/ingest.ts b/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/ingest.ts index 9ff5eece4..14c1f6f9b 100644 --- a/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/ingest.ts +++ b/lib/workload/stateless/stacks/filemanager/deploy/constructs/functions/ingest.ts @@ -3,6 +3,7 @@ import { IQueue } from 'aws-cdk-lib/aws-sqs'; import * as fn from './function'; import { SqsEventSource } from 'aws-cdk-lib/aws-lambda-event-sources'; import { DatabaseProps } from './function'; +import { FILEMANAGER_SERVICE_NAME } from '../../stack'; /** * Props for controlling access to buckets. @@ -35,7 +36,14 @@ export type IngestFunctionProps = fn.FunctionPropsConfigurable & DatabaseProps & */ export class IngestFunction extends fn.Function { constructor(scope: Construct, id: string, props: IngestFunctionProps) { - super(scope, id, { package: 'filemanager-ingest-lambda', ...props }); + super(scope, id, { + package: 'filemanager-ingest-lambda', + environment: { + FILEMANAGER_INGESTER_TAG_NAME: 'umccr-org:OrcaBusFileManagerIngestId', + ...props.environment, + }, + ...props, + }); this.addAwsManagedPolicy('service-role/AWSLambdaSQSQueueExecutionRole'); @@ -43,6 +51,11 @@ export class IngestFunction extends fn.Function { const eventSource = new SqsEventSource(source); this.function.addEventSource(eventSource); }); - this.addPoliciesForBuckets(props.buckets); + this.addPoliciesForBuckets(props.buckets, [ + 's3:GetObjectTagging', + 's3:GetObjectVersionTagging', + 's3:PutObjectTagging', + 's3:PutObjectVersionTagging', + ]); } } diff --git a/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md b/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md index ea4375fa9..4de4ebc13 100644 --- a/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md +++ b/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md @@ -1,49 +1,47 @@ # Tracking moved objects The filemanager tracks records from S3 events, which do not describe how objects move from one location to another. Using -S3 events alone, it's impossible to tell whether an object that has been deleted in one place and created in another is +S3 events alone, it's not possible to tell whether an object that has been deleted in one place and created in another is the same object that has been moved, or two different objects. This is because S3 only tracks `Created` or `Deleted` events. -To track moved objects, the filemanager has to store additional information on objects, that gets copied when the object -is moved. The design involves using object tagging to store an identifier on all objects that is copied when the -object is moved. This id can be used to track how object moves. - -When records are ingested, the filemanager first checks if the object contains the tag with the id field. If the tag is -present, then the object has been moved, and the new record reuses that id. If not, a new id is generated and the object -is tagged with it. Later, the database can be queried to find all record matching the id. This represents a sequence of moved -objects. +To track moved objects, the filemanager stores additional information in S3 tags, that gets copied when the object +is moved. This allows the filemanager to track how objects move and also allows it to copy attributes when an object +is moved/copied. ## Tagging process The process of tagging is: * When an object record is ingested, the filemanager queries `Created` events for tags. -* If the tags can be retrieved, the filemanager looks for a tag called `filemanager_id`. The key name can be +* If the tags can be retrieved, the filemanager looks for a key called `ingest_id`. The key name can be configured in the environment variable `FILEMANAGER_INGESTER_TAG_NAME`. -* The tag is parsed as a UUID, and stored in the `move_id` column of `s3_object` for that record. +* The tag is parsed as a UUID, and stored in the `ingest_id` column of `s3_object` for that record. * If the tag does not exist, then a new UUID is generated, and the object is tagged on S3 by calling `PutObjectTagging`. - The new tag is also stored in the `move_id` column. -* The database is also queried for any records with the same `move_id` so that attributes can be copied to the new record. + The new tag is also stored in the `ingest_id` column. +* The database is also queried for any records with the same `ingest_id` so that attributes can be copied to the new record. This logic is enabled by default, but it can be switched off by setting `FILEMANAGER_INGESTER_TRACK_MOVES`. The filemanager -API provides a way to query the database for records with a given `move_id`. +API provides a way to query the database for records with a given `ingest_id`. ## Design considerations -Object tags on S3 are limited to 10 tags per object, where each tag can only store 258 unicode characters. This means that it -is not possible a large amount of data or attributes in tags. Instead, filemanager stores a single UUID in the tag, which is -linked to object records that store the attributes and data. +Object tags on S3 are limited to 10 tags per object, and each tag can only store 258 unicode characters. The filemanager +avoids storing a large amount of data by using a UUID as the value of the tag, which is linked to object records that +store attributes and data. The object tagging process cannot be atomic, so there is a chance for concurrency errors to occur. Tagging can also -fail due to database or network errors. The filemanager only tracks `move_id`s if it knows that a tag has been -successfully created on S3, and successfully stored in the database. If tagging fails, or it's not enabled then the `move_id` +fail due to database or network errors. The filemanager only tracks `ingest_id`s if it knows that a tag has been +successfully created on S3, and successfully stored in the database. If tagging fails, or it's not enabled, then the `ingest_id` column will be null. +The object tagging mechanism also doesn't differentiate between moved objects and copied objects with the same tags. +If an object is copied with tags, the `ingest_id` will also be copied and the above logic will apply. + ## Alternative designs Alternatively, S3 object metadata could also be used to track moves using a similar mechanism. However, metadata can -only be updated by deleting and recreated the object, so tagging was chosen instead. Another mechanism which could track -moved objects is to compare object checksums or etags. This works but may also be limited if checksum is not present, or -if the etag was computed using a different part-size. Both these approaches could be used in addition to object tagging -to provide the filemanager more ways to track moves. +only be updated by deleting and recreated the object. This process would be much more costly so tagging was chosen instead. +Another approach is to compare object checksums or etags. However, this would also be limited if the checksum is not present, +or if the etag was computed using a different part-size. Both these approaches could be used in addition to object tagging +to provide more ways to track moves. diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs index 7833c2a72..b11a16a9e 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester.rs @@ -53,7 +53,7 @@ impl Ingester { .bind(&events.sequencers) .bind(&events.is_delete_markers) .bind(&events.event_types) - .bind(&events.move_ids) + .bind(&events.ingest_ids) .bind(&events.attributes) .fetch_all(&mut *tx) .await?; @@ -106,7 +106,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_created(&s3_object_results[0]); } @@ -155,7 +155,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_with( &s3_object_results[0], @@ -194,7 +194,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 2); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_eq!( 1, @@ -301,7 +301,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 2); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); // Order should be different here. assert_ingest_events( @@ -1098,16 +1098,16 @@ pub(crate) mod tests { *sha256 = Some(EXPECTED_SHA256.to_string()); }); }; - let update_move_ids = |move_ids: &mut Vec>| { - move_ids.iter_mut().for_each(|move_id| { - *move_id = Some(UuidGenerator::generate()); + let update_ingest_ids = |ingest_ids: &mut Vec>| { + ingest_ids.iter_mut().for_each(|ingest_id| { + *ingest_id = Some(UuidGenerator::generate()); }); }; update_last_modified(&mut events.last_modified_dates); update_storage_class(&mut events.storage_classes); update_sha256(&mut events.sha256s); - update_move_ids(&mut events.move_ids); + update_ingest_ids(&mut events.ingest_ids); events } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs index 648bafedf..eb5caf786 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/aws/ingester_paired.rs @@ -104,7 +104,7 @@ impl IngesterPaired { .bind(&object_created.sequencers) .bind(&object_created.is_delete_markers) .bind(vec![Other; object_created.s3_object_ids.len()]) - .bind(&object_created.move_ids) + .bind(&object_created.ingest_ids) .bind(&object_created.attributes) .fetch_all(&mut *tx) .await?; @@ -126,7 +126,7 @@ impl IngesterPaired { .bind(&object_created.sequencers) .bind(&object_created.is_delete_markers) .bind(vec![Other; object_created.s3_object_ids.len()]) - .bind(&object_created.move_ids) + .bind(&object_created.ingest_ids) .bind(&object_created.attributes) .fetch_all(&mut *tx) .await?; @@ -219,7 +219,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_created(&s3_object_results[0]); } @@ -273,7 +273,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_with( &s3_object_results[0], @@ -313,7 +313,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_ingest_events(&s3_object_results[0], EXPECTED_VERSION_ID); } @@ -334,7 +334,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_eq!( 2, @@ -462,7 +462,7 @@ pub(crate) mod tests { assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); assert_eq!(2, s3_object_results[0].get::("number_reordered")); assert_ingest_events(&s3_object_results[0], EXPECTED_VERSION_ID); diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs index d3a9865f9..c978c2cc1 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/database/entities/s3_object.rs @@ -37,7 +37,7 @@ pub struct Model { #[sea_orm(column_type = "Text", nullable)] pub deleted_sequencer: Option, pub number_reordered: i64, - pub move_id: Option, + pub ingest_id: Option, } #[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] pub enum Relation {} diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs index 6223ec301..4363710f0 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/env.rs @@ -57,7 +57,7 @@ impl Default for Config { sqs_url: None, paired_ingest_mode: false, ingester_track_moves: true, - ingester_tag_name: "filemanager_id".to_string(), + ingester_tag_name: "ingest_id".to_string(), api_links_url: None, api_presign_limit: None, api_presign_expiry: None, diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs index 29dc0cb70..e1abc169f 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/collecter.rs @@ -253,7 +253,7 @@ impl<'a> Collecter<'a> { let GetObjectTaggingOutput { mut tag_set, .. } = tagging; - // Check if the object contains the move_id tag. + // Check if the object contains the ingest_id tag. let tag = tag_set .clone() .into_iter() @@ -261,10 +261,10 @@ impl<'a> Collecter<'a> { let Some(tag) = tag else { // If it doesn't, then a new tag needs to be generated. - let move_id = UuidGenerator::generate(); + let ingest_id = UuidGenerator::generate(); let tag = Tag::builder() .key(config.ingester_tag_name()) - .value(move_id) + .value(ingest_id) .build()?; tag_set.push(tag); @@ -280,29 +280,29 @@ impl<'a> Collecter<'a> { warn!("Error received from PutObjectTagging: {}", err); }); - // Only add a move_id to the new record if the tagging was successful. + // Only add a ingest_id to the new record if the tagging was successful. return if result.is_ok() { - Ok(event.with_move_id(Some(move_id))) + Ok(event.with_ingest_id(Some(ingest_id))) } else { Ok(event) }; }; - // The object has a move_id tag. Grab the existing the tag, returning a new record without - // the move_id if the is not valid. - let move_id = Uuid::from_str(tag.value()).inspect_err(|err| { - warn!("Failed to parse move_id from tag: {}", err); + // The object has a ingest_id tag. Grab the existing the tag, returning a new record without + // the ingest_id if the is not valid. + let ingest_id = Uuid::from_str(tag.value()).inspect_err(|err| { + warn!("Failed to parse ingest_id from tag: {}", err); }); - let Ok(move_id) = move_id else { + let Ok(ingest_id) = ingest_id else { return Ok(event); }; // From here, the new record must be a valid, moved object. - let event = event.with_move_id(Some(move_id)); + let event = event.with_ingest_id(Some(ingest_id)); // Get the attributes from the old record to update the new record with. let filter = S3ObjectsFilter { - move_id: Some(move_id), + ingest_id: Some(ingest_id), ..Default::default() }; let moved_object = ListQueryBuilder::new(database_client.connection_ref()) @@ -317,7 +317,7 @@ impl<'a> Collecter<'a> { if let Some(moved_object) = moved_object { Ok(event.with_attributes(moved_object.attributes)) } else { - warn!("Object with move_id {} not found in database", move_id); + warn!("Object with ingest_id {} not found in database", ingest_id); Ok(event) } } @@ -564,14 +564,14 @@ pub(crate) mod tests { let EventSourceType::S3(events) = &mut result.event_type else { panic!(); }; - assert!(events.move_ids[0].is_some()); + assert!(events.ingest_ids[0].is_some()); client.ingest(result.event_type).await.unwrap(); let s3_object_results = s3_object_results(&pool).await; assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_some()); } @@ -581,9 +581,9 @@ pub(crate) mod tests { let client = Client::from_pool(pool.clone()); let mut collecter = test_collecter(&config, &client).await; - let move_id = UuidGenerator::generate(); + let ingest_id = UuidGenerator::generate(); EntriesBuilder::default() - .with_move_id(move_id) + .with_ingest_id(ingest_id) .with_n(1) .build(&client) .await; @@ -600,8 +600,8 @@ pub(crate) mod tests { vec![move || { Ok(GetObjectTaggingOutput::builder() .set_tag_set(Some(vec![Tag::builder() - .key("filemanager_id") - .value(move_id.to_string()) + .key("ingest_id") + .value(ingest_id.to_string()) .build() .unwrap()])) .build() @@ -613,19 +613,19 @@ pub(crate) mod tests { let EventSourceType::S3(events) = &mut result.event_type else { panic!(); }; - assert!(events.move_ids[0].is_some()); + assert!(events.ingest_ids[0].is_some()); client.ingest(result.event_type).await.unwrap(); let s3_object_results = s3_object_results(&pool).await; assert_eq!(s3_object_results.len(), 2); assert_eq!( - s3_object_results[0].get::, _>("move_id"), - Some(move_id) + s3_object_results[0].get::, _>("ingest_id"), + Some(ingest_id) ); assert_eq!( - s3_object_results[1].get::, _>("move_id"), - Some(move_id) + s3_object_results[1].get::, _>("ingest_id"), + Some(ingest_id) ); let expected_attributes = json!({ @@ -673,14 +673,14 @@ pub(crate) mod tests { let EventSourceType::S3(events) = &mut result.event_type else { panic!(); }; - assert!(events.move_ids[0].is_none()); + assert!(events.ingest_ids[0].is_none()); client.ingest(result.event_type).await.unwrap(); let s3_object_results = s3_object_results(&pool).await; assert_eq!(s3_object_results.len(), 1); assert!(s3_object_results[0] - .get::, _>("move_id") + .get::, _>("ingest_id") .is_none()); } @@ -761,7 +761,7 @@ pub(crate) mod tests { .with( eq("key"), eq("bucket"), - function(|t: &Tagging| t.tag_set().first().unwrap().key == "filemanager_id"), + function(|t: &Tagging| t.tag_set().first().unwrap().key == "ingest_id"), ) .times(put_tagging_expectations.len()); diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs index 584468ae5..8d2aee699 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/inventory.rs @@ -528,7 +528,7 @@ impl From for FlatS3EventMessage { // Anything in an inventory report is always a created event. event_type: Created, is_delete_marker: is_delete_marker.unwrap_or_default(), - move_id: None, + ingest_id: None, attributes: None, number_duplicate_events: 0, number_reordered: 0, diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs index 966821aee..771dd1bc0 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/message.rs @@ -191,7 +191,7 @@ impl From for FlatS3EventMessage { sha256: None, event_type, is_delete_marker, - move_id: None, + ingest_id: None, attributes: None, number_duplicate_events: 0, number_reordered: 0, diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs index 268349d96..4a6595cad 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/events/aws/mod.rs @@ -73,7 +73,7 @@ pub struct TransposedS3EventMessages { pub last_modified_dates: Vec>>, pub event_types: Vec, pub is_delete_markers: Vec, - pub move_ids: Vec>, + pub ingest_ids: Vec>, pub attributes: Vec>, } @@ -95,7 +95,7 @@ impl TransposedS3EventMessages { last_modified_dates: Vec::with_capacity(capacity), event_types: Vec::with_capacity(capacity), is_delete_markers: Vec::with_capacity(capacity), - move_ids: Vec::with_capacity(capacity), + ingest_ids: Vec::with_capacity(capacity), attributes: Vec::with_capacity(capacity), } } @@ -116,7 +116,7 @@ impl TransposedS3EventMessages { last_modified_date, event_type, is_delete_marker, - move_id, + ingest_id, attributes, .. } = message; @@ -134,7 +134,7 @@ impl TransposedS3EventMessages { self.last_modified_dates.push(last_modified_date); self.event_types.push(event_type); self.is_delete_markers.push(is_delete_marker); - self.move_ids.push(move_id); + self.ingest_ids.push(ingest_id); self.attributes.push(attributes); } } @@ -173,7 +173,7 @@ impl From for FlatS3EventMessages { messages.last_modified_dates, messages.event_types, messages.is_delete_markers, - messages.move_ids, + messages.ingest_ids, messages.attributes, ) .map( @@ -191,7 +191,7 @@ impl From for FlatS3EventMessages { last_modified_date, event_type, is_delete_marker, - move_id, + ingest_id, attributes, )| { FlatS3EventMessage { @@ -208,7 +208,7 @@ impl From for FlatS3EventMessages { event_time, event_type, is_delete_marker, - move_id, + ingest_id, attributes, number_duplicate_events: 0, number_reordered: 0, @@ -442,7 +442,7 @@ pub struct FlatS3EventMessage { pub event_time: Option>, pub event_type: EventType, pub is_delete_marker: bool, - pub move_id: Option, + pub ingest_id: Option, pub attributes: Option, pub number_duplicate_events: i64, pub number_reordered: i64, @@ -577,8 +577,8 @@ impl FlatS3EventMessage { } /// Set the move id. - pub fn with_move_id(mut self, move_id: Option) -> Self { - self.move_id = move_id; + pub fn with_ingest_id(mut self, ingest_id: Option) -> Self { + self.ingest_id = ingest_id; self } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs index 7b2d93878..4e05f03a7 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/list.rs @@ -150,7 +150,7 @@ where .is_delete_marker .map(|v| s3_object::Column::IsDeleteMarker.eq(v)), ) - .add_option(filter.move_id.map(|v| s3_object::Column::MoveId.eq(v))); + .add_option(filter.ingest_id.map(|v| s3_object::Column::IngestId.eq(v))); if let Some(attributes) = filter.attributes { let json_conditions = Self::json_conditions( diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs index 0244a0531..a5749a739 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/queries/mod.rs @@ -41,12 +41,12 @@ impl Entries { shuffle: bool, bucket_divisor: usize, key_divisor: usize, - move_id: Option, + ingest_id: Option, ) -> Vec { let mut output = vec![]; let mut entries: Vec<_> = (0..n) - .map(|index| Self::generate_entry(index, bucket_divisor, key_divisor, move_id)) + .map(|index| Self::generate_entry(index, bucket_divisor, key_divisor, ingest_id)) .collect(); if shuffle { @@ -71,7 +71,7 @@ impl Entries { index: usize, bucket_divisor: usize, key_divisor: usize, - move_id: Option, + ingest_id: Option, ) -> ActiveS3Object { let event = Self::event_type(index); let date = || Set(Some(DateTime::default().add(Days::new(index as u64)))); @@ -84,7 +84,7 @@ impl Entries { ActiveS3Object { s3_object_id: Set(UuidGenerator::generate()), - move_id: Set(Some(move_id.unwrap_or_else(UuidGenerator::generate))), + ingest_id: Set(Some(ingest_id.unwrap_or_else(UuidGenerator::generate))), event_type: Set(event.clone()), bucket: Set((index / bucket_divisor).to_string()), key: Set((index / key_divisor).to_string()), @@ -129,7 +129,7 @@ pub struct EntriesBuilder { bucket_divisor: usize, key_divisor: usize, shuffle: bool, - move_id: Option, + ingest_id: Option, } impl EntriesBuilder { @@ -158,8 +158,8 @@ impl EntriesBuilder { } /// Set whether to shuffle. - pub fn with_move_id(mut self, move_id: Uuid) -> Self { - self.move_id = Some(move_id); + pub fn with_ingest_id(mut self, ingest_id: Uuid) -> Self { + self.ingest_id = Some(ingest_id); self } @@ -171,7 +171,7 @@ impl EntriesBuilder { self.shuffle, self.bucket_divisor, self.key_divisor, - self.move_id, + self.ingest_id, ) .await; @@ -189,7 +189,7 @@ impl Default for EntriesBuilder { bucket_divisor: 2, key_divisor: 1, shuffle: false, - move_id: None, + ingest_id: None, } } } diff --git a/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs b/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs index e3782d345..af96c854b 100644 --- a/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs +++ b/lib/workload/stateless/stacks/filemanager/filemanager/src/routes/filter/mod.rs @@ -68,8 +68,8 @@ pub struct S3ObjectsFilter { /// Query by the object delete marker. pub(crate) is_delete_marker: Option, #[param(required = false)] - /// Query by the object delete marker. - pub(crate) move_id: Option, + /// Query by the ingest id that objects get tagged with. + pub(crate) ingest_id: Option, #[param(required = false)] /// Query by JSON attributes. Supports nested syntax to access inner /// fields, e.g. `attributes[attribute_id]=...`. This only deserializes From 5cf5f71ef20a7fe1dcebe4cee6fa46342e5af13c Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Fri, 4 Oct 2024 10:48:37 +1000 Subject: [PATCH 8/8] docs(filemanager): doc improvements and clarifications --- .../stacks/filemanager/docs/MOVED_OBJECTS.md | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md b/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md index 4de4ebc13..3a656fcf3 100644 --- a/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md +++ b/lib/workload/stateless/stacks/filemanager/docs/MOVED_OBJECTS.md @@ -5,9 +5,9 @@ S3 events alone, it's not possible to tell whether an object that has been delet the same object that has been moved, or two different objects. This is because S3 only tracks `Created` or `Deleted` events. -To track moved objects, the filemanager stores additional information in S3 tags, that gets copied when the object +To track moved objects, filemanager stores additional information in S3 tags. The tag gets updated when the object is moved. This allows the filemanager to track how objects move and also allows it to copy attributes when an object -is moved/copied. +is moved/copied. This process is described [below](#tagging-process). ## Tagging process @@ -26,22 +26,33 @@ API provides a way to query the database for records with a given `ingest_id`. ## Design considerations -Object tags on S3 are limited to 10 tags per object, and each tag can only store 258 unicode characters. The filemanager -avoids storing a large amount of data by using a UUID as the value of the tag, which is linked to object records that -store attributes and data. +Object tags on S3 are [limited][s3-tagging] to 10 tags per object, and each tag can only store 258 unicode characters. +The filemanager avoids storing a large amount of data by using a UUID as the value of the tag, which is linked to object +records that store attributes and data. The object tagging process cannot be atomic, so there is a chance for concurrency errors to occur. Tagging can also fail due to database or network errors. The filemanager only tracks `ingest_id`s if it knows that a tag has been successfully created on S3, and successfully stored in the database. If tagging fails, or it's not enabled, then the `ingest_id` column will be null. +The act of tagging the object allows it to be tracked - ideally this is done as soon as possible. Currently, this happens +at ingestion, however this could have performance implications. Alternative approaches should consider asynchronous tagging. +For example, [`s3:ObjectTagging:*`][s3-tagging-event] events could be used for this purpose. + The object tagging mechanism also doesn't differentiate between moved objects and copied objects with the same tags. If an object is copied with tags, the `ingest_id` will also be copied and the above logic will apply. ## Alternative designs Alternatively, S3 object metadata could also be used to track moves using a similar mechanism. However, metadata can -only be updated by deleting and recreated the object. This process would be much more costly so tagging was chosen instead. +[only be updated][s3-metadata] by deleting and recreated the object. This process would be much more costly so tagging +was chosen instead. + Another approach is to compare object checksums or etags. However, this would also be limited if the checksum is not present, -or if the etag was computed using a different part-size. Both these approaches could be used in addition to object tagging +or if the etag was computed using a different part-size. It also fails if the checksum is not expected to be the same, +for example, if tracking set of files that are compressed. Both these approaches could be used in addition to object tagging to provide more ways to track moves. + +[s3-tagging]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-tagging.html +[s3-tagging-event]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/notification-how-to-event-types-and-destinations.html#supported-notification-event-types +[s3-metadata]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html