Skip to content

Commit ad7dd6b

Browse files
authored
[ENH] Purge dirty log in background at the end of scheduled compaction (#4915)
## Description of changes _Summarize the changes made by this PR._ - Improvements & Bug fixes - Updates the `verify_and_enrich_collections` method in compaction scheduler, so that it does not directly emit purge dirty log requests. Instead, it is stored in a temporary field in the scheduler, and will be periodically drained by the compaction manager - Updates the compaction manager, so that it will drain the deleted collections field in the compaction scheduler at the end of the scheduled compaction. The purge dirty log will run as a background task - Updates the purge dirty log impl so that it can take in multiple collection ids at a time, instead of only one at a time - New functionality - Add a `PurgeDirtyLog` operator that invokes `<log_client>.purge_dirty_logs(...)` with a timeout. ## Test plan _How are these changes tested?_ - [ ] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
1 parent c7c257f commit ad7dd6b

File tree

12 files changed

+202
-45
lines changed

12 files changed

+202
-45
lines changed

idl/chromadb/proto/logservice.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ message UpdateCollectionLogOffsetResponse {
9090
}
9191

9292
message PurgeDirtyForCollectionRequest {
93-
string collection_id = 1;
93+
repeated string collection_ids = 1;
9494
}
9595

9696
message PurgeDirtyForCollectionResponse {

rust/log-service/src/bin/chroma-purge-dirty-log.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ async fn main() {
2323
let mut client = LogServiceClient::new(logservice);
2424
let _resp = client
2525
.purge_dirty_for_collection(PurgeDirtyForCollectionRequest {
26-
collection_id: collection_id.to_string(),
26+
collection_ids: vec![collection_id.to_string()],
2727
})
2828
.await
2929
.expect("purge-dirty-log request should succeed");

rust/log-service/src/lib.rs

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
use std::collections::hash_map::Entry;
44
use std::collections::HashMap;
55
use std::collections::HashSet;
6+
use std::str::FromStr;
67
use std::sync::Arc;
78
use std::time::{Duration, Instant, SystemTime};
89

@@ -1524,19 +1525,27 @@ impl LogServer {
15241525
);
15251526
async move {
15261527
let request = request.into_inner();
1527-
let collection_id = Uuid::parse_str(&request.collection_id)
1528-
.map(CollectionUuid)
1529-
.map_err(|_| Status::invalid_argument("Failed to parse collection id"))?;
1530-
tracing::info!("purge_dirty_for_collection {collection_id}");
1531-
let dirty_marker = DirtyMarker::Purge { collection_id };
1532-
let dirty_marker_json = serde_json::to_string(&dirty_marker)
1528+
let collection_ids = request
1529+
.collection_ids
1530+
.iter()
1531+
.map(|id| CollectionUuid::from_str(id))
1532+
.collect::<Result<Vec<_>, _>>()
15331533
.map_err(|err| {
1534-
tracing::error!("Failed to serialize dirty marker: {}", err);
1535-
wal3::Error::Internal
1534+
Status::invalid_argument(format!("Failed to parse collection id: {err}"))
1535+
})?;
1536+
tracing::info!("Purging collections in dirty log: [{collection_ids:?}]");
1537+
let dirty_marker_json_blobs = collection_ids
1538+
.into_iter()
1539+
.map(|collection_id| {
1540+
serde_json::to_string(&DirtyMarker::Purge { collection_id })
1541+
.map(String::into_bytes)
15361542
})
1537-
.map_err(|err| Status::new(err.code().into(), err.to_string()))?;
1543+
.collect::<Result<_, _>>()
1544+
.map_err(|err| {
1545+
Status::internal(format!("Failed to serialize dirty marker: {err}"))
1546+
})?;
15381547
self.dirty_log
1539-
.append(Vec::from(dirty_marker_json))
1548+
.append_many(dirty_marker_json_blobs)
15401549
.await
15411550
.map_err(|err| Status::new(err.code().into(), err.to_string()))?;
15421551
Ok(Response::new(PurgeDirtyForCollectionResponse {}))

rust/log/src/grpc_log.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ impl GrpcLog {
671671

672672
pub(super) async fn purge_dirty_for_collection(
673673
&mut self,
674-
collection_id: CollectionUuid,
674+
collection_ids: Vec<CollectionUuid>,
675675
) -> Result<(), GrpcPurgeDirtyForCollectionError> {
676676
let Some(assigner) = self.alt_client_assigner.as_mut() else {
677677
return Ok(());
@@ -681,14 +681,18 @@ impl GrpcLog {
681681
for client in assigner.all().into_iter() {
682682
let mut client = client.clone();
683683
let limiter = Arc::clone(&limiter);
684+
let collection_ids_clone = collection_ids.clone();
684685
let request = async move {
685686
// NOTE(rescrv): This can never fail and the result is to fail open. Don't
686687
// error-check.
687688
let _permit = limiter.acquire().await;
688689
client
689690
.purge_dirty_for_collection(chroma_proto::PurgeDirtyForCollectionRequest {
690691
// NOTE(rescrv): Use the untyped string representation of the collection ID.
691-
collection_id: collection_id.0.to_string(),
692+
collection_ids: collection_ids_clone
693+
.iter()
694+
.map(ToString::to_string)
695+
.collect(),
692696
})
693697
.await
694698
.map_err(GrpcPurgeDirtyForCollectionError::FailedToPurgeDirty)

rust/log/src/log.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,12 @@ impl Log {
161161
#[tracing::instrument(skip(self), err(Display))]
162162
pub async fn purge_dirty_for_collection(
163163
&mut self,
164-
collection_id: CollectionUuid,
164+
collection_ids: Vec<CollectionUuid>,
165165
) -> Result<(), Box<dyn ChromaError>> {
166166
match self {
167167
Log::Sqlite(_) => unimplemented!("not implemented for sqlite"),
168168
Log::Grpc(log) => Ok(log
169-
.purge_dirty_for_collection(collection_id)
169+
.purge_dirty_for_collection(collection_ids)
170170
.await
171171
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?),
172172
Log::InMemory(_) => unimplemented!("not implemented for in memory"),

rust/sysdb/src/sysdb.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -476,12 +476,12 @@ impl SysDb {
476476

477477
pub async fn get_last_compaction_time(
478478
&mut self,
479-
tanant_ids: Vec<String>,
479+
tenant_ids: Vec<String>,
480480
) -> Result<Vec<Tenant>, GetLastCompactionTimeError> {
481481
match self {
482-
SysDb::Grpc(grpc) => grpc.get_last_compaction_time(tanant_ids).await,
482+
SysDb::Grpc(grpc) => grpc.get_last_compaction_time(tenant_ids).await,
483483
SysDb::Sqlite(_) => todo!(),
484-
SysDb::Test(test) => test.get_last_compaction_time(tanant_ids).await,
484+
SysDb::Test(test) => test.get_last_compaction_time(tenant_ids).await,
485485
}
486486
}
487487

rust/sysdb/src/test_sysdb.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -239,11 +239,10 @@ impl TestSysDb {
239239
let inner = self.inner.lock();
240240
let mut tenants = Vec::new();
241241
for tenant_id in tenant_ids {
242-
let last_compaction_time = match inner.tenant_last_compaction_time.get(&tenant_id) {
243-
Some(last_compaction_time) => *last_compaction_time,
244-
None => {
245-
return Err(GetLastCompactionTimeError::TenantNotFound);
246-
}
242+
let Some(last_compaction_time) =
243+
inner.tenant_last_compaction_time.get(&tenant_id).cloned()
244+
else {
245+
continue;
247246
};
248247
tenants.push(Tenant {
249248
id: tenant_id,

rust/worker/src/compactor/compaction_manager.rs

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ use super::OneOffCompactMessage;
44
use super::RebuildMessage;
55
use crate::compactor::types::ScheduledCompactMessage;
66
use crate::config::CompactionServiceConfig;
7+
use crate::execution::operators::purge_dirty_log::PurgeDirtyLog;
8+
use crate::execution::operators::purge_dirty_log::PurgeDirtyLogError;
9+
use crate::execution::operators::purge_dirty_log::PurgeDirtyLogInput;
10+
use crate::execution::operators::purge_dirty_log::PurgeDirtyLogOutput;
711
use crate::execution::orchestration::CompactOrchestrator;
812
use crate::execution::orchestration::CompactionResponse;
913
use async_trait::async_trait;
@@ -18,8 +22,10 @@ use chroma_memberlist::memberlist_provider::Memberlist;
1822
use chroma_segment::spann_provider::SpannProvider;
1923
use chroma_storage::Storage;
2024
use chroma_sysdb::SysDb;
25+
use chroma_system::wrap;
2126
use chroma_system::Dispatcher;
2227
use chroma_system::Orchestrator;
28+
use chroma_system::TaskResult;
2329
use chroma_system::{Component, ComponentContext, ComponentHandle, Handler, System};
2430
use chroma_types::CollectionUuid;
2531
use futures::stream::FuturesUnordered;
@@ -58,6 +64,7 @@ pub(crate) struct CompactionManager {
5864
max_compaction_size: usize,
5965
max_partition_size: usize,
6066
fetch_log_batch_size: u32,
67+
purge_dirty_log_timeout_seconds: u64,
6168
on_next_memberlist_signal: Option<oneshot::Sender<()>>,
6269
}
6370

@@ -92,6 +99,7 @@ impl CompactionManager {
9299
max_compaction_size: usize,
93100
max_partition_size: usize,
94101
fetch_log_batch_size: u32,
102+
purge_dirty_log_timeout_seconds: u64,
95103
) -> Self {
96104
CompactionManager {
97105
system,
@@ -110,6 +118,7 @@ impl CompactionManager {
110118
max_partition_size,
111119
on_next_memberlist_signal: None,
112120
fetch_log_batch_size,
121+
purge_dirty_log_timeout_seconds,
113122
}
114123
}
115124

@@ -187,7 +196,7 @@ impl CompactionManager {
187196
}
188197

189198
#[instrument(name = "CompactionManager::rebuild_batch")]
190-
pub(crate) async fn rebuild_batch(&mut self, collection_ids: Vec<CollectionUuid>) {
199+
pub(crate) async fn rebuild_batch(&mut self, collection_ids: &[CollectionUuid]) {
191200
let _ = collection_ids
192201
.iter()
193202
.map(|id| self.compact(*id, true))
@@ -196,6 +205,41 @@ impl CompactionManager {
196205
.await;
197206
}
198207

208+
#[instrument(name = "CompactionManager::purge_dirty_log", skip(ctx))]
209+
pub(crate) async fn purge_dirty_log(&mut self, ctx: &ComponentContext<Self>) {
210+
let deleted_collection_uuids = self.scheduler.drain_deleted_collections();
211+
if deleted_collection_uuids.is_empty() {
212+
tracing::info!("Skipping purge dirty log because there is no deleted collections");
213+
return;
214+
}
215+
let purge_dirty_log = PurgeDirtyLog {
216+
log_client: self.log.clone(),
217+
timeout: Duration::from_secs(self.purge_dirty_log_timeout_seconds),
218+
};
219+
let purge_dirty_log_input = PurgeDirtyLogInput {
220+
collection_uuids: deleted_collection_uuids.clone(),
221+
};
222+
let purge_dirty_log_task = wrap(
223+
Box::new(purge_dirty_log),
224+
purge_dirty_log_input,
225+
ctx.receiver(),
226+
);
227+
let Some(mut dispatcher) = self.dispatcher.clone() else {
228+
tracing::error!("Unable to create background task to purge dirty log: Dispatcher is not set for compaction manager");
229+
return;
230+
};
231+
if let Err(err) = dispatcher
232+
.send(purge_dirty_log_task, Some(Span::current()))
233+
.await
234+
{
235+
tracing::error!("Unable to create background task to purge dirty log: {err}");
236+
return;
237+
};
238+
tracing::info!(
239+
"Purging dirty logs for deleted collections: [{deleted_collection_uuids:?}]",
240+
);
241+
}
242+
199243
pub(crate) fn set_dispatcher(&mut self, dispatcher: ComponentHandle<Dispatcher>) {
200244
self.dispatcher = Some(dispatcher);
201245
}
@@ -240,6 +284,7 @@ impl Configurable<(CompactionServiceConfig, System)> for CompactionManager {
240284
let max_compaction_size = config.compactor.max_compaction_size;
241285
let max_partition_size = config.compactor.max_partition_size;
242286
let fetch_log_batch_size = config.compactor.fetch_log_batch_size;
287+
let purge_dirty_log_timeout_seconds = config.compactor.purge_dirty_log_timeout_seconds;
243288
let mut disabled_collections =
244289
HashSet::with_capacity(config.compactor.disabled_collections.len());
245290
for collection_id_str in &config.compactor.disabled_collections {
@@ -298,6 +343,7 @@ impl Configurable<(CompactionServiceConfig, System)> for CompactionManager {
298343
max_compaction_size,
299344
max_partition_size,
300345
fetch_log_batch_size,
346+
purge_dirty_log_timeout_seconds,
301347
))
302348
}
303349
}
@@ -342,6 +388,7 @@ impl Handler<ScheduledCompactMessage> for CompactionManager {
342388
) {
343389
tracing::info!("CompactionManager: Performing scheduled compaction");
344390
let _ = self.compact_batch().await;
391+
self.purge_dirty_log(ctx).await;
345392

346393
// Compaction is done, schedule the next compaction
347394
ctx.scheduler.schedule(
@@ -382,7 +429,11 @@ impl Handler<RebuildMessage> for CompactionManager {
382429
"Rebuild started for collections: {:?}",
383430
message.collection_ids
384431
);
385-
self.rebuild_batch(message.collection_ids).await;
432+
self.rebuild_batch(&message.collection_ids).await;
433+
tracing::info!(
434+
"Rebuild completed for collections: {:?}",
435+
message.collection_ids
436+
);
386437
}
387438
}
388439

@@ -400,6 +451,21 @@ impl Handler<Memberlist> for CompactionManager {
400451
}
401452
}
402453

454+
#[async_trait]
455+
impl Handler<TaskResult<PurgeDirtyLogOutput, PurgeDirtyLogError>> for CompactionManager {
456+
type Result = ();
457+
458+
async fn handle(
459+
&mut self,
460+
message: TaskResult<PurgeDirtyLogOutput, PurgeDirtyLogError>,
461+
_ctx: &ComponentContext<CompactionManager>,
462+
) {
463+
if let Err(err) = message.into_inner() {
464+
tracing::error!("Error when purging dirty log: {err}");
465+
}
466+
}
467+
}
468+
403469
pub struct RegisterOnReadySignal {
404470
pub on_ready_tx: oneshot::Sender<()>,
405471
}
@@ -618,6 +684,7 @@ mod tests {
618684
let max_compaction_size = 1000;
619685
let max_partition_size = 1000;
620686
let fetch_log_batch_size = 100;
687+
let purge_dirty_log_timeout_seconds = 60;
621688

622689
// Set assignment policy
623690
let mut assignment_policy = Box::new(RendezvousHashingAssignmentPolicy::default());
@@ -682,6 +749,7 @@ mod tests {
682749
max_compaction_size,
683750
max_partition_size,
684751
fetch_log_batch_size,
752+
purge_dirty_log_timeout_seconds,
685753
);
686754

687755
let dispatcher = Dispatcher::new(DispatcherConfig {

rust/worker/src/compactor/config.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ pub struct CompactorConfig {
1818
pub disabled_collections: Vec<String>,
1919
#[serde(default = "CompactorConfig::default_fetch_log_batch_size")]
2020
pub fetch_log_batch_size: u32,
21+
#[serde(default = "CompactorConfig::default_purge_dirty_log_timeout_seconds")]
22+
pub purge_dirty_log_timeout_seconds: u64,
2123
}
2224

2325
impl CompactorConfig {
@@ -52,6 +54,10 @@ impl CompactorConfig {
5254
fn default_fetch_log_batch_size() -> u32 {
5355
100
5456
}
57+
58+
fn default_purge_dirty_log_timeout_seconds() -> u64 {
59+
60
60+
}
5561
}
5662

5763
impl Default for CompactorConfig {
@@ -65,6 +71,8 @@ impl Default for CompactorConfig {
6571
max_partition_size: CompactorConfig::default_max_partition_size(),
6672
disabled_collections: CompactorConfig::default_disabled_collections(),
6773
fetch_log_batch_size: CompactorConfig::default_fetch_log_batch_size(),
74+
purge_dirty_log_timeout_seconds:
75+
CompactorConfig::default_purge_dirty_log_timeout_seconds(),
6876
}
6977
}
7078
}

0 commit comments

Comments
 (0)