Skip to content
This repository was archived by the owner on Feb 21, 2025. It is now read-only.

Better files events processing #496

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/file_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::fs;
#[cfg(not(windows))]
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf;
use std::sync::{Arc, Mutex as StdMutex};

const LARGE_FILE_SIZE_THRESHOLD: u64 = 180*1024; // 180k files (180k is ~0.2% of all files on our dataset)
const SMALL_FILE_SIZE_THRESHOLD: u64 = 5; // 5 Bytes
Expand Down Expand Up @@ -76,3 +77,30 @@ pub fn is_this_inside_blacklisted_dir(path: &PathBuf) -> bool {
false
}

pub fn is_in_one_of_the_workspaces_root(path: &PathBuf, workspace_folders: Arc<StdMutex<Vec<PathBuf>>>) -> bool {
if let Some(path_parent) = path.parent() {
if let Ok(workspace_folders) = workspace_folders.lock() {
for folder in workspace_folders.iter() {
if path_parent == folder {
return true;
}
}
}
}
false
}

pub fn has_the_same_parent_as_one_of_the_others(path: &PathBuf, workspace_files: Arc<StdMutex<Vec<PathBuf>>>) -> bool {
if let Some(path_parent) = path.parent() {
if let Ok(workspace_files) = workspace_files.lock() {
for file in workspace_files.iter() {
if let Some(file_parent) = file.parent() {
if path_parent == file_parent {
return true;
}
}
}
}
}
false
}
68 changes: 48 additions & 20 deletions src/files_in_workspace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use tracing::info;
use crate::git::git_ls_files;
use crate::global_context::GlobalContext;
use crate::telemetry;
use crate::file_filter::{is_this_inside_blacklisted_dir, is_valid_file, BLACKLISTED_DIRS, SOURCE_FILE_EXTENSIONS};
use crate::file_filter::{has_the_same_parent_as_one_of_the_others, is_in_one_of_the_workspaces_root, is_this_inside_blacklisted_dir, is_valid_file, BLACKLISTED_DIRS, SOURCE_FILE_EXTENSIONS};
use crate::ast::ast_indexer_thread::ast_indexer_enqueue_files;
use crate::privacy::{check_file_privacy, load_privacy_if_needed, PrivacySettings, FilePrivacyLevel};

Expand Down Expand Up @@ -428,19 +428,24 @@ async fn enqueue_some_docs(
let cpaths: Vec<String> = docs.iter().map(|doc| doc.doc_path.to_string_lossy().to_string()).collect();
ast_indexer_enqueue_files(ast.clone(), cpaths, force).await;
}
let (cache_correction_arc, _) = crate::files_correction::files_cache_rebuild_as_needed(gcx.clone()).await;
let mut moar_files: Vec<PathBuf> = Vec::new();
for doc in docs {
let doc_path_str = doc.doc_path.to_string_lossy().to_string();
if !cache_correction_arc.contains_key(&doc_path_str) {
moar_files.push(doc.doc_path.clone());
if !doc.doc_path.exists() {
let workspace_files_arc = gcx.read().await.documents_state.workspace_files.clone();
let mut workspace_files = workspace_files_arc.lock().unwrap();
if let Some(idx) = workspace_files.iter().position(|file| file == &doc.doc_path) {
workspace_files.remove(idx);
}
} else {
let workspace_files_arc = gcx.read().await.documents_state.workspace_files.clone();
let mut workspace_files = workspace_files_arc.lock().unwrap();
workspace_files.push(doc.doc_path.clone());
}
}
if moar_files.len() > 0 {
crate::files_correction::files_cache_rebuild_as_needed(gcx.clone()).await;
if !docs.is_empty() {
info!("this made file cache dirty");
let dirty_arc = {
let gcx_locked = gcx.write().await;
gcx_locked.documents_state.workspace_files.lock().unwrap().extend(moar_files);
gcx_locked.documents_state.cache_dirty.clone()
};
let now = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs_f64();
Expand Down Expand Up @@ -630,10 +635,32 @@ pub async fn remove_folder(gcx: Arc<ARwLock<GlobalContext>>, path: &PathBuf)

pub async fn file_watcher_event(event: Event, gcx_weak: Weak<ARwLock<GlobalContext>>)
{
fn is_valid_path(
path: &PathBuf,
workspace_folders: Arc<StdMutex<Vec<PathBuf>>>,
workspace_files: Arc<StdMutex<Vec<PathBuf>>>
) -> bool {
if !is_in_one_of_the_workspaces_root(&path, workspace_folders)
&& !has_the_same_parent_as_one_of_the_others(&path, workspace_files) {
if is_this_inside_blacklisted_dir(&path) {
return false;
}
}
true
}

async fn on_create_modify(gcx_weak: Weak<ARwLock<GlobalContext>>, event: Event) {
let mut docs = vec![];
let (workspace_folders, workspace_files) = if let Some(gcx) = gcx_weak.clone().upgrade() {
let gcx_locked = gcx.read().await;
(gcx_locked.documents_state.workspace_folders.clone(),gcx_locked.documents_state.workspace_files.clone())
} else {
tracing::warn!("gcx is not available, cannot process file watcher signal");
return;
};

for p in &event.paths {
if is_this_inside_blacklisted_dir(&p) { // important to filter BEFORE canonical_path
if !is_valid_path(&p, workspace_folders.clone(), workspace_files.clone()) { // important to filter BEFORE canonical_path
continue;
}

Expand Down Expand Up @@ -661,19 +688,20 @@ pub async fn file_watcher_event(event: Event, gcx_weak: Weak<ARwLock<GlobalConte
}

async fn on_remove(gcx_weak: Weak<ARwLock<GlobalContext>>, event: Event) {
let mut never_mind = true;
for p in &event.paths {
never_mind &= is_this_inside_blacklisted_dir(&p);
}
let (workspace_folders, workspace_files) = if let Some(gcx) = gcx_weak.clone().upgrade() {
let gcx_locked = gcx.read().await;
(gcx_locked.documents_state.workspace_folders.clone(),gcx_locked.documents_state.workspace_files.clone())
} else {
tracing::warn!("gcx is not available, cannot process file watcher signal");
return;
};
let mut docs = vec![];
if !never_mind {
for p in &event.paths {
if is_this_inside_blacklisted_dir(&p) {
continue;
}
let cpath = crate::files_correction::canonical_path(&p.to_string_lossy().to_string());
docs.push(Document { doc_path: cpath, doc_text: None });
for p in &event.paths {
if !is_valid_path(&p, workspace_folders.clone(), workspace_files.clone()) {
continue;
}
let cpath = crate::files_correction::canonical_path(&p.to_string_lossy().to_string());
docs.push(Document { doc_path: cpath, doc_text: None });
}
if docs.is_empty() {
return;
Expand Down
2 changes: 1 addition & 1 deletion src/vecdb/vdb_thread.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ pub async fn vectorizer_enqueue_files(
documents: &Vec<Document>,
process_immediately: bool,
) {
info!("adding {} files", documents.len());
info!("adding or removing {} files", documents.len());
let documents = filter_docs_to_enqueue(documents);
let (vecdb_todo, vstatus, vstatus_notify, vecdb_max_files) = {
let service = vservice.lock().await;
Expand Down