Delete intermediate report files used in parallel Processing

This makes sure the following intermediate files are being cleaned up / deleted after parallel processing: - The "partial Report" files, after merging - The copy of the *initial* and the *final* "master Report" used in parallel verification - The parallel version of the "master Report" used for verification
codecov · Oct 4, 2024 · dcfeff8 · dcfeff8
1 parent 98629d2
commit dcfeff8
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 10 deletions.
diff --git a/services/archive.py b/services/archive.py
@@ -38,13 +38,13 @@ def get_path(self, **kwaargs) -> str:
 # Service class for performing archive operations. Meant to work against the
 # underlying StorageService
 class ArchiveService(object):
-    root = None
+    root: str
     """
     The root level of the archive. In s3 terms,
     this would be the name of the bucket
     """
 
-    storage_hash = None
+    storage_hash: str
     """
     A hash key of the repo for internal storage
     """
@@ -252,6 +252,13 @@ def delete_file(self, path) -> None:
         """
         self.storage.delete_file(self.root, path)
 
+    @sentry_sdk.trace()
+    def delete_files(self, paths: list[str]) -> list[bool]:
+        """
+        Batch-deletes the gives list of files.
+        """
+        return self.storage.delete_files(self.root, paths)
+
     def delete_repo_files(self) -> int:
         """
         Deletes an entire repository's contents

diff --git a/tasks/parallel_verification.py b/tasks/parallel_verification.py
@@ -72,23 +72,36 @@ def run_impl(
         )
 
         # Retrieve serial results
+        serial_fas_path = parallel_path_to_serial_path(
+            parallel_paths["files_and_sessions_path"], last_upload_pk
+        )
         serial_files_and_sessions = json.loads(
-            archive_service.read_file(
-                parallel_path_to_serial_path(
-                    parallel_paths["files_and_sessions_path"], last_upload_pk
-                )
-            )
+            archive_service.read_file(serial_fas_path)
+        )
+        serial_chunks_path = parallel_path_to_serial_path(
+            parallel_paths["chunks_path"], last_upload_pk
+        )
+        serial_chunks = archive_service.read_file(serial_chunks_path).decode(
+            errors="replace"
         )
-        serial_chunks = archive_service.read_file(
-            parallel_path_to_serial_path(parallel_paths["chunks_path"], last_upload_pk)
-        ).decode(errors="replace")
         serial_report = report_service.build_report(
             serial_chunks,
             serial_files_and_sessions["files"],
             serial_files_and_sessions["sessions"],
             None,
         )
 
+        # after the comparison is done, these files are not needed anymore,
+        # and should be cleaned up
+        archive_service.delete_files(
+            [
+                parallel_paths["files_and_sessions_path"],
+                parallel_paths["chunks_path"],
+                serial_fas_path,
+                serial_chunks_path,
+            ]
+        )
+
         top_level_totals_match = True
         file_level_totals_match = True
         file_level_mismatched_files = []

diff --git a/tasks/upload_finisher.py b/tasks/upload_finisher.py
@@ -647,13 +647,52 @@ def merge_report(cumulative_report: Report, obj):
         )
         report = functools.reduce(merge_report, unmerged_reports, report)
         commit.get_db_session().flush()
+
+        cleanup_partial_reports(
+            repository, commit, processing_results["parallel_incremental_result"]
+        )
+
         return report
 
 
 RegisteredUploadTask = celery_app.register_task(UploadFinisherTask())
 upload_finisher_task = celery_app.tasks[RegisteredUploadTask.name]
 
 
+def cleanup_partial_reports(
+    repository: Repository, commit: Commit, partial_reports: list[dict]
+):
+    """
+    Cleans up the files in storage that contain the "partial Report"s
+    from parallel processing, as well as the copy of the "master Report" used
+    for the "experiment" mode.
+    """
+    archive_service = ArchiveService(repository)
+    repo_hash = archive_service.get_archive_hash(repository)
+
+    # there are only relevant for the "experiment" mode:
+    files_to_delete = [
+        MinioEndpoints.parallel_upload_experiment.get_path(
+            version="v4",
+            repo_hash=repo_hash,
+            commitid=commit.commitid,
+            file_name="files_and_sessions",
+        ),
+        MinioEndpoints.parallel_upload_experiment.get_path(
+            version="v4",
+            repo_hash=repo_hash,
+            commitid=commit.commitid,
+            file_name="chunks",
+        ),
+    ]
+
+    for partial_report in partial_reports:
+        files_to_delete.append(partial_report["chunks_path"])
+        files_to_delete.append(partial_report["files_and_sessions_path"])
+
+    archive_service.delete_files(files_to_delete)
+
+
 # TODO: maybe move this to `shared` if it turns out to be a better place for this
 def change_sessionid(report: Report, old_id: int, new_id: int):
     """