Skip to content

Commit

Permalink
Delete intermediate report files used in parallel Processing
Browse files Browse the repository at this point in the history
This makes sure the following intermediate files are being cleaned up / deleted after parallel processing:

- The "partial Report" files, after merging
- The copy of the *initial* and the *final* "master Report" used in parallel verification
- The parallel version of the "master Report" used for verification
  • Loading branch information
Swatinem committed Oct 4, 2024
1 parent 98629d2 commit dcfeff8
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 10 deletions.
11 changes: 9 additions & 2 deletions services/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ def get_path(self, **kwaargs) -> str:
# Service class for performing archive operations. Meant to work against the
# underlying StorageService
class ArchiveService(object):
root = None
root: str
"""
The root level of the archive. In s3 terms,
this would be the name of the bucket
"""

storage_hash = None
storage_hash: str
"""
A hash key of the repo for internal storage
"""
Expand Down Expand Up @@ -252,6 +252,13 @@ def delete_file(self, path) -> None:
"""
self.storage.delete_file(self.root, path)

@sentry_sdk.trace()
def delete_files(self, paths: list[str]) -> list[bool]:
"""
Batch-deletes the gives list of files.
"""
return self.storage.delete_files(self.root, paths)

def delete_repo_files(self) -> int:
"""
Deletes an entire repository's contents
Expand Down
29 changes: 21 additions & 8 deletions tasks/parallel_verification.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,36 @@ def run_impl(
)

# Retrieve serial results
serial_fas_path = parallel_path_to_serial_path(
parallel_paths["files_and_sessions_path"], last_upload_pk
)
serial_files_and_sessions = json.loads(
archive_service.read_file(
parallel_path_to_serial_path(
parallel_paths["files_and_sessions_path"], last_upload_pk
)
)
archive_service.read_file(serial_fas_path)
)
serial_chunks_path = parallel_path_to_serial_path(
parallel_paths["chunks_path"], last_upload_pk
)
serial_chunks = archive_service.read_file(serial_chunks_path).decode(
errors="replace"
)
serial_chunks = archive_service.read_file(
parallel_path_to_serial_path(parallel_paths["chunks_path"], last_upload_pk)
).decode(errors="replace")
serial_report = report_service.build_report(
serial_chunks,
serial_files_and_sessions["files"],
serial_files_and_sessions["sessions"],
None,
)

# after the comparison is done, these files are not needed anymore,
# and should be cleaned up
archive_service.delete_files(
[
parallel_paths["files_and_sessions_path"],
parallel_paths["chunks_path"],
serial_fas_path,
serial_chunks_path,
]
)

top_level_totals_match = True
file_level_totals_match = True
file_level_mismatched_files = []
Expand Down
39 changes: 39 additions & 0 deletions tasks/upload_finisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,13 +647,52 @@ def merge_report(cumulative_report: Report, obj):
)
report = functools.reduce(merge_report, unmerged_reports, report)
commit.get_db_session().flush()

cleanup_partial_reports(
repository, commit, processing_results["parallel_incremental_result"]
)

return report


RegisteredUploadTask = celery_app.register_task(UploadFinisherTask())
upload_finisher_task = celery_app.tasks[RegisteredUploadTask.name]


def cleanup_partial_reports(
repository: Repository, commit: Commit, partial_reports: list[dict]
):
"""
Cleans up the files in storage that contain the "partial Report"s
from parallel processing, as well as the copy of the "master Report" used
for the "experiment" mode.
"""
archive_service = ArchiveService(repository)
repo_hash = archive_service.get_archive_hash(repository)

# there are only relevant for the "experiment" mode:
files_to_delete = [
MinioEndpoints.parallel_upload_experiment.get_path(
version="v4",
repo_hash=repo_hash,
commitid=commit.commitid,
file_name="files_and_sessions",
),
MinioEndpoints.parallel_upload_experiment.get_path(
version="v4",
repo_hash=repo_hash,
commitid=commit.commitid,
file_name="chunks",
),
]

for partial_report in partial_reports:
files_to_delete.append(partial_report["chunks_path"])
files_to_delete.append(partial_report["files_and_sessions_path"])

archive_service.delete_files(files_to_delete)


# TODO: maybe move this to `shared` if it turns out to be a better place for this
def change_sessionid(report: Report, old_id: int, new_id: int):
"""
Expand Down

0 comments on commit dcfeff8

Please sign in to comment.