From db6b602a7beded8de0a7cfe8af65fee6bc070433 Mon Sep 17 00:00:00 2001 From: Martin Weber Date: Tue, 3 Sep 2024 13:26:31 +0200 Subject: [PATCH] feat: add management command to clean up unreferenced bucket objects --- .../commands/clean_dangling_objects.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 document_merge_service/api/management/commands/clean_dangling_objects.py diff --git a/document_merge_service/api/management/commands/clean_dangling_objects.py b/document_merge_service/api/management/commands/clean_dangling_objects.py new file mode 100644 index 00000000..c44fbdb4 --- /dev/null +++ b/document_merge_service/api/management/commands/clean_dangling_objects.py @@ -0,0 +1,82 @@ +import boto3 +import botocore +from django.conf import settings +from django.core.management.base import BaseCommand +from document_merge_service.api.models import Template + + +S3_ENDPOINT_URL = settings.S3_STORAGE_OPTIONS["endpoint_url"] +S3_ACCESS_KEY_ID = settings.S3_STORAGE_OPTIONS["access_key"] +S3_SECRET_ACCESS_KEY = settings.S3_STORAGE_OPTIONS["secret_key"] +S3_BUCKET = settings.S3_STORAGE_OPTIONS["bucket_name"] + + +class Command(BaseCommand): + help = "Cleans up unreferenced files and warns about unencrypted files in the storage bucket" + + def get_object_keys(self, s3_client): + keys = [] + have_next = True + continuation_token = None + while have_next: + data = s3_client.list_objects_v2( + Bucket=S3_BUCKET, + **( + {"ContinuationToken": continuation_token} + if continuation_token + else {} + ), + ) + if data["KeyCount"] > 0: + keys += [o["Key"] for o in data["Contents"]] + have_next = data["IsTruncated"] + if have_next: + continuation_token = data["NextContinuationToken"] + return keys + + def delete_file(self, s3_client, stats: dict, fname: str): + print(f"Deleting unreferenced file: {fname}") + # TODO: delete for real + # s3_client.delete_object(Bucket=S3_BUCKET, Key=fname) + stats["deleted"] += 1 + + def check_encryption(self, s3_client, stats: dict, fname: str): + # Our approach is rather unsophisticated: we try to download an object + # without encryption key, and if that succeeds, it means the object is + # not encrypted, and therefore we print a warning. + try: + s3_client.get_object(Bucket=S3_BUCKET, Key=fname) + print(f"WARNING, unencrypted file: {fname}") + stats["unencrypted"] += 1 + except botocore.exceptions.ClientError: + stats["encrypted"] += 1 + + def handle(self, *args, **options): + s3_session = boto3.session.Session() + s3_client = s3_session.client( + service_name="s3", + aws_access_key_id=S3_ACCESS_KEY_ID, + aws_secret_access_key=S3_SECRET_ACCESS_KEY, + endpoint_url=S3_ENDPOINT_URL, + ) + bucket_files = self.get_object_keys(s3_client) + used_files = Template.objects.all().values_list("template", flat=True) + + print( + f"Cleaning up among {len(bucket_files)} bucket objects and {len(used_files)} DB entries" + ) + stats = { + "total": len(bucket_files), + "encrypted": 0, + "unencrypted": 0, + "deleted": 0, + } + for f in bucket_files: + if f in used_files: + # file found in db, check if it's encrypted + self.check_encryption(s3_client, stats, f) + else: + # file not found in db, can be deleted + self.delete_file(s3_client, stats, f) + + print(f"stats: {stats}")