From 062af47c2b6e94609af2f76d325289d80f2f5a0f Mon Sep 17 00:00:00 2001 From: Luca Vari Date: Fri, 29 Nov 2024 17:27:30 +1100 Subject: [PATCH] Reduce memory use when obfuscating staging environment data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Django caches query results by default. See this comment from the Django docs: In a newly created QuerySet, the cache is empty. The first time a QuerySet is evaluated – and, hence, a database query happens – Django saves the query results in the QuerySet’s cache and returns the results that have been explicitly requested (e.g., the next element, if the QuerySet is being iterated over). Subsequent evaluations of the QuerySet reuse the cached results. -- https://docs.djangoproject.com/en/5.1/topics/db/queries/#caching-and-querysets This can result in significant memory use. The live box was running out of memory when we were attempting to obfuscate email details, presumably because that model contains some large text fields. We disable caching as we only need to iterate over each queryset once and, obviously, we need to address the issue of running out of memory. --- .../management/commands/obfuscate_data.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/app/core/management/commands/obfuscate_data.py b/app/core/management/commands/obfuscate_data.py index 75f17fa9..23b2d86a 100644 --- a/app/core/management/commands/obfuscate_data.py +++ b/app/core/management/commands/obfuscate_data.py @@ -27,7 +27,7 @@ def handle(self, *args, **kwargs): self.stdout.write("\nObfuscating personal information...") disable_signals() - fake = Faker('en_AU') + fake = Faker("en_AU") clients = Client.objects.all() emails = Email.objects.all() @@ -46,27 +46,27 @@ def handle(self, *args, **kwargs): | Q(is_superuser=True) ).distinct() - for t in tenancies: + for t in tenancies.iterator(): t.address = fake.street_address() t.suburb = fake.city() t.postcode = fake.postcode() t.save() - for u in users: + for u in users.iterator(): u.email = fake.unique.email() - u.username = u.email # Username same as fake email. + u.username = u.email # Username same as fake email. u.first_name = fake.first_name() u.last_name = fake.last_name() u.save() - for p in people: + for p in people.iterator(): p.full_name = fake.name() p.email = fake.email() p.address = fake.address() p.phone_number = fake.phone_number() p.save() - for c in clients: + for c in clients.iterator(): c.first_name = fake.first_name() c.last_name = fake.last_name() if c.preferred_name: @@ -75,7 +75,7 @@ def handle(self, *args, **kwargs): c.phone_number = fake.phone_number() c.save() - for i in issues: + for i in issues.iterator(): if i.outcome_notes: i.outcome_notes = " ".join(fake.sentences()) @@ -87,11 +87,11 @@ def handle(self, *args, **kwargs): } i.save() - for n in notes: + for n in notes.iterator(): n.text = " ".join(fake.sentences()) n.save() - for e in emails: + for e in emails.iterator(): e.received_data = None e.subject = fake.sentence() e.text = "\n\n".join(fake.paragraphs()) @@ -104,12 +104,10 @@ def handle(self, *args, **kwargs): e.from_address = fake.email() if e.to_address: e.to_address = fake.email() - e.cc_addresses = [ - fake.email() for _ in e.cc_addresses - ] + e.cc_addresses = [fake.email() for _ in e.cc_addresses] e.save() - for s in services: + for s in services.iterator(): if s.notes: s.notes = " ".join(fake.sentences()) s.save() @@ -118,7 +116,7 @@ def handle(self, *args, **kwargs): email_attachment = os.path.join(EmailAttachment.UPLOAD_KEY, file_name) file_upload = os.path.join(FileUpload.UPLOAD_KEY, file_name) - bytes = fake.image(image_format='pdf') + bytes = fake.image(image_format="pdf") default_storage.save(email_attachment, BytesIO(bytes)) default_storage.save(file_upload, BytesIO(bytes)) @@ -126,6 +124,7 @@ def handle(self, *args, **kwargs): EmailAttachment.objects.update( file=email_attachment, content_type="application/pdf" ) + # Replace uploaded files FileUpload.objects.update(file=file_upload)