Skip to content

Commit

Permalink
Reduce memory use when obfuscating staging environment data
Browse files Browse the repository at this point in the history
Django caches query results by default. See this comment from the Django docs:

  In a newly created QuerySet, the cache is empty. The first time a QuerySet is
  evaluated – and, hence, a database query happens – Django saves the query
  results in the QuerySet’s cache and returns the results that have been
  explicitly requested (e.g., the next element, if the QuerySet is being
  iterated over). Subsequent evaluations of the QuerySet reuse the cached
  results.

-- https://docs.djangoproject.com/en/5.1/topics/db/queries/#caching-and-querysets

This can result in significant memory use. The live box was running out of
memory when we were attempting to obfuscate email details, presumably because
that model contains some large text fields.

We disable caching as we only need to iterate over each queryset once and,
obviously, we need to address the issue of running out of memory.
  • Loading branch information
luca-vari committed Nov 29, 2024
1 parent 8d2d4d5 commit 062af47
Showing 1 changed file with 13 additions and 14 deletions.
27 changes: 13 additions & 14 deletions app/core/management/commands/obfuscate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def handle(self, *args, **kwargs):
self.stdout.write("\nObfuscating personal information...")
disable_signals()

fake = Faker('en_AU')
fake = Faker("en_AU")

clients = Client.objects.all()
emails = Email.objects.all()
Expand All @@ -46,27 +46,27 @@ def handle(self, *args, **kwargs):
| Q(is_superuser=True)
).distinct()

for t in tenancies:
for t in tenancies.iterator():
t.address = fake.street_address()
t.suburb = fake.city()
t.postcode = fake.postcode()
t.save()

for u in users:
for u in users.iterator():
u.email = fake.unique.email()
u.username = u.email # Username same as fake email.
u.username = u.email # Username same as fake email.
u.first_name = fake.first_name()
u.last_name = fake.last_name()
u.save()

for p in people:
for p in people.iterator():
p.full_name = fake.name()
p.email = fake.email()
p.address = fake.address()
p.phone_number = fake.phone_number()
p.save()

for c in clients:
for c in clients.iterator():
c.first_name = fake.first_name()
c.last_name = fake.last_name()
if c.preferred_name:
Expand All @@ -75,7 +75,7 @@ def handle(self, *args, **kwargs):
c.phone_number = fake.phone_number()
c.save()

for i in issues:
for i in issues.iterator():
if i.outcome_notes:
i.outcome_notes = " ".join(fake.sentences())

Expand All @@ -87,11 +87,11 @@ def handle(self, *args, **kwargs):
}
i.save()

for n in notes:
for n in notes.iterator():
n.text = " ".join(fake.sentences())
n.save()

for e in emails:
for e in emails.iterator():
e.received_data = None
e.subject = fake.sentence()
e.text = "\n\n".join(fake.paragraphs())
Expand All @@ -104,12 +104,10 @@ def handle(self, *args, **kwargs):
e.from_address = fake.email()
if e.to_address:
e.to_address = fake.email()
e.cc_addresses = [
fake.email() for _ in e.cc_addresses
]
e.cc_addresses = [fake.email() for _ in e.cc_addresses]
e.save()

for s in services:
for s in services.iterator():
if s.notes:
s.notes = " ".join(fake.sentences())
s.save()
Expand All @@ -118,14 +116,15 @@ def handle(self, *args, **kwargs):
email_attachment = os.path.join(EmailAttachment.UPLOAD_KEY, file_name)
file_upload = os.path.join(FileUpload.UPLOAD_KEY, file_name)

bytes = fake.image(image_format='pdf')
bytes = fake.image(image_format="pdf")
default_storage.save(email_attachment, BytesIO(bytes))
default_storage.save(file_upload, BytesIO(bytes))

# Replace files attached to emails
EmailAttachment.objects.update(
file=email_attachment, content_type="application/pdf"
)

# Replace uploaded files
FileUpload.objects.update(file=file_upload)

Expand Down

0 comments on commit 062af47

Please sign in to comment.