Skip to content

Commit

Permalink
[affiliation] Improve performance of affiliation jobs
Browse files Browse the repository at this point in the history
The current algorithm was running unnecessary
calls to 'find_individual_by_uuid' when the
generic affiliation job was called. In other words,
when the list of individuals to affiliate was
all the individuals in the database.

To improve it, we have followed the same approach
we did with the matching recommendations. We get
the full list of individuals with a direct query
using Django's API.

Signed-off-by: Santiago Dueñas <[email protected]>
  • Loading branch information
sduenas committed Dec 11, 2023
1 parent e884ae7 commit 544343e
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 38 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
title: Performance on affiliation recommendations improved
category: performance
author: Santiago Dueñas <[email protected]>
issue: null
notes: >
We have improved the affiliation performance by
one order of magnitude removing unnecessary queries
to the database.
49 changes: 25 additions & 24 deletions sortinghat/core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ def recommend_affiliations(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):

if not uuids:
logger.info(f"Running job {job.id} 'recommend affiliations'; uuids='all'; ...")
uuids = Individual.objects.filter(last_modified__gte=last_modified).values_list('mk', flat=True).iterator()
else:
logger.info(f"Running job {job.id} 'recommend affiliations'; uuids={uuids}; ...")
uuids = iter(uuids)
Expand All @@ -168,25 +167,29 @@ def recommend_affiliations(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
job_ctx = SortingHatContext(ctx.user, job.id, ctx.tenant)

# Create an empty transaction to log which job
# will generate the enroll transactions.
# will generate the 'enroll' transactions.
trxl = TransactionsLog.open('recommend_affiliations', job_ctx)

for chunk in _iter_split(uuids, size=MAX_CHUNK_SIZE):
for rec in engine.recommend('affiliation', chunk):
results[rec.key] = rec.options
for rec in engine.recommend('affiliation', uuids, last_modified):
results[rec.key] = rec.options

for org_name in rec.options:
try:
individual = find_individual_by_uuid(rec.key)
org = find_organization(org_name)
except NotFoundError:
logger.warning(f"Job {job.id} 'Individual {rec.key} not found'")
logger.warning(f"Job {job.id} 'Organization {org_name} not found'")
continue
for org_name in rec.options:
try:
org = find_organization(org_name)
except NotFoundError:
logger.warning(f"Job {job.id} 'Organization {org_name} not found'")
continue
AffiliationRecommendation.objects.get_or_create(individual=individual,
organization=org)

try:
with transaction.atomic():
AffiliationRecommendation.objects.create(individual_id=rec.mk,
organization=org)
except IntegrityError:
logger.debug(
f"Job {job.id} 'Unable to create affiliation recommendation for"
f"Individual {rec.key} and Organization {org_name}"
)

trxl.close()

logger.info(
Expand Down Expand Up @@ -377,7 +380,6 @@ def affiliate(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):

if not uuids:
logger.info(f"Running job {job.id} 'affiliate'; uuids='all'; ...")
uuids = Individual.objects.filter(last_modified__gte=last_modified).values_list('mk', flat=True).iterator()
else:
logger.info(f"Running job {job.id} 'affiliate'; uuids={uuids}; ...")
uuids = iter(uuids)
Expand All @@ -396,19 +398,18 @@ def affiliate(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
job_ctx = SortingHatContext(ctx.user, job.id, ctx.tenant)

# Create an empty transaction to log which job
# will generate the enroll transactions.
# will generate the 'enroll' transactions.
trxl = TransactionsLog.open('affiliate', job_ctx)

nsuccess = 0

for chunk in _iter_split(uuids, size=MAX_CHUNK_SIZE):
for rec in engine.recommend('affiliation', chunk):
affiliated, errs = _affiliate_individual(job_ctx, rec.key, rec.options)
results[rec.key] = affiliated
errors.extend(errs)
for rec in engine.recommend('affiliation', uuids, last_modified):
affiliated, errs = _affiliate_individual(job_ctx, rec.key, rec.options)
results[rec.key] = affiliated
errors.extend(errs)

if affiliated:
nsuccess += 1
if affiliated:
nsuccess += 1

trxl.close()

Expand Down
44 changes: 30 additions & 14 deletions sortinghat/core/recommendations/affiliation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
find_domain,
search_enrollments_in_period)
from ..errors import NotFoundError
from ..models import (Individual,
MIN_PERIOD_DATE)


EMAIL_ADDRESS_PATTERN = re.compile(r"^(?P<email>[^\s@]+@[^\s@.]+\.[^\s@]+)$")
Expand All @@ -35,7 +37,7 @@
logger = logging.getLogger(__name__)


def recommend_affiliations(uuids):
def recommend_affiliations(uuids, last_modified=MIN_PERIOD_DATE):
"""Recommend organizations for a list of individuals.
Returns a generator of affiliation recommendations
Expand All @@ -59,23 +61,37 @@ def recommend_affiliations(uuids):
the individual is already enrolled.
:param uuids: list of individual keys
:param last_modified: only affiliate individuals that have been
modified after this date
:returns: a generator of recommendations
"""
logger.debug(
f"Generating affiliation recommendations; "
f"uuids={uuids}; ..."
)
if uuids:
logger.debug(
f"Generating affiliation recommendations; "
f"uuids={uuids}; ..."
)

for uuid in uuids:
try:
individual = find_individual_by_uuid(uuid)
except NotFoundError:
continue
else:
yield uuid, individual.mk, _suggest_affiliations(individual)

for uuid in uuids:
try:
individual = find_individual_by_uuid(uuid)
except NotFoundError:
continue
else:
yield (uuid, individual.mk, _suggest_affiliations(individual))
logger.info(f"Affiliation recommendations generated; uuids='{uuids}'")
else:
logger.debug(
"Generating affiliation recommendations; uuids='all'; ..."
)

individuals = Individual.objects.filter(
last_modified__gte=last_modified).order_by('mk').iterator()

logger.info(f"Affiliation recommendations generated; uuids='{uuids}'")
for individual in individuals:
yield individual.mk, individual.mk, _suggest_affiliations(individual)
logger.info("Affiliation recommendations generated; uuids=all")


def _suggest_affiliations(individual):
Expand Down Expand Up @@ -128,7 +144,7 @@ def _is_enrolled(individual, org_name):
return len(result) > 0


@functools.lru_cache()
@functools.lru_cache(512)
def _find_matching_domain(domain):
"""Look for domains and sub-domains that match with the given one."""

Expand Down

0 comments on commit 544343e

Please sign in to comment.