From 4f4951cdcd76b57813c85f7dd80d918751f6fea3 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Wed, 6 Nov 2024 08:09:05 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=B8(backend)=20improve=20users=20simil?= =?UTF-8?q?arity=20search=20and=20sort=20results?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some edge cases, the domain part the email addresse is longer than the name part. Users searches by email similarity then return a lot of unsorted results. We can improve this by being more demanding on similarity when the query looks like an email. Sorting results by the similarity score is also an obvious improvement. At the moment, we still think it is good to propose results with a weak similarity on the name part because we want to avoid as much as possible creating duplicate users by inviting one of is many emails, a user who is already in our database. Fixes 399 --- CHANGELOG.md | 4 +++ src/backend/core/api/viewsets.py | 14 ++++++++ src/backend/core/tests/test_api_users.py | 42 ++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23e673caf..935ca21c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to ## [Unreleased] +## Changed + +- 🚸(backend) improve users similarity search and sort results #391 + ## [1.7.0] - 2024-10-24 ## Added diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 98560eb84..25d56c627 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -6,6 +6,7 @@ from django.conf import settings from django.contrib.postgres.aggregates import ArrayAgg +from django.contrib.postgres.search import TrigramSimilarity from django.core.exceptions import ValidationError from django.core.files.storage import default_storage from django.db.models import ( @@ -156,8 +157,21 @@ def get_queryset(self): # Filter users by email similarity if query := self.request.GET.get("q", ""): + # For performance reasons we filter first by similarity, which relies on an index, + # then only calculate precise similarity scores for sorting purposes queryset = queryset.filter(email__trigram_word_similar=query) + queryset = queryset.annotate( + similarity=TrigramSimilarity("email", query) + ) + # When the query only is on the name part, we should try to make many proposals + # But when the query looks like an email we should only propose serious matches + threshold = 0.6 if "@" in query else 0.1 + + queryset = queryset.filter(similarity__gt=threshold).order_by( + "-similarity" + ) + return queryset @decorators.action( diff --git a/src/backend/core/tests/test_api_users.py b/src/backend/core/tests/test_api_users.py index d95153f20..e739d4d1f 100644 --- a/src/backend/core/tests/test_api_users.py +++ b/src/backend/core/tests/test_api_users.py @@ -69,6 +69,48 @@ def test_api_users_list_query_email(): assert user_ids == [str(nicole.id), str(frank.id)] +def test_api_users_list_query_email_matching(): + """While filtering by email, results should be filtered and sorted by similarity""" + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + alice = factories.UserFactory(email="alice.johnson@example.gouv.fr") + factories.UserFactory(email="jane.smith@example.gouv.fr") + michael_wilson = factories.UserFactory(email="michael.wilson@example.gouv.fr") + factories.UserFactory(email="david.jones@example.gouv.fr") + michael_brown = factories.UserFactory(email="michael.brown@example.gouv.fr") + factories.UserFactory(email="sophia.taylor@example.gouv.fr") + + response = client.get( + "/api/v1.0/users/?q=michael.johnson@example.gouv.f", + ) + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids == [str(michael_wilson.id)] + + response = client.get("/api/v1.0/users/?q=michael.johnson@example.gouv.fr") + + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids == [str(michael_wilson.id), str(alice.id), str(michael_brown.id)] + + response = client.get( + "/api/v1.0/users/?q=ajohnson@example.gouv.f", + ) + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids == [str(alice.id)] + + response = client.get( + "/api/v1.0/users/?q=michael.wilson@example.gouv.f", + ) + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids == [str(michael_wilson.id)] + + def test_api_users_list_query_email_exclude_doc_user(): """ Authenticated users should be able to list users