chore: remove pipeline legacy2023 (#2028)

* [back] chore: remove usage of legacy2023 pipeline in backend * [solidago] chore: remove pipeline legacy2023 * remove trust_algo from 'vouch' django app, it's now part of solidago pipeline * remove redundant computation for scaling calibration users in PipelineInput * bump solidago version
tournesol-app · Nov 28, 2024 · f7d6093 · f7d6093
1 parent 633184f
commit f7d6093
Show file tree

Hide file tree

Showing 23 changed files with 186 additions and 1,382 deletions.
diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml
@@ -38,7 +38,7 @@ jobs:
       - uses: actions/checkout@v3
 
       - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
           python-version: '3.9'
 

diff --git a/backend/ml/inputs.py b/backend/ml/inputs.py
@@ -2,8 +2,7 @@
 from typing import Optional
 
 import pandas as pd
-from django.db.models import Case, F, Q, QuerySet, When
-from django.db.models.expressions import RawSQL
+from django.db.models import F, Q
 from solidago.pipeline import PipelineInput
 
 from core.models import User
@@ -12,52 +11,14 @@
     ContributorRating,
     ContributorRatingCriteriaScore,
     ContributorScaling,
-    Entity,
 )
 from vouch.models import Voucher
 
 
 class MlInputFromDb(PipelineInput):
-    SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE = 20
-
     def __init__(self, poll_name: str):
         self.poll_name = poll_name
 
-    def get_scaling_calibration_users(self) -> QuerySet[User]:
-        n_alternatives = (
-            Entity.objects.filter(comparisons_entity_1__poll__name=self.poll_name)
-            .union(Entity.objects.filter(comparisons_entity_2__poll__name=self.poll_name))
-            .count()
-        )
-        users = User.objects.alias(
-            n_compared_entities=RawSQL(
-                """
-                SELECT COUNT(DISTINCT e.id)
-                FROM tournesol_entity e
-                INNER JOIN tournesol_comparison c
-                    ON (c.entity_1_id = e.id OR c.entity_2_id = e.id)
-                INNER JOIN tournesol_poll p
-                    ON (p.id = c.poll_id AND p.name = %s)
-                WHERE c.user_id = "core_user"."id"
-                """,
-                (self.poll_name,),
-            )
-        )
-        if n_alternatives <= self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE:
-            # The number of alternatives is low enough to consider as calibration users
-            # all trusted users who have compared all alternatives.
-            return users.filter(
-                is_active=True,
-                trust_score__gt=self.SCALING_CALIBRATION_MIN_TRUST_SCORE,
-                n_compared_entities__gte=n_alternatives,
-            )
-
-        return users.filter(
-            is_active=True,
-            trust_score__gt=self.SCALING_CALIBRATION_MIN_TRUST_SCORE,
-            n_compared_entities__gte=self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE,
-        ).order_by("-n_compared_entities")[: self.MAX_SCALING_CALIBRATION_USERS]
-
     def get_comparisons(self, criterion=None, user_id=None) -> pd.DataFrame:
         scores_queryset = ComparisonCriteriaScore.objects.filter(
             comparison__poll__name=self.poll_name,
@@ -100,33 +61,19 @@ def get_comparisons(self, criterion=None, user_id=None) -> pd.DataFrame:
     def ratings_properties(self):
         # This makes sure that `get_scaling_calibration_users()` is evaluated separately, as the
         # table names mentionned in its RawSQL query could conflict with the current queryset.
-        scaling_calibration_user_ids = list(self.get_scaling_calibration_users().values_list("id"))
-        values = (
-            ContributorRating.objects.filter(
-                poll__name=self.poll_name,
-            )
-            .annotate(
-                is_scaling_calibration_user=Case(
-                    When(user__in=scaling_calibration_user_ids, then=True),
-                    default=False,
-                ),
-            )
-            .values(
-                "user_id",
-                "entity_id",
-                "is_public",
-                "is_scaling_calibration_user",
-                trust_score=F("user__trust_score"),
-            )
+        values = ContributorRating.objects.filter(
+            poll__name=self.poll_name,
+        ).values(
+            "user_id",
+            "entity_id",
+            "is_public",
         )
         if len(values) == 0:
             return pd.DataFrame(
                 columns=[
                     "user_id",
                     "entity_id",
                     "is_public",
-                    "is_scaling_calibration_user",
-                    "trust_score",
                 ]
             )
         return pd.DataFrame(values)

diff --git a/backend/ml/mehestan/parameters.py b/backend/ml/mehestan/parameters.py
diff --git a/backend/ml/mehestan/run.py b/backend/ml/mehestan/run.py
diff --git a/backend/ml/outputs.py b/backend/ml/outputs.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pandas as pd
 from django.db import transaction
-from solidago.pipeline.legacy2023.global_scores import get_squash_function
 from solidago.pipeline.outputs import PipelineOutput
 
 from core.models import User
@@ -21,9 +20,6 @@
 )
 from tournesol.models.poll import ALGORITHM_MEHESTAN
 
-from .inputs import MlInputFromDb
-from .mehestan.parameters import MehestanParameters
-
 logger = logging.getLogger(__name__)
 
 
@@ -40,8 +36,8 @@ def __init__(
 
     @cached_property
     def poll(self) -> Poll:
-        # Retrieving the poll instance lazily allows to be use this instance
-        # in a forked process. See the function `run_mehestan()`.
+        # Retrieving the poll instance lazily allows to use this instance
+        # in a forked process (e.g with multiprocessing).
         return Poll.objects.get(name=self.poll_name)
 
     def save_trust_scores(self, trusts: pd.DataFrame):
@@ -92,11 +88,6 @@ def save_individual_scores(
         scores: pd.DataFrame,
         single_user_id: Optional[int] = None,
     ):
-        if "score" not in scores:
-            # Scaled "score" and "uncertainty" need to be computed
-            # based on raw_score and raw_uncertainty
-            scores = apply_score_scalings(self.poll, scores)
-
         if "voting_right" not in scores:
             # Row contains `voting_right` when it comes from a full ML run, but not in the
             # case of online individual updates. As online updates do not update the
@@ -246,61 +237,3 @@ def entities_iterator():
             [ent.single_poll_rating for ent in batch],
             fields=["tournesol_score"],
         )
-
-
-def apply_score_scalings(poll: Poll, contributor_scores: pd.DataFrame):
-    """
-    Apply individual and poll-level scalings based on input "raw_score", and "raw_uncertainty".
-
-    Params:
-        poll: Poll,
-        contributor_scores: DataFrame with columns:
-            user_id: int
-            entity_id: int
-            criterion: str
-            raw_score: float
-            raw_uncertainty: float
-
-    Returns:
-        DataFrame with additional columns "score" and "uncertainty".
-    """
-    if poll.algorithm != ALGORITHM_MEHESTAN:
-        contributor_scores["score"] = contributor_scores["raw_score"]
-        contributor_scores["uncertainty"] = contributor_scores["raw_uncertainty"]
-        return contributor_scores
-
-    ml_input = MlInputFromDb(poll_name=poll.name)
-    scalings = ml_input.get_user_scalings().set_index(["user_id", "criterion"])
-    contributor_scores = contributor_scores.join(
-        scalings, on=["user_id", "criterion"], how="left"
-    ).fillna(
-        {
-            "scale": 1.0,
-            "translation": 0.0,
-            "scale_uncertainty": 0.0,
-            "translation_uncertatinty": 0.0,
-        }
-    )
-
-    # Apply individual scaling
-    contributor_scores["uncertainty"] = (
-        contributor_scores["scale"] * contributor_scores["raw_uncertainty"]
-        + contributor_scores["scale_uncertainty"]
-        * contributor_scores["raw_score"].abs()
-        + contributor_scores["translation_uncertainty"]
-    )
-    contributor_scores["score"] = (
-        contributor_scores["raw_score"] * contributor_scores["scale"]
-        + contributor_scores["translation"]
-    )
-
-    # Apply score squashing
-    squash_function = get_squash_function(MehestanParameters())
-    contributor_scores["uncertainty"] = 0.5 * (
-        squash_function(contributor_scores["score"] + contributor_scores["uncertainty"])
-        - squash_function(
-            contributor_scores["score"] - contributor_scores["uncertainty"]
-        )
-    )
-    contributor_scores["score"] = squash_function(contributor_scores["score"])
-    return contributor_scores
diff --git a/backend/tournesol/lib/public_dataset.py b/backend/tournesol/lib/public_dataset.py
@@ -10,14 +10,14 @@
 from datetime import datetime
 from typing import Optional
 
+import solidago
 from django.conf import settings
 from django.db.models import QuerySet
 from django.utils import timezone
 
-from ml.mehestan.run import MehestanParameters
+from ml.management.commands.ml_train import get_solidago_pipeline
 from tournesol.entities.base import UID_DELIMITER
 from vouch.models import Voucher
-from vouch.trust_algo import SINK_VOUCH, TRUSTED_EMAIL_PRETRUST, VOUCH_DECAY
 
 # The standard decimal precision of floating point numbers appearing in the
 # dataset. Very small numbers can use a higher precision.
@@ -245,28 +245,15 @@ def write_metadata_file(write_target, data_until: datetime) -> None:
     Write the metadata as JSON in `write_target`, an
     object supporting the Python file API.
     """
-    mehestan_params = MehestanParameters()
-
+    solidago_pipeline = get_solidago_pipeline()
     metadata_dict = {
         "data_included_until": data_until.isoformat(),
         "generated_by": settings.MAIN_URL,
         "tournesol_version": settings.TOURNESOL_VERSION,
         "license": "ODC-By-1.0",
-        "algorithms_parameters": {
-            "byztrust": {
-                "SINK_VOUCH": SINK_VOUCH,
-                "TRUSTED_EMAIL_PRETRUST": TRUSTED_EMAIL_PRETRUST,
-                "VOUCH_DECAY": VOUCH_DECAY,
-            },
-            "individual_scores": mehestan_params.indiv_algo.get_metadata(),
-            "mehestan": {
-                "W": mehestan_params.W,
-                "VOTE_WEIGHT_PUBLIC_RATINGS": mehestan_params.vote_weight_public_ratings,
-                "VOTE_WEIGHT_PRIVATE_RATINGS": mehestan_params.vote_weight_private_ratings,
-                "OVER_TRUST_BIAS": mehestan_params.over_trust_bias,
-                "OVER_TRUST_SCALE": mehestan_params.over_trust_scale,
-                "MAX_SCALED_SCORE": mehestan_params.max_squashed_score,
-            },
+        "solidago": {
+            "version": solidago.__version__,
+            "pipeline": solidago_pipeline.to_json()
         },
     }
     json.dump(metadata_dict, write_target, indent=2)

diff --git a/backend/tournesol/tests/test_api_comparison.py b/backend/tournesol/tests/test_api_comparison.py
@@ -1,5 +1,6 @@
 import datetime
 from copy import deepcopy
+from unittest import skip
 from unittest.mock import patch
 
 from django.core.management import call_command
@@ -1371,6 +1372,7 @@ def setUp(self):
 
         self.client = APIClient()
 
+    @skip("Online updates not implemented in Solidago")
     @override_settings(
         UPDATE_MEHESTAN_SCORES_ON_COMPARISON=True,
         MEHESTAN_MULTIPROCESSING=False,