Skip to content

Commit

Permalink
chore: remove pipeline legacy2023 (#2028)
Browse files Browse the repository at this point in the history
* [back] chore: remove usage of legacy2023 pipeline in backend

* [solidago] chore: remove pipeline legacy2023

* remove trust_algo from 'vouch' django app, it's now part of solidago pipeline

* remove redundant computation for scaling calibration users in PipelineInput

* bump solidago version
  • Loading branch information
amatissart authored Nov 28, 2024
1 parent 633184f commit f7d6093
Show file tree
Hide file tree
Showing 23 changed files with 186 additions and 1,382 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/backend-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: '3.9'

Expand Down
67 changes: 7 additions & 60 deletions backend/ml/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from typing import Optional

import pandas as pd
from django.db.models import Case, F, Q, QuerySet, When
from django.db.models.expressions import RawSQL
from django.db.models import F, Q
from solidago.pipeline import PipelineInput

from core.models import User
Expand All @@ -12,52 +11,14 @@
ContributorRating,
ContributorRatingCriteriaScore,
ContributorScaling,
Entity,
)
from vouch.models import Voucher


class MlInputFromDb(PipelineInput):
SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE = 20

def __init__(self, poll_name: str):
self.poll_name = poll_name

def get_scaling_calibration_users(self) -> QuerySet[User]:
n_alternatives = (
Entity.objects.filter(comparisons_entity_1__poll__name=self.poll_name)
.union(Entity.objects.filter(comparisons_entity_2__poll__name=self.poll_name))
.count()
)
users = User.objects.alias(
n_compared_entities=RawSQL(
"""
SELECT COUNT(DISTINCT e.id)
FROM tournesol_entity e
INNER JOIN tournesol_comparison c
ON (c.entity_1_id = e.id OR c.entity_2_id = e.id)
INNER JOIN tournesol_poll p
ON (p.id = c.poll_id AND p.name = %s)
WHERE c.user_id = "core_user"."id"
""",
(self.poll_name,),
)
)
if n_alternatives <= self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE:
# The number of alternatives is low enough to consider as calibration users
# all trusted users who have compared all alternatives.
return users.filter(
is_active=True,
trust_score__gt=self.SCALING_CALIBRATION_MIN_TRUST_SCORE,
n_compared_entities__gte=n_alternatives,
)

return users.filter(
is_active=True,
trust_score__gt=self.SCALING_CALIBRATION_MIN_TRUST_SCORE,
n_compared_entities__gte=self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE,
).order_by("-n_compared_entities")[: self.MAX_SCALING_CALIBRATION_USERS]

def get_comparisons(self, criterion=None, user_id=None) -> pd.DataFrame:
scores_queryset = ComparisonCriteriaScore.objects.filter(
comparison__poll__name=self.poll_name,
Expand Down Expand Up @@ -100,33 +61,19 @@ def get_comparisons(self, criterion=None, user_id=None) -> pd.DataFrame:
def ratings_properties(self):
# This makes sure that `get_scaling_calibration_users()` is evaluated separately, as the
# table names mentionned in its RawSQL query could conflict with the current queryset.
scaling_calibration_user_ids = list(self.get_scaling_calibration_users().values_list("id"))
values = (
ContributorRating.objects.filter(
poll__name=self.poll_name,
)
.annotate(
is_scaling_calibration_user=Case(
When(user__in=scaling_calibration_user_ids, then=True),
default=False,
),
)
.values(
"user_id",
"entity_id",
"is_public",
"is_scaling_calibration_user",
trust_score=F("user__trust_score"),
)
values = ContributorRating.objects.filter(
poll__name=self.poll_name,
).values(
"user_id",
"entity_id",
"is_public",
)
if len(values) == 0:
return pd.DataFrame(
columns=[
"user_id",
"entity_id",
"is_public",
"is_scaling_calibration_user",
"trust_score",
]
)
return pd.DataFrame(values)
Expand Down
7 changes: 0 additions & 7 deletions backend/ml/mehestan/parameters.py

This file was deleted.

108 changes: 0 additions & 108 deletions backend/ml/mehestan/run.py

This file was deleted.

71 changes: 2 additions & 69 deletions backend/ml/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import numpy as np
import pandas as pd
from django.db import transaction
from solidago.pipeline.legacy2023.global_scores import get_squash_function
from solidago.pipeline.outputs import PipelineOutput

from core.models import User
Expand All @@ -21,9 +20,6 @@
)
from tournesol.models.poll import ALGORITHM_MEHESTAN

from .inputs import MlInputFromDb
from .mehestan.parameters import MehestanParameters

logger = logging.getLogger(__name__)


Expand All @@ -40,8 +36,8 @@ def __init__(

@cached_property
def poll(self) -> Poll:
# Retrieving the poll instance lazily allows to be use this instance
# in a forked process. See the function `run_mehestan()`.
# Retrieving the poll instance lazily allows to use this instance
# in a forked process (e.g with multiprocessing).
return Poll.objects.get(name=self.poll_name)

def save_trust_scores(self, trusts: pd.DataFrame):
Expand Down Expand Up @@ -92,11 +88,6 @@ def save_individual_scores(
scores: pd.DataFrame,
single_user_id: Optional[int] = None,
):
if "score" not in scores:
# Scaled "score" and "uncertainty" need to be computed
# based on raw_score and raw_uncertainty
scores = apply_score_scalings(self.poll, scores)

if "voting_right" not in scores:
# Row contains `voting_right` when it comes from a full ML run, but not in the
# case of online individual updates. As online updates do not update the
Expand Down Expand Up @@ -246,61 +237,3 @@ def entities_iterator():
[ent.single_poll_rating for ent in batch],
fields=["tournesol_score"],
)


def apply_score_scalings(poll: Poll, contributor_scores: pd.DataFrame):
"""
Apply individual and poll-level scalings based on input "raw_score", and "raw_uncertainty".
Params:
poll: Poll,
contributor_scores: DataFrame with columns:
user_id: int
entity_id: int
criterion: str
raw_score: float
raw_uncertainty: float
Returns:
DataFrame with additional columns "score" and "uncertainty".
"""
if poll.algorithm != ALGORITHM_MEHESTAN:
contributor_scores["score"] = contributor_scores["raw_score"]
contributor_scores["uncertainty"] = contributor_scores["raw_uncertainty"]
return contributor_scores

ml_input = MlInputFromDb(poll_name=poll.name)
scalings = ml_input.get_user_scalings().set_index(["user_id", "criterion"])
contributor_scores = contributor_scores.join(
scalings, on=["user_id", "criterion"], how="left"
).fillna(
{
"scale": 1.0,
"translation": 0.0,
"scale_uncertainty": 0.0,
"translation_uncertatinty": 0.0,
}
)

# Apply individual scaling
contributor_scores["uncertainty"] = (
contributor_scores["scale"] * contributor_scores["raw_uncertainty"]
+ contributor_scores["scale_uncertainty"]
* contributor_scores["raw_score"].abs()
+ contributor_scores["translation_uncertainty"]
)
contributor_scores["score"] = (
contributor_scores["raw_score"] * contributor_scores["scale"]
+ contributor_scores["translation"]
)

# Apply score squashing
squash_function = get_squash_function(MehestanParameters())
contributor_scores["uncertainty"] = 0.5 * (
squash_function(contributor_scores["score"] + contributor_scores["uncertainty"])
- squash_function(
contributor_scores["score"] - contributor_scores["uncertainty"]
)
)
contributor_scores["score"] = squash_function(contributor_scores["score"])
return contributor_scores
25 changes: 6 additions & 19 deletions backend/tournesol/lib/public_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
from datetime import datetime
from typing import Optional

import solidago
from django.conf import settings
from django.db.models import QuerySet
from django.utils import timezone

from ml.mehestan.run import MehestanParameters
from ml.management.commands.ml_train import get_solidago_pipeline
from tournesol.entities.base import UID_DELIMITER
from vouch.models import Voucher
from vouch.trust_algo import SINK_VOUCH, TRUSTED_EMAIL_PRETRUST, VOUCH_DECAY

# The standard decimal precision of floating point numbers appearing in the
# dataset. Very small numbers can use a higher precision.
Expand Down Expand Up @@ -245,28 +245,15 @@ def write_metadata_file(write_target, data_until: datetime) -> None:
Write the metadata as JSON in `write_target`, an
object supporting the Python file API.
"""
mehestan_params = MehestanParameters()

solidago_pipeline = get_solidago_pipeline()
metadata_dict = {
"data_included_until": data_until.isoformat(),
"generated_by": settings.MAIN_URL,
"tournesol_version": settings.TOURNESOL_VERSION,
"license": "ODC-By-1.0",
"algorithms_parameters": {
"byztrust": {
"SINK_VOUCH": SINK_VOUCH,
"TRUSTED_EMAIL_PRETRUST": TRUSTED_EMAIL_PRETRUST,
"VOUCH_DECAY": VOUCH_DECAY,
},
"individual_scores": mehestan_params.indiv_algo.get_metadata(),
"mehestan": {
"W": mehestan_params.W,
"VOTE_WEIGHT_PUBLIC_RATINGS": mehestan_params.vote_weight_public_ratings,
"VOTE_WEIGHT_PRIVATE_RATINGS": mehestan_params.vote_weight_private_ratings,
"OVER_TRUST_BIAS": mehestan_params.over_trust_bias,
"OVER_TRUST_SCALE": mehestan_params.over_trust_scale,
"MAX_SCALED_SCORE": mehestan_params.max_squashed_score,
},
"solidago": {
"version": solidago.__version__,
"pipeline": solidago_pipeline.to_json()
},
}
json.dump(metadata_dict, write_target, indent=2)
Expand Down
2 changes: 2 additions & 0 deletions backend/tournesol/tests/test_api_comparison.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
from copy import deepcopy
from unittest import skip
from unittest.mock import patch

from django.core.management import call_command
Expand Down Expand Up @@ -1371,6 +1372,7 @@ def setUp(self):

self.client = APIClient()

@skip("Online updates not implemented in Solidago")
@override_settings(
UPDATE_MEHESTAN_SCORES_ON_COMPARISON=True,
MEHESTAN_MULTIPROCESSING=False,
Expand Down
Loading

0 comments on commit f7d6093

Please sign in to comment.