cleanlab · aditya1503 · Nov 16, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
@@ -1,8 +1,9 @@
 import pathlib
-from typing import Any, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union, List
 import math
 
-import numpy as np
+import copy
+
 import pandas as pd
 
 try:
@@ -63,3 +64,149 @@ def check_none(x: Any) -> bool:
 
 def check_not_none(x: Any) -> bool:
     return not check_none(x)
+
+
+def _get_autofix_default_thresholds(strategy: str) -> dict:  # Studio team port to backend
+    """returns default percentage-wise params of autofix"""
+
+    strategy_defaults = {
+        "optimized_training_data": {
+            "drop_ambiguous": 0.0,
+            "drop_label_issue": 0.5,
+            "drop_near_duplicate": 0.2,
+            "drop_outlier": 0.5,
+            "relabel_confidence_threshold": 0.95,
+        },
+        "drop_all_issues": {
+            "drop_ambiguous": 1.0,
+            "drop_label_issue": 1.0,
+            "drop_near_duplicate": 1.0,
+            "drop_outlier": 1.0,
+        },
+        "suggested_actions": {
+            "drop_near_duplicate": 1.0,
+            "drop_outlier": 1.0,
+            "relabel_confidence_threshold": 0.0,
+        },
+    }
+    return strategy_defaults[strategy]
+
+
+def get_autofix_defaults(
+    cleanset_df: pd.DataFrame, strategy
+) -> dict:  # Studio team port to backend
+    """
+    Generate default values for autofix parameters based on the size of the cleaned dataset.
+    """
+    default_thresholds = _get_autofix_default_thresholds(strategy)
+    default_values = {}
+
+    for param_type, param_value in default_thresholds.items():
+        # Convert drop fractions to number of rows and leave rest of the parameters as is
+        if param_type.startswith("drop_"):
+            issue_name = param_type[5:]
+            num_rows = cleanset_df[f"is_{issue_name}"].sum()
+            default_values[param_type] = math.ceil(num_rows * param_value)
+        else:
+            default_values[param_type] = param_value
+    return default_values
+
+
+def _get_top_fraction_ids(  # Studio team port to backend
+    cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True
+) -> List[str]:
+    """
+    This will only return the IDs of datapoints to drop for a given setting of the num_rows to drop during autofix.
+    Parameters:
+    - cleanset_df (pd.DataFrame): The input DataFrame containing the cleanset.
+    - name_col (str): The name of the column indicating the category for which the top rows should be extracted.
+    - num_rows (int): The number of rows to be extracted.
+    - asc (bool, optional): If True, the rows are sorted in ascending order based on the score column; if False, in descending order.
+                           Default is True.
+
+    Returns:
+    - list: A list of row indices representing the top specified number of rows based on the specified score column.
+    """
+    bool_column_name = f"is_{issue_name}"
+
+    # Construct a filter based on the 'label_issue' variable
+    filter_condition = cleanset_df[bool_column_name]
+
+    # Create a new DataFrame based on the filter
+    filtered_df = cleanset_df[filter_condition]
+    if issue_name == "near_duplicate":
+        # Group by the 'near_duplicate_cluster_ID' column
+        df_n = filtered_df.sort_values(by="near_duplicate_score").reset_index(drop=True)
+        sorted_df = df_n.head(num_rows)
+        grouped_df = sorted_df.groupby("near_duplicate_cluster_id")
+
+        # Initialize an empty list to store the aggregated indices
+        aggregated_indices = []
+
+        # Iterate over each group
+        for group_name, group_df in grouped_df:
+            # Sort the group DataFrame by the 'near_duplicate_score' column in ascending order
+            sorted_group_df = group_df.sort_values(
+                by=f"{issue_name}_score", ascending=asc
+            ).reset_index(drop=True)
+
+            # Extract every other index and append to the aggregated indices list
+            selected_indices = sorted_group_df.loc[::2, "cleanlab_row_ID"]
+            aggregated_indices.extend(selected_indices)
+
+        return aggregated_indices
+    else:
+        # Construct the boolean column name with 'is_' prefix and 'label_issue_score' suffix
+        score_col_name = f"{issue_name}_score"
+
+        # Sort the filtered DataFrame by the constructed boolean column in descending order
+        sorted_df = filtered_df.sort_values(by=score_col_name, ascending=asc)
+
+        # Extract the top specified number of rows and return the 'cleanlab_row_ID' column
+        top_rows_ids = sorted_df["cleanlab_row_ID"].head(num_rows)
+
+        return top_rows_ids
+
+
+def _update_label_based_on_confidence(row, conf_threshold):  # Studio team port to backend
+    """Update the label and is_issue based on confidence threshold if there is a label issue.
+
+    Args:
+        row (pd.Series): The row containing label information.
+        conf_threshold (float): The confidence threshold for updating the label.
+
+    Returns:
+        pd.Series: The updated row.
+    """
+    if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold:
+        row[
+            "is_issue"
+        ] = False  # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled
+        row["label"] = row["suggested_label"]
+    return row
+
+
+def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
+    original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: dict
+) -> pd.DataFrame:
+    """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""
+    original_df_copy = copy.deepcopy(original_df)
+    original_columns = original_df_copy.columns
+    merged_df = pd.merge(original_df_copy, cleanset_df, left_index=True, right_on="cleanlab_row_ID")
+
+    merged_df = merged_df.apply(
+        lambda row: _update_label_based_on_confidence(
+            row, conf_threshold=parameters["relabel_confidence_threshold"]
+        ),
+        axis=1,
+    )
+
+    indices_to_drop = set()
+    for param_name, top_num in parameters.items():
+        if param_name.startswith("drop_"):
+            issue_name = param_name.replace("drop_", "")
+            top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False)
+            indices_to_drop.update(top_percent_ids)
+
+    merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True)
+    return merged_df[original_columns]
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
@@ -1,25 +1,28 @@
 """
 Python API for Cleanlab Studio.
 """
-from typing import Any, List, Literal, Optional, Union
 import warnings
+from typing import Any, List, Literal, Optional, Union, Dict
 
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 
-from . import inference
-from . import trustworthy_language_model
 from cleanlab_studio.errors import CleansetError
 from cleanlab_studio.internal import clean_helpers, upload_helpers
 from cleanlab_studio.internal.api import api
+from cleanlab_studio.internal.settings import CleanlabSettings
+from cleanlab_studio.internal.types import FieldSchemaDict
 from cleanlab_studio.internal.util import (
-    init_dataset_source,
+    apply_autofixed_cleanset_to_new_dataframe,
+    _get_autofix_default_thresholds,
     check_none,
     check_not_none,
+    get_autofix_defaults,
+    init_dataset_source,
 )
-from cleanlab_studio.internal.settings import CleanlabSettings
-from cleanlab_studio.internal.types import FieldSchemaDict
+
+from . import inference, trustworthy_language_model
 
 _pyspark_exists = api.pyspark_exists
 if _pyspark_exists:
@@ -131,7 +134,7 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool
         label_column = api.get_label_column_of_project(self._api_key, project_id)
         id_col = api.get_id_column(self._api_key, cleanset_id)
         if _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame):
-            from pyspark.sql.functions import row_number, monotonically_increasing_id, when, col
+            from pyspark.sql.functions import col, monotonically_increasing_id, row_number, when
             from pyspark.sql.window import Window
 
             cl_cols = self.download_cleanlab_columns(
@@ -383,3 +386,36 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)
 
         except (TimeoutError, CleansetError):
             return False
+
+    def autofix_dataset(
             params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and 
             params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and 
+        self,
+        original_df: pd.DataFrame,
+        cleanset_id: str,
+        params: Optional[Dict[str, Union[int, float]]] = None,
+        strategy="optimized_training_data",
+    ) -> pd.DataFrame:
+        """
+        This method returns the auto-fixed dataset.
+        Args:
+            cleanset_id (str): ID of cleanset.
+            params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
+                number of rows to drop for each issue type. If not provided, default values will be used.
+
+                Example:
+                {
+                    'drop_ambiguous': 9,
+                    'drop_label_issue': 92,
+                    'drop_near_duplicate': 1,
+                    'drop_outlier': 3,
+                    'drop_confidence_threshold': 0.95
+                }
+
+        Returns:
+            pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset.
+
+        """
+        cleanset_df = self.download_cleanlab_columns(cleanset_id)
+        if params is None:
+            params = get_autofix_defaults(cleanset_df, strategy)
+            print("Using autofix values:", params)
+        return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)
diff --git a/tests/test_autofix.py b/tests/test_autofix.py
@@ -0,0 +1,153 @@
+import pandas as pd
+import pytest
+from cleanlab_studio.internal.util import (
+    get_autofix_defaults,
+    _update_label_based_on_confidence,
+    _get_top_fraction_ids,
+)
+import numpy as np
+
+
+class TestAutofix:
+    @pytest.mark.parametrize(
+        "strategy, expected_results",
+        [
+            (
+                "optimized_training_data",
+                {
+                    "drop_ambiguous": 0,
+                    "drop_label_issue": 2,
+                    "drop_near_duplicate": 2,
+                    "drop_outlier": 3,
+                    "relabel_confidence_threshold": 0.95,
+                },
+            ),
+            (
+                "drop_all_issues",
+                {
+                    "drop_ambiguous": 10,
+                    "drop_label_issue": 3,
+                    "drop_near_duplicate": 6,
+                    "drop_outlier": 6,
+                },
+            ),
+            (
+                "suggested_actions",
+                {
+                    "drop_near_duplicate": 6,
+                    "drop_outlier": 6,
+                    "relabel_confidence_threshold": 0.0,
+                },
+            ),
+        ],
+        ids=["optimized_training_data", "drop_all_issues", "suggested_actions"],
+    )
+    def test_get_autofix_defaults(self, strategy, expected_results):
+        cleanlab_columns = pd.DataFrame()
+        cleanlab_columns["is_label_issue"] = [True] * 3 + [False] * 7
+        cleanlab_columns["is_near_duplicate"] = [True] * 6 + [False] * 4
+        cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
+        cleanlab_columns["is_ambiguous"] = [True] * 10
+
+        params = get_autofix_defaults(cleanlab_columns, strategy)
+        assert params == expected_results
+
+    @pytest.mark.parametrize(
+        "row, expected_updated_row",
+        [
+            (
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.6,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.6,
+                    "label": "label_1",
+                    "suggested_label": "label_1",
+                    "is_issue": False,
+                },
+            ),
+            (
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.5,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.5,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+            ),
+            (
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+            ),
+            (
+                {
+                    "is_label_issue": False,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": False,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+            ),
+        ],
+        ids=[
+            "is a label issue with confidence score greater than threshold",
+            "is a label issue with confidence score equal to threshold",
+            "is a label issue with confidence score less than threshold",
+            "is not a label issue",
+        ],
+    )
+    def test_update_label_based_on_confidence(self, row, expected_updated_row):
+        conf_threshold = 0.5
+        updated_row = _update_label_based_on_confidence(row, conf_threshold)
+        assert updated_row == expected_updated_row
+
+    def test_get_top_fraction_ids(self):
+        cleanlab_columns = pd.DataFrame()
+
+        cleanlab_columns["cleanlab_row_ID"] = np.arange(10)
+        cleanlab_columns["is_dummy"] = [False] * 5 + [True] * 5
+        cleanlab_columns["dummy_score"] = np.arange(10) * 0.1
+        top_ids = _get_top_fraction_ids(cleanlab_columns, "dummy", 3)
+        assert set(top_ids) == set([5, 6, 7])
+
+    def test_get_top_fraction_ids_near_duplicate(self):
+        cleanlab_columns = pd.DataFrame()
+
+        cleanlab_columns["cleanlab_row_ID"] = np.arange(12)
+        cleanlab_columns["is_near_duplicate"] = [False] * 6 + [True] * 6
+        cleanlab_columns["near_duplicate_score"] = np.arange(12) * 0.1
+        cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1]
+
+        top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
+        assert set(top_ids) == set([6, 8, 10])