From 461563753c7499ac06b3e37132552e450b5ceeed Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Fri, 17 Nov 2023 02:10:18 +0530
Subject: [PATCH 01/32] make pull request


From 2a7cf915998f7e431eb117bbc1843276df38e0a3 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Sat, 18 Nov 2023 01:52:24 +0530
Subject: [PATCH 02/32] cleaned skeleton code

---
 cleanlab_studio/internal/util.py | 125 ++++++++++++++++++++++++++++++-
 cleanlab_studio/studio/studio.py |  15 ++++
 2 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 08684427..11eeea15 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -2,7 +2,8 @@
 from typing import Any, Optional, TypeVar, Union
 import math
 
-import numpy as np
+import copy
+
 import pandas as pd
 
 try:
@@ -63,3 +64,125 @@ def check_none(x: Any) -> bool:
 
 def check_not_none(x: Any) -> bool:
     return not check_none(x)
+
+
+def _get_autofix_default_params():
+    """returns default params of autofix"""
+    return {
+        "ambiguous": 0.2,
+        "label_issue": 0.5,
+        "near_duplicate": 0.2,
+        "outlier": 0.5,
+        "confidence_threshold": 0.95,
+    }
+
+
+def _get_autofix_defaults(cleanset_df):
+    """
+    Generate default values for autofix parameters based on the size of the cleaned dataset.
+    """
+    default_params = _get_autofix_default_params()
+    default_values = {}
+
+    for param_name, param_value in default_params.items():
+        if param_name != "confidence_threshold":
+            num_rows = cleanset_df[f"is_{param_name}"].sum()
+            default_values[param_name] = math.ceil(num_rows * param_value)
+        else:
+            default_values[param_name] = param_value
+    return default_values
+
+
+def _get_top_fraction_ids(cleanset_df, name_col, num_rows, asc=True):
+    """
+    Extracts the top specified number of rows based on a specified score column from a DataFrame.
+
+    Parameters:
+    - cleanset_df (pd.DataFrame): The input DataFrame containing the cleanset.
+    - name_col (str): The name of the column indicating the category for which the top rows should be extracted.
+    - num_rows (int): The number of rows to be extracted.
+    - asc (bool, optional): If True, the rows are sorted in ascending order based on the score column; if False, in descending order.
+                           Default is True.
+
+    Returns:
+    - list: A list of row indices representing the top specified number of rows based on the specified score column.
+    """
+    bool_column_name = f"is_{name_col}"
+
+    # Construct a filter based on the 'label_issue' variable
+    filter_condition = cleanset_df[bool_column_name]
+
+    # Create a new DataFrame based on the filter
+    filtered_df = cleanset_df[filter_condition]
+    if name_col == "near_duplicate":
+        # Group by the 'near_duplicate_cluster_ID' column
+        df_n = filtered_df.sort_values(by="near_duplicate_score").reset_index(drop=True)
+        sorted_df = df_n.head(num_rows)
+        grouped_df = sorted_df.groupby("near_duplicate_cluster_id")
+
+        # Initialize an empty list to store the aggregated indices
+        aggregated_indices = []
+
+        # Iterate over each group
+        for group_name, group_df in grouped_df:
+            # Sort the group DataFrame by the 'near_duplicate_score' column in ascending order
+            sorted_group_df = group_df.sort_values(
+                by=f"{name_col}_score", ascending=asc
+            ).reset_index(drop=True)
+
+            # Extract every other index and append to the aggregated indices list
+            selected_indices = sorted_group_df.loc[::2, "cleanlab_row_ID"]
+            aggregated_indices.extend(selected_indices)
+
+        return aggregated_indices
+    else:
+        # Construct the boolean column name with 'is_' prefix and 'label_issue_score' suffix
+        score_col_name = f"{name_col}_score"
+
+        # Sort the filtered DataFrame by the constructed boolean column in descending order
+        sorted_df = filtered_df.sort_values(by=score_col_name, ascending=asc)
+
+        # Extract the top specified number of rows and return the 'cleanlab_row_ID' column
+        top_rows_ids = sorted_df["cleanlab_row_ID"].head(num_rows)
+
+        return top_rows_ids
+
+
+def _update_label_based_on_confidence(row, conf_threshold):
+    """Update the label and is_issue based on confidence threshold if there is a label issue.
+
+    Args:
+        row (pd.Series): The row containing label information.
+        conf_threshold (float): The confidence threshold for updating the label.
+
+    Returns:
+        pd.Series: The updated row.
+    """
+    if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold:
+        row["is_issue"] = False
+        row["label"] = row["suggested_label"]
+    return row
+
+
+def _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters):
+    """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""
+    original_df_copy = copy.deepcopy(original_df)
+    original_columns = original_df_copy.columns
+    merged_df = pd.merge(original_df_copy, cleanset_df, left_index=True, right_on="cleanlab_row_ID")
+
+    merged_df = merged_df.apply(
+        lambda row: _update_label_based_on_confidence(
+            row, conf_threshold=parameters["confidence_threshold"]
+        ),
+        axis=1,
+    )
+
+    indices_to_drop = set()
+    for column_name, top_num in parameters.items():
+        if column_name == "confidence_threshold":
+            continue
+        top_percent_ids = _get_top_fraction_ids(merged_df, column_name, top_num, asc=False)
+        indices_to_drop.update(top_percent_ids)
+
+    merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True)
+    return merged_df[original_columns]
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 21f0e0e6..7d570b8d 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -17,6 +17,9 @@
     init_dataset_source,
     check_none,
     check_not_none,
+    _get_autofix_default_params,
+    _get_autofix_defaults,
+    _apply_autofixed_cleanset_to_new_dataframe,
 )
 from cleanlab_studio.internal.settings import CleanlabSettings
 from cleanlab_studio.internal.types import FieldSchemaDict
@@ -383,3 +386,15 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)
 
         except (TimeoutError, CleansetError):
             return False
+
+    def get_autofix_defaults(self, project_id):
+        cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
+        cleaned_df = self.download_cleanlab_columns(cleanset_id)
+        return _get_autofix_defaults(cleaned_df)
+
+    def autofix_dataset(self, project_id):
+        cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
+        cleaned_df = self.download_cleanlab_columns(cleanset_id)
+        original_df = get_original_df()  # Studio team
+        parameters = _get_autofix_defaults(cleaned_df)
+        return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters)

From e7a3d07414203dc04be43d8cd1108830527b28a2 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Sat, 18 Nov 2023 02:02:06 +0530
Subject: [PATCH 03/32] cleanup

---
 cleanlab_studio/internal/util.py |  9 +++++----
 cleanlab_studio/studio/studio.py | 13 +++++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 11eeea15..7cdb8535 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -87,9 +87,9 @@ def _get_autofix_defaults(cleanset_df):
     for param_name, param_value in default_params.items():
         if param_name != "confidence_threshold":
             num_rows = cleanset_df[f"is_{param_name}"].sum()
-            default_values[param_name] = math.ceil(num_rows * param_value)
+            default_values[f"drop_{param_name}"] = math.ceil(num_rows * param_value)
         else:
-            default_values[param_name] = param_value
+            default_values[f"drop_{param_name}"] = param_value
     return default_values
 
 
@@ -172,13 +172,14 @@ def _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, paramet
 
     merged_df = merged_df.apply(
         lambda row: _update_label_based_on_confidence(
-            row, conf_threshold=parameters["confidence_threshold"]
+            row, conf_threshold=parameters["drop_confidence_threshold"]
         ),
         axis=1,
     )
 
     indices_to_drop = set()
-    for column_name, top_num in parameters.items():
+    for drop_name, top_num in parameters.items():
+        column_name = drop_name.replace("drop_", "")
         if column_name == "confidence_threshold":
             continue
         top_percent_ids = _get_top_fraction_ids(merged_df, column_name, top_num, asc=False)
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 7d570b8d..bd1eaeb4 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -388,13 +388,18 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)
             return False
 
     def get_autofix_defaults(self, project_id):
+        """
+        Returns the default parameters for autofix.
+        """
         cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
         cleaned_df = self.download_cleanlab_columns(cleanset_id)
         return _get_autofix_defaults(cleaned_df)
 
-    def autofix_dataset(self, project_id):
+    def autofix_dataset(self, project_id, params=None):
         cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
-        cleaned_df = self.download_cleanlab_columns(cleanset_id)
+        cleanset_df = self.download_cleanlab_columns(cleanset_id)
         original_df = get_original_df()  # Studio team
-        parameters = _get_autofix_defaults(cleaned_df)
-        return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters)
+        if params is None:
+            params = _get_autofix_defaults(cleanset_df)
+            print("Using autofix parameters:", params)
+        return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)

From 72fc9196ea8c6a38ebfab005252cbe70f35a566b Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Sat, 18 Nov 2023 02:13:50 +0530
Subject: [PATCH 04/32] add type hinting

---
 cleanlab_studio/internal/util.py | 16 ++++++++++------
 cleanlab_studio/studio/studio.py | 14 ++++++++++++--
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 7cdb8535..3d03c346 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -1,5 +1,5 @@
 import pathlib
-from typing import Any, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union, List
 import math
 
 import copy
@@ -66,8 +66,8 @@ def check_not_none(x: Any) -> bool:
     return not check_none(x)
 
 
-def _get_autofix_default_params():
-    """returns default params of autofix"""
+def _get_autofix_default_params() -> dict:
+    """returns default percentage-wise params of autofix"""
     return {
         "ambiguous": 0.2,
         "label_issue": 0.5,
@@ -77,7 +77,7 @@ def _get_autofix_default_params():
     }
 
 
-def _get_autofix_defaults(cleanset_df):
+def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict:
     """
     Generate default values for autofix parameters based on the size of the cleaned dataset.
     """
@@ -93,7 +93,9 @@ def _get_autofix_defaults(cleanset_df):
     return default_values
 
 
-def _get_top_fraction_ids(cleanset_df, name_col, num_rows, asc=True):
+def _get_top_fraction_ids(
+    cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True
+) -> List[str]:
     """
     Extracts the top specified number of rows based on a specified score column from a DataFrame.
 
@@ -164,7 +166,9 @@ def _update_label_based_on_confidence(row, conf_threshold):
     return row
 
 
-def _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters):
+def _apply_autofixed_cleanset_to_new_dataframe(
+    original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame
+) -> pd.DataFrame:
     """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""
     original_df_copy = copy.deepcopy(original_df)
     original_columns = original_df_copy.columns
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index bd1eaeb4..d8dca397 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -387,15 +387,25 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)
         except (TimeoutError, CleansetError):
             return False
 
-    def get_autofix_defaults(self, project_id):
+    def get_autofix_defaults(self, project_id: str) -> dict:
         """
         Returns the default parameters for autofix.
+        Args:
+            project_id: ID of project.
+
+        Returns:
+            A dictionary containing number of rows to drop for each issue type.
         """
         cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
         cleaned_df = self.download_cleanlab_columns(cleanset_id)
         return _get_autofix_defaults(cleaned_df)
 
-    def autofix_dataset(self, project_id, params=None):
+    def autofix_dataset(self, project_id: str, params: dict = None) -> pd.DataFrame:
+        """
+        Args:
+            project_id: ID of project.
+            params: Default parameter dictionary showing number of rows to drop for each issue type.
+        """
         cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
         original_df = get_original_df()  # Studio team

From d67bbc37c052929a2fbd134c57dcd62f2e24bb27 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Sat, 18 Nov 2023 11:22:22 +0530
Subject: [PATCH 05/32] address PR comments

---
 cleanlab_studio/studio/studio.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index d8dca397..4fe20a1e 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -387,28 +387,30 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)
         except (TimeoutError, CleansetError):
             return False
 
-    def get_autofix_defaults(self, project_id: str) -> dict:
+    def autofix_dataset(
+        self, original_df: pd.DataFrame, cleanset_id: str, params: dict = None
+    ) -> pd.DataFrame:
         """
-        Returns the default parameters for autofix.
+        This method returns the auto-fixed dataset.
         Args:
-            project_id: ID of project.
+            cleanset_id (str): ID of cleanset.
+            params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
+                number of rows to drop for each issue type. If not provided, default values will be used.
+
+                Example:
+                {
+                    'drop_ambiguous': 9,
+                    'drop_label_issue': 92,
+                    'drop_near_duplicate': 1,
+                    'drop_outlier': 3,
+                    'drop_confidence_threshold': 0.95
+                }
 
         Returns:
-            A dictionary containing number of rows to drop for each issue type.
-        """
-        cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
-        cleaned_df = self.download_cleanlab_columns(cleanset_id)
-        return _get_autofix_defaults(cleaned_df)
+            pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset.
 
-    def autofix_dataset(self, project_id: str, params: dict = None) -> pd.DataFrame:
-        """
-        Args:
-            project_id: ID of project.
-            params: Default parameter dictionary showing number of rows to drop for each issue type.
         """
-        cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id)
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
-        original_df = get_original_df()  # Studio team
         if params is None:
             params = _get_autofix_defaults(cleanset_df)
             print("Using autofix parameters:", params)

From fc4bf7c3038620291b76b0096a1039c2f3c0739d Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Tue, 21 Nov 2023 02:09:35 +0530
Subject: [PATCH 06/32] Update cleanlab_studio/internal/util.py

Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com>
---
 cleanlab_studio/internal/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 3d03c346..ac6798b0 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -161,7 +161,7 @@ def _update_label_based_on_confidence(row, conf_threshold):
         pd.Series: The updated row.
     """
     if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold:
-        row["is_issue"] = False
+        row["is_issue"] = False  # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled 
         row["label"] = row["suggested_label"]
     return row
 

From 9f00909e75a1f46e48eec4017b6870d876f07011 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Tue, 21 Nov 2023 02:15:17 +0530
Subject: [PATCH 07/32] linting + doc change

---
 cleanlab_studio/internal/util.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index ac6798b0..9595bf3f 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -97,8 +97,7 @@ def _get_top_fraction_ids(
     cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True
 ) -> List[str]:
     """
-    Extracts the top specified number of rows based on a specified score column from a DataFrame.
-
+    This will only return the IDs of datapoints to drop for a given setting of the num_rows to drop during autofix.
     Parameters:
     - cleanset_df (pd.DataFrame): The input DataFrame containing the cleanset.
     - name_col (str): The name of the column indicating the category for which the top rows should be extracted.
@@ -161,7 +160,9 @@ def _update_label_based_on_confidence(row, conf_threshold):
         pd.Series: The updated row.
     """
     if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold:
-        row["is_issue"] = False  # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled 
+        row[
+            "is_issue"
+        ] = False  # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled
         row["label"] = row["suggested_label"]
     return row
 

From d2a34321af9c77e8f69ba50685f6f552420708e6 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Wed, 22 Nov 2023 22:56:37 +0530
Subject: [PATCH 08/32] set ambiguous to 0

---
 cleanlab_studio/internal/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 9595bf3f..7592dd76 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -69,7 +69,7 @@ def check_not_none(x: Any) -> bool:
 def _get_autofix_default_params() -> dict:
     """returns default percentage-wise params of autofix"""
     return {
-        "ambiguous": 0.2,
+        "ambiguous": 0.0,
         "label_issue": 0.5,
         "near_duplicate": 0.2,
         "outlier": 0.5,

From 6bcec4c22ecdf7eb94a674078b39403abc6aefa1 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Wed, 22 Nov 2023 23:30:09 +0530
Subject: [PATCH 09/32] things to port to backend

---
 cleanlab_studio/internal/util.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 7592dd76..0c02a89e 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -66,7 +66,7 @@ def check_not_none(x: Any) -> bool:
     return not check_none(x)
 
 
-def _get_autofix_default_params() -> dict:
+def _get_autofix_default_params() -> dict:  # Studio team port to backend
     """returns default percentage-wise params of autofix"""
     return {
         "ambiguous": 0.0,
@@ -77,7 +77,7 @@ def _get_autofix_default_params() -> dict:
     }
 
 
-def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict:
+def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict:  # Studio team port to backend
     """
     Generate default values for autofix parameters based on the size of the cleaned dataset.
     """
@@ -93,7 +93,7 @@ def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict:
     return default_values
 
 
-def _get_top_fraction_ids(
+def _get_top_fraction_ids(  # Studio team port to backend
     cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True
 ) -> List[str]:
     """
@@ -149,7 +149,7 @@ def _get_top_fraction_ids(
         return top_rows_ids
 
 
-def _update_label_based_on_confidence(row, conf_threshold):
+def _update_label_based_on_confidence(row, conf_threshold):  # Studio team port to backend
     """Update the label and is_issue based on confidence threshold if there is a label issue.
 
     Args:
@@ -167,7 +167,7 @@ def _update_label_based_on_confidence(row, conf_threshold):
     return row
 
 
-def _apply_autofixed_cleanset_to_new_dataframe(
+def _apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
     original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame
 ) -> pd.DataFrame:
     """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""

From cc52ce2dee8e3919efd2e610fa60f1d76b499a6d Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Fri, 1 Dec 2023 18:49:15 +0530
Subject: [PATCH 10/32] Updated code for different strategies

---
 cleanlab_studio/internal/util.py | 48 +++++++++++++++++++++-----------
 cleanlab_studio/studio/studio.py | 30 +++++++++++---------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 0c02a89e..c97db08f 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -66,30 +66,46 @@ def check_not_none(x: Any) -> bool:
     return not check_none(x)
 
 
-def _get_autofix_default_params() -> dict:  # Studio team port to backend
+def _get_autofix_default_thresholds(strategy: str) -> dict:  # Studio team port to backend
     """returns default percentage-wise params of autofix"""
-    return {
-        "ambiguous": 0.0,
-        "label_issue": 0.5,
-        "near_duplicate": 0.2,
-        "outlier": 0.5,
-        "confidence_threshold": 0.95,
+
+    strategy_defaults = {
+        "optimized_training_data": {
+            "drop_ambiguous": 0.0,
+            "drop_label_issue": 0.5,
+            "drop_near_duplicate": 0.2,
+            "drop_outlier": 0.5,
+            "relabel_confidence_threshold": 0.95,
+        },
+        "drop_all_issues": {
+            "drop_ambiguous": 1.0,
+            "drop_label_issue": 1.5,
+            "drop_near_duplicate": 1.0,
+            "drop_outlier": 1.0,
+        },
+        "suggested_actions": {
+            "drop_near_duplicate": 1.0,
+            "drop_outlier": 1.0,
+            "relabel_confidence_threshold": 0.5,
+        },
     }
+    return strategy_defaults[strategy]
 
 
-def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict:  # Studio team port to backend
+def get_autofix_defaults(
+    cleanset_df: pd.DataFrame, strategy
+) -> dict:  # Studio team port to backend
     """
     Generate default values for autofix parameters based on the size of the cleaned dataset.
     """
-    default_params = _get_autofix_default_params()
+    default_thresholds = _get_autofix_default_thresholds(strategy)
     default_values = {}
 
-    for param_name, param_value in default_params.items():
-        if param_name != "confidence_threshold":
-            num_rows = cleanset_df[f"is_{param_name}"].sum()
-            default_values[f"drop_{param_name}"] = math.ceil(num_rows * param_value)
-        else:
-            default_values[f"drop_{param_name}"] = param_value
+    for param_type, param_value in default_thresholds.items():
+        if param_type.startswith("drop_"):
+            issue_name = param_type[5:]
+            num_rows = cleanset_df[f"is_{issue_name}"].sum()
+            default_values[param_type] = math.ceil(num_rows * param_value)
     return default_values
 
 
@@ -167,7 +183,7 @@ def _update_label_based_on_confidence(row, conf_threshold):  # Studio team port
     return row
 
 
-def _apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
+def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
     original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame
 ) -> pd.DataFrame:
     """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 4fe20a1e..97e6e9ce 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -1,28 +1,28 @@
 """
 Python API for Cleanlab Studio.
 """
-from typing import Any, List, Literal, Optional, Union
 import warnings
+from typing import Any, List, Literal, Optional, Union
 
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 
-from . import inference
-from . import trustworthy_language_model
 from cleanlab_studio.errors import CleansetError
 from cleanlab_studio.internal import clean_helpers, upload_helpers
 from cleanlab_studio.internal.api import api
+from cleanlab_studio.internal.settings import CleanlabSettings
+from cleanlab_studio.internal.types import FieldSchemaDict
 from cleanlab_studio.internal.util import (
-    init_dataset_source,
+    apply_autofixed_cleanset_to_new_dataframe,
+    _get_autofix_default_thresholds,
     check_none,
     check_not_none,
-    _get_autofix_default_params,
-    _get_autofix_defaults,
-    _apply_autofixed_cleanset_to_new_dataframe,
+    get_autofix_defaults,
+    init_dataset_source,
 )
-from cleanlab_studio.internal.settings import CleanlabSettings
-from cleanlab_studio.internal.types import FieldSchemaDict
+
+from . import inference, trustworthy_language_model
 
 _pyspark_exists = api.pyspark_exists
 if _pyspark_exists:
@@ -134,7 +134,7 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool
         label_column = api.get_label_column_of_project(self._api_key, project_id)
         id_col = api.get_id_column(self._api_key, cleanset_id)
         if _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame):
-            from pyspark.sql.functions import row_number, monotonically_increasing_id, when, col
+            from pyspark.sql.functions import col, monotonically_increasing_id, row_number, when
             from pyspark.sql.window import Window
 
             cl_cols = self.download_cleanlab_columns(
@@ -388,7 +388,11 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)
             return False
 
     def autofix_dataset(
-        self, original_df: pd.DataFrame, cleanset_id: str, params: dict = None
+        self,
+        original_df: pd.DataFrame,
+        cleanset_id: str,
+        params: dict = None,
+        strategy="optimized_training_data",
     ) -> pd.DataFrame:
         """
         This method returns the auto-fixed dataset.
@@ -412,6 +416,6 @@ def autofix_dataset(
         """
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
         if params is None:
-            params = _get_autofix_defaults(cleanset_df)
+            params = get_autofix_defaults(cleanset_df, strategy)
             print("Using autofix parameters:", params)
-        return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)
+        return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)

From 62efa2d1f97e176ee1ffa9960667197da408c0b0 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Fri, 1 Dec 2023 19:11:44 +0530
Subject: [PATCH 11/32] Fixed apply method

---
 cleanlab_studio/internal/util.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index c97db08f..ed4d8f6a 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -110,7 +110,7 @@ def get_autofix_defaults(
 
 
 def _get_top_fraction_ids(  # Studio team port to backend
-    cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True
+    cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True
 ) -> List[str]:
     """
     This will only return the IDs of datapoints to drop for a given setting of the num_rows to drop during autofix.
@@ -124,14 +124,14 @@ def _get_top_fraction_ids(  # Studio team port to backend
     Returns:
     - list: A list of row indices representing the top specified number of rows based on the specified score column.
     """
-    bool_column_name = f"is_{name_col}"
+    bool_column_name = f"is_{issue_name}"
 
     # Construct a filter based on the 'label_issue' variable
     filter_condition = cleanset_df[bool_column_name]
 
     # Create a new DataFrame based on the filter
     filtered_df = cleanset_df[filter_condition]
-    if name_col == "near_duplicate":
+    if issue_name == "near_duplicate":
         # Group by the 'near_duplicate_cluster_ID' column
         df_n = filtered_df.sort_values(by="near_duplicate_score").reset_index(drop=True)
         sorted_df = df_n.head(num_rows)
@@ -144,7 +144,7 @@ def _get_top_fraction_ids(  # Studio team port to backend
         for group_name, group_df in grouped_df:
             # Sort the group DataFrame by the 'near_duplicate_score' column in ascending order
             sorted_group_df = group_df.sort_values(
-                by=f"{name_col}_score", ascending=asc
+                by=f"{issue_name}_score", ascending=asc
             ).reset_index(drop=True)
 
             # Extract every other index and append to the aggregated indices list
@@ -154,7 +154,7 @@ def _get_top_fraction_ids(  # Studio team port to backend
         return aggregated_indices
     else:
         # Construct the boolean column name with 'is_' prefix and 'label_issue_score' suffix
-        score_col_name = f"{name_col}_score"
+        score_col_name = f"{issue_name}_score"
 
         # Sort the filtered DataFrame by the constructed boolean column in descending order
         sorted_df = filtered_df.sort_values(by=score_col_name, ascending=asc)
@@ -184,7 +184,7 @@ def _update_label_based_on_confidence(row, conf_threshold):  # Studio team port
 
 
 def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
-    original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame
+    original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: dict
 ) -> pd.DataFrame:
     """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""
     original_df_copy = copy.deepcopy(original_df)
@@ -193,18 +193,17 @@ def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
 
     merged_df = merged_df.apply(
         lambda row: _update_label_based_on_confidence(
-            row, conf_threshold=parameters["drop_confidence_threshold"]
+            row, conf_threshold=parameters["relabel_confidence_threshold"]
         ),
         axis=1,
     )
 
     indices_to_drop = set()
-    for drop_name, top_num in parameters.items():
-        column_name = drop_name.replace("drop_", "")
-        if column_name == "confidence_threshold":
-            continue
-        top_percent_ids = _get_top_fraction_ids(merged_df, column_name, top_num, asc=False)
-        indices_to_drop.update(top_percent_ids)
+    for param_name, top_num in parameters.items():
+        if param_name.startswith('drop_'):
+            issue_name = param_name.replace("drop_", "")
+            top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False)
+            indices_to_drop.update(top_percent_ids)
 
     merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True)
     return merged_df[original_columns]

From e5c48720bcc156cbad95b2fdda340020a80440ac Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Sat, 2 Dec 2023 07:32:29 +0530
Subject: [PATCH 12/32] Added test for computing rows for exclusion

---
 cleanlab_studio/internal/util.py |  7 +++--
 cleanlab_studio/studio/studio.py |  2 +-
 tests/test_autofix.py            | 48 ++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_autofix.py

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index ed4d8f6a..9241535c 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -79,14 +79,14 @@ def _get_autofix_default_thresholds(strategy: str) -> dict:  # Studio team port
         },
         "drop_all_issues": {
             "drop_ambiguous": 1.0,
-            "drop_label_issue": 1.5,
+            "drop_label_issue": 1.0,
             "drop_near_duplicate": 1.0,
             "drop_outlier": 1.0,
         },
         "suggested_actions": {
             "drop_near_duplicate": 1.0,
             "drop_outlier": 1.0,
-            "relabel_confidence_threshold": 0.5,
+            "relabel_confidence_threshold": 0.0,
         },
     }
     return strategy_defaults[strategy]
@@ -102,10 +102,13 @@ def get_autofix_defaults(
     default_values = {}
 
     for param_type, param_value in default_thresholds.items():
+        # Convert drop fractions to number of rows and leave rest of the parameters as is
         if param_type.startswith("drop_"):
             issue_name = param_type[5:]
             num_rows = cleanset_df[f"is_{issue_name}"].sum()
             default_values[param_type] = math.ceil(num_rows * param_value)
+        else:
+            default_values[param_type] = param_value
     return default_values
 
 
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 97e6e9ce..a83071c0 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -417,5 +417,5 @@ def autofix_dataset(
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
         if params is None:
             params = get_autofix_defaults(cleanset_df, strategy)
-            print("Using autofix parameters:", params)
+            print("Using autofix values:", params)
         return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)
diff --git a/tests/test_autofix.py b/tests/test_autofix.py
new file mode 100644
index 00000000..501b1250
--- /dev/null
+++ b/tests/test_autofix.py
@@ -0,0 +1,48 @@
+import pandas as pd
+import pytest
+from cleanlab_studio.internal.util import get_autofix_defaults
+
+
+class TestAutofix:
+    @pytest.mark.parametrize(
+        "strategy, expected_results",
+        [
+            (
+                "optimized_training_data",
+                {
+                    "drop_ambiguous": 0,
+                    "drop_label_issue": 2,
+                    "drop_near_duplicate": 2,
+                    "drop_outlier": 3,
+                    "relabel_confidence_threshold": 0.95,
+                },
+            ),
+            (
+                "drop_all_issues",
+                {
+                    "drop_ambiguous": 10,
+                    "drop_label_issue": 3,
+                    "drop_near_duplicate": 6,
+                    "drop_outlier": 6,
+                },
+            ),
+            (
+                "suggested_actions",
+                {
+                    "drop_near_duplicate": 6,
+                    "drop_outlier": 6,
+                    "relabel_confidence_threshold": 0.0,
+                },
+            ),
+        ],
+        ids=["optimized_training_data", "drop_all_issues", "suggested_actions"],
+    )
+    def test_get_autofix_defaults(self, strategy, expected_results):
+        cleanlab_columns = pd.DataFrame()
+        cleanlab_columns["is_label_issue"] = [True] * 3 + [False] * 7
+        cleanlab_columns["is_near_duplicate"] = [True] * 6 + [False] * 4
+        cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
+        cleanlab_columns["is_ambiguous"] = [True] * 10
+
+        params = get_autofix_defaults(cleanlab_columns, strategy)
+        assert params == expected_results

From 02294c876a14b3b040b5fa6bedde835666eb1a32 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Sat, 2 Dec 2023 07:34:50 +0530
Subject: [PATCH 13/32] Improved formatting

---
 cleanlab_studio/internal/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 9241535c..19957c9d 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -203,7 +203,7 @@ def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
 
     indices_to_drop = set()
     for param_name, top_num in parameters.items():
-        if param_name.startswith('drop_'):
+        if param_name.startswith("drop_"):
             issue_name = param_name.replace("drop_", "")
             top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False)
             indices_to_drop.update(top_percent_ids)

From 1d644a0c63d75e1c0861ca361e315114c24a6afd Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Sat, 2 Dec 2023 07:51:51 +0530
Subject: [PATCH 14/32] Added tests for updating label issue rows based on
 threshold

---
 tests/test_autofix.py | 82 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/tests/test_autofix.py b/tests/test_autofix.py
index 501b1250..79b455e7 100644
--- a/tests/test_autofix.py
+++ b/tests/test_autofix.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import pytest
-from cleanlab_studio.internal.util import get_autofix_defaults
+from cleanlab_studio.internal.util import get_autofix_defaults, _update_label_based_on_confidence
 
 
 class TestAutofix:
@@ -46,3 +46,83 @@ def test_get_autofix_defaults(self, strategy, expected_results):
 
         params = get_autofix_defaults(cleanlab_columns, strategy)
         assert params == expected_results
+
+    @pytest.mark.parametrize(
+        "row, expected_updated_row",
+        [
+            (
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.6,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.6,
+                    "label": "label_1",
+                    "suggested_label": "label_1",
+                    "is_issue": False,
+                },
+            ),
+            (
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.5,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.5,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+            ),
+            (
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": True,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+            ),
+            (
+                {
+                    "is_label_issue": False,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+                {
+                    "is_label_issue": False,
+                    "suggested_label_confidence_score": 0.4,
+                    "label": "label_0",
+                    "suggested_label": "label_1",
+                    "is_issue": True,
+                },
+            ),
+        ],
+        ids=[
+            "is a label issue with confidence score greater than threshold",
+            "is a label issue with confidence score equal to threshold",
+            "is a label issue with confidence score less than threshold",
+            "is not a label issue",
+        ],
+    )
+    def test_update_label_based_on_confidence(self, row, expected_updated_row):
+        conf_threshold = 0.5
+        updated_row = _update_label_based_on_confidence(row, conf_threshold)
+        assert updated_row == expected_updated_row

From 3ff2507ed9cd9047e14472b8c55ed6957ee2e119 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Sat, 2 Dec 2023 08:00:27 +0530
Subject: [PATCH 15/32] Fixed mypy issue

---
 cleanlab_studio/studio/studio.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index a83071c0..5b3ce1da 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -2,7 +2,7 @@
 Python API for Cleanlab Studio.
 """
 import warnings
-from typing import Any, List, Literal, Optional, Union
+from typing import Any, List, Literal, Optional, Union, Dict
 
 import numpy as np
 import numpy.typing as npt
@@ -391,7 +391,7 @@ def autofix_dataset(
         self,
         original_df: pd.DataFrame,
         cleanset_id: str,
-        params: dict = None,
+        params: Optional[Dict[str, Union[int, float]]] = None,
         strategy="optimized_training_data",
     ) -> pd.DataFrame:
         """

From 7235b4079747c69f5759e4e49665c658c1a6e322 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Sat, 2 Dec 2023 08:09:48 +0530
Subject: [PATCH 16/32] Added test for checking right rows are dropped for non
 near duplicate issues

---
 tests/test_autofix.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/test_autofix.py b/tests/test_autofix.py
index 79b455e7..1f6ccd06 100644
--- a/tests/test_autofix.py
+++ b/tests/test_autofix.py
@@ -1,6 +1,11 @@
 import pandas as pd
 import pytest
-from cleanlab_studio.internal.util import get_autofix_defaults, _update_label_based_on_confidence
+from cleanlab_studio.internal.util import (
+    get_autofix_defaults,
+    _update_label_based_on_confidence,
+    _get_top_fraction_ids,
+)
+import numpy as np
 
 
 class TestAutofix:
@@ -126,3 +131,12 @@ def test_update_label_based_on_confidence(self, row, expected_updated_row):
         conf_threshold = 0.5
         updated_row = _update_label_based_on_confidence(row, conf_threshold)
         assert updated_row == expected_updated_row
+
+    def test_get_top_fraction_ids(self):
+        cleanlab_columns = pd.DataFrame()
+
+        cleanlab_columns["cleanlab_row_ID"] = np.arange(10)
+        cleanlab_columns["is_dummy"] = [False] * 5 + [True] * 5
+        cleanlab_columns["dummy_score"] = np.arange(10) * 0.1
+        top_ids = _get_top_fraction_ids(cleanlab_columns, "dummy", 3)
+        assert set(top_ids) == set([5, 6, 7])

From 1b99d602558cdc03a7f0f27ba8456ab8d32d858b Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Sat, 2 Dec 2023 08:28:28 +0530
Subject: [PATCH 17/32] Added test for checking right rows are dropped for near
 duplicate issues

---
 tests/test_autofix.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_autofix.py b/tests/test_autofix.py
index 1f6ccd06..b5a1ef6f 100644
--- a/tests/test_autofix.py
+++ b/tests/test_autofix.py
@@ -140,3 +140,14 @@ def test_get_top_fraction_ids(self):
         cleanlab_columns["dummy_score"] = np.arange(10) * 0.1
         top_ids = _get_top_fraction_ids(cleanlab_columns, "dummy", 3)
         assert set(top_ids) == set([5, 6, 7])
+
+    def test_get_top_fraction_ids_near_duplicate(self):
+        cleanlab_columns = pd.DataFrame()
+
+        cleanlab_columns["cleanlab_row_ID"] = np.arange(12)
+        cleanlab_columns["is_near_duplicate"] = [False] * 6 + [True] * 6
+        cleanlab_columns["near_duplicate_score"] = np.arange(12) * 0.1
+        cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1]
+
+        top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
+        assert set(top_ids) == set([6, 8, 10])

From 330aa44e595f293f5077066ddfa30a64199c9ea3 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 18:43:28 +0530
Subject: [PATCH 18/32] Added get defaults method

---
 cleanlab_studio/internal/util.py              | 85 +++++++++----------
 cleanlab_studio/studio/studio.py              | 17 ++--
 ...{test_autofix.py => test_autofix_utils.py} | 11 ++-
 3 files changed, 58 insertions(+), 55 deletions(-)
 rename tests/{test_autofix.py => test_autofix_utils.py} (95%)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 19957c9d..0f76b0e2 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -1,5 +1,5 @@
 import pathlib
-from typing import Any, Optional, TypeVar, Union, List
+from typing import Any, Optional, TypeVar, Union, List, Dict
 import math
 
 import copy
@@ -27,6 +27,27 @@
 
 DatasetSourceType = TypeVar("DatasetSourceType", bound=dataset_source_types)  # type: ignore
 
+# Studio team port to backend
+AUTOFIX_DEFAULTS = {
+    "optimized_training_data": {
+        "drop_ambiguous": 0.0,
+        "drop_label_issue": 0.5,
+        "drop_near_duplicate": 0.2,
+        "drop_outlier": 0.5,
+        "relabel_confidence_threshold": 0.95,
+    },
+    "drop_all_issues": {
+        "drop_ambiguous": 1.0,
+        "drop_label_issue": 1.0,
+        "drop_near_duplicate": 1.0,
+        "drop_outlier": 1.0,
+    },
+    "suggested_actions": {
+        "drop_near_duplicate": 1.0,
+        "drop_outlier": 1.0,
+        "relabel_confidence_threshold": 0.0,
+    },
+}
 
 def init_dataset_source(
     dataset_source: DatasetSourceType, dataset_name: Optional[str] = None
@@ -66,51 +87,22 @@ def check_not_none(x: Any) -> bool:
     return not check_none(x)
 
 
-def _get_autofix_default_thresholds(strategy: str) -> dict:  # Studio team port to backend
-    """returns default percentage-wise params of autofix"""
-
-    strategy_defaults = {
-        "optimized_training_data": {
-            "drop_ambiguous": 0.0,
-            "drop_label_issue": 0.5,
-            "drop_near_duplicate": 0.2,
-            "drop_outlier": 0.5,
-            "relabel_confidence_threshold": 0.95,
-        },
-        "drop_all_issues": {
-            "drop_ambiguous": 1.0,
-            "drop_label_issue": 1.0,
-            "drop_near_duplicate": 1.0,
-            "drop_outlier": 1.0,
-        },
-        "suggested_actions": {
-            "drop_near_duplicate": 1.0,
-            "drop_outlier": 1.0,
-            "relabel_confidence_threshold": 0.0,
-        },
-    }
-    return strategy_defaults[strategy]
-
-
-def get_autofix_defaults(
-    cleanset_df: pd.DataFrame, strategy
-) -> dict:  # Studio team port to backend
-    """
-    Generate default values for autofix parameters based on the size of the cleaned dataset.
-    """
-    default_thresholds = _get_autofix_default_thresholds(strategy)
-    default_values = {}
+# Studio team port to backend
+def get_autofix_defaults_for_strategy(strategy):
+    return AUTOFIX_DEFAULTS[strategy]
 
-    for param_type, param_value in default_thresholds.items():
+def get_param_values(cleanset_df, params, strategy):
+    thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params
+    param_values = {}
+    for param_type, param_value in thresholds.items():
         # Convert drop fractions to number of rows and leave rest of the parameters as is
         if param_type.startswith("drop_"):
             issue_name = param_type[5:]
             num_rows = cleanset_df[f"is_{issue_name}"].sum()
-            default_values[param_type] = math.ceil(num_rows * param_value)
+            param_values[param_type] = math.ceil(num_rows * param_value)
         else:
-            default_values[param_type] = param_value
-    return default_values
-
+            param_values[param_type] = param_value
+    return param_values
 
 def _get_top_fraction_ids(  # Studio team port to backend
     cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True
@@ -201,12 +193,17 @@ def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
         axis=1,
     )
 
+    indices_to_drop = _get_indices_to_drop(merged_df, parameters)
+
+    merged_df = merged_df.drop(indices_to_drop, axis=0).reset_index(drop=True)
+    return merged_df[original_columns]
+
+
+def _get_indices_to_drop(merged_df, parameters):
     indices_to_drop = set()
     for param_name, top_num in parameters.items():
         if param_name.startswith("drop_"):
             issue_name = param_name.replace("drop_", "")
-            top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False)
+            top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=True)
             indices_to_drop.update(top_percent_ids)
-
-    merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True)
-    return merged_df[original_columns]
+    return list(indices_to_drop)
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 5b3ce1da..388530de 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -15,10 +15,8 @@
 from cleanlab_studio.internal.types import FieldSchemaDict
 from cleanlab_studio.internal.util import (
     apply_autofixed_cleanset_to_new_dataframe,
-    _get_autofix_default_thresholds,
-    check_none,
-    check_not_none,
-    get_autofix_defaults,
+    get_autofix_defaults_for_strategy,
+    get_param_values,
     init_dataset_source,
 )
 
@@ -415,7 +413,10 @@ def autofix_dataset(
 
         """
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
-        if params is None:
-            params = get_autofix_defaults(cleanset_df, strategy)
-            print("Using autofix values:", params)
-        return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)
+        if params is not None and strategy is not None:
+            raise ValueError("Please provide only of params or strategy for autofix")
+        param_values = get_param_values(cleanset_df, params, strategy)
+        return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values)
+
+    def get_autofix_defaults(self, strategy="optimized_training_data"):
+        return get_autofix_defaults_for_strategy(strategy)
diff --git a/tests/test_autofix.py b/tests/test_autofix_utils.py
similarity index 95%
rename from tests/test_autofix.py
rename to tests/test_autofix_utils.py
index b5a1ef6f..e5526719 100644
--- a/tests/test_autofix.py
+++ b/tests/test_autofix_utils.py
@@ -1,9 +1,10 @@
 import pandas as pd
 import pytest
 from cleanlab_studio.internal.util import (
-    get_autofix_defaults,
+    get_param_values,
     _update_label_based_on_confidence,
     _get_top_fraction_ids,
+    _get_indices_to_drop
 )
 import numpy as np
 
@@ -42,14 +43,14 @@ class TestAutofix:
         ],
         ids=["optimized_training_data", "drop_all_issues", "suggested_actions"],
     )
-    def test_get_autofix_defaults(self, strategy, expected_results):
+    def test_get_param_values(self, strategy, expected_results):
         cleanlab_columns = pd.DataFrame()
         cleanlab_columns["is_label_issue"] = [True] * 3 + [False] * 7
         cleanlab_columns["is_near_duplicate"] = [True] * 6 + [False] * 4
         cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
         cleanlab_columns["is_ambiguous"] = [True] * 10
 
-        params = get_autofix_defaults(cleanlab_columns, strategy)
+        params = get_param_values(cleanlab_columns, None, strategy)
         assert params == expected_results
 
     @pytest.mark.parametrize(
@@ -151,3 +152,7 @@ def test_get_top_fraction_ids_near_duplicate(self):
 
         top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
         assert set(top_ids) == set([6, 8, 10])
+
+
+    def test_get_indices_to_drop(self):
+        pass
\ No newline at end of file

From a19c88c34ccc78422014ccc91392834e2ab9a823 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 20:21:39 +0530
Subject: [PATCH 19/32] Return cleanset with original indices

---
 cleanlab_studio/internal/util.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 0f76b0e2..52e7e036 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -49,6 +49,7 @@
     },
 }
 
+
 def init_dataset_source(
     dataset_source: DatasetSourceType, dataset_name: Optional[str] = None
 ) -> DatasetSource:
@@ -91,6 +92,7 @@ def check_not_none(x: Any) -> bool:
 def get_autofix_defaults_for_strategy(strategy):
     return AUTOFIX_DEFAULTS[strategy]
 
+
 def get_param_values(cleanset_df, params, strategy):
     thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params
     param_values = {}
@@ -104,6 +106,7 @@ def get_param_values(cleanset_df, params, strategy):
             param_values[param_type] = param_value
     return param_values
 
+
 def _get_top_fraction_ids(  # Studio team port to backend
     cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True
 ) -> List[str]:
@@ -171,9 +174,9 @@ def _update_label_based_on_confidence(row, conf_threshold):  # Studio team port
         pd.Series: The updated row.
     """
     if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold:
-        row[
-            "is_issue"
-        ] = False  # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled
+        # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled
+        row["is_issue"] = False
+        row["is_label_issue"] = False
         row["label"] = row["suggested_label"]
     return row
 
@@ -195,7 +198,7 @@ def apply_autofixed_cleanset_to_new_dataframe(  # Studio team port to backend
 
     indices_to_drop = _get_indices_to_drop(merged_df, parameters)
 
-    merged_df = merged_df.drop(indices_to_drop, axis=0).reset_index(drop=True)
+    merged_df = merged_df.drop(indices_to_drop, axis=0)
     return merged_df[original_columns]
 
 
From 19143a3752aaf4f75cc67a2c17d5b78387c0eaba Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 20:36:22 +0530
Subject: [PATCH 20/32] Removed unimplemented test

---
 tests/test_autofix_utils.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index e5526719..de5e89da 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -151,8 +151,4 @@ def test_get_top_fraction_ids_near_duplicate(self):
         cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1]
 
         top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
-        assert set(top_ids) == set([6, 8, 10])
-
-
-    def test_get_indices_to_drop(self):
-        pass
\ No newline at end of file
+        assert set(top_ids) == set([6, 8, 10])
\ No newline at end of file

From e5b97f51bd50ee9d35354c7964bcb5f64a37c967 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 20:40:59 +0530
Subject: [PATCH 21/32] removed unncessary merge change

---
 cleanlab_studio/studio/studio.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 70ea83ac..d02f180a 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -8,6 +8,8 @@
 import numpy.typing as npt
 import pandas as pd
 
+from . import inference
+from . import trustworthy_language_model
 from cleanlab_studio.errors import CleansetError
 from cleanlab_studio.internal import clean_helpers, upload_helpers
 from cleanlab_studio.internal.api import api
@@ -23,7 +25,6 @@
 from cleanlab_studio.internal.settings import CleanlabSettings
 from cleanlab_studio.internal.types import FieldSchemaDict
 
-from . import inference, trustworthy_language_model
 
 _snowflake_exists = api.snowflake_exists
 if _snowflake_exists:

From 20a532c5e221edc49f2e036ec4c24abfb03b88c0 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 20:43:45 +0530
Subject: [PATCH 22/32] Fixed tests

---
 tests/test_autofix_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index de5e89da..9b7928c6 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -65,7 +65,7 @@ def test_get_param_values(self, strategy, expected_results):
                     "is_issue": True,
                 },
                 {
-                    "is_label_issue": True,
+                    "is_label_issue": False,
                     "suggested_label_confidence_score": 0.6,
                     "label": "label_1",
                     "suggested_label": "label_1",

From 3bbfc1ca4324ffcb07ef0c2b1cb355e0b855e9c2 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 20:50:53 +0530
Subject: [PATCH 23/32] Fixed mypy error

---
 cleanlab_studio/studio/studio.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index d02f180a..67e69ce9 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -144,19 +144,19 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool
             cl_cols = self.download_cleanlab_columns(
                 cleanset_id, to_spark=False, include_project_details=True
             )
-            corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df(
+            snowflake_corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df(
                 dataset, cl_cols, id_col, label_col, keep_excluded
             )
-            return corrected_ds
+            return snowflake_corrected_ds
 
         elif _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame):
             cl_cols = self.download_cleanlab_columns(
                 cleanset_id, to_spark=True, include_project_details=True
             )
-            corrected_ds: pyspark.sql.DataFrame = apply_corrections_spark_df(
+            pyspark_corrected_ds: pyspark.sql.DataFrame = apply_corrections_spark_df(
                 dataset, cl_cols, id_col, label_col, keep_excluded
             )
-            return corrected_ds
+            return pyspark_corrected_ds
 
         elif isinstance(dataset, pd.DataFrame):
             cl_cols = self.download_cleanlab_columns(cleanset_id, include_project_details=True)

From b892e87dbeb9bc01e5ebb35e8d8bbf676fe2fad1 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 22:07:04 +0530
Subject: [PATCH 24/32] Added newline

---
 tests/test_autofix_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index 9b7928c6..8bee3736 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -151,4 +151,4 @@ def test_get_top_fraction_ids_near_duplicate(self):
         cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1]
 
         top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
-        assert set(top_ids) == set([6, 8, 10])
\ No newline at end of file
+        assert set(top_ids) == set([6, 8, 10])

From b54a0a7679994e53ba0fa574a9ff7234623bb2a9 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Tue, 5 Dec 2023 22:15:52 +0530
Subject: [PATCH 25/32] Fixed formatting

---
 tests/test_autofix_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index 8bee3736..e642c0ec 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -4,7 +4,7 @@
     get_param_values,
     _update_label_based_on_confidence,
     _get_top_fraction_ids,
-    _get_indices_to_drop
+    _get_indices_to_drop,
 )
 import numpy as np
 

From f870e04c555f1a087eb51150ce6122f9827f80c0 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Wed, 6 Dec 2023 18:18:44 +0530
Subject: [PATCH 26/32] added tests for dropped indices

---
 cleanlab_studio/internal/util.py |  2 +-
 tests/test_autofix_utils.py      | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 8da09467..71969cc8 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -340,7 +340,7 @@ def _get_indices_to_drop(merged_df, parameters):
     for param_name, top_num in parameters.items():
         if param_name.startswith("drop_"):
             issue_name = param_name.replace("drop_", "")
-            top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=True)
+            top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False)
             indices_to_drop.update(top_percent_ids)
     return list(indices_to_drop)
 
diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index e642c0ec..0c2bdd0a 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -152,3 +152,23 @@ def test_get_top_fraction_ids_near_duplicate(self):
 
         top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
         assert set(top_ids) == set([6, 8, 10])
+
+    def test_get_indices_to_drop(self):
+        cleanlab_columns = pd.DataFrame()
+        cleanlab_columns['cleanlab_row_ID'] = np.arange(10)
+        cleanlab_columns["is_issue1"] = [True] * 2 + [False] * 8
+        cleanlab_columns["issue1_score"] = [1.0, 0.9] + [0] * 8
+        cleanlab_columns["is_issue2"] = [False] * 2 + [True] * 4 + [False] * 4
+        cleanlab_columns["issue2_score"] = [0] * 2 + [1.0, 0.9, 0.8, 0.7] + [0] * 4
+        cleanlab_columns["is_issue3"] = [False] * 4 + [True] * 3 + [False] * 3
+        cleanlab_columns["issue3_score"] = [0] * 4 + [1.0, 0.9, 0.8] + [0] * 3
+
+        params = {
+            "drop_issue1": 1,
+            "drop_issue2": 3,
+            "drop_issue3": 2,
+        }
+        expected_indices = [0, 2, 3, 4, 5]
+
+        indices = _get_indices_to_drop(cleanlab_columns, params)
+        assert set(indices) == set(expected_indices)

From eb106d13c94ec578a6ff02b364863b2baa0f7ebf Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Wed, 6 Dec 2023 18:26:29 +0530
Subject: [PATCH 27/32] Added docs for user facing method s

---
 cleanlab_studio/studio/studio.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 67e69ce9..732388ca 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -374,16 +374,18 @@ def autofix_dataset(
         Args:
             cleanset_id (str): ID of cleanset.
             params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
-                number of rows to drop for each issue type. If not provided, default values will be used.
+                fraction of rows to drop for each issue type. If not provided, default values will be used.
 
                 Example:
                 {
-                    'drop_ambiguous': 9,
-                    'drop_label_issue': 92,
-                    'drop_near_duplicate': 1,
-                    'drop_outlier': 3,
-                    'drop_confidence_threshold': 0.95
+                    'drop_ambiguous': 0.0,
+                    'drop_label_issue': 0.5,
+                    'drop_near_duplicate': 0.5,
+                    'drop_outlier': 0.2,
+                    'relabel_confidence_threshold': 0.95
                 }
+            strategy (str): Auto-fixing strategy to use,
+                Possible strategies: optimized_training_data, drop_all_issues, suggested_actions
 
         Returns:
             pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset.
@@ -395,5 +397,16 @@ def autofix_dataset(
         param_values = get_param_values(cleanset_df, params, strategy)
         return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values)
 
-    def get_autofix_defaults(self, strategy="optimized_training_data"):
+    def get_autofix_defaults(self, strategy="optimized_training_data")-> Dict[str, float]:
+        """
+        This method returns the default params auto-fixed dataset.
+        Args:
+            strategy (str): Auto-fixing strategy
+                Possible strategies: optimized_training_data, drop_all_issues, suggested_actions
+
+        Returns:
+            dict[str, float]: parameter dictionary containing confidence threshold for auto-relabelling, and
+                fraction of rows to drop for each issue type.
+        """
         return get_autofix_defaults_for_strategy(strategy)
+

From a7acfa62cc93451c6556f4f99a9bb0eb3b3ff0e8 Mon Sep 17 00:00:00 2001
From: Sanjana Garg <garg.sanjana.95@gmail.com>
Date: Wed, 6 Dec 2023 18:27:17 +0530
Subject: [PATCH 28/32] Black formatting

---
 cleanlab_studio/studio/studio.py | 3 +--
 tests/test_autofix_utils.py      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 732388ca..18086f24 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -397,7 +397,7 @@ def autofix_dataset(
         param_values = get_param_values(cleanset_df, params, strategy)
         return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values)
 
-    def get_autofix_defaults(self, strategy="optimized_training_data")-> Dict[str, float]:
+    def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, float]:
         """
         This method returns the default params auto-fixed dataset.
         Args:
@@ -409,4 +409,3 @@ def get_autofix_defaults(self, strategy="optimized_training_data")-> Dict[str, f
                 fraction of rows to drop for each issue type.
         """
         return get_autofix_defaults_for_strategy(strategy)
-
diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index 0c2bdd0a..b6357b08 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -155,7 +155,7 @@ def test_get_top_fraction_ids_near_duplicate(self):
 
     def test_get_indices_to_drop(self):
         cleanlab_columns = pd.DataFrame()
-        cleanlab_columns['cleanlab_row_ID'] = np.arange(10)
+        cleanlab_columns["cleanlab_row_ID"] = np.arange(10)
         cleanlab_columns["is_issue1"] = [True] * 2 + [False] * 8
         cleanlab_columns["issue1_score"] = [1.0, 0.9] + [0] * 8
         cleanlab_columns["is_issue2"] = [False] * 2 + [True] * 4 + [False] * 4

From 692efe4d880f2bc29b2ccc5c9f268a47b4f10cfc Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Wed, 13 Dec 2023 22:19:45 +0530
Subject: [PATCH 29/32] merge main

---
 cleanlab_studio/studio/studio.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 387db76c..b2806d69 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -144,10 +144,10 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool
             cl_cols = self.download_cleanlab_columns(
                 cleanset_id, to_spark=False, include_project_details=True
             )
-            snowflake_corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df(
+            corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df(
                 dataset, cl_cols, id_col, label_col, keep_excluded
             )
-            return snowflake_corrected_ds
+            return corrected_ds
 
         elif _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame):
             cl_cols = self.download_cleanlab_columns(

From afbe4a9bf8fba1c9eee8963bef042a57ef202090 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Wed, 13 Dec 2023 22:32:05 +0530
Subject: [PATCH 30/32] add github change request

---
 cleanlab_studio/internal/util.py |  6 +++---
 cleanlab_studio/studio/studio.py | 24 +++++++++++++++++-------
 tests/test_autofix_utils.py      |  4 ++--
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
index 71969cc8..280171a1 100644
--- a/cleanlab_studio/internal/util.py
+++ b/cleanlab_studio/internal/util.py
@@ -222,12 +222,12 @@ def check_not_none(x: Any) -> bool:
 
 
 # Studio team port to backend
-def get_autofix_defaults_for_strategy(strategy):
+def _get_autofix_defaults_for_strategy(strategy):
     return AUTOFIX_DEFAULTS[strategy]
 
 
-def get_param_values(cleanset_df, params, strategy):
-    thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params
+def _get_param_values(cleanset_df, params, strategy):
+    thresholds = _get_autofix_defaults_for_strategy(strategy) if params is None else params
     param_values = {}
     for param_type, param_value in thresholds.items():
         # Convert drop fractions to number of rows and leave rest of the parameters as is
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index b2806d69..1dade64b 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -19,8 +19,8 @@
     apply_corrections_spark_df,
     apply_corrections_pd_df,
     apply_autofixed_cleanset_to_new_dataframe,
-    get_autofix_defaults_for_strategy,
-    get_param_values,
+    _get_autofix_defaults_for_strategy,
+    _get_param_values,
 )
 from cleanlab_studio.internal.settings import CleanlabSettings
 from cleanlab_studio.internal.types import FieldSchemaDict
@@ -370,13 +370,20 @@ def autofix_dataset(
         strategy="optimized_training_data",
     ) -> pd.DataFrame:
         """
-        This method returns the auto-fixed dataset.
+        This method returns the auto-fixed dataset. It works for text or tabular dataset only.
         Args:
             cleanset_id (str): ID of cleanset.
+            original_df (pd.DataFrame): The original dataset in DataFrame format.
             params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
                 fraction of rows to drop for each issue type. If not provided, default values will be used.
-
-                Example:
+                This dictionary includes the following options:
+
+                    * drop_ambiguous (float): Fraction of rows to drop when encountering ambiguous data. Default is 0.0 (no rows dropped).
+                    * drop_label_issue (float): Fraction of rows to drop when facing label-related issues. Default is 0.5 (50% of rows dropped).
+                    * drop_near_duplicate (float): Fraction of rows to drop for near-duplicate data. Default is 0.5 (50% of rows dropped).
+                    * drop_outlier (float): Fraction of rows to drop for outlier data. Default is 0.2 (20% of rows dropped).
+                    * relabel_confidence_threshold (float): Confidence threshold for auto-relabelling. Default is 0.95.
+                For example, the default values are:
                 {
                     'drop_ambiguous': 0.0,
                     'drop_label_issue': 0.5,
@@ -384,6 +391,9 @@ def autofix_dataset(
                     'drop_outlier': 0.2,
                     'relabel_confidence_threshold': 0.95
                 }
+
+            Specify values in params to customize the behavior for specific scenarios. If params are provided, the values in params take precedence over default ones.
+            
             strategy (str): Auto-fixing strategy to use,
                 Possible strategies: optimized_training_data, drop_all_issues, suggested_actions
 
@@ -394,7 +404,7 @@ def autofix_dataset(
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
         if params is not None and strategy is not None:
             raise ValueError("Please provide only of params or strategy for autofix")
-        param_values = get_param_values(cleanset_df, params, strategy)
+        param_values = _get_param_values(cleanset_df, params, strategy)
         return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values)
 
     def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, float]:
@@ -408,4 +418,4 @@ def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str,
             dict[str, float]: parameter dictionary containing confidence threshold for auto-relabelling, and
                 fraction of rows to drop for each issue type.
         """
-        return get_autofix_defaults_for_strategy(strategy)
+        return _get_autofix_defaults_for_strategy(strategy)
diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
index b6357b08..69fa04ba 100644
--- a/tests/test_autofix_utils.py
+++ b/tests/test_autofix_utils.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 from cleanlab_studio.internal.util import (
-    get_param_values,
+    _get_param_values,
     _update_label_based_on_confidence,
     _get_top_fraction_ids,
     _get_indices_to_drop,
@@ -50,7 +50,7 @@ def test_get_param_values(self, strategy, expected_results):
         cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
         cleanlab_columns["is_ambiguous"] = [True] * 10
 
-        params = get_param_values(cleanlab_columns, None, strategy)
+        params = _get_param_values(cleanlab_columns, None, strategy)
         assert params == expected_results
 
     @pytest.mark.parametrize(

From 7b96faa57a4d9b512044dbdc8d7e1ebcbcab6ed4 Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Mon, 18 Dec 2023 17:01:14 +0530
Subject: [PATCH 31/32] Update cleanlab_studio/studio/studio.py

Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com>
---
 cleanlab_studio/studio/studio.py | 43 +++++++++++++-------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 1dade64b..30db37a9 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -370,32 +370,25 @@ def autofix_dataset(
         strategy="optimized_training_data",
     ) -> pd.DataFrame:
         """
-        This method returns the auto-fixed dataset. It works for text or tabular dataset only.
+        Improves a dataset by applying automatically-suggested corrections based on issues detected by Cleanlab.
         Args:
-            cleanset_id (str): ID of cleanset.
-            original_df (pd.DataFrame): The original dataset in DataFrame format.
-            params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
-                fraction of rows to drop for each issue type. If not provided, default values will be used.
-                This dictionary includes the following options:
-
-                    * drop_ambiguous (float): Fraction of rows to drop when encountering ambiguous data. Default is 0.0 (no rows dropped).
-                    * drop_label_issue (float): Fraction of rows to drop when facing label-related issues. Default is 0.5 (50% of rows dropped).
-                    * drop_near_duplicate (float): Fraction of rows to drop for near-duplicate data. Default is 0.5 (50% of rows dropped).
-                    * drop_outlier (float): Fraction of rows to drop for outlier data. Default is 0.2 (20% of rows dropped).
-                    * relabel_confidence_threshold (float): Confidence threshold for auto-relabelling. Default is 0.95.
-                For example, the default values are:
-                {
-                    'drop_ambiguous': 0.0,
-                    'drop_label_issue': 0.5,
-                    'drop_near_duplicate': 0.5,
-                    'drop_outlier': 0.2,
-                    'relabel_confidence_threshold': 0.95
-                }
-
-            Specify values in params to customize the behavior for specific scenarios. If params are provided, the values in params take precedence over default ones.
-            
-            strategy (str): Auto-fixing strategy to use,
-                Possible strategies: optimized_training_data, drop_all_issues, suggested_actions
+            cleanset_id (str): ID of the cleanset from the Project for this Dataset.
+            original_df (pd.DataFrame): The original dataset (must be a DataFrame, so only text and tabular datasets are currently supported).
+            params (dict, optional): Optional parameters to control how many data points from each type of detected data issue are auto-corrected or filtered (prioritizing the more severe instances of each issue). If not provided, default `params` values will be used.
+                The `params` dictionary includes the following options:
+
+                    * drop_ambiguous (float): Fraction of the data points detected as ambiguous to exclude from the dataset.
+                    * drop_label_issue (float): Fraction of the data points with label issues to exclude from the dataset.
+                    * drop_near_duplicate (float): Fraction of the data points detected as near duplicates to exclude from the dataset.
+                    * drop_outlier (float): Fraction of the data points detected as outliers to exclude from the dataset.
+                    * relabel_confidence_threshold (float): Confidence threshold for the suggested label, data points with label issues that also exceed this threshold are re-labeled as the suggested label.
+
+            strategy (str): What strategy to use for auto-fixing the dataset out of the following possibilities: 
+            ['optimized_training_data', 'drop_all_issues', 'suggested_actions'].
+            Each of these possibilities corresponds to a default setting of the `params` dictionary, designed to be used in different scenarios.
+            If specified, the `params` argument will override this argument. Specify 'optimized_training_data' when your goal is to auto-fix training data to achieve the best ML performance on randomly split test data.
+            Specify 'drop_all_issues' to instead exclude all datapoints detected to have issues from the dataset.
+            Specify 'suggested_actions' to instead apply the suggested action to each data point that is displayed in the Cleanlab Studio Web Application (e.g. relabeling for label issues, dropping for outliers, etc).
 
         Returns:
             pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset.

From b31674ce9540224264b60f77b0118b8b4661b75f Mon Sep 17 00:00:00 2001
From: Aditya Thyagarajan <aditya1593@icloud.com>
Date: Mon, 18 Dec 2023 17:03:54 +0530
Subject: [PATCH 32/32] linting

---
 cleanlab_studio/studio/studio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
index 30db37a9..5d13c769 100644
--- a/cleanlab_studio/studio/studio.py
+++ b/cleanlab_studio/studio/studio.py
@@ -383,7 +383,7 @@ def autofix_dataset(
                     * drop_outlier (float): Fraction of the data points detected as outliers to exclude from the dataset.
                     * relabel_confidence_threshold (float): Confidence threshold for the suggested label, data points with label issues that also exceed this threshold are re-labeled as the suggested label.
 
-            strategy (str): What strategy to use for auto-fixing the dataset out of the following possibilities: 
+            strategy (str): What strategy to use for auto-fixing the dataset out of the following possibilities:
             ['optimized_training_data', 'drop_all_issues', 'suggested_actions'].
             Each of these possibilities corresponds to a default setting of the `params` dictionary, designed to be used in different scenarios.
             If specified, the `params` argument will override this argument. Specify 'optimized_training_data' when your goal is to auto-fix training data to achieve the best ML performance on randomly split test data.