From 461563753c7499ac06b3e37132552e450b5ceeed Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Fri, 17 Nov 2023 02:10:18 +0530 Subject: [PATCH 01/32] make pull request From 2a7cf915998f7e431eb117bbc1843276df38e0a3 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Sat, 18 Nov 2023 01:52:24 +0530 Subject: [PATCH 02/32] cleaned skeleton code --- cleanlab_studio/internal/util.py | 125 ++++++++++++++++++++++++++++++- cleanlab_studio/studio/studio.py | 15 ++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 08684427..11eeea15 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -2,7 +2,8 @@ from typing import Any, Optional, TypeVar, Union import math -import numpy as np +import copy + import pandas as pd try: @@ -63,3 +64,125 @@ def check_none(x: Any) -> bool: def check_not_none(x: Any) -> bool: return not check_none(x) + + +def _get_autofix_default_params(): + """returns default params of autofix""" + return { + "ambiguous": 0.2, + "label_issue": 0.5, + "near_duplicate": 0.2, + "outlier": 0.5, + "confidence_threshold": 0.95, + } + + +def _get_autofix_defaults(cleanset_df): + """ + Generate default values for autofix parameters based on the size of the cleaned dataset. + """ + default_params = _get_autofix_default_params() + default_values = {} + + for param_name, param_value in default_params.items(): + if param_name != "confidence_threshold": + num_rows = cleanset_df[f"is_{param_name}"].sum() + default_values[param_name] = math.ceil(num_rows * param_value) + else: + default_values[param_name] = param_value + return default_values + + +def _get_top_fraction_ids(cleanset_df, name_col, num_rows, asc=True): + """ + Extracts the top specified number of rows based on a specified score column from a DataFrame. + + Parameters: + - cleanset_df (pd.DataFrame): The input DataFrame containing the cleanset. + - name_col (str): The name of the column indicating the category for which the top rows should be extracted. + - num_rows (int): The number of rows to be extracted. + - asc (bool, optional): If True, the rows are sorted in ascending order based on the score column; if False, in descending order. + Default is True. + + Returns: + - list: A list of row indices representing the top specified number of rows based on the specified score column. + """ + bool_column_name = f"is_{name_col}" + + # Construct a filter based on the 'label_issue' variable + filter_condition = cleanset_df[bool_column_name] + + # Create a new DataFrame based on the filter + filtered_df = cleanset_df[filter_condition] + if name_col == "near_duplicate": + # Group by the 'near_duplicate_cluster_ID' column + df_n = filtered_df.sort_values(by="near_duplicate_score").reset_index(drop=True) + sorted_df = df_n.head(num_rows) + grouped_df = sorted_df.groupby("near_duplicate_cluster_id") + + # Initialize an empty list to store the aggregated indices + aggregated_indices = [] + + # Iterate over each group + for group_name, group_df in grouped_df: + # Sort the group DataFrame by the 'near_duplicate_score' column in ascending order + sorted_group_df = group_df.sort_values( + by=f"{name_col}_score", ascending=asc + ).reset_index(drop=True) + + # Extract every other index and append to the aggregated indices list + selected_indices = sorted_group_df.loc[::2, "cleanlab_row_ID"] + aggregated_indices.extend(selected_indices) + + return aggregated_indices + else: + # Construct the boolean column name with 'is_' prefix and 'label_issue_score' suffix + score_col_name = f"{name_col}_score" + + # Sort the filtered DataFrame by the constructed boolean column in descending order + sorted_df = filtered_df.sort_values(by=score_col_name, ascending=asc) + + # Extract the top specified number of rows and return the 'cleanlab_row_ID' column + top_rows_ids = sorted_df["cleanlab_row_ID"].head(num_rows) + + return top_rows_ids + + +def _update_label_based_on_confidence(row, conf_threshold): + """Update the label and is_issue based on confidence threshold if there is a label issue. + + Args: + row (pd.Series): The row containing label information. + conf_threshold (float): The confidence threshold for updating the label. + + Returns: + pd.Series: The updated row. + """ + if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold: + row["is_issue"] = False + row["label"] = row["suggested_label"] + return row + + +def _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters): + """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters.""" + original_df_copy = copy.deepcopy(original_df) + original_columns = original_df_copy.columns + merged_df = pd.merge(original_df_copy, cleanset_df, left_index=True, right_on="cleanlab_row_ID") + + merged_df = merged_df.apply( + lambda row: _update_label_based_on_confidence( + row, conf_threshold=parameters["confidence_threshold"] + ), + axis=1, + ) + + indices_to_drop = set() + for column_name, top_num in parameters.items(): + if column_name == "confidence_threshold": + continue + top_percent_ids = _get_top_fraction_ids(merged_df, column_name, top_num, asc=False) + indices_to_drop.update(top_percent_ids) + + merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True) + return merged_df[original_columns] diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 21f0e0e6..7d570b8d 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -17,6 +17,9 @@ init_dataset_source, check_none, check_not_none, + _get_autofix_default_params, + _get_autofix_defaults, + _apply_autofixed_cleanset_to_new_dataframe, ) from cleanlab_studio.internal.settings import CleanlabSettings from cleanlab_studio.internal.types import FieldSchemaDict @@ -383,3 +386,15 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) except (TimeoutError, CleansetError): return False + + def get_autofix_defaults(self, project_id): + cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) + cleaned_df = self.download_cleanlab_columns(cleanset_id) + return _get_autofix_defaults(cleaned_df) + + def autofix_dataset(self, project_id): + cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) + cleaned_df = self.download_cleanlab_columns(cleanset_id) + original_df = get_original_df() # Studio team + parameters = _get_autofix_defaults(cleaned_df) + return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters) From e7a3d07414203dc04be43d8cd1108830527b28a2 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Sat, 18 Nov 2023 02:02:06 +0530 Subject: [PATCH 03/32] cleanup --- cleanlab_studio/internal/util.py | 9 +++++---- cleanlab_studio/studio/studio.py | 13 +++++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 11eeea15..7cdb8535 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -87,9 +87,9 @@ def _get_autofix_defaults(cleanset_df): for param_name, param_value in default_params.items(): if param_name != "confidence_threshold": num_rows = cleanset_df[f"is_{param_name}"].sum() - default_values[param_name] = math.ceil(num_rows * param_value) + default_values[f"drop_{param_name}"] = math.ceil(num_rows * param_value) else: - default_values[param_name] = param_value + default_values[f"drop_{param_name}"] = param_value return default_values @@ -172,13 +172,14 @@ def _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, paramet merged_df = merged_df.apply( lambda row: _update_label_based_on_confidence( - row, conf_threshold=parameters["confidence_threshold"] + row, conf_threshold=parameters["drop_confidence_threshold"] ), axis=1, ) indices_to_drop = set() - for column_name, top_num in parameters.items(): + for drop_name, top_num in parameters.items(): + column_name = drop_name.replace("drop_", "") if column_name == "confidence_threshold": continue top_percent_ids = _get_top_fraction_ids(merged_df, column_name, top_num, asc=False) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 7d570b8d..bd1eaeb4 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -388,13 +388,18 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) return False def get_autofix_defaults(self, project_id): + """ + Returns the default parameters for autofix. + """ cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) cleaned_df = self.download_cleanlab_columns(cleanset_id) return _get_autofix_defaults(cleaned_df) - def autofix_dataset(self, project_id): + def autofix_dataset(self, project_id, params=None): cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) - cleaned_df = self.download_cleanlab_columns(cleanset_id) + cleanset_df = self.download_cleanlab_columns(cleanset_id) original_df = get_original_df() # Studio team - parameters = _get_autofix_defaults(cleaned_df) - return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters) + if params is None: + params = _get_autofix_defaults(cleanset_df) + print("Using autofix parameters:", params) + return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params) From 72fc9196ea8c6a38ebfab005252cbe70f35a566b Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Sat, 18 Nov 2023 02:13:50 +0530 Subject: [PATCH 04/32] add type hinting --- cleanlab_studio/internal/util.py | 16 ++++++++++------ cleanlab_studio/studio/studio.py | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 7cdb8535..3d03c346 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -1,5 +1,5 @@ import pathlib -from typing import Any, Optional, TypeVar, Union +from typing import Any, Optional, TypeVar, Union, List import math import copy @@ -66,8 +66,8 @@ def check_not_none(x: Any) -> bool: return not check_none(x) -def _get_autofix_default_params(): - """returns default params of autofix""" +def _get_autofix_default_params() -> dict: + """returns default percentage-wise params of autofix""" return { "ambiguous": 0.2, "label_issue": 0.5, @@ -77,7 +77,7 @@ def _get_autofix_default_params(): } -def _get_autofix_defaults(cleanset_df): +def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict: """ Generate default values for autofix parameters based on the size of the cleaned dataset. """ @@ -93,7 +93,9 @@ def _get_autofix_defaults(cleanset_df): return default_values -def _get_top_fraction_ids(cleanset_df, name_col, num_rows, asc=True): +def _get_top_fraction_ids( + cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True +) -> List[str]: """ Extracts the top specified number of rows based on a specified score column from a DataFrame. @@ -164,7 +166,9 @@ def _update_label_based_on_confidence(row, conf_threshold): return row -def _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, parameters): +def _apply_autofixed_cleanset_to_new_dataframe( + original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame +) -> pd.DataFrame: """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters.""" original_df_copy = copy.deepcopy(original_df) original_columns = original_df_copy.columns diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index bd1eaeb4..d8dca397 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -387,15 +387,25 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) except (TimeoutError, CleansetError): return False - def get_autofix_defaults(self, project_id): + def get_autofix_defaults(self, project_id: str) -> dict: """ Returns the default parameters for autofix. + Args: + project_id: ID of project. + + Returns: + A dictionary containing number of rows to drop for each issue type. """ cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) cleaned_df = self.download_cleanlab_columns(cleanset_id) return _get_autofix_defaults(cleaned_df) - def autofix_dataset(self, project_id, params=None): + def autofix_dataset(self, project_id: str, params: dict = None) -> pd.DataFrame: + """ + Args: + project_id: ID of project. + params: Default parameter dictionary showing number of rows to drop for each issue type. + """ cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) cleanset_df = self.download_cleanlab_columns(cleanset_id) original_df = get_original_df() # Studio team From d67bbc37c052929a2fbd134c57dcd62f2e24bb27 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Sat, 18 Nov 2023 11:22:22 +0530 Subject: [PATCH 05/32] address PR comments --- cleanlab_studio/studio/studio.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index d8dca397..4fe20a1e 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -387,28 +387,30 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) except (TimeoutError, CleansetError): return False - def get_autofix_defaults(self, project_id: str) -> dict: + def autofix_dataset( + self, original_df: pd.DataFrame, cleanset_id: str, params: dict = None + ) -> pd.DataFrame: """ - Returns the default parameters for autofix. + This method returns the auto-fixed dataset. Args: - project_id: ID of project. + cleanset_id (str): ID of cleanset. + params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and + number of rows to drop for each issue type. If not provided, default values will be used. + + Example: + { + 'drop_ambiguous': 9, + 'drop_label_issue': 92, + 'drop_near_duplicate': 1, + 'drop_outlier': 3, + 'drop_confidence_threshold': 0.95 + } Returns: - A dictionary containing number of rows to drop for each issue type. - """ - cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) - cleaned_df = self.download_cleanlab_columns(cleanset_id) - return _get_autofix_defaults(cleaned_df) + pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset. - def autofix_dataset(self, project_id: str, params: dict = None) -> pd.DataFrame: - """ - Args: - project_id: ID of project. - params: Default parameter dictionary showing number of rows to drop for each issue type. """ - cleanset_id = api.get_latest_cleanset_id(self._api_key, project_id) cleanset_df = self.download_cleanlab_columns(cleanset_id) - original_df = get_original_df() # Studio team if params is None: params = _get_autofix_defaults(cleanset_df) print("Using autofix parameters:", params) From fc4bf7c3038620291b76b0096a1039c2f3c0739d Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Tue, 21 Nov 2023 02:09:35 +0530 Subject: [PATCH 06/32] Update cleanlab_studio/internal/util.py Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab_studio/internal/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 3d03c346..ac6798b0 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -161,7 +161,7 @@ def _update_label_based_on_confidence(row, conf_threshold): pd.Series: The updated row. """ if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold: - row["is_issue"] = False + row["is_issue"] = False # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled row["label"] = row["suggested_label"] return row From 9f00909e75a1f46e48eec4017b6870d876f07011 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Tue, 21 Nov 2023 02:15:17 +0530 Subject: [PATCH 07/32] linting + doc change --- cleanlab_studio/internal/util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index ac6798b0..9595bf3f 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -97,8 +97,7 @@ def _get_top_fraction_ids( cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True ) -> List[str]: """ - Extracts the top specified number of rows based on a specified score column from a DataFrame. - + This will only return the IDs of datapoints to drop for a given setting of the num_rows to drop during autofix. Parameters: - cleanset_df (pd.DataFrame): The input DataFrame containing the cleanset. - name_col (str): The name of the column indicating the category for which the top rows should be extracted. @@ -161,7 +160,9 @@ def _update_label_based_on_confidence(row, conf_threshold): pd.Series: The updated row. """ if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold: - row["is_issue"] = False # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled + row[ + "is_issue" + ] = False # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled row["label"] = row["suggested_label"] return row From d2a34321af9c77e8f69ba50685f6f552420708e6 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 22 Nov 2023 22:56:37 +0530 Subject: [PATCH 08/32] set ambiguous to 0 --- cleanlab_studio/internal/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 9595bf3f..7592dd76 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -69,7 +69,7 @@ def check_not_none(x: Any) -> bool: def _get_autofix_default_params() -> dict: """returns default percentage-wise params of autofix""" return { - "ambiguous": 0.2, + "ambiguous": 0.0, "label_issue": 0.5, "near_duplicate": 0.2, "outlier": 0.5, From 6bcec4c22ecdf7eb94a674078b39403abc6aefa1 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 22 Nov 2023 23:30:09 +0530 Subject: [PATCH 09/32] things to port to backend --- cleanlab_studio/internal/util.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 7592dd76..0c02a89e 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -66,7 +66,7 @@ def check_not_none(x: Any) -> bool: return not check_none(x) -def _get_autofix_default_params() -> dict: +def _get_autofix_default_params() -> dict: # Studio team port to backend """returns default percentage-wise params of autofix""" return { "ambiguous": 0.0, @@ -77,7 +77,7 @@ def _get_autofix_default_params() -> dict: } -def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict: +def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict: # Studio team port to backend """ Generate default values for autofix parameters based on the size of the cleaned dataset. """ @@ -93,7 +93,7 @@ def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict: return default_values -def _get_top_fraction_ids( +def _get_top_fraction_ids( # Studio team port to backend cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True ) -> List[str]: """ @@ -149,7 +149,7 @@ def _get_top_fraction_ids( return top_rows_ids -def _update_label_based_on_confidence(row, conf_threshold): +def _update_label_based_on_confidence(row, conf_threshold): # Studio team port to backend """Update the label and is_issue based on confidence threshold if there is a label issue. Args: @@ -167,7 +167,7 @@ def _update_label_based_on_confidence(row, conf_threshold): return row -def _apply_autofixed_cleanset_to_new_dataframe( +def _apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame ) -> pd.DataFrame: """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters.""" From cc52ce2dee8e3919efd2e610fa60f1d76b499a6d Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Fri, 1 Dec 2023 18:49:15 +0530 Subject: [PATCH 10/32] Updated code for different strategies --- cleanlab_studio/internal/util.py | 48 +++++++++++++++++++++----------- cleanlab_studio/studio/studio.py | 30 +++++++++++--------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 0c02a89e..c97db08f 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -66,30 +66,46 @@ def check_not_none(x: Any) -> bool: return not check_none(x) -def _get_autofix_default_params() -> dict: # Studio team port to backend +def _get_autofix_default_thresholds(strategy: str) -> dict: # Studio team port to backend """returns default percentage-wise params of autofix""" - return { - "ambiguous": 0.0, - "label_issue": 0.5, - "near_duplicate": 0.2, - "outlier": 0.5, - "confidence_threshold": 0.95, + + strategy_defaults = { + "optimized_training_data": { + "drop_ambiguous": 0.0, + "drop_label_issue": 0.5, + "drop_near_duplicate": 0.2, + "drop_outlier": 0.5, + "relabel_confidence_threshold": 0.95, + }, + "drop_all_issues": { + "drop_ambiguous": 1.0, + "drop_label_issue": 1.5, + "drop_near_duplicate": 1.0, + "drop_outlier": 1.0, + }, + "suggested_actions": { + "drop_near_duplicate": 1.0, + "drop_outlier": 1.0, + "relabel_confidence_threshold": 0.5, + }, } + return strategy_defaults[strategy] -def _get_autofix_defaults(cleanset_df: pd.DataFrame) -> dict: # Studio team port to backend +def get_autofix_defaults( + cleanset_df: pd.DataFrame, strategy +) -> dict: # Studio team port to backend """ Generate default values for autofix parameters based on the size of the cleaned dataset. """ - default_params = _get_autofix_default_params() + default_thresholds = _get_autofix_default_thresholds(strategy) default_values = {} - for param_name, param_value in default_params.items(): - if param_name != "confidence_threshold": - num_rows = cleanset_df[f"is_{param_name}"].sum() - default_values[f"drop_{param_name}"] = math.ceil(num_rows * param_value) - else: - default_values[f"drop_{param_name}"] = param_value + for param_type, param_value in default_thresholds.items(): + if param_type.startswith("drop_"): + issue_name = param_type[5:] + num_rows = cleanset_df[f"is_{issue_name}"].sum() + default_values[param_type] = math.ceil(num_rows * param_value) return default_values @@ -167,7 +183,7 @@ def _update_label_based_on_confidence(row, conf_threshold): # Studio team port return row -def _apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend +def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame ) -> pd.DataFrame: """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters.""" diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 4fe20a1e..97e6e9ce 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -1,28 +1,28 @@ """ Python API for Cleanlab Studio. """ -from typing import Any, List, Literal, Optional, Union import warnings +from typing import Any, List, Literal, Optional, Union import numpy as np import numpy.typing as npt import pandas as pd -from . import inference -from . import trustworthy_language_model from cleanlab_studio.errors import CleansetError from cleanlab_studio.internal import clean_helpers, upload_helpers from cleanlab_studio.internal.api import api +from cleanlab_studio.internal.settings import CleanlabSettings +from cleanlab_studio.internal.types import FieldSchemaDict from cleanlab_studio.internal.util import ( - init_dataset_source, + apply_autofixed_cleanset_to_new_dataframe, + _get_autofix_default_thresholds, check_none, check_not_none, - _get_autofix_default_params, - _get_autofix_defaults, - _apply_autofixed_cleanset_to_new_dataframe, + get_autofix_defaults, + init_dataset_source, ) -from cleanlab_studio.internal.settings import CleanlabSettings -from cleanlab_studio.internal.types import FieldSchemaDict + +from . import inference, trustworthy_language_model _pyspark_exists = api.pyspark_exists if _pyspark_exists: @@ -134,7 +134,7 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool label_column = api.get_label_column_of_project(self._api_key, project_id) id_col = api.get_id_column(self._api_key, cleanset_id) if _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame): - from pyspark.sql.functions import row_number, monotonically_increasing_id, when, col + from pyspark.sql.functions import col, monotonically_increasing_id, row_number, when from pyspark.sql.window import Window cl_cols = self.download_cleanlab_columns( @@ -388,7 +388,11 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) return False def autofix_dataset( - self, original_df: pd.DataFrame, cleanset_id: str, params: dict = None + self, + original_df: pd.DataFrame, + cleanset_id: str, + params: dict = None, + strategy="optimized_training_data", ) -> pd.DataFrame: """ This method returns the auto-fixed dataset. @@ -412,6 +416,6 @@ def autofix_dataset( """ cleanset_df = self.download_cleanlab_columns(cleanset_id) if params is None: - params = _get_autofix_defaults(cleanset_df) + params = get_autofix_defaults(cleanset_df, strategy) print("Using autofix parameters:", params) - return _apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params) + return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params) From 62efa2d1f97e176ee1ffa9960667197da408c0b0 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Fri, 1 Dec 2023 19:11:44 +0530 Subject: [PATCH 11/32] Fixed apply method --- cleanlab_studio/internal/util.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index c97db08f..ed4d8f6a 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -110,7 +110,7 @@ def get_autofix_defaults( def _get_top_fraction_ids( # Studio team port to backend - cleanset_df: pd.DataFrame, name_col: str, num_rows: int, asc=True + cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True ) -> List[str]: """ This will only return the IDs of datapoints to drop for a given setting of the num_rows to drop during autofix. @@ -124,14 +124,14 @@ def _get_top_fraction_ids( # Studio team port to backend Returns: - list: A list of row indices representing the top specified number of rows based on the specified score column. """ - bool_column_name = f"is_{name_col}" + bool_column_name = f"is_{issue_name}" # Construct a filter based on the 'label_issue' variable filter_condition = cleanset_df[bool_column_name] # Create a new DataFrame based on the filter filtered_df = cleanset_df[filter_condition] - if name_col == "near_duplicate": + if issue_name == "near_duplicate": # Group by the 'near_duplicate_cluster_ID' column df_n = filtered_df.sort_values(by="near_duplicate_score").reset_index(drop=True) sorted_df = df_n.head(num_rows) @@ -144,7 +144,7 @@ def _get_top_fraction_ids( # Studio team port to backend for group_name, group_df in grouped_df: # Sort the group DataFrame by the 'near_duplicate_score' column in ascending order sorted_group_df = group_df.sort_values( - by=f"{name_col}_score", ascending=asc + by=f"{issue_name}_score", ascending=asc ).reset_index(drop=True) # Extract every other index and append to the aggregated indices list @@ -154,7 +154,7 @@ def _get_top_fraction_ids( # Studio team port to backend return aggregated_indices else: # Construct the boolean column name with 'is_' prefix and 'label_issue_score' suffix - score_col_name = f"{name_col}_score" + score_col_name = f"{issue_name}_score" # Sort the filtered DataFrame by the constructed boolean column in descending order sorted_df = filtered_df.sort_values(by=score_col_name, ascending=asc) @@ -184,7 +184,7 @@ def _update_label_based_on_confidence(row, conf_threshold): # Studio team port def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend - original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: pd.DataFrame + original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: dict ) -> pd.DataFrame: """Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters.""" original_df_copy = copy.deepcopy(original_df) @@ -193,18 +193,17 @@ def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend merged_df = merged_df.apply( lambda row: _update_label_based_on_confidence( - row, conf_threshold=parameters["drop_confidence_threshold"] + row, conf_threshold=parameters["relabel_confidence_threshold"] ), axis=1, ) indices_to_drop = set() - for drop_name, top_num in parameters.items(): - column_name = drop_name.replace("drop_", "") - if column_name == "confidence_threshold": - continue - top_percent_ids = _get_top_fraction_ids(merged_df, column_name, top_num, asc=False) - indices_to_drop.update(top_percent_ids) + for param_name, top_num in parameters.items(): + if param_name.startswith('drop_'): + issue_name = param_name.replace("drop_", "") + top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False) + indices_to_drop.update(top_percent_ids) merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True) return merged_df[original_columns] From e5c48720bcc156cbad95b2fdda340020a80440ac Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Sat, 2 Dec 2023 07:32:29 +0530 Subject: [PATCH 12/32] Added test for computing rows for exclusion --- cleanlab_studio/internal/util.py | 7 +++-- cleanlab_studio/studio/studio.py | 2 +- tests/test_autofix.py | 48 ++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 tests/test_autofix.py diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index ed4d8f6a..9241535c 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -79,14 +79,14 @@ def _get_autofix_default_thresholds(strategy: str) -> dict: # Studio team port }, "drop_all_issues": { "drop_ambiguous": 1.0, - "drop_label_issue": 1.5, + "drop_label_issue": 1.0, "drop_near_duplicate": 1.0, "drop_outlier": 1.0, }, "suggested_actions": { "drop_near_duplicate": 1.0, "drop_outlier": 1.0, - "relabel_confidence_threshold": 0.5, + "relabel_confidence_threshold": 0.0, }, } return strategy_defaults[strategy] @@ -102,10 +102,13 @@ def get_autofix_defaults( default_values = {} for param_type, param_value in default_thresholds.items(): + # Convert drop fractions to number of rows and leave rest of the parameters as is if param_type.startswith("drop_"): issue_name = param_type[5:] num_rows = cleanset_df[f"is_{issue_name}"].sum() default_values[param_type] = math.ceil(num_rows * param_value) + else: + default_values[param_type] = param_value return default_values diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 97e6e9ce..a83071c0 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -417,5 +417,5 @@ def autofix_dataset( cleanset_df = self.download_cleanlab_columns(cleanset_id) if params is None: params = get_autofix_defaults(cleanset_df, strategy) - print("Using autofix parameters:", params) + print("Using autofix values:", params) return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params) diff --git a/tests/test_autofix.py b/tests/test_autofix.py new file mode 100644 index 00000000..501b1250 --- /dev/null +++ b/tests/test_autofix.py @@ -0,0 +1,48 @@ +import pandas as pd +import pytest +from cleanlab_studio.internal.util import get_autofix_defaults + + +class TestAutofix: + @pytest.mark.parametrize( + "strategy, expected_results", + [ + ( + "optimized_training_data", + { + "drop_ambiguous": 0, + "drop_label_issue": 2, + "drop_near_duplicate": 2, + "drop_outlier": 3, + "relabel_confidence_threshold": 0.95, + }, + ), + ( + "drop_all_issues", + { + "drop_ambiguous": 10, + "drop_label_issue": 3, + "drop_near_duplicate": 6, + "drop_outlier": 6, + }, + ), + ( + "suggested_actions", + { + "drop_near_duplicate": 6, + "drop_outlier": 6, + "relabel_confidence_threshold": 0.0, + }, + ), + ], + ids=["optimized_training_data", "drop_all_issues", "suggested_actions"], + ) + def test_get_autofix_defaults(self, strategy, expected_results): + cleanlab_columns = pd.DataFrame() + cleanlab_columns["is_label_issue"] = [True] * 3 + [False] * 7 + cleanlab_columns["is_near_duplicate"] = [True] * 6 + [False] * 4 + cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4 + cleanlab_columns["is_ambiguous"] = [True] * 10 + + params = get_autofix_defaults(cleanlab_columns, strategy) + assert params == expected_results From 02294c876a14b3b040b5fa6bedde835666eb1a32 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Sat, 2 Dec 2023 07:34:50 +0530 Subject: [PATCH 13/32] Improved formatting --- cleanlab_studio/internal/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 9241535c..19957c9d 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -203,7 +203,7 @@ def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend indices_to_drop = set() for param_name, top_num in parameters.items(): - if param_name.startswith('drop_'): + if param_name.startswith("drop_"): issue_name = param_name.replace("drop_", "") top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False) indices_to_drop.update(top_percent_ids) From 1d644a0c63d75e1c0861ca361e315114c24a6afd Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Sat, 2 Dec 2023 07:51:51 +0530 Subject: [PATCH 14/32] Added tests for updating label issue rows based on threshold --- tests/test_autofix.py | 82 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/tests/test_autofix.py b/tests/test_autofix.py index 501b1250..79b455e7 100644 --- a/tests/test_autofix.py +++ b/tests/test_autofix.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -from cleanlab_studio.internal.util import get_autofix_defaults +from cleanlab_studio.internal.util import get_autofix_defaults, _update_label_based_on_confidence class TestAutofix: @@ -46,3 +46,83 @@ def test_get_autofix_defaults(self, strategy, expected_results): params = get_autofix_defaults(cleanlab_columns, strategy) assert params == expected_results + + @pytest.mark.parametrize( + "row, expected_updated_row", + [ + ( + { + "is_label_issue": True, + "suggested_label_confidence_score": 0.6, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + { + "is_label_issue": True, + "suggested_label_confidence_score": 0.6, + "label": "label_1", + "suggested_label": "label_1", + "is_issue": False, + }, + ), + ( + { + "is_label_issue": True, + "suggested_label_confidence_score": 0.5, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + { + "is_label_issue": True, + "suggested_label_confidence_score": 0.5, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + ), + ( + { + "is_label_issue": True, + "suggested_label_confidence_score": 0.4, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + { + "is_label_issue": True, + "suggested_label_confidence_score": 0.4, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + ), + ( + { + "is_label_issue": False, + "suggested_label_confidence_score": 0.4, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + { + "is_label_issue": False, + "suggested_label_confidence_score": 0.4, + "label": "label_0", + "suggested_label": "label_1", + "is_issue": True, + }, + ), + ], + ids=[ + "is a label issue with confidence score greater than threshold", + "is a label issue with confidence score equal to threshold", + "is a label issue with confidence score less than threshold", + "is not a label issue", + ], + ) + def test_update_label_based_on_confidence(self, row, expected_updated_row): + conf_threshold = 0.5 + updated_row = _update_label_based_on_confidence(row, conf_threshold) + assert updated_row == expected_updated_row From 3ff2507ed9cd9047e14472b8c55ed6957ee2e119 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Sat, 2 Dec 2023 08:00:27 +0530 Subject: [PATCH 15/32] Fixed mypy issue --- cleanlab_studio/studio/studio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index a83071c0..5b3ce1da 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -2,7 +2,7 @@ Python API for Cleanlab Studio. """ import warnings -from typing import Any, List, Literal, Optional, Union +from typing import Any, List, Literal, Optional, Union, Dict import numpy as np import numpy.typing as npt @@ -391,7 +391,7 @@ def autofix_dataset( self, original_df: pd.DataFrame, cleanset_id: str, - params: dict = None, + params: Optional[Dict[str, Union[int, float]]] = None, strategy="optimized_training_data", ) -> pd.DataFrame: """ From 7235b4079747c69f5759e4e49665c658c1a6e322 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Sat, 2 Dec 2023 08:09:48 +0530 Subject: [PATCH 16/32] Added test for checking right rows are dropped for non near duplicate issues --- tests/test_autofix.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_autofix.py b/tests/test_autofix.py index 79b455e7..1f6ccd06 100644 --- a/tests/test_autofix.py +++ b/tests/test_autofix.py @@ -1,6 +1,11 @@ import pandas as pd import pytest -from cleanlab_studio.internal.util import get_autofix_defaults, _update_label_based_on_confidence +from cleanlab_studio.internal.util import ( + get_autofix_defaults, + _update_label_based_on_confidence, + _get_top_fraction_ids, +) +import numpy as np class TestAutofix: @@ -126,3 +131,12 @@ def test_update_label_based_on_confidence(self, row, expected_updated_row): conf_threshold = 0.5 updated_row = _update_label_based_on_confidence(row, conf_threshold) assert updated_row == expected_updated_row + + def test_get_top_fraction_ids(self): + cleanlab_columns = pd.DataFrame() + + cleanlab_columns["cleanlab_row_ID"] = np.arange(10) + cleanlab_columns["is_dummy"] = [False] * 5 + [True] * 5 + cleanlab_columns["dummy_score"] = np.arange(10) * 0.1 + top_ids = _get_top_fraction_ids(cleanlab_columns, "dummy", 3) + assert set(top_ids) == set([5, 6, 7]) From 1b99d602558cdc03a7f0f27ba8456ab8d32d858b Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Sat, 2 Dec 2023 08:28:28 +0530 Subject: [PATCH 17/32] Added test for checking right rows are dropped for near duplicate issues --- tests/test_autofix.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_autofix.py b/tests/test_autofix.py index 1f6ccd06..b5a1ef6f 100644 --- a/tests/test_autofix.py +++ b/tests/test_autofix.py @@ -140,3 +140,14 @@ def test_get_top_fraction_ids(self): cleanlab_columns["dummy_score"] = np.arange(10) * 0.1 top_ids = _get_top_fraction_ids(cleanlab_columns, "dummy", 3) assert set(top_ids) == set([5, 6, 7]) + + def test_get_top_fraction_ids_near_duplicate(self): + cleanlab_columns = pd.DataFrame() + + cleanlab_columns["cleanlab_row_ID"] = np.arange(12) + cleanlab_columns["is_near_duplicate"] = [False] * 6 + [True] * 6 + cleanlab_columns["near_duplicate_score"] = np.arange(12) * 0.1 + cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1] + + top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5) + assert set(top_ids) == set([6, 8, 10]) From 330aa44e595f293f5077066ddfa30a64199c9ea3 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 18:43:28 +0530 Subject: [PATCH 18/32] Added get defaults method --- cleanlab_studio/internal/util.py | 85 +++++++++---------- cleanlab_studio/studio/studio.py | 17 ++-- ...{test_autofix.py => test_autofix_utils.py} | 11 ++- 3 files changed, 58 insertions(+), 55 deletions(-) rename tests/{test_autofix.py => test_autofix_utils.py} (95%) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 19957c9d..0f76b0e2 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -1,5 +1,5 @@ import pathlib -from typing import Any, Optional, TypeVar, Union, List +from typing import Any, Optional, TypeVar, Union, List, Dict import math import copy @@ -27,6 +27,27 @@ DatasetSourceType = TypeVar("DatasetSourceType", bound=dataset_source_types) # type: ignore +# Studio team port to backend +AUTOFIX_DEFAULTS = { + "optimized_training_data": { + "drop_ambiguous": 0.0, + "drop_label_issue": 0.5, + "drop_near_duplicate": 0.2, + "drop_outlier": 0.5, + "relabel_confidence_threshold": 0.95, + }, + "drop_all_issues": { + "drop_ambiguous": 1.0, + "drop_label_issue": 1.0, + "drop_near_duplicate": 1.0, + "drop_outlier": 1.0, + }, + "suggested_actions": { + "drop_near_duplicate": 1.0, + "drop_outlier": 1.0, + "relabel_confidence_threshold": 0.0, + }, +} def init_dataset_source( dataset_source: DatasetSourceType, dataset_name: Optional[str] = None @@ -66,51 +87,22 @@ def check_not_none(x: Any) -> bool: return not check_none(x) -def _get_autofix_default_thresholds(strategy: str) -> dict: # Studio team port to backend - """returns default percentage-wise params of autofix""" - - strategy_defaults = { - "optimized_training_data": { - "drop_ambiguous": 0.0, - "drop_label_issue": 0.5, - "drop_near_duplicate": 0.2, - "drop_outlier": 0.5, - "relabel_confidence_threshold": 0.95, - }, - "drop_all_issues": { - "drop_ambiguous": 1.0, - "drop_label_issue": 1.0, - "drop_near_duplicate": 1.0, - "drop_outlier": 1.0, - }, - "suggested_actions": { - "drop_near_duplicate": 1.0, - "drop_outlier": 1.0, - "relabel_confidence_threshold": 0.0, - }, - } - return strategy_defaults[strategy] - - -def get_autofix_defaults( - cleanset_df: pd.DataFrame, strategy -) -> dict: # Studio team port to backend - """ - Generate default values for autofix parameters based on the size of the cleaned dataset. - """ - default_thresholds = _get_autofix_default_thresholds(strategy) - default_values = {} +# Studio team port to backend +def get_autofix_defaults_for_strategy(strategy): + return AUTOFIX_DEFAULTS[strategy] - for param_type, param_value in default_thresholds.items(): +def get_param_values(cleanset_df, params, strategy): + thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params + param_values = {} + for param_type, param_value in thresholds.items(): # Convert drop fractions to number of rows and leave rest of the parameters as is if param_type.startswith("drop_"): issue_name = param_type[5:] num_rows = cleanset_df[f"is_{issue_name}"].sum() - default_values[param_type] = math.ceil(num_rows * param_value) + param_values[param_type] = math.ceil(num_rows * param_value) else: - default_values[param_type] = param_value - return default_values - + param_values[param_type] = param_value + return param_values def _get_top_fraction_ids( # Studio team port to backend cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True @@ -201,12 +193,17 @@ def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend axis=1, ) + indices_to_drop = _get_indices_to_drop(merged_df, parameters) + + merged_df = merged_df.drop(indices_to_drop, axis=0).reset_index(drop=True) + return merged_df[original_columns] + + +def _get_indices_to_drop(merged_df, parameters): indices_to_drop = set() for param_name, top_num in parameters.items(): if param_name.startswith("drop_"): issue_name = param_name.replace("drop_", "") - top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False) + top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=True) indices_to_drop.update(top_percent_ids) - - merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True) - return merged_df[original_columns] + return list(indices_to_drop) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 5b3ce1da..388530de 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -15,10 +15,8 @@ from cleanlab_studio.internal.types import FieldSchemaDict from cleanlab_studio.internal.util import ( apply_autofixed_cleanset_to_new_dataframe, - _get_autofix_default_thresholds, - check_none, - check_not_none, - get_autofix_defaults, + get_autofix_defaults_for_strategy, + get_param_values, init_dataset_source, ) @@ -415,7 +413,10 @@ def autofix_dataset( """ cleanset_df = self.download_cleanlab_columns(cleanset_id) - if params is None: - params = get_autofix_defaults(cleanset_df, strategy) - print("Using autofix values:", params) - return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params) + if params is not None and strategy is not None: + raise ValueError("Please provide only of params or strategy for autofix") + param_values = get_param_values(cleanset_df, params, strategy) + return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values) + + def get_autofix_defaults(self, strategy="optimized_training_data"): + return get_autofix_defaults_for_strategy(strategy) diff --git a/tests/test_autofix.py b/tests/test_autofix_utils.py similarity index 95% rename from tests/test_autofix.py rename to tests/test_autofix_utils.py index b5a1ef6f..e5526719 100644 --- a/tests/test_autofix.py +++ b/tests/test_autofix_utils.py @@ -1,9 +1,10 @@ import pandas as pd import pytest from cleanlab_studio.internal.util import ( - get_autofix_defaults, + get_param_values, _update_label_based_on_confidence, _get_top_fraction_ids, + _get_indices_to_drop ) import numpy as np @@ -42,14 +43,14 @@ class TestAutofix: ], ids=["optimized_training_data", "drop_all_issues", "suggested_actions"], ) - def test_get_autofix_defaults(self, strategy, expected_results): + def test_get_param_values(self, strategy, expected_results): cleanlab_columns = pd.DataFrame() cleanlab_columns["is_label_issue"] = [True] * 3 + [False] * 7 cleanlab_columns["is_near_duplicate"] = [True] * 6 + [False] * 4 cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4 cleanlab_columns["is_ambiguous"] = [True] * 10 - params = get_autofix_defaults(cleanlab_columns, strategy) + params = get_param_values(cleanlab_columns, None, strategy) assert params == expected_results @pytest.mark.parametrize( @@ -151,3 +152,7 @@ def test_get_top_fraction_ids_near_duplicate(self): top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5) assert set(top_ids) == set([6, 8, 10]) + + + def test_get_indices_to_drop(self): + pass \ No newline at end of file From a19c88c34ccc78422014ccc91392834e2ab9a823 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 20:21:39 +0530 Subject: [PATCH 19/32] Return cleanset with original indices --- cleanlab_studio/internal/util.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 0f76b0e2..52e7e036 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -49,6 +49,7 @@ }, } + def init_dataset_source( dataset_source: DatasetSourceType, dataset_name: Optional[str] = None ) -> DatasetSource: @@ -91,6 +92,7 @@ def check_not_none(x: Any) -> bool: def get_autofix_defaults_for_strategy(strategy): return AUTOFIX_DEFAULTS[strategy] + def get_param_values(cleanset_df, params, strategy): thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params param_values = {} @@ -104,6 +106,7 @@ def get_param_values(cleanset_df, params, strategy): param_values[param_type] = param_value return param_values + def _get_top_fraction_ids( # Studio team port to backend cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True ) -> List[str]: @@ -171,9 +174,9 @@ def _update_label_based_on_confidence(row, conf_threshold): # Studio team port pd.Series: The updated row. """ if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold: - row[ - "is_issue" - ] = False # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled + # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled + row["is_issue"] = False + row["is_label_issue"] = False row["label"] = row["suggested_label"] return row @@ -195,7 +198,7 @@ def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend indices_to_drop = _get_indices_to_drop(merged_df, parameters) - merged_df = merged_df.drop(indices_to_drop, axis=0).reset_index(drop=True) + merged_df = merged_df.drop(indices_to_drop, axis=0) return merged_df[original_columns] From 19143a3752aaf4f75cc67a2c17d5b78387c0eaba Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 20:36:22 +0530 Subject: [PATCH 20/32] Removed unimplemented test --- tests/test_autofix_utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index e5526719..de5e89da 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -151,8 +151,4 @@ def test_get_top_fraction_ids_near_duplicate(self): cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1] top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5) - assert set(top_ids) == set([6, 8, 10]) - - - def test_get_indices_to_drop(self): - pass \ No newline at end of file + assert set(top_ids) == set([6, 8, 10]) \ No newline at end of file From e5b97f51bd50ee9d35354c7964bcb5f64a37c967 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 20:40:59 +0530 Subject: [PATCH 21/32] removed unncessary merge change --- cleanlab_studio/studio/studio.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 70ea83ac..d02f180a 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -8,6 +8,8 @@ import numpy.typing as npt import pandas as pd +from . import inference +from . import trustworthy_language_model from cleanlab_studio.errors import CleansetError from cleanlab_studio.internal import clean_helpers, upload_helpers from cleanlab_studio.internal.api import api @@ -23,7 +25,6 @@ from cleanlab_studio.internal.settings import CleanlabSettings from cleanlab_studio.internal.types import FieldSchemaDict -from . import inference, trustworthy_language_model _snowflake_exists = api.snowflake_exists if _snowflake_exists: From 20a532c5e221edc49f2e036ec4c24abfb03b88c0 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 20:43:45 +0530 Subject: [PATCH 22/32] Fixed tests --- tests/test_autofix_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index de5e89da..9b7928c6 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -65,7 +65,7 @@ def test_get_param_values(self, strategy, expected_results): "is_issue": True, }, { - "is_label_issue": True, + "is_label_issue": False, "suggested_label_confidence_score": 0.6, "label": "label_1", "suggested_label": "label_1", From 3bbfc1ca4324ffcb07ef0c2b1cb355e0b855e9c2 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 20:50:53 +0530 Subject: [PATCH 23/32] Fixed mypy error --- cleanlab_studio/studio/studio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index d02f180a..67e69ce9 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -144,19 +144,19 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool cl_cols = self.download_cleanlab_columns( cleanset_id, to_spark=False, include_project_details=True ) - corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df( + snowflake_corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df( dataset, cl_cols, id_col, label_col, keep_excluded ) - return corrected_ds + return snowflake_corrected_ds elif _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame): cl_cols = self.download_cleanlab_columns( cleanset_id, to_spark=True, include_project_details=True ) - corrected_ds: pyspark.sql.DataFrame = apply_corrections_spark_df( + pyspark_corrected_ds: pyspark.sql.DataFrame = apply_corrections_spark_df( dataset, cl_cols, id_col, label_col, keep_excluded ) - return corrected_ds + return pyspark_corrected_ds elif isinstance(dataset, pd.DataFrame): cl_cols = self.download_cleanlab_columns(cleanset_id, include_project_details=True) From b892e87dbeb9bc01e5ebb35e8d8bbf676fe2fad1 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 22:07:04 +0530 Subject: [PATCH 24/32] Added newline --- tests/test_autofix_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index 9b7928c6..8bee3736 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -151,4 +151,4 @@ def test_get_top_fraction_ids_near_duplicate(self): cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1] top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5) - assert set(top_ids) == set([6, 8, 10]) \ No newline at end of file + assert set(top_ids) == set([6, 8, 10]) From b54a0a7679994e53ba0fa574a9ff7234623bb2a9 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Tue, 5 Dec 2023 22:15:52 +0530 Subject: [PATCH 25/32] Fixed formatting --- tests/test_autofix_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index 8bee3736..e642c0ec 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -4,7 +4,7 @@ get_param_values, _update_label_based_on_confidence, _get_top_fraction_ids, - _get_indices_to_drop + _get_indices_to_drop, ) import numpy as np From f870e04c555f1a087eb51150ce6122f9827f80c0 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 6 Dec 2023 18:18:44 +0530 Subject: [PATCH 26/32] added tests for dropped indices --- cleanlab_studio/internal/util.py | 2 +- tests/test_autofix_utils.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 8da09467..71969cc8 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -340,7 +340,7 @@ def _get_indices_to_drop(merged_df, parameters): for param_name, top_num in parameters.items(): if param_name.startswith("drop_"): issue_name = param_name.replace("drop_", "") - top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=True) + top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False) indices_to_drop.update(top_percent_ids) return list(indices_to_drop) diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index e642c0ec..0c2bdd0a 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -152,3 +152,23 @@ def test_get_top_fraction_ids_near_duplicate(self): top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5) assert set(top_ids) == set([6, 8, 10]) + + def test_get_indices_to_drop(self): + cleanlab_columns = pd.DataFrame() + cleanlab_columns['cleanlab_row_ID'] = np.arange(10) + cleanlab_columns["is_issue1"] = [True] * 2 + [False] * 8 + cleanlab_columns["issue1_score"] = [1.0, 0.9] + [0] * 8 + cleanlab_columns["is_issue2"] = [False] * 2 + [True] * 4 + [False] * 4 + cleanlab_columns["issue2_score"] = [0] * 2 + [1.0, 0.9, 0.8, 0.7] + [0] * 4 + cleanlab_columns["is_issue3"] = [False] * 4 + [True] * 3 + [False] * 3 + cleanlab_columns["issue3_score"] = [0] * 4 + [1.0, 0.9, 0.8] + [0] * 3 + + params = { + "drop_issue1": 1, + "drop_issue2": 3, + "drop_issue3": 2, + } + expected_indices = [0, 2, 3, 4, 5] + + indices = _get_indices_to_drop(cleanlab_columns, params) + assert set(indices) == set(expected_indices) From eb106d13c94ec578a6ff02b364863b2baa0f7ebf Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 6 Dec 2023 18:26:29 +0530 Subject: [PATCH 27/32] Added docs for user facing method s --- cleanlab_studio/studio/studio.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 67e69ce9..732388ca 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -374,16 +374,18 @@ def autofix_dataset( Args: cleanset_id (str): ID of cleanset. params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and - number of rows to drop for each issue type. If not provided, default values will be used. + fraction of rows to drop for each issue type. If not provided, default values will be used. Example: { - 'drop_ambiguous': 9, - 'drop_label_issue': 92, - 'drop_near_duplicate': 1, - 'drop_outlier': 3, - 'drop_confidence_threshold': 0.95 + 'drop_ambiguous': 0.0, + 'drop_label_issue': 0.5, + 'drop_near_duplicate': 0.5, + 'drop_outlier': 0.2, + 'relabel_confidence_threshold': 0.95 } + strategy (str): Auto-fixing strategy to use, + Possible strategies: optimized_training_data, drop_all_issues, suggested_actions Returns: pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset. @@ -395,5 +397,16 @@ def autofix_dataset( param_values = get_param_values(cleanset_df, params, strategy) return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values) - def get_autofix_defaults(self, strategy="optimized_training_data"): + def get_autofix_defaults(self, strategy="optimized_training_data")-> Dict[str, float]: + """ + This method returns the default params auto-fixed dataset. + Args: + strategy (str): Auto-fixing strategy + Possible strategies: optimized_training_data, drop_all_issues, suggested_actions + + Returns: + dict[str, float]: parameter dictionary containing confidence threshold for auto-relabelling, and + fraction of rows to drop for each issue type. + """ return get_autofix_defaults_for_strategy(strategy) + From a7acfa62cc93451c6556f4f99a9bb0eb3b3ff0e8 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 6 Dec 2023 18:27:17 +0530 Subject: [PATCH 28/32] Black formatting --- cleanlab_studio/studio/studio.py | 3 +-- tests/test_autofix_utils.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 732388ca..18086f24 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -397,7 +397,7 @@ def autofix_dataset( param_values = get_param_values(cleanset_df, params, strategy) return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values) - def get_autofix_defaults(self, strategy="optimized_training_data")-> Dict[str, float]: + def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, float]: """ This method returns the default params auto-fixed dataset. Args: @@ -409,4 +409,3 @@ def get_autofix_defaults(self, strategy="optimized_training_data")-> Dict[str, f fraction of rows to drop for each issue type. """ return get_autofix_defaults_for_strategy(strategy) - diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index 0c2bdd0a..b6357b08 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -155,7 +155,7 @@ def test_get_top_fraction_ids_near_duplicate(self): def test_get_indices_to_drop(self): cleanlab_columns = pd.DataFrame() - cleanlab_columns['cleanlab_row_ID'] = np.arange(10) + cleanlab_columns["cleanlab_row_ID"] = np.arange(10) cleanlab_columns["is_issue1"] = [True] * 2 + [False] * 8 cleanlab_columns["issue1_score"] = [1.0, 0.9] + [0] * 8 cleanlab_columns["is_issue2"] = [False] * 2 + [True] * 4 + [False] * 4 From 692efe4d880f2bc29b2ccc5c9f268a47b4f10cfc Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 13 Dec 2023 22:19:45 +0530 Subject: [PATCH 29/32] merge main --- cleanlab_studio/studio/studio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 387db76c..b2806d69 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -144,10 +144,10 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool cl_cols = self.download_cleanlab_columns( cleanset_id, to_spark=False, include_project_details=True ) - snowflake_corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df( + corrected_ds: snowpark.DataFrame = apply_corrections_snowpark_df( dataset, cl_cols, id_col, label_col, keep_excluded ) - return snowflake_corrected_ds + return corrected_ds elif _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame): cl_cols = self.download_cleanlab_columns( From afbe4a9bf8fba1c9eee8963bef042a57ef202090 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 13 Dec 2023 22:32:05 +0530 Subject: [PATCH 30/32] add github change request --- cleanlab_studio/internal/util.py | 6 +++--- cleanlab_studio/studio/studio.py | 24 +++++++++++++++++------- tests/test_autofix_utils.py | 4 ++-- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py index 71969cc8..280171a1 100644 --- a/cleanlab_studio/internal/util.py +++ b/cleanlab_studio/internal/util.py @@ -222,12 +222,12 @@ def check_not_none(x: Any) -> bool: # Studio team port to backend -def get_autofix_defaults_for_strategy(strategy): +def _get_autofix_defaults_for_strategy(strategy): return AUTOFIX_DEFAULTS[strategy] -def get_param_values(cleanset_df, params, strategy): - thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params +def _get_param_values(cleanset_df, params, strategy): + thresholds = _get_autofix_defaults_for_strategy(strategy) if params is None else params param_values = {} for param_type, param_value in thresholds.items(): # Convert drop fractions to number of rows and leave rest of the parameters as is diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index b2806d69..1dade64b 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -19,8 +19,8 @@ apply_corrections_spark_df, apply_corrections_pd_df, apply_autofixed_cleanset_to_new_dataframe, - get_autofix_defaults_for_strategy, - get_param_values, + _get_autofix_defaults_for_strategy, + _get_param_values, ) from cleanlab_studio.internal.settings import CleanlabSettings from cleanlab_studio.internal.types import FieldSchemaDict @@ -370,13 +370,20 @@ def autofix_dataset( strategy="optimized_training_data", ) -> pd.DataFrame: """ - This method returns the auto-fixed dataset. + This method returns the auto-fixed dataset. It works for text or tabular dataset only. Args: cleanset_id (str): ID of cleanset. + original_df (pd.DataFrame): The original dataset in DataFrame format. params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and fraction of rows to drop for each issue type. If not provided, default values will be used. - - Example: + This dictionary includes the following options: + + * drop_ambiguous (float): Fraction of rows to drop when encountering ambiguous data. Default is 0.0 (no rows dropped). + * drop_label_issue (float): Fraction of rows to drop when facing label-related issues. Default is 0.5 (50% of rows dropped). + * drop_near_duplicate (float): Fraction of rows to drop for near-duplicate data. Default is 0.5 (50% of rows dropped). + * drop_outlier (float): Fraction of rows to drop for outlier data. Default is 0.2 (20% of rows dropped). + * relabel_confidence_threshold (float): Confidence threshold for auto-relabelling. Default is 0.95. + For example, the default values are: { 'drop_ambiguous': 0.0, 'drop_label_issue': 0.5, @@ -384,6 +391,9 @@ def autofix_dataset( 'drop_outlier': 0.2, 'relabel_confidence_threshold': 0.95 } + + Specify values in params to customize the behavior for specific scenarios. If params are provided, the values in params take precedence over default ones. + strategy (str): Auto-fixing strategy to use, Possible strategies: optimized_training_data, drop_all_issues, suggested_actions @@ -394,7 +404,7 @@ def autofix_dataset( cleanset_df = self.download_cleanlab_columns(cleanset_id) if params is not None and strategy is not None: raise ValueError("Please provide only of params or strategy for autofix") - param_values = get_param_values(cleanset_df, params, strategy) + param_values = _get_param_values(cleanset_df, params, strategy) return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values) def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, float]: @@ -408,4 +418,4 @@ def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, dict[str, float]: parameter dictionary containing confidence threshold for auto-relabelling, and fraction of rows to drop for each issue type. """ - return get_autofix_defaults_for_strategy(strategy) + return _get_autofix_defaults_for_strategy(strategy) diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py index b6357b08..69fa04ba 100644 --- a/tests/test_autofix_utils.py +++ b/tests/test_autofix_utils.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from cleanlab_studio.internal.util import ( - get_param_values, + _get_param_values, _update_label_based_on_confidence, _get_top_fraction_ids, _get_indices_to_drop, @@ -50,7 +50,7 @@ def test_get_param_values(self, strategy, expected_results): cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4 cleanlab_columns["is_ambiguous"] = [True] * 10 - params = get_param_values(cleanlab_columns, None, strategy) + params = _get_param_values(cleanlab_columns, None, strategy) assert params == expected_results @pytest.mark.parametrize( From 7b96faa57a4d9b512044dbdc8d7e1ebcbcab6ed4 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Mon, 18 Dec 2023 17:01:14 +0530 Subject: [PATCH 31/32] Update cleanlab_studio/studio/studio.py Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab_studio/studio/studio.py | 43 +++++++++++++------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 1dade64b..30db37a9 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -370,32 +370,25 @@ def autofix_dataset( strategy="optimized_training_data", ) -> pd.DataFrame: """ - This method returns the auto-fixed dataset. It works for text or tabular dataset only. + Improves a dataset by applying automatically-suggested corrections based on issues detected by Cleanlab. Args: - cleanset_id (str): ID of cleanset. - original_df (pd.DataFrame): The original dataset in DataFrame format. - params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and - fraction of rows to drop for each issue type. If not provided, default values will be used. - This dictionary includes the following options: - - * drop_ambiguous (float): Fraction of rows to drop when encountering ambiguous data. Default is 0.0 (no rows dropped). - * drop_label_issue (float): Fraction of rows to drop when facing label-related issues. Default is 0.5 (50% of rows dropped). - * drop_near_duplicate (float): Fraction of rows to drop for near-duplicate data. Default is 0.5 (50% of rows dropped). - * drop_outlier (float): Fraction of rows to drop for outlier data. Default is 0.2 (20% of rows dropped). - * relabel_confidence_threshold (float): Confidence threshold for auto-relabelling. Default is 0.95. - For example, the default values are: - { - 'drop_ambiguous': 0.0, - 'drop_label_issue': 0.5, - 'drop_near_duplicate': 0.5, - 'drop_outlier': 0.2, - 'relabel_confidence_threshold': 0.95 - } - - Specify values in params to customize the behavior for specific scenarios. If params are provided, the values in params take precedence over default ones. - - strategy (str): Auto-fixing strategy to use, - Possible strategies: optimized_training_data, drop_all_issues, suggested_actions + cleanset_id (str): ID of the cleanset from the Project for this Dataset. + original_df (pd.DataFrame): The original dataset (must be a DataFrame, so only text and tabular datasets are currently supported). + params (dict, optional): Optional parameters to control how many data points from each type of detected data issue are auto-corrected or filtered (prioritizing the more severe instances of each issue). If not provided, default `params` values will be used. + The `params` dictionary includes the following options: + + * drop_ambiguous (float): Fraction of the data points detected as ambiguous to exclude from the dataset. + * drop_label_issue (float): Fraction of the data points with label issues to exclude from the dataset. + * drop_near_duplicate (float): Fraction of the data points detected as near duplicates to exclude from the dataset. + * drop_outlier (float): Fraction of the data points detected as outliers to exclude from the dataset. + * relabel_confidence_threshold (float): Confidence threshold for the suggested label, data points with label issues that also exceed this threshold are re-labeled as the suggested label. + + strategy (str): What strategy to use for auto-fixing the dataset out of the following possibilities: + ['optimized_training_data', 'drop_all_issues', 'suggested_actions']. + Each of these possibilities corresponds to a default setting of the `params` dictionary, designed to be used in different scenarios. + If specified, the `params` argument will override this argument. Specify 'optimized_training_data' when your goal is to auto-fix training data to achieve the best ML performance on randomly split test data. + Specify 'drop_all_issues' to instead exclude all datapoints detected to have issues from the dataset. + Specify 'suggested_actions' to instead apply the suggested action to each data point that is displayed in the Cleanlab Studio Web Application (e.g. relabeling for label issues, dropping for outliers, etc). Returns: pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset. From b31674ce9540224264b60f77b0118b8b4661b75f Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Mon, 18 Dec 2023 17:03:54 +0530 Subject: [PATCH 32/32] linting --- cleanlab_studio/studio/studio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 30db37a9..5d13c769 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -383,7 +383,7 @@ def autofix_dataset( * drop_outlier (float): Fraction of the data points detected as outliers to exclude from the dataset. * relabel_confidence_threshold (float): Confidence threshold for the suggested label, data points with label issues that also exceed this threshold are re-labeled as the suggested label. - strategy (str): What strategy to use for auto-fixing the dataset out of the following possibilities: + strategy (str): What strategy to use for auto-fixing the dataset out of the following possibilities: ['optimized_training_data', 'drop_all_issues', 'suggested_actions']. Each of these possibilities corresponds to a default setting of the `params` dictionary, designed to be used in different scenarios. If specified, the `params` argument will override this argument. Specify 'optimized_training_data' when your goal is to auto-fix training data to achieve the best ML performance on randomly split test data.