Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved autofix strategy #148

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
4615637
make pull request
aditya1503 Nov 16, 2023
2a7cf91
cleaned skeleton code
aditya1503 Nov 17, 2023
e7a3d07
cleanup
aditya1503 Nov 17, 2023
72fc919
add type hinting
aditya1503 Nov 17, 2023
d67bbc3
address PR comments
aditya1503 Nov 18, 2023
fc4bf7c
Update cleanlab_studio/internal/util.py
aditya1503 Nov 20, 2023
9f00909
linting + doc change
aditya1503 Nov 20, 2023
d2a3432
set ambiguous to 0
aditya1503 Nov 22, 2023
6bcec4c
things to port to backend
aditya1503 Nov 22, 2023
cc52ce2
Updated code for different strategies
sanjanag Dec 1, 2023
62efa2d
Fixed apply method
sanjanag Dec 1, 2023
e5c4872
Added test for computing rows for exclusion
sanjanag Dec 2, 2023
02294c8
Improved formatting
sanjanag Dec 2, 2023
1d644a0
Added tests for updating label issue rows based on threshold
sanjanag Dec 2, 2023
3ff2507
Fixed mypy issue
sanjanag Dec 2, 2023
7235b40
Added test for checking right rows are dropped for non near duplicate…
sanjanag Dec 2, 2023
1b99d60
Added test for checking right rows are dropped for near duplicate issues
sanjanag Dec 2, 2023
330aa44
Added get defaults method
sanjanag Dec 5, 2023
a19c88c
Return cleanset with original indices
sanjanag Dec 5, 2023
69ccda6
Merge branch 'main' into improve_autofix
sanjanag Dec 5, 2023
19143a3
Removed unimplemented test
sanjanag Dec 5, 2023
e5b97f5
removed unncessary merge change
sanjanag Dec 5, 2023
20a532c
Fixed tests
sanjanag Dec 5, 2023
3bbfc1c
Fixed mypy error
sanjanag Dec 5, 2023
b892e87
Added newline
sanjanag Dec 5, 2023
b54a0a7
Fixed formatting
sanjanag Dec 5, 2023
f870e04
added tests for dropped indices
sanjanag Dec 6, 2023
eb106d1
Added docs for user facing method
sanjanag Dec 6, 2023
a7acfa6
Black formatting
sanjanag Dec 6, 2023
1f0344d
Merge remote-tracking branch 'origin/main' into improve_autofix
aditya1503 Dec 13, 2023
692efe4
merge main
aditya1503 Dec 13, 2023
afbe4a9
add github change request
aditya1503 Dec 13, 2023
7b96faa
Update cleanlab_studio/studio/studio.py
aditya1503 Dec 18, 2023
b31674c
linting
aditya1503 Dec 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 149 additions & 2 deletions cleanlab_studio/internal/util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pathlib
from typing import Any, Optional, TypeVar, Union
from typing import Any, Optional, TypeVar, Union, List
import math

import numpy as np
import copy

import pandas as pd

try:
Expand Down Expand Up @@ -63,3 +64,149 @@ def check_none(x: Any) -> bool:

def check_not_none(x: Any) -> bool:
return not check_none(x)


def _get_autofix_default_thresholds(strategy: str) -> dict: # Studio team port to backend
"""returns default percentage-wise params of autofix"""

strategy_defaults = {
"optimized_training_data": {
"drop_ambiguous": 0.0,
"drop_label_issue": 0.5,
"drop_near_duplicate": 0.2,
"drop_outlier": 0.5,
"relabel_confidence_threshold": 0.95,
},
"drop_all_issues": {
"drop_ambiguous": 1.0,
"drop_label_issue": 1.0,
"drop_near_duplicate": 1.0,
"drop_outlier": 1.0,
},
"suggested_actions": {
"drop_near_duplicate": 1.0,
"drop_outlier": 1.0,
"relabel_confidence_threshold": 0.0,
},
}
return strategy_defaults[strategy]


def get_autofix_defaults(
cleanset_df: pd.DataFrame, strategy
) -> dict: # Studio team port to backend
"""
Generate default values for autofix parameters based on the size of the cleaned dataset.
"""
default_thresholds = _get_autofix_default_thresholds(strategy)
default_values = {}

for param_type, param_value in default_thresholds.items():
# Convert drop fractions to number of rows and leave rest of the parameters as is
if param_type.startswith("drop_"):
issue_name = param_type[5:]
num_rows = cleanset_df[f"is_{issue_name}"].sum()
default_values[param_type] = math.ceil(num_rows * param_value)
else:
default_values[param_type] = param_value
return default_values


def _get_top_fraction_ids( # Studio team port to backend
cleanset_df: pd.DataFrame, issue_name: str, num_rows: int, asc=True
) -> List[str]:
"""
This will only return the IDs of datapoints to drop for a given setting of the num_rows to drop during autofix.
Parameters:
- cleanset_df (pd.DataFrame): The input DataFrame containing the cleanset.
- name_col (str): The name of the column indicating the category for which the top rows should be extracted.
- num_rows (int): The number of rows to be extracted.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In autofix, we can simply multiply the fraction of issues that are the cleanset defaults by the number of datapoints to get this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right when we spoke originally, we wanted this call to be similar to the Studio web interface call, hence I rewrote it this way, it was floating percentage before.
the function _get_autofix_defaults does the multiplication by number of datapoints

- asc (bool, optional): If True, the rows are sorted in ascending order based on the score column; if False, in descending order.
Default is True.

Returns:
- list: A list of row indices representing the top specified number of rows based on the specified score column.
"""
bool_column_name = f"is_{issue_name}"

# Construct a filter based on the 'label_issue' variable
filter_condition = cleanset_df[bool_column_name]

# Create a new DataFrame based on the filter
filtered_df = cleanset_df[filter_condition]
if issue_name == "near_duplicate":
# Group by the 'near_duplicate_cluster_ID' column
df_n = filtered_df.sort_values(by="near_duplicate_score").reset_index(drop=True)
sorted_df = df_n.head(num_rows)
grouped_df = sorted_df.groupby("near_duplicate_cluster_id")

# Initialize an empty list to store the aggregated indices
aggregated_indices = []

# Iterate over each group
for group_name, group_df in grouped_df:
# Sort the group DataFrame by the 'near_duplicate_score' column in ascending order
sorted_group_df = group_df.sort_values(
by=f"{issue_name}_score", ascending=asc
).reset_index(drop=True)

# Extract every other index and append to the aggregated indices list
selected_indices = sorted_group_df.loc[::2, "cleanlab_row_ID"]
aggregated_indices.extend(selected_indices)

return aggregated_indices
else:
# Construct the boolean column name with 'is_' prefix and 'label_issue_score' suffix
score_col_name = f"{issue_name}_score"

# Sort the filtered DataFrame by the constructed boolean column in descending order
sorted_df = filtered_df.sort_values(by=score_col_name, ascending=asc)

# Extract the top specified number of rows and return the 'cleanlab_row_ID' column
top_rows_ids = sorted_df["cleanlab_row_ID"].head(num_rows)

return top_rows_ids


def _update_label_based_on_confidence(row, conf_threshold): # Studio team port to backend
"""Update the label and is_issue based on confidence threshold if there is a label issue.

Args:
row (pd.Series): The row containing label information.
conf_threshold (float): The confidence threshold for updating the label.

Returns:
pd.Series: The updated row.
"""
if row["is_label_issue"] and row["suggested_label_confidence_score"] > conf_threshold:
row[
"is_issue"
] = False # make sure this does not affect back end. We are doing this to avoid dropping these datapoints in autofix later, they should be relabeled
row["label"] = row["suggested_label"]
return row


def apply_autofixed_cleanset_to_new_dataframe( # Studio team port to backend
original_df: pd.DataFrame, cleanset_df: pd.DataFrame, parameters: dict
) -> pd.DataFrame:
"""Apply a cleanset to update original dataaset labels and remove top rows based on specified parameters."""
original_df_copy = copy.deepcopy(original_df)
original_columns = original_df_copy.columns
merged_df = pd.merge(original_df_copy, cleanset_df, left_index=True, right_on="cleanlab_row_ID")

merged_df = merged_df.apply(
lambda row: _update_label_based_on_confidence(
row, conf_threshold=parameters["relabel_confidence_threshold"]
),
axis=1,
)

indices_to_drop = set()
for param_name, top_num in parameters.items():
if param_name.startswith("drop_"):
issue_name = param_name.replace("drop_", "")
top_percent_ids = _get_top_fraction_ids(merged_df, issue_name, top_num, asc=False)
indices_to_drop.update(top_percent_ids)

merged_df = merged_df.drop(list(indices_to_drop), axis=0).reset_index(drop=True)
return merged_df[original_columns]
50 changes: 43 additions & 7 deletions cleanlab_studio/studio/studio.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
"""
Python API for Cleanlab Studio.
"""
from typing import Any, List, Literal, Optional, Union
import warnings
from typing import Any, List, Literal, Optional, Union, Dict

import numpy as np
import numpy.typing as npt
import pandas as pd

from . import inference
from . import trustworthy_language_model
from cleanlab_studio.errors import CleansetError
from cleanlab_studio.internal import clean_helpers, upload_helpers
from cleanlab_studio.internal.api import api
from cleanlab_studio.internal.settings import CleanlabSettings
from cleanlab_studio.internal.types import FieldSchemaDict
from cleanlab_studio.internal.util import (
init_dataset_source,
apply_autofixed_cleanset_to_new_dataframe,
_get_autofix_default_thresholds,
check_none,
check_not_none,
get_autofix_defaults,
init_dataset_source,
)
from cleanlab_studio.internal.settings import CleanlabSettings
from cleanlab_studio.internal.types import FieldSchemaDict

from . import inference, trustworthy_language_model

_pyspark_exists = api.pyspark_exists
if _pyspark_exists:
Expand Down Expand Up @@ -131,7 +134,7 @@ def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool
label_column = api.get_label_column_of_project(self._api_key, project_id)
id_col = api.get_id_column(self._api_key, cleanset_id)
if _pyspark_exists and isinstance(dataset, pyspark.sql.DataFrame):
from pyspark.sql.functions import row_number, monotonically_increasing_id, when, col
from pyspark.sql.functions import col, monotonically_increasing_id, row_number, when
from pyspark.sql.window import Window

cl_cols = self.download_cleanlab_columns(
Expand Down Expand Up @@ -383,3 +386,36 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None)

except (TimeoutError, CleansetError):
return False

def autofix_dataset(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allow string options to passed straight through into _get_autofix_default_params()

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be added now, clarified in the docs:

params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and

self,
original_df: pd.DataFrame,
cleanset_id: str,
params: Optional[Dict[str, Union[int, float]]] = None,
strategy="optimized_training_data",
) -> pd.DataFrame:
"""
This method returns the auto-fixed dataset.
Copy link
Member

@jwmueller jwmueller Nov 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Docstring should clarify that Dataset must be a DataFrame (text or tabular dataset only)

Args:
cleanset_id (str): ID of cleanset.
params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
number of rows to drop for each issue type. If not provided, default values will be used.

Example:
{
'drop_ambiguous': 9,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change to fractions

'drop_label_issue': 92,
'drop_near_duplicate': 1,
'drop_outlier': 3,
'drop_confidence_threshold': 0.95
}

Returns:
pd.DataFrame: A new dataframe after applying auto-fixes to the cleanset.

"""
cleanset_df = self.download_cleanlab_columns(cleanset_id)
if params is None:
params = get_autofix_defaults(cleanset_df, strategy)
print("Using autofix values:", params)
return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, params)
153 changes: 153 additions & 0 deletions tests/test_autofix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import pandas as pd
import pytest
from cleanlab_studio.internal.util import (
get_autofix_defaults,
_update_label_based_on_confidence,
_get_top_fraction_ids,
)
import numpy as np


class TestAutofix:
@pytest.mark.parametrize(
"strategy, expected_results",
[
(
"optimized_training_data",
{
"drop_ambiguous": 0,
"drop_label_issue": 2,
"drop_near_duplicate": 2,
"drop_outlier": 3,
"relabel_confidence_threshold": 0.95,
},
),
(
"drop_all_issues",
{
"drop_ambiguous": 10,
"drop_label_issue": 3,
"drop_near_duplicate": 6,
"drop_outlier": 6,
},
),
(
"suggested_actions",
{
"drop_near_duplicate": 6,
"drop_outlier": 6,
"relabel_confidence_threshold": 0.0,
},
),
],
ids=["optimized_training_data", "drop_all_issues", "suggested_actions"],
)
def test_get_autofix_defaults(self, strategy, expected_results):
cleanlab_columns = pd.DataFrame()
cleanlab_columns["is_label_issue"] = [True] * 3 + [False] * 7
cleanlab_columns["is_near_duplicate"] = [True] * 6 + [False] * 4
cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
cleanlab_columns["is_ambiguous"] = [True] * 10

params = get_autofix_defaults(cleanlab_columns, strategy)
assert params == expected_results

@pytest.mark.parametrize(
"row, expected_updated_row",
[
(
{
"is_label_issue": True,
"suggested_label_confidence_score": 0.6,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
{
"is_label_issue": True,
"suggested_label_confidence_score": 0.6,
"label": "label_1",
"suggested_label": "label_1",
"is_issue": False,
},
),
(
{
"is_label_issue": True,
"suggested_label_confidence_score": 0.5,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
{
"is_label_issue": True,
"suggested_label_confidence_score": 0.5,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
),
(
{
"is_label_issue": True,
"suggested_label_confidence_score": 0.4,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
{
"is_label_issue": True,
"suggested_label_confidence_score": 0.4,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
),
(
{
"is_label_issue": False,
"suggested_label_confidence_score": 0.4,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
{
"is_label_issue": False,
"suggested_label_confidence_score": 0.4,
"label": "label_0",
"suggested_label": "label_1",
"is_issue": True,
},
),
],
ids=[
"is a label issue with confidence score greater than threshold",
"is a label issue with confidence score equal to threshold",
"is a label issue with confidence score less than threshold",
"is not a label issue",
],
)
def test_update_label_based_on_confidence(self, row, expected_updated_row):
conf_threshold = 0.5
updated_row = _update_label_based_on_confidence(row, conf_threshold)
assert updated_row == expected_updated_row

def test_get_top_fraction_ids(self):
cleanlab_columns = pd.DataFrame()

cleanlab_columns["cleanlab_row_ID"] = np.arange(10)
cleanlab_columns["is_dummy"] = [False] * 5 + [True] * 5
cleanlab_columns["dummy_score"] = np.arange(10) * 0.1
top_ids = _get_top_fraction_ids(cleanlab_columns, "dummy", 3)
assert set(top_ids) == set([5, 6, 7])

def test_get_top_fraction_ids_near_duplicate(self):
cleanlab_columns = pd.DataFrame()

cleanlab_columns["cleanlab_row_ID"] = np.arange(12)
cleanlab_columns["is_near_duplicate"] = [False] * 6 + [True] * 6
cleanlab_columns["near_duplicate_score"] = np.arange(12) * 0.1
cleanlab_columns["near_duplicate_cluster_id"] = [None] * 6 + [0, 0, 1, 1, 1, 1]

top_ids = _get_top_fraction_ids(cleanlab_columns, "near_duplicate", 5)
assert set(top_ids) == set([6, 8, 10])
Loading