Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test contribution/robustness detector #1908

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"isort.args":["--profile", "black"],
}
9 changes: 8 additions & 1 deletion giskard/scanner/robustness/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
"""
from .base_detector import BaseTextPerturbationDetector
from .ethical_bias_detector import EthicalBiasDetector
from .switch_detector import SwitchAllDetector
from .text_perturbation_detector import TextPerturbationDetector

__all__ = ["EthicalBiasDetector", "TextPerturbationDetector", "BaseTextPerturbationDetector"]
__all__ = [
"EthicalBiasDetector",
"TextPerturbationDetector",
"BaseTextPerturbationDetector",
"BaseCategorialTransformation",
"SwitchAllDetector",
]
197 changes: 196 additions & 1 deletion giskard/scanner/robustness/base_detector.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Sequence
from typing import Optional, Sequence, Union

from abc import abstractmethod

Expand All @@ -11,6 +11,7 @@
from ..issues import Issue, IssueLevel, Robustness
from ..logger import logger
from ..registry import Detector
from .feature_transformation import CategorialTransformation
from .text_transformations import TextTransformation


Expand Down Expand Up @@ -217,6 +218,200 @@ def _detect_issues(
return issues


class BaseCategorialPertubationDetector(Detector):
"""Base class for metamorphic detectors based on categorial feature"""

_issue_group = Robustness
# @TODO : Reserch for the adapted value for the taxonomy.
_taxonomy = None

def __init__(
self,
transformations: Optional[Sequence[CategorialTransformation]] = None,
threshold: Optional[float] = None,
output_sensitivity: Optional[float] = None,
num_samples: Optional[int] = None,
):
"""
Create a new instance of the detector
Parameters
----------
transformations: Optional[Sequence[CategorialTransformation]]
The categorial transformation used in the metamorphic test. If not provided, a default set of transformation will be used.
threshold: Optional[float]
The threshold for the fail rate, which is defined as the proportion of samples for which the model
prediction has changed. If the fail rate is greater than the threshold, an issue is created.
If not provided, a default threshold will be used.
output_sensitivity: Optional[float]
For regression models, the output sensitivity is the maximum relative change in the prediction that is
considered acceptable. If the relative change is greater than the output sensitivity, an issue is created.
This parameter is ignored for classification models. If not provided, a default output sensitivity will be
used.
num_samples: Optional[int]
The maximum number of samples to use for the metamorphic testing. If not provided, a default number of
samples will be used.
"""
self.transformations = transformations
self.threshold = threshold
self.num_samples = num_samples
self.output_sensitivity = output_sensitivity

def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]) -> Sequence[Issue]:
transformations = self.transformations or self._get_default_transformations(model, dataset)
# Only analyze categorials features
cat_features = [f for f in features if dataset.column_types[f] == "category"]
logger.info(
f"{self.__class__.__name__}: Running with transformations={[t.name for t in transformations]} "
f"threshold={self.threshold} output_sensitivity={self.output_sensitivity} num_samples={self.num_samples}"
)

issues = []
for transformation in transformations:
issues.extend(self._detect_issues(model, dataset, transformation, cat_features))

return [i for i in issues if i]

@abstractmethod
def _get_default_transformations(self, model: BaseModel, dataset: Dataset) -> Sequence[CategorialTransformation]:
...

def _detect_issues(
self,
model: BaseModel,
dataset: Dataset,
transformation: CategorialTransformation,
features: Sequence[Union[str, int]],
) -> Sequence[Issue]:
num_samples = self.num_samples if self.num_samples is not None else _get_default_num_samples(model)
output_sensitivity = (
self.output_sensitivity if self.output_sensitivity is not None else _get_default_output_sensitivity(model)
)
threshold = self.threshold if self.threshold is not None else _get_default_threshold(model)

issues = []
# @TODO: integrate this with Giskard metamorphic tests already present
for feature in features:
transformation_fn = transformation(column=feature)
transformed = dataset.transform(transformation_fn)

changed_idx = dataset.df.index[transformed.df[feature] != dataset.df[feature]]

if changed_idx.empty:
continue

# Select a random subset of the changed records
if len(changed_idx) > num_samples:
rng = np.random.default_rng(747)
changed_idx = changed_idx[rng.choice(len(changed_idx), num_samples, replace=False)]

original_data = Dataset(
dataset.df.loc[changed_idx],
target=dataset.target,
column_types=dataset.column_types,
validation=False,
)
perturbed_data = Dataset(
transformed.df.loc[changed_idx],
target=dataset.target,
column_types=dataset.column_types,
validation=False,
)

# Calculate predictions
original_pred = model.predict(original_data)
perturbed_pred = model.predict(perturbed_data)

if model.is_classification:
passed = original_pred.raw_prediction == perturbed_pred.raw_prediction
elif model.is_regression:
rel_delta = _relative_delta(perturbed_pred.raw_prediction, original_pred.raw_prediction)
passed = np.abs(rel_delta) < output_sensitivity
elif model.is_text_generation:
try:
import evaluate
except ImportError as err:
raise LLMImportError() from err

scorer = evaluate.load("bertscore")
score = scorer.compute(
predictions=perturbed_pred.prediction,
references=original_pred.prediction,
model_type="distilbert-base-multilingual-cased",
idf=True,
)
passed = np.array(score["f1"]) > 1 - output_sensitivity
else:
raise NotImplementedError("Only classification, regression, or text generation models are supported.")
pass_rate = passed.mean()
fail_rate = 1 - pass_rate
logger.info(
f"{self.__class__.__name__}: Testing `{feature}` for perturbation `{transformation.name}`\tFail rate: {fail_rate:.3f}"
)

if fail_rate >= threshold:
# Severity
issue_level = IssueLevel.MAJOR if fail_rate >= 2 * threshold else IssueLevel.MEDIUM

# Description
desc = (
"When feature “{feature}” is perturbed with the transformation “{transformation_fn}”, "
"the model changes its prediction in {fail_rate_percent}% of the cases. "
"We expected the predictions not to be affected by this transformation."
)

failed_size = (~passed).sum()
slice_size = len(passed)

issue = Issue(
model,
dataset,
group=self._issue_group,
level=issue_level,
transformation_fn=transformation_fn,
description=desc,
features=[feature],
meta={
"feature": feature,
"domain": f"Feature `{feature}`",
"deviation": f"{failed_size}/{slice_size} tested samples ({round(fail_rate * 100, 2)}%) changed prediction after perturbation",
"failed_size": failed_size,
"slice_size": slice_size,
"fail_rate": fail_rate,
"fail_rate_percent": round(fail_rate * 100, 2),
"metric": "Fail rate",
"metric_value": fail_rate,
"threshold": threshold,
"output_sentitivity": output_sensitivity,
"perturbed_data_slice": perturbed_data,
"perturbed_data_slice_predictions": perturbed_pred,
},
importance=fail_rate,
tests=_generate_robustness_tests,
taxonomy=self._taxonomy,
)

# Add examples
examples = original_data.df.loc[~passed, (feature,)].copy()
examples[f"{transformation_fn.name}({feature})"] = perturbed_data.df.loc[~passed, feature]

examples["Original prediction"] = original_pred.prediction[~passed]
examples["Prediction after perturbation"] = perturbed_pred.prediction[~passed]

if model.is_classification:
examples["Original prediction"] = examples["Original prediction"].astype(str)
examples["Prediction after perturbation"] = examples["Prediction after perturbation"].astype(str)
ps_before = pd.Series(original_pred.probabilities[~passed], index=examples.index)
ps_after = pd.Series(perturbed_pred.probabilities[~passed], index=examples.index)
examples["Original prediction"] += ps_before.apply(lambda p: f" (p = {p:.2f})")
examples["Prediction after perturbation"] += ps_after.apply(lambda p: f" (p = {p:.2f})")

issue.add_examples(examples)

issues.append(issue)

return issues


def _generate_robustness_tests(issue: Issue):
from ...testing.tests.metamorphic import test_metamorphic_invariance

Expand Down
51 changes: 51 additions & 0 deletions giskard/scanner/robustness/feature_transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import List, Optional

import numpy as np
import pandas as pd

from ..core.core import DatasetProcessFunctionMeta
from ..registry.registry import get_object_uuid
from ..registry.transformation_function import TransformationFunction


class CategorialTransformation(TransformationFunction):
name: str

def __init__(self, cat_column, needs_dataset=False):
super().__init__(None, row_level=False, cell_level=False, needs_dataset=needs_dataset)
self.cat_column = cat_column
self.meta = DatasetProcessFunctionMeta(type="TRANSFORMATION")
self.meta.uuid = get_object_uuid(self)
self.meta.code = self.name
self.meta.name = self.name
self.meta.display_name = self.name
self.meta.tags = ["pickle", "scan"]
self.meta.doc = self.meta.default_doc("Automatically generated transformation function")

def __str__(self):
return self.name

def execute(self, data: pd.DataFrame) -> pd.DataFrame:
feature_data = data[self.cat_column]
data.loc[feature_data.index, self.column] = feature_data.apply(self.make_perturbation)
return data

def make_perturbation(self) -> Optional[List[str]]:
raise NotImplementedError()


class CategorialShuffle(CategorialTransformation):
name = "Shuffle categorial values"

def __init__(self, cat_column, rng_seed=1729):
super.__init__(cat_column)
self.rng = np.random.default_rng(seed=rng_seed)

def execute(self, data: pd.DataFrame):
feature_data = data[self.cat_column]
cat_values = list(set(feature_data))
for i in range(len(cat_values)):
shuffle_cat_value = self.rng.choice(cat_values)
cat_values[i] = shuffle_cat_value

return data
32 changes: 32 additions & 0 deletions giskard/scanner/robustness/switch_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Sequence

from giskard.datasets.base import Dataset
from giskard.models.base.model import BaseModel
from giskard.scanner.robustness.feature_transformation import CategorialTransformation

from ..decorators import detector
from ..issues import Robustness
from .base_detector import BaseCategorialPertubationDetector


@detector(
name="swtich_all",
tags=["switch_all", "robustness", "classification", "regression"],
)
class SwitchAllDetector(BaseCategorialPertubationDetector):
"""Detect if a pertubation of a single categorial column from the input data can pertub the model.

By default, we simply perform a shuffle of the data.

As an example is having a breed category with values potential values: ['Labrador', 'Husky', 'Beagle', ...].
The idea is to switch all Labrador` value to any other breed and so on.
"""

_issue_group = Robustness
# @TODO: find information related to the taxonomy
_taxonomy = None

def _get_default_transformations(self, model: BaseModel, dataset: Dataset) -> Sequence[CategorialTransformation]:
from .feature_transformation import CategorialShuffle

return [CategorialShuffle]
Loading