Skip to content

Commit

Permalink
simplifying the rocauc scores; if we encounter issues then just give …
Browse files Browse the repository at this point in the history
…a warning/info and return 0s
  • Loading branch information
brifordwylie committed Oct 17, 2024
1 parent 220347f commit 39fc1ba
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 22 deletions.
36 changes: 14 additions & 22 deletions src/sageworks/core/artifacts/endpoint_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
precision_recall_fscore_support,
root_mean_squared_error,
)
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

except ImportError as e:
# Initialize the logger
Expand Down Expand Up @@ -735,30 +735,22 @@ def classification_metrics(self, target_column: str, prediction_df: pd.DataFrame
zero_division=0,
)

# Identify the probability columns and convert them to a 2D array
# Identify the probability columns and keep them as a Pandas DataFrame
proba_columns = [f"{label}_proba" for label in class_labels]
y_score = prediction_df[proba_columns].to_numpy()
y_score = prediction_df[proba_columns]

# One-hot encode the true labels using all class labels (fit with class_labels)
lb = LabelBinarizer()
lb.fit(class_labels)
y_true = lb.transform(prediction_df[target_column])

# Initialize list for ROC AUC scores
roc_auc = []

# Calculate ROC AUC for each class, handling cases where only one class is present
for i, label in enumerate(class_labels):
y_true_class = y_true[:, i] # True labels for the current class
y_score_class = y_score[:, i] # Predicted probabilities for the current class

# Check if both positive and negative examples exist in y_true for the current class
if len(np.unique(y_true_class)) < 2: # Only one class present (all 0s or all 1s)
self.log.warning(f"Skipping ROC AUC calculation for class {label} (only one class present in y_true).")
roc_auc.append(0.0) # Assign 0.0 if only one class is present
else:
auc = roc_auc_score(y_true_class, y_score_class) # Calculate ROC AUC for this class
roc_auc.append(auc)
encoder = OneHotEncoder(categories=[class_labels], sparse_output=False)
y_true = encoder.fit_transform(prediction_df[[target_column]])

# Calculate ROC AUC for the multiclass case using 'ovr' (one-vs-rest) strategy
try:
roc_auc = roc_auc_score(y_true, y_score, multi_class="ovr", average="macro")
except ValueError as e:
present_classes = prediction_df[target_column].unique().tolist()
self.log.warning(f"ROC AUC calculation is missing classes. Predictions only have {present_classes}")
self.log.warning(f"{str(e)}")
roc_auc = 0.0

# Put the scores into a DataFrame
score_df = pd.DataFrame(
Expand Down
2 changes: 2 additions & 0 deletions tests/artifacts/endpoint_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def test_classification_inference_with_subset_of_labels():


def test_classification_roc_auc():

# Compute performance metrics for our test predictions
eval_data_df = fs_evaluation_data(class_endpoint)[:50]
pred_df = class_endpoint.inference(eval_data_df)

Expand Down

0 comments on commit 39fc1ba

Please sign in to comment.