experimenting with quartile regressor functionality

SuperCowPowers · Jun 26, 2024 · 7e72a7f · 7e72a7f
1 parent 2cae10f
commit 7e72a7f
Show file tree

Hide file tree

Showing 3 changed files with 269 additions and 173 deletions.
diff --git a/experiments/quantile_regression_3.py b/experiments/quantile_regression_3.py
@@ -1,53 +1,34 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from xgboost import XGBRegressor
+from sklearn.neighbors import KNeighborsRegressor
 
-# Generate synthetic data with even spacing from -10 to 5 and sparse spacing from 5 to 10
-x_even = np.linspace(-10, 5, 1600)  # Evenly spaced from -10 to 5
-x_sparse = 5 + (np.linspace(0, 1, 200) ** 2) * 5  # Increasingly sparse from 5 to 10
-x = np.concatenate([x_even, x_sparse])
+# SageWorks Imports
+from sageworks.utils.test_data_generator import TestDataGenerator
 
-# Ensure no values are exactly zero or negative in the input to the log function
-epsilon = 1e-6  # Small value to avoid log(0)
-x_adjusted = np.where(x >= 0, x + 1 + epsilon, -x + 1 + epsilon)
+# Get synthetic data
+data_generator = TestDataGenerator()
+df = data_generator.confidence_data()
 
-# Generate non-linear 'S' shape y values
-np.random.seed(42)
-# y = np.where(x >= 0, np.log10(x_adjusted), -np.log10(x_adjusted)) + np.random.normal(scale=0.02 * np.abs(x), size=x.shape)
-y = np.where(x >= 0, np.log(x_adjusted) / np.log(100), -np.log(x_adjusted) / np.log(100)) + np.random.normal(
-    scale=0.02 * np.abs(x), size=x.shape
-)
+feature_list = ["feature_1"]
+feature = feature_list[0]
+target = "target"
+X_train = df[feature_list]
+y_train = df[target]
 
-# Add pairs coincident points in X to test IQR functionality
-for i in range(3):
-    x_coincident = np.array([-0.5, -0.5, 0, 0, 0.5, 0.5])
 
-    # Increasing deltas for the y values
-    y_delta = 0.1 + 0.05 * i
-    y_offsets = [-0.1, 0, 0.1]
-
-    # Now create pairs of y values (-delta + offset and +delta + offset) for each x value
-    y_coincident = np.concatenate([[-y_delta + y_offset, y_delta + y_offset] for y_offset in y_offsets])
-
-    # Add the coincident points to the data`
-    x = np.concatenate([x, x_coincident])
-    y = np.concatenate([y, y_coincident])
-
-"""
-x_coincident = np.array([-0.5, -0.5, 0, 0, 0.5, 0.5])
-y_coincident = np.array([-.4, .1, -.25, .25, -.1, .4])
-x = np.concatenate([x, x_coincident])
-y = np.concatenate([y, y_coincident])
-"""
-
-# Reshape the data
-X_train = x.reshape(-1, 1)
-y_train = y
-
-# Sort X for plotting (also sorts y)
-sort_idx = np.argsort(X_train[:, 0])
-X_train = X_train[sort_idx]
-y_train = y_train[sort_idx]
+# Get real data
+if False:
+    df = data_generator.aqsol_data()
+    feature_list = data_generator.aqsol_features()
+    # feature = "mollogp"
+    # feature = "tpsa"
+    # feature = "numhacceptors"
+    # feature_list = ["mollogp", "tpsa", "numhacceptors"]
+    feature = "mollogp"  # feature_list[0]
+    target = data_generator.aqsol_target()
+    X_train = df[feature_list]
+    y_train = df[target]
 
 
 # Function to train and predict using the prediction model
@@ -70,7 +51,7 @@ def train_quantile_models(X, y, quantiles, n_estimators=100):
 
 
 # Calculate confidence based on the quantile predictions
-def calculate_confidence(y, quantile_models, X):
+def domain_specific_confidence(quantile_models, X, predictions):
     lower_05 = quantile_models[0.05].predict(X)
     lower_25 = quantile_models[0.25].predict(X)
     quant_50 = quantile_models[0.50].predict(X)
@@ -84,17 +65,75 @@ def calculate_confidence(y, quantile_models, X):
     # If the interval with is greater than target_sensitivity with have 0 confidence
     # anything below that is a linear scale from 0 to 1
     confidence_interval = upper_95 - lower_05
-    q_conf = np.clip((1 - confidence_interval) / (target_sensitivity * 4.0), 0, 1)
+    q_conf = 1 - (confidence_interval / target_sensitivity)
+    print(f"q_conf: {np.min(q_conf):.2f} {np.max(q_conf):.2f}")
+    q_conf_clip = np.clip(q_conf, 0, 1)
 
     # Now lets look at the IQR distance for each observation
     epsilon_iqr = target_sensitivity * 0.5
     iqr = np.maximum(epsilon_iqr, np.abs(upper_75 - lower_25))
-    iqr_distance = np.abs(y - quant_50) / iqr
-    iqr_conf = np.clip(1 - iqr_distance, 0, 1)
+    iqr_distance = np.abs(predictions - quant_50) / iqr
+    print(f"iqr_distance: {np.min(iqr_distance):.2f} {np.max(iqr_distance):.2f}")
+    iqr_conf = 1 - iqr_distance
+    print(f"iqr_conf: {np.min(iqr_conf):.2f} {np.max(iqr_conf):.2f}")
+    iqr_conf_clip = np.clip(iqr_conf, 0, 1)
 
     # Now combine the two confidence values
-    confidence = (q_conf + iqr_conf) / 2
-    return confidence
+    confidence = (q_conf_clip + iqr_conf_clip) / 2
+    return confidence, confidence_interval, iqr_distance
+
+
+def domain_specific_confidence_2(quantile_models, X, predictions):
+    lower_05 = quantile_models[0.05].predict(X)
+    lower_25 = quantile_models[0.25].predict(X)
+    median_pred = quantile_models[0.50].predict(X)
+    upper_75 = quantile_models[0.75].predict(X)
+    upper_95 = quantile_models[0.95].predict(X)
+
+    # Target sensitivity
+    target_sensitivity = 0.25
+
+    # Ensure IQR is positive
+    epsilon = 1e-6
+    iqr = np.maximum(epsilon, np.abs(upper_75 - lower_25))
+
+    # Calculate confidence scores
+    iqr_conf = np.clip(1 - (iqr / target_sensitivity), 0, 1)
+    confidence = iqr_conf
+    return confidence, iqr_conf, median_pred
+
+
+# Fit the KNN model
+def fit_knn_model(X, y, n_neighbors=5):
+    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
+    knn.fit(X, y)
+    return knn
+
+
+# Confidence method using the KNN model
+def knn_confidence(knn, X_new, predictions=None, tolerance=0.1):
+    # Get the neighbors' target values
+    neighbors = knn.kneighbors(X_new, return_distance=False)
+    neighbors_targets = np.array([knn._y[indices] for indices in neighbors])
+
+    # Calculate the variance of the neighbors' target values
+    variance = np.var(neighbors_targets, axis=1)
+
+    # Confidence score can be inversely related to the variance
+    confidence_scores = np.clip(1 - (variance / np.max(variance)), 0, 1)
+
+    if predictions is not None:
+        # Get KNN predictions for the new data
+        knn_predictions = knn.predict(X_new)
+
+        # Calculate the difference between provided predictions and KNN predictions
+        prediction_diff = np.abs(predictions - knn_predictions)
+
+        # Adjust confidence scores based on the prediction difference
+        adjusted_confidence = np.clip(1 - (prediction_diff / tolerance), 0, 1)
+        confidence_scores = np.minimum(confidence_scores, adjusted_confidence)
+
+    return confidence_scores
 
 
 # Train models
@@ -106,18 +145,47 @@ def calculate_confidence(y, quantile_models, X):
 rmse_predictions = prediction_model.predict(X_train)
 
 # Calculate confidence for the array of predictions
-confidence_values = calculate_confidence(y_train, quantile_models, X_train)
+conf, conf_interval, iqr_distance = domain_specific_confidence_2(quantile_models, X_train, y_train)
+
+# Now we're going to use the KNN model to calculate confidence
+knn = fit_knn_model(X_train, y_train)
+knn_conf = knn_confidence(knn, X_train, rmse_predictions)
+
+# Total confidence is the product of the two confidence scores
+total_confidence = (conf + knn_conf) / 2
+
+# Compute model metrics for RMSE
+rmse = np.sqrt(np.mean((y_train - rmse_predictions) ** 2))
+print(f"RMSE: {rmse} support: {len(X_train)}")
+
+# Domain Specific Confidence Threshold
+conf_thres = 0.5
+
+# Now filter the data based on confidence and give RMSE for the filtered data
+rmse_predictions_filtered = rmse_predictions[conf > conf_thres]
+y_train_filtered = y_train[conf > conf_thres]
+rmse_filtered = np.sqrt(np.mean((y_train_filtered - rmse_predictions_filtered) ** 2))
+print(f"RMSE Filtered: {rmse_filtered} support: {len(rmse_predictions_filtered)}")
+
 
 # Plot the results
 plt.figure(figsize=(10, 6))
-sc = plt.scatter(X_train, y_train, c=confidence_values, cmap="coolwarm_r", label="Data", alpha=0.5)
-# sc = plt.scatter(X_train, y_train, cmap='coolwarm', label='Data', alpha=0.5)
+actual_values = y_train
+actual_values = df["feature_1"]
+sc = plt.scatter(actual_values, y_train, c=knn_conf, cmap="coolwarm", label="Data", alpha=0.5)
 plt.colorbar(sc, label="Confidence")
+
+# Sort x_values and the corresponding y-values for each quantile
+sorted_indices = np.argsort(actual_values)
+sorted_actual_values = actual_values[sorted_indices]
 for q in quantiles:
-    plt.plot(X_train, quantile_models[q].predict(X_train), label=f"Quantile {int(q * 100)}")
-plt.plot(X_train, rmse_predictions, label="RMSE Prediction", color="black", linestyle="--")
-plt.xlabel("X")
-plt.ylabel("Y")
+    sorted_y_values = quantile_models[q].predict(X_train)[sorted_indices]
+    plt.plot(sorted_actual_values, sorted_y_values, label=f"Quantile {int(q * 100):02}")
+
+# Plot the RMSE predictions
+plt.plot(sorted_actual_values, rmse_predictions[sorted_indices], label="RMSE Prediction", color="black", linestyle="--")
+plt.xlabel("Actual")
+plt.ylabel("Predicted")
 plt.title("Quantile Regression and Confidence with XGBoost")
 plt.legend()
 plt.show()