Skip to content

Commit

Permalink
experimenting with quartile regressor functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
brifordwylie committed Jun 26, 2024
1 parent 2cae10f commit 7e72a7f
Show file tree
Hide file tree
Showing 3 changed files with 269 additions and 173 deletions.
178 changes: 123 additions & 55 deletions experiments/quantile_regression_3.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,34 @@
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

# Generate synthetic data with even spacing from -10 to 5 and sparse spacing from 5 to 10
x_even = np.linspace(-10, 5, 1600) # Evenly spaced from -10 to 5
x_sparse = 5 + (np.linspace(0, 1, 200) ** 2) * 5 # Increasingly sparse from 5 to 10
x = np.concatenate([x_even, x_sparse])
# SageWorks Imports
from sageworks.utils.test_data_generator import TestDataGenerator

# Ensure no values are exactly zero or negative in the input to the log function
epsilon = 1e-6 # Small value to avoid log(0)
x_adjusted = np.where(x >= 0, x + 1 + epsilon, -x + 1 + epsilon)
# Get synthetic data
data_generator = TestDataGenerator()
df = data_generator.confidence_data()

# Generate non-linear 'S' shape y values
np.random.seed(42)
# y = np.where(x >= 0, np.log10(x_adjusted), -np.log10(x_adjusted)) + np.random.normal(scale=0.02 * np.abs(x), size=x.shape)
y = np.where(x >= 0, np.log(x_adjusted) / np.log(100), -np.log(x_adjusted) / np.log(100)) + np.random.normal(
scale=0.02 * np.abs(x), size=x.shape
)
feature_list = ["feature_1"]
feature = feature_list[0]
target = "target"
X_train = df[feature_list]
y_train = df[target]

# Add pairs coincident points in X to test IQR functionality
for i in range(3):
x_coincident = np.array([-0.5, -0.5, 0, 0, 0.5, 0.5])

# Increasing deltas for the y values
y_delta = 0.1 + 0.05 * i
y_offsets = [-0.1, 0, 0.1]

# Now create pairs of y values (-delta + offset and +delta + offset) for each x value
y_coincident = np.concatenate([[-y_delta + y_offset, y_delta + y_offset] for y_offset in y_offsets])

# Add the coincident points to the data`
x = np.concatenate([x, x_coincident])
y = np.concatenate([y, y_coincident])

"""
x_coincident = np.array([-0.5, -0.5, 0, 0, 0.5, 0.5])
y_coincident = np.array([-.4, .1, -.25, .25, -.1, .4])
x = np.concatenate([x, x_coincident])
y = np.concatenate([y, y_coincident])
"""

# Reshape the data
X_train = x.reshape(-1, 1)
y_train = y

# Sort X for plotting (also sorts y)
sort_idx = np.argsort(X_train[:, 0])
X_train = X_train[sort_idx]
y_train = y_train[sort_idx]
# Get real data
if False:
df = data_generator.aqsol_data()
feature_list = data_generator.aqsol_features()
# feature = "mollogp"
# feature = "tpsa"
# feature = "numhacceptors"
# feature_list = ["mollogp", "tpsa", "numhacceptors"]
feature = "mollogp" # feature_list[0]
target = data_generator.aqsol_target()
X_train = df[feature_list]
y_train = df[target]


# Function to train and predict using the prediction model
Expand All @@ -70,7 +51,7 @@ def train_quantile_models(X, y, quantiles, n_estimators=100):


# Calculate confidence based on the quantile predictions
def calculate_confidence(y, quantile_models, X):
def domain_specific_confidence(quantile_models, X, predictions):
lower_05 = quantile_models[0.05].predict(X)
lower_25 = quantile_models[0.25].predict(X)
quant_50 = quantile_models[0.50].predict(X)
Expand All @@ -84,17 +65,75 @@ def calculate_confidence(y, quantile_models, X):
# If the interval with is greater than target_sensitivity with have 0 confidence
# anything below that is a linear scale from 0 to 1
confidence_interval = upper_95 - lower_05
q_conf = np.clip((1 - confidence_interval) / (target_sensitivity * 4.0), 0, 1)
q_conf = 1 - (confidence_interval / target_sensitivity)
print(f"q_conf: {np.min(q_conf):.2f} {np.max(q_conf):.2f}")
q_conf_clip = np.clip(q_conf, 0, 1)

# Now lets look at the IQR distance for each observation
epsilon_iqr = target_sensitivity * 0.5
iqr = np.maximum(epsilon_iqr, np.abs(upper_75 - lower_25))
iqr_distance = np.abs(y - quant_50) / iqr
iqr_conf = np.clip(1 - iqr_distance, 0, 1)
iqr_distance = np.abs(predictions - quant_50) / iqr
print(f"iqr_distance: {np.min(iqr_distance):.2f} {np.max(iqr_distance):.2f}")
iqr_conf = 1 - iqr_distance
print(f"iqr_conf: {np.min(iqr_conf):.2f} {np.max(iqr_conf):.2f}")
iqr_conf_clip = np.clip(iqr_conf, 0, 1)

# Now combine the two confidence values
confidence = (q_conf + iqr_conf) / 2
return confidence
confidence = (q_conf_clip + iqr_conf_clip) / 2
return confidence, confidence_interval, iqr_distance


def domain_specific_confidence_2(quantile_models, X, predictions):
lower_05 = quantile_models[0.05].predict(X)
lower_25 = quantile_models[0.25].predict(X)
median_pred = quantile_models[0.50].predict(X)
upper_75 = quantile_models[0.75].predict(X)
upper_95 = quantile_models[0.95].predict(X)

# Target sensitivity
target_sensitivity = 0.25

# Ensure IQR is positive
epsilon = 1e-6
iqr = np.maximum(epsilon, np.abs(upper_75 - lower_25))

# Calculate confidence scores
iqr_conf = np.clip(1 - (iqr / target_sensitivity), 0, 1)
confidence = iqr_conf
return confidence, iqr_conf, median_pred


# Fit the KNN model
def fit_knn_model(X, y, n_neighbors=5):
knn = KNeighborsRegressor(n_neighbors=n_neighbors)
knn.fit(X, y)
return knn


# Confidence method using the KNN model
def knn_confidence(knn, X_new, predictions=None, tolerance=0.1):
# Get the neighbors' target values
neighbors = knn.kneighbors(X_new, return_distance=False)
neighbors_targets = np.array([knn._y[indices] for indices in neighbors])

# Calculate the variance of the neighbors' target values
variance = np.var(neighbors_targets, axis=1)

# Confidence score can be inversely related to the variance
confidence_scores = np.clip(1 - (variance / np.max(variance)), 0, 1)

if predictions is not None:
# Get KNN predictions for the new data
knn_predictions = knn.predict(X_new)

# Calculate the difference between provided predictions and KNN predictions
prediction_diff = np.abs(predictions - knn_predictions)

# Adjust confidence scores based on the prediction difference
adjusted_confidence = np.clip(1 - (prediction_diff / tolerance), 0, 1)
confidence_scores = np.minimum(confidence_scores, adjusted_confidence)

return confidence_scores


# Train models
Expand All @@ -106,18 +145,47 @@ def calculate_confidence(y, quantile_models, X):
rmse_predictions = prediction_model.predict(X_train)

# Calculate confidence for the array of predictions
confidence_values = calculate_confidence(y_train, quantile_models, X_train)
conf, conf_interval, iqr_distance = domain_specific_confidence_2(quantile_models, X_train, y_train)

# Now we're going to use the KNN model to calculate confidence
knn = fit_knn_model(X_train, y_train)
knn_conf = knn_confidence(knn, X_train, rmse_predictions)

# Total confidence is the product of the two confidence scores
total_confidence = (conf + knn_conf) / 2

# Compute model metrics for RMSE
rmse = np.sqrt(np.mean((y_train - rmse_predictions) ** 2))
print(f"RMSE: {rmse} support: {len(X_train)}")

# Domain Specific Confidence Threshold
conf_thres = 0.5

# Now filter the data based on confidence and give RMSE for the filtered data
rmse_predictions_filtered = rmse_predictions[conf > conf_thres]
y_train_filtered = y_train[conf > conf_thres]
rmse_filtered = np.sqrt(np.mean((y_train_filtered - rmse_predictions_filtered) ** 2))
print(f"RMSE Filtered: {rmse_filtered} support: {len(rmse_predictions_filtered)}")


# Plot the results
plt.figure(figsize=(10, 6))
sc = plt.scatter(X_train, y_train, c=confidence_values, cmap="coolwarm_r", label="Data", alpha=0.5)
# sc = plt.scatter(X_train, y_train, cmap='coolwarm', label='Data', alpha=0.5)
actual_values = y_train
actual_values = df["feature_1"]
sc = plt.scatter(actual_values, y_train, c=knn_conf, cmap="coolwarm", label="Data", alpha=0.5)
plt.colorbar(sc, label="Confidence")

# Sort x_values and the corresponding y-values for each quantile
sorted_indices = np.argsort(actual_values)
sorted_actual_values = actual_values[sorted_indices]
for q in quantiles:
plt.plot(X_train, quantile_models[q].predict(X_train), label=f"Quantile {int(q * 100)}")
plt.plot(X_train, rmse_predictions, label="RMSE Prediction", color="black", linestyle="--")
plt.xlabel("X")
plt.ylabel("Y")
sorted_y_values = quantile_models[q].predict(X_train)[sorted_indices]
plt.plot(sorted_actual_values, sorted_y_values, label=f"Quantile {int(q * 100):02}")

# Plot the RMSE predictions
plt.plot(sorted_actual_values, rmse_predictions[sorted_indices], label="RMSE Prediction", color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Quantile Regression and Confidence with XGBoost")
plt.legend()
plt.show()
Loading

0 comments on commit 7e72a7f

Please sign in to comment.