Skip to content

Commit

Permalink
cleaning up the quantile regression scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
brifordwylie committed Jul 3, 2024
1 parent dd11cc9 commit 276981f
Show file tree
Hide file tree
Showing 3 changed files with 355 additions and 292 deletions.
166 changes: 140 additions & 26 deletions experiments/quantile_regression.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,147 @@
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

# Generate synthetic training data
np.random.seed(42)
X_train = np.random.rand(100, 2) * 1000 # Features: [size, number of bedrooms]
y_train = X_train[:, 0] * 300 + X_train[:, 1] * 50000 + np.random.randn(100) * 50000 # House prices
# SageWorks Imports
from sageworks.utils.test_data_generator import TestDataGenerator

# Train models for the 10th, 50th, and 90th percentiles
quantiles = [0.1, 0.50, 0.9]
models = {}
for q in quantiles:
params = {"objective": "reg:quantileerror", "eval_metric": "mae", "quantile_alpha": q}
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train)
models[q] = model
# Get synthetic data
data_generator = TestDataGenerator()
df = data_generator.confidence_data()

# Example test data
X_test = np.array([[2250, 4]]) # Features: [size, number of bedrooms]
feature_list = ["feature_1"]
feature = feature_list[0]
target = "target"
X_train = df[feature_list]
y_train = df[target]

# Predict the 10th, 50th, and 90th percentiles
preds = {}
for q in quantiles:
preds[q] = models[q].predict(X_test)

# Output predictions
preds_10 = preds[0.1] # Lower bound
preds_50 = preds[0.50] # Median
preds_90 = preds[0.9] # Upper bound
# Get real data
if False:
df = data_generator.aqsol_data()
feature_list = data_generator.aqsol_features()
# feature = "mollogp"
# feature = "tpsa"
# feature = "numhacceptors"
# feature_list = ["mollogp", "tpsa", "numhacceptors"]
feature = "mollogp" # feature_list[0]
target = data_generator.aqsol_target()
X_train = df[feature_list]
y_train = df[target]


# Function to train and predict using the prediction model
def train_prediction_model(X, y):
model = XGBRegressor(objective="reg:squarederror")
model.fit(X, y)
return model


# Function to train quantile models and store predictions
def train_quantile_models(X, y, quantiles, n_estimators=100):
quantile_models = {}
for quantile in quantiles:
model = XGBRegressor(
objective="reg:quantileerror", quantile_alpha=quantile, n_estimators=n_estimators, max_depth=1
)
model.fit(X, y)
quantile_models[quantile] = model
return quantile_models


def normalize(data):
data = np.array(data)
normalized_data = (data - data.min()) / (data.max() - data.min())
return normalized_data


# Calculate confidence based on the quantile predictions
def calculate_confidence(quantile_models, X, y):
lower_05 = quantile_models[0.05].predict(X)
lower_25 = quantile_models[0.25].predict(X)
quant_50 = quantile_models[0.50].predict(X)
upper_75 = quantile_models[0.75].predict(X)
upper_95 = quantile_models[0.95].predict(X)

# Domain specific logic for calculating confidence
# If the interval with is greater than target_sensitivity with have 0 confidence
# anything below that is a linear scale from 0 to 1
confidence_interval = upper_95 - lower_05
q_conf = 1 - confidence_interval
print(f"q_conf: {np.min(q_conf):.2f} {np.max(q_conf):.2f}")

# Normalize the confidence to be between 0 and 1
q_conf = normalize(q_conf)
print(f"q_conf: {np.min(q_conf):.2f} {np.max(q_conf):.2f}")

# Now lets look at the IQR distance for each observation
target_sensitivity = 1.0
epsilon_iqr = target_sensitivity * 0.5
iqr = np.maximum(epsilon_iqr, np.abs(upper_75 - lower_25))
iqr_distance = np.abs(y - quant_50) / iqr
iqr_conf = 1 - iqr_distance
print(f"iqr_conf: {np.min(iqr_conf):.2f} {np.max(iqr_conf):.2f}")

# Normalize the confidence to be between 0 and 1
iqr_conf = normalize(iqr_conf)
print(f"q_conf: {np.min(iqr_conf):.2f} {np.max(iqr_conf):.2f}")

# Now combine the two confidence values
confidence = (q_conf + iqr_conf) / 2
return confidence


# Train models
prediction_model = train_prediction_model(X_train, y_train)
quantiles = [0.05, 0.25, 0.50, 0.75, 0.95]
quantile_models = train_quantile_models(X_train, y_train, quantiles)

# Predictions
rmse_predictions = prediction_model.predict(X_train)

# Calculate confidence for the array of predictions
confidence_values = calculate_confidence(quantile_models, X_train, y_train)

# Compute model metrics for RMSE
rmse = np.sqrt(np.mean((y_train - rmse_predictions) ** 2))
print(f"RMSE: {rmse} support: {len(X_train)}")

# Confidence Threshold
conf_thres = 0.8

# Now filter the data based on confidence and give RMSE for the filtered data
rmse_predictions_filtered = rmse_predictions[confidence_values > conf_thres]
y_train_filtered = y_train[confidence_values > conf_thres]
rmse_filtered = np.sqrt(np.mean((y_train_filtered - rmse_predictions_filtered) ** 2))
print(f"RMSE Filtered: {rmse_filtered} support: {len(rmse_predictions_filtered)}")


# Plot the results
plt.figure(figsize=(10, 6))
x_values = df[feature]
# x_values = df["feature_1"]
# x_values = rmse_predictions

"""
# Sort both the x and y values (for plotting purposes)
sort_order = np.argsort(x_values)
x_values = x_values[sort_order]
y_train = y_train[sort_order]
"""
sc = plt.scatter(x_values, y_train, c=confidence_values, cmap="coolwarm", label="Data", alpha=0.5)
plt.colorbar(sc, label="Confidence")

# Sort x_values and the corresponding y-values for each quantile
sorted_indices = np.argsort(x_values)
sorted_x_values = x_values[sorted_indices]
for q in quantiles:
sorted_y_values = quantile_models[q].predict(X_train)[sorted_indices]
plt.plot(sorted_x_values, sorted_y_values, label=f"Quantile {int(q * 100):02}")

print(f"10th Percentile: ${preds_10[0]:,.0f}")
print(f"50th Percentile (Median): ${preds_50[0]:,.0f}")
print(f"90th Percentile: ${preds_90[0]:,.0f}")
# Plot the RMSE predictions
plt.plot(sorted_x_values, rmse_predictions[sorted_indices], label="RMSE Prediction", color="black", linestyle="--")
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Quantile Regression and Confidence with XGBoost")
plt.legend()
plt.show()
Loading

0 comments on commit 276981f

Please sign in to comment.