-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
cleaning up the quantile regression scripts
- Loading branch information
1 parent
dd11cc9
commit 276981f
Showing
3 changed files
with
355 additions
and
292 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,147 @@ | ||
import xgboost as xgb | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from xgboost import XGBRegressor | ||
|
||
# Generate synthetic training data | ||
np.random.seed(42) | ||
X_train = np.random.rand(100, 2) * 1000 # Features: [size, number of bedrooms] | ||
y_train = X_train[:, 0] * 300 + X_train[:, 1] * 50000 + np.random.randn(100) * 50000 # House prices | ||
# SageWorks Imports | ||
from sageworks.utils.test_data_generator import TestDataGenerator | ||
|
||
# Train models for the 10th, 50th, and 90th percentiles | ||
quantiles = [0.1, 0.50, 0.9] | ||
models = {} | ||
for q in quantiles: | ||
params = {"objective": "reg:quantileerror", "eval_metric": "mae", "quantile_alpha": q} | ||
model = xgb.XGBRegressor(**params) | ||
model.fit(X_train, y_train) | ||
models[q] = model | ||
# Get synthetic data | ||
data_generator = TestDataGenerator() | ||
df = data_generator.confidence_data() | ||
|
||
# Example test data | ||
X_test = np.array([[2250, 4]]) # Features: [size, number of bedrooms] | ||
feature_list = ["feature_1"] | ||
feature = feature_list[0] | ||
target = "target" | ||
X_train = df[feature_list] | ||
y_train = df[target] | ||
|
||
# Predict the 10th, 50th, and 90th percentiles | ||
preds = {} | ||
for q in quantiles: | ||
preds[q] = models[q].predict(X_test) | ||
|
||
# Output predictions | ||
preds_10 = preds[0.1] # Lower bound | ||
preds_50 = preds[0.50] # Median | ||
preds_90 = preds[0.9] # Upper bound | ||
# Get real data | ||
if False: | ||
df = data_generator.aqsol_data() | ||
feature_list = data_generator.aqsol_features() | ||
# feature = "mollogp" | ||
# feature = "tpsa" | ||
# feature = "numhacceptors" | ||
# feature_list = ["mollogp", "tpsa", "numhacceptors"] | ||
feature = "mollogp" # feature_list[0] | ||
target = data_generator.aqsol_target() | ||
X_train = df[feature_list] | ||
y_train = df[target] | ||
|
||
|
||
# Function to train and predict using the prediction model | ||
def train_prediction_model(X, y): | ||
model = XGBRegressor(objective="reg:squarederror") | ||
model.fit(X, y) | ||
return model | ||
|
||
|
||
# Function to train quantile models and store predictions | ||
def train_quantile_models(X, y, quantiles, n_estimators=100): | ||
quantile_models = {} | ||
for quantile in quantiles: | ||
model = XGBRegressor( | ||
objective="reg:quantileerror", quantile_alpha=quantile, n_estimators=n_estimators, max_depth=1 | ||
) | ||
model.fit(X, y) | ||
quantile_models[quantile] = model | ||
return quantile_models | ||
|
||
|
||
def normalize(data): | ||
data = np.array(data) | ||
normalized_data = (data - data.min()) / (data.max() - data.min()) | ||
return normalized_data | ||
|
||
|
||
# Calculate confidence based on the quantile predictions | ||
def calculate_confidence(quantile_models, X, y): | ||
lower_05 = quantile_models[0.05].predict(X) | ||
lower_25 = quantile_models[0.25].predict(X) | ||
quant_50 = quantile_models[0.50].predict(X) | ||
upper_75 = quantile_models[0.75].predict(X) | ||
upper_95 = quantile_models[0.95].predict(X) | ||
|
||
# Domain specific logic for calculating confidence | ||
# If the interval with is greater than target_sensitivity with have 0 confidence | ||
# anything below that is a linear scale from 0 to 1 | ||
confidence_interval = upper_95 - lower_05 | ||
q_conf = 1 - confidence_interval | ||
print(f"q_conf: {np.min(q_conf):.2f} {np.max(q_conf):.2f}") | ||
|
||
# Normalize the confidence to be between 0 and 1 | ||
q_conf = normalize(q_conf) | ||
print(f"q_conf: {np.min(q_conf):.2f} {np.max(q_conf):.2f}") | ||
|
||
# Now lets look at the IQR distance for each observation | ||
target_sensitivity = 1.0 | ||
epsilon_iqr = target_sensitivity * 0.5 | ||
iqr = np.maximum(epsilon_iqr, np.abs(upper_75 - lower_25)) | ||
iqr_distance = np.abs(y - quant_50) / iqr | ||
iqr_conf = 1 - iqr_distance | ||
print(f"iqr_conf: {np.min(iqr_conf):.2f} {np.max(iqr_conf):.2f}") | ||
|
||
# Normalize the confidence to be between 0 and 1 | ||
iqr_conf = normalize(iqr_conf) | ||
print(f"q_conf: {np.min(iqr_conf):.2f} {np.max(iqr_conf):.2f}") | ||
|
||
# Now combine the two confidence values | ||
confidence = (q_conf + iqr_conf) / 2 | ||
return confidence | ||
|
||
|
||
# Train models | ||
prediction_model = train_prediction_model(X_train, y_train) | ||
quantiles = [0.05, 0.25, 0.50, 0.75, 0.95] | ||
quantile_models = train_quantile_models(X_train, y_train, quantiles) | ||
|
||
# Predictions | ||
rmse_predictions = prediction_model.predict(X_train) | ||
|
||
# Calculate confidence for the array of predictions | ||
confidence_values = calculate_confidence(quantile_models, X_train, y_train) | ||
|
||
# Compute model metrics for RMSE | ||
rmse = np.sqrt(np.mean((y_train - rmse_predictions) ** 2)) | ||
print(f"RMSE: {rmse} support: {len(X_train)}") | ||
|
||
# Confidence Threshold | ||
conf_thres = 0.8 | ||
|
||
# Now filter the data based on confidence and give RMSE for the filtered data | ||
rmse_predictions_filtered = rmse_predictions[confidence_values > conf_thres] | ||
y_train_filtered = y_train[confidence_values > conf_thres] | ||
rmse_filtered = np.sqrt(np.mean((y_train_filtered - rmse_predictions_filtered) ** 2)) | ||
print(f"RMSE Filtered: {rmse_filtered} support: {len(rmse_predictions_filtered)}") | ||
|
||
|
||
# Plot the results | ||
plt.figure(figsize=(10, 6)) | ||
x_values = df[feature] | ||
# x_values = df["feature_1"] | ||
# x_values = rmse_predictions | ||
|
||
""" | ||
# Sort both the x and y values (for plotting purposes) | ||
sort_order = np.argsort(x_values) | ||
x_values = x_values[sort_order] | ||
y_train = y_train[sort_order] | ||
""" | ||
sc = plt.scatter(x_values, y_train, c=confidence_values, cmap="coolwarm", label="Data", alpha=0.5) | ||
plt.colorbar(sc, label="Confidence") | ||
|
||
# Sort x_values and the corresponding y-values for each quantile | ||
sorted_indices = np.argsort(x_values) | ||
sorted_x_values = x_values[sorted_indices] | ||
for q in quantiles: | ||
sorted_y_values = quantile_models[q].predict(X_train)[sorted_indices] | ||
plt.plot(sorted_x_values, sorted_y_values, label=f"Quantile {int(q * 100):02}") | ||
|
||
print(f"10th Percentile: ${preds_10[0]:,.0f}") | ||
print(f"50th Percentile (Median): ${preds_50[0]:,.0f}") | ||
print(f"90th Percentile: ${preds_90[0]:,.0f}") | ||
# Plot the RMSE predictions | ||
plt.plot(sorted_x_values, rmse_predictions[sorted_indices], label="RMSE Prediction", color="black", linestyle="--") | ||
plt.xlabel("X") | ||
plt.ylabel("Y") | ||
plt.title("Quantile Regression and Confidence with XGBoost") | ||
plt.legend() | ||
plt.show() |
Oops, something went wrong.