You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am new to AutoMl. Exploring the lending club dataset found that model.predict(test) , provide unexpected results.
# Import necessary packages
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
# Initialize instance of H2O
h2o.init()
# If possible download from the s3 link and change the path to the dataset
path = "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv"
# Specify some column types to "String" that we want to munge later
types = {"int_rate":"string", "revol_util":"string", "emp_length":"string",
"earliest_cr_line":"string", "issue_d":"string", "last_credit_pull_d":"factor", "verification_status":"string"}
# Task 1: Import the file and look at the frame
data = h2o.import_file(path=path, col_types= types)
data.describe()
# Task 2: Look at the levels int he response column, "loan_status"
# Hint: Use .table() function on the response column
data["loan_status"].table()
# Task 3: Drop all loans that are still in progess and therefore cannot be deemed good/bad loans
# Hint: "Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)" are ongoing loans
data = data[~data["loan_status"].isin(["Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)"]), :]
data.show()
# Task 4: Bin the response variable to good/bad oans only, use your best judgement for what qualifies as a good/bad loan
# Create a new column called "bad_loan" which should be a binary variable
# Hint: You can turn the bad_loan columm into a factor using .asfactor()
data["bad_loan"] = data["loan_status"].isin(["Charged Off", "Default",
"Does not meet the credit policy. Status:Charged Off"])
data ["bad_loan"] = data["bad_loan"].asfactor()
data["bad_loan"]
# Task 5: String munging to clearn string columns before converting to numeric
# Hint: Column that need munging include "int_rate," "revol_util," "emp_length"
#### Example for int_rate using gsub, trim, asnumeric ####
data ["int_rate"] = data["int_rate"].gsub(pattern = "%", replacement = "") # strip %
data [ "int_rate"] = data["int_rate"].trim() # trim ws
data ["int_rate"] = data["int_rate"].asnumeric() #change to a numeric
data["int_rate"].show()
# Now try for revol_util yourself
data["revol_util"] = data["revol_util"].gsub(pattern = "%", replacement = "")
data["revol_util"] = data["revol_util"].trim()
data["revol_util"] = data["revol_util"].asnumeric()
data["revol_util"].show()
# Now we're going to clean up emp_length
# Use gsub to remove " year" and " years" also translate n/a to ""
data ["emp_length"] = data["emp_length"].gsub(pattern = "([ ]*+[a-zA-Z].*)|(n/a)",
replacement = "")
# Use trim to remove any trailing spaces
data ["emp_length"] = data["emp_length"].trim()
# Use sub to convert < 1 to 0 years and do the same for 10 + to 10
# Hint: Be mindful of spaces between characters
data ["emp_length"] = data["emp_length"].gsub(pattern = "< 1", replacement = "0.5")
data ["emp_length"] = data["emp_length"].gsub(pattern = "10\\+", replacement = "10")
data ["emp_length"] = data["emp_length"].asnumeric()
data ["emp_length"].show()
# Task 6: Extract month and year from earliest_cr_line and make two new columns called
# earliest_cr_month and earliest_cr_year
data["earliest_cr_month"] = data["earliest_cr_line"].strsplit(pattern = "-")[0]
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1]
data["earliest_cr_year"] = data["earliest_cr_line"].asnumeric()
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1].asnumeric()
data["earliest_cr_year"].show()
# Task 7: Extract month and year from issue_d and make two new columns called issue_d_month and issue_d_year
data["issue_d_month"] = data["issue_d"].strsplit(pattern = "-")[0]
data["issue_d_year"] = data["issue_d"].strsplit(pattern = "-")[1]
data["issue_d_year"] = data["issue_d_year"].asnumeric()
data["issue_d_year"].show()
# Task 8: Create new column called credit_length
# Hint: Do this by subtracting the earliest_cr_year from the issue_d_year
data["credit_length"] = data["issue_d_year"] - data["earliest_cr_year"]
data["credit_length"].show()
# Task 9: Use the sub function to create two levels from the verification_status column. Ie "verified" and "not verified"
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income source",
replacement = "verified")
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income",
replacement = "verified")
data ["verification_status"] = data["verification_status"].asfactor()
# Task 10: Do a test-train split (80-20)
s = data["int_rate"].runif()
train = data[s <= 0.80]
test = data[s > 0.80]
# Task 11: Define your response and predictor variables
y="bad_loan"
x=["loan_amnt", "credit_length", "revol_util",
"home_ownership", "annual_inc", "purpose", "addr_state", "dti",
"delinq_2yrs", "total_acc", "verification_status", "term"]
# Task 12: Set parameters for GBM model
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model = H2OGradientBoostingEstimator(model_id="BadLoanModel",
score_each_iteration=True,
ntrees=10,
learn_rate=0.05)
# Task 13: Build your model
model.train(x=x, y=y, training_frame=train, validation_frame=test)
# Task 14: View your model results
model
predict = model.predict(test)
predict.head()
Results :
As per my understanding
p0 is the probability (between 0 and 1) that class 0 is chosen.
p1 is the probability (between 0 and 1) that class 1 is chosen.
I am new to AutoMl. Exploring the lending club dataset found that
model.predict(test)
, provide unexpected results.Results :
As per my understanding
Results appear to be very strange, why P1 is always the lower here.
The text was updated successfully, but these errors were encountered: