-
Notifications
You must be signed in to change notification settings - Fork 0
/
MLJ.jl
114 lines (84 loc) · 4.44 KB
/
MLJ.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
using CSV # Source: JuliaCon 2020 | MLJ: A Machine Learning Toolbox for Julia : https://www.youtube.com/watch?v=qSWbCn170HU&t=27s
file = CSV.File("horse.csv");
using DataFrames
horse = DataFrames.DataFrame(file);
using MLJ
coerce!(horse, autotype(horse));
coerce!(horse, Count => Continuous); #convert all Count data to Continuous
# name of column and type
coerce!(horse, :surgery => Multiclass, :age => Multiclass, :mucous_membranes => Multiclass,
:capillary_refill_time => Multiclass, :outcome => Multiclass, :cp_data => Multiclass);
y, X = unpack(horse,
==(:outcome),
name -> elscitype(Tables.getcolumn(horse, name)) == Continuous);
train, test = partition(eachindex(y), 0.7, shuffle=true);
# put model type into scope,
# instantiate the model with default parameters
# pkg name is imp when model with same name belong to multiple package
lc = @load LogisticClassifier pkg=MLJLinearModels verbosity=0; #load model
model = lc(); # initialization
model.lambda = 100;
mach = machine(model, X, y);
fit!(mach, rows=train);
yhat = predict(mach, rows=test);
misclassification_rate(mode.(yhat), y[test])
tuning = RandomSearch(rng=123)
r = range(model, :lambda, lower=1, upper=150, scale=:log);
tuned_model = TunedModel(model=model,
ranges=r,
resampling=CV(nfolds=6),
measures=cross_entropy,
tuning=Grid(resolution=10),
n=30)
self_tuning_mach = machine(tuned_model, X, y)
fit!(self_tuning_mach) # self tuning - optimizatio happens here - optimizes it using the strategy we've set, and once the best model is found - it will retrain on all available data and store that result
ythat = predict(self_tuning_mach, rows=test);
misclassification_rate(mode.(ythat), y[test])
curve = learning_curve(mach,
range=r,
resampling=Holdout(fraction_train=0.7), # (default)
measure=cross_entropy)
using Plots
plot(curve.parameter_values, curve.measurements)
xlabel!(plt, "epochs")
ylabel!(plt, "cross entropy on holdout set")
using MLJ
X = (x1=rand(100), x2=rand(100), x3=rand(100));
y = 2X.x1 - X.x2 + 0.05*rand(100);
dtr = @load DecisionTreeRegressor pkg=DecisionTree;
tree_model = dtr();
# set the range of hyper parameter
r = range(tree_model, :min_purity_increase, lower=0.001, upper=1.0, scale=:log);
# RandomSearch is a bunch of parameter for the prior distribution
# these prioor dist will be used for sampling
# TunedModel wrap the model and the construction explain what kind of tuning we want to do
# the new model is kind of self tuning model
tuned_model = TunedModel(model=tree_model,
resampling = CV(nfolds=3),
tuning = Grid(resolution=10),
ranges = r,
measure = rms);
self_tuning_mach = machine(tuned_model, X, y)
fit!(self_tuning_mach) # self tuning - optimizatio happens here - optimizes it using the strategy we've set, and once the best model is found - it will retrain on all available data and store that result
f.best_model # prediction will pick the model with the optimal hyperparamter
# after instantiating the model we construct a machine
# a machine is model and data bounded
# its convenient to give it the complete dataset
mach = machine(model, X, y) # for supervised
fitted_params(mach) # inspect the trained parameter in a mamed tuple
# machine stores the actual learned parameters of the model
fit!(mach, rows=train, verbosity=2) # trains the model using train data and stores the parameters in machine
# if you change a hyperparameter using model.hp, then you retrain the modelj
predict(mach, rows=test) # given the machine(contains the trained model) and new dataset or row indecies
# predict method by default will predict probabilitiesj
# predict_mode returns the highest probability
# measures the performance of the model
cross_entropy(yhat, y[test]) |> mean
missclassification_rate(mode.(yhat), y[test])
measures(matching(y))
model = @load NeuralNetworkClassifier
model.hyperparameters.epochs = 12
# lets say we have model bounded with data and we want to test it's performance
evaluate!(mach, resampling=Holdout(fraction_train=0.7), measures=[cross_entropy])
evaluate!(mach, resampling=CV(nfolds=6),measures=[cross_entropy])
# range takes the name of model, the name of hyperparameter, and its lower and upper range and a scale