diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31959ec62..a103cea1e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,7 @@ jobs: env: JULIA_NUM_THREADS: 2 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v3 with: file: lcov.info docs: diff --git a/Project.toml b/Project.toml index ae84eb3b7..181d50af4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBase" uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" authors = ["Anthony D. Blaom "] -version = "1.2.1" +version = "1.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/src/MLJBase.jl b/src/MLJBase.jl index bde58cc4e..0d58635df 100644 --- a/src/MLJBase.jl +++ b/src/MLJBase.jl @@ -291,8 +291,8 @@ export machines, sources, Stack, export TransformedTargetModel # resampling.jl: -export ResamplingStrategy, Holdout, CV, StratifiedCV, TimeSeriesCV, - evaluate!, Resampler, PerformanceEvaluation +export ResamplingStrategy, InSample, Holdout, CV, StratifiedCV, TimeSeriesCV, + evaluate!, Resampler, PerformanceEvaluation, CompactPerformanceEvaluation # `MLJType` and the abstract `Model` subtypes are exported from within # src/composition/abstract_types.jl diff --git a/src/composition/learning_networks/nodes.jl b/src/composition/learning_networks/nodes.jl index 5ede32aaa..07bd3ae05 100644 --- a/src/composition/learning_networks/nodes.jl +++ b/src/composition/learning_networks/nodes.jl @@ -27,9 +27,9 @@ See also [`node`](@ref), [`Source`](@ref), [`origins`](@ref), [`sources`](@ref), [`fit!`](@ref). """ -struct Node{T<:Union{Machine, Nothing}} <: AbstractNode +struct Node{T<:Union{Machine, Nothing},Oper} <: AbstractNode - operation # eg, `predict` or a static operation, such as `exp` + operation::Oper # eg, `predict` or a static operation, such as `exp` machine::T # is `nothing` for static operations # nodes called to get args for `operation(model, ...) ` or @@ -43,9 +43,11 @@ struct Node{T<:Union{Machine, Nothing}} <: AbstractNode # order consistent with extended graph, excluding self nodes::Vector{AbstractNode} - function Node(operation, - machine::T, - args::AbstractNode...) where T<:Union{Machine, Nothing} + function Node( + operation::Oper, + machine::T, + args::AbstractNode..., + ) where {T<:Union{Machine, Nothing}, Oper} # check the number of arguments: # if machine === nothing && isempty(args) @@ -70,7 +72,7 @@ struct Node{T<:Union{Machine, Nothing}} <: AbstractNode vcat(nodes_, (nodes(n) for n in machine.args)...) |> unique end - return new{T}(operation, machine, args, origins_, nodes_) + return new{T,Oper}(operation, machine, args, origins_, nodes_) end end @@ -407,14 +409,14 @@ of nodes, sources and other arguments. ### Examples -``` -X = source(π) -W = @node sin(X) +```julia-repl +julia> X = source(π) +julia> W = @node sin(X) julia> W() 0 -X = source(1:10) -Y = @node selectrows(X, 3:4) +julia> X = source(1:10) +julia> Y = @node selectrows(X, 3:4) julia> Y() 3:4 @@ -423,10 +425,10 @@ julia> Y(["one", "two", "three", "four"]) "three" "four" -X1 = source(4) -X2 = source(5) -add(a, b, c) = a + b + c -N = @node add(X1, 1, X2) +julia> X1 = source(4) +julia> X2 = source(5) +julia> add(a, b, c) = a + b + c +julia> N = @node add(X1, 1, X2) julia> N() 10 diff --git a/src/composition/learning_networks/signatures.jl b/src/composition/learning_networks/signatures.jl index d49aace9d..7ffadb7c5 100644 --- a/src/composition/learning_networks/signatures.jl +++ b/src/composition/learning_networks/signatures.jl @@ -8,10 +8,10 @@ **Private method.** -Return a dictionary of machines, keyed on model, for the all machines in the completed -learning network for which `node` is the greatest lower bound. Only machines bound to -symbolic models are included. Values are always vectors, even if they contain only a -single machine. +Return a dictionary of machines, keyed on model, for the all machines in the +completed learning network for which `node` is the greatest lower bound. Only +machines bound to symbolic models are included. Values are always vectors, +even if they contain only a single machine. """ function machines_given_model(node::AbstractNode) @@ -35,14 +35,14 @@ attempt_scalarize(v) = length(v) == 1 ? v[1] : v **Private method.** -Given a dictionary of machine vectors, keyed on model names (symbols), broadcast `f` over -each vector, and make the result, in the returned named tuple, the value associated with -the corresponding model name as key. +Given a dictionary of machine vectors, keyed on model names (symbols), broadcast +`f` over each vector, and make the result, in the returned named tuple, the +value associated with the corresponding model name as key. Singleton vector values are scalarized, unless `scalarize = false`. -If a value in the computed named tuple is `nothing`, or a vector of `nothing`s, then the -entry is dropped from the tuple, unless `drop_nothings=false`. +If a value in the computed named tuple is `nothing`, or a vector of `nothing`s, +then the entry is dropped from the tuple, unless `drop_nothings=false`. """ function tuple_keyed_on_model(f, machines_given_model; scalarize=true, drop_nothings=true) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index ec872c167..9f6b4121f 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -337,12 +337,12 @@ internal_stack_report( ) = NamedTuple{}() """ -internal_stack_report( - m::Stack, - verbosity::Int, - y::AbstractNode, - folds_evaluations::Vararg{AbstractNode}, -) + internal_stack_report( + m::Stack, + verbosity::Int, + y::AbstractNode, + folds_evaluations::Vararg{AbstractNode}, + ) When measure/measures is provided, the folds_evaluation will have been filled by `store_for_evaluation`. This function is not doing any heavy work (not constructing nodes @@ -518,7 +518,7 @@ function oos_set(m::Stack{modelnames}, Xs::Source, ys::Source, tt_pairs) where m end ####################################### -################# Prefit ################# +################# Prefit ############## ####################################### function prefit(m::Stack{modelnames}, verbosity::Int, X, y) where modelnames @@ -564,8 +564,7 @@ const DOC_STACK = Stack(; metalearner=nothing, name1=model1, name2=model2, ..., keyword_options...) Implements the two-layer generalized stack algorithm introduced by -[Wolpert -(1992)](https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231) +[Wolpert (1992)](https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231) and generalized by [Van der Laan et al (2007)](https://biostats.bepress.com/ucbbiostat/paper222/). Returns an instance of type `ProbabilisticStack` or `DeterministicStack`, diff --git a/src/composition/models/transformed_target_model.jl b/src/composition/models/transformed_target_model.jl index 259cff97a..9304b63ca 100644 --- a/src/composition/models/transformed_target_model.jl +++ b/src/composition/models/transformed_target_model.jl @@ -61,7 +61,7 @@ const ERR_MODEL_UNSPECIFIED = ArgumentError( "Expecting atomic model as argument. None specified. " ) const ERR_TRANSFORMER_UNSPECIFIED = ArgumentError( -"You must specify `transformer=...`. ." + "You must specify `transformer=...`. ." ) const ERR_TOO_MANY_ARGUMENTS = ArgumentError( "At most one non-keyword argument, a model, allowed. " @@ -123,7 +123,7 @@ y -> mode.(y))`. A model that normalizes the target before applying ridge regression, with predictions returned on the original scale: -``` +```julia @load RidgeRegressor pkg=MLJLinearModels model = RidgeRegressor() tmodel = TransformedTargetModel(model, transformer=Standardizer()) @@ -132,7 +132,7 @@ tmodel = TransformedTargetModel(model, transformer=Standardizer()) A model that applies a static `log` transformation to the data, again returning predictions to the original scale: -``` +```julia tmodel2 = TransformedTargetModel(model, transformer=y->log.(y), inverse=z->exp.(y)) ``` diff --git a/src/data/data.jl b/src/data/data.jl index 32477a759..d32428073 100644 --- a/src/data/data.jl +++ b/src/data/data.jl @@ -104,23 +104,28 @@ corresponding `fractions` of `length(nrows(X))`, where valid fractions are floats between 0 and 1 whose sum is less than one. The last fraction is not provided, as it is inferred from the preceding ones. -For "synchronized" partitioning of multiple objects, use the -`multi=true` option described below. +For synchronized partitioning of multiple objects, use the +`multi=true` option. - julia> partition(1:1000, 0.8) - ([1,...,800], [801,...,1000]) +```julia-repl +julia> partition(1:1000, 0.8) +([1,...,800], [801,...,1000]) - julia> partition(1:1000, 0.2, 0.7) - ([1,...,200], [201,...,900], [901,...,1000]) +julia> partition(1:1000, 0.2, 0.7) +([1,...,200], [201,...,900], [901,...,1000]) - julia> partition(reshape(1:10, 5, 2), 0.2, 0.4) - ([1 6], [2 7; 3 8], [4 9; 5 10]) +julia> partition(reshape(1:10, 5, 2), 0.2, 0.4) +([1 6], [2 7; 3 8], [4 9; 5 10]) - X, y = make_blobs() # a table and vector - Xtrain, Xtest = partition(X, 0.8, stratify=y) +julia> X, y = make_blobs() # a table and vector +julia> Xtrain, Xtest = partition(X, 0.8, stratify=y) +``` - (Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true) +Here's an example of synchronized partitioning of multiple objects: +```julia-repl +julia> (Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true) +``` ## Keywords @@ -209,7 +214,7 @@ Returns a tuple of tables/vectors with length one greater than the number of supplied predicates, with the last component including all previously unselected columns. -``` +```julia-repl julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"]) 2×4 DataFrame Row │ x y z w @@ -218,7 +223,7 @@ julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"]) 1 │ 1 a 10.0 A 2 │ 2 b 20.0 B -Z, XY, W = unpack(table, ==(:z), !=(:w)) +julia> Z, XY, W = unpack(table, ==(:z), !=(:w)); julia> Z 2-element Vector{Float64}: 10.0 @@ -300,9 +305,11 @@ The method is curried, so that `restrict(folds, i)` is the operator on data defined by `restrict(folds, i)(X) = restrict(X, folds, i)`. ### Example - - folds = ([1, 2], [3, 4, 5], [6,]) - restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5] +# +```julia +folds = ([1, 2], [3, 4, 5], [6,]) +restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5] +``` See also [`corestrict`](@ref) @@ -322,7 +329,9 @@ all elements of `folds`. Here `folds` is a vector or tuple of integer vectors, typically representing row indices or a vector, matrix or table. - complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5] +```julia +complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5] +``` """ complement(f, i) = reduce(vcat, collect(f)[Not(i)]) @@ -345,8 +354,10 @@ on data defined by `corestrict(folds, i)(X) = corestrict(X, folds, i)`. ### Example - folds = ([1, 2], [3, 4, 5], [6,]) - corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6] +```julia +folds = ([1, 2], [3, 4, 5], [6,]) +corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6] +``` """ corestrict(f::NTuple{N}, i) where N = FoldComplementRestrictor{i,N}(f) diff --git a/src/data/datasets.jl b/src/data/datasets.jl index 9e84b75c2..ba4d88db3 100644 --- a/src/data/datasets.jl +++ b/src/data/datasets.jl @@ -158,7 +158,7 @@ const COERCE_SUNSPOTS = ( (:sunspot_number=>Continuous),) """ -load_dataset(fpath, coercions) + load_dataset(fpath, coercions) Load one of standard dataset like Boston etc assuming the file is a comma separated file with a header. diff --git a/src/data/datasets_synthetic.jl b/src/data/datasets_synthetic.jl index 58a984c4f..d1a8830ef 100644 --- a/src/data/datasets_synthetic.jl +++ b/src/data/datasets_synthetic.jl @@ -18,9 +18,6 @@ const EXTRA_CLASSIFICATION = Internal function to finalize the `make_*` functions. """ -x = [1 2 3 ; 4 5 6] -x -length(size(collect(1:3))) # ( function finalize_Xy(X, y, shuffle, as_table, eltype, rng; clf::Bool=true) # Shuffle the rows if required if shuffle @@ -78,7 +75,7 @@ By default, a table `X` with `p` columns (features) and `n` rows ### Example -``` +```julia X, y = make_blobs(100, 3; centers=2, cluster_std=[1.0, 3.0]) ``` @@ -95,8 +92,7 @@ function make_blobs(n::Integer=100, # check arguments make sense if n < 1 || p < 1 - throw(ArgumentError( - "Expected `n` and `p` to be at least 1.")) + throw(ArgumentError("Expected `n` and `p` to be at least 1.")) end if center_box.first >= center_box.second throw(ArgumentError( @@ -181,7 +177,7 @@ $(EXTRA_KW_MAKE*EXTRA_CLASSIFICATION) ### Example -``` +```julia X, y = make_circles(100; noise=0.5, factor=0.3) ``` @@ -196,12 +192,10 @@ function make_circles(n::Integer=100; # check arguments make sense if n < 1 - throw(ArgumentError( - "Expected `n` to be at least 1.")) + throw(ArgumentError("Expected `n` to be at least 1.")) end if noise < 0 - throw(ArgumentError( - "Noise argument cannot be negative.")) + throw(ArgumentError("Noise argument cannot be negative.")) end if !(0 < factor < 1) throw(ArgumentError( @@ -224,12 +218,12 @@ function make_circles(n::Integer=100; X .+= noise .* randn(rng, n, 2) end - return finalize_Xy(X, y, shuffle, as_table, eltype, rng) + return finalize_Xy(X, y, shuffle, as_table, eltype, rng) end """ - make_moons(n::Int=100; kwargs...) + make_moons(n::Int=100; kwargs...) Generates labeled two-dimensional points lying close to two interleaved semi-circles, for use with classification and clustering @@ -257,7 +251,7 @@ membership to the left or right semi-circle. ### Example -``` +```julia X, y = make_moons(100; noise=0.5) ``` @@ -273,12 +267,10 @@ function make_moons(n::Int=150; # check arguments make sense if n < 1 - throw(ArgumentError( - "Expected `n` to be at least 1.")) + throw(ArgumentError("Expected `n` to be at least 1.")) end if noise < 0 - throw(ArgumentError( - "Noise argument cannot be negative.")) + throw(ArgumentError("Noise argument cannot be negative.")) end rng = init_rng(rng) @@ -324,8 +316,7 @@ end Make portion `s` of vector `θ` exactly 0. """ -sparsify!(rng, θ, s) = - (θ .*= (rand(rng, length(θ)) .< s)) +sparsify!(rng, θ, s) = (θ .*= (rand(rng, length(θ)) .< s)) """Add outliers to portion s of vector.""" outlify!(rng, y, s) = @@ -338,19 +329,18 @@ const SIGMOID_32 = log(Float32(1)/eps(Float32) - Float32(1)) sigmoid(x) Return the sigmoid computed in a numerically stable way: - ``σ(x) = 1/(1+exp(-x))`` """ function sigmoid(x::Float64) - x > SIGMOID_64 && return one(x) - x < -SIGMOID_64 && return zero(x) - return one(x) / (one(x) + exp(-x)) + x > SIGMOID_64 && return one(x) + x < -SIGMOID_64 && return zero(x) + return one(x) / (one(x) + exp(-x)) end function sigmoid(x::Float32) - x > SIGMOID_32 && return one(x) - x < -SIGMOID_32 && return zero(x) - return one(x) / (one(x) + exp(-x)) + x > SIGMOID_32 && return one(x) + x < -SIGMOID_32 && return zero(x) + return one(x) / (one(x) + exp(-x)) end sigmoid(x) = sigmoid(float(x)) @@ -392,7 +382,7 @@ $EXTRA_KW_MAKE ### Example -``` +```julia X, y = make_regression(100, 5; noise=0.5, sparse=0.2, outliers=0.1) ``` @@ -411,24 +401,19 @@ function make_regression(n::Int=100, # check arguments make sense if n < 1 || p < 1 - throw(ArgumentError( - "Expected `n` and `p` to be at least 1.")) + throw(ArgumentError("Expected `n` and `p` to be at least 1.")) end if n_targets < 1 - throw(ArgumentError( - "Expected `n_targets` to be at least 1.")) + throw(ArgumentError("Expected `n_targets` to be at least 1.")) end if !(0 <= sparse < 1) - throw(ArgumentError( - "Sparsity argument must be in [0, 1).")) + throw(ArgumentError("Sparsity argument must be in [0, 1).")) end if noise < 0 - throw(ArgumentError( - "Noise argument cannot be negative.")) + throw(ArgumentError("Noise argument cannot be negative.")) end if !(0 <= outliers <= 1) - throw(ArgumentError( - "Outliers argument must be in [0, 1].")) + throw(ArgumentError("Outliers argument must be in [0, 1].")) end rng = init_rng(rng) diff --git a/src/hyperparam/one_dimensional_range_methods.jl b/src/hyperparam/one_dimensional_range_methods.jl index 5afc670c9..dc82d03a5 100644 --- a/src/hyperparam/one_dimensional_range_methods.jl +++ b/src/hyperparam/one_dimensional_range_methods.jl @@ -296,9 +296,11 @@ Construct an object `s` which can be used to generate random samples from a `ParamRange` object `r` (a one-dimensional range) using one of the following calls: - rand(s) # for one sample - rand(s, n) # for n samples - rand(rng, s [, n]) # to specify an RNG +```julia +rand(s) # for one sample +rand(s, n) # for n samples +rand(rng, s [, n]) # to specify an RNG +``` The argument `probs` can be any probability vector with the same length as `r.values`. The second `sampler` method above calls the @@ -329,30 +331,32 @@ in the special case `r.scale` is a callable object `f`. In that case, ### Examples - r = range(Char, :letter, values=collect("abc")) - s = sampler(r, [0.1, 0.2, 0.7]) - samples = rand(s, 1000); - StatsBase.countmap(samples) - Dict{Char,Int64} with 3 entries: - 'a' => 107 - 'b' => 205 - 'c' => 688 - - r = range(Int, :k, lower=2, upper=6) # numeric but discrete - s = sampler(r, Normal) - samples = rand(s, 1000); - UnicodePlots.histogram(samples) - ┌ ┐ - [2.0, 2.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 119 - [2.5, 3.0) ┤ 0 - [3.0, 3.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 296 - [3.5, 4.0) ┤ 0 - [4.0, 4.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 275 - [4.5, 5.0) ┤ 0 - [5.0, 5.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 221 - [5.5, 6.0) ┤ 0 - [6.0, 6.5) ┤▇▇▇▇▇▇▇▇▇▇▇ 89 - └ ┘ +```julia-repl +julia> r = range(Char, :letter, values=collect("abc")) +julia> s = sampler(r, [0.1, 0.2, 0.7]) +julia> samples = rand(s, 1000); +julia> StatsBase.countmap(samples) +Dict{Char,Int64} with 3 entries: + 'a' => 107 + 'b' => 205 + 'c' => 688 + +julia> r = range(Int, :k, lower=2, upper=6) # numeric but discrete +julia> s = sampler(r, Normal) +julia> samples = rand(s, 1000); +julia> UnicodePlots.histogram(samples) + ┌ ┐ +[2.0, 2.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 119 +[2.5, 3.0) ┤ 0 +[3.0, 3.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 296 +[3.5, 4.0) ┤ 0 +[4.0, 4.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 275 +[4.5, 5.0) ┤ 0 +[5.0, 5.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 221 +[5.5, 6.0) ┤ 0 +[6.0, 6.5) ┤▇▇▇▇▇▇▇▇▇▇▇ 89 + └ ┘ +``` """ Distributions.sampler(r::NumericRange{T}, diff --git a/src/machines.jl b/src/machines.jl index 7a6e8bdca..1a3f53886 100644 --- a/src/machines.jl +++ b/src/machines.jl @@ -14,7 +14,7 @@ The effect of the `scitype_check_level` option in calls of the form `machine(model, data, scitype_check_level=...)` is summarized below: `scitype_check_level` | Inspect scitypes? | If `Unknown` in scitypes | If other scitype mismatch | -|:-------------------:|:-----------------:|:------------------------:|:-------------------------:| +|:--------------------|:-----------------:|:------------------------:|:-------------------------:| 0 | × | | | 1 (value at startup) | ✓ | | warning | 2 | ✓ | warning | warning | @@ -47,12 +47,14 @@ caches_data_by_default(m) = caches_data_by_default(typeof(m)) caches_data_by_default(::Type) = true caches_data_by_default(::Type{<:Symbol}) = false -mutable struct Machine{M,C} <: MLJType +mutable struct Machine{M,OM,C} <: MLJType model::M - old_model # for remembering the model used in last call to `fit!` + old_model::OM # for remembering the model used in last call to `fit!` + + # the next two refer to objects returned by `MLJModlelInterface.fit(::M, ...)`. fitresult - cache + cache # relevant to `MLJModelInterface.update`, not to be confused with type param `C` # training arguments (`Node`s or user-specified data wrapped in # `Source`s): @@ -77,8 +79,11 @@ mutable struct Machine{M,C} <: MLJType function Machine( model::M, args::AbstractNode...; cache=caches_data_by_default(model), - ) where M - mach = new{M,cache}(model) + ) where M + # In the case of symbolic model, machine cannot know the type of model to be fit + # at time of construction: + OM = M == Symbol ? Any : M + mach = new{M,OM,cache}(model) # (this `cache` is not the *field* `cache`) mach.frozen = false mach.state = 0 mach.args = args @@ -89,6 +94,8 @@ mutable struct Machine{M,C} <: MLJType end +caches_data(::Machine{<:Any, <:Any, C}) where C = C + """ age(mach::Machine) @@ -113,9 +120,9 @@ any upstream dependencies in a learning network): ```julia replace(mach, :args => (), :data => (), :data_resampled_data => (), :cache => nothing) - +``` """ -function Base.replace(mach::Machine{<:Any,C}, field_value_pairs::Pair...) where C +function Base.replace(mach::Machine{<:Any,<:Any,C}, field_value_pairs::Pair...) where C # determined new `model` and `args` and build replacement dictionary: newfield_given_old = Dict(field_value_pairs) # to be extended fields_to_be_replaced = keys(newfield_given_old) @@ -199,8 +206,7 @@ const WARN_UNKNOWN_SCITYPE = "Some data contains `Unknown` scitypes, which might lead to model-data mismatches. " err_length_mismatch(model) = DimensionMismatch( - "Differing number of observations "* - "in input and target. ") + "Differing number of observations in input and target. ") function check(model::Model, scitype_check_level, args...) @@ -436,8 +442,8 @@ machines(::Source) = Machine[] ## DISPLAY -_cache_status(::Machine{<:Any,true}) = "caches model-specific representations of data" -_cache_status(::Machine{<:Any,false}) = "does not cache data" +_cache_status(::Machine{<:Any,<:Any,true}) = "caches model-specific representations of data" +_cache_status(::Machine{<:Any,<:Any,false}) = "does not cache data" function Base.show(io::IO, mach::Machine) model = mach.model @@ -502,8 +508,8 @@ end # for getting model specific representation of the row-restricted # training data from a machine, according to the value of the machine # type parameter `C` (`true` or `false`): -_resampled_data(mach::Machine{<:Any,true}, model, rows) = mach.resampled_data -function _resampled_data(mach::Machine{<:Any,false}, model, rows) +_resampled_data(mach::Machine{<:Any,<:Any,true}, model, rows) = mach.resampled_data +function _resampled_data(mach::Machine{<:Any,<:Any,false}, model, rows) raw_args = map(N -> N(), mach.args) data = MMI.reformat(model, raw_args...) return selectrows(model, rows, data...) @@ -518,6 +524,10 @@ err_no_real_model(mach) = ErrorException( """ ) +err_missing_model(model) = ErrorException( + "Specified `composite` model does not have `:$(model)` as a field." +) + """ last_model(mach::Machine) @@ -605,7 +615,7 @@ more on these lower-level training methods. """ function fit_only!( - mach::Machine{<:Any,cache_data}; + mach::Machine{<:Any,<:Any,cache_data}; rows=nothing, verbosity=1, force=false, @@ -628,7 +638,8 @@ function fit_only!( # `getproperty(composite, mach.model)`: model = if mach.model isa Symbol isnothing(composite) && throw(err_no_real_model(mach)) - mach.model in propertynames(composite) + mach.model in propertynames(composite) || + throw(err_missing_model(model)) getproperty(composite, mach.model) else mach.model @@ -670,7 +681,7 @@ function fit_only!( force == true || # condition (ii) upstream_has_changed || # condition (iii) condition_iv || # condition (iv) - modeltype_changed # conditions (vi) or (vii) + modeltype_changed # conditions (vi) or (vii) isdefined(mach, :report) || (mach.report = LittleDict{Symbol,Any}()) @@ -795,12 +806,12 @@ type's field names as keys. The corresponding value is the fitted parameters for machine in the underlying learning network bound to that model. (If multiple machines share the same model, then the value is a vector.) -```julia -using MLJ -@load LogisticClassifier pkg=MLJLinearModels -X, y = @load_crabs; -pipe = Standardizer() |> LogisticClassifier() -mach = machine(pipe, X, y) |> fit! +```julia-repl +julia> using MLJ +julia> @load LogisticClassifier pkg=MLJLinearModels +julia> X, y = @load_crabs; +julia> pipe = Standardizer() |> LogisticClassifier(); +julia> mach = machine(pipe, X, y) |> fit!; julia> fitted_params(mach).logistic_classifier (classes = CategoricalArrays.CategoricalValue{String,UInt32}["B", "O"], @@ -833,12 +844,12 @@ type's field names as keys. The corresponding value is the report for the machin underlying learning network bound to that model. (If multiple machines share the same model, then the value is a vector.) -```julia -using MLJ -@load LinearBinaryClassifier pkg=GLM -X, y = @load_crabs; -pipe = Standardizer() |> LinearBinaryClassifier() -mach = machine(pipe, X, y) |> fit! +```julia-repl +julia> using MLJ +julia> @load LinearBinaryClassifier pkg=GLM +julia> X, y = @load_crabs; +julia> pipe = Standardizer() |> LinearBinaryClassifier(); +julia> mach = machine(pipe, X, y) |> fit!; julia> report(mach).linear_binary_classifier (deviance = 3.8893386087844543e-7, @@ -945,29 +956,29 @@ A machine returned by `serializable` is characterized by the property `mach.state == -1`. ### Example using [JLSO](https://invenia.github.io/JLSO.jl/stable/) - - using MLJ - using JLSO - Tree = @load DecisionTreeClassifier - tree = Tree() - X, y = @load_iris - mach = fit!(machine(tree, X, y)) - - # This machine can now be serialized - smach = serializable(mach) - JLSO.save("machine.jlso", :machine => smach) - - # Deserialize and restore learned parameters to useable form: - loaded_mach = JLSO.load("machine.jlso")[:machine] - restore!(loaded_mach) - - predict(loaded_mach, X) - predict(mach, X) - +```julia +using MLJ +using JLSO +Tree = @load DecisionTreeClassifier +tree = Tree() +X, y = @load_iris +mach = fit!(machine(tree, X, y)) + +# This machine can now be serialized +smach = serializable(mach) +JLSO.save("machine.jlso", :machine => smach) + +# Deserialize and restore learned parameters to useable form: +loaded_mach = JLSO.load("machine.jlso")[:machine] +restore!(loaded_mach) + +predict(loaded_mach, X) +predict(mach, X) +``` See also [`restore!`](@ref), [`MLJBase.save`](@ref). """ -function serializable(mach::Machine{<:Any, C}, model=mach.model; verbosity=1) where C +function serializable(mach::Machine{<:Any,<:Any,C}, model=mach.model; verbosity=1) where C isdefined(mach, :fitresult) || throw(ERR_SERIALIZING_UNTRAINED) mach.state == -1 && return mach @@ -1039,21 +1050,23 @@ the example below. ### Example - using MLJ - Tree = @load DecisionTreeClassifier - X, y = @load_iris - mach = fit!(machine(Tree(), X, y)) - - MLJ.save("tree.jls", mach) - mach_predict_only = machine("tree.jls") - predict(mach_predict_only, X) - - # using a buffer: - io = IOBuffer() - MLJ.save(io, mach) - seekstart(io) - predict_only_mach = machine(io) - predict(predict_only_mach, X) +```julia +using MLJ +Tree = @load DecisionTreeClassifier +X, y = @load_iris +mach = fit!(machine(Tree(), X, y)) + +MLJ.save("tree.jls", mach) +mach_predict_only = machine("tree.jls") +predict(mach_predict_only, X) + +# using a buffer: +io = IOBuffer() +MLJ.save(io, mach) +seekstart(io) +predict_only_mach = machine(io) +predict(predict_only_mach, X) +``` !!! warning "Only load files from trusted sources" Maliciously constructed JLS files, like pickles, and most other @@ -1066,8 +1079,7 @@ the example below. See also [`serializable`](@ref), [`machine`](@ref). """ -function save(file::Union{String,IO}, - mach::Machine) +function save(file::Union{String,IO}, mach::Machine) isdefined(mach, :fitresult) || error("Cannot save an untrained machine. ") diff --git a/src/operations.jl b/src/operations.jl index 9fab39992..d42689f2b 100644 --- a/src/operations.jl +++ b/src/operations.jl @@ -74,12 +74,12 @@ for operation in OPERATIONS operation == :inverse_transform && continue ex = quote - function $(operation)(mach::Machine{<:Model,false}; rows=:) + function $(operation)(mach::Machine{<:Model,<:Any,false}; rows=:) # catch deserialized machine with no data: isempty(mach.args) && throw(err_serialized($operation)) return ($operation)(mach, mach.args[1](rows=rows)) end - function $(operation)(mach::Machine{<:Model,true}; rows=:) + function $(operation)(mach::Machine{<:Model,<:Any,true}; rows=:) # catch deserialized machine with no data: isempty(mach.args) && throw(err_serialized($operation)) model = last_model(mach) @@ -92,8 +92,10 @@ for operation in OPERATIONS end # special case of Static models (no training arguments): - $operation(mach::Machine{<:Static,true}; rows=:) = throw(ERR_ROWS_NOT_ALLOWED) - $operation(mach::Machine{<:Static,false}; rows=:) = throw(ERR_ROWS_NOT_ALLOWED) + $operation(mach::Machine{<:Static,<:Any,true}; rows=:) = + throw(ERR_ROWS_NOT_ALLOWED) + $operation(mach::Machine{<:Static,<:Any,false}; rows=:) = + throw(ERR_ROWS_NOT_ALLOWED) end eval(ex) diff --git a/src/resampling.jl b/src/resampling.jl index a4afc2fa4..250e3ca0c 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -15,8 +15,7 @@ const PREDICT_OPERATIONS_STRING = begin end const PROG_METER_DT = 0.1 const ERR_WEIGHTS_LENGTH = - DimensionMismatch("`weights` and target "* - "have different lengths. ") + DimensionMismatch("`weights` and target have different lengths. ") const ERR_WEIGHTS_DICT = ArgumentError("`class_weights` must be a "* "dictionary with `Real` values. ") @@ -110,18 +109,62 @@ function shuffle_and_rng(shuffle, rng) return shuffle, rng end +# ---------------------------------------------------------------- +# InSample + +""" + in_sample = InSample() + +Instantiate an `InSample` resampling strategy, for use in `evaluate!`, `evaluate` and in +tuning. In this strategy the train and test sets are the same, and consist of all +observations specified by the `rows` keyword argument. If `rows` is not specified, all +supplied rows are used. + +# Example + +```julia +using MLJBase, MLJModels + +X, y = make_blobs() # a table and a vector +model = ConstantClassifier() +train, test = partition(eachindex(y), 0.7) # train:test = 70:30 +``` + +Compute in-sample (training) loss: + +```julia +evaluate(model, X, y, resampling=InSample(), rows=train, measure=brier_loss) +``` + +Compute the out-of-sample loss: + +```julia +evaluate(model, X, y, resampling=[(train, test),], measure=brier_loss) +``` + +Or equivalently: + +```julia +evaluate(model, X, y, resampling=Holdout(fraction_train=0.7), measure=brier_loss) +``` + +""" +struct InSample <: ResamplingStrategy end + +train_test_pairs(::InSample, rows) = [(rows, rows),] + # ---------------------------------------------------------------- # Holdout """ - holdout = Holdout(; fraction_train=0.7, - shuffle=nothing, - rng=nothing) + holdout = Holdout(; fraction_train=0.7, shuffle=nothing, rng=nothing) -Holdout resampling strategy, for use in `evaluate!`, `evaluate` and in +Instantiate a `Holdout` resampling strategy, for use in `evaluate!`, `evaluate` and in tuning. - train_test_pairs(holdout, rows) +```julia +train_test_pairs(holdout, rows) +``` Returns the pair `[(train, test)]`, where `train` and `test` are vectors such that `rows=vcat(train, test)` and @@ -156,7 +199,7 @@ Holdout(; fraction_train::Float64=0.7, shuffle=nothing, rng=nothing) = function train_test_pairs(holdout::Holdout, rows) train, test = partition(rows, holdout.fraction_train, - shuffle=holdout.shuffle, rng=holdout.rng) + shuffle=holdout.shuffle, rng=holdout.rng) return [(train, test),] end @@ -170,7 +213,9 @@ end Cross-validation resampling strategy, for use in `evaluate!`, `evaluate` and tuning. - train_test_pairs(cv, rows) +```julia +train_test_pairs(cv, rows) +``` Returns an `nfolds`-length iterator of `(train, test)` pairs of vectors (row indices), where each `train` and `test` is a sub-vector @@ -253,7 +298,9 @@ Cross-validation resampling strategy, for use in `evaluate!`, `evaluate` and tuning, when observations are chronological and not expected to be independent. - train_test_pairs(tscv, rows) +```julia +train_test_pairs(tscv, rows) +``` Returns an `nfolds`-length iterator of `(train, test)` pairs of vectors (row indices), where each `train` and `test` is a sub-vector @@ -348,8 +395,9 @@ Stratified cross-validation resampling strategy, for use in `evaluate!`, `evaluate` and in tuning. Applies only to classification problems (`OrderedFactor` or `Multiclass` targets). - train_test_pairs(stratified_cv, rows, y) - +```julia +train_test_pairs(stratified_cv, rows, y) +``` Returns an `nfolds`-length iterator of `(train, test)` pairs of vectors (row indices) where each `train` and `test` is a sub-vector of `rows`. The `test` vectors are mutually exclusive and exhaust @@ -465,12 +513,18 @@ end # ================================================================ ## EVALUATION RESULT TYPE +abstract type AbstractPerformanceEvaluation <: MLJType end + """ - PerformanceEvaluation + PerformanceEvaluation <: AbstractPerformanceEvaluation Type of object returned by [`evaluate`](@ref) (for models plus data) or [`evaluate!`](@ref) (for machines). Such objects encode estimates of the performance -(generalization error) of a supervised model or outlier detection model. +(generalization error) of a supervised model or outlier detection model, and store other +information ancillary to the computation. + +If [`evaluate`](@ref) or [`evaluate!`](@ref) is called with the `compact=true` option, +then a [`CompactPerformanceEvaluation`](@ref) object is returned instead. When `evaluate`/`evaluate!` is called, a number of train/test pairs ("folds") of row indices are generated, according to the options provided, which are discussed in the @@ -479,7 +533,7 @@ pairs are recorded in the `train_test_rows` field of the `PerformanceEvaluation` and the corresponding estimates, aggregated over all train/test pairs, are recorded in `measurement`, a vector with one entry for each measure (metric) recorded in `measure`. -When displayed, a `PerformanceEvalution` object includes a value under the heading +When displayed, a `PerformanceEvaluation` object includes a value under the heading `1.96*SE`, derived from the standard error of the `per_fold` entries. This value is suitable for constructing a formal 95% confidence interval for the given `measurement`. Such intervals should be interpreted with caution. See, for example, Bates @@ -526,10 +580,13 @@ These fields are part of the public API of the `PerformanceEvaluation` struct. and `test` are vectors of row (observation) indices for training and evaluation respectively. -- `resampling`: the resampling strategy used to generate the train/test pairs. +- `resampling`: the user-specified resampling strategy to generate the train/test pairs + (or literal train/test pairs if that was directly specified). - `repeats`: the number of times the resampling strategy was repeated. +See also [`CompactPerformanceEvaluation`](@ref). + """ struct PerformanceEvaluation{M, Measure, @@ -539,7 +596,7 @@ struct PerformanceEvaluation{M, PerObservation, FittedParamsPerFold, ReportPerFold, - R} <: MLJType + R} <: AbstractPerformanceEvaluation model::M measure::Measure measurement::Measurement @@ -553,6 +610,47 @@ struct PerformanceEvaluation{M, repeats::Int end +""" + CompactPerformanceEvaluation <: AbstractPerformanceEvaluation + +Type of object returned by [`evaluate`](@ref) (for models plus data) or +[`evaluate!`](@ref) (for machines) when called with the option `compact = true`. Such +objects have the same structure as the [`PerformanceEvaluation`](@ref) objects returned by +default, except that the following fields are omitted to save memory: +`fitted_params_per_fold`, `report_per_fold`, `train_test_rows`. + +For more on the remaining fields, see [`PerformanceEvaluation`](@ref). + +""" +struct CompactPerformanceEvaluation{M, + Measure, + Measurement, + Operation, + PerFold, + PerObservation, + R} <: AbstractPerformanceEvaluation + model::M + measure::Measure + measurement::Measurement + operation::Operation + per_fold::PerFold + per_observation::PerObservation + resampling::R + repeats::Int +end + +compactify(e::CompactPerformanceEvaluation) = e +compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation( + e.model, + e.measure, + e.measurement, + e.operation, + e.per_fold, + e. per_observation, + e.resampling, + e.repeats, +) + # pretty printing: round3(x) = x round3(x::AbstractFloat) = round(x, sigdigits=3) @@ -562,7 +660,7 @@ const SE_FACTOR = 1.96 # For a 95% confidence interval. _standard_error(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1) _standard_error(v) = "N/A" -function _standard_errors(e::PerformanceEvaluation) +function _standard_errors(e::AbstractPerformanceEvaluation) measure = e.measure length(e.per_fold[1]) == 1 && return [nothing] std_errors = map(_standard_error, e.per_fold) @@ -573,42 +671,81 @@ end _repr_(f::Function) = repr(f) _repr_(x) = repr("text/plain", x) -function Base.show(io::IO, ::MIME"text/plain", e::PerformanceEvaluation) +# helper for row labels: _label(1) ="A", _label(2) = "B", _label(27) = "BA", etc +const alphabet = Char.(65:90) +_label(i) = map(digits(i - 1, base=26)) do d alphabet[d + 1] end |> join |> reverse + +function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) _measure = [_repr_(m) for m in e.measure] _measurement = round3.(e.measurement) _per_fold = [round3.(v) for v in e.per_fold] _sterr = round3.(_standard_errors(e)) + row_labels = _label.(eachindex(e.measure)) + + # Define header and data for main table - # Only show the standard error if the number of folds is higher than 1. - show_sterr = any(!isnothing, _sterr) - data = show_sterr ? - hcat(_measure, e.operation, _measurement, _sterr, _per_fold) : - hcat(_measure, e.operation, _measurement, _per_fold) - header = show_sterr ? - ["measure", "operation", "measurement", "1.96*SE", "per_fold"] : - ["measure", "operation", "measurement", "per_fold"] + data = hcat(_measure, e.operation, _measurement) + header = ["measure", "operation", "measurement"] + if length(row_labels) > 1 + data = hcat(row_labels, data) + header =["", header...] + end - println(io, "PerformanceEvaluation object "* + if e isa PerformanceEvaluation + println(io, "PerformanceEvaluation object "* + "with these fields:") + println(io, " model, measure, operation,\n"* + " measurement, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats") + else + println(io, "CompactPerformanceEvaluation object "* "with these fields:") - println(io, " model, measure, operation, measurement, per_fold,\n"* - " per_observation, fitted_params_per_fold,\n"* - " report_per_fold, train_test_rows, resampling, repeats") + println(io, " model, measure, operation,\n"* + " measurement, per_fold, per_observation,\n"* + " train_test_rows, resampling, repeats") + end + println(io, "Extract:") show_color = MLJBase.SHOW_COLOR[] color_off() - PrettyTables.pretty_table(io, - data; - header, - header_crayon=PrettyTables.Crayon(bold=false), - alignment=:l, - linebreaks=true) + PrettyTables.pretty_table( + io, + data; + header, + header_crayon=PrettyTables.Crayon(bold=false), + alignment=:l, + linebreaks=true, + ) + + # Show the per-fold table if needed: + + if length(first(e.per_fold)) > 1 + show_sterr = any(!isnothing, _sterr) + data2 = hcat(_per_fold, _sterr) + header2 = ["per_fold", "1.96*SE"] + if length(row_labels) > 1 + data2 = hcat(row_labels, data2) + header2 =["", header2...] + end + PrettyTables.pretty_table( + io, + data2; + header=header2, + header_crayon=PrettyTables.Crayon(bold=false), + alignment=:l, + linebreaks=true, + ) + end show_color ? color_on() : color_off() end -function Base.show(io::IO, e::PerformanceEvaluation) - summary = Tuple(round3.(e.measurement)) - print(io, "PerformanceEvaluation$summary") -end +_summary(e) = Tuple(round3.(e.measurement)) +Base.show(io::IO, e::PerformanceEvaluation) = + print(io, "PerformanceEvaluation$(_summary(e))") +Base.show(io::IO, e::CompactPerformanceEvaluation) = + print(io, "CompactPerformanceEvaluation$(_summary(e))") + # =============================================================== ## EVALUATION METHODS @@ -877,8 +1014,10 @@ Available resampling strategies are $RESAMPLING_STRATEGIES_LIST. If `resampling` instance of one of these, then a vector of tuples of the form `(train_rows, test_rows)` is expected. For example, setting - resampling = [((1:100), (101:200)), - ((101:200), (1:100))] +```julia +resampling = [((1:100), (101:200)), + ((101:200), (1:100))] +``` gives two-fold cross-validation using the first 200 rows of data. @@ -931,7 +1070,11 @@ Although `evaluate!` is mutating, `mach.model` and `mach.args` are not mutated. - `logger` - a logger object (see [`MLJBase.log_evaluation`](@ref)) -See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref) +- `compact=false` - if `true`, the returned evaluation object excludes these fields: + `fitted_params_per_fold`, `report_per_fold`, `train_test_rows`. + +See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref), +[`CompactPerformanceEvaluation`](@ref). """ function evaluate!( @@ -951,6 +1094,7 @@ function evaluate!( per_observation=true, verbosity=1, logger=nothing, + compact=false, ) # this method just checks validity of options, preprocess the @@ -1017,15 +1161,15 @@ function evaluate!( per_observation, logger, resampling, + compact, ) end """ evaluate(model, data...; cache=true, options...) -Equivalent to `evaluate!(machine(model, data..., cache=cache); -options...)`. See the machine version `evaluate!` for the complete -list of options. +Equivalent to `evaluate!(machine(model, data..., cache=cache); options...)`. +See the machine version `evaluate!` for the complete list of options. Returns a [`PerformanceEvaluation`](@ref) object. @@ -1106,7 +1250,7 @@ end @static if VERSION >= v"1.3.0-DEV.573" # determines if an instantiated machine caches data: -_caches_data(::Machine{M, C}) where {M, C} = C +_caches_data(::Machine{<:Any,<:Any,C}) where C = C function _evaluate!(func, mach, accel::CPUThreads, nfolds, verbosity) @@ -1198,6 +1342,7 @@ function evaluate!( per_observation_flag, logger, user_resampling, + compact, ) # Note: `user_resampling` keyword argument is the user-defined resampling strategy, @@ -1352,7 +1497,8 @@ function evaluate!( ) log_evaluation(logger, evaluation) - evaluation + compact && return compactify(evaluation) + return evaluation end # ---------------------------------------------------------------- @@ -1399,6 +1545,7 @@ end check_measure=true, per_observation=true, logger=nothing, + compact=false, ) Resampling model wrapper, used internally by the `fit` method of `TunedModel` instances @@ -1442,6 +1589,7 @@ mutable struct Resampler{S, L} <: Model cache::Bool per_observation::Bool logger::L + compact::Bool end # Some traits are markded as `missing` because we cannot determine @@ -1485,6 +1633,7 @@ function Resampler( cache=true, per_observation=true, logger=nothing, + compact=false, ) resampler = Resampler( model, @@ -1499,6 +1648,7 @@ function Resampler( cache, per_observation, logger, + compact, ) message = MLJModelInterface.clean!(resampler) isempty(message) || @warn message @@ -1532,6 +1682,10 @@ function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...) _acceleration = _process_accel_settings(resampler.acceleration) + # the value of `compact` below is always `false`, because we need + # `e.train_test_rows` in `update`. (If `resampler.compact=true`, then + # `evaluate(resampler, ...)` returns the compactified version of the current + # `PerformanceEvaluation` object.) e = evaluate!( mach, resampler.resampling, @@ -1547,6 +1701,7 @@ function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...) resampler.per_observation, resampler.logger, resampler.resampling, + false, # compact ) fitresult = (machine = mach, evaluation = e) @@ -1620,6 +1775,7 @@ function MLJModelInterface.update( resampler.per_observation, resampler.logger, resampler.resampling, + false # we use `compact=false`; see comment in `fit` above ) report = (evaluation = e, ) fitresult = (machine=mach2, evaluation=e) @@ -1643,7 +1799,8 @@ StatisticalTraits.load_path(::Type{<:Resampler}) = "MLJBase.Resampler" fitted_params(::Resampler, fitresult) = fitresult -evaluate(resampler::Resampler, fitresult) = fitresult.evaluation +evaluate(resampler::Resampler, fitresult) = resampler.compact ? + compactify(fitresult.evaluation) : fitresult.evaluation function evaluate(machine::Machine{<:Resampler}) if isdefined(machine, :fitresult) diff --git a/src/show.jl b/src/show.jl index a1ebb9456..9a9616af2 100644 --- a/src/show.jl +++ b/src/show.jl @@ -27,7 +27,9 @@ Private method (used in testing). Equivalent to `const x = value` but registers the binding thus: - MLJBase.HANDLE_GIVEN_ID[objectid(value)] = :x +```julia +MLJBase.HANDLE_GIVEN_ID[objectid(value)] = :x +``` Registered objects get displayed using the variable name to which it was bound in calls to `show(x)`, etc. @@ -320,19 +322,21 @@ _show(stream::IO, ::Nothing) = println(stream, "nothing") """ _recursive_show(stream, object, current_depth, depth) +**Private method.** + Generate a table of the properties of the `MLJType` object, dislaying each property value by calling the method `_show` on it. The behaviour of `_show(stream, f)` is as follows: 1. If `f` is itself a `MLJType` object, then its short form is shown -and `_recursive_show` generates as separate table for each of its -properties (and so on, up to a depth of argument `depth`). + and `_recursive_show` generates as separate table for each of its + properties (and so on, up to a depth of argument `depth`). 2. Otherwise `f` is displayed as "(omitted T)" where `T = typeof(f)`, -unless `istoobig(f)` is false (the `istoobig` fall-back for arbitrary -types being `true`). In the latter case, the long (ie, -MIME"plain/text") form of `f` is shown. To override this behaviour, -overload the `_show` method for the type in question. + unless `istoobig(f)` is false (the `istoobig` fall-back for arbitrary + types being `true`). In the latter case, the long (ie, + MIME"plain/text") form of `f` is shown. To override this behaviour, + overload the `_show` method for the type in question. """ function _recursive_show(stream::IO, object::MLJType, current_depth, depth) diff --git a/src/sources.jl b/src/sources.jl index d2fd75242..083e269e1 100644 --- a/src/sources.jl +++ b/src/sources.jl @@ -41,9 +41,11 @@ expected. The calling behaviour of a `Source` object is this: - Xs() = X - Xs(rows=r) = selectrows(X, r) # eg, X[r,:] for a DataFrame - Xs(Xnew) = Xnew +```julia +Xs() = X +Xs(rows=r) = selectrows(X, r) # eg, X[r,:] for a DataFrame +Xs(Xnew) = Xnew +``` See also: [`MLJBase.prefit`](@ref), [`sources`](@ref), [`origins`](@ref), [`node`](@ref). diff --git a/src/utilities.jl b/src/utilities.jl index 969fce4ce..3dcf31a68 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -14,7 +14,7 @@ View a nested named tuple `t` as a tree and return, as a tuple, the values at the leaves, in the order they appear in the original tuple. ```julia-repl -julia> t = (X = (x = 1, y = 2), Y = 3) +julia> t = (X = (x = 1, y = 2), Y = 3); julia> flat_values(t) (1, 2, 3) ``` @@ -51,6 +51,7 @@ end For prepending symbols in expressions like `:(y.w)` and `:(x1.x2.x3)`. +```julia-repl julia> prepend(:x, :y) :(x.y) @@ -59,6 +60,7 @@ julia> prepend(:x, :(y.z)) julia> prepend(:w, ans) :(w.x.y.z) +``` If the second argument is `nothing`, then `nothing` is returned. @@ -74,10 +76,11 @@ prepend(s::Symbol, ex::Expr) = Expr(:(.), prepend(s, ex.args[1]), ex.args[2]) Call getproperty recursively on `object` to extract the value of some nested property, as in the following example: - julia> object = (X = (x = 1, y = 2), Y = 3) - julia> recursive_getproperty(object, :(X.y)) - 2 - +```julia-repl +julia> object = (X = (x = 1, y = 2), Y = 3); +julia> recursive_getproperty(object, :(X.y)) +2 +``` """ recursive_getproperty(obj, property::Symbol) = getproperty(obj, property) function recursive_getproperty(obj, ex::Expr) @@ -105,7 +108,7 @@ end Set a nested property of an `object` to `value`, as in the following example: -``` +```julia-repl julia> mutable struct Foo X Y @@ -150,7 +153,7 @@ have the same number of rows. end """ -_permute_rows(obj, perm) + _permute_rows(obj, perm) Internal function to return a vector or matrix with permuted rows given the permutation `perm`. @@ -182,7 +185,7 @@ function shuffle_rows( end """ -init_rng(rng) + init_rng(rng) Create an `AbstractRNG` from `rng`. If `rng` is a non-negative `Integer`, it returns a `MersenneTwister` random number generator seeded with `rng`; If `rng` is @@ -249,8 +252,10 @@ end Return a "sequence" string from the first `n` elements generated by `itr`. - julia> MLJBase.sequence_string(1:10, 4) - "1, 2, 3, 4, ..." +```julia-repl +julia> MLJBase.sequence_string(1:10, 4) +"1, 2, 3, 4, ..." +``` **Private method.** @@ -293,7 +298,7 @@ column cycle fastest, those in the last clolumn slowest. ### Example -```julia +```julia-repl julia> iterators = ([1, 2], ["a","b"], ["x", "y", "z"]); julia> MLJTuning.unwind(iterators...) 12×3 Array{Any,2}: @@ -340,15 +345,15 @@ end Split an `AbstractRange` into `n` subranges of approximately equal length. ### Example -```julia +```julia-repl julia> collect(chunks(1:5, 2)) 2-element Array{UnitRange{Int64},1}: 1:3 4:5 +``` **Private method** -``` """ function chunks(c::AbstractRange, n::Integer) n < 1 && throw(ArgumentError("cannot split range into $n subranges")) @@ -410,8 +415,8 @@ If `only` is specified, then the operation is restricted to those `M` for which `M isa only`. In all other cases the symbolic name is generated using `substitute` as the base symbol. -``` -existing_names = [] +```julia-repl +julia> existing_names = []; julia> generate_name!(Vector{Int}, existing_names) :vector @@ -470,14 +475,14 @@ generate_name!(model, existing_names; kwargs...) = *Private method.* -Tries to infer the per-observation scitype from the scitype of `S`, when `S` is known to -be the scitype of some container with multiple observations; here we view the scitype for -one row of a table to be the scitype of the row converted to a vector. Return `Unknown` if -unable to draw reliable inferrence. +Tries to infer the per-observation scitype from the scitype of `S`, when `S` is +known to be the scitype of some container with multiple observations; here we +view the scitype for one row of a table to be the scitype of the row converted +to a vector. Return `Unknown` if unable to draw reliable inferrence. -The observation scitype for a table is here understood as the scitype of a row converted -to a vector. +The observation scitype for a table is here understood as the scitype of a row +converted to a vector. """ observation(::Type) = Unknown @@ -501,13 +506,13 @@ end *Private method.* -If `y` is an `AbstractArray`, return the scitype of `y[:, :, ..., :, 1]`. If `y` is a -table, return the scitype of the first row, converted to a vector, unless this row has -`missing` elements, in which case return `Unknown`. +If `y` is an `AbstractArray`, return the scitype of `y[:, :, ..., :, 1]`. If `y` +is a table, return the scitype of the first row, converted to a vector, unless +this row has `missing` elements, in which case return `Unknown`. In all other cases, `Unknown`. -``` +```julia-repl julia> guess_observation_scitype([missing, 1, 2, 3]) Union{Missing, Count} @@ -536,12 +541,12 @@ end *Private method* -Try to infer a lowest upper bound on the scitype of target observations acceptable to -`model`, by inspecting `target_scitype(model)`. Return `Unknown` if unable to draw reliable -inferrence. +Try to infer a lowest upper bound on the scitype of target observations +acceptable to `model`, by inspecting `target_scitype(model)`. Return `Unknown` +if unable to draw reliable inferrence. -The observation scitype for a table is here understood as the scitype of a row converted -to a vector. +The observation scitype for a table is here understood as the scitype of a row +converted to a vector. """ guess_model_target_observation_scitype(model) = observation(target_scitype(model)) diff --git a/test/hyperparam/one_dimensional_range_methods.jl b/test/hyperparam/one_dimensional_range_methods.jl index 386cadf4f..2ffd8101c 100644 --- a/test/hyperparam/one_dimensional_range_methods.jl +++ b/test/hyperparam/one_dimensional_range_methods.jl @@ -233,31 +233,31 @@ end @testset "NominalSampler" begin r = range(Char, :(model.dummy), values=collect("cab")) + N = 10000 - @testset "probability vector specified" begin - s = MLJBase.sampler(r, [0.1, 0.2, 0.7]) - rng = StableRNG(600) - dict = Dist.countmap(rand(rng,s, 1000)) - c, a, b = map(x -> dict[x], collect("cab")) - @test a == 201 && b == 714 && c == 85 + # to compute half-width of 95% confidence intervals, for counts of a Bernoulli process + # with probability `p`, sampled `N` times: + halfwidth(p, N) = 1.96*sqrt(p*(1 - p))*sqrt(N) - rng = StableRNG(89); - dict = Dist.countmap(rand(rng, s, 1000)) - c, a, b = map(x -> dict[x], collect("cab")) - @test a == 173 && b == 733 && c == 94 + @testset "probability vector specified" begin + p = Dict('c'=>0.1, 'a'=>0.2, 'b'=>0.7) + rng = StableRNG(660) + s = MLJBase.sampler(r, [p[class] for class in "cab"]) + counts = Dist.countmap(rand(rng,s, N)) + for class in "abc" + μ = p[class]*N + @test abs(counts[class] - μ) < halfwidth(p[class], N) + end end @testset "probability vector unspecified (uniform)" begin s = MLJBase.sampler(r) - rng = StableRNG(55) - dict = Dist.countmap(rand(rng,s, 1000)) - c, a, b = map(x -> dict[x], collect("cab")) - @test a == 361 && b == 335 && c == 304 - - rng = StableRNG(550) - dict = Dist.countmap(rand(rng, s, 1000)) - c, a, b = map(x -> dict[x], collect("cab")) - @test a == 332 && b == 356 && c == 312 + rng = StableRNG(660) + counts = Dist.countmap(rand(rng,s, N)) + for class in "abc" + μ = N/3 + @test abs(counts[class] - μ) < halfwidth(1/3, N) + end end end diff --git a/test/machines.jl b/test/machines.jl index c78aa06dd..95080b375 100644 --- a/test/machines.jl +++ b/test/machines.jl @@ -68,7 +68,12 @@ end predict(t, selectrows(X,test)); @test rms(predict(t, selectrows(X, test)), y[test]) < std(y) + # cache type parameter + mach = machine(ConstantRegressor(), X, y, cache=false) + @test !MLJBase.caches_data(mach) mach = machine(ConstantRegressor(), X, y) + @test MLJBase.caches_data(mach) + @test_logs (:info, r"Training") fit!(mach) yhat = predict_mean(mach, X); @@ -272,7 +277,6 @@ end X = ones(2, 3) mach = @test_logs machine(Scale(2)) - @test mach isa Machine{Scale, false} transform(mach, X) # triggers training of `mach`, ie is mutating @test report(mach) in [nothing, NamedTuple()] @test isnothing(fitted_params(mach)) diff --git a/test/resampling.jl b/test/resampling.jl index d27af319c..19b351656 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -364,6 +364,22 @@ end end end +@testset "insample" begin + rows = rand(Int, 100) + @test MLJBase.train_test_pairs(InSample(), rows) == [(rows, rows),] + + X, y = make_regression(20) + model = Models.DeterministicConstantRegressor() + + # all rows: + e = evaluate(model, X, y, resampling=InSample(), measure=rms) + @test e.measurement[1] ≈ std(y, corrected=false) + + # subsample of rows: + e = evaluate(model, X, y, resampling=InSample(), measure=rms, rows=1:7) + @test e.measurement[1] ≈ std(y[1:7], corrected=false) +end + @testset_accelerated "holdout" accel begin x1 = ones(4) x2 = ones(4) @@ -891,4 +907,29 @@ end fit!(mach) end +@testset "compact evaluation objects" begin + model = ConstantClassifier() + X, y = make_blobs(10) + e = evaluate(model, X, y) + ec = evaluate(model, X, y, compact=true) + @test MLJBase.compactify(ec) == ec == MLJBase.compactify(e) + @test e isa PerformanceEvaluation + @test ec isa CompactPerformanceEvaluation + @test startswith(sprint(show, MIME("text/plain"), e), "PerformanceEvaluation") + @test startswith(sprint(show, MIME("text/plain"), ec), "CompactPerformanceEvaluation") + @test e.measurement[1] == ec.measurement[1] + + # smoke tests: + mach = machine(model, X, y) + for e in [ + evaluate!(mach, measures=[brier_loss, accuracy]), + evaluate!(mach, measures=[brier_loss, accuracy], compact=true), + evaluate!(mach, resampling=Holdout(), measures=[brier_loss, accuracy]), + evaluate!(mach, resampling=Holdout(), measures=[brier_loss, accuracy], compact=true), + ] + @test contains(sprint(show, MIME("text/plain"), e), "predict") + @test contains(sprint(show, e), "PerformanceEvaluation(") + end +end + true