diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 375c7cf7..17d29ade 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,36 +2,47 @@ "name": "noj", "build": { "dockerfile": "Dockerfile" - - }, - - - "features": { - "ghcr.io/devcontainers-contrib/features/clojure-asdf:2": {}, + + }, + + + "features": { + "ghcr.io/devcontainers-contrib/features/clojure-asdf:2": {}, "ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {}, "ghcr.io/devcontainers-contrib/features/bash-command:1": - {"command": "apt-get update && apt-get install -y rlwrap && apt-get install -y libxtst-dev"}, + {"command": "apt-get update && apt-get install -y rlwrap && apt-get install -y libxtst-dev"}, "ghcr.io/devcontainers-contrib/features/poetry:2" : {}, "ghcr.io/rocker-org/devcontainer-features/r-apt:0": {} - - -}, + + + }, "customizations": { "vscode": { "settings": { - }, + "markdown-everywhere.customized-rules": [ + "defaultRules", + { + "name": "double semicolon", + "whileRegExp": ";;", + "example": ";; *hello", + "languages": [ + { "name": "clojure", "source": "source.clj" } + ] + } + ]}, "extensions": [ "betterthantomorrow.calva", - "streetsidesoftware.code-spell-checker" + "streetsidesoftware.code-spell-checker", + "zhaouv.vscode-markdown-everywhere" ] } }, - + "remoteUser": "vscode", "updateContentCommand": {"install python packages": "mkdir -p .venv && poetry install --sync" , - "add link to python executable": "sudo ln -s /usr/bin/python3 /usr/local/bin/python" - }, - "onCreateCommand": "clojure -Sthreads 1 -A:model-integration-tests:test:dev -P" - + "add link to python executable": "sudo ln -s /usr/bin/python3 /usr/local/bin/python" +}, +"onCreateCommand": "clojure -Sthreads 1 -A:model-integration-tests:test:dev -P" + } diff --git a/notebooks/chapters.edn b/notebooks/chapters.edn index be10b5f0..4b372220 100644 --- a/notebooks/chapters.edn +++ b/notebooks/chapters.edn @@ -10,8 +10,8 @@ :chapters ["statistics_intro" "linear_regression_intro"]} {:part "Machine Learning" - :chapters ["metamorph" - "ml_basic" + :chapters ["ml_basic" + "metamorph" "prepare_for_ml" "automl" "interactions_ols"]} diff --git a/notebooks/noj_book/automl.clj b/notebooks/noj_book/automl.clj index 389bd7ef..e9af9e04 100644 --- a/notebooks/noj_book/automl.clj +++ b/notebooks/noj_book/automl.clj @@ -1,6 +1,5 @@ ;; # AutoML using metamorph pipelines - ;; In this tutorial we see how to use `metamorph.ml` to perform automatic machine learning. ;; With AutoML we mean to try lots of different models and hyper parameters and rely on automatic ;; validation to pick the best performing model automatically. @@ -21,7 +20,7 @@ [tech.v3.dataset.modelling :as ds-mod])) ;; ## The metamorph pipeline abstraction -;; When doing automl, it is very useful to be able to manage +;; When using automl, it is very useful to be able to manage all ;; the steps of a machine learning pipeline (including data ;; transformations and modeling) as a unified function that can be ;; freely moved around. @@ -47,6 +46,7 @@ ;; functions of the pipeline can (but don't need to) do ;; different things depend on the `mode` ;; +;; ### metamorph.ml/model ;; Specifically we have a function called `metamorph.ml/model` which ;; will do `train` in mode ;; `:fit` and `predict` in mode `:transform` @@ -64,6 +64,7 @@ ;; (def titanic ml-basic/numeric-titanic-data) +;; ### Split the data ;; so lets create splits of the data first: @@ -71,7 +72,7 @@ (def train-ds (:train splits)) (def test-ds (:test splits)) - +;; ### Create pipeline ;; In its foundation a metamorph pipeline is a sequential composition of ;; functions, @@ -96,6 +97,8 @@ my-pipeline ;; ;; But this map cannot be "arbitrary", it need to adhere to the `metamorph` conventions. ;; +;; ### run pipeline = train model +;; ;; The following `trains` a model, because the `ml/model` ;; function does this when called with `:mode` `:fit`. ;; And it is the only operation in the pipeline, so the pipeline does one @@ -112,11 +115,12 @@ ctx-after-train ;; (vals ctx-after-train) -;; The `model` function has closed over the id, so is knows "his id", so in the `transform` -;; mode it can get the data created at `:fit`. So the `model` function can "send" data to itself -;; from `:fit` to `:transform`, the `trained model`. +;; The `model` function has closed over the id, so it knows "its id", so in the +;; `transform` mode it can get the data created at `:fit`. So the `model` +;; function can "send" data to itself from `:fit` to `:transform`, +;; the `trained model`. ;; -;; So this will do the `predict` on new data +;; So this will do the `predict` on new data: (def ctx-after-predict (my-pipeline (assoc ctx-after-train @@ -217,9 +221,9 @@ ctx-after-train (mm/pipeline ops-2) (mm/pipeline ops-3) -;; All three can be called as function taking a dataset iwrapped in a ctx +;; All three can be called as function taking a dataset wrapped in a ctx map. -;; Pipeline as data is as well supported +;; Pipeline as data is as well supported: (def op-spec [[ml/model {:model-type :metamorph.ml/dummy-classifier}]]) ;; (mm/->pipeline op-spec) @@ -278,8 +282,8 @@ ctx-after-train ;; The AutoML support in metamorph.ml consists now in the possibility ;; to create an arbitrary number of different pipelines ;; and have them run against arbitrary test/train data splits -;; and it automatically chooses the best model evaluated by by a -;; certain metric. +;; and it automatically chooses the best model evaluated by a +;; user provided metric function. ;; helper for later (defn make-results-ds [evaluation-results] @@ -498,7 +502,8 @@ logistic-regression-specs ;; of the pipeline. It should be faster to do data transformations only once, ;; before the metamorph pipeline starts. ;; -;; Nevertheless is some scenarios it is very useful to create a full transformation pipeline -;; as a metamorph pipeline. This would for example allow to perform very different transformation steps per model -;; and still only have a single seq of pipeline functions to manage, +;; Nevertheless in some scenarios it is very useful to create a full +;; transformation pipeline as a metamorph pipeline. +;; This would for example allow to perform very different transformation steps per +;; model and still only have a single seq of pipeline functions to manage, ;; therefore having fully self contained pipelines. diff --git a/notebooks/noj_book/interactions_ols.clj b/notebooks/noj_book/interactions_ols.clj index 06ed4527..c8bddfa8 100644 --- a/notebooks/noj_book/interactions_ols.clj +++ b/notebooks/noj_book/interactions_ols.clj @@ -21,24 +21,23 @@ (def md (comp kindly/hide-code kind/md)) -(md "This examples shows how to do interactions in linear regression with `metamorph.ml`.") +;; This examples shows how to do interactions in linear regression with `metamorph.ml`. -(md "Taking ideas from: [Interaction Effect in Multiple Regression: Essentials](http://www.sthda.com/english/articles/40-regression-analysis/164-interaction-effect-in-multiple-regression-essentials/) by Alboukadel Kassambara") +;;Taking ideas from: [Interaction Effect in Multiple Regression: Essentials](http://www.sthda.com/english/articles/40-regression-analysis/164-interaction-effect-in-multiple-regression-essentials/) by Alboukadel Kassambara -(md "First we load the data:") +;; First we load the data: (def marketing (tc/dataset "https://github.com/scicloj/datarium-CSV/raw/main/data/marketing.csv.gz" {:key-fn keyword})) -(md "and do some preprocessing to set up the regression:") +;; and do some preprocessing to set up the regression: + (def preprocessed-data (-> marketing (tc/drop-columns [:newspaper]) (modelling/set-inference-target :sales))) - -(md "## Additive model") -(md "First we build an additive model, which model equation is -$$sales = b0 + b1 * youtube + b2 * facebook$$") +;; ## Additive model +;;First we build an additive model, which model equation is $$sales = b0 + b1 * youtube + b2 * facebook$$ (def linear-model-config {:model-type :fastmath/ols}) @@ -47,7 +46,8 @@ $$sales = b0 + b1 * youtube + b2 * facebook$$") {:metamorph/id :model} (ml/model linear-model-config))) -(md "We evaluate it, ") +;; We evaluate it, + (def evaluations (ml/evaluate-pipelines [additive-pipeline] @@ -59,29 +59,31 @@ $$sales = b0 + b1 * youtube + b2 * facebook$$") {:other-metrices [{:name :r2 :metric-fn fmstats/r2-determination}]})) -(md "and print the resulting model: -(note that the `:sales` term means the intercept `b0`)") -(md "(note that )") +;; and print the resulting model: +;;(note that the `:sales` term means the intercept `b0`) + (-> evaluations flatten first :fit-ctx :model ml/tidy) -(md "We have the following metrics:") -(md "$RMSE$") +;; We have the following metrics: + +;; $RMSE$: + (-> evaluations flatten first :test-transform :metric) (kindly/check = 1.772159024927988) -(md "$R^2$") +;; $R^2$: + (-> evaluations flatten first :test-transform :other-metrices first :metric) (kindly/check = 0.9094193687523886) +;; ## Interaction effects -(md "## Interaction effects") -(md "Now we add interaction effects to it, resulting in this model equation: -$$sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)$$") +;; We add a new column wit an interaction: (def pipe-interaction (mm/pipeline (tcpipe/add-column :youtube*facebook (fn [ds] (tcc/* (ds :youtube) (ds :facebook)))) {:metamorph/id :model} (ml/model linear-model-config))) +;; Again we evaluate the model, -(md "Again we evaluate the model,") (def evaluations (ml/evaluate-pipelines [pipe-interaction] @@ -94,25 +96,26 @@ $$sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)$$") :metric-fn fmstats/r2-determination}]})) -(md "and print it and the performance metrics:") + +;; and print it and the performance metrics: (-> evaluations flatten first :fit-ctx :model ml/tidy) -(md "As the multiplcation of `youtube*facebook` is as well statistically relevant, it -suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook.") +;; As the multiplcation of `youtube*facebook` is as well statistically relevant, it +;;suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook. + +;; $RMSE$ -(md "$RMSE$") (-> evaluations flatten first :test-transform :metric) (kindly/check = 0.933077510748531) +;; $R^2$ -(md "$R^2$") (-> evaluations flatten first :test-transform :other-metrices first :metric) (kindly/check = 0.9747551116991899) -(md "$RMSE$ and $R^2$ of the intercation model are sligtly better. +;;$RMSE$ and $R^2$ of the intercation model are sligtly better. -These results suggest that the model with the interaction term is better than the model that contains only main effects. -So, for this specific data, we should go for the model with the interaction model. -") +;;These results suggest that the model with the interaction term is better than the model that contains only main effects. +;;So, for this specific data, we should go for the model with the interaction model. ;; ## use design matrix diff --git a/notebooks/noj_book/metamorph.clj b/notebooks/noj_book/metamorph.clj index f06b40e1..a96eabf6 100644 --- a/notebooks/noj_book/metamorph.clj +++ b/notebooks/noj_book/metamorph.clj @@ -6,11 +6,11 @@ ;; # Machine learning pipelines ;; ## Clojure Core Pipelines -;; Clojure has built-in support for data processing pipelines—a series of functions where the output +;; Clojure has built-in support for data processing pipelines, a series of functions where the output ;; of one step is the input to the next. In core Clojure, these are supported by the so-called ;; **threading macro**. -;; ### Example: Using the Threading Macro +;; ## Using the Threading Macro (require '[clojure.string :as str]) (-> "hello" @@ -29,10 +29,10 @@ ;; We can achieve the same result using **function composition** with `comp`. Note that when using ;; `comp`, the order of functions is reversed compared to the threading macro. + (def upper-reverse-first (comp first str/reverse str/upper-case)) -(upper-reverse-first "hello") ;; This defines a function `upper-reverse-first` that: @@ -167,9 +167,9 @@ (metamorph-pipeline-3-b {:metamorph/data "hello"}) - ; ### Pass additional state +;; ### Pass additional state -;; We can pass a main data object and any state through the pipeline. +;; We can pass a main data object and any state through the pipeline: (def metamorph-pipeline-4 (mm/pipeline @@ -225,6 +225,7 @@ ;; This can take two values, `:fit` and `:transform`, representing the concept of running the pipeline to ;; learn something from the data (train or fit the pipeline/model) ;; and apply what was learned on new data (predict or transform). + ;; The learned information can be stored in the context map, becoming available in later runs. ;; This passing of state only makes sense if the state is written to the map in one pass @@ -261,8 +262,8 @@ (metamorph-pipeline-6 {:metamorph/data "hello" :metamorph/mode :fit})) -;; This will print `:state "5"` in the terminal, showing that the state from the `:fit` phase is used during the -;; `:transform` phase. +;; This will print `:state "5"` in the terminal, showing that the state from the +;; `:fit` phase is used during the `:transform` phase. (metamorph-pipeline-6 (merge fitted-ctx @@ -272,7 +273,7 @@ ;; #### Lifting to create pipeline functions ;; As we have seen , most pipeline functions will behave exactly the same -;; in `:fit` and `:transform`, so they neither need state. +;; in `:fit` and `:transform`, so they don't need to remember / pass state. ;; ;; Example: @@ -287,6 +288,9 @@ (mm/lift str/reverse) (mm/lift first))) +;; The lifting creates a variant of the original function which acts on the +;; data in the context under key :metamorph/data + (metamorph-pipeline-7 {:metamorph/data "hello"}) @@ -294,12 +298,13 @@ ;; As we have seen so far, the data object at key `:metamorph/data` ;; can be anything, so far we have used a `String`. ;; -;; In machine learning pipelines we use a `tech.v3.dataset` instead, +;; In machine learning pipelines we usually use a `tech.v3.dataset` instead, ;; and the pipeline step functions transform mainly the dataset or train -;; a model. +;; a model from the dataset. ;; -;; The **state** is often the result of a **model** function. It is calculated in `:fit` -;; on training data and applied in `:transform` on other data to make a prediction. +;; The **state** is often the result of a **model** function. The model is +;; calculated in `:fit` on training data and applied in `:transform` on other +;; data to make a prediction. ;; All the rest stays the same. diff --git a/notebooks/noj_book/ml_basic.clj b/notebooks/noj_book/ml_basic.clj index df54358f..8903af5e 100644 --- a/notebooks/noj_book/ml_basic.clj +++ b/notebooks/noj_book/ml_basic.clj @@ -28,8 +28,8 @@ (into [:table [:tr [:th "Library" ] [:th "Clojure Wrapper"]]])) -;; These libraries do not have any functions for the models they contain. -;; Instead of funtcions per model, `metamorph.ml` has the concept of each model having a +;; These libraries do not have specific Clojure functions for the models they contain. +;; Instead of functions per model, `metamorph.ml` has the concept of each model having a ;; unique `key`, the `:model-type` , which needs to be given when calling ;;`metamorph.ml/train`. ;; @@ -47,10 +47,10 @@ ;; namely `:scicloj.ml.tribuo/classification` and `:scicloj.ml.tribuo/regression`. ;; The model as such is encoded in the same way as the Triuo Java libraries does this, ;; namely as a map of all Tribuo components in place, of which one is the model, -;; the so called "Trainer", is always needed and has a certin `:type`, the model class. +;; the so called "Trainer", is always needed and has a certain `:type`, the model class. ;; ;; The reference documentation therefore lists all "Trainer"s and their name incl. parameters. -;; It lists as well all other "Configurable"s which could be refered to in a component map. +;; It lists as well all other "Configurable"s which could be referred to in a component map. ;; ## Setup @@ -200,7 +200,7 @@ cat-maps [1.0 1.0 0.0 1.0] [0.0 3.0 0.0 0.0]]) -;; Split data into train and test set +;; ## Split data into train and test set ;; ;; Now we split the data into train and test. We use ;; a `:holdout` strategy, so will get a single split in training and test data. @@ -234,6 +234,8 @@ split ;; We can calculate accuracy by using a metric after having converted ;; the numerical data back to original (important!). ;; We should never compare mapped columns directly. + +;; We get an accuray of: (loss/classification-accuracy (:survived (ds-cat/reverse-map-categorical-xforms (:test split))) (:survived (ds-cat/reverse-map-categorical-xforms dummy-prediction))) @@ -241,7 +243,7 @@ split (kindly/check = 0.6026936026936027) ;; ## Logistic regression -;; Next model to use is Logistic Regression: +;; Next model to use is Logistic Regression (require '[scicloj.ml.tribuo]) @@ -255,7 +257,7 @@ split (def lreg-prediction (ml/predict (:test split) lreg-model)) - +;; with accuracy of: (loss/classification-accuracy (:survived (ds-cat/reverse-map-categorical-xforms (:test split))) (:survived (ds-cat/reverse-map-categorical-xforms lreg-prediction))) @@ -301,10 +303,10 @@ split (kindly/check = 0.7878787878787878) -;; best so far, 78 %. +;; best accuracy so far, 78 %. ;; ## Next steps -;; We could now go further and trying to improve the features / the model type +;; We could now go further and try to improve the features / the model type ;; in order to find the best performing model for the data we have. ;; All models types have a range of configurations, ;; so-called hyper-parameters. They can have as well influence on the diff --git a/notebooks/noj_book/prepare_for_ml.clj b/notebooks/noj_book/prepare_for_ml.clj index b2340fa8..31929c60 100644 --- a/notebooks/noj_book/prepare_for_ml.clj +++ b/notebooks/noj_book/prepare_for_ml.clj @@ -70,7 +70,7 @@ categorical-ds ;; a column name as input. ;; ;; We use them to calculate a mapping from string/keyword to a -;; numerical space (0 ... x) like this +;; numerical space (0 ... x) like this: (ds-cat/fit-categorical-map categorical-ds :x) @@ -296,7 +296,7 @@ one-hot-ds ;; we need to mark explicitly which columns are `features` and which are ;; `targets` in order to be able to use the dataset later for ;; machine learning in `metamorph.ml` -;; + ;; As normally only one or a few columns are inference targets, ;; we can simply mark those and the other columns are regarded as features. @@ -306,11 +306,9 @@ one-hot-ds (ds-mod/set-inference-target :y))) ;; (works as well with a seq) - ;; This is marked as well in the column metadata. (-> modelled-ds :y meta) - ;; There are several functions to get information on features and ;; inference targets: @@ -357,7 +355,7 @@ ds-ready-for-train ;; ;; Side remark: ;; If needed, data could as well be easily transformed into a tensor. -;; Most models do this internally anyway (often to primitive arrays) +;; Most models do this internally anyway (often to primitive arrays). (require 'tech.v3.dataset.tensor) (def ds-tensor @@ -365,5 +363,5 @@ ds-ready-for-train ds-tensor ;; or we can do so, if needed, but this looses the notation of features / -;; inference target +;; inference target. (tech.v3.tensor/->jvm ds-tensor)