Sun Dec 18 23:13:15 CET 2022
(comment
(note/init-with-browser)
(note/eval-this-notespace)
(note/reread-this-notespace)
(note/render-static-html "docs/tune-titanic.html")
(note/init))
nil
This is the Clojure version of https://www.moritzkoerber.com/posts/preprocessing-hyperparameters/
(require '[scicloj.ml.dataset :as ds]
'[scicloj.ml.core :as ml]
'[scicloj.ml.metamorph :as mm]
'[camel-snake-kebab.core :as csk]
'[scicloj.metamorph.ml.evaluation-handler :as eval-hn]
'[tech.v3.datatype.functional :as dtfunc])
(def categorical-features [:pclass :sex :embarked])
(def numeric-features [:age :parch :fare])
(defn map->vec [m] (flatten (into [] m)))
Preproceesing Pipelines including feature engineering
(def data
(-> (ds/dataset "data/titanic/train.csv"
{:key-fn csk/->kebab-case-keyword})
(ds/select-columns (concat categorical-features numeric-features [:survived]))
(ds/replace-missing categorical-features :value "missing")
(ds/categorical->one-hot categorical-features)))
(defn replace-missing [options]
(fn [ctx]
( (apply mm/replace-missing numeric-features (map->vec (:replace-missing-options options))) ctx)))
(defn maybe-std-scale [options]
(fn [ctx]
(if (-> options :scaling-options :scale?)
((mm/std-scale numeric-features {})
ctx)
ctx)))
(defn assoc-pipe-opts [options]
(fn [ctx]
(assoc ctx :pipe-options options)))
(defn make-decl-pipeline[model-type options]
[[::assoc-pipe-opts options]
[::replace-missing options]
[:mm/categorical->number [:survived ] {} :int64]
[::maybe-std-scale options]
[:mm/set-inference-target :survived]
{:metamorph/id :model} [:mm/model (merge (:model-options options) {:model-type model-type})]])
(def logistic-regression-pipelines
(map
#(make-decl-pipeline :smile.classification/logistic-regression %)
(ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
:replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
:model-options {:lambda (ml/categorical [0.1 0.2 0.5 0.7 1])
:tolerance (ml/categorical [0.1 0.01 0.001 0.0001])}})))
(def random-forrest-pipelines
(map
#(make-decl-pipeline :smile.classification/random-forest %)
(ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
:replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
:model-options {:trees (ml/categorical [5 50 100 250])
:max-depth (ml/categorical [5 8 10])}})))
(def all-pipelines (concat random-forrest-pipelines))
(def pipe-fns
(mapv ml/->pipeline all-pipelines))
Simple split
(def splits (ds/split->seq data :holdout {:ratio 0.8}))
(def train-ds ((first splits) :train))
(def holdout-ds ((first splits) :test))
Tune hyperparameter by evaluating all pipelines/models
(def files [atom []])
(def best-evaluation
(ml/evaluate-pipelines
all-pipelines
(ds/split->seq train-ds :kfold 5)
ml/classification-accuracy
:accuracy
{;; :attach-fn-sources {:ns (find-ns 'scicloj.ml.tune-titanic)
;; :pipe-fns-clj-file "src/scicloj/ml/tune_titanic.clj"}
:return-best-crossvalidation-only true
:return-best-pipeline-only true}))
(def best-accuracy (-> best-evaluation first first :train-transform :metric))
(def best-options (-> best-evaluation first first :fit-ctx :pipe-options))
(def best-pipe-fn
(-> best-evaluation first first :pipe-fn))
best-pipe-fn
#object[scicloj.metamorph.core$pipeline$local_pipeline__44848 0x33c73b03 "scicloj.metamorph.core$pipeline$local_pipeline__44848@33c73b03"]
(def best-pipe-decl
(-> best-evaluation first first :pipe-decl))
best accuracy found on train data: 0.8734622144112478
best accuracy found on test data: 0.8461538461538461
best options (found on train data):
best-options
{:scaling-options {:scale? false},
:replace-missing-options
{:value
#object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
:model-options {:trees 250, :max-depth 10}}
best pipeline (found on train data)
best-pipe-decl
[[:scicloj.ml.tune-titanic/assoc-pipe-opts
{:scaling-options {:scale? false},
:replace-missing-options
{:value
#object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
:model-options {:trees 250, :max-depth 10}}]
[:scicloj.ml.tune-titanic/replace-missing
{:scaling-options {:scale? false},
:replace-missing-options
{:value
#object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
:model-options {:trees 250, :max-depth 10}}]
[:mm/categorical->number [:survived] {} :int64]
[:scicloj.ml.tune-titanic/maybe-std-scale
{:scaling-options {:scale? false},
:replace-missing-options
{:value
#object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
:model-options {:trees 250, :max-depth 10}}]
[:mm/set-inference-target :survived]
#:metamorph{:id :model}
[:mm/model
{:trees 250,
:max-depth 10,
:model-type :smile.classification/random-forest}]]
pipe sources information
(->
(ml/get-nice-source-info best-pipe-decl
(find-ns 'scicloj.ml.tune-titanic)
(-> #'data meta :file))
(update :classpath #(take 20 %)))
{:fn-sources
#:mm{categorical->number
{:source-str
"(defn categorical->number\n \"Convert columns into a discrete , numeric representation\n See tech.v3.dataset.categorical/fit-categorical-map.\"\n ([filter-fn-or-ds]\n (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds))\n ([filter-fn-or-ds table-args]\n (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds table-args))\n ([filter-fn-or-ds table-args result-datatype]\n (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds table-args result-datatype)))",
:source-form
(defn
categorical->number
"Convert columns into a discrete , numeric representation\n See tech.v3.dataset.categorical/fit-categorical-map."
([filter-fn-or-ds]
(tech.v3.dataset.metamorph/categorical->number
filter-fn-or-ds))
([filter-fn-or-ds table-args]
(tech.v3.dataset.metamorph/categorical->number
filter-fn-or-ds
table-args))
([filter-fn-or-ds table-args result-datatype]
(tech.v3.dataset.metamorph/categorical->number
filter-fn-or-ds
table-args
result-datatype)))},
model
{:source-str
"(defn model\n \"Executes a machine learning model in train/predict (depending on :mode)\n from the `metamorph.ml` model registry.\n\n The model is passed between both invocation via the shared context ctx in a\n key (a step indentifier) which is passed in key `:metamorph/id` and guarantied to be unique for each\n pipeline step.\n\n The function writes and reads into this common context key.\n\n Options:\n - `:model-type` - Keyword for the model to use\n\n Further options get passed to `train` functions and are model specific.\n\n See here for an overview for the models build into scicloj.ml:\n\n\n https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\n\n Other libraries might contribute other models,\n which are documented as part of the library.\n\n\n metamorph | .\n -------------------------------------|----------------------------------------------------------------------------\n Behaviour in mode :fit | Calls `scicloj.metamorph.ml/train` using data in `:metamorph/data` and `options`and stores trained model in ctx under key in `:metamorph/id`\n Behaviour in mode :transform | Reads trained model from ctx and calls `scicloj.metamorph.ml/predict` with the model in $id and data in `:metamorph/data`\n Reads keys from ctx | In mode `:transform` : Reads trained model to use for prediction from key in `:metamorph/id`.\n Writes keys to ctx | In mode `:fit` : Stores trained model in key $id and writes feature-ds and target-ds before prediction into ctx at `:scicloj.metamorph.ml/feature-ds` /`:scicloj.metamorph.ml/target-ds`\n\n\n\n\n See as well:\n\n * `scicloj.metamorph.ml/train`\n * `scicloj.metamorph.ml/predict`\n\n \"\n ([options]\n (scicloj.metamorph.ml/model options)))",
:source-form
(defn
model
"Executes a machine learning model in train/predict (depending on :mode)\n from the `metamorph.ml` model registry.\n\n The model is passed between both invocation via the shared context ctx in a\n key (a step indentifier) which is passed in key `:metamorph/id` and guarantied to be unique for each\n pipeline step.\n\n The function writes and reads into this common context key.\n\n Options:\n - `:model-type` - Keyword for the model to use\n\n Further options get passed to `train` functions and are model specific.\n\n See here for an overview for the models build into scicloj.ml:\n\n\n https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\n\n Other libraries might contribute other models,\n which are documented as part of the library.\n\n\n metamorph | .\n -------------------------------------|----------------------------------------------------------------------------\n Behaviour in mode :fit | Calls `scicloj.metamorph.ml/train` using data in `:metamorph/data` and `options`and stores trained model in ctx under key in `:metamorph/id`\n Behaviour in mode :transform | Reads trained model from ctx and calls `scicloj.metamorph.ml/predict` with the model in $id and data in `:metamorph/data`\n Reads keys from ctx | In mode `:transform` : Reads trained model to use for prediction from key in `:metamorph/id`.\n Writes keys to ctx | In mode `:fit` : Stores trained model in key $id and writes feature-ds and target-ds before prediction into ctx at `:scicloj.metamorph.ml/feature-ds` /`:scicloj.metamorph.ml/target-ds`\n\n\n\n\n See as well:\n\n * `scicloj.metamorph.ml/train`\n * `scicloj.metamorph.ml/predict`\n\n "
([options] (scicloj.metamorph.ml/model options)))},
set-inference-target
{:source-str
"(defn set-inference-target\n \"Set the inference target on the column. This sets the :column-type member\n of the column metadata to :inference-target?.\"\n ([target-name-or-target-name-seq]\n (tech.v3.dataset.metamorph/set-inference-target target-name-or-target-name-seq)))",
:source-form
(defn
set-inference-target
"Set the inference target on the column. This sets the :column-type member\n of the column metadata to :inference-target?."
([target-name-or-target-name-seq]
(tech.v3.dataset.metamorph/set-inference-target
target-name-or-target-name-seq)))}},
:classpath
("src"
"resources"
"/home/carsten/.m2/repository/aerial/hanami/aerial.hanami/0.12.9/aerial.hanami-0.12.9.jar"
"/home/carsten/.gitlibs/libs/applied-science/waqi/faefe5dfd1b161ff70089924591ac2d699527811/resources"
"/home/carsten/.gitlibs/libs/applied-science/waqi/faefe5dfd1b161ff70089924591ac2d699527811/src"
"/home/carsten/.m2/repository/ch/qos/logback/logback-classic/1.4.4/logback-classic-1.4.4.jar"
"/home/carsten/.m2/repository/clj-python/libpython-clj/2.020/libpython-clj-2.020.jar"
"/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.13.2/jackson-annotations-2.13.2.jar"
"/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-core/2.13.2/jackson-core-2.13.2.jar"
"/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.13.2/jackson-databind-2.13.2.jar"
"/home/carsten/.m2/repository/com/github/luben/zstd-jni/1.5.1-1/zstd-jni-1.5.1-1.jar"
"/home/carsten/.m2/repository/dk/simongray/datalinguist/0.1.163/datalinguist-0.1.163.jar"
"/home/carsten/.m2/repository/generateme/fastmath/2.1.6/fastmath-2.1.6.jar"
"/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/src"
"/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/resources"
"/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/bb"
"/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/src"
"/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/resources"
"/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/target/classes"
"/home/carsten/.m2/repository/org/apache/arrow/arrow-vector/6.0.0/arrow-vector-6.0.0.jar")}
(def predicted-survival-hold-out
(->
(best-pipe-fn
(merge (-> best-evaluation first first :fit-ctx)
{:metamorph/data holdout-ds :metamorph/mode :transform}))
:metamorph/data
ds/reverse-map-categorical-xforms
:survived))
Classication accuracy on holdout data:
(ml/classification-accuracy predicted-survival-hold-out
(holdout-ds :survived))
0.7988826815642458
Confusion matrix on holdout data
^kind/dataset
(->
(ml/confusion-map predicted-survival-hold-out
(holdout-ds :survived))
(ml/confusion-map->ds))
_unnamed [3 3]:
:column-name | 0 | 1 |
---|---|---|
column-name | 0 | 1 |
0 | 0.9159 | 0.08411 |
1 | 0.3750 | 0.6250 |
Smile model object:
(ml/thaw-model
(-> best-evaluation first first :fit-ctx :model))
#object[smile.classification.RandomForest 0x732fb0b7 "smile.classification.RandomForest@732fb0b7"]
Feature importance:
(seq
(.importance
(ml/thaw-model
(-> best-evaluation first first :fit-ctx :model))))
(91.72936078911897
6.804519064404864
117.00237798409798
5.096989319475346
10.150868122594787
4.240011278391342
22.305792702068686
33.949740317735156
1.0232068587001921
0.9722581226876843
2.552121205979241
0.0)
(require '[scicloj.ml.nested-cv :as nested-cv])
(def nested-cv-result
(doall
(nested-cv/nested-cv data all-pipelines
ml/classification-accuracy
:accuracy 10 5)))
nested cv best models metrics
(map :metric nested-cv-result)
(0.8666666666666667
0.8666666666666667
0.8666666666666667
0.8444444444444444
0.8777777777777778
0.888888888888889
0.8666666666666667
0.8555555555555556
0.8333333333333333
0.9135802469135802)
(def final-model-by-cv
(let [inner-k-fold (ds/split->seq data :kfold {:k 5})
evaluation (ml/evaluate-pipelines
all-pipelines
inner-k-fold
ml/classification-accuracy
:accuracy)
fit-ctx (-> evaluation first first :fit-ctx)
best-pipefn (-> evaluation first first :pipe-fn)]
{:best-pipe-fn best-pipefn
:fit-ctx fit-ctx}))
(def final-model
((:best-pipe-fn final-model-by-cv)
{:metamorph/data data :metamorph/mode :fit}))
Final best model
(ml/thaw-model (:model final-model))
#object[smile.classification.RandomForest 0x5eb7a939 "smile.classification.RandomForest@5eb7a939"]
trained with best hyper paramter
(-> final-model :pipe-options)
{:scaling-options {:scale? true},
:replace-missing-options
{:value
#object[tech.v3.datatype.functional$median 0x499d3e41 "tech.v3.datatype.functional$median@499d3e41"]},
:model-options {:trees 50, :max-depth 10}}