(notespace)

Sun Dec 18 23:13:15 CET 2022


(comment
  (note/init-with-browser)
  (note/eval-this-notespace)
  (note/reread-this-notespace)
  (note/render-static-html "docs/tune-titanic.html")
  (note/init))
nil

(require  '[scicloj.ml.dataset :as ds]
          '[scicloj.ml.core :as ml]
          '[scicloj.ml.metamorph :as mm]
          '[camel-snake-kebab.core :as csk]
          '[scicloj.metamorph.ml.evaluation-handler :as eval-hn]
          '[tech.v3.datatype.functional :as dtfunc])

(def  categorical-features  [:pclass :sex :embarked])

(def  numeric-features [:age :parch :fare])

(defn map->vec [m] (flatten (into [] m)))

Preproceesing Pipelines including feature engineering

(def data
  (-> (ds/dataset "data/titanic/train.csv"
                  {:key-fn csk/->kebab-case-keyword})
      (ds/select-columns (concat categorical-features numeric-features [:survived]))
      (ds/replace-missing categorical-features :value "missing")
      (ds/categorical->one-hot categorical-features)))

(defn replace-missing [options]
  (fn [ctx]
    ( (apply mm/replace-missing numeric-features (map->vec (:replace-missing-options options))) ctx)))

(defn maybe-std-scale [options]
  (fn [ctx]
    (if (-> options :scaling-options :scale?)
      ((mm/std-scale numeric-features {})
       ctx)
      ctx)))

(defn assoc-pipe-opts [options]
  (fn [ctx]
    (assoc ctx :pipe-options options)))

(defn make-decl-pipeline[model-type options]
  [[::assoc-pipe-opts options]
   [::replace-missing options]
   [:mm/categorical->number [:survived ] {} :int64]
   [::maybe-std-scale options]
   [:mm/set-inference-target :survived]
   {:metamorph/id :model} [:mm/model (merge (:model-options options) {:model-type model-type})]])

(def logistic-regression-pipelines
  (map
   #(make-decl-pipeline :smile.classification/logistic-regression %)
   (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
                         :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
                         :model-options {:lambda (ml/categorical [0.1 0.2 0.5 0.7 1])
                                         :tolerance (ml/categorical [0.1 0.01 0.001 0.0001])}})))

(def random-forrest-pipelines
  (map
   #(make-decl-pipeline :smile.classification/random-forest %)
   (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
                         :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
                         :model-options {:trees (ml/categorical [5 50 100 250])
                                         :max-depth (ml/categorical [5 8 10])}})))

(def all-pipelines (concat random-forrest-pipelines))

(def pipe-fns
  (mapv ml/->pipeline all-pipelines))

Simple split

(def splits (ds/split->seq data :holdout {:ratio 0.8}))

(def train-ds ((first splits) :train))

(def holdout-ds ((first splits) :test))

Tune hyperparameter by evaluating all pipelines/models

(def files [atom []])

(def best-evaluation
  (ml/evaluate-pipelines
   all-pipelines
   (ds/split->seq train-ds :kfold 5)
   ml/classification-accuracy
   :accuracy
   {;; :attach-fn-sources {:ns (find-ns 'scicloj.ml.tune-titanic)
    ;;                         :pipe-fns-clj-file "src/scicloj/ml/tune_titanic.clj"}
    :return-best-crossvalidation-only true
    :return-best-pipeline-only true}))

(def best-accuracy (-> best-evaluation first first :train-transform :metric))

(def best-options (-> best-evaluation first first :fit-ctx :pipe-options))

(def best-pipe-fn
  (-> best-evaluation first first :pipe-fn))

best-pipe-fn
#object[scicloj.metamorph.core$pipeline$local_pipeline__44848 0x33c73b03 "scicloj.metamorph.core$pipeline$local_pipeline__44848@33c73b03"]

(def best-pipe-decl
  (-> best-evaluation first first :pipe-decl))

All information on best found pipeline

best accuracy found on train data: 0.8734622144112478

best accuracy found on test data: 0.8461538461538461

best options (found on train data):

best-options
{:scaling-options {:scale? false},
 :replace-missing-options
 {:value
  #object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
 :model-options {:trees 250, :max-depth 10}}

best pipeline (found on train data)

best-pipe-decl
[[:scicloj.ml.tune-titanic/assoc-pipe-opts
  {:scaling-options {:scale? false},
   :replace-missing-options
   {:value
    #object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
   :model-options {:trees 250, :max-depth 10}}]
 [:scicloj.ml.tune-titanic/replace-missing
  {:scaling-options {:scale? false},
   :replace-missing-options
   {:value
    #object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
   :model-options {:trees 250, :max-depth 10}}]
 [:mm/categorical->number [:survived] {} :int64]
 [:scicloj.ml.tune-titanic/maybe-std-scale
  {:scaling-options {:scale? false},
   :replace-missing-options
   {:value
    #object[tech.v3.datatype.functional$mean 0x7e25d9d1 "tech.v3.datatype.functional$mean@7e25d9d1"]},
   :model-options {:trees 250, :max-depth 10}}]
 [:mm/set-inference-target :survived]
 #:metamorph{:id :model}
 [:mm/model
  {:trees 250,
   :max-depth 10,
   :model-type :smile.classification/random-forest}]]

pipe sources information

(->
 (ml/get-nice-source-info best-pipe-decl
                          (find-ns 'scicloj.ml.tune-titanic)
                          (-> #'data meta :file))
 (update :classpath #(take 20 %)))
{:fn-sources
 #:mm{categorical->number
      {:source-str
       "(defn categorical->number\n  \"Convert columns into a discrete , numeric representation\n  See tech.v3.dataset.categorical/fit-categorical-map.\"\n  ([filter-fn-or-ds]\n  (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds))\n  ([filter-fn-or-ds table-args]\n  (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds table-args))\n  ([filter-fn-or-ds table-args result-datatype]\n  (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds table-args result-datatype)))",
       :source-form
       (defn
        categorical->number
        "Convert columns into a discrete , numeric representation\n  See tech.v3.dataset.categorical/fit-categorical-map."
        ([filter-fn-or-ds]
         (tech.v3.dataset.metamorph/categorical->number
          filter-fn-or-ds))
        ([filter-fn-or-ds table-args]
         (tech.v3.dataset.metamorph/categorical->number
          filter-fn-or-ds
          table-args))
        ([filter-fn-or-ds table-args result-datatype]
         (tech.v3.dataset.metamorph/categorical->number
          filter-fn-or-ds
          table-args
          result-datatype)))},
      model
      {:source-str
       "(defn model\n  \"Executes a machine learning model in train/predict (depending on :mode)\n  from the `metamorph.ml` model registry.\n\n  The model is passed between both invocation via the shared context ctx in a\n  key (a step indentifier) which is passed in key `:metamorph/id` and guarantied to be unique for each\n  pipeline step.\n\n  The function writes and reads into this common context key.\n\n  Options:\n  - `:model-type` - Keyword for the model to use\n\n  Further options get passed to `train` functions and are model specific.\n\n  See here for an overview for the models build into scicloj.ml:\n\n\n  https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\n\n  Other libraries might contribute other models,\n  which are documented as part of the library.\n\n\n  metamorph                            | .\n  -------------------------------------|----------------------------------------------------------------------------\n  Behaviour in mode :fit               | Calls `scicloj.metamorph.ml/train` using data in `:metamorph/data` and `options`and stores trained model in ctx under key in `:metamorph/id`\n  Behaviour in mode :transform         | Reads trained model from ctx and calls `scicloj.metamorph.ml/predict` with the model in $id and data in `:metamorph/data`\n  Reads keys from ctx                  | In mode `:transform` : Reads trained model to use for prediction from key in `:metamorph/id`.\n  Writes keys to ctx                   | In mode `:fit` : Stores trained model in key $id and writes feature-ds and target-ds before prediction into ctx at `:scicloj.metamorph.ml/feature-ds` /`:scicloj.metamorph.ml/target-ds`\n\n\n\n\n  See as well:\n\n  * `scicloj.metamorph.ml/train`\n  * `scicloj.metamorph.ml/predict`\n\n  \"\n  ([options]\n  (scicloj.metamorph.ml/model options)))",
       :source-form
       (defn
        model
        "Executes a machine learning model in train/predict (depending on :mode)\n  from the `metamorph.ml` model registry.\n\n  The model is passed between both invocation via the shared context ctx in a\n  key (a step indentifier) which is passed in key `:metamorph/id` and guarantied to be unique for each\n  pipeline step.\n\n  The function writes and reads into this common context key.\n\n  Options:\n  - `:model-type` - Keyword for the model to use\n\n  Further options get passed to `train` functions and are model specific.\n\n  See here for an overview for the models build into scicloj.ml:\n\n\n  https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\n\n  Other libraries might contribute other models,\n  which are documented as part of the library.\n\n\n  metamorph                            | .\n  -------------------------------------|----------------------------------------------------------------------------\n  Behaviour in mode :fit               | Calls `scicloj.metamorph.ml/train` using data in `:metamorph/data` and `options`and stores trained model in ctx under key in `:metamorph/id`\n  Behaviour in mode :transform         | Reads trained model from ctx and calls `scicloj.metamorph.ml/predict` with the model in $id and data in `:metamorph/data`\n  Reads keys from ctx                  | In mode `:transform` : Reads trained model to use for prediction from key in `:metamorph/id`.\n  Writes keys to ctx                   | In mode `:fit` : Stores trained model in key $id and writes feature-ds and target-ds before prediction into ctx at `:scicloj.metamorph.ml/feature-ds` /`:scicloj.metamorph.ml/target-ds`\n\n\n\n\n  See as well:\n\n  * `scicloj.metamorph.ml/train`\n  * `scicloj.metamorph.ml/predict`\n\n  "
        ([options] (scicloj.metamorph.ml/model options)))},
      set-inference-target
      {:source-str
       "(defn set-inference-target\n  \"Set the inference target on the column.  This sets the :column-type member\n  of the column metadata to :inference-target?.\"\n  ([target-name-or-target-name-seq]\n  (tech.v3.dataset.metamorph/set-inference-target target-name-or-target-name-seq)))",
       :source-form
       (defn
        set-inference-target
        "Set the inference target on the column.  This sets the :column-type member\n  of the column metadata to :inference-target?."
        ([target-name-or-target-name-seq]
         (tech.v3.dataset.metamorph/set-inference-target
          target-name-or-target-name-seq)))}},
 :classpath
 ("src"
  "resources"
  "/home/carsten/.m2/repository/aerial/hanami/aerial.hanami/0.12.9/aerial.hanami-0.12.9.jar"
  "/home/carsten/.gitlibs/libs/applied-science/waqi/faefe5dfd1b161ff70089924591ac2d699527811/resources"
  "/home/carsten/.gitlibs/libs/applied-science/waqi/faefe5dfd1b161ff70089924591ac2d699527811/src"
  "/home/carsten/.m2/repository/ch/qos/logback/logback-classic/1.4.4/logback-classic-1.4.4.jar"
  "/home/carsten/.m2/repository/clj-python/libpython-clj/2.020/libpython-clj-2.020.jar"
  "/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.13.2/jackson-annotations-2.13.2.jar"
  "/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-core/2.13.2/jackson-core-2.13.2.jar"
  "/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.13.2/jackson-databind-2.13.2.jar"
  "/home/carsten/.m2/repository/com/github/luben/zstd-jni/1.5.1-1/zstd-jni-1.5.1-1.jar"
  "/home/carsten/.m2/repository/dk/simongray/datalinguist/0.1.163/datalinguist-0.1.163.jar"
  "/home/carsten/.m2/repository/generateme/fastmath/2.1.6/fastmath-2.1.6.jar"
  "/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/src"
  "/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/resources"
  "/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/bb"
  "/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/src"
  "/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/resources"
  "/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/target/classes"
  "/home/carsten/.m2/repository/org/apache/arrow/arrow-vector/6.0.0/arrow-vector-6.0.0.jar")}

(def predicted-survival-hold-out
  (->
   (best-pipe-fn
    (merge (-> best-evaluation first first :fit-ctx)
           {:metamorph/data holdout-ds :metamorph/mode :transform}))
   :metamorph/data
   ds/reverse-map-categorical-xforms
   :survived))

Classication accuracy on holdout data:

(ml/classification-accuracy predicted-survival-hold-out
                           (holdout-ds :survived))
0.7988826815642458

Confusion matrix on holdout data

^kind/dataset
(->
 (ml/confusion-map predicted-survival-hold-out
                   (holdout-ds :survived))
 (ml/confusion-map->ds))

_unnamed [3 3]:

:column-name01
column-name01
00.91590.08411
10.37500.6250

Smile model object:

(ml/thaw-model
 (-> best-evaluation first first :fit-ctx :model))
#object[smile.classification.RandomForest 0x732fb0b7 "smile.classification.RandomForest@732fb0b7"]

Feature importance:

(seq
 (.importance
  (ml/thaw-model
   (-> best-evaluation first first :fit-ctx :model))))
(91.72936078911897
 6.804519064404864
 117.00237798409798
 5.096989319475346
 10.150868122594787
 4.240011278391342
 22.305792702068686
 33.949740317735156
 1.0232068587001921
 0.9722581226876843
 2.552121205979241
 0.0)

nested cross validation

(require '[scicloj.ml.nested-cv :as nested-cv])

(def nested-cv-result
 (doall
  (nested-cv/nested-cv data all-pipelines
                       ml/classification-accuracy
                       :accuracy 10 5)))

nested cv best models metrics

(map :metric nested-cv-result)
(0.8666666666666667
 0.8666666666666667
 0.8666666666666667
 0.8444444444444444
 0.8777777777777778
 0.888888888888889
 0.8666666666666667
 0.8555555555555556
 0.8333333333333333
 0.9135802469135802)

(def final-model-by-cv
 (let [inner-k-fold (ds/split->seq data :kfold {:k 5})
       evaluation (ml/evaluate-pipelines
                   all-pipelines
                   inner-k-fold
                   ml/classification-accuracy
                   :accuracy)
       fit-ctx (-> evaluation first first :fit-ctx)
       best-pipefn (-> evaluation first first :pipe-fn)]
   {:best-pipe-fn best-pipefn
    :fit-ctx fit-ctx}))

(def final-model
  ((:best-pipe-fn final-model-by-cv)
   {:metamorph/data data :metamorph/mode :fit}))

Final best model

(ml/thaw-model (:model  final-model))
#object[smile.classification.RandomForest 0x5eb7a939 "smile.classification.RandomForest@5eb7a939"]

trained with best hyper paramter

(-> final-model :pipe-options)
{:scaling-options {:scale? true},
 :replace-missing-options
 {:value
  #object[tech.v3.datatype.functional$median 0x499d3e41 "tech.v3.datatype.functional$median@499d3e41"]},
 :model-options {:trees 50, :max-depth 10}}