(ns tidytext
  (:require
   [clojure.java.io :as io]
   [clojure.string :as str]
   [scicloj.clay.v2.api :as clay]
   [scicloj.metamorph.ml.text :as text]
   [tech.v3.dataset.reductions :as ds-reductions]
   [tablecloth.api :as tc]
   [clojure.set :as set]
   [scicloj.tableplot.v1.hanami :as hanami]
   [aerial.hanami.templates :as ht]
   [scicloj.ml.smile.nlp :as nlp]
   [scicloj.kindly.v4.kind :as kind]))
(comment
  (clay/stop!)
  (clay/start!)
  (clay/make! {:source-path "notebooks/tidytext.clj"
               :show true}))
(def stopword
  (tc/dataset {:token
               (nlp/resolve-stopwords :default)}))
(defn parse-f [f]
  (let [path-components (->> (.toPath f) (.iterator) iterator-seq)
        type (str (nth path-components 2))
        label (str (nth path-components 3))
        id (str (nth path-components 4))
        texts (line-seq (io/reader f))]
    (map
     #(hash-map  :label %1
                 :text %2
                 :id %3
                 :type %4)
     (repeat (count texts) label)
     texts
     (repeat (count texts) id)
     (repeat (count texts) type))))
(def raw-text
  (tc/dataset
   (flatten
    (->>
     (file-seq (io/file "bigdata/20news-by-date"))
     (filter #(.isFile %))
     (remove #(=  ".keep" (.getName %)))
     (map parse-f)))))
(assert (pos? (tc/row-count raw-text)) "raw-text is empty")
nil
(def raw-text-cleaned
  (->
   raw-text
   (tc/drop-rows (fn [{:keys [text]}]
                   (empty? text)))
   (tc/drop-rows (fn [{:keys [text]}] (nil? (re-matches #"^[^>]+[A-Za-z\\d]" text))))
   (tc/add-column :line (range))))
(tc/shape raw-text)
[834781 4]
^kind/table
(tc/head
 raw-text)
type label id text
20news-bydate-test comp.sys.mac.hardware 52034 From: nevai@mps.ohio-state.edu (Paul Nevai)
20news-bydate-test comp.sys.mac.hardware 52034 Subject: Monitors - should they be kept on 24 hours a day???
20news-bydate-test comp.sys.mac.hardware 52034 Organization: Department of Mathematics, The Ohio State University
20news-bydate-test comp.sys.mac.hardware 52034 Lines: 9
20news-bydate-test comp.sys.mac.hardware 52034 Distribution: world
(def category-counts
  (->> raw-text
       (ds-reductions/group-by-column-agg
        :label
        {:messages (ds-reductions/row-count)})))
(->
 category-counts
 (hanami/plot
  ht/bar-chart {:Y :label
                :X :messages
                :YTYPE :nominal
                })
 )
(->
 category-counts
 (hanami/plot
  hanami/bar-chart
  {:=y :label
   :=x :messages
   })
 )
(defn tokenize-fn [s]
  (if (empty? s)
    []
    (map 
     str/lower-case
     (str/split s #"\W+"))))
(def tidy-result
  (->
   (text/->tidy-text
    raw-text-cleaned
    (fn [df] (:text df))
    (fn [line] [line nil])
    tokenize-fn
    :datatype-token-pos :int32
    :datatype-document :int32
    :datatype-token-idx :int32))
  )
(def usenet-words (-> tidy-result :datasets first))
(def token-word-table
  (tc/dataset
   {:token (->> tidy-result :token-lookup-table keys)
    :token-idx (->> tidy-result :token-lookup-table vals)}))
(def stopword-table
  (-> stopword
      (tc/left-join token-word-table :token)
      (tc/drop-columns [:right.token])))
(def cleaned-usenet-words
  (-> usenet-words
      (tc/anti-join stopword-table :token-idx)))
(->
 (ds-reductions/group-by-column-agg
  :token-idx
  {:n (ds-reductions/row-count)}
  cleaned-usenet-words)
 (tc/order-by :n :desc)
 (tc/head 100)

 (tc/left-join token-word-table [:token-idx])
 (tc/drop-rows (fn [{:keys [token]}]
                 (empty? token)))

 (tc/drop-rows (fn [{:keys [token]}]
                 (some? (re-matches #"[a-z']$" token))))

 (tc/order-by :n :desc)
 (tc/head 20))

left-outer-join [20 4]:

:token-idx :n :right.token-idx :token
1 15188 1 organization
57 10579 57 subject
15 10248 15 edu
17 7961 17 posting
18 7737 18 host
16 7604 16 nntp
8 7293 8 university
552 4985 552 com
365 4909 365 1
9 4338 9 distribution
82 4163 82 2
258 4038 258 don
1175 4017 1175 people
279 3937 279 like
318 3715 318 just
597 3389 597 3
25 3214 25 know
259 3075 259 think
429 2929 429 use
679 2879 679 time

(tc/shape usenet_words) ^kind/table (tc/head usenet_words)

(-> raw-text-cleaned
    (tc/select-rows (fn [row] (= "sci.space" (:label row))))
    (tc/select-columns [:id :line])
    (tc/left-join cleaned-usenet-words {:left :line  :right :document})
    (tc/drop-missing)
    (tc/drop-columns [:line :document])
    (tc/rename-columns {:id :document})
    (tc/add-column :document #(map Integer/parseInt (:document %)))
    (text/->tfidf)
    (tc/order-by :tfidf :desc)
    (tc/head 10)
    (tc/left-join token-word-table :token-idx)
    (tc/order-by :tfidf :desc))

left-outer-join [10 7]:

:token-idx :document :tfidf :tf :token-count :right.token-idx :token
23260 61057 1.49693847 0.50000000 1 23260 lesc
9062 61183 0.71625966 0.33333334 1 9062 army
9062 60855 0.71625966 0.33333334 1 9062 army
20098 62317 0.65082812 0.33333334 1 20098 gps
19076 60876 0.65082812 0.33333334 1 19076 debris
7573 61072 0.63823187 0.33333334 1 7573 msfc
44132 61093 0.61591631 0.33333334 1 44132 moonbase
43966 61515 0.59877539 0.20000000 1 43966 brinkmeyer
80993 61048 0.59795421 0.25000000 1 80993 spacewalk
8422 61093 0.58780932 0.33333334 1 8422 race
source: notebooks/tidytext.clj