(ns tidytext
  (:require
   [clojure.java.io :as io]
   [clojure.string :as str]
   [scicloj.clay.v2.api :as clay]
   [scicloj.metamorph.ml.text :as text]
   [tech.v3.dataset.reductions :as ds-reductions]
   [tablecloth.api :as tc]
   [clojure.set :as set]
   [scicloj.tableplot.v1.hanami :as hanami]
   [aerial.hanami.templates :as ht]
   [scicloj.ml.smile.nlp :as nlp]
   [scicloj.kindly.v4.kind :as kind]))
(comment
  (clay/stop!)
  (clay/start!)
  (clay/make! {:source-path "notebooks/tidytext.clj"
               :show true}))
(def stopword
  (tc/dataset {:token
               (nlp/resolve-stopwords :default)}))
(defn parse-f [f]
  (let [path-components (->> (.toPath f) (.iterator) iterator-seq)
        type (str (nth path-components 2))
        label (str (nth path-components 3))
        id (str (nth path-components 4))
        texts (line-seq (io/reader f))]
    (map
     #(hash-map  :label %1
                 :text %2
                 :id %3
                 :type %4)
     (repeat (count texts) label)
     texts
     (repeat (count texts) id)
     (repeat (count texts) type))))
(def raw-text
  (tc/dataset
   (flatten
    (->>
     (file-seq (io/file "bigdata/20news-by-date"))
     (filter #(.isFile %))
     (remove #(=  ".keep" (.getName %)))
     (map parse-f)))))
(assert (pos? (tc/row-count raw-text)) "raw-text is empty")
nil
(def raw-text-cleaned
  (->
   raw-text
   (tc/drop-rows (fn [{:keys [text]}]
                   (empty? text)))
   (tc/drop-rows (fn [{:keys [text]}] (nil? (re-matches #"^[^>]+[A-Za-z\\d]" text))))
   (tc/add-column :line (range))))
(tc/shape raw-text)
[834781 4]
^kind/table
(tc/head
 raw-text)
type label id text
20news-bydate-train talk.politics.misc 178588 From: ckincy@cs.umr.edu (Charles Kincy)
20news-bydate-train talk.politics.misc 178588 Subject: Re: Top Ten Signs That It's the Age of Aquarius on Pennsylvania Avenue
20news-bydate-train talk.politics.misc 178588 Nntp-Posting-Host: next4.cs.umr.edu
20news-bydate-train talk.politics.misc 178588 Organization: University of Missouri - Rolla, Rolla, MO
20news-bydate-train talk.politics.misc 178588 Lines: 79
(def category-counts
  (->> raw-text
       (ds-reductions/group-by-column-agg
        :label
        {:messages (ds-reductions/row-count)})))
(->
 category-counts
 (hanami/plot
  ht/bar-chart {:Y :label
                :X :messages
                :YTYPE :nominal
                })
 )
(->
 category-counts
 (hanami/plot
  hanami/bar-chart
  {:=y :label
   :=x :messages
   })
 )
(defn tokenize-fn [s]
  (if (empty? s)
    []
    (map 
     str/lower-case
     (str/split s #"\W+"))))
(def tidy-result
  (->
   (text/->tidy-text
    raw-text-cleaned
    (fn [df] (:text df))
    (fn [line] [line nil])
    tokenize-fn
    :datatype-token-pos :int32
    :datatype-document :int32
    :datatype-token-idx :int32))
  )
(def usenet-words (-> tidy-result :datasets first))
(def token-word-table
  (tc/dataset
   {:token (->> tidy-result :token-lookup-table keys)
    :token-idx (->> tidy-result :token-lookup-table vals)}))
(def stopword-table
  (-> stopword
      (tc/left-join token-word-table :token)
      (tc/drop-columns [:right.token])))
(def cleaned-usenet-words
  (-> usenet-words
      (tc/anti-join stopword-table :token-idx)))
(->
 (ds-reductions/group-by-column-agg
  :token-idx
  {:n (ds-reductions/row-count)}
  cleaned-usenet-words)
 (tc/order-by :n :desc)
 (tc/head 100)

 (tc/left-join token-word-table [:token-idx])
 (tc/drop-rows (fn [{:keys [token]}]
                 (empty? token)))

 (tc/drop-rows (fn [{:keys [token]}]
                 (some? (re-matches #"[a-z']$" token))))

 (tc/order-by :n :desc)
 (tc/head 20))

left-outer-join [20 4]:

:token-idx :n :right.token-idx :token
23 15188 23 organization
1 10579 1 subject
22 10248 22 edu
17 7961 17 posting
18 7737 18 host
16 7604 16 nntp
24 7293 24 university
751 4985 751 com
1215 4909 1215 1
550 4338 550 distribution
153 4163 153 2
351 4038 351 don
29 4017 29 people
486 3937 486 like
139 3715 139 just
2259 3389 2259 3
512 3214 512 know
87 3075 87 think
131 2929 131 use
515 2879 515 time

(tc/shape usenet_words) ^kind/table (tc/head usenet_words)

(-> raw-text-cleaned
    (tc/select-rows (fn [row] (= "sci.space" (:label row))))
    (tc/select-columns [:id :line])
    (tc/left-join cleaned-usenet-words {:left :line  :right :document})
    (tc/drop-missing)
    (tc/drop-columns [:line :document])
    (tc/rename-columns {:id :document})
    (tc/add-column :document #(map Integer/parseInt (:document %)))
    (text/->tfidf)
    (tc/order-by :tfidf :desc)
    (tc/head 10)
    (tc/left-join token-word-table :token-idx)
    (tc/order-by :tfidf :desc))

left-outer-join [10 7]:

:token-idx :document :tfidf :tf :token-count :right.token-idx :token
10066 61057 1.49693847 0.50000000 1 10066 lesc
1490 60855 0.71625966 0.33333334 1 1490 army
1490 61183 0.71625966 0.33333334 1 1490 army
9054 62317 0.65082812 0.33333334 1 9054 gps
8354 60876 0.65082812 0.33333334 1 8354 debris
8550 61072 0.63823187 0.33333334 1 8550 msfc
9997 61093 0.61591631 0.33333334 1 9997 moonbase
68433 61515 0.59877539 0.20000000 1 68433 brinkmeyer
9803 61048 0.59795421 0.25000000 1 9803 spacewalk
2611 61093 0.58780932 0.33333334 1 2611 race
source: notebooks/tidytext.clj