tidytext – metamorph.ml topics

(ns tidytext
  (:require
   [clojure.java.io :as io]
   [clojure.string :as str]
   [scicloj.clay.v2.api :as clay]
   [scicloj.metamorph.ml.text :as text]
   [tech.v3.dataset.reductions :as ds-reductions]
   [tablecloth.api :as tc]
   [clojure.set :as set]
   [scicloj.tableplot.v1.hanami :as hanami]
   [aerial.hanami.templates :as ht]
   [scicloj.ml.smile.nlp :as nlp]
   [scicloj.kindly.v4.kind :as kind]))

(comment
  (clay/stop!)
  (clay/start!)
  (clay/make! {:source-path "notebooks/tidytext.clj"
               :show true}))

(def stopword
  (tc/dataset {:token
               (nlp/resolve-stopwords :default)}))

(defn parse-f [f]
  (let [path-components (->> (.toPath f) (.iterator) iterator-seq)
        type (str (nth path-components 2))
        label (str (nth path-components 3))
        id (str (nth path-components 4))
        texts (line-seq (io/reader f))]
    (map
     #(hash-map  :label %1
                 :text %2
                 :id %3
                 :type %4)
     (repeat (count texts) label)
     texts
     (repeat (count texts) id)
     (repeat (count texts) type))))

(def raw-text
  (tc/dataset
   (flatten
    (->>
     (file-seq (io/file "bigdata/20news-by-date"))
     (filter #(.isFile %))
     (remove #(=  ".keep" (.getName %)))
     (map parse-f)))))

(assert (pos? (tc/row-count raw-text)) "raw-text is empty")

nil

(def raw-text-cleaned
  (->
   raw-text
   (tc/drop-rows (fn [{:keys [text]}]
                   (empty? text)))
   (tc/drop-rows (fn [{:keys [text]}] (nil? (re-matches #"^[^>]+[A-Za-z\\d]" text))))
   (tc/add-column :line (range))))

(tc/shape raw-text)

[834781 4]

^kind/table
(tc/head
 raw-text)

type	label	id	text
20news-bydate-train	talk.politics.misc	178588	From: ckincy@cs.umr.edu (Charles Kincy)
20news-bydate-train	talk.politics.misc	178588	Subject: Re: Top Ten Signs That It's the Age of Aquarius on Pennsylvania Avenue
20news-bydate-train	talk.politics.misc	178588	Nntp-Posting-Host: next4.cs.umr.edu
20news-bydate-train	talk.politics.misc	178588	Organization: University of Missouri - Rolla, Rolla, MO
20news-bydate-train	talk.politics.misc	178588	Lines: 79

(def category-counts
  (->> raw-text
       (ds-reductions/group-by-column-agg
        :label
        {:messages (ds-reductions/row-count)})))

(->
 category-counts
 (hanami/plot
  ht/bar-chart {:Y :label
                :X :messages
                :YTYPE :nominal
                })
 )

(->
 category-counts
 (hanami/plot
  hanami/bar-chart
  {:=y :label
   :=x :messages
   })
 )

(defn tokenize-fn [s]
  (if (empty? s)
    []
    (map 
     str/lower-case
     (str/split s #"\W+"))))

(def tidy-result
  (->
   (text/->tidy-text
    raw-text-cleaned
    (fn [df] (:text df))
    (fn [line] [line nil])
    tokenize-fn
    :datatype-token-pos :int32
    :datatype-document :int32
    :datatype-token-idx :int32))
  )

(def usenet-words (-> tidy-result :datasets first))

(def token-word-table
  (tc/dataset
   {:token (->> tidy-result :token-lookup-table keys)
    :token-idx (->> tidy-result :token-lookup-table vals)}))

(def stopword-table
  (-> stopword
      (tc/left-join token-word-table :token)
      (tc/drop-columns [:right.token])))

(def cleaned-usenet-words
  (-> usenet-words
      (tc/anti-join stopword-table :token-idx)))

(->
 (ds-reductions/group-by-column-agg
  :token-idx
  {:n (ds-reductions/row-count)}
  cleaned-usenet-words)
 (tc/order-by :n :desc)
 (tc/head 100)

 (tc/left-join token-word-table [:token-idx])
 (tc/drop-rows (fn [{:keys [token]}]
                 (empty? token)))

 (tc/drop-rows (fn [{:keys [token]}]
                 (some? (re-matches #"[a-z']$" token))))

 (tc/order-by :n :desc)
 (tc/head 20))

left-outer-join [20 4]:

:token-idx	:n	:right.token-idx	:token
23	15188	23	organization
1	10579	1	subject
22	10248	22	edu
17	7961	17	posting
18	7737	18	host
16	7604	16	nntp
24	7293	24	university
751	4985	751	com
1215	4909	1215	1
550	4338	550	distribution
153	4163	153	2
351	4038	351	don
29	4017	29	people
486	3937	486	like
139	3715	139	just
2259	3389	2259	3
512	3214	512	know
87	3075	87	think
131	2929	131	use
515	2879	515	time

(tc/shape usenet_words) ^kind/table (tc/head usenet_words)

(-> raw-text-cleaned
    (tc/select-rows (fn [row] (= "sci.space" (:label row))))
    (tc/select-columns [:id :line])
    (tc/left-join cleaned-usenet-words {:left :line  :right :document})
    (tc/drop-missing)
    (tc/drop-columns [:line :document])
    (tc/rename-columns {:id :document})
    (tc/add-column :document #(map Integer/parseInt (:document %)))
    (text/->tfidf)
    (tc/order-by :tfidf :desc)
    (tc/head 10)
    (tc/left-join token-word-table :token-idx)
    (tc/order-by :tfidf :desc))

left-outer-join [10 7]:

:token-idx	:document	:tfidf	:tf	:token-count	:right.token-idx	:token
10066	61057	1.49693847	0.50000000	1	10066	lesc
1490	60855	0.71625966	0.33333334	1	1490	army
1490	61183	0.71625966	0.33333334	1	1490	army
9054	62317	0.65082812	0.33333334	1	9054	gps
8354	60876	0.65082812	0.33333334	1	8354	debris
8550	61072	0.63823187	0.33333334	1	8550	msfc
9997	61093	0.61591631	0.33333334	1	9997	moonbase
68433	61515	0.59877539	0.20000000	1	68433	brinkmeyer
9803	61048	0.59795421	0.25000000	1	9803	spacewalk
2611	61093	0.58780932	0.33333334	1	2611	race

source: notebooks/tidytext.clj