(ns tidytext
(:require
[clojure.java.io :as io]
[clojure.string :as str]
[scicloj.clay.v2.api :as clay]
[scicloj.metamorph.ml.text :as text]
[tech.v3.dataset.reductions :as ds-reductions]
[tablecloth.api :as tc]
[clojure.set :as set]
[scicloj.tableplot.v1.hanami :as hanami]
[aerial.hanami.templates :as ht]
[scicloj.ml.smile.nlp :as nlp]
[scicloj.kindly.v4.kind :as kind]))(comment
(clay/stop!)
(clay/start!)
(clay/make! {:source-path "notebooks/tidytext.clj"
:show true}))(def stopword
(tc/dataset {:token
(nlp/resolve-stopwords :default)}))(defn parse-f [f]
(let [path-components (->> (.toPath f) (.iterator) iterator-seq)
type (str (nth path-components 2))
label (str (nth path-components 3))
id (str (nth path-components 4))
texts (line-seq (io/reader f))]
(map
#(hash-map :label %1
:text %2
:id %3
:type %4)
(repeat (count texts) label)
texts
(repeat (count texts) id)
(repeat (count texts) type))))(def raw-text
(tc/dataset
(flatten
(->>
(file-seq (io/file "bigdata/20news-by-date"))
(filter #(.isFile %))
(remove #(= ".keep" (.getName %)))
(map parse-f)))))(assert (pos? (tc/row-count raw-text)) "raw-text is empty")nil(def raw-text-cleaned
(->
raw-text
(tc/drop-rows (fn [{:keys [text]}]
(empty? text)))
(tc/drop-rows (fn [{:keys [text]}] (nil? (re-matches #"^[^>]+[A-Za-z\\d]" text))))
(tc/add-column :line (range))))(tc/shape raw-text)[834781 4]^kind/table
(tc/head
raw-text)| type | label | id | text |
|---|---|---|---|
| 20news-bydate-test | comp.sys.mac.hardware | 52034 | From: nevai@mps.ohio-state.edu (Paul Nevai) |
| 20news-bydate-test | comp.sys.mac.hardware | 52034 | Subject: Monitors - should they be kept on 24 hours a day??? |
| 20news-bydate-test | comp.sys.mac.hardware | 52034 | Organization: Department of Mathematics, The Ohio State University |
| 20news-bydate-test | comp.sys.mac.hardware | 52034 | Lines: 9 |
| 20news-bydate-test | comp.sys.mac.hardware | 52034 | Distribution: world |
(def category-counts
(->> raw-text
(ds-reductions/group-by-column-agg
:label
{:messages (ds-reductions/row-count)})))(->
category-counts
(hanami/plot
ht/bar-chart {:Y :label
:X :messages
:YTYPE :nominal
})
)(->
category-counts
(hanami/plot
hanami/bar-chart
{:=y :label
:=x :messages
})
)(defn tokenize-fn [s]
(if (empty? s)
[]
(map
str/lower-case
(str/split s #"\W+"))))(def tidy-result
(->
(text/->tidy-text
raw-text-cleaned
(fn [df] (:text df))
(fn [line] [line nil])
tokenize-fn
:datatype-token-pos :int32
:datatype-document :int32
:datatype-token-idx :int32))
)(def usenet-words (-> tidy-result :datasets first))(def token-word-table
(tc/dataset
{:token (->> tidy-result :token-lookup-table keys)
:token-idx (->> tidy-result :token-lookup-table vals)}))(def stopword-table
(-> stopword
(tc/left-join token-word-table :token)
(tc/drop-columns [:right.token])))(def cleaned-usenet-words
(-> usenet-words
(tc/anti-join stopword-table :token-idx)))(->
(ds-reductions/group-by-column-agg
:token-idx
{:n (ds-reductions/row-count)}
cleaned-usenet-words)
(tc/order-by :n :desc)
(tc/head 100)
(tc/left-join token-word-table [:token-idx])
(tc/drop-rows (fn [{:keys [token]}]
(empty? token)))
(tc/drop-rows (fn [{:keys [token]}]
(some? (re-matches #"[a-z']$" token))))
(tc/order-by :n :desc)
(tc/head 20))left-outer-join [20 4]:
| :token-idx | :n | :right.token-idx | :token |
|---|---|---|---|
| 1 | 15188 | 1 | organization |
| 57 | 10579 | 57 | subject |
| 15 | 10248 | 15 | edu |
| 17 | 7961 | 17 | posting |
| 18 | 7737 | 18 | host |
| 16 | 7604 | 16 | nntp |
| 8 | 7293 | 8 | university |
| 552 | 4985 | 552 | com |
| 365 | 4909 | 365 | 1 |
| 9 | 4338 | 9 | distribution |
| 82 | 4163 | 82 | 2 |
| 258 | 4038 | 258 | don |
| 1175 | 4017 | 1175 | people |
| 279 | 3937 | 279 | like |
| 318 | 3715 | 318 | just |
| 597 | 3389 | 597 | 3 |
| 25 | 3214 | 25 | know |
| 259 | 3075 | 259 | think |
| 429 | 2929 | 429 | use |
| 679 | 2879 | 679 | time |
(tc/shape usenet_words) ^kind/table (tc/head usenet_words)
(-> raw-text-cleaned
(tc/select-rows (fn [row] (= "sci.space" (:label row))))
(tc/select-columns [:id :line])
(tc/left-join cleaned-usenet-words {:left :line :right :document})
(tc/drop-missing)
(tc/drop-columns [:line :document])
(tc/rename-columns {:id :document})
(tc/add-column :document #(map Integer/parseInt (:document %)))
(text/->tfidf)
(tc/order-by :tfidf :desc)
(tc/head 10)
(tc/left-join token-word-table :token-idx)
(tc/order-by :tfidf :desc))left-outer-join [10 7]:
| :token-idx | :document | :tfidf | :tf | :token-count | :right.token-idx | :token |
|---|---|---|---|---|---|---|
| 23260 | 61057 | 1.49693847 | 0.50000000 | 1 | 23260 | lesc |
| 9062 | 61183 | 0.71625966 | 0.33333334 | 1 | 9062 | army |
| 9062 | 60855 | 0.71625966 | 0.33333334 | 1 | 9062 | army |
| 20098 | 62317 | 0.65082812 | 0.33333334 | 1 | 20098 | gps |
| 19076 | 60876 | 0.65082812 | 0.33333334 | 1 | 19076 | debris |
| 7573 | 61072 | 0.63823187 | 0.33333334 | 1 | 7573 | msfc |
| 44132 | 61093 | 0.61591631 | 0.33333334 | 1 | 44132 | moonbase |
| 43966 | 61515 | 0.59877539 | 0.20000000 | 1 | 43966 | brinkmeyer |
| 80993 | 61048 | 0.59795421 | 0.25000000 | 1 | 80993 | spacewalk |
| 8422 | 61093 | 0.58780932 | 0.33333334 | 1 | 8422 | race |
source: notebooks/tidytext.clj