ns tidytext
(:require
(:as io]
[clojure.java.io :as str]
[clojure.string :as clay]
[scicloj.clay.v2.api :as text]
[scicloj.metamorph.ml.text :as ds-reductions]
[tech.v3.dataset.reductions :as tc]
[tablecloth.api set :as set]
[clojure.:as hanami]
[scicloj.tableplot.v1.hanami :as ht]
[aerial.hanami.templates :as nlp]
[scicloj.ml.smile.nlp :as kind])) [scicloj.kindly.v4.kind
comment
(
(clay/stop!)
(clay/start!):source-path "notebooks/tidytext.clj"
(clay/make! {:show true}))
def stopword
(:token
(tc/dataset {:default)})) (nlp/resolve-stopwords
defn parse-f [f]
(let [path-components (->> (.toPath f) (.iterator) iterator-seq)
(type (str (nth path-components 2))
str (nth path-components 3))
label (str (nth path-components 4))
id (line-seq (io/reader f))]
texts (map
(hash-map :label %1
#(:text %2
:id %3
:type %4)
repeat (count texts) label)
(
textsrepeat (count texts) id)
(repeat (count texts) type)))) (
def raw-text
(
(tc/dataset
(flatten->>
(file-seq (io/file "bigdata/20news-by-date"))
(filter #(.isFile %))
(remove #(= ".keep" (.getName %)))
(map parse-f))))) (
assert (pos? (tc/row-count raw-text)) "raw-text is empty") (
nil
def raw-text-cleaned
(->
(
raw-textfn [{:keys [text]}]
(tc/drop-rows (empty? text)))
(fn [{:keys [text]}] (nil? (re-matches #"^[^>]+[A-Za-z\\d]" text))))
(tc/drop-rows (:line (range)))) (tc/add-column
(tc/shape raw-text)
834781 4] [
^kind/table
(tc/head raw-text)
type | label | id | text |
---|---|---|---|
20news-bydate-train | talk.politics.misc | 178588 | From: ckincy@cs.umr.edu (Charles Kincy) |
20news-bydate-train | talk.politics.misc | 178588 | Subject: Re: Top Ten Signs That It's the Age of Aquarius on Pennsylvania Avenue |
20news-bydate-train | talk.politics.misc | 178588 | Nntp-Posting-Host: next4.cs.umr.edu |
20news-bydate-train | talk.politics.misc | 178588 | Organization: University of Missouri - Rolla, Rolla, MO |
20news-bydate-train | talk.politics.misc | 178588 | Lines: 79 |
def category-counts
(->> raw-text
(
(ds-reductions/group-by-column-agg:label
:messages (ds-reductions/row-count)}))) {
->
(
category-counts
(hanami/plot:Y :label
ht/bar-chart {:X :messages
:YTYPE :nominal
}) )
->
(
category-counts
(hanami/plot
hanami/bar-chart:label
{:=y :messages
:=x
}) )
defn tokenize-fn [s]
(if (empty? s)
(
[]map
(
str/lower-case#"\W+")))) (str/split s
def tidy-result
(->
(
(text/->tidy-text
raw-text-cleanedfn [df] (:text df))
(fn [line] [line nil])
(
tokenize-fn:datatype-token-pos :int32
:datatype-document :int32
:datatype-token-idx :int32))
)
def usenet-words (-> tidy-result :datasets first)) (
def token-word-table
(
(tc/dataset:token (->> tidy-result :token-lookup-table keys)
{:token-idx (->> tidy-result :token-lookup-table vals)}))
def stopword-table
(-> stopword
(:token)
(tc/left-join token-word-table :right.token]))) (tc/drop-columns [
def cleaned-usenet-words
(-> usenet-words
(:token-idx))) (tc/anti-join stopword-table
->
(
(ds-reductions/group-by-column-agg:token-idx
:n (ds-reductions/row-count)}
{
cleaned-usenet-words):n :desc)
(tc/order-by 100)
(tc/head
:token-idx])
(tc/left-join token-word-table [fn [{:keys [token]}]
(tc/drop-rows (empty? token)))
(
fn [{:keys [token]}]
(tc/drop-rows (re-matches #"[a-z']$" token))))
(some? (
:n :desc)
(tc/order-by 20)) (tc/head
left-outer-join [20 4]:
:token-idx | :n | :right.token-idx | :token |
---|---|---|---|
23 | 15188 | 23 | organization |
1 | 10579 | 1 | subject |
22 | 10248 | 22 | edu |
17 | 7961 | 17 | posting |
18 | 7737 | 18 | host |
16 | 7604 | 16 | nntp |
24 | 7293 | 24 | university |
751 | 4985 | 751 | com |
1215 | 4909 | 1215 | 1 |
550 | 4338 | 550 | distribution |
153 | 4163 | 153 | 2 |
351 | 4038 | 351 | don |
29 | 4017 | 29 | people |
486 | 3937 | 486 | like |
139 | 3715 | 139 | just |
2259 | 3389 | 2259 | 3 |
512 | 3214 | 512 | know |
87 | 3075 | 87 | think |
131 | 2929 | 131 | use |
515 | 2879 | 515 | time |
(tc/shape usenet_words) ^kind/table (tc/head usenet_words)
-> raw-text-cleaned
(fn [row] (= "sci.space" (:label row))))
(tc/select-rows (:id :line])
(tc/select-columns [:left :line :right :document})
(tc/left-join cleaned-usenet-words {
(tc/drop-missing):line :document])
(tc/drop-columns [:id :document})
(tc/rename-columns {:document #(map Integer/parseInt (:document %)))
(tc/add-column
(text/->tfidf):tfidf :desc)
(tc/order-by 10)
(tc/head :token-idx)
(tc/left-join token-word-table :tfidf :desc)) (tc/order-by
left-outer-join [10 7]:
:token-idx | :document | :tfidf | :tf | :token-count | :right.token-idx | :token |
---|---|---|---|---|---|---|
10066 | 61057 | 1.49693847 | 0.50000000 | 1 | 10066 | lesc |
1490 | 60855 | 0.71625966 | 0.33333334 | 1 | 1490 | army |
1490 | 61183 | 0.71625966 | 0.33333334 | 1 | 1490 | army |
9054 | 62317 | 0.65082812 | 0.33333334 | 1 | 9054 | gps |
8354 | 60876 | 0.65082812 | 0.33333334 | 1 | 8354 | debris |
8550 | 61072 | 0.63823187 | 0.33333334 | 1 | 8550 | msfc |
9997 | 61093 | 0.61591631 | 0.33333334 | 1 | 9997 | moonbase |
68433 | 61515 | 0.59877539 | 0.20000000 | 1 | 68433 | brinkmeyer |
9803 | 61048 | 0.59795421 | 0.25000000 | 1 | 9803 | spacewalk |
2611 | 61093 | 0.58780932 | 0.33333334 | 1 | 2611 | race |
source: notebooks/tidytext.clj