ns tidytext
(:require
(:as io]
[clojure.java.io :as str]
[clojure.string :as clay]
[scicloj.clay.v2.api :as text]
[scicloj.metamorph.ml.text :as ds-reductions]
[tech.v3.dataset.reductions :as tc]
[tablecloth.api set :as set]
[clojure.:as hanami]
[scicloj.tableplot.v1.hanami :as ht]
[aerial.hanami.templates :as nlp]
[scicloj.ml.smile.nlp :as kind])) [scicloj.kindly.v4.kind
comment
(
(clay/stop!)
(clay/start!):source-path "notebooks/tidytext.clj"
(clay/make! {:show true}))
def stopword
(:token
(tc/dataset {:default)})) (nlp/resolve-stopwords
defn parse-f [f]
(let [path-components (->> (.toPath f) (.iterator) iterator-seq)
(type (str (nth path-components 2))
str (nth path-components 3))
label (str (nth path-components 4))
id (line-seq (io/reader f))]
texts (map
(hash-map :label %1
#(:text %2
:id %3
:type %4)
repeat (count texts) label)
(
textsrepeat (count texts) id)
(repeat (count texts) type)))) (
def raw-text
(
(tc/dataset
(flatten->>
(file-seq (io/file "bigdata/20news-by-date"))
(filter #(.isFile %))
(remove #(= ".keep" (.getName %)))
(map parse-f))))) (
assert (pos? (tc/row-count raw-text)) "raw-text is empty") (
nil
def raw-text-cleaned
(->
(
raw-textfn [{:keys [text]}]
(tc/drop-rows (empty? text)))
(fn [{:keys [text]}] (nil? (re-matches #"^[^>]+[A-Za-z\\d]" text))))
(tc/drop-rows (:line (range)))) (tc/add-column
(tc/shape raw-text)
834781 4] [
^kind/table
(tc/head raw-text)
type | label | id | text |
---|---|---|---|
20news-bydate-test | comp.sys.mac.hardware | 52034 | From: nevai@mps.ohio-state.edu (Paul Nevai) |
20news-bydate-test | comp.sys.mac.hardware | 52034 | Subject: Monitors - should they be kept on 24 hours a day??? |
20news-bydate-test | comp.sys.mac.hardware | 52034 | Organization: Department of Mathematics, The Ohio State University |
20news-bydate-test | comp.sys.mac.hardware | 52034 | Lines: 9 |
20news-bydate-test | comp.sys.mac.hardware | 52034 | Distribution: world |
def category-counts
(->> raw-text
(
(ds-reductions/group-by-column-agg:label
:messages (ds-reductions/row-count)}))) {
->
(
category-counts
(hanami/plot:Y :label
ht/bar-chart {:X :messages
:YTYPE :nominal
}) )
->
(
category-counts
(hanami/plot
hanami/bar-chart:label
{:=y :messages
:=x
}) )
defn tokenize-fn [s]
(if (empty? s)
(
[]map
(
str/lower-case#"\W+")))) (str/split s
def tidy-result
(->
(
(text/->tidy-text
raw-text-cleanedfn [df] (:text df))
(fn [line] [line nil])
(
tokenize-fn:datatype-token-pos :int32
:datatype-document :int32
:datatype-token-idx :int32))
)
def usenet-words (-> tidy-result :datasets first)) (
def token-word-table
(
(tc/dataset:token (->> tidy-result :token-lookup-table keys)
{:token-idx (->> tidy-result :token-lookup-table vals)}))
def stopword-table
(-> stopword
(:token)
(tc/left-join token-word-table :right.token]))) (tc/drop-columns [
def cleaned-usenet-words
(-> usenet-words
(:token-idx))) (tc/anti-join stopword-table
->
(
(ds-reductions/group-by-column-agg:token-idx
:n (ds-reductions/row-count)}
{
cleaned-usenet-words):n :desc)
(tc/order-by 100)
(tc/head
:token-idx])
(tc/left-join token-word-table [fn [{:keys [token]}]
(tc/drop-rows (empty? token)))
(
fn [{:keys [token]}]
(tc/drop-rows (re-matches #"[a-z']$" token))))
(some? (
:n :desc)
(tc/order-by 20)) (tc/head
left-outer-join [20 4]:
:token-idx | :n | :right.token-idx | :token |
---|---|---|---|
1 | 15188 | 1 | organization |
57 | 10579 | 57 | subject |
15 | 10248 | 15 | edu |
17 | 7961 | 17 | posting |
18 | 7737 | 18 | host |
16 | 7604 | 16 | nntp |
8 | 7293 | 8 | university |
552 | 4985 | 552 | com |
365 | 4909 | 365 | 1 |
9 | 4338 | 9 | distribution |
82 | 4163 | 82 | 2 |
258 | 4038 | 258 | don |
1175 | 4017 | 1175 | people |
279 | 3937 | 279 | like |
318 | 3715 | 318 | just |
597 | 3389 | 597 | 3 |
25 | 3214 | 25 | know |
259 | 3075 | 259 | think |
429 | 2929 | 429 | use |
679 | 2879 | 679 | time |
(tc/shape usenet_words) ^kind/table (tc/head usenet_words)
-> raw-text-cleaned
(fn [row] (= "sci.space" (:label row))))
(tc/select-rows (:id :line])
(tc/select-columns [:left :line :right :document})
(tc/left-join cleaned-usenet-words {
(tc/drop-missing):line :document])
(tc/drop-columns [:id :document})
(tc/rename-columns {:document #(map Integer/parseInt (:document %)))
(tc/add-column
(text/->tfidf):tfidf :desc)
(tc/order-by 10)
(tc/head :token-idx)
(tc/left-join token-word-table :tfidf :desc)) (tc/order-by
left-outer-join [10 7]:
:token-idx | :document | :tfidf | :tf | :token-count | :right.token-idx | :token |
---|---|---|---|---|---|---|
23260 | 61057 | 1.49693847 | 0.50000000 | 1 | 23260 | lesc |
9062 | 61183 | 0.71625966 | 0.33333334 | 1 | 9062 | army |
9062 | 60855 | 0.71625966 | 0.33333334 | 1 | 9062 | army |
20098 | 62317 | 0.65082812 | 0.33333334 | 1 | 20098 | gps |
19076 | 60876 | 0.65082812 | 0.33333334 | 1 | 19076 | debris |
7573 | 61072 | 0.63823187 | 0.33333334 | 1 | 7573 | msfc |
44132 | 61093 | 0.61591631 | 0.33333334 | 1 | 44132 | moonbase |
43966 | 61515 | 0.59877539 | 0.20000000 | 1 | 43966 | brinkmeyer |
80993 | 61048 | 0.59795421 | 0.25000000 | 1 | 80993 | spacewalk |
8422 | 61093 | 0.58780932 | 0.33333334 | 1 | 8422 | race |
source: notebooks/tidytext.clj