(ns austen
(:require
[clojure.java.io :as io]
[clojure.string :as str]
[scicloj.metamorph.ml.text :as text]
[tablecloth.api :as tc]))The Jane Austen books are available here.
(def austen-books
(->>
{
"Lady Susan" "https://raw.githubusercontent.com/GITenberg/Lady-Susan_946/refs/heads/master/946.txt"
"Sense and Snsibility" "https://raw.githubusercontent.com/GITenberg/Sense-and-Sensibility_161/refs/heads/master/161.txt"
"Emma" "https://raw.githubusercontent.com/GITenberg/Emma_158/refs/heads/master/158.txt"
"Northanger Abbey" "https://raw.githubusercontent.com/GITenberg/Northanger-Abbey_121/refs/heads/master/121.txt"
"Persuasion" "https://raw.githubusercontent.com/GITenberg/Persuasion_105/refs/heads/master/105.txt"
"Mansfield Park" "https://github.com/GITenberg/Mansfield-Park_141/raw/refs/heads/master/141.txt"
"pride and justice" "https://github.com/GITenberg/Pride-and-Prejudice_1342/blob/master/1342.txt"
}
(map (fn [[title link]]
(hash-map :title title
:text (slurp (io/reader link)))
)
)
(tc/dataset)
))(def tidy-austen
(text/->tidy-text
austen-books
(fn [df] (map str (-> df :text)))
(fn [line] [line nil])
#(map str/lower-case (str/split % #"\W+"))
:datatype-token-pos :int32
:datatype-token-idx :int32
:datatype-document :int32
))(def token-table
(tc/dataset
{:token (-> tidy-austen :token-lookup-table keys)
:token-idx (-> tidy-austen :token-lookup-table vals)}))(def austen-tfidf
(-> tidy-austen
:datasets
first
text/->tfidf))(def groups
(-> austen-tfidf
(tc/group-by :document { :result-type :as-seq})
))(map (fn [df]
(-> df
(tc/order-by :tfidf :desc)
(tc/head 10)
(tc/left-join token-table :token-idx)
(tc/select-columns [:token :tfidf])
(tc/order-by :tfidf :desc)
))
groups)(
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| vernon | 0.00351366 |
| frederica | 0.00268316 |
| reginald | 0.00258733 |
| susan | 0.00209755 |
| courcy | 0.00166100 |
| churchhill | 0.00146935 |
| mainwaring | 0.00137352 |
| de | 0.00073715 |
| langford | 0.00060690 |
| johnson | 0.00048680 |
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| elinor | 0.00466954 |
| marianne | 0.00385833 |
| dashwood | 0.00171785 |
| jennings | 0.00160196 |
| willoughby | 0.00147244 |
| brandon | 0.00098163 |
| ferrars | 0.00088619 |
| lucy | 0.00081629 |
| middleton | 0.00069532 |
| barton | 0.00060670 |
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| emma | 0.00286596 |
| weston | 0.00225143 |
| knightley | 0.00199046 |
| elton | 0.00197511 |
| woodhouse | 0.00160670 |
| fairfax | 0.00123317 |
| churchill | 0.00114618 |
| harriet | 0.00112738 |
| hartfield | 0.00081870 |
| bates | 0.00075730 |
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| tilney | 0.00229288 |
| morland | 0.00153550 |
| thorpe | 0.00152513 |
| catherine | 0.00145307 |
| allen | 0.00145250 |
| isabella | 0.00104198 |
| eleanor | 0.00084038 |
| northanger | 0.00043575 |
| fullerton | 0.00029050 |
| henry | 0.00022245 |
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| elliot | 0.00279565 |
| wentworth | 0.00210883 |
| russell | 0.00143168 |
| walter | 0.00136397 |
| musgrove | 0.00125756 |
| anne | 0.00083132 |
| uppercross | 0.00074486 |
| harville | 0.00072551 |
| henrietta | 0.00071584 |
| kellynch | 0.00070617 |
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| crawford | 0.00310524 |
| edmund | 0.00210951 |
| fanny | 0.00206726 |
| bertram | 0.00139094 |
| rushworth | 0.00111891 |
| thomas | 0.00108713 |
| norris | 0.00107272 |
| mansfield | 0.00097007 |
| julia | 0.00068777 |
| yates | 0.00033875 |
left-outer-join [10 2]:
| :token | :tfidf |
|---|---|
| r | 0.02878681 |
| 0 | 0.00886625 |
| quot | 0.00691303 |
| 75 | 0.00472686 |
| darcy | 0.00205323 |
| bennet | 0.00159039 |
| div | 0.00154608 |
| bingley | 0.00150669 |
| 25 | 0.00150291 |
| link | 0.00132943 |
)
source: notebooks/austen.clj