ns austen
(:require
(:as io]
[clojure.java.io :as str]
[clojure.string :as text]
[scicloj.metamorph.ml.text :as tc])) [tablecloth.api
The Jane Austen books are available here.
def austen-books
(->>
(
{"Lady Susan" "https://raw.githubusercontent.com/GITenberg/Lady-Susan_946/refs/heads/master/946.txt"
"Sense and Snsibility" "https://raw.githubusercontent.com/GITenberg/Sense-and-Sensibility_161/refs/heads/master/161.txt"
"Emma" "https://raw.githubusercontent.com/GITenberg/Emma_158/refs/heads/master/158.txt"
"Northanger Abbey" "https://raw.githubusercontent.com/GITenberg/Northanger-Abbey_121/refs/heads/master/121.txt"
"Persuasion" "https://raw.githubusercontent.com/GITenberg/Persuasion_105/refs/heads/master/105.txt"
"Mansfield Park" "https://github.com/GITenberg/Mansfield-Park_141/raw/refs/heads/master/141.txt"
"pride and justice" "https://github.com/GITenberg/Pride-and-Prejudice_1342/blob/master/1342.txt"
}map (fn [[title link]]
(hash-map :title title
(:text (slurp (io/reader link)))
)
)
(tc/dataset) ))
def tidy-austen
(
(text/->tidy-text
austen-booksfn [df] (map str (-> df :text)))
(fn [line] [line nil])
(map str/lower-case (str/split % #"\W+"))
#(:datatype-token-pos :int32
:datatype-token-idx :int32
:datatype-document :int32
))
def token-table
(
(tc/dataset:token (-> tidy-austen :token-lookup-table keys)
{:token-idx (-> tidy-austen :token-lookup-table vals)}))
def austen-tfidf
(-> tidy-austen
(:datasets
first
text/->tfidf))
def groups
(-> austen-tfidf
(:document { :result-type :as-seq})
(tc/group-by ))
map (fn [df]
(-> df
(:tfidf :desc)
(tc/order-by 10)
(tc/head :token-idx)
(tc/left-join token-table :token :tfidf])
(tc/select-columns [:tfidf :desc)
(tc/order-by
)) groups)
(
left-outer-join [10 2]:
:token | :tfidf |
---|---|
vernon | 0.00351366 |
frederica | 0.00268316 |
reginald | 0.00258733 |
susan | 0.00209755 |
courcy | 0.00166100 |
churchhill | 0.00146935 |
mainwaring | 0.00137352 |
de | 0.00073715 |
langford | 0.00060690 |
johnson | 0.00048680 |
left-outer-join [10 2]:
:token | :tfidf |
---|---|
elinor | 0.00466954 |
marianne | 0.00385833 |
dashwood | 0.00171785 |
jennings | 0.00160196 |
willoughby | 0.00147244 |
brandon | 0.00098163 |
ferrars | 0.00088619 |
lucy | 0.00081629 |
middleton | 0.00069532 |
barton | 0.00060670 |
left-outer-join [10 2]:
:token | :tfidf |
---|---|
emma | 0.00286596 |
weston | 0.00225143 |
knightley | 0.00199046 |
elton | 0.00197511 |
woodhouse | 0.00160670 |
fairfax | 0.00123317 |
churchill | 0.00114618 |
harriet | 0.00112738 |
hartfield | 0.00081870 |
bates | 0.00075730 |
left-outer-join [10 2]:
:token | :tfidf |
---|---|
tilney | 0.00229288 |
morland | 0.00153550 |
thorpe | 0.00152513 |
catherine | 0.00145307 |
allen | 0.00145250 |
isabella | 0.00104198 |
eleanor | 0.00084038 |
northanger | 0.00043575 |
fullerton | 0.00029050 |
henry | 0.00022245 |
left-outer-join [10 2]:
:token | :tfidf |
---|---|
elliot | 0.00279565 |
wentworth | 0.00210883 |
russell | 0.00143168 |
walter | 0.00136397 |
musgrove | 0.00125756 |
anne | 0.00083132 |
uppercross | 0.00074486 |
harville | 0.00072551 |
henrietta | 0.00071584 |
kellynch | 0.00070617 |
left-outer-join [10 2]:
:token | :tfidf |
---|---|
crawford | 0.00310524 |
edmund | 0.00210951 |
fanny | 0.00206726 |
bertram | 0.00139094 |
rushworth | 0.00111891 |
thomas | 0.00108713 |
norris | 0.00107272 |
mansfield | 0.00097007 |
julia | 0.00068777 |
yates | 0.00033875 |
left-outer-join [10 2]:
:token | :tfidf |
---|---|
div | 0.03281766 |
r | 0.02027056 |
react | 0.01745236 |
code | 0.01405134 |
0 | 0.00737664 |
quot | 0.00583826 |
text | 0.00474962 |
id | 0.00402507 |
padding | 0.00370611 |
testid | 0.00361251 |
)
source: notebooks/austen.clj