(ns austen
  (:require
   [clojure.java.io :as io]
   [clojure.string :as str]
   [scicloj.metamorph.ml.text :as text]
   [tablecloth.api :as tc]))

The Jane Austen books are available here.

(def austen-books
  (->>
   {
    "Lady Susan" "https://raw.githubusercontent.com/GITenberg/Lady-Susan_946/refs/heads/master/946.txt"
    "Sense and Snsibility" "https://raw.githubusercontent.com/GITenberg/Sense-and-Sensibility_161/refs/heads/master/161.txt"
    "Emma" "https://raw.githubusercontent.com/GITenberg/Emma_158/refs/heads/master/158.txt"
    "Northanger Abbey" "https://raw.githubusercontent.com/GITenberg/Northanger-Abbey_121/refs/heads/master/121.txt"
    "Persuasion" "https://raw.githubusercontent.com/GITenberg/Persuasion_105/refs/heads/master/105.txt"
    "Mansfield Park" "https://github.com/GITenberg/Mansfield-Park_141/raw/refs/heads/master/141.txt"
    "pride and justice" "https://github.com/GITenberg/Pride-and-Prejudice_1342/blob/master/1342.txt"
    }
   (map (fn [[title link]]
          (hash-map :title title
                    :text (slurp (io/reader link)))
          )
        )
   (tc/dataset)
   ))
(def tidy-austen
  (text/->tidy-text
   austen-books
   (fn [df] (map str (-> df :text)))
   (fn [line] [line nil])
   #(map str/lower-case (str/split % #"\W+"))
   :datatype-token-pos :int32                       
   :datatype-token-idx :int32 
   :datatype-document :int32 
   ))
(def token-table
  (tc/dataset
   {:token (-> tidy-austen :token-lookup-table keys)
    :token-idx (-> tidy-austen :token-lookup-table vals)}))
(def austen-tfidf
  (-> tidy-austen
      :datasets
      first
      text/->tfidf))
(def groups
  (-> austen-tfidf
      (tc/group-by :document { :result-type :as-seq})
      ))
(map (fn [df]
       (-> df
           (tc/order-by :tfidf :desc)
           (tc/head 10)
           (tc/left-join token-table :token-idx)
           (tc/select-columns [:token :tfidf])
           (tc/order-by :tfidf :desc)
           ))
     groups)

(

left-outer-join [10 2]:

:token :tfidf
vernon 0.00351366
frederica 0.00268316
reginald 0.00258733
susan 0.00209755
courcy 0.00166100
churchhill 0.00146935
mainwaring 0.00137352
de 0.00073715
langford 0.00060690
johnson 0.00048680

left-outer-join [10 2]:

:token :tfidf
elinor 0.00466954
marianne 0.00385833
dashwood 0.00171785
jennings 0.00160196
willoughby 0.00147244
brandon 0.00098163
ferrars 0.00088619
lucy 0.00081629
middleton 0.00069532
barton 0.00060670

left-outer-join [10 2]:

:token :tfidf
emma 0.00286596
weston 0.00225143
knightley 0.00199046
elton 0.00197511
woodhouse 0.00160670
fairfax 0.00123317
churchill 0.00114618
harriet 0.00112738
hartfield 0.00081870
bates 0.00075730

left-outer-join [10 2]:

:token :tfidf
tilney 0.00229288
morland 0.00153550
thorpe 0.00152513
catherine 0.00145307
allen 0.00145250
isabella 0.00104198
eleanor 0.00084038
northanger 0.00043575
fullerton 0.00029050
henry 0.00022245

left-outer-join [10 2]:

:token :tfidf
elliot 0.00279565
wentworth 0.00210883
russell 0.00143168
walter 0.00136397
musgrove 0.00125756
anne 0.00083132
uppercross 0.00074486
harville 0.00072551
henrietta 0.00071584
kellynch 0.00070617

left-outer-join [10 2]:

:token :tfidf
crawford 0.00310524
edmund 0.00210951
fanny 0.00206726
bertram 0.00139094
rushworth 0.00111891
thomas 0.00108713
norris 0.00107272
mansfield 0.00097007
julia 0.00068777
yates 0.00033875

left-outer-join [10 2]:

:token :tfidf
div 0.03281766
r 0.02027056
react 0.01745236
code 0.01405134
0 0.00737664
quot 0.00583826
text 0.00474962
id 0.00402507
padding 0.00370611
testid 0.00361251

)

source: notebooks/austen.clj