-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgen_from_epub.py
80 lines (64 loc) · 2.29 KB
/
gen_from_epub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import nltk
import argparse
import numpy as np
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output", type=str, default='./decks', help='Where do you want me to save the decks I generate?')
parser.add_argument("--start_chapter", type=str, default=2, type=int, help='Which chapter should I start from?')
parser.add_argument("--end_chapter", type=str, default=48, type=int, help='Which chapter should I finish at?')
## unzipped epub location
BOOKNAME = lambda x : f"books/book_chap_{x}.htm"
args = parser.parse_args()
def tf(t, d):
'''
Fetch the term frequency of term t in document d
Parameters
----------
t : str
The word to look up
d : dict
The term frequency dictionary for one document
'''
return d[t]
def idf(t, D):
'''
Fetch the inverse document frequency of term t in the set of documents D
Parameters
----------
t : str
The word to look up
D : list
A list of frequency dictionaries, one for each document
'''
num_docs = len(D)
num_docs_with_t = len([d for d in D if d.get(t) is not None])
return np.log(num_docs / num_docs_with_t)
## compile the frequency tables for all documents
docs = []
for i in range(args.start_chapter,args.end_chapter):
j = '{:03d}'.format(i)
fname = BOOKNAME(j)
with open(fname,'r') as html_doc:
chap = BeautifulSoup(html_doc, 'html.parser')
try:
chap_text = " ".join([p.string.lower() for p in chap.find_all(class_='MsoNormal')])
except:
try:
chap_text = " ".join([p.text.lower() for p in chap.find_all(class_='calibre11')])
except:
print(f"error in chapter {j}")
continue
total_terms = len(chap_text)
tokenizer = RegexpTokenizer(r'\w+')
toks = tokenizer.tokenize(chap_text)
freq = nltk.FreqDist(toks)
tf_t = {k:f/total_terms for (k,f) in freq.items()} # TF(t)
docs.append(tf_t)
## for each document, calculate TF-IDF and return indicative words
for i in range(args.start_chapter,args.end_chapter):
doc = docs[i]
tf_idf = {}
for term in doc:
tf_idf[term] = tf(term, doc) * idf(term, docs)
most_important = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)