-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics.py
389 lines (317 loc) · 13.4 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Grant Drake <[email protected]>, 2017 additions by David Forrester <[email protected]>'
__docformat__ = 'restructuredtext en'
import re, os, shutil, subprocess, tempfile
import six
import mechanize
from six import text_type as unicode
from calibre import prints
from calibre.constants import isosx
from calibre.ebooks.oeb.iterator import EbookIterator
from calibre.utils.html2text import html2text
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre_plugins.count_pages.nltk_lite.textanalyzer import TextAnalyzer
from calibre_plugins.count_pages.nltk_lite.obi2.obi2 import Obi2
RE_HTML_BODY = re.compile(u'<body[^>]*>(.*)</body>', re.UNICODE | re.DOTALL | re.IGNORECASE)
RE_STRIP_MARKUP = re.compile(u'<[^>]+>', re.UNICODE)
def get_pdf_page_count(book_path):
'''
Optimisation to read the actual page count for PDFs from the PDF itself.
'''
from calibre.ptempfile import TemporaryDirectory
with TemporaryDirectory('_pages_pdf') as pdfpath:
pdf_copy = os.path.join(pdfpath, 'src.pdf')
shutil.copyfile(book_path, pdf_copy)
try:
res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
(pdfpath, False))
except WorkerError as e:
prints(e.orig_tb)
raise RuntimeError('Failed to run pdfinfo')
# Let's try to delete this extra copy straight away
try:
os.remove(pdf_copy)
except:
pass
info = res['result']
if not info:
raise ValueError('Could not read info dict from PDF')
if 'Pages' in info:
return int(info['Pages'])
def get_page_count(iterator, book_path, page_algorithm, custom_chars_per_page=0):
'''
Given an iterator for the epub (if already opened/converted), estimate a page count
'''
if iterator is None:
iterator = _open_epub_file(book_path)
count = 0
if page_algorithm == 0:
count = _get_page_count_accurate(iterator)
elif page_algorithm == 1:
count = _get_page_count_calibre(iterator)
elif page_algorithm == 2:
count = _get_page_count_adobe(iterator, book_path)
elif page_algorithm == 3:
count = _get_page_count_custom(iterator, custom_chars_per_page)
print('\tPage count:', count)
return iterator, count
def get_word_count(iterator, book_path, icu_wordcount):
'''
Given an iterator for the epub (if already opened/converted), estimate a word count
'''
from calibre.utils.localization import get_lang
if iterator is None:
iterator = _open_epub_file(book_path)
lang = iterator.opf.language
lang = get_lang() if not lang else lang
count = _get_epub_standard_word_count(iterator, lang, icu_wordcount)
print('\tWord count:', count)
return iterator, count
def _open_epub_file(book_path, strip_html=False):
'''
Given a path to an EPUB file, read the contents into a giant block of text
'''
iterator = EbookIterator(book_path)
iterator.__enter__(only_input_plugin=True, run_char_count=True,
read_anchor_map=False)
return iterator
def _get_page_count_adobe(iterator, book_path):
'''
This algorithm uses the proper adobe count. We look at the compressed size in the
zip of every file in the spine, and apply the 1024 bytes calculation to that...
'''
import math
from calibre.utils.zipfile import ZipFile
with ZipFile(book_path, 'r') as zf:
pages = 0.0
base = iterator.base
csizes = {os.path.abspath(os.path.join(base, ci.filename)): ci.compress_size for ci in zf.infolist()}
for path in iterator.spine:
sz = csizes.get(os.path.abspath(path))
if sz is not None:
pages += math.ceil(sz / 1024.0)
zf.close()
return pages
def _get_page_count_calibre(iterator):
'''
This algorithm uses the ebook viewer page count.
'''
count = sum(iterator.pages)
return count
def _get_page_count_accurate(iterator):
'''
The accurate algorithm attempts to apply a similar algorithm
used for mobi accurate in apnx.py
'''
epub_html = _read_epub_contents(iterator)
# Decide whether to split on <p> or <div> characters
num_divs = len(epub_html.split('<div'))
num_paras = len(epub_html.split('<p'))
split_char = 'p' if num_paras > num_divs else 'd'
# States
in_tag = False
in_p = False
check_p = False
closing = False
p_char_count = 0
# Get positions of every line
# A line is either a paragraph starting
# or every 70 characters in a paragraph.
lines = []
pos = -1
# We want this to be as fast as possible so we
# are going to do one pass across the text. re
# and string functions will parse the text each
# time they are called.
#
# We can can use .lower() here because we are
# not modifying the text. In this case the case
# doesn't matter just the absolute character and
# the position within the stream.
for c in epub_html.lower():
pos += 1
# Check if we are starting or stopping a p tag.
if check_p:
if c == '/':
closing = True
continue
elif c == split_char:
if closing:
in_p = False
else:
in_p = True
lines.append(pos - 2)
check_p = False
closing = False
continue
if c == '<':
in_tag = True
check_p = True
continue
elif c == '>':
in_tag = False
check_p = False
continue
if in_p and not in_tag:
p_char_count += 1
if p_char_count == 70:
lines.append(pos)
p_char_count = 0
# Using 31 lines instead of 32 used by APNX to get the numbers similar
count = int(len(lines) / 31)
# We could still have a really weird document and massively understate
# As a backstop count the characters using the "fast count" algorithm
# and use that number instead
fast_count = int(len(epub_html) / 2400) + 1
print('\tEstimated accurate page count')
print('\t Lines:', len(lines), ' Divs:', num_divs, ' Paras:', num_paras)
print('\t Accurate count:', count, ' Fast count:', fast_count)
return max([count, fast_count])
def _get_page_count_custom(iterator, custom_chars_per_page):
'''
This algorithm uses a custom algorithm with a user supplied
average page character count.
'''
count = 0
for path in iterator.spine:
with open(path, 'rb') as f:
html = f.read().decode('utf-8', 'replace')
html = unicode(_extract_body_text(html)).strip()
count = count + int(len(html) / custom_chars_per_page) + 1
return count
def _get_epub_standard_word_count(iterator, lang='en', icu_wordcount=False):
'''
This algorithm counts individual words instead of pages
'''
book_text = _read_epub_contents(iterator, strip_html=True)
wordcount = None
if icu_wordcount:
try:
from calibre.spell.break_iterator import count_words
print('\tWord count using icu_wordcount - trying to count_words')
wordcount = count_words(book_text, lang)
print('\tWord count - used count_words:', wordcount)
except:
try: # The above method is new and no-one will have it as of 08/01/2016.
print('\tWord count using icu_wordcount - trying to import split_into_words_and_positions')
from calibre.spell.break_iterator import split_into_words_and_positions
print('\tWord count - trying split_into_words_and_positions:')
wordcount = len(split_into_words_and_positions(book_text, lang))
print('\tWord count - used split_into_words_and_positions:', wordcount)
except:
pass
if not wordcount: # If not using icu wordcount, or it failed, use the old method.
from calibre.utils.wordcount import get_wordcount_obj
print('\tWord count using older method - trying get_wordcount_obj')
wordcount = get_wordcount_obj(book_text)
wordcount = wordcount.words
return wordcount
def _read_epub_contents(iterator, strip_html=False):
'''
Given an iterator for an ePub file, read the contents into a giant block of text
'''
book_files = []
for path in iterator.spine:
with open(path, 'rb') as f:
html = f.read().decode('utf-8', 'replace')
if strip_html:
html = unicode(_extract_body_text(html)).strip()
#print('FOUND HTML:', html)
book_files.append(html)
return ' '.join(book_files)
def _extract_body_text(data):
'''
Get the body text of this html content with any html tags stripped
'''
body = RE_HTML_BODY.findall(data)
if body:
return RE_STRIP_MARKUP.sub('', body[0]).replace('.','. ')
return ''
# ---------------------------------------------------------
# CBR/CBZ Page Count Functions
# ---------------------------------------------------------
COMIC_PAGE_EXTENSIONS = ['.jpeg', '.jpg', '.gif', '.png']
def get_cbr_page_count(book_path):
from calibre.utils.unrar import names
pages = set()
with open(book_path, 'rb') as rf:
file_names = list(names(rf))
for name in file_names:
if '__MACOSX' in name: continue
ext = os.path.splitext(name)[1].lower()
if ext in COMIC_PAGE_EXTENSIONS:
pages.add(name)
return len(pages)
def get_cbz_page_count(book_path):
from calibre.utils.zipfile import ZipFile
page_count = 0
with ZipFile(book_path, 'r') as zf:
for name in zf.namelist():
if '__MACOSX' in name: continue
ext = os.path.splitext(name)[1].lower()
if ext in COMIC_PAGE_EXTENSIONS:
page_count += 1
return page_count
# ---------------------------------------------------------
# Readability Statistics Functions
# ---------------------------------------------------------
def get_text_analysis(iterator, book_path, nltk_pickle):
'''
Given an iterator for the epub (if already opened/converted), perform text
analysis using NLTK to produce a dictionary of analysed statistics for
attribution like words, sentences, syllables etc that we can then perform
various official readability computations with.
'''
if iterator is None:
iterator = _open_epub_file(book_path)
epub_html = _read_epub_contents(iterator, strip_html=True)
# Lets ignore any html content files less than 500 characters to hopefully
# stop any skewing of results caused by cover pages etc.
#epub_html = [h for h in epub_html if len(h) > 500]
text = ''.join(epub_html).strip()
# TODO: Do not analyse the WHOLE book - just a portion should be sufficient???
t = TextAnalyzer(nltk_pickle)
text_analysis = t.analyzeText(text)
return iterator, text_analysis
def get_flesch_reading_ease(text_analysis, lang=None):
if lang and lang == 'deu':
print('\tFlesch Reading Ease: language=%s' % lang)
score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
else:
score = 206.835 - (1.015 * (text_analysis['averageWordsPerSentence'])) - (84.6 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
print('\tFlesch Reading Ease:', score)
return score
def get_flesch_kincaid_grade_level(text_analysis):
score = 0.39 * (text_analysis['averageWordsPerSentence']) + 11.8 * (text_analysis['syllableCount']/ text_analysis['wordCount']) - 15.59
print('\tFlesch Kincade Grade:', score)
return score
def get_gunning_fog_index(text_analysis):
score = 0.4 * ((text_analysis['averageWordsPerSentence']) + (100 * (text_analysis['complexwordCount']/text_analysis['wordCount'])))
print('\tGunning Fog:', score)
return score
def get_obi2_level(iterator, obi2_resources, lang=None):
if lang and lang == 'jpn':
epub_html = _read_epub_contents(iterator)
re.sub(r'<rt>([ぁ-んァ-ン]+)<\/rt>', '', epub_html) # Remove furigana
epub_text = html2text(epub_html) # Remove html tags
epub_text = re.sub(r'\!\[Unnamed image\]\([A-Za-z\/0-9\.]+\)', '', epub_text) # Remove image references
epub_text = re.sub(r'\(part[0-9\.A-Za-z#-]+\)', '', epub_text) # Remove file references
o = Obi2(obi2_resources)
return o.analyze_text(epub_text)
# calibre-debug -e statistics.py
if __name__ == '__main__':
def test_ntlk(book_path):
pickle_path = os.path.join(os.getcwd(), 'nltk_lite/english.pickle')
p = open(pickle_path,'rb').read()
it, ta = get_text_analysis(None, book_path, p)
get_flesch_reading_ease(ta)
get_flesch_kincaid_grade_level(ta)
get_gunning_fog_index(ta)
it.__exit__()
#test_ntlk('''C:\Dev\Tools\eclipse\workspace\_Misc\Test\TestDoc.rtf''')
get_cbz_page_count('''C:\Dev\Tools\eclipse\workspace\_Misc\misery-depot.zip''')
get_cbr_page_count('''C:\Dev\Tools\eclipse\workspace\_Misc\misery-depot.cbr''')