This repository has been archived by the owner on May 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
annotate.py
49 lines (35 loc) · 1.49 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import PyPDF2
import pdf, matcher, sigil
from collections import Counter
import os.path
import argparse
def annotate(input_filename, output_filename, pages=None):
rdr = pdf.SchematicReader(open(input_filename, 'rb'))
sigdict = sigil.SigilDict.from_json(open('scheming.json'))
font_name = rdr.add_dummy_font()
if pages is None:
pages = range(len(rdr.pages))
for page_no in pages:
line_ops = rdr.get_line_ops(page_no)
matches = matcher.match_sigils(sigdict, line_ops)
rdr.add_text(page_no, font_name,
[(s, pos, scale) for (s, pos, scale) in matches])
wtr = PyPDF2.PdfFileWriter()
for p in rdr.pages:
wtr.addPage(p)
wtr.write(open(output_filename, 'wb'))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--pages', '-p', nargs='?',
help="comma-separated list of pages to process [default: all]")
parser.add_argument('input', nargs='?', default='P1318-005a.pdf',
help='path to input PDF [default: P1318-005a.pdf]')
parser.add_argument('output', nargs='?', default=None,
help='path to output PDF [default: <input>_searchable.pdf]')
args = parser.parse_args()
if args.pages is not None:
args.pages = map(int, args.pages.split(','))
if args.output is None:
root, ext = os.path.splitext(args.input)
args.output = '{}_searchable{}'.format(root, ext)
annotate(args.input, args.output, pages=args.pages)