-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfname.py
executable file
·136 lines (112 loc) · 3.81 KB
/
pdfname.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
from PyPDF2 import PdfReader
import urllib.request
import urllib.parse
import xml.etree.ElementTree as ET
from typing import NamedTuple
from os import path, rename
from sys import argv
import string
class ManuscriptInfo(NamedTuple):
authors: list[str]
year: int
title: str
# Extract identifying search string from PDF.
def extract_search_string(fn: str) -> str:
# Load PDF.
reader = PdfReader(fn)
# Assume first line contains title. Read first line and return.
page = reader.pages[0]
text = page.extract_text()
first_line_end = text.lower().find("\n")
title = text[:first_line_end]
return title
# Find manuscript metadata by search string.
def find_info(search_string: str) -> ManuscriptInfo | None:
# Convert search string to query string.
tokens = search_string.split(" ")
tokens = filter(ascii_encodable, tokens)
qry = '+'.join(tokens)
qry = qry.replace('-', '+')
# Query dblp and parse results. Assume first result is correct.
qry = urllib.parse.quote_plus(qry)
contents = urllib.request.urlopen(f"https://dblp.org/search/publ/api?q={qry}").read()
root = ET.fromstring(contents)
hits = root.find("hits")
if hits == None or len(hits) == 0:
return None
info = hits[0].find("info")
if info == None:
return None
# Get authors.
authors = info.find("authors")
if authors == None:
return None
authors = [author.text for author in authors]
# Get title.
title = info.find("title")
if title == None:
return None
title = title.text
# Get year.
year = info.find("year")
if year == None:
return None
year = int(year.text)
return ManuscriptInfo(authors, year, title)
# Make file title from manuscript meta data.
def make_file_title(info: ManuscriptInfo) -> str:
# flln ~ first letter of last name.
def flln(author: str) -> str:
# Remove trailing author index.
author = author.rstrip(" 0123456789")
last_name = author.split(" ")[-1]
return last_name[0].capitalize()
# First letter of last name of up to first 3 authors.
first_letters = map(flln, info.authors[:3])
fn = ''.join(first_letters)
# Append "+" if more than 3 authors.
if len(info.authors) > 3:
fn += "+"
# Append year.
fn += str(info.year)
# Append first two words of title.
tokens = info.title.split(" ")[:2]
tokens = map(filter_alphanumeric, tokens)
tokens = map(lambda x: x.capitalize(), tokens)
fn += ''.join(tokens)
return fn
# Rename manuscript file based on content.
def name_pdf(fn: str, search_string: str | None = None):
if search_string == None:
search_string = extract_search_string(fn)
info = find_info(search_string)
if info == None:
print("Could not find metadata for PDF. Try supplying a search string.")
exit(1)
title = make_file_title(info)
base_path = path.dirname(fn)
new_fn = path.join(base_path, title + ".pdf")
print(f'Renaming "{fn}" to "{new_fn}"...')
rename(fn, new_fn)
print("Done.")
### BEGIN ASCII UTILS ###
def ascii_encodable(s: str) -> bool:
try:
s.encode("ascii")
except UnicodeError:
return False
return True
def filter_alphanumeric(s: str) -> str:
alphanumeric = string.digits + string.ascii_letters
return ''.join(filter(lambda x: x in alphanumeric, s))
### END ASCII UTILS ###
if __name__ == '__main__':
if len(argv) < 2:
print("Usage: [file_name] [search_string:optional]")
print("Please provide a PDF file name as the first argument.")
print("Optionally, provide a search string for finding the metadata.")
exit(1)
fn = argv[1]
search_string = argv[2] if len(argv) >= 3 else None
name_pdf(fn, search_string)