-
Notifications
You must be signed in to change notification settings - Fork 0
/
BookINDEX_find&replaceKEYWORDS.py
108 lines (87 loc) · 3.89 KB
/
BookINDEX_find&replaceKEYWORDS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Script for creating a book index
# get INDEX words from CSV file, find page numbers in PDF, replace synonyme words and combine page numbers
# workflow documentation: https://insulae.hypotheses.org/307
# import packages
import csv
import pandas as pd
from pandas import DataFrame
import numpy as np
import pdftotext
import os
from collections import defaultdict
# define path for mapping file
CSV_FILE='C:\\#######\\BRILL_keywords.csv' # file containing original keywords and mapping to final index words
# sample file: https://github.com/MonikaBarget/DigitalHistory/blob/master/BRILL_INDEX_12lines.csv
with open(CSV_FILE, encoding="utf-8", errors="ignore") as f:
data = pd.read_csv(f, sep=";")
words=data['WORD'].values
print(len(words))
# define INDEX words
index_words=words
# exclude pages containing only bibliographies and endnotes
excluded_pages=[14, 15, 16, 17, 35, 51, 52, 53, 71, 72, 73, 89, 90, 91, 92, 93, 111, 114, 158, 159, 160,
161, 162, 163, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 216, 218, 236, 237, 238,
239, 257, 258, 259, 260, 284, 285, 286, 287, 288, 308, 309, 310, 311, 312, 313, 314]
# open PDF file
def extract_information(filename):
with open(filename, 'rb') as f:
# create dictionary of lists for final results
content_all={}
# read PDF file
pdf = pdftotext.PDF(f)
print("The document has", len(pdf), "pages.")
# get PDF content and check index words page by page
page_dict={}
for i in index_words:
print("TRYING TO FIND", i, ":")
content_all[i]=[]
page_list=[] # create list for page results per word
count=0
j=(str(i)+"'s") # join strings to get keyword in genitive
for page in pdf:
count+=1
if count in excluded_pages:
continue
else:
if i in page:
page_list.append(count) # CLASS = LIST
if j in page:
page_list.append(count) # CLASS = LIST
else:
continue
# replace word found in text for final index word
try:
df=DataFrame(data)
df_count=int(df[df["WORD"]== i].index.values) # get index number as integer
print("POSITION IN DATAFRAME:", df_count)
nw=df.at[df_count, 'MAP TO'] # get new word as string
print("INDEX WORD:", nw) # print final index word
# check if nw as key already exists in dict and add OR update values
if nw in page_dict.keys():
page_dict[nw].extend(page_list) # extend function creates duplicates
print(page_dict)
else:
page_dict[nw]=page_list
print(page_dict)
# write original word to dictionary if mapping to new word fails
except:
page_dict[i]=page_list
print(page_dict)
# write dictionary of lists to new .TXT files
with open('C:\\####\\BRILL_index.txt', 'w', encoding="utf-8") as outfile:
outfile.write(str(page_dict))
outfile.close()
# write each dictionary to one row in new .CSV file
with open('C:\\####\\BRILL_index.csv', 'w', encoding="utf-8") as x:
writer = csv.writer(x)
for key, value in page_dict.items():
# remove bibliography pages, de-depulicate and sort results
writer.writerow([key, sorted(set(value))])
f.close()
# iterate through all PDF files in directoy
if __name__ == '__main__':
path = 'C:\\####\\BRILL_IN-FILE'
for p in os.listdir(path):
filename=(os.path.join(path, p))
extract_information(filename)
print("Done")