-
Notifications
You must be signed in to change notification settings - Fork 0
/
PDF.py
93 lines (75 loc) · 2.59 KB
/
PDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import random
from os import listdir
from pypdf import PdfReader
from re import finditer
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
import requests
def find_all_indexes(word, text):
indexes = []
for match in finditer(word, text):
indexes.append(match.start())
return indexes
def get_abstract(text):
start_indexes = find_all_indexes('POVZETEK', text)
end_indexes = find_all_indexes('ABSTRACT', text)
abstracts = []
for start_index, end_index in zip(start_indexes, end_indexes):
abstract_start = start_index + len('POVZETEK')
abstract_end = end_index
abstract = text[abstract_start:abstract_end].rstrip()
if 'poplav' in abstract:
abstracts.append(abstract)
"""
for a in abstracts:
print(a)
"""
return abstracts
def create_txt(abstracts, prefix):
rand = random.randint(0, 10000)
for a in abstracts:
filename = f'output/{prefix}_{rand}.txt'
print(f'\tsaving abstract to {filename}')
with open(filename, 'wb') as f:
f.write(a.encode('utf-8'))
def get_inputs():
paths = []
pdfs = listdir('slovenski_vodar')
for pdf in pdfs:
paths.append(f'slovenski_vodar/{pdf}')
return paths
def get_driver():
driver = Firefox(service=Service(executable_path=GeckoDriverManager().install()))
driver.implicitly_wait(3)
return driver
def download(url, local_path):
response = requests.get(url)
with open(local_path, 'wb') as f:
f.write(response.content)
def scrape():
sv_url = "http://www.drustvo-vodarjev.si/slovenski-vodar/"
driver = get_driver()
driver.get(sv_url)
down_links = driver.find_elements(By.CLASS_NAME, 'filename')
n = len(down_links)
for i in range(n):
down_links = driver.find_elements(By.CLASS_NAME, 'filename')
down_link = down_links[i]
url = down_link.find_element(By.TAG_NAME, 'a')
url = url.get_property('href')
filename = url.split('/')[-1]
print(f'i={i} Trying to save {url} to filename {filename}')
download(url, f'slovenski_vodar/{filename}')
#scrape()
if __name__ == '__main__':
journals = get_inputs()
for journal in journals:
reader = PdfReader(journal)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
print(f'extracting abstracts from {journal}')
poganjavcek = get_abstract(text)
create_txt(poganjavcek, "SV")