-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
62 lines (48 loc) · 2.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import os
import logging
from extractors import pypdf2_extractor, pdfminer_extractor, pymupdf_extractor, pdfplumber_extractor
from helpers import utils
def get_extractor(library_name):
return {
'pypdf2': pypdf2_extractor.extract_text,
'pdfminer': pdfminer_extractor.extract_text,
'pymupdf': pymupdf_extractor.extract_text,
'pdfplumber': pdfplumber_extractor.extract_text
}.get(library_name, None)
def process_file(file_path, libraries, output_dir):
for library in libraries:
extractor = get_extractor(library)
if extractor:
try:
extracted_text = extractor(file_path)
output_file = os.path.join(output_dir, f"{os.path.basename(file_path).split('.')[0]}_{library}.txt")
utils.save_text_to_file(extracted_text, output_file)
except Exception as e:
logging.error(f"Error processing {file_path} with {library}: {e}")
def create_output_dir(base_dir, input_path, is_single_file):
if is_single_file:
output_dir = os.path.join(base_dir, os.path.basename(input_path).split('.')[0])
else:
output_dir = os.path.join(base_dir, os.path.basename(input_path))
os.makedirs(output_dir, exist_ok=True)
return output_dir
def main():
with open('json/config.json') as config_file:
config = json.load(config_file)
logging.basicConfig(level=config.get("log_level", "INFO"))
input_path = config["input_path"]
libraries = config["libraries"]
base_output_dir = config.get("output_path", "./output")
is_single_file = os.path.isfile(input_path) and input_path.endswith('.pdf')
output_dir = create_output_dir(base_output_dir, input_path, is_single_file)
if is_single_file:
process_file(input_path, libraries, output_dir)
elif os.path.isdir(input_path):
for filename in os.listdir(input_path):
if filename.endswith('.pdf'):
process_file(os.path.join(input_path, filename), libraries, output_dir)
else:
logging.error("Invalid input path")
if __name__ == '__main__':
main()