-
Notifications
You must be signed in to change notification settings - Fork 0
/
filtering_pdfs.py
96 lines (79 loc) · 3.75 KB
/
filtering_pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import fitz # PyMuPDF
import pytesseract
# Input and output directories
input_dir = "data/pdfs"
output_dir = "data/modified_pdfs"
os.makedirs(output_dir, exist_ok=True)
# Function to check if a page contains images
def page_has_image(page):
images = page.get_images(full=True)
return len(images) > 0
# Function to check for time series keywords in figure descriptions
def figure_has_time_series_keywords(page_text):
keywords = [
"year", "years", "quarter", "quarters", "month", "months",
"week", "weeks", "weekday", "weekend",
"day", "days", "date", "dates",
"hour", "hours", "minute", "minutes", "second", "seconds",
"time", "timestamp", "timezone", "UTC", "GMT",
"annual", "quarterly", "monthly", "weekly", "daily", "hourly",
"decade", "century", "millennium",
"season", "spring", "summer", "autumn", "fall", "winter",
"fiscal", "calendar",
"period", "interval", "duration", "span",
"epoch", "era",
"AM", "PM", "noon", "midnight",
"fortnight", "semester", "trimester",
# Names of months
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December",
# Abbreviated month names
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
]
figure_descriptions = [desc.strip() for desc in page_text.split('\n') if desc.lower().startswith('figure')]
for desc in figure_descriptions:
if any(keyword in desc.lower() for keyword in keywords):
return True, desc
return False, ""
# Process PDFs from "arxiv_pdfs" directly to "trim_3"
def process_pdfs():
for filename in os.listdir(input_dir):
if filename.endswith(".pdf"):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, filename)
# Check if the file already exists in the output folder
if os.path.exists(output_path):
print(f"File already exists in output folder: {output_path}")
continue
try:
# Open the PDF file
doc = fitz.open(input_path)
# Create a new PDF for the output
new_doc = fitz.open()
# Loop through each page, and only add pages with images and relevant figure descriptions to the new PDF
for page_num in range(len(doc)):
page = doc.load_page(page_num)
page_text = page.get_text("text")
has_keywords, description = figure_has_time_series_keywords(page_text)
if page_has_image(page) and has_keywords:
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
print(f"File: {filename}, Page: {page_num + 1}, Keywords in Description: {description}")
# Save the new PDF if it has any pages
if len(new_doc) > 0:
new_doc.save(output_path)
print(f"Trimmed and saved: {output_path}")
else:
print(f"No relevant content found in: {input_path}")
# Close the documents
doc.close()
new_doc.close()
except fitz.EmptyFileError:
print(f"Skipping empty file: {input_path}")
except Exception as e:
print(f"Error processing {input_path}: {str(e)}")
print("All PDFs processed and saved in modified_pdfs.")
# Run the process
process_pdfs()
print("All PDFs processed.")