-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_pdf.py
61 lines (53 loc) · 2.04 KB
/
import_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import requests
# get PDF document path
# Download PDF
try:
if not os.path.exists(pdf_path):
print(f"[INFO] File doesn't exist, downloading...")
# Enter the URL of the PDF
url = input("Give url for the PDF")
# The local filename of the downloaded file:
filename = pdf_path
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# open the file and save it
with open(filename, "wb") as file:
file.write(response.content)
print(f"[INFO] The file has been downloaded and saved as {filename}")
else:
print(f"[INFO] Filed to download the file. Status code: {response.status_code}")
else:
print(f"File {pdf_path} exists.")
except NameError:
print(NameError)
if __name__ != '__main__':
print("Not being executed on main.py")
print('done')
# Now open it
import pypdfium2 as fitz
from tqdm.auto import tqdm # pip instasll tqdm, progress bar
def text_formatter(text:str ) -> str:
"""Performs minor formatting on text."""
cleaned_text = text.replace("\n", " ").strip() #Replace newlines with a space, and remove spaces at the end.
cleaned_text = cleaned_text.replace('\r', "")
cleaned_text = cleaned_text.replace('/', "")
#Potentially more text processing here The better formatted text you pass to an LLM, the better the responses will be.
return cleaned_text
def open_and_read_pdf(pdf_path: str) -> list[dict]:
doc = fitz.PdfDocument(pdf_path)
pages_and_text= []
for page_number, page in tqdm(enumerate(doc)): #if doc has 100 pages, page number will start 0,1,2,3...
# print(page_number)
text_page = page.get_textpage()
text = text_page.get_text_bounded(left=None, bottom=None, right=None, top=None, errors='ignore')
text = text_formatter(text=text)
pages_and_text.append({"page_number": page_number-41, "page_char_count": len(text),
"page_word_count": len(text.split(" ")),
"page_sentence_count_raw": len(text.split(". ")),
"page_token_count": len(text) / 4, # 1 token = ~4 characters
"text": text
})
return pages_and_text