forked from miron/NeonCore
-
Notifications
You must be signed in to change notification settings - Fork 0
/
token_sentence.py
33 lines (32 loc) · 1.43 KB
/
token_sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
import nltk
from PyPDF2 import PdfReader
nltk.download('punkt')
# Open the PDF file in read-only mode
with open('RTG-CPR-EasyModev1.1.pdf', 'rb') as file:
# Create a PDF object
pdf = PdfReader(file)
# Get the number of pages in the PDF
num_pages = len(pdf.pages)
# Create an empty list to store the sentences
sentences = []
# Iterate over the pages and extract the text
for i in range(num_pages):
page = pdf.getPage(i)
text = page.extractText()
# Remove leading and trailing white space and newline characters
text = text.strip()
# Replace newline characters with a single space
text = text.replace('\n', ' ')
# Replace hyphens and spaces following them with an empty string
text = re.sub(r'(\s+-\s+)|(-\s+)|(\s+-)', '', text)
# Tokenize the text into sentences
page_sentences = nltk.sent_tokenize(text)
# Split the sentences into a list of words
page_sentences = [sentence.split() for sentence in page_sentences]
# Remove punctuation using regular expressions
page_sentences = [[re.sub(r'[^\w\s\’\']', '', word) for word in sentence] for sentence in page_sentences]
# Join the words into sentences while inserting a single space character between each word
page_sentences = [' '.join(sentence) for sentence in page_sentences]
sentences.extend(page_sentences)
print(sentences)