-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
147 lines (116 loc) · 4.42 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from pathlib import Path
from typing import List, Tuple
import nltk
import pdfplumber
import torch
from torch.utils.data import Dataset
from transformers import RagTokenizer
import re
from bs4 import BeautifulSoup
import requests
def filter_sentence(sentence):
patterns = [
r"https?://\S+", # URLs
r"ISBN\s*\d+", # ISBNs
r"OCLC\s*\d+", # OCLC numbers
]
for pattern in patterns:
if re.search(pattern, sentence):
return False
if len(sentence) < 5 or sentence.strip().startswith("{{cite web}}"):
return False
alphanumeric_ratio = sum(c.isalnum() for c in sentence) / len(sentence)
min_alphanumeric_ratio = 0.5
if alphanumeric_ratio < min_alphanumeric_ratio:
return False
numeric_ratio = sum(c.isdigit() for c in sentence) / len(sentence)
max_numeric_ratio = 0.6
if numeric_ratio > max_numeric_ratio:
return False
return True
def pdf_to_sentence_chunks(pdf_path: Path) -> List[str]:
"""Parses a PDF file into sentences."""
nltk.download("punkt")
text_chunks = []
removed = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text().replace("\n", " ")
potential_sentences = nltk.sent_tokenize(text)
for potential_sentence in potential_sentences:
for sentence in potential_sentence.split(". "):
if filter_sentence(sentence):
text_chunks.append(sentence)
else:
removed.append(sentence)
moving_window_chunks = []
window_size = 3
for i in range(len(text_chunks) - window_size):
moving_window_chunks.append(" ".join(text_chunks[i : i + window_size]))
return moving_window_chunks
# Initial setup, using above functions
class PDFDataset(Dataset):
"""Torch Dataset for PDF Sentences."""
def __init__(self, pdf_paths: List[Path], pretrained_model_name: str):
self.tokenizer = RagTokenizer.from_pretrained(pretrained_model_name)
sentences = []
for path in pdf_paths:
sentences.extend(pdf_to_sentence_chunks(path))
self.sentences = sentences
def __len__(self) -> int:
return len(self.sentences)
def __getitem__(self, idx: int) -> Tuple[str, torch.Tensor]:
sentence = self.sentences[idx]
embedding = self.tokenizer(
sentence,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=128,
)
return sentence, embedding
# Used for Doc2Vec
class PDFTextDataset(Dataset):
"""Torch Dataset for PDF Sentences."""
def __init__(self, pdf_paths: List[Path]):
sentences = []
for path in pdf_paths:
sentences.extend(pdf_to_sentence_chunks(path))
self.sentences = sentences
def __len__(self) -> int:
return len(self.sentences)
def __getitem__(self, idx: int) -> str:
sentence = self.sentences[idx]
return sentence
# Improved Wiki setup
class WikiDataset(Dataset):
def __init__(self, wiki_page_names: List[str]):
text = []
for page_title in wiki_page_names:
url = f"https://en.wikipedia.org/wiki/{page_title}"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
content_div = soup.find("div", id="mw-content-text")
extracted_text = ""
for element in content_div.descendants:
if isinstance(element, str):
if "References" in element and "[edit]" in element:
break
if ".mw-parser-output" in element:
continue
extracted_text += element.strip() + " "
elif element.name == "li":
extracted_text += "\n- " + element.get_text(
separator=" ", strip=True
)
text.extend(extracted_text.strip().split(". "))
moving_window_chunks = []
window_size = 3
for i in range(len(text) - window_size):
moving_window_chunks.append(" ".join(text[i : i + window_size]))
self.sentences = moving_window_chunks
def __len__(self) -> int:
return len(self.sentences)
def __getitem__(self, idx: int) -> str:
sentence = self.sentences[idx]
return sentence