-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
177 lines (149 loc) · 5.67 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import arxiv
import pymupdf
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import re
from structures import ResearchPaper, ResearchAnalysis
log_directory = 'logs'
if not os.path.exists(log_directory):
os.makedirs(log_directory)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
log_file = os.path.join(log_directory, "utils.log")
handler = logging.FileHandler(log_file)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
# Searchs for papers and returns them in a list
def search_relevent_arxiv(query: str, max_results: int = 5):
"""Searches for papers on arXiv"""
try:
# Configure the client
client = arxiv.Client(
page_size = max_results,
delay_seconds = 3.0,
num_retries = 3,
)
# Configure the search
search = arxiv.Search(
query = query,
max_results = max_results,
sort_by = arxiv.SortCriterion.Relevance,
)
# Returns a list of results (since we can call .download_pdf on them)
return list(client.results(search))
except Exception as e:
print(f"Error searching for papers: {e}")
return []
# Searchs for papers and returns them in a list
def search_new_arxiv(query: str, max_results: int = 5):
"""Searches for papers on arXiv"""
try:
# Configure the client
client = arxiv.Client(
page_size = max_results,
delay_seconds = 3.0,
num_retries = 3,
)
# Configure the search
search = arxiv.Search(
query = query,
max_results = max_results,
sort_by = arxiv.SortCriterion.SubmittedDate,
)
# Returns a list of results (since we can call .download_pdf on them)
return list(client.results(search))
except Exception as e:
print(f"Error searching for papers: {e}")
return []
def get_pdf(result, path):
return result.download_pdf(dirpath = path)
def download_papers(results, search_num = None, max_workers = 10):
papers_dir = f"./papers/"
# Creates a papers dir
if not os.path.exists(papers_dir):
os.makedirs(papers_dir)
research_papers = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_result = {executor.submit(get_pdf, result, papers_dir): result for result in results}
for future in as_completed(future_to_result):
try:
pdf_path = future.result()
logger.info(f"Downloading {future_to_result[future].title} to {pdf_path} ....")
paper = ResearchPaper(
title=future_to_result[future].title,
authors=future_to_result[future].authors,
abstract=future_to_result[future].summary,
url=future_to_result[future].entry_id,
pdf_path=pdf_path,
content=pdf_to_text(pdf_path)
)
research_papers.append(paper)
except Exception as exc:
logger.info(f"{future_to_result[future].entry_id} generated an exception: {exc}")
return research_papers
def pdf_to_text(pdf_path):
"""Converts a PDF to text"""
# Open the PDF file
pdf = pymupdf.open(pdf_path)
text = ""
# Iterate through each page and add the text to the string
for page in pdf:
text += page.get_text()
# Return the text
return text
def split_into_sections(text: str) -> List[str]:
"""Split text into sections based on common research paper headers"""
section_headers = [
"abstract", "introduction", "background", "related work",
"methodology", "method", "approach", "implementation",
"results", "evaluation", "discussion", "conclusion",
"future work", "references"
]
# Add common section number patterns
numbered_patterns = [
r'\d+\.\s+\w+', # "1. Introduction"
r'[IVX]+\.\s+\w+', # "IV. Results"
]
current_section = ""
sections = []
lines = text.split('\n')
for line in lines:
line_lower = line.lower().strip()
# Check if line is a section header
is_header = (
line_lower in section_headers or
any(re.match(pattern, line, re.IGNORECASE) for pattern in numbered_patterns)
)
if is_header:
if current_section:
sections.append(current_section)
current_section = line + "\n"
else:
current_section += line + "\n"
if current_section:
sections.append(current_section)
return sections
def create_chunks_with_sections(text: str, chunk_size: int = 4000) -> List[Tuple[str, str]]:
"""Creates chunks while preserving section context"""
sections = split_into_sections(text)
chunks = []
current_chunk = ""
current_section = ""
for section in sections:
# If adding this section would exceed chunk size
if len(current_chunk) + len(section) > chunk_size:
if current_chunk:
chunks.append((current_section, current_chunk))
current_chunk = section
current_section = section.split('\n')[0] # First line is header
else:
current_chunk += section
if not current_section:
current_section = section.split('\n')[0]
if current_chunk:
chunks.append((current_section, current_chunk))
return chunks