Skip to content

Commit

Permalink
UI Modification: Added Expandable Text
Browse files Browse the repository at this point in the history
Merge branch 'dev' of github.com:akashe/arxiv_hunter into dev
  • Loading branch information
subratamondal1 committed Feb 21, 2024
2 parents 327aa78 + 77e6897 commit 7c5da19
Showing 1 changed file with 105 additions and 101 deletions.
206 changes: 105 additions & 101 deletions src/logics/data_extraction.py
Original file line number Diff line number Diff line change
@@ -1,150 +1,154 @@
"""Data Extraction from arxiv api"""

import os.path
import json
import pdb

from tqdm import tqdm
import argparse
from datetime import datetime
from typing import Dict, List
import multiprocessing

import requests
import feedparser
from tqdm import tqdm
import fitz # this is pymupdf
import pandas as pd
from datetime import datetime, timedelta
import fitz # this is pymupdf
from typing import Dict, List, Tuple

STANDARD_SEARCH_QUERY: str = (
"cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"
)
STANDARD_SEARCH_QUERY = f"cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"


class ArxivParser:
"""Extract & Parse data from the Arxiv API"""
def download_data(local_entries, split_no, save_path, days):
print("Downloading")
downloaded_data: Dict[str, Dict[str, str]] = {}
# Loop through the entries
for entry in tqdm(local_entries):
try:
published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
current_date = datetime.now()
date_diff = (current_date - published_date).days

# Check if the date difference is less than or equal to the days parameter
if date_diff <= days:
id = entry.id
title = entry.title
link = entry.link
summary = entry.summary

# Get the pdf link by replacing the "abs" with "pdf" in the link
pdf_link = link.replace("abs", "pdf")
# Get the pdf content by sending a GET request to the pdf link and opening it with fitz
pdf_content = requests.get(pdf_link).content
pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
# Extract the text from the pdf file
pdf_text = ""
for page in pdf_file:
pdf_text += page.get_text()
# Store the extracted data in the dictionary with the id as the key
downloaded_data[id] = {
"title": title,
"published_date": published_date,
"pdf_link": pdf_link,
"summary": summary,
"pdf_text": pdf_text
}
except Exception as e:
print("Failed for an entry")
print(e)
continue
# Convert the extracted data into a pandas dataframe
save_location = os.path.join(save_path, f"split_{split_no}.pkl")
pd.DataFrame.from_dict(downloaded_data, orient="index").to_pickle(save_location)

base_url = "http://export.arxiv.org/api/query"

def __init__(self, data_path="../data/"):
self.extracted_data: List[Dict[str, str]] = (
[]
)
class ArxivParser:
base_url = "http://export.arxiv.org/api/query"
def __init__(self, data_path = "data/"):
self.extracted_data: pd.DataFrame = pd.DataFrame()

if not os.path.exists(data_path):
os.makedirs(data_path)
self.data_path = data_path

def get_results(
self,
max_results: int = 5,
days: int = 60,
search_query: str = STANDARD_SEARCH_QUERY,
) -> List[Dict[str, str]]:

"""Get results from the Arxiv API"""

def get_results(self, max_results: int = 5,
days: int = 60,
search_query: str = STANDARD_SEARCH_QUERY,
num_threads: int = 8) -> pd.DataFrame:
# Construct the url with the query parameters
params = {
"search_query": search_query,
"start": 0,
"max_results": max_results,
"sortBy": "submittedDate",
"sortOrder": "descending",
"sortOrder": "descending"
}
url = self.base_url + "?" + requests.compat.urlencode(params)


response = requests.get(url, timeout=15)

# Send a GET request to the api endpoint
response = requests.get(url)
# Parse the response
entries = feedparser.parse(response.text).entries

downloaded_data: List[Dict[str, str]] = (
[]
)
splits = []
for i in range(num_threads):
splits.append(entries[(i*(len(entries)//num_threads)):((i+1)*(len(entries)//num_threads))])


for entry in tqdm(entries):
published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
current_date = datetime.now()
date_diff = (current_date - published_date).days
if len(entries)%num_threads!=0:
mod = len(entries)%num_threads
splits[-1].extend(entries[-mod:])


if date_diff <= days:
link = entry.link
pdf_link = link.replace("abs", "pdf")
pdf_content = requests.get(pdf_link, timeout=15).content
pdf_file = fitz.open(stream=pdf_content, filetype="pdf")

pdf_text = ""
for page in pdf_file:
pdf_text += page.get_text()

downloaded_data.append(
{
"id": entry.id,
"title": entry.title,
"summary": entry.summary,
"published_date": str(published_date),
"pdf_link": pdf_link,
"pdf_text": pdf_text,
}
)

self.extracted_data.extend(downloaded_data)

return self.extracted_data
process_list = []
for i, split in enumerate(splits):
p = multiprocessing.Process(target=download_data, args=[split, i, self.data_path, days])
p.start()
process_list.append(p)

def store_data(
self,
save_file_name: str = "master_data.json",
max_results: int = 10,
days: int = 60,
) -> None:
"""Store the Extracted data in Json format"""
for process in process_list:
process.join()

dfs = []
# combine all the downloaded content
for i in range(len(splits)):
save_path = os.path.join(self.data_path, f"split_{i}.pkl")
df = pd.read_pickle(save_path)
dfs.append(df)

return pd.concat(dfs, ignore_index=True)

def store_data(self, save_file_name: str = "master_data.pkl",
max_results: int = 10,
days: int = 60) -> None:
# Call the get_results method and store the dataframe in the self.extracted_data attribute
self.extracted_data = self.get_results(max_results, days)

assert len(self.extracted_data) > 0, "Got no results with the search query"

for data in self.extracted_data:
data["published_date"] = data["published_date"].strftime("%Y-%m-%d")

save_location = os.path.join(self.data_path, save_file_name)
with open(save_location, "w", encoding="utf-8") as f:
json.dump(self.extracted_data, f, indent=4)
# Feature Engineer two new columns
self.extracted_data['summary_length'] = self.extracted_data.apply(lambda row: len(row['summary']), axis=1)
self.extracted_data['pdf_text_length'] = self.extracted_data.apply(lambda row: len(row['pdf_text']), axis=1)

save_location = os.path.join(self.data_path,save_file_name)
self.extracted_data.to_pickle(save_location)

def get_stored_data(self):
"""Return the self.extracted_data attribute"""
def get_stored_data(self) -> pd.DataFrame:
# Return the self.extracted_data attribute

assert len(self.extracted_data) != 0, "Please store data first"
return self.extracted_data


if __name__ == "__main__":

parser = argparse.ArgumentParser(
description="A recommender system based on tf-idf and cosine similarity"
)
parser = argparse.ArgumentParser(description="A recommender system based on tf-idf and cosine similarity")

# Add an argument for the query
parser.add_argument(
"-m",
"--max_results",
type=str,
help="Maximum results to store",
nargs="?",
default=5,
)
parser.add_argument(
"-d",
"--days",
type=str,
help="Store only for these many past days",
nargs="?",
default=50,
)
parser.add_argument("-m", "--max_results", type=str, help="Maximum results to store", nargs='?', default=50)
parser.add_argument("-d", "--days", type=str, help="Store only for these many past days", nargs='?', default=50)

# Parse the arguments
args = parser.parse_args()

new_max_results = args.max_results
new_days = args.days
max_results = args.max_results
days = int(args.days)

# initialize parser
parser = ArxivParser()

# store the past data
parser.store_data(max_results=new_max_results, days=new_days)
parser.store_data(max_results=max_results, days=days)

0 comments on commit 7c5da19

Please sign in to comment.