UI Modification: Added Expandable Text

Merge branch 'dev' of github.com:akashe/arxiv_hunter into dev
akashe · Feb 21, 2024 · 7c5da19 · 7c5da19
2 parents 327aa78 + 77e6897
commit 7c5da19
Showing 1 changed file with 105 additions and 101 deletions.
diff --git a/src/logics/data_extraction.py b/src/logics/data_extraction.py
@@ -1,150 +1,154 @@
-"""Data Extraction from arxiv api"""
-
 import os.path
-import json
+import pdb
+
+from tqdm import tqdm
 import argparse
-from datetime import datetime
-from typing import Dict, List
+import multiprocessing
+
 import requests
 import feedparser
-from tqdm import tqdm
-import fitz  # this is pymupdf
+import pandas as pd
+from datetime import datetime, timedelta
+import fitz # this is pymupdf
+from typing import Dict, List, Tuple
 
-STANDARD_SEARCH_QUERY: str = (
-    "cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"
-)
+STANDARD_SEARCH_QUERY = f"cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"
 
 
-class ArxivParser:
-    """Extract & Parse data from the Arxiv API"""
+def download_data(local_entries, split_no, save_path, days):
+    print("Downloading")
+    downloaded_data: Dict[str, Dict[str, str]] = {}
+    # Loop through the entries
+    for entry in tqdm(local_entries):
+        try:
+            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
+            current_date = datetime.now()
+            date_diff = (current_date - published_date).days
+
+            # Check if the date difference is less than or equal to the days parameter
+            if date_diff <= days:
+                id = entry.id
+                title = entry.title
+                link = entry.link
+                summary = entry.summary
+
+                # Get the pdf link by replacing the "abs" with "pdf" in the link
+                pdf_link = link.replace("abs", "pdf")
+                # Get the pdf content by sending a GET request to the pdf link and opening it with fitz
+                pdf_content = requests.get(pdf_link).content
+                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
+                # Extract the text from the pdf file
+                pdf_text = ""
+                for page in pdf_file:
+                    pdf_text += page.get_text()
+                # Store the extracted data in the dictionary with the id as the key
+                downloaded_data[id] = {
+                    "title": title,
+                    "published_date": published_date,
+                    "pdf_link": pdf_link,
+                    "summary": summary,
+                    "pdf_text": pdf_text
+                }
+        except Exception as e:
+            print("Failed for an entry")
+            print(e)
+            continue
+    # Convert the extracted data into a pandas dataframe
+    save_location = os.path.join(save_path, f"split_{split_no}.pkl")
+    pd.DataFrame.from_dict(downloaded_data, orient="index").to_pickle(save_location)
 
-    base_url = "http://export.arxiv.org/api/query"
 
-    def __init__(self, data_path="../data/"):
-        self.extracted_data: List[Dict[str, str]] = (
-            []
-        )
+class ArxivParser:
+    base_url = "http://export.arxiv.org/api/query"
+    def __init__(self, data_path = "data/"):
+        self.extracted_data: pd.DataFrame = pd.DataFrame()
 
         if not os.path.exists(data_path):
             os.makedirs(data_path)
         self.data_path = data_path
 
-    def get_results(
-        self,
-        max_results: int = 5,
-        days: int = 60,
-        search_query: str = STANDARD_SEARCH_QUERY,
-    ) -> List[Dict[str, str]]:
-
-        """Get results from the Arxiv API"""
-
+    def get_results(self, max_results: int = 5,
+                    days: int = 60,
+                    search_query: str = STANDARD_SEARCH_QUERY,
+                    num_threads: int = 8) -> pd.DataFrame:
+        # Construct the url with the query parameters
         params = {
             "search_query": search_query,
             "start": 0,
             "max_results": max_results,
             "sortBy": "submittedDate",
-            "sortOrder": "descending",
+            "sortOrder": "descending"
         }
         url = self.base_url + "?" + requests.compat.urlencode(params)
 
-
-        response = requests.get(url, timeout=15)
-
+        # Send a GET request to the api endpoint
+        response = requests.get(url)
+        # Parse the response
         entries = feedparser.parse(response.text).entries
 
-        downloaded_data: List[Dict[str, str]] = (
-            []
-        ) 
+        splits = []
+        for i in range(num_threads):
+            splits.append(entries[(i*(len(entries)//num_threads)):((i+1)*(len(entries)//num_threads))])
 
-
-        for entry in tqdm(entries):
-            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
-            current_date = datetime.now()
-            date_diff = (current_date - published_date).days
+        if len(entries)%num_threads!=0:
+            mod = len(entries)%num_threads
+            splits[-1].extend(entries[-mod:])
 
-
-            if date_diff <= days:
-                link = entry.link              
-                pdf_link = link.replace("abs", "pdf")
-                pdf_content = requests.get(pdf_link, timeout=15).content
-                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
-
-                pdf_text = ""
-                for page in pdf_file:
-                    pdf_text += page.get_text()
-
-                downloaded_data.append(
-                    {
-                        "id": entry.id,
-                        "title": entry.title,
-                        "summary": entry.summary,
-                        "published_date": str(published_date),
-                        "pdf_link": pdf_link,
-                        "pdf_text": pdf_text,
-                    }
-                )
-
-        self.extracted_data.extend(downloaded_data)
-
-        return self.extracted_data
+        process_list = []
+        for i, split in enumerate(splits):
+            p = multiprocessing.Process(target=download_data, args=[split, i, self.data_path, days])
+            p.start()
+            process_list.append(p)
 
-    def store_data(
-        self,
-        save_file_name: str = "master_data.json",
-        max_results: int = 10,
-        days: int = 60,
-    ) -> None:
-        """Store the Extracted data in Json format"""
+        for process in process_list:
+            process.join()
+
+        dfs = []
+        # combine all the downloaded content
+        for i in range(len(splits)):
+            save_path = os.path.join(self.data_path, f"split_{i}.pkl")
+            df = pd.read_pickle(save_path)
+            dfs.append(df)
+
+        return pd.concat(dfs, ignore_index=True)
+
+    def store_data(self, save_file_name: str = "master_data.pkl",
+                   max_results: int = 10,
+                   days: int = 60) -> None:
+        # Call the get_results method and store the dataframe in the self.extracted_data attribute
         self.extracted_data = self.get_results(max_results, days)
 
         assert len(self.extracted_data) > 0, "Got no results with the search query"
-
-        for data in self.extracted_data:
-            data["published_date"] = data["published_date"].strftime("%Y-%m-%d")
-
-        save_location = os.path.join(self.data_path, save_file_name)
-        with open(save_location, "w", encoding="utf-8") as f:
-            json.dump(self.extracted_data, f, indent=4)
+        # Feature Engineer two new columns
+        self.extracted_data['summary_length'] = self.extracted_data.apply(lambda row: len(row['summary']), axis=1)
+        self.extracted_data['pdf_text_length'] = self.extracted_data.apply(lambda row: len(row['pdf_text']), axis=1)
+
+        save_location = os.path.join(self.data_path,save_file_name)
+        self.extracted_data.to_pickle(save_location)
 
-    def get_stored_data(self):
-        """Return the self.extracted_data attribute"""
+    def get_stored_data(self) -> pd.DataFrame:
+        # Return the self.extracted_data attribute
 
         assert len(self.extracted_data) != 0, "Please store data first"
         return self.extracted_data
 
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(
-        description="A recommender system based on tf-idf and cosine similarity"
-    )
+    parser = argparse.ArgumentParser(description="A recommender system based on tf-idf and cosine similarity")
 
     # Add an argument for the query
-    parser.add_argument(
-        "-m",
-        "--max_results",
-        type=str,
-        help="Maximum results to store",
-        nargs="?",
-        default=5,
-    )
-    parser.add_argument(
-        "-d",
-        "--days",
-        type=str,
-        help="Store only for these many past days",
-        nargs="?",
-        default=50,
-    )
+    parser.add_argument("-m", "--max_results", type=str, help="Maximum results to store", nargs='?', default=50)
+    parser.add_argument("-d", "--days", type=str, help="Store only for these many past days", nargs='?', default=50)
 
     # Parse the arguments
     args = parser.parse_args()
 
-    new_max_results = args.max_results
-    new_days = args.days
+    max_results = args.max_results
+    days = int(args.days)
 
     # initialize parser
     parser = ArxivParser()
 
     # store the past data
-    parser.store_data(max_results=new_max_results, days=new_days)
+    parser.store_data(max_results=max_results, days=days)