Defined Schemas - Data Format for API Requests/Response

akashe · Feb 22, 2024 · 293561a · 293561a
1 parent 6612a0f
commit 293561a
Show file tree

Hide file tree

Showing 9 changed files with 205 additions and 108 deletions.
diff --git a/src/app/database.py b/src/app/database.py
@@ -1,4 +1,5 @@
 """Database: Manages connection and sessions for SQLite database."""
+
 import sqlalchemy as _sql
 
 import sqlalchemy.ext.declarative as _declarative
@@ -9,16 +10,15 @@
 
 # engine for connecting to the database
 ENGINE: _sql.Engine = _sql.create_engine(
-    url=SQLALCHEMY_DATABASE_URL,
-    connect_args={"check_same_thread":False}
+    url=SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
 )
 
 # session maker for opening database sessions
 SESSION_LOCAL = _orm.sessionmaker(
-    bind=ENGINE, # bind the session maker to the engine
-    autoflush=False, # disable automatic flushing for performance optimization
-    autocommit=False # prevent automatic commits for better control
+    bind=ENGINE,  # bind the session maker to the engine
+    autoflush=False,  # disable automatic flushing for performance optimization
+    autocommit=False,  # prevent automatic commits for better control
 )
 
 # base class for your database models
-BASE = _declarative.declarative_base()
+BASE = _declarative.declarative_base()
diff --git a/src/app/main.py b/src/app/main.py
@@ -1,4 +1,5 @@
 """Entry Point for the FastAPI App"""
+
 import os
 import json
 from pathlib import Path
@@ -13,11 +14,14 @@
 from src.logics import arxiv_search
 from src.app import schemas
 from src.logics.arxiv_recommender import LearnTransformVocabulary
+
 # import sys
 # sys.modules["__main__"].LearnTransformVocabulary = LearnTransformVocabulary
 
 
-recommender = Recommender(vocabulary_path="data/transformed_data.pkl", vectorizer_path="data/vectorizer.pkl")
+recommender = Recommender(
+    vocabulary_path="data/transformed_data.pkl", vectorizer_path="data/vectorizer.pkl"
+)
 search = arxiv_search.ArxivSearcher()
 
 BASE_PATH = Path(__file__).resolve().parent
@@ -26,21 +30,23 @@
 app = FastAPI()
 
 # TEMPLATES = Jinja2Templates(directory=str(BASE_PATH / "../templates"))
-TEMPLATES = Jinja2Templates(directory=BASE_PATH/"../templates")
-app.mount("/static", StaticFiles(directory=BASE_PATH/"../static"), name="static")
+TEMPLATES = Jinja2Templates(directory=BASE_PATH / "../templates")
+app.mount("/static", StaticFiles(directory=BASE_PATH / "../static"), name="static")
 
 
 @app.get(path="/")
 def homepage(request: Request):
     return TEMPLATES.TemplateResponse(
-        name = "index.html", 
-        context = {"request": request, "name": "Subrata Mondal"}
+        name="index.html", context={"request": request, "name": "Subrata Mondal"}
     )
 
 
 # Define a route for searching documents
 @app.get("/search", response_model=List[schemas.SearchResult])
-def search_arxiv_papers(request:Request, query: str = Query(default="LLM, Attention, GPT", min_length=3, max_length=64)):
+def search_arxiv_papers(
+    request: Request,
+    query: str = Query(default="LLM, Attention, GPT", min_length=3, max_length=64),
+):
     """Search through the Arxiv API"""
     # Validate the input and perform the search
     try:
@@ -55,12 +61,7 @@ def search_arxiv_papers(request:Request, query: str = Query(default="LLM, Attent
     if results:
         results = json.loads(results)
         return TEMPLATES.TemplateResponse(
-            name="search.html",
-            context={
-                "request":request,
-                "results":results
-            }
-
+            name="search.html", context={"request": request, "results": results}
         )
         # return responses.JSONResponse(content=results, status_code=status.HTTP_200_OK)
     raise HTTPException(
@@ -70,11 +71,11 @@ def search_arxiv_papers(request:Request, query: str = Query(default="LLM, Attent
 
 # Define a route for getting recommendations
 @app.get("/recommend", response_model=List[schemas.Recommendation])
-def get_recommendations(request:Request, query: str = Query(default="LLM, Attention, GPT")):
+def get_recommendations(
+    request: Request, query: str = Query(default="LLM, Attention, GPT")
+):
     """Arxiv Research Paper Recommendation"""
-    vocabulary = LearnTransformVocabulary(
-        json_data = "../../data/master_data.json"
-    )
+    vocabulary = LearnTransformVocabulary(json_data="../../data/master_data.json")
     # Validate the input and generate recommendations
     try:
         # Perform recommendation
@@ -92,5 +93,6 @@ def get_recommendations(request:Request, query: str = Query(default="LLM, Attent
         content=recommendations, status_code=status.HTTP_200_OK
     )
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     pass
diff --git a/src/app/models.py b/src/app/models.py
@@ -1,11 +1,13 @@
 """Database Models: Represent SQL tables and relationships."""
+
 import datetime as _dt
 
 import sqlalchemy as _sql
 import sqlalchemy.orm as _orm
 
 import database as _database
 
+
 class User(_database.BASE):
     """Represents a user in the database.
 
@@ -17,6 +19,7 @@ class User(_database.BASE):
 
     Has a one-to-many relationship with the Post model, meaning a user can create many posts.
     """
+
     __tablename__ = "users"
     id = _sql.Column(_sql.Integer, primary_key=True, index=True)
     email = _sql.Column(_sql.String, unique=True, index=True)
@@ -26,6 +29,7 @@ class User(_database.BASE):
     # relationship with 'Post' model
     posts = _orm.relationship("Post", back_populates="owner")
 
+
 class Post(_database.BASE):
     """Represents a post created by a user.
 
@@ -39,6 +43,7 @@ class Post(_database.BASE):
 
     Belongs to the User model, meaning a post is created by a specific user.
     """
+
     __tablename__ = "posts"
     id = _sql.Column(_sql.Integer, primary_key=True, index=True)
     title = _sql.Column(_sql.String, index=True)
@@ -48,4 +53,4 @@ class Post(_database.BASE):
     date_last_updated = _sql.Column(_sql.DateTime, default=_dt.datetime.utcnow)
 
     # relationship with User model
-    owner = _orm.relationship("User", back_populates="posts")
+    owner = _orm.relationship("User", back_populates="posts")
diff --git a/src/app/schemas.py b/src/app/schemas.py
@@ -1,19 +1,66 @@
-"""Schemas using Pydantic for Request and Response Model"""
+"""Schemas: Define data representations for API requests/responses.
+This separation improves data validation and control over what data 
+is exposed through the API.
+"""
 
-from typing import List, Dict, Any
-from pydantic import BaseModel, HttpUrl
+import datetime as _dt
+import pydantic as _pydantic
 
+class _PostBase(_pydantic.BaseModel):
+    """Base schema for 'Post' data, including title and content."""
+    title:str
+    content:str
 
-# Define a response model for the search results
-class SearchResult(BaseModel):
-    """Response Model for the Search Result"""
+class CreatePost(_PostBase):
+    """Schema for creating a new Post.
+    Requires title and content, used for data validation and creation.
+    """
 
-    title: str
-    pdf_link: HttpUrl
+class Post(_PostBase):
+    """Schema for representing a Post including additional details.
+    Inherits from '_PostBase' and adds fields like ID, owner ID, 
+    creation and update dates.
+    """
+    id:int
+    owner_id:int
+    date_created: _dt.datetime
+    date_last_updated: _dt.datetime
+
+    class Config:
+        orm_mode = True
+
+class _UserBase(_pydantic.BaseModel):
+    """Base schema for User data, including email."""
+    email: str
 
+class UserCreate(_UserBase):
+    """Schema for creating a new User.
+    Requires email and password, used for data validation and creation.
+    """
+    password:str
 
-# Define a response model for the recommendations
-class Recommendation(BaseModel):
-    """Response Model for the Recommendation"""
+class User(_UserBase):
+    """Schema for representing a User including additional details and posts.
+    Inherits from _UserBase and adds fields like ID, active status, and a list of posts.
+    """
+    id:int
+    is_active:bool
+    posts: list[Post] = []
+
+    class Config:
+        orm_mode = True
+
+class _RecommendedPaperBase(_pydantic.BaseModel):
+    """Base schema for RecommendedPaper data, including basic paper details."""
+    id: str
+    title: str
+    published_date: _dt.datetime
+    pdf_link: _pydantic.HttpUrl
+    summary:  str
+    pdf_text: str
 
-    items: List[Dict[str, Any]]
+class RecommendedPaper(_RecommendedPaperBase):
+    """Schema for representing a RecommendedPaper with nested results.
+    Inherits from _RecommendedPaperBase and can include a list of nested results (other papers).
+    """
+    results: list[_RecommendedPaperBase]
diff --git a/src/logics/arxiv_extractor.py b/src/logics/arxiv_extractor.py
@@ -1,5 +1,6 @@
 """This module contains the ArxivParser class that extracts and parses 
     data from the arxiv API and stores it in a JSON file."""
+
 from abc import ABC, abstractmethod
 import os
 from pathlib import Path
@@ -10,8 +11,11 @@
 import feedparser
 import fitz
 from tqdm import tqdm
+
 BASE_PATH = Path(__file__).resolve().parent
 print(f"BASE_PATH: {BASE_PATH}")
+
+
 class ArxivExtractorBase(ABC):
     """Class that extracts data from Arxiv-Api and Loads Data in Json"""
 
@@ -23,47 +27,51 @@ def extract_data(self):
     def store_data(self):
         """Store data as Json File"""
 
+
 class ArxivExtractor(ArxivExtractorBase):
-    STANDARD_SEARCH_QUERY: str = ("cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR")
+    STANDARD_SEARCH_QUERY: str = (
+        "cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"
+    )
     BASE_URL = "http://export.arxiv.org/api/query"
 
-    def __init__(self, days:int, max_results:int=5, data_path:str="data/") -> None:
-        self.days=days
-        self.max_results=max_results
+    def __init__(
+        self, days: int, max_results: int = 5, data_path: str = "data/"
+    ) -> None:
+        self.days = days
+        self.max_results = max_results
         if not os.path.exists(data_path):
             os.makedirs(data_path)
         self.data_path = data_path
-        
+
         params = {
-                "search_query": self.STANDARD_SEARCH_QUERY,
-                "start": 0,
-                "max_results": max_results,
-                "sortBy": "submittedDate",
-                "sortOrder": "descending",
-            }
+            "search_query": self.STANDARD_SEARCH_QUERY,
+            "start": 0,
+            "max_results": max_results,
+            "sortBy": "submittedDate",
+            "sortOrder": "descending",
+        }
         self.URL = self.BASE_URL + "?" + requests.compat.urlencode(params)
 
     def extract_data(self):
         response = requests.get(self.URL, timeout=15)
         entries = feedparser.parse(response.text).entries
-        downloaded_data: List[Dict[str, str]] = ([])
-       
+        downloaded_data: List[Dict[str, str]] = []
+
         for entry in tqdm(entries):
             published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
             current_date = datetime.now()
             date_diff = (current_date - published_date).days
 
-
             if date_diff <= self.days:
                 link = entry.link
                 pdf_link = link.replace("abs", "pdf")
                 pdf_content = requests.get(pdf_link, timeout=15).content
                 pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
-                
+
                 pdf_text = ""
                 for page in pdf_file:
                     pdf_text += page.get_text()
-                
+
                 downloaded_data.append(
                     {
                         "id": entry.id,
@@ -76,20 +84,21 @@ def extract_data(self):
                 )
         print(f"DOWNLOADED DATA: {len(downloaded_data)}")
         return downloaded_data
-    
+
     def store_data(self):
         extracted_data = self.extract_data()
 
         assert len(extracted_data) > 0, "Got no results with the search query"
-        
+
         save_location = os.path.join(self.data_path, "master_data.json")
         with open(save_location, "w", encoding="utf-8") as f:
             json.dump(extracted_data, f, indent=4)
-
+
+
 if __name__ == "__main__":
     import pandas as pd
-    print ("Current working directory:")
-    print (os.getcwd())
-    arxiv_extractor = ArxivExtractor(days=60,max_results=5, data_path="data/")
+
+    print("Current working directory:")
+    print(os.getcwd())
+    arxiv_extractor = ArxivExtractor(days=60, max_results=5, data_path="data/")
     arxiv_extractor.store_data()
-