Skip to content

Commit

Permalink
Defined Schemas - Data Format for API Requests/Response
Browse files Browse the repository at this point in the history
  • Loading branch information
subratamondal1 committed Feb 22, 2024
1 parent 6612a0f commit 293561a
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 108 deletions.
12 changes: 6 additions & 6 deletions src/app/database.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Database: Manages connection and sessions for SQLite database."""

import sqlalchemy as _sql

import sqlalchemy.ext.declarative as _declarative
Expand All @@ -9,16 +10,15 @@

# engine for connecting to the database
ENGINE: _sql.Engine = _sql.create_engine(
url=SQLALCHEMY_DATABASE_URL,
connect_args={"check_same_thread":False}
url=SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
)

# session maker for opening database sessions
SESSION_LOCAL = _orm.sessionmaker(
bind=ENGINE, # bind the session maker to the engine
autoflush=False, # disable automatic flushing for performance optimization
autocommit=False # prevent automatic commits for better control
bind=ENGINE, # bind the session maker to the engine
autoflush=False, # disable automatic flushing for performance optimization
autocommit=False, # prevent automatic commits for better control
)

# base class for your database models
BASE = _declarative.declarative_base()
BASE = _declarative.declarative_base()
36 changes: 19 additions & 17 deletions src/app/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Entry Point for the FastAPI App"""

import os
import json
from pathlib import Path
Expand All @@ -13,11 +14,14 @@
from src.logics import arxiv_search
from src.app import schemas
from src.logics.arxiv_recommender import LearnTransformVocabulary

# import sys
# sys.modules["__main__"].LearnTransformVocabulary = LearnTransformVocabulary


recommender = Recommender(vocabulary_path="data/transformed_data.pkl", vectorizer_path="data/vectorizer.pkl")
recommender = Recommender(
vocabulary_path="data/transformed_data.pkl", vectorizer_path="data/vectorizer.pkl"
)
search = arxiv_search.ArxivSearcher()

BASE_PATH = Path(__file__).resolve().parent
Expand All @@ -26,21 +30,23 @@
app = FastAPI()

# TEMPLATES = Jinja2Templates(directory=str(BASE_PATH / "../templates"))
TEMPLATES = Jinja2Templates(directory=BASE_PATH/"../templates")
app.mount("/static", StaticFiles(directory=BASE_PATH/"../static"), name="static")
TEMPLATES = Jinja2Templates(directory=BASE_PATH / "../templates")
app.mount("/static", StaticFiles(directory=BASE_PATH / "../static"), name="static")


@app.get(path="/")
def homepage(request: Request):
return TEMPLATES.TemplateResponse(
name = "index.html",
context = {"request": request, "name": "Subrata Mondal"}
name="index.html", context={"request": request, "name": "Subrata Mondal"}
)


# Define a route for searching documents
@app.get("/search", response_model=List[schemas.SearchResult])
def search_arxiv_papers(request:Request, query: str = Query(default="LLM, Attention, GPT", min_length=3, max_length=64)):
def search_arxiv_papers(
request: Request,
query: str = Query(default="LLM, Attention, GPT", min_length=3, max_length=64),
):
"""Search through the Arxiv API"""
# Validate the input and perform the search
try:
Expand All @@ -55,12 +61,7 @@ def search_arxiv_papers(request:Request, query: str = Query(default="LLM, Attent
if results:
results = json.loads(results)
return TEMPLATES.TemplateResponse(
name="search.html",
context={
"request":request,
"results":results
}

name="search.html", context={"request": request, "results": results}
)
# return responses.JSONResponse(content=results, status_code=status.HTTP_200_OK)
raise HTTPException(
Expand All @@ -70,11 +71,11 @@ def search_arxiv_papers(request:Request, query: str = Query(default="LLM, Attent

# Define a route for getting recommendations
@app.get("/recommend", response_model=List[schemas.Recommendation])
def get_recommendations(request:Request, query: str = Query(default="LLM, Attention, GPT")):
def get_recommendations(
request: Request, query: str = Query(default="LLM, Attention, GPT")
):
"""Arxiv Research Paper Recommendation"""
vocabulary = LearnTransformVocabulary(
json_data = "../../data/master_data.json"
)
vocabulary = LearnTransformVocabulary(json_data="../../data/master_data.json")
# Validate the input and generate recommendations
try:
# Perform recommendation
Expand All @@ -92,5 +93,6 @@ def get_recommendations(request:Request, query: str = Query(default="LLM, Attent
content=recommendations, status_code=status.HTTP_200_OK
)

if __name__=="__main__":

if __name__ == "__main__":
pass
7 changes: 6 additions & 1 deletion src/app/models.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Database Models: Represent SQL tables and relationships."""

import datetime as _dt

import sqlalchemy as _sql
import sqlalchemy.orm as _orm

import database as _database


class User(_database.BASE):
"""Represents a user in the database.
Expand All @@ -17,6 +19,7 @@ class User(_database.BASE):
Has a one-to-many relationship with the Post model, meaning a user can create many posts.
"""

__tablename__ = "users"
id = _sql.Column(_sql.Integer, primary_key=True, index=True)
email = _sql.Column(_sql.String, unique=True, index=True)
Expand All @@ -26,6 +29,7 @@ class User(_database.BASE):
# relationship with 'Post' model
posts = _orm.relationship("Post", back_populates="owner")


class Post(_database.BASE):
"""Represents a post created by a user.
Expand All @@ -39,6 +43,7 @@ class Post(_database.BASE):
Belongs to the User model, meaning a post is created by a specific user.
"""

__tablename__ = "posts"
id = _sql.Column(_sql.Integer, primary_key=True, index=True)
title = _sql.Column(_sql.String, index=True)
Expand All @@ -48,4 +53,4 @@ class Post(_database.BASE):
date_last_updated = _sql.Column(_sql.DateTime, default=_dt.datetime.utcnow)

# relationship with User model
owner = _orm.relationship("User", back_populates="posts")
owner = _orm.relationship("User", back_populates="posts")
71 changes: 59 additions & 12 deletions src/app/schemas.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,66 @@
"""Schemas using Pydantic for Request and Response Model"""
"""Schemas: Define data representations for API requests/responses.
This separation improves data validation and control over what data
is exposed through the API.
"""

from typing import List, Dict, Any
from pydantic import BaseModel, HttpUrl
import datetime as _dt
import pydantic as _pydantic

class _PostBase(_pydantic.BaseModel):
"""Base schema for 'Post' data, including title and content."""
title:str
content:str

# Define a response model for the search results
class SearchResult(BaseModel):
"""Response Model for the Search Result"""
class CreatePost(_PostBase):
"""Schema for creating a new Post.
Requires title and content, used for data validation and creation.
"""

title: str
pdf_link: HttpUrl
class Post(_PostBase):
"""Schema for representing a Post including additional details.
Inherits from '_PostBase' and adds fields like ID, owner ID,
creation and update dates.
"""
id:int
owner_id:int
date_created: _dt.datetime
date_last_updated: _dt.datetime

class Config:
orm_mode = True

class _UserBase(_pydantic.BaseModel):
"""Base schema for User data, including email."""
email: str

class UserCreate(_UserBase):
"""Schema for creating a new User.
Requires email and password, used for data validation and creation.
"""
password:str

# Define a response model for the recommendations
class Recommendation(BaseModel):
"""Response Model for the Recommendation"""
class User(_UserBase):
"""Schema for representing a User including additional details and posts.
Inherits from _UserBase and adds fields like ID, active status, and a list of posts.
"""
id:int
is_active:bool
posts: list[Post] = []

class Config:
orm_mode = True

class _RecommendedPaperBase(_pydantic.BaseModel):
"""Base schema for RecommendedPaper data, including basic paper details."""
id: str
title: str
published_date: _dt.datetime
pdf_link: _pydantic.HttpUrl
summary: str
pdf_text: str

items: List[Dict[str, Any]]
class RecommendedPaper(_RecommendedPaperBase):
"""Schema for representing a RecommendedPaper with nested results.
Inherits from _RecommendedPaperBase and can include a list of nested results (other papers).
"""
results: list[_RecommendedPaperBase]
55 changes: 32 additions & 23 deletions src/logics/arxiv_extractor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This module contains the ArxivParser class that extracts and parses
data from the arxiv API and stores it in a JSON file."""

from abc import ABC, abstractmethod
import os
from pathlib import Path
Expand All @@ -10,8 +11,11 @@
import feedparser
import fitz
from tqdm import tqdm

BASE_PATH = Path(__file__).resolve().parent
print(f"BASE_PATH: {BASE_PATH}")


class ArxivExtractorBase(ABC):
"""Class that extracts data from Arxiv-Api and Loads Data in Json"""

Expand All @@ -23,47 +27,51 @@ def extract_data(self):
def store_data(self):
"""Store data as Json File"""


class ArxivExtractor(ArxivExtractorBase):
STANDARD_SEARCH_QUERY: str = ("cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR")
STANDARD_SEARCH_QUERY: str = (
"cat:cs.CV OR cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:cs.NE OR cat:stat.ML OR cat:cs.IR"
)
BASE_URL = "http://export.arxiv.org/api/query"

def __init__(self, days:int, max_results:int=5, data_path:str="data/") -> None:
self.days=days
self.max_results=max_results
def __init__(
self, days: int, max_results: int = 5, data_path: str = "data/"
) -> None:
self.days = days
self.max_results = max_results
if not os.path.exists(data_path):
os.makedirs(data_path)
self.data_path = data_path

params = {
"search_query": self.STANDARD_SEARCH_QUERY,
"start": 0,
"max_results": max_results,
"sortBy": "submittedDate",
"sortOrder": "descending",
}
"search_query": self.STANDARD_SEARCH_QUERY,
"start": 0,
"max_results": max_results,
"sortBy": "submittedDate",
"sortOrder": "descending",
}
self.URL = self.BASE_URL + "?" + requests.compat.urlencode(params)

def extract_data(self):
response = requests.get(self.URL, timeout=15)
entries = feedparser.parse(response.text).entries
downloaded_data: List[Dict[str, str]] = ([])
downloaded_data: List[Dict[str, str]] = []

for entry in tqdm(entries):
published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
current_date = datetime.now()
date_diff = (current_date - published_date).days


if date_diff <= self.days:
link = entry.link
pdf_link = link.replace("abs", "pdf")
pdf_content = requests.get(pdf_link, timeout=15).content
pdf_file = fitz.open(stream=pdf_content, filetype="pdf")

pdf_text = ""
for page in pdf_file:
pdf_text += page.get_text()

downloaded_data.append(
{
"id": entry.id,
Expand All @@ -76,20 +84,21 @@ def extract_data(self):
)
print(f"DOWNLOADED DATA: {len(downloaded_data)}")
return downloaded_data

def store_data(self):
extracted_data = self.extract_data()

assert len(extracted_data) > 0, "Got no results with the search query"

save_location = os.path.join(self.data_path, "master_data.json")
with open(save_location, "w", encoding="utf-8") as f:
json.dump(extracted_data, f, indent=4)



if __name__ == "__main__":
import pandas as pd
print ("Current working directory:")
print (os.getcwd())
arxiv_extractor = ArxivExtractor(days=60,max_results=5, data_path="data/")

print("Current working directory:")
print(os.getcwd())
arxiv_extractor = ArxivExtractor(days=60, max_results=5, data_path="data/")
arxiv_extractor.store_data()

Loading

0 comments on commit 293561a

Please sign in to comment.