Skip to content

Commit

Permalink
refactor: split event population and vector db population
Browse files Browse the repository at this point in the history
  • Loading branch information
marcus-ny committed Sep 24, 2024
1 parent 4aad4eb commit be9f93c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 9 deletions.
27 changes: 27 additions & 0 deletions backend/src/scrapers/guardian/get_analyses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from src.common.database import engine
from sqlalchemy.orm import Session
from sqlalchemy import select
from src.events.models import Analysis


def get_analyses():
with Session(engine) as session:
# Select the first 5 articles
result = session.scalars(select(Analysis).limit(5))

analyses = []
# Iterate over the result and print each article
for article in result:
data_dict = {
"id": article.id,
"event_id": article.event_id,
"category_id": article.category_id,
"content": article.content,
}
analyses.append(data_dict)

return analyses


if __name__ == "__main__":
print(len(get_analyses()))
10 changes: 1 addition & 9 deletions backend/src/scripts/populate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,9 @@
from src.embeddings.vector_store import store_documents
from src.events.process import EventLLM

# NOTE: this is for the purpose of populating the database with
# the LM generated events and analyses


# Populate the db with events from lm_events_output.json
def populate():
# TODO: Query articles from DB
# NOTE: articles are expected to be in json with all the necessary fields

with open("backend/lm_events_output.json", "r") as f:
events = json.load(f)
for event in events:
Expand All @@ -25,9 +20,6 @@ def populate():
)
add_event_to_db(event_obj)

# TODO: Embed analyses and push to pinecone
store_documents(events)


if __name__ == "__main__":
populate()

0 comments on commit be9f93c

Please sign in to comment.