-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcron.py
107 lines (78 loc) · 3.87 KB
/
cron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""Main File for Performing the Cron Job"""
import schedule
import time
import os
import pandas as pd
from datetime import datetime, timedelta
# from ..logics.data_extraction import ArxivParser
from src.logics.data_extraction import ArxivParser
from src.logics.arxiv_recommender import update_vocab_with_new_data
# First install
# Download data
# run cron jobs from next day
# create vocab for recommender
# check everything is doable by poetry, for example installation and running, issue with numpy installation https://github.com/python-poetry/poetry/issues/3894
# preferences of the user
# synchronise data reloading in fastapi endpoint and updates in master data and todays data by this file
# It takes time to calculate tf-idf for 10K papers
def get_time() -> str:
"""Gets the current time and returns it as a String
Hr:Min:Sec D/M/Y
Example: '19:18:15 (03/03/24)'
"""
return time.strftime("%X (%d/%m/%y)")
def remove_old_papers(data_path, save_path, window_size) -> None:
"""Remove all the papers from the oldest day within the window size"""
df: pd.DataFrame = pd.read_pickle(data_path)
df["published_date"] = pd.to_datetime(arg=df['published_date'])
# Calculate the cutoff date
today = datetime.today().date()
cutoff_date = (today - timedelta(days=window_size + 1)).strftime("%Y-%m-%d")
# Convert cutoff_date to datetime
cutoff_date = pd.to_datetime(arg=cutoff_date)
# Find papers to remove
papers_to_remove = df[df['published_date'] <= cutoff_date]
# Remove the old papers
df = df[~df.index.isin(values=papers_to_remove.index)]
# Convert published_date column to string in the desired format
df['published_date'] = df['published_date'].dt.strftime(date_format='%Y-%m-%d')
# Save the updated dataframe
df.to_pickle(save_path)
def add_new_papers(latest_papers_path, master_data_path) -> None:
"""Add new papers to the master data JSON file"""
latest_papers = pd.read_pickle(latest_papers_path)
# Check if the master data file already exists
if os.path.exists(path=master_data_path):
master_data = pd.read_pickle(master_data_path)
# Concatenate the latest papers DataFrame with the existing master data
master_data = pd.concat(objs=[master_data, latest_papers], ignore_index=True)
else:
# If the master data file doesn't exist, use the latest papers as the master data
master_data = latest_papers
master_data.to_pickle(master_data_path)
TOTAL_PAPERS_WINDOW_SIZE = 1900
def download_todays_paper_task() -> None:
parser = ArxivParser()
parser.store_data(save_file_name="todays_data.pkl", max_results=10000, days=2)
print("Doing task...", get_time())
def remove_old_papers_add_new_papers() -> None:
print("Doing task...", get_time())
add_new_papers(latest_papers_path="data/todays_data.pkl", master_data_path="data/master_data.pkl")
remove_old_papers(data_path="data/master_data.pkl", save_path="data/master_data.pkl",
window_size=TOTAL_PAPERS_WINDOW_SIZE)
update_vocab_with_new_data("data/master_data.pkl")
if __name__=="__main__":
# schedule.every().day.at(time_str="23:58:45").do(job_func=download_todays_paper_task)
# schedule.every().day.at(time_str="23:59:59").do(job_func=remove_old_papers_add_new_papers)
# # schedule.every(interval=5).seconds.do(job_func=task)
#
# while True:
# # performs the task
# schedule.run_pending()
parser = ArxivParser()
parser.store_data(save_file_name="todays_data.pkl", max_results=10000, days=16)
add_new_papers(latest_papers_path="data/todays_data.pkl", master_data_path="data/master_data.pkl")
remove_old_papers(data_path="data/master_data.pkl", save_path="data/master_data.pkl", window_size=1)
update_vocab_with_new_data("data/master_data.pkl")
# download_todays_paper_task()
# remove_old_papers_add_new_papers()