Skip to content

Commit

Permalink
easy debug
Browse files Browse the repository at this point in the history
  • Loading branch information
TideDra committed Nov 29, 2024
1 parent 1853fa5 commit a742121
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 25 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Test workflow
on:
workflow_dispatch:

jobs:
calculate-and-send:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup uv
uses: astral-sh/setup-uv@v3
with:
version: '0.5.4'

- name: Run script
env:
ZOTERO_ID: ${{ secrets.ZOTERO_ID }}
ZOTERO_KEY: ${{ secrets.ZOTERO_KEY }}
ARXIV_QUERY: ${{ secrets.ARXIV_QUERY }}
SMTP_SERVER: ${{ secrets.SMTP_SERVER }}
SMTP_PORT: ${{ secrets.SMTP_PORT }}
SENDER: ${{ secrets.SENDER }}
RECEIVER: ${{ secrets.RECEIVER }}
SENDER_PASSWORD: ${{ secrets.SENDER_PASSWORD }}
MAX_PAPER_NUM: ${{ secrets.MAX_PAPER_NUM }}
run: |
uv run main.py --debug
74 changes: 49 additions & 25 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from tldr import get_paper_tldr
from llama_cpp import Llama
from tqdm import tqdm
from loguru import logger

def get_zotero_corpus(id:str,key:str) -> list[dict]:
zot = zotero.Zotero(id, 'user', key)
Expand Down Expand Up @@ -51,28 +52,47 @@ def get_paper_code_url(paper:arxiv.Result) -> str:
return None
return repo_list['results'][0]['url']

def get_arxiv_paper(query:str, start:datetime.datetime, end:datetime.datetime) -> list[arxiv.Result]:
def get_arxiv_paper(query:str, start:datetime.datetime, end:datetime.datetime, debug:bool=False) -> list[arxiv.Result]:
client = arxiv.Client()
search = arxiv.Search(query=query, sort_by=arxiv.SortCriterion.SubmittedDate)
retry_num = 5
while retry_num > 0:
papers = []
try:
for i in client.results(search):
published_date = i.published
if published_date < end and published_date >= start:
if not debug:
while retry_num > 0:
papers = []
try:
for i in client.results(search):
published_date = i.published
if published_date < end and published_date >= start:
i.arxiv_id = re.sub(r'v\d+$', '', i.get_short_id())
i.code_url = get_paper_code_url(i)
papers.append(i)
elif published_date < start:
break
break
except Exception as e:
logger.warning(f'Got error: {e}. Try again...')
sleep(180)
retry_num -= 1
if retry_num == 0:
raise e
else:
logger.debug("Retrieve 5 arxiv papers regardless of the date.")
while retry_num > 0:
papers = []
try:
for i in client.results(search):
i.arxiv_id = re.sub(r'v\d+$', '', i.get_short_id())
i.code_url = get_paper_code_url(i)
papers.append(i)
elif published_date < start:
break
break
except Exception as e:
print(f'Got error: {e}. Try again...')
sleep(180)
retry_num -= 1
if retry_num == 0:
raise e
if len(papers) == 5:
break
break
except Exception as e:
logger.warning(f'Got error: {e}. Try again...')
sleep(180)
retry_num -= 1
if retry_num == 0:
raise e
return papers

def send_email(sender:str, receiver:str, password:str,smtp_server:str,smtp_port:int, html:str,):
Expand Down Expand Up @@ -103,25 +123,30 @@ def _format_addr(s):
parser.add_argument('--sender', type=str, help='Sender email address',default=os.environ.get('SENDER'))
parser.add_argument('--receiver', type=str, help='Receiver email address',default=os.environ.get('RECEIVER'))
parser.add_argument('--password', type=str, help='Sender email password',default=os.environ.get('SENDER_PASSWORD'))
parser.add_argument('--debug', action='store_true', help='Debug mode')
args = parser.parse_args()
assert args.zotero_id is not None
assert args.zotero_key is not None
assert args.arxiv_query is not None
if args.debug:
logger.debug("Debug mode is on.")
today = datetime.datetime.now(tz=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
yesterday = today - datetime.timedelta(days=1)
print("Retrieving Zotero corpus...")
logger.info("Retrieving Zotero corpus...")
corpus = get_zotero_corpus(args.zotero_id, args.zotero_key)
print("Retrieving Arxiv papers...")
papers = get_arxiv_paper(args.arxiv_query, yesterday, today)
logger.info(f"Retrieved {len(corpus)} papers from Zotero.")
logger.info("Retrieving Arxiv papers...")
papers = get_arxiv_paper(args.arxiv_query, yesterday, today, args.debug)
if len(papers) == 0:
print("No new papers found.")
logger.info("No new papers found. Yesterday maybe a holiday and no one submit their work :). If this is not the case, please check the ARXIV_QUERY.")
logger.info("No email will be sent. Enjoy a relaxing day!")
exit(0)
print("Reranking papers...")
logger.info("Reranking papers...")
papers = rerank_paper(papers, corpus)
if args.max_paper_num != -1:
papers = papers[:args.max_paper_num]

print("Generating TLDRs...")
logger.info("Generating TLDRs...")
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-3B-Instruct-GGUF",
filename="qwen2.5-3b-instruct-q4_k_m.gguf",
Expand All @@ -133,7 +158,6 @@ def _format_addr(s):
p.tldr = get_paper_tldr(p, llm)

html = render_email(papers)
print("Sending email...")
logger.info("Sending email...")
send_email(args.sender, args.receiver, args.password, args.smtp_server, args.smtp_port, html)
with open('email.html', 'w') as f:
f.write(html)
logger.success("Email sent successfully! If you don't receive the email, please check the configuration and the junk box.")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ requires-python = ">=3.11"
dependencies = [
"arxiv>=2.1.3",
"llama-cpp-python>=0.3.2",
"loguru>=0.7.2",
"pyzotero>=1.5.25",
"scikit-learn>=1.5.2",
"sentence-transformers>=3.3.1",
Expand Down
24 changes: 24 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a742121

Please sign in to comment.