-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
90 lines (74 loc) · 2.77 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import bs4
import requests
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
def load_posted_news(username: str)->list[str]:
"""
Description:
This function is responsible for loading the posted news for a user.
Args:
username (str): The username of the user.
Returns:
list: The list of posted news.
"""
try:
with open(f"data/posted_news/{username}.txt", "r") as f:
return f.read().splitlines()
except FileNotFoundError:
return []
def save_posted_news(username: str, posted_news: str)->None:
"""
Description:
This function is responsible for saving the posted news for a user.
Args:
username (str): The username of the user.
posted_news (str): The posted news content.
"""
if not os.path.exists("data/posted_news"):
os.makedirs("data/posted_news",exist_ok=True)
with open(f"data/posted_news/{username}.txt", "a") as f:
f.write(f"{posted_news}\n")
def fetch_news(username: str)->list[dict]:
"""
Description:
This function is responsible for fetching news from the web.
Args:
username (str): The username of the user.
Returns:
list: The list of news fetched from the web.
"""
url = "https://www.artificialintelligence-news.com/"
# read url content
page = requests.get(url)
# get only the body content
soup = bs4.BeautifulSoup(page.content, 'html.parser')
featured_news = soup.find_all('div', {'class': 'cell blocks small-12 medium-3 large-3'})
posted_news = load_posted_news(username)
news = []
for news_item in featured_news:
title = news_item.find('h3').find('a').text
if title in posted_news:
continue
news.append({
"title": title,
"link": news_item.find('h3').find('a')['href']
})
if not news:
return []
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options, service=Service("./driver/geckodriver"))
for news_item in news:
driver.get(news_item['link'])
soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
news_item['content'] = soup.find('article', {'data-title': news_item['title']}).find_all('p')[1:-5]
# remove double spaces
news_item['content'] = ' '.join([p.text for p in news_item['content']]).replace(' ', ' ')
# remove special characters
news_item['content'] = ''.join(e for e in news_item['content'] if e.isalnum() or e.isspace())
# remove \n
news_item['content'] = news_item['content'].replace('\n', ' ')
# trim to 1000 characters
news_item['content'] = news_item['content'][:1000]
return news