-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper.py
117 lines (91 loc) · 4.11 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from bs4 import BeautifulSoup
from urllib import request
import article
from datetime import datetime, timezone
class Scraper:
def __init__(self,url) -> None:
self.url = url
self.publisher = self.__class__.__name__
@staticmethod
def _generate_soup(url):
source = request.urlopen(url).read()
soup = BeautifulSoup(source,'lxml')
return soup
def get_links(self):
raise NotImplementedError()
def parse_link(self,link) -> article.Article:
raise NotImplementedError()
def run(self, saved_links):
links = self.get_links()
links = [link for link in links if link not in saved_links]
links = links[:10] if len(links) > 10 else links
articles = []
for link in links:
try:
print("Parsing link: ",link)
articles.append(self.parse_link(link))
except Exception as e:
print("Could not parse link: ",link)
print(e)
return articles
class CNBC(Scraper):
def get_links(self):
homepage = Scraper._generate_soup(self.url)
thumbnail_tags = homepage.find('div',attrs={'id':'homepage-riverPlus'}).find_all('div',attrs={'class':'RiverHeadline-headline RiverHeadline-hasThumbnail'})
links = [thumbnail.find('a').attrs['href'] for thumbnail in thumbnail_tags]
links = {link for link in links if link != '/pro/'}
return links
def parse_link(self, link):
soup = Scraper._generate_soup(link)
title = soup.title.get_text()
article_p_tags = soup.find('div',attrs={'class':'ArticleBody-articleBody'}).find_all('p')
body = ''.join([tag.get_text() for tag in article_p_tags])
return article.Article(title,body,self.publisher,link)
class CBS(Scraper):
def get_links(self):
homepage = Scraper._generate_soup(self.url)
a_tags = homepage.find_all('a')
links = {tag.attrs['href'] for tag in a_tags if "https://www.cbsnews.com/news" in tag.attrs['href']}
return links
def parse_link(self, link):
soup = Scraper._generate_soup(link)
content_body = soup.find('section',attrs={'class':'content__body'})
contributed_by_ap_tag = content_body.find('em')
app_upsell_tag = content_body.find('p',attrs={'class': 'item__dek'})
if contributed_by_ap_tag is not None:
contributed_by_ap_tag.decompose()
if app_upsell_tag is not None:
app_upsell_tag.decompose()
body = ''.join([tag.get_text() for tag in content_body.find_all('p')])
title = soup.find('h1',attrs={'class':'content__title'}).get_text()
return article.Article(title,body,self.publisher,link)
def scrape_articles():
# No db queries are called from Scraper class/superclasses as a design choice
scrapers = [CNBC("https://www.cnbc.com/"), CBS("https://www.cbsnews.com")]
current_datetime = datetime.now(timezone.utc)
elapsed_time, last_insert_date = article.get_last_insert_et_and_date(current_datetime)
if last_insert_date is None or last_insert_date < current_datetime.date():
# If articles were scraped on a previous day, delete all old articles
article.delete_all()
elif elapsed_time is not None and elapsed_time < 6:
# Scrape no more than once every 6 hours
return
articles = []
for scraper in scrapers:
saved_links = article.get_links_by_publisher(scraper.publisher)
articles.extend(scraper.run(saved_links))
article.save(articles)
'''
MULTIPROCESS IMPLEMENTATION
Not using it for the time being cause it does not provide a significant performance boost
with concurrent.futures.ProcessPoolExecutor() as executor:
results = []
articles = []
for scraper in scrapers:
saved_links = Article.get_links_by_publisher(scraper.publisher)
future = executor.submit(scraper.run,saved_links)
results.append(future)
for future in concurrent.futures.as_completed(results):
articles.extend(future.result())
Article.save(articles)
'''