-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapnews.py
101 lines (90 loc) · 7.48 KB
/
scrapnews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from bs4 import BeautifulSoup
from readability import Document
import urllib
import requests
from urllib.parse import urlparse
from extractarticle import ExtractArticle
import re
import logging
from requests.exceptions import Timeout
IGNORE_LIST = r'wikipedia.org|books.google|linkedin|monster|jobstreet.com|wisdomjobs|naukari|sec.gov|sec.report|dl.bourse|fintel.io|ftp.cs.princeton.edu'
IGNORE_LINKS_LIST = r'Annual|Financial-Report'
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
GOOGLE_CLASS='gG0TJc'
class ScrapNews:
links=[]
link=''
errorcount=0
counter=100000
ingnorelist=re.compile(IGNORE_LIST)
ingnorelink=re.compile(IGNORE_LINKS_LIST)
logger = logging.getLogger(__name__)
def __init__(self):
#logger = logging.getLogger(__name__)
print (__name__)
#print(logger)
def scrapEntity(self,id,pageno,keyword):
#query = keyword +'+"' +entity +'"'
#query = query.replace(' ', '+')
query="{0}&sxsrf=ALeKk03SaWrEuh1vZDxLQXk1bJUIwMSzIA:1594109560695&source=lnms&tbm=nws&sa=X&ved=2ahUKEwjaluTw2LrqAhXb8XMBHempA28Q_AUoAXoECBQQAw&biw=1280&bih=578&start={1}".format(keyword,str(pageno))
URL = f"https://google.com/search?q={query}"
#ingnorelist=('.wikipedia.org','.books.google.*')
print("running query=" +URL)
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers,timeout=3)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content,features="lxml")
results = []
#check for trusted new site and implement custom logic for parsing.
# for rest use library to get text as required.
for g in soup.find_all('div', class_='dbsr'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
#link='https://www.livingstondaily.com/story/news/local/2020/07/09/howell-area-woman-accused-embezzling-more-than-375-000/5404973002/'
parsed_uri = urlparse(link)
#optimize further to handle ignore url's
if not(link in self.links
or parsed_uri.path.lower().endswith('.pdf')
or self.ingnorelist.search(parsed_uri.netloc.lower())
or link.lower().endswith('pdf')
or self.ingnorelink.search(link)):
extractarticle = ExtractArticle()
print ('--------Extracting Link: '+ link)
try:
article = extractarticle.extractArticle(link)
if (article is None):
self.errorcount=self.errorcount+1
self.logger.error("Error Processing Article for entity %s and link %s",pageno,link)
continue
except (requests.ConnectionError,requests.ConnectTimeout,requests.ReadTimeout):
self.errorcount=self.errorcount+1
self.logger.error("Connection Error While processing %s and link %s",pageno,link)
else:
extractedSummary=article.summary
extractedTitle=article.title
#extractedSummary='<html><body><div><div class="gnt_ar_b"><figure class="gnt_em gnt_em__fp gnt_em_img"><img class="gnt_em_img_i" src="/636282189332826193-GettyImages-gavel.jpg" elementtiming="ar-lead-image" srcset="/636282189332826193-GettyImages-gavel.jpg" decoding="async" alt="Alicia Holbrook-Bloink is charged with numerous felonies including conducting a criminal enterprise and embezzlement."/><p class="gntd" data-c-caption="Alicia Holbrook-Bloink is charged with numerous felonies including conducting a criminal enterprise and embezzlement." data-c-credit="Getty Images/iStockphoto"/></figure>\n<p class="gnt_ar_b_p">A Livingston County insurance company owner defrauded clients of more than $375,000 from 2015 to 2018, a Michigan assistant attorney general testified in a hearing that led to charges Tuesday. </p>\n<p class="gnt_ar_b_p">Alicia Holbrook-Bloink, 44, used her insurance company, Holbrook Insurance Agency, LLC located on Dutcher Road in Iosco Township near Howell to embezzle $375,744, Assistant Attorney General Ashley Schwartz testified. </p>\n<p class="gnt_ar_b_p">Holbrook-Bloink, of Iosco Township, is charged with conducting a criminal enterprise and four counts of identity theft. She is also charged with two counts each of embezzlement of over $100,000 and false filing of a <i>tax return.</i> </p></div></div></body></html>'
#extractedSummary='<html><body><div><div class="gnt_ar_b"><figure class="gnt_em gnt_em__fp gnt_em_img"><img class="gnt_em_img_i" src="/636282189332826193-GettyImages-gavel.jpg" elementtiming="ar-lead-image" srcset="/636282189332826193-GettyImages-gavel.jpg" decoding="async" alt="Alicia Holbrook-Bloink is charged with numerous felonies including conducting a criminal enterprise and embezzlement."/><p class="gntd" data-c-caption="Alicia Holbrook-Bloink is charged with numerous felonies including conducting a criminal enterprise and embezzlement." data-c-credit="Getty Images/iStockphoto"/></figure>\n<p class="gnt_ar_b_p">A Livingston County insurance company owner defrauded clients of more than $375,000 from 2015 to 2018, a Michigan assistant attorney general testified in a hearing that led to charges Tuesday. </p><p class="gnt_ar_b_p">Alicia Holbrook-Bloink, 44, used her insurance company, Holbrook Insurance Agency, LLC located on Dutcher Road in Iosco Township near Howell to embezzle $375,744, Assistant Attorney General Ashley Schwartz testified. </p><p class="gnt_ar_b_p">Holbrook-Bloink, of Iosco Township, is charged with conducting a criminal enterprise and four counts of identity theft. She is also charged with two counts each of embezzlement of over $100,000 and false filing of a <br>tax return. </p></div></div></body></html>'
extractedSummary=extractedSummary.replace('</p>','</p>\n',extractedSummary.count('</p>')-1).replace('<br>','\n')
#print(extractedSummary)
summarytext = BeautifulSoup(extractedSummary,features="lxml").get_text()
#print(summarytext)
# title = g.find('h3').text
# item = {
# "title": title,
# "link": link
# }
self.links.append(link)
self.counter=self.counter+1
yield [self.counter,keyword.strip('"'),link,extractedTitle, summarytext]
else:
self.errorcount=self.errorcount+1
self.logger.error("Non 200 response Error Processing %s and link %s",pageno,URL)
return 'error'
def scrapEntitykeywordList(self,id,pageno,keywords):
for key in keywords:
for o in self.scrapEntity(id,pageno,key) :
yield o