-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap.py
108 lines (97 loc) · 4.68 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from bs4 import BeautifulSoup
from readability import Document
import urllib
import requests
from urllib.parse import urlparse
from extractarticle import ExtractArticle
import re
import logging
from requests.exceptions import Timeout
IGNORE_LIST = r'wikipedia.org|books.google|linkedin|monster|jobstreet.com|wisdomjobs|naukari|sec.gov|sec.report|dl.bourse|fintel.io|ftp.cs.princeton.edu'
IGNORE_LINKS_LIST = r'Annual|Financial-Report'
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
class Scrap:
links=[]
link=''
errorcount=0
ingnorelist=re.compile(IGNORE_LIST)
ingnorelink=re.compile(IGNORE_LINKS_LIST)
logger = logging.getLogger(__name__)
def __init__(self):
#logger = logging.getLogger(__name__)
print (__name__)
#print(logger)
def scrapEntity(self,id,entity,keyword):
query = keyword +'+"' +entity +'"'
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
#ingnorelist=('.wikipedia.org','.books.google.*')
print("running query=" +URL)
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers,timeout=3)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content,features="lxml")
results = []
#check for trusted new site and implement custom logic for parsing.
# for rest use library to get text as required.
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
parsed_uri = urlparse(link)
#optimize further to handle ignore url's
if not(link in self.links
or parsed_uri.path.lower().endswith('.pdf')
or self.ingnorelist.search(parsed_uri.netloc.lower())
or link.lower().endswith('pdf')
or self.ingnorelink.search(link)):
extractarticle = ExtractArticle()
print ('--------Extracting Link: '+ link)
try:
article = extractarticle.extractArticle(link)
if (article is None):
self.errorcount=self.errorcount+1
self.logger.error("Error Processing Article for entity %s and link %s",entity,link)
continue
except (requests.ConnectionError,requests.ConnectTimeout,requests.ReadTimeout):
self.errorcount=self.errorcount+1
self.logger.error("Connection Error While processing %s and link %s",entity,link)
else:
extractedSummary=article.summary
extractedTitle=article.title
extractedSummary=get_plain_text
plaintext=get_plain_text(BeautifulSoup(extractedSummary,features="lxml"))
summarytext = BeautifulSoup(plaintext,features="lxml").get_text()
title = g.find('h3').text
item = {
"title": title,
"link": link
}
self.links.append(link)
yield [id,entity,keyword.strip('"'),link,extractedTitle, extractedSummary]
else:
self.errorcount=self.errorcount+1
self.logger.error("Non 200 response Error Processing %s and link %s",entity,URL)
return 'error'
def scrapEntitykeywordList(self,id,entity,keywords):
for key in keywords:
for o in self.scrapEntity(id,entity,key) :
yield o
def replace_with_newlines(element):
text = ''
for elem in element.recursiveChildGenerator():
if isinstance(elem, str):
text += elem.strip()
elif elem.name == 'br':
text += '\n'
return text
def get_plain_text(soup):
plain_text = ''
lines = soup.find("body")
for line in lines.findAll('p'):
line = replace_with_newlines(line)
plain_text+=line
return plain_text