Skip to content

Commit

Permalink
refactor request_html, add BadStatusCode exception
Browse files Browse the repository at this point in the history
  • Loading branch information
fsadannn committed Oct 16, 2019
1 parent 2a24be9 commit c52a38c
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 89 deletions.
39 changes: 4 additions & 35 deletions Crawler/Cubadebate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import logging
from datetime import datetime

logger = logging.getLogger('scrapper')
logger = logging.getLogger(__name__)

sps = re.compile(' +')
comm = re.compile('comment')
Expand All @@ -28,47 +28,18 @@ class CubaDebate(ScrapBase):

def __init__(self,url,proxy=None):
super().__init__(url,proxy)
self.__html_text = None
self._html_text = None

def _Source(self):
return "CubaDebate"

def _request_html(self, url, proxy):
# logger.debug('_request_html {}, {}'.format(url, proxy))
try:
response = requests.get(url, proxies=proxy, timeout=10)
except Exception as e:
# logger.debug(e)
if isinstance(e, LocationParseError):
try:
response = requests.get(url, proxies=proxy['http'], timeout=10)
except Exception as e:
if isinstance(e, LocationParseError):
logger.debug(e)
raise ProxyConfigError(e.args[0])
logger.debug(e)
raise UnreachebleURL(e.args[0])
else:
logger.debug(e)
raise UnreachebleURL(e.args[0])
# logger.debug(response)
response.encoding = 'utf-8'
if response.status_code != 200:
logger.debug("bad response estatus")
raise Exception("received code = %d" % response.status_code)
return response.text

def _Scrap(self, url, proxy):
"""
Search for div with class:note_content and delete footnotes in order to have
only the desired new text
"""
#logger.debug('_Scrap params {}, {}'.format(url,proxy))
if self.__html_text is None:
self.__html_text = self._request_html(url, proxy)
#logger.debug(html_text)

soup = BeautifulSoup(self.__html_text, 'lxml')
soup = BeautifulSoup(self._html_text, 'lxml')
img = None
ans = soup.find("div", {"class": "note_content"})
img = ans.find("img")
Expand Down Expand Up @@ -109,9 +80,7 @@ def _extract_comments(self, url: str, proxy):
Retorna una lista de diccionarios que contienen el texto
de los comentarios y la fecha en que se hicieron.
"""
if self.__html_text is None:
self.__html_text = self._request_html(url, proxy)
soup = BeautifulSoup(self.__html_text, 'lxml')
soup = BeautifulSoup(self._html_text, 'lxml')

# buscar la seccion de los comentarios
comments_section = soup.find('section', id='comments')
Expand Down
41 changes: 5 additions & 36 deletions Crawler/Granma.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,47 +29,18 @@ class Granma(ScrapBase):

def __init__(self,url,proxy=None):
super().__init__(url,proxy)
self.__html_text = None
self._html_text = None

def _Source(self):
return "Granma"

def _request_html(self, url, proxy):
# logger.debug('_request_html {}, {}'.format(url, proxy))
try:
response = requests.get(url, proxies=proxy, timeout=10)
except Exception as e:
# logger.debug(e)
if isinstance(e, LocationParseError):
try:
response = requests.get(url, proxies=proxy['http'], timeout=10)
except Exception as e:
if isinstance(e, LocationParseError):
logger.debug(e)
raise ProxyConfigError(e.args[0])
logger.debug(e)
raise UnreachebleURL(e.args[0])
else:
logger.debug(e)
raise UnreachebleURL(e.args[0])
# logger.debug(response)
response.encoding = 'utf-8'
if response.status_code != 200:
logger.debug("bad response estatus")
raise Exception("received code = %d" % response.status_code)
return response.text

def _Scrap(self, url, proxy):
"""
Search for div with class:note_content and delete footnotes in order to have
only the desired new text
"""
#logger.debug('_Scrap params {}, {}'.format(url,proxy))
if self.__html_text is None:
self.__html_text = self._request_html(url, proxy)
#logger.debug(html_text)

soup = BeautifulSoup(self.__html_text, 'lxml')
soup = BeautifulSoup(self._html_text, 'lxml')
img = None
ans = soup.find("article")
photo = ans.find("div", {"class": "g-story-media-container"})
Expand Down Expand Up @@ -98,9 +69,7 @@ def _Comment(self, url, proxy):
return self._extract_comments(url, proxy)

def _extract_comments(self, url: str, proxy):
if self.__html_text is None:
self.__html_text = self._request_html(url, proxy)
soup = BeautifulSoup(self.__html_text, 'lxml')
soup = BeautifulSoup(self._html_text, 'lxml')
pg = soup.find('ul', {'class': 'pagination'})
if not pg:
return self._extract_comments_page(soup)
Expand All @@ -110,8 +79,8 @@ def _extract_comments(self, url: str, proxy):
if i.attrs.get('class'):
continue
url = i.find('a')['href']
self.__html_text = self._request_html(url, proxy)
soup = BeautifulSoup(self.__html_text, 'lxml')
self._html_text = self._request_html(url, proxy)
soup = BeautifulSoup(self._html_text, 'lxml')
comments.extend(self._extract_comments_page(soup))
return comments

Expand Down
73 changes: 56 additions & 17 deletions Crawler/ScrapBase.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,67 @@
import re
from urllib3.exceptions import LocationParseError
import logging
import requests


logger = logging.getLogger(__name__)
corchet = re.compile(r'\[[^)]*\]')
parents = re.compile(r'\([^)]*\)')

class UnreachebleURL(Exception):
pass


class ProxyConfigError(Exception):
pass

class BadStatusCode(Exception):
pass

class ScrapBase:
def __init__(self, url, proxy=None):
self.__comment = None
self.__text = None
self.__url = url
self.__proxy = proxy
self._comment = None
self._text = None
self._url = url
self._proxy = proxy
self._html_text = None

def _request_html(self, url, proxy):
# logger.debug('_request_html {}, {}'.format(url, proxy))
try:
response = requests.get(url, proxies=proxy, timeout=10)
except Exception as e:
# logger.debug(e)
if isinstance(e, LocationParseError):
try:
response = requests.get(url, proxies=proxy['http'], timeout=10)
except Exception as e:
if isinstance(e, LocationParseError):
logger.debug(e)
raise ProxyConfigError(e.args[0])
logger.debug(e)
raise UnreachebleURL(e.args[0])
else:
logger.debug(e)
raise UnreachebleURL(e.args[0])
# logger.debug(response)
response.encoding = 'utf-8'
if response.status_code != 200:
logger.debug("bad response estatus")
raise BadStatusCode("received code = %d" % response.status_code)
return response.text

def Scrap(self, url, proxy=None):
self.__text = self._Scrap(url, proxy)
#logger.debug('_Scrap params {}, {}'.format(url,proxy))
if self._html_text is None:
self._html_text = self._request_html(url, proxy)
#logger.debug(html_text)
self._text = self._Scrap(url, proxy)

def Comment(self, url, proxy=None):
self.__comment = self._Comment(url, proxy)
if self._html_text is None:
self._html_text = self._request_html(url, proxy)
self._comment = self._Comment(url, proxy)

def _Scrap(self, url, proxy=None):
raise NotImplementedError
Expand All @@ -39,26 +79,25 @@ def _Source(self):

@property
def comment(self):
if self.__comment:
return self.__comment
if self._comment:
return self._comment

self.__comment = self._Comment(self.__url, self.__proxy)
self.Comment(self._url, self._proxy)

return self.__comment
return self._comment

@property
def data(self):
if self.__text:
return self.__text
if self._text:
return self._text

self.__text = self._Scrap(self.__url, self.__proxy)
text = self.__text['text']
self.Scrap(self._url, self._proxy)
text = self._text['text']
text = parents.sub('', text)
text = corchet.sub('', text)
self.__text['text'] = text

return self.__text
self._text['text'] = text

return self._text

@staticmethod
def can_crawl(url: str)->bool:
Expand Down
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#print(text)
#print(comment)
try:
c.request('http://www.cubadebate.cu/noticias/2019/10/14/cuba-avanza-en-la-habilitacion-de-la-tecnologia-4g-lte/#.XaTBdfcpCV4')
c.request('http://www.granma.cu/cuba/2019-10-12/universidad-de-la-habana-eleva-su-prestigio-en-la-region-12-10-2019-00-10-34')
text = c.data
print(text)
comment = c.comment
Expand Down

0 comments on commit c52a38c

Please sign in to comment.