-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
156 additions
and
125 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
''' | ||
Created on 4 Mar 2017 | ||
@author: Robert Putt | ||
''' | ||
|
||
# -*- coding: utf-8 -*- | ||
from distutils.core import setup | ||
from setuptools import find_packages | ||
import os | ||
|
||
base_name='this_is_scraper' | ||
|
||
# allow setup.py to be run from any path | ||
os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir))) | ||
|
||
setup( | ||
name=base_name, | ||
version='0.1', | ||
author=u'Robert Putt', | ||
include_package_data = True, | ||
packages=find_packages(), # include all packages under this directory | ||
description='to update', | ||
long_description="", | ||
zip_safe=False, | ||
|
||
# Adds dependencies | ||
install_requires = ['bs4', | ||
'sqlalchemy', | ||
'pymysql'] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,125 +0,0 @@ | ||
''' | ||
Scraper for ThisIsPlymouth.co.uk news stories. | ||
''' | ||
import re | ||
import sys | ||
import logging | ||
import requests | ||
import datetime | ||
from bs4 import BeautifulSoup | ||
from this_is_scraper.db import init_db | ||
from this_is_scraper.db import get_db_session | ||
from this_is_scraper.db import Articles | ||
|
||
|
||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', | ||
level=logging.DEBUG, | ||
stream=sys.stdout) | ||
|
||
|
||
def remove_duplicates(l): | ||
return list(set(l)) | ||
|
||
|
||
def has_numbers(input_str): | ||
return any(char.isdigit() for char in input_str) | ||
|
||
|
||
def list_stories(): | ||
article_links = [] | ||
html_doc = requests.get('http://www.plymouthherald.co.uk/news/') | ||
soup = BeautifulSoup(html_doc.text, 'html.parser') | ||
url_stub = "^http://www.plymouthherald.co.uk/news/" | ||
for link in soup.findAll('a', attrs={'href': re.compile(url_stub)}): | ||
href = link.get('href') | ||
if href != 'http://www.plymouthherald.co.uk/news/': | ||
# remove links to comments and any invalid articles, valid | ||
# articles have numbers in their URLs, interesting huh? | ||
if ('#comments-section' not in href) and (has_numbers(href)): | ||
article_links.append(href) | ||
|
||
article_links = remove_duplicates(article_links) | ||
return article_links | ||
|
||
|
||
def get_article_by_link(db_sess, link): | ||
articles = [] | ||
result = db_sess.query(Articles). \ | ||
filter(Articles.article_link == link).all() | ||
for article in result: | ||
articles.append(article) | ||
return articles | ||
|
||
|
||
def insert_link(db_sess, link): | ||
new_article = Articles(article_link=link, | ||
article_dt=datetime.datetime.now(), | ||
article_status='pending') | ||
db_sess.add(new_article) | ||
db_sess.commit() | ||
|
||
|
||
def insert_new_links(db_sess, links): | ||
add_count = 0 | ||
for link in links: | ||
logging.debug("Processing %s." % link) | ||
curr_linked_articles = get_article_by_link(db_sess, link) | ||
if len(curr_linked_articles) == 0: | ||
logging.debug("%s doesn't exist in DB, adding." % link) | ||
insert_link(db_sess, link) | ||
add_count += 1 | ||
else: | ||
logging.debug("%s already exists in DB, skipping." % link) | ||
return add_count | ||
|
||
|
||
def get_pending_links(db_sess): | ||
pending_articles = db_sess.query(Articles). \ | ||
filter(Articles.article_status == 'pending').all() | ||
return pending_articles | ||
|
||
|
||
def extract_content(link): | ||
html_doc = requests.get(link) | ||
html_doc = html_doc.text | ||
soup = BeautifulSoup(html_doc, 'html.parser') | ||
title = soup.title.string | ||
title = title.replace(' - Plymouth Herald', '') | ||
article_div = soup.findAll("div", {"class":"article-body"}) | ||
for div in article_div: | ||
paragraphs = div.findAll("p") | ||
ret_text = "" | ||
for paragraph in paragraphs: | ||
ret_text += "%s \r\n\r\n" % paragraph.get_text().strip() | ||
return title, ret_text.encode('utf-8') | ||
|
||
|
||
def process_pending_articles(db_sess): | ||
pending_articles = get_pending_links(db_sess) | ||
for article in pending_articles: | ||
logging.debug("Processing %s." % article.article_link) | ||
try: | ||
title, content = extract_content(article.article_link) | ||
article.article_content = content | ||
article.article_title = title | ||
article.article_status = 'complete' | ||
except: | ||
article.article_status = 'failed' | ||
db_sess.commit() | ||
|
||
|
||
def main(): | ||
logging.info("Checking database...") | ||
init_db() | ||
db_sess = get_db_session() | ||
links = list_stories() | ||
logging.info("Found %s articles on news front page." % len(links)) | ||
logging.info("Inserting new articles to DB.") | ||
added = insert_new_links(db_sess, links) | ||
logging.info("%s new articles added to DB." % added) | ||
logging.info("Fetching pending articles.") | ||
process_pending_articles(db_sess) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
''' | ||
Scraper for ThisIsPlymouth.co.uk news stories. | ||
''' | ||
import re | ||
import sys | ||
import logging | ||
import requests | ||
import datetime | ||
from bs4 import BeautifulSoup | ||
from this_is_scraper.db import init_db | ||
from this_is_scraper.db import get_db_session | ||
from this_is_scraper.db import Articles | ||
|
||
|
||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', | ||
level=logging.DEBUG, | ||
stream=sys.stdout) | ||
|
||
|
||
def remove_duplicates(l): | ||
return list(set(l)) | ||
|
||
|
||
def has_numbers(input_str): | ||
return any(char.isdigit() for char in input_str) | ||
|
||
|
||
def list_stories(): | ||
article_links = [] | ||
html_doc = requests.get('http://www.plymouthherald.co.uk/news/') | ||
soup = BeautifulSoup(html_doc.text, 'html.parser') | ||
url_stub = "^http://www.plymouthherald.co.uk/news/" | ||
for link in soup.findAll('a', attrs={'href': re.compile(url_stub)}): | ||
href = link.get('href') | ||
if href != 'http://www.plymouthherald.co.uk/news/': | ||
# remove links to comments and any invalid articles, valid | ||
# articles have numbers in their URLs, interesting huh? | ||
if ('#comments-section' not in href) and (has_numbers(href)): | ||
article_links.append(href) | ||
|
||
article_links = remove_duplicates(article_links) | ||
return article_links | ||
|
||
|
||
def get_article_by_link(db_sess, link): | ||
articles = [] | ||
result = db_sess.query(Articles). \ | ||
filter(Articles.article_link == link).all() | ||
for article in result: | ||
articles.append(article) | ||
return articles | ||
|
||
|
||
def insert_link(db_sess, link): | ||
new_article = Articles(article_link=link, | ||
article_dt=datetime.datetime.now(), | ||
article_status='pending') | ||
db_sess.add(new_article) | ||
db_sess.commit() | ||
|
||
|
||
def insert_new_links(db_sess, links): | ||
add_count = 0 | ||
for link in links: | ||
logging.debug("Processing %s." % link) | ||
curr_linked_articles = get_article_by_link(db_sess, link) | ||
if len(curr_linked_articles) == 0: | ||
logging.debug("%s doesn't exist in DB, adding." % link) | ||
insert_link(db_sess, link) | ||
add_count += 1 | ||
else: | ||
logging.debug("%s already exists in DB, skipping." % link) | ||
return add_count | ||
|
||
|
||
def get_pending_links(db_sess): | ||
pending_articles = db_sess.query(Articles). \ | ||
filter(Articles.article_status == 'pending').all() | ||
return pending_articles | ||
|
||
|
||
def extract_content(link): | ||
html_doc = requests.get(link) | ||
html_doc = html_doc.text | ||
soup = BeautifulSoup(html_doc, 'html.parser') | ||
title = soup.title.string | ||
title = title.replace(' - Plymouth Herald', '') | ||
article_div = soup.findAll("div", {"class":"article-body"}) | ||
for div in article_div: | ||
paragraphs = div.findAll("p") | ||
ret_text = "" | ||
for paragraph in paragraphs: | ||
ret_text += "%s \r\n\r\n" % paragraph.get_text().strip() | ||
return title, ret_text.encode('utf-8') | ||
|
||
|
||
def process_pending_articles(db_sess): | ||
pending_articles = get_pending_links(db_sess) | ||
for article in pending_articles: | ||
logging.debug("Processing %s." % article.article_link) | ||
try: | ||
title, content = extract_content(article.article_link) | ||
article.article_content = content | ||
article.article_title = title | ||
article.article_status = 'complete' | ||
except: | ||
article.article_status = 'failed' | ||
db_sess.commit() | ||
|
||
|
||
def main(): | ||
logging.info("Checking database...") | ||
init_db() | ||
db_sess = get_db_session() | ||
links = list_stories() | ||
logging.info("Found %s articles on news front page." % len(links)) | ||
logging.info("Inserting new articles to DB.") | ||
added = insert_new_links(db_sess, links) | ||
logging.info("%s new articles added to DB." % added) | ||
logging.info("Fetching pending articles.") | ||
process_pending_articles(db_sess) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |