This repository has been archived by the owner on Mar 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
70f2c95
commit 23df293
Showing
13 changed files
with
611 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.serverless/ | ||
.venv/ | ||
node_modules/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,60 @@ | ||
# serverless-broken-link-checker | ||
A severless (AWS Lambda) broken link checker for checking 403/404/500s on websites | ||
|
||
A severless (AWS Lambda) broken link checker for checking 403/404/500s | ||
on websites. | ||
|
||
This is a Python serverless-based project to create a lambda on AWS | ||
running as a daily cron job. The goal is to scrape a website and check | ||
that all links are valid (i.e. no 403s, 404s, 500s and 501s). | ||
|
||
It scrapes your website using the `scrapy` Python library. The crawler | ||
will follow all internal links on your website. All external URLs are | ||
checked, but are not followed. After the crawler has finished it will | ||
send an email using `mailgun`s REST API. | ||
|
||
It shouldn't be too hard to convert this to use another cloud provider. | ||
|
||
## Requirements | ||
|
||
- AWS account (see [serverless quick start](https://serverless.com/framework/docs/getting-started/)) | ||
- Mailgun account | ||
|
||
## Prerequisites | ||
|
||
1. Node.js (tested with v6.11.4) | ||
2. [Serverless](https://serverless.com) (tested with 1.24.1) | ||
3. Python3 (tested with 3.6.2) | ||
|
||
## Installation | ||
|
||
```bash | ||
npm install | ||
serverless plugin install -n serverless-python-requirements | ||
``` | ||
|
||
## Build | ||
|
||
You will need to export the required settings and secrets. | ||
|
||
```bash | ||
export MAILGUN_API_KEY=key-xxxx MAILGUN_DOMAIN_NAME=example.com [email protected] URL=https://example.com | ||
serverless deploy | ||
``` | ||
|
||
## Usage | ||
|
||
The code is set to run every 24 hours. But you can run it manually with: | ||
|
||
```bash | ||
serverless invoke -f cron | ||
``` | ||
|
||
## Configuration | ||
|
||
The following environmental variables are exposed. You must set these | ||
before you run `serverless deploy`. | ||
|
||
- **MAILGUN_API_KEY**: Your mailgun API key | ||
- **MAILGUN_DOMAIN_NAME**: Your mailgun domain name | ||
- **EMAIL**: The email address you want to send the report to | ||
- **URL**: The URL you want to check (in the format: `https://example.com/`) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import imp | ||
import json | ||
import os | ||
import sys | ||
|
||
sys.modules["sqlite"] = imp.new_module("sqlite") | ||
sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2") | ||
from scrapy.crawler import CrawlerProcess | ||
from scrapy.utils.project import get_project_settings | ||
from scrapy import signals | ||
import requests | ||
|
||
MAILGUN_DOMAIN_NAME = os.environ['MAILGUN_DOMAIN_NAME'] | ||
MAILGUN_API_KEY = os.environ['MAILGUN_API_KEY'] | ||
EMAIL = os.environ["EMAIL"] # Emails will be sent to this address | ||
|
||
|
||
def run(event, context): | ||
items = [] | ||
|
||
def add_item(item): | ||
items.append(item) | ||
|
||
# Create and run the crawler, scrapy stuff | ||
process = CrawlerProcess(get_project_settings()) | ||
crawler = process.create_crawler('broken_link_spider') | ||
crawler.signals.connect(add_item, signals.item_passed) # Intercept the results | ||
process.crawl(crawler) | ||
process.start() | ||
|
||
# Convert results to json and send email | ||
json_string = json.dumps([ob.__dict__ for ob in items]) | ||
print("Found broken links:", json_string) | ||
send_simple_message(EMAIL, json_string) | ||
|
||
|
||
def send_simple_message(to, content): | ||
url = 'https://api.mailgun.net/v3/{}/messages'.format(MAILGUN_DOMAIN_NAME) | ||
auth = ('api', MAILGUN_API_KEY) | ||
data = { | ||
'from': 'Link Checker <noreply@{}>'.format(MAILGUN_DOMAIN_NAME), | ||
'to': to, | ||
'subject': 'Results of Broken Links', | ||
'text': content, | ||
} | ||
print(data) | ||
response = requests.post(url, auth=auth, data=data) | ||
print(response.content) | ||
response.raise_for_status() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
BOT_NAME = 'linkchecker' | ||
SPIDER_MODULES = ['linkchecker.spiders'] | ||
NEWSPIDER_MODULE = 'linkchecker.spiders' | ||
ROBOTSTXT_OBEY = False |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import os | ||
|
||
import scrapy | ||
from scrapy.item import Item, Field | ||
|
||
CHECK_URL = os.environ["URL"] | ||
|
||
|
||
class BrokenItem(Item): | ||
url = Field() | ||
referer = Field() | ||
status = Field() | ||
|
||
|
||
class BrokenLinksSpider(scrapy.Spider): | ||
name = 'broken_link_spider' | ||
start_urls = [CHECK_URL, ] | ||
handle_httpstatus_list = [400, 403, 404, 500, 501] | ||
|
||
def parse(self, response): | ||
if self.error_condition(response): | ||
yield self.to_item(response) | ||
|
||
# Find links to follow | ||
for link in response.css('a::attr(href)'): | ||
url = link.extract() | ||
|
||
# Skip large and/or invalud links. | ||
invalid_urls = [".mov", ".m4a", "mailto"] | ||
if any(substring in url for substring in invalid_urls): | ||
return | ||
|
||
# If a local link, follow links. Else only check response. | ||
if CHECK_URL in url: | ||
yield scrapy.Request(response.urljoin(url), callback=self.parse) | ||
else: | ||
yield scrapy.Request(response.urljoin(url), callback=self.parse_external) | ||
|
||
def parse_external(self, response): | ||
if self.error_condition(response): | ||
yield self.to_item(response) | ||
|
||
def error_condition(self, response): | ||
error_states = [403, 404] | ||
if response.status in error_states: | ||
return True | ||
return False | ||
|
||
def to_item(self, response): | ||
item = BrokenItem() | ||
item['url'] = response.url | ||
item['referer'] = response.request.headers.get('Referer').decode("utf-8") | ||
item['status'] = response.status | ||
return item |
Oops, something went wrong.