Initial commit

philwinder · Nov 30, 2017 · 23df293 · 23df293
1 parent 70f2c95
commit 23df293
Show file tree

Hide file tree

Showing 13 changed files with 611 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.serverless/
+.venv/
+node_modules/
diff --git a/README.md b/README.md
@@ -1,2 +1,60 @@
 # serverless-broken-link-checker
-A severless (AWS Lambda) broken link checker for checking 403/404/500s on websites
+
+A severless (AWS Lambda) broken link checker for checking 403/404/500s
+on websites.
+
+This is a Python serverless-based project to create a lambda on AWS
+running as a daily cron job. The goal is to scrape a website and check
+that all links are valid (i.e. no 403s, 404s, 500s and 501s).
+
+It scrapes your website using the `scrapy` Python library. The crawler
+will follow all internal links on your website. All external URLs are
+checked, but are not followed. After the crawler has finished it will
+send an email using `mailgun`s REST API.
+
+It shouldn't be too hard to convert this to use another cloud provider.
+
+## Requirements
+
+- AWS account (see [serverless quick start](https://serverless.com/framework/docs/getting-started/))
+- Mailgun account
+
+## Prerequisites
+
+1. Node.js (tested with v6.11.4)
+2. [Serverless](https://serverless.com) (tested with 1.24.1)
+3. Python3 (tested with 3.6.2)
+
+## Installation
+
+```bash
+npm install
+serverless plugin install -n serverless-python-requirements
+```
+
+## Build
+
+You will need to export the required settings and secrets.
+
+```bash
+export MAILGUN_API_KEY=key-xxxx MAILGUN_DOMAIN_NAME=example.com [email protected] URL=https://example.com  
+serverless deploy
+```
+
+## Usage
+
+The code is set to run every 24 hours. But you can run it manually with:
+
+```bash
+serverless invoke -f cron
+```
+
+## Configuration
+
+The following environmental variables are exposed. You must set these
+before you run `serverless deploy`.
+
+- **MAILGUN_API_KEY**: Your mailgun API key
+- **MAILGUN_DOMAIN_NAME**: Your mailgun domain name
+- **EMAIL**: The email address you want to send the report to
+- **URL**: The URL you want to check (in the format: `https://example.com/`)
diff --git a/handler.py b/handler.py
@@ -0,0 +1,49 @@
+import imp
+import json
+import os
+import sys
+
+sys.modules["sqlite"] = imp.new_module("sqlite")
+sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from scrapy import signals
+import requests
+
+MAILGUN_DOMAIN_NAME = os.environ['MAILGUN_DOMAIN_NAME']
+MAILGUN_API_KEY = os.environ['MAILGUN_API_KEY']
+EMAIL = os.environ["EMAIL"] # Emails will be sent to this address
+
+
+def run(event, context):
+    items = []
+
+    def add_item(item):
+        items.append(item)
+
+    # Create and run the crawler, scrapy stuff
+    process = CrawlerProcess(get_project_settings())
+    crawler = process.create_crawler('broken_link_spider')
+    crawler.signals.connect(add_item, signals.item_passed) # Intercept the results
+    process.crawl(crawler)
+    process.start()
+
+    # Convert results to json and send email
+    json_string = json.dumps([ob.__dict__ for ob in items])
+    print("Found broken links:", json_string)
+    send_simple_message(EMAIL, json_string)
+
+
+def send_simple_message(to, content):
+    url = 'https://api.mailgun.net/v3/{}/messages'.format(MAILGUN_DOMAIN_NAME)
+    auth = ('api', MAILGUN_API_KEY)
+    data = {
+        'from': 'Link Checker <noreply@{}>'.format(MAILGUN_DOMAIN_NAME),
+        'to': to,
+        'subject': 'Results of Broken Links',
+        'text': content,
+    }
+    print(data)
+    response = requests.post(url, auth=auth, data=data)
+    print(response.content)
+    response.raise_for_status()
diff --git a/linkchecker/__init__.py b/linkchecker/__init__.py
diff --git a/linkchecker/settings.py b/linkchecker/settings.py
@@ -0,0 +1,4 @@
+BOT_NAME = 'linkchecker'
+SPIDER_MODULES = ['linkchecker.spiders']
+NEWSPIDER_MODULE = 'linkchecker.spiders'
+ROBOTSTXT_OBEY = False
diff --git a/linkchecker/spiders/__init__.py b/linkchecker/spiders/__init__.py
diff --git a/linkchecker/spiders/broken_link_spider.py b/linkchecker/spiders/broken_link_spider.py
@@ -0,0 +1,54 @@
+import os
+
+import scrapy
+from scrapy.item import Item, Field
+
+CHECK_URL = os.environ["URL"]
+
+
+class BrokenItem(Item):
+    url = Field()
+    referer = Field()
+    status = Field()
+
+
+class BrokenLinksSpider(scrapy.Spider):
+    name = 'broken_link_spider'
+    start_urls = [CHECK_URL, ]
+    handle_httpstatus_list = [400, 403, 404, 500, 501]
+
+    def parse(self, response):
+        if self.error_condition(response):
+            yield self.to_item(response)
+
+        # Find links to follow
+        for link in response.css('a::attr(href)'):
+            url = link.extract()
+
+            # Skip large and/or invalud links.
+            invalid_urls = [".mov", ".m4a", "mailto"]
+            if any(substring in url for substring in invalid_urls):
+                return
+
+            # If a local link, follow links. Else only check response.
+            if CHECK_URL in url:
+                yield scrapy.Request(response.urljoin(url), callback=self.parse)
+            else:
+                yield scrapy.Request(response.urljoin(url), callback=self.parse_external)
+
+    def parse_external(self, response):
+        if self.error_condition(response):
+            yield self.to_item(response)
+
+    def error_condition(self, response):
+        error_states = [403, 404]
+        if response.status in error_states:
+            return True
+        return False
+
+    def to_item(self, response):
+        item = BrokenItem()
+        item['url'] = response.url
+        item['referer'] = response.request.headers.get('Referer').decode("utf-8")
+        item['status'] = response.status
+        return item