From 6b49ea22fe39206baf04a3cf3973e1ce317360dc Mon Sep 17 00:00:00 2001
From: Endy Iskandar Imam <imam.endy@gmail.com>
Date: Thu, 13 Feb 2020 09:53:57 -0500
Subject: [PATCH 1/2] Create lint-and-test.yml

---
 .github/workflows/lint-and-test.yml | 33 +++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/lint-and-test.yml

diff --git a/.github/workflows/lint-and-test.yml b/.github/workflows/lint-and-test.yml
new file mode 100644
index 0000000..420d2d2
--- /dev/null
+++ b/.github/workflows/lint-and-test.yml
@@ -0,0 +1,33 @@
+name: Lint and Test
+
+on:
+  push:
+    branches:
+      - update-requirements
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python 3.7
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r dev_requirements.txt
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --ignore E501,W503,E203 --show-source --statistics
+    - name: Lint with Black
+      run: |
+        black .
+    - name: Test with django
+      run: |
+        python manage.py test

From b885aea69249893ba53e7ce90ce666f6bf4de5fe Mon Sep 17 00:00:00 2001
From: Elliot <5495776+bitoffdev@users.noreply.github.com>
Date: Mon, 24 Feb 2020 18:53:21 -0500
Subject: [PATCH 2/2] Refactor Scraper (#121)

- Convert remaining absolute imports to relative in scraper
- Unify configuration and logging for the scraper
- Add django management command to create scraper user
- Update both READMEs to reflect scraper changes
---
 README.md                                     | 14 ++---
 .../commands/create_scraper_user.py           | 26 ++++++++
 dev_requirements.txt                          |  1 +
 requirements.txt                              |  1 +
 scrapers/README.rst                           | 42 ++++++++-----
 scrapers/__init__.py                          |  0
 scrapers/agency_api_service.py                | 25 +++++---
 scrapers/lighthouse.py                        | 10 +--
 scrapers/process_agency_info.py               | 30 ++++-----
 scrapers/scrape_handler.py                    | 23 +++----
 scrapers/scrapers/social_scraper.py           | 29 ++++-----
 scrapers/settings.py                          | 62 +++++++++++++++++++
 12 files changed, 175 insertions(+), 88 deletions(-)
 create mode 100644 apps/civic_pulse/management/commands/create_scraper_user.py
 create mode 100644 scrapers/__init__.py
 create mode 100644 scrapers/settings.py

diff --git a/README.md b/README.md
index 8ae9aa7..52545b5 100644
--- a/README.md
+++ b/README.md
@@ -110,20 +110,14 @@ Note: The scrapers live in an independent environment not neccessarily in the sa
   # enter the password when prompted. It can be any password that you wish to use.
   # It is used for login to the admin website.
  ```
-- Start up the webserver so we can create a user for the scraper.
+- Start up the webserver
 ```bash
 python3 manage.py runserver
 ```
-- Visit localhost:8000/admin and follow the UI to add a new user named "scraper", set the password to whatever you would like but make note of it.
-
-- In a new terminal tab, create a token for the scraper user using the following command
-```bash
-python3 manage.py drf_create_token scraper
-```
-Finally, the database is ready to go! We are now ready to run the server:
-
 Navigate in your browser to `http://127.0.0.1:8000/admin`. Log in with the new admin user you just created. Click on Agencys and you should see a list of
-agencies.
+agencies created with the ``fill_agency_objects`` command.
+
+To setup the scraper, read [the scraper README](scrapers/README.rst).
 
 ## Code formatting
 GovLens enforces code style using [Black](https://github.com/psf/black) and pep8 rules using [Flake8](http://flake8.pycqa.org/en/latest/).
diff --git a/apps/civic_pulse/management/commands/create_scraper_user.py b/apps/civic_pulse/management/commands/create_scraper_user.py
new file mode 100644
index 0000000..f6b8884
--- /dev/null
+++ b/apps/civic_pulse/management/commands/create_scraper_user.py
@@ -0,0 +1,26 @@
+"""Idempotent management command to create the scraper user with a DRF token
+"""
+from django.core.management.base import BaseCommand
+from django.contrib.auth.models import User
+from rest_framework.authtoken.models import Token
+
+SCRAPER_USERNAME = "scraper"
+
+
+class Command(BaseCommand):
+    help = "Get or create a scraper user with a Django REST Framework token"
+
+    def add_arguments(self, parser):
+        pass
+
+    def handle(self, *args, **options):
+        user, created = User.objects.get_or_create(username=SCRAPER_USERNAME)
+        user.save()
+
+        if created:
+            self.stdout.write(f"Created new user with username {SCRAPER_USERNAME}")
+        else:
+            self.stdout.write(f"User {SCRAPER_USERNAME} already exists.")
+
+        token, created = Token.objects.get_or_create(user=user)
+        self.stdout.write(f"The token for the user {SCRAPER_USERNAME} is {token}")
diff --git a/dev_requirements.txt b/dev_requirements.txt
index 4e92b9d..930b21a 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -1,2 +1,3 @@
 black
 flake8
+coloredlogs==10.0
diff --git a/requirements.txt b/requirements.txt
index faca283..3060d4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ simplejson==3.16.0
 sqlparse==0.3.0
 urllib3==1.24.2
 apscheduler==3.6.0
+python-dotenv==0.11.0
diff --git a/scrapers/README.rst b/scrapers/README.rst
index 5187286..c310fb4 100644
--- a/scrapers/README.rst
+++ b/scrapers/README.rst
@@ -27,28 +27,38 @@ Directory Structure
       ├── security_scraper.py       - scrapes for HTTPS & privacy policy
       └── social_scraper.py         - scrapes for phone number, email, address, social media
 
-Requirements
-============
+Quick Start
+===========
+
+Configuration
+~~~~~~~~~~~~~
+
+There are a few required environmental variables. The easiest way to set them in development is to create a file called `.env` in the root directory of this repository (don't commit this file). The file (named `.env`) should contain the following text::
+
+    GOVLENS_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+    GOVLENS_API_ENDPOINT=http://127.0.0.1:8000/api/agencies/
+    GOOGLE_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXX
+
+To get the ``GOOGLE_API_TOKEN``, you need to visit the following page: https://developers.google.com/speed/docs/insights/v5/get-started
+
+To get the ``GOVLENS_API_TOKEN``, run ``python3 manage.py create_scraper_user``. Copy the token from the command output and paste it into the ``.env`` file.
+
+Execution
+~~~~~~~~~
 
-Google Lighthouse API Key
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Get the API key for accessing lighthouse from here: https://developers.google.com/speed/docs/insights/v5/get-started (click on the button get key)
+Once you have created the `.env` file as mentioned above, run the scraper::
 
-Put that key in GOOGLE_API_KEY environment variable.
+  # run the following from the root directory of the repository
+  python3 -m scrapers.scrape_handler
 
-Running the Scrapers
-====================
-``scrape_handler.py`` is the entry point for scraping.
-When we run from our local machine, we get the list of agencies and start scraping them.
-But when deployed to AWS, the scraper is invoked by the schedule and ``scrape_handler.scrape_data()`` is the method hooked up to the lambda.
+Design
+======
 
-Local
-~~~~~
-If running from local, the following command should run the scraper::
+The scraper is intended to be used both locally and on AWS Lambda.
 
-  python scraper.py
+The ``scrapers`` directory in the root of this repository is the top-level Python package for this project. This means that any absolute imports should begin with ``scrapers.MODULE_NAME_HERE``.
 
-Make sure to set the environment variable to your local endpoint.
+``scrapers/scrape_handler.py`` is the main Python module invoked. On AWS Lambda, the method ``scrape_handler.scrape_data()`` is imported and called directly.
 
 AWS Lambda
 ~~~~~~~~~~
diff --git a/scrapers/__init__.py b/scrapers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrapers/agency_api_service.py b/scrapers/agency_api_service.py
index 62d3710..b1a6c9d 100644
--- a/scrapers/agency_api_service.py
+++ b/scrapers/agency_api_service.py
@@ -1,25 +1,30 @@
-import os
+import logging
 
 import requests
 
+from . import settings
+
+logger = logging.getLogger(__name__)
+
 
 class AgencyApiService:
     def __init__(self):
-        # If environment variable is set, we use the corresponding api(usually local). otherwise govlens api
-        if os.environ.get("govlens_api", None) is None:
-            self.base_url = (
-                "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/"
-            )
-        else:
-            self.base_url = os.environ["govlens_api"]
+        self.base_url = settings.GOVLENS_API_ENDPOINT
 
     def get_all_agencies(self):
         try:
             all_agency_list = self._get(self.base_url)
             return all_agency_list
         except Exception as ex:
-            print(f"Error while retrieving all the agency information: {str(ex)}")
+            logger.error(ex, "Error while retrieving all the agency information")
 
     def _get(self, url):
-        response = requests.get(url, headers={"Content-type": "application/json"})
+        response = requests.get(
+            url,
+            headers={
+                "Content-type": "application/json",
+                "Authorization": "Token {}".format(settings.GOVLENS_API_TOKEN),
+            },
+        )
+        response.raise_for_status()
         return response.json()
diff --git a/scrapers/lighthouse.py b/scrapers/lighthouse.py
index 8f594e3..29a1917 100644
--- a/scrapers/lighthouse.py
+++ b/scrapers/lighthouse.py
@@ -1,7 +1,7 @@
-from scrapers.base_api_client import ApiClient
+from .scrapers.base_api_client import ApiClient
+from . import settings
 
 
-GOOGLE_API_KEY = ""  # os.environ['GOOGLE_API_KEY']
 PAGE_INSIGHTS_ENDPOINT = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
 MOBILE_FRIENDLY_ENDPOINT = "https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run"  # from what i have tested, very hard to automate
 
@@ -15,7 +15,7 @@
 
 
 class PageInsightsClient(ApiClient):
-    def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=GOOGLE_API_KEY):
+    def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=settings.GOOGLE_API_KEY):
         ApiClient.__init__(self, api_uri, api_key)
 
     def get_page_insights(self, url, category):
@@ -24,7 +24,9 @@ def get_page_insights(self, url, category):
 
 
 class GoogleMobileFriendlyClient(ApiClient):
-    def __init__(self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=GOOGLE_API_KEY):
+    def __init__(
+        self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=settings.GOOGLE_API_KEY
+    ):
         self.urls = []
         self.results = []
         ApiClient.__init__(self, api_uri, api_key)
diff --git a/scrapers/process_agency_info.py b/scrapers/process_agency_info.py
index 1395a6b..1bc3c9d 100644
--- a/scrapers/process_agency_info.py
+++ b/scrapers/process_agency_info.py
@@ -1,10 +1,12 @@
-import os
 import requests
 import logging
-from scrapers.social_scraper import SocialScraper
-from scrapers.security_scraper import SecurityScraper
-from scrapers.accessibility_scraper import AccessibilityScraper
-from agency_dataaccessor import AgencyDataAccessor
+from .scrapers.social_scraper import SocialScraper
+from .scrapers.security_scraper import SecurityScraper
+from .scrapers.accessibility_scraper import AccessibilityScraper
+from .agency_dataaccessor import AgencyDataAccessor
+from . import settings
+
+logger = logging.getLogger(__name__)
 
 
 class AgencyInfo:
@@ -24,15 +26,12 @@ def process_agency_info(self):
             # HTTP Get on agency url
             agency_url = self.agency.get("website", None)
             if agency_url is None or agency_url == "":
-                print(
-                    f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}"
-                )
-                logging.error(
+                logger.error(
                     f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}"
                 )
                 self.agency_dataaccessor.update_agency_info(self.agency)
                 return
-            print(f"Scraping the website {agency_url}")
+            logger.info(f"Scraping the website {agency_url}")
             page = requests.get(agency_url, timeout=30)
             # Initialize scrapers
             socialScraper = SocialScraper(page, agency_url)
@@ -45,7 +44,7 @@ def process_agency_info(self):
             # Figure out the google_api_key and then fix the below buckets
             for bucket in self.buckets:
                 if bucket == "security_and_privacy":
-                    if os.environ.get("GOOGLE_API_KEY", None) is not None:
+                    if settings.GOOGLE_API_KEY:
                         profile_info[
                             bucket
                         ] = securityScraper.get_security_privacy_info()
@@ -56,7 +55,7 @@ def process_agency_info(self):
                         social_media_info, contact_info
                     )
                 elif bucket == "website_accessibility":
-                    if os.environ.get("GOOGLE_API_KEY", None) is not None:
+                    if settings.GOOGLE_API_KEY:
                         profile_info[
                             bucket
                         ] = accessibilityScraper.get_website_accessibility_info()
@@ -71,9 +70,6 @@ def process_agency_info(self):
             self.agency_dataaccessor.enrich_agency_info_with_scrape_info(agency_details)
             return agency_details
         except Exception as ex:
-            logging.error(
-                f"An error occurred while processing the agency information: {str(ex)}"
-            )
-            print(
-                f"An error occurred while processing the agency information: {str(ex)}"
+            logger.error(
+                ex, "An error occurred while processing the agency information"
             )
diff --git a/scrapers/scrape_handler.py b/scrapers/scrape_handler.py
index 69168a8..67e255a 100644
--- a/scrapers/scrape_handler.py
+++ b/scrapers/scrape_handler.py
@@ -1,14 +1,19 @@
-import os
 import logging
 from .process_agency_info import AgencyInfo
 from .agency_api_service import AgencyApiService
 
+from . import settings
+
+settings.setup_logging()
+
+logger = logging.getLogger(__name__)
+
 
 # method invoked by lambda
 def scrape_data(event, context=None):
     agencies = event["agencies"]
     if event.get("agencies", None) is None or len(agencies) <= 0:
-        print("No Agency information was passed to scrape")
+        logger.warning("No Agency information was passed to scrape")
         return
 
     for agency in agencies:
@@ -16,20 +21,10 @@ def scrape_data(event, context=None):
         agency_instance.process_agency_info()
 
 
-# if running from local, we get the list of agencies and scrape one by one.
 if __name__ == "__main__":
-    # If running from local, set the environment variable to your local
-    logging.basicConfig(
-        filename="Scraper_Errors.log",
-        level=logging.ERROR,
-        format="%(asctime)s %(message)s",
-    )
-    os.environ[
-        "govlens_api"
-    ] = "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/"
-    os.environ["GOOGLE_API_KEY"] = ""
+
     agency_api_service = AgencyApiService()
     agencies = agency_api_service.get_all_agencies()
     event = {"agencies": agencies}
     scrape_data(event)
-    print("SCRAPED")
+    logger.info("Finished scraping")
diff --git a/scrapers/scrapers/social_scraper.py b/scrapers/scrapers/social_scraper.py
index 82bb26c..8226201 100644
--- a/scrapers/scrapers/social_scraper.py
+++ b/scrapers/scrapers/social_scraper.py
@@ -4,6 +4,8 @@
 import re
 import logging
 
+logger = logging.getLogger(__name__)
+
 
 class SocialScraper(BaseScraper):
 
@@ -39,34 +41,29 @@ def scrape_info(self):
                         elif any(link in tag["href"] for link in social_media_criteria):
                             social_media_links.append(tag["href"])
                 except Exception as ex:
-                    print(
-                        f"An error occurred while trying to extract the social media information: {str(ex)}"
-                    )
                     logging.error(
-                        f"An error occurred while trying to extract the social media information: {str(ex)}"
+                        ex,
+                        "An error occurred while trying to extract the social media information",
                     )
             if contact_us_link:
                 if "http" in contact_us_link["href"]:
-                    print(
+                    logger.info(
                         f"making an extra call to get the contact info: {contact_us_link['href']}"
                     )
                     contact_us_page = requests.get(contact_us_link["href"])
                 else:
-                    print(
+                    logger.info(
                         f"making an extra call to get the contact info: {self.url+contact_us_link['href']}"
                     )
                     contact_us_page = requests.get(self.url + contact_us_link["href"])
                 contact_us_soup = BeautifulSoup(contact_us_page.content, "html.parser")
                 contact_info = self.get_contact_info(contact_us_soup)
             else:
-                print("not making an extra call to get the contact info")
+                logger.info("not making an extra call to get the contact info")
                 contact_info = self.get_contact_info(soup)
         except Exception as ex:
-            print(
-                f"An error occurred while processing the social media information: {str(ex)}"
-            )
             logging.error(
-                f"An error occurred while processing the social media information: {str(ex)}"
+                ex, f"An error occurred while processing the social media information"
             )
 
         return social_media_links, contact_info
@@ -106,15 +103,13 @@ def get_contact_info(self, soup):
                     "address": list(set(address))[0] if address else [],
                 }
             else:
-                print("Contact Information not available")
+                logger.warning("Contact Information not available")
                 all_contact_info = {"email": [], "phone_number": [], "address": []}
             return all_contact_info
-        except Exception:
-            print(
-                "An error occurred while extracting the contact information for the firm {self.url}: {str(ex)}"
-            )
+        except Exception as ex:
             logging.error(
-                "An error occurred while extracting the contact information for the firm {self.url}: {str(ex)}"
+                ex,
+                "An error occurred while extracting the contact information for the firm {self.url}",
             )
             return None
 
diff --git a/scrapers/settings.py b/scrapers/settings.py
new file mode 100644
index 0000000..2e0c9e3
--- /dev/null
+++ b/scrapers/settings.py
@@ -0,0 +1,62 @@
+"""Unified configuration for GovLens scraper
+
+Goals of this file:
+    - Express the entire configuration of the scraper in one place
+    - Quickly fail in the event of an improper configuration
+"""
+import logging
+import os
+import sys
+
+
+logger = logging.getLogger(__name__)
+
+
+def setup_logging():
+    """configure logging
+
+    call this as soon as possible
+    """
+    try:
+        import coloredlogs
+
+        coloredlogs.install(level=logging.INFO)
+    except Exception:
+        logger.warning("Could not import coloredlogs")
+        # fall back to basicConfig
+        logging.basicConfig(level=logging.INFO,)
+
+
+# setup the logger
+setup_logging()
+
+# attempt to load variables from a file called '.env'
+try:
+    from dotenv import load_dotenv
+
+    load_dotenv()
+except ImportError:
+    logger.warning(
+        "dotenv could not be imported. Variables will only be loaded from the environment."
+    )
+
+GOVLENS_API_TOKEN = os.environ.get("GOVLENS_API_TOKEN")
+GOVLENS_API_ENDPOINT = os.environ.get("GOVLENS_API_ENDPOINT")
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+
+if not GOVLENS_API_ENDPOINT:
+    logger.warning(
+        "The environmental variable GOVLENS_API_ENDPOINT was not provided. Exiting..."
+    )
+    sys.exit(1)
+
+if not GOVLENS_API_TOKEN:
+    logger.warning(
+        "The environmental variable GOVLENS_API_TOKEN was not provided. Exiting..."
+    )
+    sys.exit(1)
+
+if not GOOGLE_API_KEY:
+    logger.warning(
+        "The environmental variable GOOGLE_API_KEY was not provided; no lighthouse data will be collected."
+    )