From 6b49ea22fe39206baf04a3cf3973e1ce317360dc Mon Sep 17 00:00:00 2001 From: Endy Iskandar Imam Date: Thu, 13 Feb 2020 09:53:57 -0500 Subject: [PATCH 1/2] Create lint-and-test.yml --- .github/workflows/lint-and-test.yml | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/lint-and-test.yml diff --git a/.github/workflows/lint-and-test.yml b/.github/workflows/lint-and-test.yml new file mode 100644 index 0000000..420d2d2 --- /dev/null +++ b/.github/workflows/lint-and-test.yml @@ -0,0 +1,33 @@ +name: Lint and Test + +on: + push: + branches: + - update-requirements + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + - name: Set up Python 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r dev_requirements.txt + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --ignore E501,W503,E203 --show-source --statistics + - name: Lint with Black + run: | + black . + - name: Test with django + run: | + python manage.py test From b885aea69249893ba53e7ce90ce666f6bf4de5fe Mon Sep 17 00:00:00 2001 From: Elliot <5495776+bitoffdev@users.noreply.github.com> Date: Mon, 24 Feb 2020 18:53:21 -0500 Subject: [PATCH 2/2] Refactor Scraper (#121) - Convert remaining absolute imports to relative in scraper - Unify configuration and logging for the scraper - Add django management command to create scraper user - Update both READMEs to reflect scraper changes --- README.md | 14 ++--- .../commands/create_scraper_user.py | 26 ++++++++ dev_requirements.txt | 1 + requirements.txt | 1 + scrapers/README.rst | 42 ++++++++----- scrapers/__init__.py | 0 scrapers/agency_api_service.py | 25 +++++--- scrapers/lighthouse.py | 10 +-- scrapers/process_agency_info.py | 30 ++++----- scrapers/scrape_handler.py | 23 +++---- scrapers/scrapers/social_scraper.py | 29 ++++----- scrapers/settings.py | 62 +++++++++++++++++++ 12 files changed, 175 insertions(+), 88 deletions(-) create mode 100644 apps/civic_pulse/management/commands/create_scraper_user.py create mode 100644 scrapers/__init__.py create mode 100644 scrapers/settings.py diff --git a/README.md b/README.md index 8ae9aa7..52545b5 100644 --- a/README.md +++ b/README.md @@ -110,20 +110,14 @@ Note: The scrapers live in an independent environment not neccessarily in the sa # enter the password when prompted. It can be any password that you wish to use. # It is used for login to the admin website. ``` -- Start up the webserver so we can create a user for the scraper. +- Start up the webserver ```bash python3 manage.py runserver ``` -- Visit localhost:8000/admin and follow the UI to add a new user named "scraper", set the password to whatever you would like but make note of it. - -- In a new terminal tab, create a token for the scraper user using the following command -```bash -python3 manage.py drf_create_token scraper -``` -Finally, the database is ready to go! We are now ready to run the server: - Navigate in your browser to `http://127.0.0.1:8000/admin`. Log in with the new admin user you just created. Click on Agencys and you should see a list of -agencies. +agencies created with the ``fill_agency_objects`` command. + +To setup the scraper, read [the scraper README](scrapers/README.rst). ## Code formatting GovLens enforces code style using [Black](https://github.com/psf/black) and pep8 rules using [Flake8](http://flake8.pycqa.org/en/latest/). diff --git a/apps/civic_pulse/management/commands/create_scraper_user.py b/apps/civic_pulse/management/commands/create_scraper_user.py new file mode 100644 index 0000000..f6b8884 --- /dev/null +++ b/apps/civic_pulse/management/commands/create_scraper_user.py @@ -0,0 +1,26 @@ +"""Idempotent management command to create the scraper user with a DRF token +""" +from django.core.management.base import BaseCommand +from django.contrib.auth.models import User +from rest_framework.authtoken.models import Token + +SCRAPER_USERNAME = "scraper" + + +class Command(BaseCommand): + help = "Get or create a scraper user with a Django REST Framework token" + + def add_arguments(self, parser): + pass + + def handle(self, *args, **options): + user, created = User.objects.get_or_create(username=SCRAPER_USERNAME) + user.save() + + if created: + self.stdout.write(f"Created new user with username {SCRAPER_USERNAME}") + else: + self.stdout.write(f"User {SCRAPER_USERNAME} already exists.") + + token, created = Token.objects.get_or_create(user=user) + self.stdout.write(f"The token for the user {SCRAPER_USERNAME} is {token}") diff --git a/dev_requirements.txt b/dev_requirements.txt index 4e92b9d..930b21a 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,2 +1,3 @@ black flake8 +coloredlogs==10.0 diff --git a/requirements.txt b/requirements.txt index faca283..3060d4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ simplejson==3.16.0 sqlparse==0.3.0 urllib3==1.24.2 apscheduler==3.6.0 +python-dotenv==0.11.0 diff --git a/scrapers/README.rst b/scrapers/README.rst index 5187286..c310fb4 100644 --- a/scrapers/README.rst +++ b/scrapers/README.rst @@ -27,28 +27,38 @@ Directory Structure ├── security_scraper.py - scrapes for HTTPS & privacy policy    └── social_scraper.py - scrapes for phone number, email, address, social media -Requirements -============ +Quick Start +=========== + +Configuration +~~~~~~~~~~~~~ + +There are a few required environmental variables. The easiest way to set them in development is to create a file called `.env` in the root directory of this repository (don't commit this file). The file (named `.env`) should contain the following text:: + + GOVLENS_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + GOVLENS_API_ENDPOINT=http://127.0.0.1:8000/api/agencies/ + GOOGLE_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXX + +To get the ``GOOGLE_API_TOKEN``, you need to visit the following page: https://developers.google.com/speed/docs/insights/v5/get-started + +To get the ``GOVLENS_API_TOKEN``, run ``python3 manage.py create_scraper_user``. Copy the token from the command output and paste it into the ``.env`` file. + +Execution +~~~~~~~~~ -Google Lighthouse API Key -~~~~~~~~~~~~~~~~~~~~~~~~~ -Get the API key for accessing lighthouse from here: https://developers.google.com/speed/docs/insights/v5/get-started (click on the button get key) +Once you have created the `.env` file as mentioned above, run the scraper:: -Put that key in GOOGLE_API_KEY environment variable. + # run the following from the root directory of the repository + python3 -m scrapers.scrape_handler -Running the Scrapers -==================== -``scrape_handler.py`` is the entry point for scraping. -When we run from our local machine, we get the list of agencies and start scraping them. -But when deployed to AWS, the scraper is invoked by the schedule and ``scrape_handler.scrape_data()`` is the method hooked up to the lambda. +Design +====== -Local -~~~~~ -If running from local, the following command should run the scraper:: +The scraper is intended to be used both locally and on AWS Lambda. - python scraper.py +The ``scrapers`` directory in the root of this repository is the top-level Python package for this project. This means that any absolute imports should begin with ``scrapers.MODULE_NAME_HERE``. -Make sure to set the environment variable to your local endpoint. +``scrapers/scrape_handler.py`` is the main Python module invoked. On AWS Lambda, the method ``scrape_handler.scrape_data()`` is imported and called directly. AWS Lambda ~~~~~~~~~~ diff --git a/scrapers/__init__.py b/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapers/agency_api_service.py b/scrapers/agency_api_service.py index 62d3710..b1a6c9d 100644 --- a/scrapers/agency_api_service.py +++ b/scrapers/agency_api_service.py @@ -1,25 +1,30 @@ -import os +import logging import requests +from . import settings + +logger = logging.getLogger(__name__) + class AgencyApiService: def __init__(self): - # If environment variable is set, we use the corresponding api(usually local). otherwise govlens api - if os.environ.get("govlens_api", None) is None: - self.base_url = ( - "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/" - ) - else: - self.base_url = os.environ["govlens_api"] + self.base_url = settings.GOVLENS_API_ENDPOINT def get_all_agencies(self): try: all_agency_list = self._get(self.base_url) return all_agency_list except Exception as ex: - print(f"Error while retrieving all the agency information: {str(ex)}") + logger.error(ex, "Error while retrieving all the agency information") def _get(self, url): - response = requests.get(url, headers={"Content-type": "application/json"}) + response = requests.get( + url, + headers={ + "Content-type": "application/json", + "Authorization": "Token {}".format(settings.GOVLENS_API_TOKEN), + }, + ) + response.raise_for_status() return response.json() diff --git a/scrapers/lighthouse.py b/scrapers/lighthouse.py index 8f594e3..29a1917 100644 --- a/scrapers/lighthouse.py +++ b/scrapers/lighthouse.py @@ -1,7 +1,7 @@ -from scrapers.base_api_client import ApiClient +from .scrapers.base_api_client import ApiClient +from . import settings -GOOGLE_API_KEY = "" # os.environ['GOOGLE_API_KEY'] PAGE_INSIGHTS_ENDPOINT = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed" MOBILE_FRIENDLY_ENDPOINT = "https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run" # from what i have tested, very hard to automate @@ -15,7 +15,7 @@ class PageInsightsClient(ApiClient): - def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=GOOGLE_API_KEY): + def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=settings.GOOGLE_API_KEY): ApiClient.__init__(self, api_uri, api_key) def get_page_insights(self, url, category): @@ -24,7 +24,9 @@ def get_page_insights(self, url, category): class GoogleMobileFriendlyClient(ApiClient): - def __init__(self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=GOOGLE_API_KEY): + def __init__( + self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=settings.GOOGLE_API_KEY + ): self.urls = [] self.results = [] ApiClient.__init__(self, api_uri, api_key) diff --git a/scrapers/process_agency_info.py b/scrapers/process_agency_info.py index 1395a6b..1bc3c9d 100644 --- a/scrapers/process_agency_info.py +++ b/scrapers/process_agency_info.py @@ -1,10 +1,12 @@ -import os import requests import logging -from scrapers.social_scraper import SocialScraper -from scrapers.security_scraper import SecurityScraper -from scrapers.accessibility_scraper import AccessibilityScraper -from agency_dataaccessor import AgencyDataAccessor +from .scrapers.social_scraper import SocialScraper +from .scrapers.security_scraper import SecurityScraper +from .scrapers.accessibility_scraper import AccessibilityScraper +from .agency_dataaccessor import AgencyDataAccessor +from . import settings + +logger = logging.getLogger(__name__) class AgencyInfo: @@ -24,15 +26,12 @@ def process_agency_info(self): # HTTP Get on agency url agency_url = self.agency.get("website", None) if agency_url is None or agency_url == "": - print( - f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}" - ) - logging.error( + logger.error( f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}" ) self.agency_dataaccessor.update_agency_info(self.agency) return - print(f"Scraping the website {agency_url}") + logger.info(f"Scraping the website {agency_url}") page = requests.get(agency_url, timeout=30) # Initialize scrapers socialScraper = SocialScraper(page, agency_url) @@ -45,7 +44,7 @@ def process_agency_info(self): # Figure out the google_api_key and then fix the below buckets for bucket in self.buckets: if bucket == "security_and_privacy": - if os.environ.get("GOOGLE_API_KEY", None) is not None: + if settings.GOOGLE_API_KEY: profile_info[ bucket ] = securityScraper.get_security_privacy_info() @@ -56,7 +55,7 @@ def process_agency_info(self): social_media_info, contact_info ) elif bucket == "website_accessibility": - if os.environ.get("GOOGLE_API_KEY", None) is not None: + if settings.GOOGLE_API_KEY: profile_info[ bucket ] = accessibilityScraper.get_website_accessibility_info() @@ -71,9 +70,6 @@ def process_agency_info(self): self.agency_dataaccessor.enrich_agency_info_with_scrape_info(agency_details) return agency_details except Exception as ex: - logging.error( - f"An error occurred while processing the agency information: {str(ex)}" - ) - print( - f"An error occurred while processing the agency information: {str(ex)}" + logger.error( + ex, "An error occurred while processing the agency information" ) diff --git a/scrapers/scrape_handler.py b/scrapers/scrape_handler.py index 69168a8..67e255a 100644 --- a/scrapers/scrape_handler.py +++ b/scrapers/scrape_handler.py @@ -1,14 +1,19 @@ -import os import logging from .process_agency_info import AgencyInfo from .agency_api_service import AgencyApiService +from . import settings + +settings.setup_logging() + +logger = logging.getLogger(__name__) + # method invoked by lambda def scrape_data(event, context=None): agencies = event["agencies"] if event.get("agencies", None) is None or len(agencies) <= 0: - print("No Agency information was passed to scrape") + logger.warning("No Agency information was passed to scrape") return for agency in agencies: @@ -16,20 +21,10 @@ def scrape_data(event, context=None): agency_instance.process_agency_info() -# if running from local, we get the list of agencies and scrape one by one. if __name__ == "__main__": - # If running from local, set the environment variable to your local - logging.basicConfig( - filename="Scraper_Errors.log", - level=logging.ERROR, - format="%(asctime)s %(message)s", - ) - os.environ[ - "govlens_api" - ] = "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/" - os.environ["GOOGLE_API_KEY"] = "" + agency_api_service = AgencyApiService() agencies = agency_api_service.get_all_agencies() event = {"agencies": agencies} scrape_data(event) - print("SCRAPED") + logger.info("Finished scraping") diff --git a/scrapers/scrapers/social_scraper.py b/scrapers/scrapers/social_scraper.py index 82bb26c..8226201 100644 --- a/scrapers/scrapers/social_scraper.py +++ b/scrapers/scrapers/social_scraper.py @@ -4,6 +4,8 @@ import re import logging +logger = logging.getLogger(__name__) + class SocialScraper(BaseScraper): @@ -39,34 +41,29 @@ def scrape_info(self): elif any(link in tag["href"] for link in social_media_criteria): social_media_links.append(tag["href"]) except Exception as ex: - print( - f"An error occurred while trying to extract the social media information: {str(ex)}" - ) logging.error( - f"An error occurred while trying to extract the social media information: {str(ex)}" + ex, + "An error occurred while trying to extract the social media information", ) if contact_us_link: if "http" in contact_us_link["href"]: - print( + logger.info( f"making an extra call to get the contact info: {contact_us_link['href']}" ) contact_us_page = requests.get(contact_us_link["href"]) else: - print( + logger.info( f"making an extra call to get the contact info: {self.url+contact_us_link['href']}" ) contact_us_page = requests.get(self.url + contact_us_link["href"]) contact_us_soup = BeautifulSoup(contact_us_page.content, "html.parser") contact_info = self.get_contact_info(contact_us_soup) else: - print("not making an extra call to get the contact info") + logger.info("not making an extra call to get the contact info") contact_info = self.get_contact_info(soup) except Exception as ex: - print( - f"An error occurred while processing the social media information: {str(ex)}" - ) logging.error( - f"An error occurred while processing the social media information: {str(ex)}" + ex, f"An error occurred while processing the social media information" ) return social_media_links, contact_info @@ -106,15 +103,13 @@ def get_contact_info(self, soup): "address": list(set(address))[0] if address else [], } else: - print("Contact Information not available") + logger.warning("Contact Information not available") all_contact_info = {"email": [], "phone_number": [], "address": []} return all_contact_info - except Exception: - print( - "An error occurred while extracting the contact information for the firm {self.url}: {str(ex)}" - ) + except Exception as ex: logging.error( - "An error occurred while extracting the contact information for the firm {self.url}: {str(ex)}" + ex, + "An error occurred while extracting the contact information for the firm {self.url}", ) return None diff --git a/scrapers/settings.py b/scrapers/settings.py new file mode 100644 index 0000000..2e0c9e3 --- /dev/null +++ b/scrapers/settings.py @@ -0,0 +1,62 @@ +"""Unified configuration for GovLens scraper + +Goals of this file: + - Express the entire configuration of the scraper in one place + - Quickly fail in the event of an improper configuration +""" +import logging +import os +import sys + + +logger = logging.getLogger(__name__) + + +def setup_logging(): + """configure logging + + call this as soon as possible + """ + try: + import coloredlogs + + coloredlogs.install(level=logging.INFO) + except Exception: + logger.warning("Could not import coloredlogs") + # fall back to basicConfig + logging.basicConfig(level=logging.INFO,) + + +# setup the logger +setup_logging() + +# attempt to load variables from a file called '.env' +try: + from dotenv import load_dotenv + + load_dotenv() +except ImportError: + logger.warning( + "dotenv could not be imported. Variables will only be loaded from the environment." + ) + +GOVLENS_API_TOKEN = os.environ.get("GOVLENS_API_TOKEN") +GOVLENS_API_ENDPOINT = os.environ.get("GOVLENS_API_ENDPOINT") +GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") + +if not GOVLENS_API_ENDPOINT: + logger.warning( + "The environmental variable GOVLENS_API_ENDPOINT was not provided. Exiting..." + ) + sys.exit(1) + +if not GOVLENS_API_TOKEN: + logger.warning( + "The environmental variable GOVLENS_API_TOKEN was not provided. Exiting..." + ) + sys.exit(1) + +if not GOOGLE_API_KEY: + logger.warning( + "The environmental variable GOOGLE_API_KEY was not provided; no lighthouse data will be collected." + )