diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..1458e64 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,48 @@ +name: CI + +on: + push: + release: + types: [ published ] + +jobs: + checks: + name: "Run Tests" + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Set up Pre-commit + uses: pre-commit/action@v2.0.0 + release: + name: "Release to PyPI" + runs-on: ubuntu-latest + needs: checks + if: "github.event_name == 'release' && startsWith(github.event.release.tag_name, 'v')" + steps: + + - uses: actions/checkout@v2 + - name: "Set up Python" + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: "Build package" + run: | + python setup.py build sdist + - name: "TEST Upload to PyPI" + uses: pypa/gh-action-pypi-publish@release/v1 + if: github.event.release.prerelease + with: + user: __token__ + password: ${{ secrets.PYPI_TEST_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + + - name: "Upload to PyPI" + uses: pypa/gh-action-pypi-publish@release/v1 + if: "!github.event.release.prerelease" + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fab6c2b..643f1f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,7 @@ repos: - id: flake8 types: - python + args: ["--max-line-length=88"] - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml diff --git a/README.md b/README.md index a029218..c26cab5 100644 --- a/README.md +++ b/README.md @@ -1,51 +1,54 @@ -# Feed Checker +# GTFS Aggregator Checker This repo is to verify that a given list of feeds is listed in feed aggregators. Currently it checks transit.land and transitfeeds.com to verify that feeds are listed in an aggregator. +## Installation -## Requirements +``` +pip install gtfs-aggregator-checker +``` -* `.env` - Acquire an [api key from transitland][1] and save it to a `.env` file - like `TRANSITLAND_API_KEY=SECRET`. Alternatively you can prefix commands with - the api key like `TRANSITLAND_API_KEY=SECRET python feed_checker.py [...]`. +## Configure -* `agencies.yml` - This file can have any structure as the feed checker just - looks for any urls (strings starting with `'http://'`), but the intended usage - is a [Cal-ITP agencies.yml file][2]. (to run the program without an - `agencies.yml` file, see the "Options" section below) +The following env variables can be set in a `.env` file, set to the environment, +or inline like `TRANSITLAND_API_KEY=SECRET python -m gtfs_aggregator_checker`. -## Getting Started +* `TRANSITLAND_API_KEY` An [api key from transitland][1]. -To install requirments and check urls run the following. The first time you run -this it will take a while since the cache is empty. +* `GTFS_CACHE_DIR` Folder to save cached files to. Defaults to +`~/.cache/gtfs-aggregator-checker` -``` bash -pip install -r requirements.txt -python feed_checker.py -``` +## Getting Started -The final line of stdout will tell how many urls were in `agencies.yml` and how -many of those were matched in a feed. Above that it will list the domains for -each url (in alphabetical order) as well group paths based on if the path was -matched (in both `agencies.yml` and aggregator), missing (in `agencies.yml` but -not aggregator) or unused (in aggregator but not in `agencies.yml`). An ideal -outcome would mean the missing column is empty for all domains. +## CLI Usage +`python -m gtfs_aggregator_checker [YAML_FILE] [OPTIONS]` -## CLI Usage +`python -m gtfs_aggregator_checker` or `python -m gtfs_aggregator_checker +/path/to/yml` will search a [Cal-ITP agencies.yml file][2] for any urls and see +if they are present in any of the feed aggregators. Alternatively you can use a +`--csv-file` or `--url` instead of an `agencies.yml` file. -`python feed_checker.py` or `python feed_checker.py /path/to/yml` will search a -[Cal-ITP agencies.yml file][2] for any urls and see if they are present in any -of the feed aggregators. +The final line of stdout will tell how many urls were in `agencies.yml` and how +many of those were matched in a feed. ### Options -* `python feed_checker.py --help` print the help -* `--csv-file agencies.csv` load a csv instead of a Cal-ITP agencies yaml file (one url per line) -* `--url http://example.com` Check a single url instead of a Cal-ITP agencies yaml file -* `--verbose` Print a table of all results (organized by domain) +* `python -m gtfs_aggregator_checker --help` print the help +* `--csv-file agencies.csv` load a csv instead of a Cal-ITP agencies yaml file + (one url per line) +* `--url http://example.com` Check a single url instead of a Cal-ITP agencies + yaml file * `--output /path/to/file.json` Save the results as a json file [1]: https://www.transit.land/documentation/index#signing-up-for-an-api-key [2]: https://github.com/cal-itp/data-infra/blob/main/airflow/data/agencies.yml + +## Development + +Clone this repo and `pip install -e /pat/to/feed-checker` to develop locally. + +By default, downloaded files (raw html files, api requsets) will be saved to +`~/.cache/calitp_gtfs_aggregator_checker`. This greatly reduces the time +required to run the script. Delete this folder to reset the cache. diff --git a/cache.py b/cache.py deleted file mode 100644 index 867a130..0000000 --- a/cache.py +++ /dev/null @@ -1,78 +0,0 @@ -import json -import os -import urllib.error -import urllib.request - -from utils import url_split - - -def mkdir(path, root="."): - """ - Makes directory (and parents) if it does not exist - """ - current = root - for folder in path.split("/"): - current = os.path.join(current, folder) - if not os.path.exists(current): - os.mkdir(current) - return current - - -def get_cached(key, func, directory=".cache"): - directory = mkdir(directory) - path = os.path.join(directory, key) - if not os.path.exists(path): - content = func() - with open(path, "w") as f: - f.write(content) - print("wrote cached file", path) - with open(path, "r") as f: - return f.read() - - -def curl_cached(url, key=None): - domain, path = url_split(url) - if key is None: - key = path.replace("/", "__") - if len(key) > 255: - key = key[:255] # max wilename length is 255 - - def get(): - req = urllib.request.Request(url) - r = urllib.request.urlopen(req) - return r.read().decode() - - return get_cached(key, get, os.path.join(".cache", domain)) - - -class JsonCache(dict): - """ - A dictionary that is stored to the file system. - """ - - def __init__(self, name, *args, **kwargs): - super().__init__(*args, **kwargs) - self._path = os.path.join(".cache", name + ".json") - if os.path.exists(self._path): - with open(self._path, "r") as f: - self.update(json.loads(f.read())) - - def __setitem__(self, *args): - super().__setitem__(*args) - self._save() - - def _save(self): - with open(self._path, "w") as f: - f.write(json.dumps(self, indent=2)) - - -class JsonCacheSetter(JsonCache): - """ - A cached dictionary which takes a function instead of a dictionary value. - If the key is not set, the function is evaluated. - Useful when the takes a long time to compute (eg fetching a url). - """ - - def __setitem__(self, key, func): - if key not in self: - super().__setitem(key, func()) diff --git a/feed_checker.py b/gtfs_aggregator_checker/__init__.py similarity index 84% rename from feed_checker.py rename to gtfs_aggregator_checker/__init__.py index 5957c97..4cb1e0d 100644 --- a/feed_checker.py +++ b/gtfs_aggregator_checker/__init__.py @@ -1,15 +1,15 @@ from collections import OrderedDict import json -import typer import urllib.error import urllib.parse import urllib.request import yaml -from transitland import get_transitland_urls -from transitfeeds import get_transitfeeds_urls +from .transitland import get_transitland_urls +from .transitfeeds import get_transitfeeds_urls +__version__ = "1.0.0" SECRET_PARAMS = ["api_key", "token", "apiKey", "key"] @@ -26,13 +26,7 @@ def clean_url(url): return urllib.parse.urlunparse(url) -def main( - yml_file=typer.Argument("agencies.yml", help="A yml file containing urls"), - csv_file=typer.Option(None, help="A csv file (one url per line)"), - url=typer.Option(None, help="URL to check instead of a file",), - output=typer.Option(None, help="Path to a file to save output to."), - verbose: bool = typer.Option(False, help="Print a result table to stdout"), -): +def check_feeds(yml_file=None, csv_file=None, url=None, output=None): results = {} if url: @@ -96,7 +90,7 @@ def main( if "present" not in statuses: missing.append(url) - if missing and verbose: + if missing: print(f"Unable to find {len(missing)}/{len(results)} urls:") for url in missing: print(url) @@ -108,7 +102,3 @@ def main( with open(output, "w") as f: f.write(json.dumps(results, indent=4)) print(f"Results saved to {output}") - - -if __name__ == "__main__": - typer.run(main) diff --git a/gtfs_aggregator_checker/__main__.py b/gtfs_aggregator_checker/__main__.py new file mode 100644 index 0000000..f4a8b4f --- /dev/null +++ b/gtfs_aggregator_checker/__main__.py @@ -0,0 +1,15 @@ +import typer + +from . import check_feeds + + +def main( + yml_file=typer.Argument("agencies.yml", help="A yml file containing urls"), + csv_file=typer.Option(None, help="A csv file (one url per line)"), + url=typer.Option(None, help="URL to check instead of a file",), + output=typer.Option(None, help="Path to a file to save output to."), +): + check_feeds(yml_file=yml_file, csv_file=csv_file, url=url, output=output) + + +typer.run(main) diff --git a/gtfs_aggregator_checker/cache.py b/gtfs_aggregator_checker/cache.py new file mode 100644 index 0000000..f24dd57 --- /dev/null +++ b/gtfs_aggregator_checker/cache.py @@ -0,0 +1,44 @@ +import os +from pathlib import Path +import urllib.error +import urllib.request + +from .utils import url_split + + +def get_cache_dir(): + if "GTFS_CACHE_DIR" in os.environ: + path = Path(os.environ["GTFS_CACHE_DIR"]) + else: + path = Path.home() / ".cache/gtfs-aggregator-checker" + path.mkdir(exist_ok=True, parents=True) + return path + + +def get_cached(key, func, directory=None): + if not directory: + directory = get_cache_dir() + path = directory / key + if not path.exists(): + content = func() + with open(path, "w") as f: + f.write(content) + with open(path, "r") as f: + return f.read() + + +def curl_cached(url, key=None): + domain, path = url_split(url) + if key is None: + key = path.replace("/", "__") + if len(key) > 255: + key = key[:255] # max filename length is 255 + + def get(): + req = urllib.request.Request(url) + r = urllib.request.urlopen(req) + return r.read().decode() + + path = get_cache_dir() / domain + path.mkdir(exist_ok=True, parents=True) + return get_cached(key, get, directory=path) diff --git a/config.py b/gtfs_aggregator_checker/config.py similarity index 100% rename from config.py rename to gtfs_aggregator_checker/config.py diff --git a/transitfeeds.py b/gtfs_aggregator_checker/transitfeeds.py similarity index 98% rename from transitfeeds.py rename to gtfs_aggregator_checker/transitfeeds.py index 75798a2..199fda5 100644 --- a/transitfeeds.py +++ b/gtfs_aggregator_checker/transitfeeds.py @@ -1,7 +1,7 @@ from bs4 import BeautifulSoup from urllib.error import HTTPError -from cache import curl_cached +from .cache import curl_cached LOCATION = "67-california-usa" ROOT = "https://transitfeeds.com" diff --git a/transitland.py b/gtfs_aggregator_checker/transitland.py similarity index 94% rename from transitland.py rename to gtfs_aggregator_checker/transitland.py index 05baade..f8538a3 100644 --- a/transitland.py +++ b/gtfs_aggregator_checker/transitland.py @@ -1,7 +1,7 @@ import json -from config import env -from cache import curl_cached +from .config import env +from .cache import curl_cached API_KEY = env["TRANSITLAND_API_KEY"] BASE_URL = f"https://transit.land/api/v2/rest/feeds?apikey={API_KEY}" diff --git a/utils.py b/gtfs_aggregator_checker/utils.py similarity index 100% rename from utils.py rename to gtfs_aggregator_checker/utils.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e554a58 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import re +from setuptools import setup, find_namespace_packages + +_version_re = re.compile(r"__version__\s+=\s+(.*)") + +with open("gtfs_aggregator_checker/__init__.py", "r") as f: + version = _version_re.search(f.read()).group(1).strip("'\"") + +with open("README.md", "r") as f: + long_description = f.read() + +setup( + name="gtfs_aggregator_checker", + version=version, + packages=find_namespace_packages(), + install_requires=[ + "beautifulsoup4", + "python-dotenv", + "PyYAML", + "requests", + "typer", + ], + description="Tool for checking if transit urls are on aggregator websites", + long_description=long_description, + long_description_content_type="text/markdown", + author="", + author_email="", + url="https://github.com/cal-itp/gtfs-aggregator-checker", +)