diff --git a/.gitignore b/.gitignore index 65e9d1d..8ef116c 100644 --- a/.gitignore +++ b/.gitignore @@ -129,11 +129,18 @@ dmypy.json .pyre/ # Ignore -.vscode/ + censo/bin/ censo/include/ censo/share/ # Data notebooks/*.csv - +*.zip +cache + +# Azure stuff +.vscode +__azurite* +__blob* +local.settings.json \ No newline at end of file diff --git a/utils/.dockerignore b/utils/.dockerignore new file mode 100644 index 0000000..85377aa --- /dev/null +++ b/utils/.dockerignore @@ -0,0 +1,3 @@ +__pycache* +.*cache +.venv \ No newline at end of file diff --git a/utils/.env.template b/utils/.env.template new file mode 100644 index 0000000..03f43af --- /dev/null +++ b/utils/.env.template @@ -0,0 +1,36 @@ +# Environment variables + +### IMPORTANT: when done editing this file, rename it to ".env" +### (without the ".template" ending) + +## General settings + +### log verbosity level - choose between 'error', 'warn', 'info', 'debug' +FETCHPORTALS_LOG_LEVEL="debug" + +### set source of portal URLs and geo IDs - only 'census' is currently accepted +FETCHPORTALS_SOURCE="census" + +### control whether to only ping portal status, or to fetch its source code +FETCHPORTALS_MODE="ping" + +### set maximum number of tries and time waiting +FETCHPORTALS_MAX_RETRIES=3 +FETCHPORTALS_TIMEOUT=10.0 + +### control callback to process and/or save the retrived data +FETCHPORTALS_CALLBACK="kaggle" + +### control what to do when destination file already exists - must be one of +### 'replace', 'append' or 'skip' +FETCHPORTALS_EXISTING="replace" + +### set a local directory where retrieved files may persist +FETCHPORTALS_LOCALDIR="./data" + +## Kaggle settings + +KAGGLE_USERNAME="exampleuser" +KAGGLE_KEY="12345678abcdefgh" +KAGGLE_DATASET="bcbernardo/censusqd2020" +KAGGLE_FILE="portals-availability.csv" \ No newline at end of file diff --git a/utils/.funcignore b/utils/.funcignore new file mode 100644 index 0000000..010071a --- /dev/null +++ b/utils/.funcignore @@ -0,0 +1,5 @@ +.env +.venv +local.settings.json +__azurite* +__pycache__ \ No newline at end of file diff --git a/utils/README.md b/utils/README.md new file mode 100644 index 0000000..5bf9183 --- /dev/null +++ b/utils/README.md @@ -0,0 +1,134 @@ +# Utilitários do Censo Querido Diário + +Este sub-repositório inclui rotinas e funções elaboradas pela comunidade para +processar, analisar e salvar os resultados do Censo Querido Diário. + +Atualmente, o único utilitário desenvolvido é o pacote `fetch_portals`, que se +comunica com todos os endereços web cadastrados no Censo para checar quais +portais estão *online* e/ou para obter seu código-fonte. + +Contribuições na forma de novos pacotes e utilitários para processar os dados +do Censo são bem-vindas. Cheque o [CONTRIBUTING.md](../CONTRIBUTING.md) do +projeto para mais detalhes de como ajudar nas diferentes tarefas do Censo, bem +como a seção [Adicionando um novo utilitário](#adicionando-um-novo-utilitário) +para instruções específicas de como criar uma nova rotina de pré-processamento. + +Se tiver alguma dúvida ou quiser ter uma visão geral dos próximos passos do +Censo, não hesite em visitar as +[issues](https://github.com/okfn-brasil/censo-querido-diario/issues) do projeto +ou entrar em contato pelo [Discord](https://discord.gg/M6ep5VED). + +## Instalação e execução + +### Com o Docker (recomendado) + +A forma mais simples de rodar os utilitários é utilizando o Docker. Você deve +ter instalado o Docker Community Edition. Encontre a versão apropriada para o +seu sistema [aqui][Docker CE]. Você também deve ter o [git] instalado para fazer download do repositório. + +Para instalar os utilitários, abra um terminal de linha de comando e rode os +seguintes comandos: + +```bash +$ git clone https://github.com/okfn-brasil/censo-querido-diario.git +$ cd censo-querido-diario/utils +``` + +Em um explorador de arquivos, encontre o diretório onde você fez download do +repositório e abra o arquivo `censo-querido-diario/utils/.env.template`. Adapte +as configurações presentes no arquivo de acordo com os dados que pretende obter +(especialmente as iniciadas em `KAGGLE_*`, se for exportar para o Kaggle). +Salve o arquivo modificado renomeie-o para `.env` (sem o `.template` no final). + +Para inicializar a checagem dos portais, basta voltar ao terminal e inserir o +comando: + +```bash +$ docker-compose up +``` + +[Docker CE]: https://hub.docker.com/search?offering=community&type=edition +[git]: https://git-scm.com/ + +### Como pacote Python + +Os utilitários contidos nesse sub-repositório podem ser instalados como pacotes +Python avulsos. Para isso, você deve ter instalada na sua máquina uma versão +Python compatível (3.7 ou superior). + +Para instalar a partir do repositório, rode em um terminal de linha de comando: + +```bash +$ git clone https://github.com/okfn-brasil/censo-querido-diario.git +$ cd censo-querido-diario/utils +$ python -m venv .venv +$ source .venv/bin/activate # no PowerShell: $ .venv/Scripts\activate.ps1 +(.venv) $ python -m pip install . +``` + +Para que a instalação funcione e você possa usar o comando `fetch-portals` da +linha de comando, é necessário que antes você exporte algumas variáveis de +ambiente, que controlam o funcionamento do programa. + +Para isso, edite o arquivo `.env.template`, contido no diretório +`censo-querido-diario/utils`, alterando as configurações necessárias. +**Importante:** para rodar a versão atual do coletor de portais, você deve, no +mínimo, alterar as variáveis de ambiente iniciadas em `KAGGLE_*`. Você precisa +ter permissão de escrita no dataset utilizado para salvar os resultados. + +Quando finalizar a edição, salve o arquivo `.env.template` e renomei-o para +`.env`, apenas. + + +Com o utilitário instalado como um pacote e o respectivo ambiente virtual +ativado, basta rodar o comando `fetch-portals` na linha de comando. Esse +comando fará requisições a todos os portais de publicação de diários oficiais +mapeados no Censo, e salvará os resultados no dataset do Kaggle indicado no +arquivo `.env`. + +```bash +(.venv) $ fetch-portals +``` + +## Adicionando um novo utilitário + +Para desenvolver um pacote Python que consuma e processe os dados do Censo +Querido Diário, [faça um +*fork*](https://github.com/okfn-brasil/censo-querido-diario/fork) do +repositório para a sua própria conta e adicione os scripts em um sub-diretório +da pasta `censo-querido-diario/utils/src`. + +Para o nome do diretório e dos módulos, utilize apenas letras minúsculas e +*underscores* (\_). Insira também um arquivo `__init__.py` vazio no diretório +criado, e adicione as dependências utilizadas na lista de pacotes abaixo do +item `install_requires` do arquivo +[`censo-querido-diario/utils/setup.cfg`](./setup.cfg). + +Se você quiser que o utilitário seja acessível por meio do Docker, crie um +arquivo chamado `.Dockerfile` em +`censo-querido-diario/utils`, contendo as instruções de construção do contâiner +(veja a [referência do Dockerfile]). Em seguida, adicione uma entrada no +arquivo `docker-compose.yml` localizado no mesmo diretório (veja a [referência +do Docker Compose] para mais detalhes). + +[referência do Dockerfile]: https://docs.docker.com/engine/reference/builder/ +[referência do Docker Compose]: https://docs.docker.com/compose/compose-file/ diff --git a/utils/azure_ping_portals/__init__.py b/utils/azure_ping_portals/__init__.py new file mode 100644 index 0000000..4507381 --- /dev/null +++ b/utils/azure_ping_portals/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Periodically check the availability of Official Gazettes portals. +""" + +import logging + +import azure.functions as func +from fetch_portals.main import main as fetch + + +def main(timer: func.TimerRequest): + """Ping Querido Diario Census portals to check their availability.""" + logging.info(f"Starting function (past due {timer.past_due})") + fetch(mode="ping", existing="append", callback="kaggle") + logging.info("Finished checking portals from Census.") diff --git a/utils/azure_ping_portals/function.json b/utils/azure_ping_portals/function.json new file mode 100644 index 0000000..abb1e13 --- /dev/null +++ b/utils/azure_ping_portals/function.json @@ -0,0 +1,11 @@ +{ + "scriptFile": "__init__.py", + "bindings": [ + { + "name": "timer", + "type": "timerTrigger", + "direction": "in", + "schedule": "0 0 */3 * * *" + } + ] +} \ No newline at end of file diff --git a/utils/docker-compose.yml b/utils/docker-compose.yml new file mode 100644 index 0000000..a96e8e0 --- /dev/null +++ b/utils/docker-compose.yml @@ -0,0 +1,11 @@ +version: '3.8' +services: + fetch-portals: + build: + context: . + dockerfile: fetch_portals.Dockerfile + volumes: + - ./cache:/usr/src/data + env_file: .env + environment: + - FETCHPORTALS_LOCALDIR=/usr/src/data \ No newline at end of file diff --git a/utils/fetch_portals.Dockerfile b/utils/fetch_portals.Dockerfile new file mode 100644 index 0000000..ecd8683 --- /dev/null +++ b/utils/fetch_portals.Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.8.5-slim + +# Setup env +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONFAULTHANDLER 1 + +RUN mkdir /usr/src/app +WORKDIR /usr/src/app + +COPY . . + +RUN python -m pip install . + +# Run the executable +ENTRYPOINT ["fetch-portals"] diff --git a/utils/host.json b/utils/host.json new file mode 100644 index 0000000..f0abc88 --- /dev/null +++ b/utils/host.json @@ -0,0 +1,5 @@ +{ + "functionTimeout": "00:10:00", + "version": "2.0", + "watchDirectories": [ "src" ] +} \ No newline at end of file diff --git a/utils/pyproject.toml b/utils/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/utils/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/utils/requirements-dev.txt b/utils/requirements-dev.txt new file mode 100644 index 0000000..bf1a8b0 --- /dev/null +++ b/utils/requirements-dev.txt @@ -0,0 +1 @@ +.[dev] \ No newline at end of file diff --git a/utils/requirements.txt b/utils/requirements.txt new file mode 100644 index 0000000..945c9b4 --- /dev/null +++ b/utils/requirements.txt @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/utils/setup.cfg b/utils/setup.cfg new file mode 100644 index 0000000..db231a8 --- /dev/null +++ b/utils/setup.cfg @@ -0,0 +1,41 @@ +[metadata] +name = censusqdutils +version = 0.1.0 +url = https://github.com/okfn-brasil/censo-querido-diario +author = Open Knowledge Brasil +author_email = contato@serenata.ai +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent +description = Utils for processing Querido Diario Census data. +long_description = file: README.md +long_description_content_type = text/markdown +license = MIT + +[options] +python_requires = >=3.7 +package_dir = + =src +packages = find: +install_requires = + aiohttp >= 3.7 + kaggle >= 1.5 + pandas >= 1.2 + python-dotenv >= 0.15 + +[options.extras_require] +dev = + black == 20.8b1 + flake8 >= 3.8.4 + isort >= 5.7.0 + mypy >= 0.800 + pandas-stubs >= 1.0.4.4 + pytest >= 6.2.2 + +[options.packages.find] +where = src + +[options.entry_points] +console_scripts = + fetch-portals = fetch_portals.main:main diff --git a/utils/src/fetch_portals/__init__.py b/utils/src/fetch_portals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/src/fetch_portals/callbacks.py b/utils/src/fetch_portals/callbacks.py new file mode 100644 index 0000000..2ca02c2 --- /dev/null +++ b/utils/src/fetch_portals/callbacks.py @@ -0,0 +1,196 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Callback functions to save data fetched from official gazettes portals. + +This module contains callback functions to process and/or save contents and +monitor the service availability of portals containing the official gazettes +for the 5.570 brazilian municipalities. +""" + +import json +import logging +import os +from dataclasses import is_dataclass +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Iterable, Literal, Optional, Union + +import pandas as pd + +from .models import ExistingBehavior, PathLike, PortalCapture + + +def _autogen_version_notes( + dest_file: str, operation: Literal["create", "append", "update"] +) -> str: + """Generate a default version note message. + + Parameters: + dest_file: Destination file being written (or appended to). + operation: Whether the file is being ``create``'d, ``append`'ed to or + completely ``update``'d (replaced). + """ + + logging.warning( + "Version notes not provided; a default message will be generated." + ) + + if operation == "create": + version_notes = "Create " + dest_file + elif operation == "append": + version_notes = "Add records to " + dest_file + elif operation == "update": + version_notes = "Update " + dest_file + + return version_notes + + +def to_kaggle( + data: Union[Iterable[Union[dict, PortalCapture]], pd.DataFrame], + dataset: str, + dest_file: str, + existing_behavior: ExistingBehavior = "replace", + version_notes: Optional[str] = None, + local_dir: Optional[PathLike] = None, + delete_old_versions: bool = False, +) -> "DatasetNewVersionResponse": # type: ignore # noqa: F821 + """Write data to a destination dataset file in Kaggle. + + Parameters: + data: Data to be uploaded to the dataset. Can be a pandas `DataFrame`_ + instance, or an iterable of dataclass objects or dictionaries. + dataset: Kaggle dataset id, in the format ``/``. + dest_file: How to name the destination file in the dataset context. + existing_behavior: What to do if the file already exists in the + dataset. + version_notes: A message describing what changes will be made to the + dataset (optional; a default message will be generated if none was + given). + local_dir: A local directory where the dataset files will persist + (optional; defaults to None). + delete_old_versions: Whether to delete previous versions of the + dataset that exist in Kaggle. + + Returns + A `DatasetNewVersionResponse`_ instance with the new dataset version. + + .. _DataFrame: https://pandas.pydata.org/pandas-docs/stable/reference/ + frame.html + .. _DatasetNewVersionResponse: https://github.com/Kaggle/kaggle-api/blob/ + 89eb72dd811492c500839f65332f669cd839d2bc/kaggle/models/ + kaggle_models_extended.py#L150 + """ + + from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore + from kaggle.models.kaggle_models_extended import Metadata # type: ignore + + # check data object type is supported + if all(is_dataclass(record) for record in data) or all( + isinstance(record, dict) for record in data + ): + data = pd.DataFrame(data) + elif isinstance(data, pd.DataFrame): + pass + else: + raise TypeError( + "`data` parameter must be a list os Dataclass instances, a " + + f"dictionary or a pandas DataFrame, not {type(data).__name__}." + ) + + logging.info( + f"Uploading {len(data.index)} records to '{dest_file}' file in " + + f"Kaggle's '{dataset}' dataset." + ) + + # authenticate Kaggle API + logging.debug("Authenticating to Kaggle API...") + api = KaggleApi() + api.authenticate() + + # make sure dataset exists + logging.debug("Searching dataset...") + try: + dataset_owner, dataset_name = dataset.split("/") + matching_datasets = api.dataset_list( + search=dataset_name, user=dataset_owner + ) + assert dataset in [dataset.ref for dataset in matching_datasets] + except AssertionError: + # TODO: create dataset if it doesn't exist + raise ValueError("The dataset does not exist.") + + # use the provided local (persistent) directory, or create a temporary one + if not local_dir: + tmpdir = TemporaryDirectory() + data_dir: Any = tmpdir.name + logging.debug(f"Creatd temporary directory: {data_dir}") + else: + data_dir = local_dir + + # get dataset metadata + metafile = Path(data_dir, "datapackage.json") + if not os.path.isfile(metafile): + metadata_response = api.process_response( + api.metadata_get_with_http_info(dataset_owner, dataset_name) + ) + metadata = Metadata(metadata_response) + with open(metafile, "w") as f: + json.dump(metadata, f, indent=2, default=lambda o: o.__dict__) + + # download existing files + # TODO: skip downloading unchanged files if they already exist locally + # TODO: start downloading asynchronously while data is gathered + api.dataset_download_files(dataset, path=data_dir, unzip=True) + + # write data file as CSV + operation: Literal["create", "append", "update"] + if os.path.isfile(Path(data_dir, dest_file)): + if existing_behavior == "replace": + operation = "update" + data.to_csv(Path(data_dir, dest_file), mode="w") + elif existing_behavior == "append": + operation = "append" + data.to_csv(Path(data_dir, dest_file), mode="a") + elif existing_behavior == "skip": + logging.error(f"File '{dest_file}' already exists. Skiped.") + raise FileExistsError( + "File already exists and behavior is set to `skip`." + ) + else: + raise ValueError( + "`existing_behavior` argument must be one of " + + f"`replace`, `append` or `skip` ('{existing_behavior}' " + + "provided)." + ) + else: + operation = "create" + data.to_csv(Path(data_dir, dest_file)) + + # # update metadata file + # BUG + # try: + # api.dataset_metadata_update(dataset, data_dir) + # except KeyError: + # # BUG: KaggleApi's dataset_metadata_update() method references an + # # inexistent key for checking for errors. Just ignore it. + # pass + + # create a version notes message, if user hasn't provided one + if not version_notes: + version_notes = _autogen_version_notes(dest_file, operation) + + # upload data to Kaggle + new_version = api.dataset_create_version( + data_dir, version_notes, delete_old_versions=delete_old_versions + ) + + # clear temporary directory + if not local_dir: + tmpdir.cleanup() + + return new_version diff --git a/utils/src/fetch_portals/fetchers.py b/utils/src/fetch_portals/fetchers.py new file mode 100644 index 0000000..5eb9621 --- /dev/null +++ b/utils/src/fetch_portals/fetchers.py @@ -0,0 +1,100 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Functions to fetch official gazette portals statuses and contents. + +This module contains functions developed as a part of the `Censo Querido +Diário`_ effort, in order to periodically fetch the contents and monitor the +service availability of portals containing the official gazettes for the 5.526 +brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our +`contribution guidelines`_ (in portuguese) to learn the various ways you can +support the project. + +.. _Censo Querido Diário: + https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import asyncio +import logging +from typing import cast + +from typing import List + +from .models import AcceptedHttpMethod, FetchMode, PortalList + + +async def _gather_responses( + portals: PortalList, + mode: FetchMode = "ping", + max_retries: int = 3, + timeout: float = 10.0, +): + """Orchestrates asynchronous requests to official gazettes portals. + + Parameters: + portals: A `PortalList` instance to be fetched. + mode: How to fetch the portals. ``mode="ping"`` fetches only the + portals' status codes and request metadate. ``mode="capture"`` also + captures the portals' source code. + """ + + logging.info("Preparing fetch tasks...") + + portals = PortalList(portals) + + if mode == "ping": + http_method = "HEAD" + elif mode == "source": + http_method = "GET" + + task_list: List = list() + + for subset in portals.by_domain(): + task: asyncio.Task = asyncio.create_task( + subset.fetch_all( + method=cast(AcceptedHttpMethod, http_method), + max_retries=max_retries, + timeout=timeout, + ) + ) + task_list.append(task) + + return await asyncio.gather(*task_list) + + +def fetch_portals( + portals: PortalList, + mode: FetchMode = "ping", + max_retries: int = 3, + timeout: float = 10.0, +): + """Orchestrates asynchronous requests to official gazettes portals. + + Parameters: + portals: A `PortalList` instance to be fetched. + mode: How to fetch the portals. ``mode="ping"`` fetches only the + portals' status codes and request metadate. ``mode="capture"`` also + captures the portals' source code. + """ + + results = list() + + task_list = asyncio.run( + _gather_responses(portals, mode, max_retries, timeout) + ) + + for task_results in task_list: + for result in task_results: + results.append(result) + + return results diff --git a/utils/src/fetch_portals/main.py b/utils/src/fetch_portals/main.py new file mode 100644 index 0000000..e01d742 --- /dev/null +++ b/utils/src/fetch_portals/main.py @@ -0,0 +1,127 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Main logic to capture data from official gazettes portals. + +This module contains main logic to capture official gazette portals' status and +source code for Brazilian municipalities. developed as a part of the `Censo +Querido Diário`_ effort, in order to periodically fetch the contents and +monitor the service availability of portals containing the official gazettes +for the 5.526 brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our `contribution +guidelines`_ (in portuguese) to learn the various ways you can support the +project. + +.. _Censo Querido Diário: https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import json +import logging +import os +import sys +from pathlib import Path +from typing import cast, Final, List, Optional + +from dotenv import load_dotenv + +from .callbacks import to_kaggle +from .fetchers import fetch_portals +from .models import ( + AcceptedCallback, + AcceptedSource, + ExistingBehavior, + FetchMode, + LogLevel, + PathLike, + PortalCapture, +) +from .sources import get_portals_from_census + +# get configurations from environment variables, or use defaults +script_path: Path = Path(os.path.abspath(__file__)) +load_dotenv(os.path.join(script_path.parents[2], ".env")) + + +# load settings from environment, or use defaults +SOURCE: Final = os.getenv("FETCHPORTALS_SOURCE", "census") +MODE: Final = os.getenv("FETCHPORTALS_MODE", "ping") +CALLBACK: Final = os.getenv("FETCHPORTALS_CALLBACK", None) +LOCAL_DIR: Final = os.getenv("FETCHPORTALS_LOCALDIR", None) +EXISTING: Final = os.getenv("FETCHPORTALS_EXISTING", "replace") +MAX_RETRIES: Final = int(os.getenv("FETCHPORTALS_MAX_RETRIES", 3)) +TIMEOUT: Final = float(os.getenv("FETCHPORTALS_TIMEOUT", 10.0)) +LOG_LEVEL: Final = os.getenv("FETCHPORTALS_LOG_LEVEL", "warning") + + +def main( + source: AcceptedSource = cast(AcceptedSource, SOURCE), + mode: FetchMode = cast(FetchMode, MODE), + callback: Optional[AcceptedCallback] = cast( + Optional[AcceptedCallback], CALLBACK + ), + local_dir: Optional[PathLike] = LOCAL_DIR, + existing: ExistingBehavior = cast(ExistingBehavior, EXISTING), + max_retries: int = MAX_RETRIES, + timeout: float = TIMEOUT, + log_level: LogLevel = cast(LogLevel, LOG_LEVEL), +) -> None: + """Main program entry point.""" + # init logs + logging.basicConfig( + format="%(asctime)s %(message)s", + level=getattr(logging, log_level.upper()) + ) + + # get a list of portals + if source == "census": + portals = get_portals_from_census() + else: + raise ValueError(f"'{source}' is not a valid source.") + + # fetch them + captures: List[PortalCapture] = fetch_portals(portals=portals, mode=mode) + + # save captured data + if callback == "kaggle": # save to a Kaggle dataset file + try: + assert "KAGGLE_USERNAME" in os.environ + assert "KAGGLE_KEY" in os.environ + except AssertionError: + logging.error("Kaggle credentials not found in environment.") + raise RuntimeError + dest_dataset: str = os.environ["KAGGLE_DATASET"] + dest_file: str = os.environ["KAGGLE_FILE"] + to_kaggle( + captures, + dataset=dest_dataset, + dest_file=dest_file, + existing_behavior=existing, + local_dir=local_dir, + ) + + # print to stdout (default) + elif not callback: + results_json: str = json.dumps( + [capture.to_dict() for capture in captures], + indent=4, + sort_keys=True, + separators=(",", ": "), + ).replace("\\n", "\n") + sys.stdout.write(results_json) + + # unimplemented callback + else: + raise ValueError(f"'{callback}' is not a valid callback.") + + +if __name__ == "__main__": + main() diff --git a/utils/src/fetch_portals/models.py b/utils/src/fetch_portals/models.py new file mode 100644 index 0000000..296cf3a --- /dev/null +++ b/utils/src/fetch_portals/models.py @@ -0,0 +1,238 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Representations of concepts used by other utilities. + +This module contains reusable types and classes that model both portals where +Brazilian official gazettes are published and their attributes. +""" + +import itertools +import logging +import os +from asyncio.exceptions import TimeoutError +from collections import UserList +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from enum import Enum +from typing import Any, List, Literal, NewType, Optional, Set, Union + +import aiohttp +from aiohttp import ClientConnectorCertificateError, ClientError, ClientTimeout +from yarl import URL + +AcceptedCallback = Literal["kaggle"] +AcceptedSource = Literal["census"] +ExistingBehavior = Literal["append", "replace", "skip"] +FetchMode = Literal["ping", "source"] +AcceptedHttpMethod = Literal["GET", "HEAD"] +IbgeCode = NewType("IbgeCode", int) # TODO: make it a UserString +LogLevel = Literal["error", "warning", "info", "debug"] +PathLike = Union[str, bytes, "os.PathLike[Any]"] + + +class GovernmentBranch(Enum): + """An enumeration of government branches in Brazil.""" + + EXECUTIVE = 1 # only the Executive branch is currently supported + # LEGISLATIVE = 2 + # JUDICIAL = 3 + # ESSENTIAL_JUSTICE = 4 + + +class GovernmentLevel(Enum): + """An enumeration of government levels in Brazil.""" + + # FEDERAL = 1 + # STATE = 2 # includes Federal District + MUNICIPALITY = 3 # only Municipalities are currently supported + + +@dataclass +class Portal: + """Representation of a portal that publishes local-level official gazettes. + """ + + ibge_code: IbgeCode + url: URL + branch: GovernmentBranch = GovernmentBranch.EXECUTIVE + level: GovernmentLevel = GovernmentLevel.MUNICIPALITY + + +@dataclass +class PortalCapture: + """Capture of an official gazette publication portal at a point in time.""" + + ibge_code: IbgeCode + request_time: datetime + waiting_time: timedelta + attempts: int + initial_url: URL + final_url: Optional[URL] + method: Literal["GET", "POST"] + ssl_valid: bool + status: int + message: str + level: GovernmentLevel = GovernmentLevel.MUNICIPALITY + branch: GovernmentBranch = GovernmentBranch.EXECUTIVE + + def to_dict(self): + """Converts a PortalCapture into a dictionary.""" + return { + "ibge_code": str(self.ibge_code), + "request_time": self.request_time.isoformat(), + "waiting_time": self.waiting_time.total_seconds(), + "attempts": self.attempts, + "initial_url": str(self.initial_url), + "final_url": str(self.final_url or ""), + "method": self.method, + "ssl_valid": int(self.ssl_valid), + "status": self.status, + "message": self.message, + "level": self.level, + "branch": self.branch, + } + + +class PortalList(UserList): + """A list of official portals.""" + + def by_domain(self) -> List["PortalList"]: + """Separate a list of portals by their domains. + + This function creates a list populated with sets of unique portals that + have all the same domain in their URLs. + + Parameters: + portals: An iterable of `Portal` instances + + Returns: + A list of `PortalList`s, one for each domain in the original + instance. + """ + + logging.debug("Separating portals according to their domains...") + + # collect all unique domains + domains = set(portal.url.host for portal in self.data) + + # iterate over domains and check which portals belong to them + separated = list() + for domain in domains: + portals_in_domain = PortalList( + portal for portal in self.data if portal.url.host == domain + ) + + separated.append(portals_in_domain) # add to separated list + + return separated + + async def fetch_all( + self, + method: Literal["GET", "HEAD"] = "HEAD", + timeout: float = 10.0, + max_retries: int = 3, + ) -> List[PortalCapture]: + + logging.info(f"Fetching {len(self.data)} portals ('{method}')...") + + # create an empty list of responses data and metadata + responses: List[dict] = list() + + # remove url duplicates + unique_urls: Set[URL] = set(portal.url for portal in self.data) + + client_timeout = ClientTimeout(total=timeout) + + async with aiohttp.ClientSession( + timeout=client_timeout, trust_env=True + ) as client: + + # iterate over portal URLs + for url in unique_urls: + + # configure request + ssl_valid: bool = True # start assuming so + + # try fetching page + attempt: int = 1 + while attempt <= max_retries: + try: + logging.info( + f"Sending request to <{url}> " + + f"({attempt}/{max_retries})..." + ) + request_time: datetime = datetime.now(timezone.utc) + + async with client.request( + method, url=str(url), ssl=ssl_valid + ) as response: + time_elapsed: timedelta = ( + datetime.now(timezone.utc) - request_time + ) + final_url: Optional[URL] = response.url + response_status: int = response.status + if method == "GET": + # TODO: get charsets defined in tags + message: Any = str(await response.text()) + else: + message = response.reason + if not response.ok and attempt <= max_retries: + attempt += 1 + continue + + # Invalid SSL certificate; try again without verifying + except ClientConnectorCertificateError: + ssl_valid = False + if attempt < max_retries: + continue + + # some other error; try again + except ( + ClientError, + TimeoutError, + UnicodeDecodeError, + ) as err: + time_elapsed = ( + datetime.now(timezone.utc) - request_time + ) + message = repr(err) + final_url = None + response_status = 999 + if attempt < max_retries: + attempt += 1 + continue + + # record answer if it is OK or exceeded max tries + logging.info(f"<{url}>: {message} ({response_status})") + responses.append( + { + "initial_url": url, + "final_url": final_url, + "method": method, + "attempts": attempt, + "request_time": request_time, + "waiting_time": time_elapsed, + "ssl_valid": ssl_valid, + "status": response_status, + "message": message, + } + ) + break + + # associate unique urls to portals + captures: List[PortalCapture] = list() + for portal, capture in itertools.product(self.data, responses): + if portal.url == capture["initial_url"]: + captures.append( + PortalCapture( + ibge_code=portal.ibge_code, + level=portal.level.value, + branch=portal.branch.value, + **capture, + ) + ) + return captures diff --git a/utils/src/fetch_portals/sources.py b/utils/src/fetch_portals/sources.py new file mode 100644 index 0000000..ad211a0 --- /dev/null +++ b/utils/src/fetch_portals/sources.py @@ -0,0 +1,72 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Functions to interact with sources of official gazettes portals. + +This module contains functions developed as a part of the `Censo Querido +Diário`_ effort, in order to periodically fetch the contents and monitor the +service availability of portals containing the official gazettes for the 5.526 +brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our +`contribution guidelines`_ (in portuguese) to learn the various ways you can +support the project. + +.. _Censo Querido Diário: + https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import logging +from typing import List + +import numpy as np +import pandas as pd +from yarl import URL + +from .models import IbgeCode, Portal, PortalList + + +def get_portals_from_census() -> PortalList: + """Get a list of official gazettes portals from Querido Diario Census data. + + Returns: + A list of `Portal`_ objects, containing the official Id for the city + and the portal URL. + """ + + logging.info("Getting census data...") + + # download census full data + url: str = "https://censo.ok.org.br/get-data/" + df_census: pd.DataFrame = pd.read_csv(url) + + # filter and process relevant data (cities geocodes and portal URLs) + logging.debug("Processing portals information...") + portals: List[Portal] = ( + pd.wide_to_long( # type: ignore + df_census, "fonte", i="IBGE7", j="fonte_num", sep="_" + ) + .reset_index() + .dropna() + .apply( + # FIXME: avoid "None" strings in url column + lambda mun: Portal( + ibge_code=IbgeCode(mun["IBGE7"]), url=URL(mun["fonte"]) + ) + if mun.fonte != "None" + else np.nan, + axis=1, + ) + .dropna() + .to_list() + ) + + return PortalList(portals) diff --git a/utils/src/fetch_portals/test/__init__.py b/utils/src/fetch_portals/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/src/fetch_portals/test/conftest.py b/utils/src/fetch_portals/test/conftest.py new file mode 100644 index 0000000..76f2524 --- /dev/null +++ b/utils/src/fetch_portals/test/conftest.py @@ -0,0 +1,73 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Reusable Pytest fixtures for testing fetch_portals package.""" + +import json +import os +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Generator + +import pytest +from dotenv import load_dotenv + + +@pytest.fixture(scope="session") +def kaggle_api() -> "KaggleApi": # type: ignore # noqa: F821 + """Initialize and authenticate connection to Kaggle API.""" + # get set kaggle credentials as environment variables + script_path = Path(os.path.abspath(__file__)) + load_dotenv(os.path.join(script_path.parents[3], ".env")) + + # initialize api + from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore + + api = KaggleApi() + api.authenticate() + + return api + + +@pytest.fixture(scope="session") +def mock_kaggle_dataset(kaggle_api) -> Generator[str, None, None]: + """Creates a Kaggle dataset for testing purposes. + + Note: + There is currently no method for programatically removing a Kaggle + dataset. Therefore, the user must manually delete the created dataset, + located at ``https://kaggle.com/myuser/example`` (where ``myuser`` is + the name of the Kaggle user provided through the ``KAGGLE_USER`` + environment variable). + + Yields: + ID of the created dataset, in the format ``myuser/example``. + """ + kaggle_user = os.environ["KAGGLE_USERNAME"] + mock_data = """ + "fruit_name","fruit_color","fruit_number" + apple,red,6 + banana,yellow,12 + plum,purple,5 + """ + try: + tmpdir = TemporaryDirectory() + metadata = { + "title": "Example Dataset", + "id": kaggle_user + "/example", + "licenses": [{"name": "CC0-1.0"}], + } + with open( + os.path.join(tmpdir.name, "datapackage.json"), "w" + ) as meta_file: + meta_json = json.dumps(metadata) + meta_file.write(meta_json) + with open(os.path.join(tmpdir.name, "example_fruits.csv"), "w") as f: + f.write(mock_data) + kaggle_api.dataset_create_new(tmpdir.name) + yield metadata["id"] # type: ignore + finally: + tmpdir.cleanup() diff --git a/utils/src/fetch_portals/test/test_callbacks.py b/utils/src/fetch_portals/test/test_callbacks.py new file mode 100644 index 0000000..0a3c45d --- /dev/null +++ b/utils/src/fetch_portals/test/test_callbacks.py @@ -0,0 +1,86 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Tests callback functions. + +This module contains test cases for checking whether the callback functions +defined in the `callbacks.py`_ file are working as expected. +""" + +from datetime import datetime, timedelta, timezone + +import pytest + +from ..callbacks import _autogen_version_notes, to_kaggle +from ..models import PortalCapture + + +@pytest.fixture +def mock_captures(): + """Creates a list of fake records to process and/or save.""" + captures = [ + PortalCapture( + ibge_code=2600807, + initial_url=( + "http://netuse.inf.br/altinho_pm/portaltransparencia/" + + "index.php?link=6" + ), + final_url=( + "http://netuse.inf.br/altinho_pm/portaltransparencia/" + + "index.php?link=6" + ), + method="HEAD", + attempts=1, + request_time=datetime.now(timezone.utc), + waiting_time=timedelta(seconds=1.1), + ssl_valid=True, + status=200, + message="OK", + ), + PortalCapture( + ibge_code=4200754, + initial_url=( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ), + final_url=( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ), + method="HEAD", + attempts=1, + request_time=datetime.now(timezone.utc), + waiting_time=timedelta(seconds=0.92), + ssl_valid=True, + status=200, + message="OK", + ), + ] + + return captures + + +def test_autogen_version_notes(): + """Tests generating a default version note message.""" + expected_notes = { + "create": "Create example.csv", + "append": "Add records to example.csv", + "update": "Update example.csv", + } + for operation, expected_note in expected_notes.items(): + version_note = _autogen_version_notes("example.csv", operation) + assert version_note == expected_note + + +def test_to_kaggle(mock_captures, mock_kaggle_dataset, kaggle_api): + """Tests saving some records to Kaggle""" + to_kaggle( + mock_captures, + dataset=mock_kaggle_dataset, + dest_file="example.csv", + ) + file_list = kaggle_api.dataset_list_files(mock_kaggle_dataset) + assert "example.csv" in [str(datafile) for datafile in file_list.files] diff --git a/utils/src/fetch_portals/test/test_fetchers.py b/utils/src/fetch_portals/test/test_fetchers.py new file mode 100644 index 0000000..9defbc6 --- /dev/null +++ b/utils/src/fetch_portals/test/test_fetchers.py @@ -0,0 +1,127 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +import asyncio +from typing import List, Set + +import pytest +from yarl import URL + +from ..fetchers import fetch_portals +from ..models import IbgeCode, Portal, PortalCapture, PortalList + + +@pytest.fixture +def example_portals() -> PortalList: + """Create a `PortalList`_ instance with a few official gazette portals.""" + + # Altinho (PE) + portal1: Portal = Portal( + ibge_code=IbgeCode(2600807), + url=URL( + "http://netuse.inf.br/altinho_pm/portaltransparencia/index.php?" + + "link=6" + ), + ) + portal2: Portal = Portal( + ibge_code=IbgeCode(2600807), + url=URL("http://www.diariomunicipal.com.br/amupe/"), + ) + + # Alto Bela Vista (SC) + portal3: Portal = Portal( + ibge_code=IbgeCode(4200754), + url=URL( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ), + ) + + # Anchieta (SC) + portal4: Portal = Portal( + ibge_code=IbgeCode(4200804), + url=URL( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A14" + ), + ) + + # Angelim (PE) + portal5: Portal = Portal( + ibge_code=IbgeCode(2601003), + url=URL("http://www.diariomunicipal.com.br/amupe/pesquisar"), + ) + portal6 = Portal( + ibge_code=IbgeCode(2601003), + url=URL( + "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + + "legislacaomunicipal.faces" + ), + ) + portal7 = Portal( + ibge_code=IbgeCode(2601003), + url=URL( + "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + + "outrosatos.faces" + ), + ) + + return PortalList( + [portal1, portal2, portal3, portal4, portal5, portal6, portal7] + ) + + +def test_split_by_domain(example_portals) -> None: + """Tests spliting `PortalList`_s into instances with an unique domain each. + """ + splitted: PortalList = example_portals.by_domain() + for subset in splitted: + domains: Set[str] = set(portal.url.host for portal in subset) + assert len(domains) == 1 + + +def test_head_subsets(example_portals) -> None: + """Tests pinging subsets of a `PortalList`_ with unique domains.""" + subsets: PortalList = example_portals.by_domain() + for subset in subsets: + subset = PortalList(subset) + captures: List[PortalCapture] = asyncio.run( + subset.fetch_all(method="HEAD", timeout=30) + ) + assert len(captures) == len(subset) + for capture in captures: + assert isinstance(capture, PortalCapture) + + +def test_get_subsets(example_portals) -> None: + """Tests capturing subsets of a `PortalList`_ with unique domains.""" + subsets: PortalList = example_portals.by_domain() + for subset in subsets: + subset = PortalList(subset) + captures: List[PortalCapture] = asyncio.run( + subset.fetch_all(method="GET", timeout=30) + ) + assert len(captures) == len(subset) + for capture in captures: + assert isinstance(capture, PortalCapture) + + +def test_orchestrate_pinging(example_portals) -> None: + """Tests asynchronously pinging multiple portals.""" + captures: List[PortalCapture] = fetch_portals(example_portals, mode="ping") + assert len(captures) == len(example_portals) + for capture in captures: + assert isinstance(capture, PortalCapture) + + +def test_orchestrate_sourcing(example_portals) -> None: + """Tests asynchronously getting source code for multiple portals.""" + captures: List[PortalCapture] = fetch_portals( + example_portals, mode="source" + ) + assert len(captures) == len(example_portals) + for capture in captures: + assert isinstance(capture, PortalCapture) diff --git a/utils/src/fetch_portals/test/test_main.py b/utils/src/fetch_portals/test/test_main.py new file mode 100644 index 0000000..b3467d4 --- /dev/null +++ b/utils/src/fetch_portals/test/test_main.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Tests callback functions. + +This module contains test cases for checking whether the callback functions +defined in the `callbacks.py`_ file are working as expected. +""" + +import os +from tempfile import TemporaryDirectory + +import pandas as pd + +from ..main import main + + +def test_ping(capsys): + """Tests pinging all portals in Querido Diario Census.""" + main(mode="ping", callback=None) + out, err = capsys.readouterr() + assert '"ibge_code": "2600807"' in out + + +def test_source(capsys): + """Tests getting source codes for all portals in Querido Diario Census.""" + main(mode="source", callback=None) + out, err = capsys.readouterr() + assert '"ibge_code": "2600807"' in out + assert "" in out + + +def test_ping_to_kaggle(mock_kaggle_dataset, kaggle_api): + """Tests saving pings to all portals in QD Census to Kaggle.""" + # copy original kaggle dataset config (that should not to be modified) + previous_kaggle_dataset = os.getenv("KAGGLE_DATASET") + previous_kaggle_file = os.getenv("KAGGLE_FILE") + + # upload data to mock dataset + try: + os.environ["KAGGLE_DATASET"] = mock_kaggle_dataset + os.environ["KAGGLE_FILE"] = "test-ping.csv" + main(mode="ping", callback="kaggle", existing="append") + with TemporaryDirectory() as tmpdir: + kaggle_api.dataset_download_file( + mock_kaggle_dataset, os.environ["KAGGLE_FILE"], tmpdir + ) + df = pd.read_csv(os.path.join(tmpdir, os.environ["KAGGLE_FILE"])) + for col in [ + "ibge_code", + "request_time", + "waiting_time", + "attempts", + "initial_url", + "final_url", + "method", + "ssl_valid", + "status", + "message", + "level", + "branch", + ]: + assert col in df.columns + assert len(df.index) > 3 + assert "200" in df["status"].unique() + assert "OK" in df["message"].unique() + + # reset kaggle dataset config to the original one + finally: + if previous_kaggle_dataset: + os.environ["KAGGLE_DATASET"] = previous_kaggle_dataset + else: + del os.environ["KAGGLE_DATASET"] + if previous_kaggle_file: + os.environ["KAGGLE_FILE"] = previous_kaggle_file + else: + del os.environ["KAGGLE_FILE"] + + +def test_source_to_kaggle(mock_kaggle_dataset, kaggle_api): + """Tests saving source codes for all portals in QD Census to Kaggle.""" + # copy original kaggle dataset config (that should not to be modified) + previous_kaggle_dataset = os.getenv("KAGGLE_DATASET") + previous_kaggle_file = os.getenv("KAGGLE_FILE") + + # upload data to mock dataset + try: + os.environ["KAGGLE_DATASET"] = mock_kaggle_dataset + os.environ["KAGGLE_FILE"] = "test-source.csv" + main(mode="source", callback="kaggle") + with TemporaryDirectory() as tmpdir: + kaggle_api.dataset_download_file( + mock_kaggle_dataset, os.environ["KAGGLE_FILE"], tmpdir + ) + try: + df = pd.read_csv( + os.path.join(tmpdir, os.environ["KAGGLE_FILE"]) + ) + except FileNotFoundError: + df = pd.read_csv( + os.path.join(tmpdir, os.environ["KAGGLE_FILE"] + ".zip") + ) + for col in [ + "ibge_code", + "request_time", + "waiting_time", + "attempts", + "initial_url", + "final_url", + "method", + "ssl_valid", + "status", + "message", + "level", + "branch", + ]: + assert col in df.columns + assert len(df.index) > 3 + assert df["message"].apply(lambda msg: "" in str(msg)).any() + + # reset kaggle dataset config to the original one + finally: + if previous_kaggle_dataset: + os.environ["KAGGLE_DATASET"] = previous_kaggle_dataset + else: + del os.environ["KAGGLE_DATASET"] + if previous_kaggle_file: + os.environ["KAGGLE_FILE"] = previous_kaggle_file + else: + del os.environ["KAGGLE_FILE"] diff --git a/utils/src/fetch_portals/test/test_sources.py b/utils/src/fetch_portals/test/test_sources.py new file mode 100644 index 0000000..1743b7b --- /dev/null +++ b/utils/src/fetch_portals/test/test_sources.py @@ -0,0 +1,20 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + + +from ..models import Portal, PortalList +from ..sources import get_portals_from_census + + +def test_get_portals_from_census() -> None: + """Test getting a list of official gazettes portals from the QD census.""" + portals = get_portals_from_census() + assert len(portals) >= 326 # there are at least 324 mapped portals + assert isinstance(portals, PortalList) + for portal in portals: + assert isinstance(portal, Portal) + # assert len(portal.ibge_code) == 7 + assert len(str(portal.url.host or '')) > 5