From d5ca708a117f659065e5fded3e952833d0aabf41 Mon Sep 17 00:00:00 2001 From: bcbernardo Date: Mon, 25 Jan 2021 07:58:43 -0300 Subject: [PATCH 01/19] Add utils skeleton --- utils/.env.template | 23 +++++++++++++++++++++++ utils/requirements-dev.txt | 9 +++++++++ utils/requirements.txt | 4 ++++ utils/src/__init__.py | 0 utils/src/fetch_portals/__init__.py | 0 utils/src/fetch_portals/test/__init__.py | 0 6 files changed, 36 insertions(+) create mode 100644 utils/.env.template create mode 100644 utils/requirements-dev.txt create mode 100644 utils/requirements.txt create mode 100644 utils/src/__init__.py create mode 100644 utils/src/fetch_portals/__init__.py create mode 100644 utils/src/fetch_portals/test/__init__.py diff --git a/utils/.env.template b/utils/.env.template new file mode 100644 index 0000000..568bdbc --- /dev/null +++ b/utils/.env.template @@ -0,0 +1,23 @@ +# Environment variables + +### IMPORTANT: when done editing this file, rename it to ".env" +### (without the ".template" ending) + +## General settings + +### log verbosity level - choose between 'error', 'warn', 'info', 'debug' +PORTALFETCH_LOG_LEVEL = "debug" + +### control whether to only ping portal status, or to fetch its source code +PORTALFETCH_MODE = "ping" + +### control callback to process and/or save the retrived data +PORTALFETCH_CALLBACK = "kaggle" + + +## Kaggle settings + +KAGGLE_USER = "exampleuser" +KAGGLE_KEY = "12345678abcdefgh" +KAGGLE_DATASET = "bcbernardo/censusqd2020" +KAGGLE_FILE = "portals-availability.csv" \ No newline at end of file diff --git a/utils/requirements-dev.txt b/utils/requirements-dev.txt new file mode 100644 index 0000000..738944e --- /dev/null +++ b/utils/requirements-dev.txt @@ -0,0 +1,9 @@ +kaggle +pandas +python-dotenv +requests +mypy +pytest +flake8 +isort +black \ No newline at end of file diff --git a/utils/requirements.txt b/utils/requirements.txt new file mode 100644 index 0000000..822b9bf --- /dev/null +++ b/utils/requirements.txt @@ -0,0 +1,4 @@ +kaggle +pandas +python-dotenv +requests \ No newline at end of file diff --git a/utils/src/__init__.py b/utils/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/src/fetch_portals/__init__.py b/utils/src/fetch_portals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/src/fetch_portals/test/__init__.py b/utils/src/fetch_portals/test/__init__.py new file mode 100644 index 0000000..e69de29 From 66c54d863ed6d8716283a84269711bb6387bfa83 Mon Sep 17 00:00:00 2001 From: bcbernardo Date: Mon, 25 Jan 2021 07:59:52 -0300 Subject: [PATCH 02/19] Get portals using Querido Diario Census as source --- utils/src/fetch_portals/sources.py | 69 ++++++++++++++++++++ utils/src/fetch_portals/test/test_sources.py | 21 ++++++ 2 files changed, 90 insertions(+) create mode 100644 utils/src/fetch_portals/sources.py create mode 100644 utils/src/fetch_portals/test/test_sources.py diff --git a/utils/src/fetch_portals/sources.py b/utils/src/fetch_portals/sources.py new file mode 100644 index 0000000..621755d --- /dev/null +++ b/utils/src/fetch_portals/sources.py @@ -0,0 +1,69 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Functions to interact with sources of official gazettes portals. + +This module contains functions developed as a part of the `Censo Querido +Diário`_ effort, in order to periodically fetch the contents and monitor the +service availability of portals containing the official gazettes for the 5.526 +brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our +`contribution guidelines`_ (in portuguese) to learn the various ways you can +support the project. + +.. _Censo Querido Diário: + https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import logging +from typing import List + +import numpy as np +import pandas as pd + +from ..models import HttpUrl, IbgeCode, Portal, PortalList + + +def get_portals_from_census() -> PortalList: + """Get a list of official gazettes portals from Querido Diario Census data. + + Returns: + A list of `Portal`_ objects, containing the official Id for the city + and the portal URL. + """ + + logging.info("Getting census data...") + + # download census full data + url: str = "https://censo.ok.org.br/get-data/" + df_census: pd.DataFrame = pd.read_csv(url) + + # filter and process relevant data (cities geocodes and portal URLs) + logging.debug("Processing portals information...") + portals: List[Portal] = ( + pd.wide_to_long(df_census, "fonte", i="IBGE7", j="fonte_num", sep="_") + .reset_index() + .dropna() + .apply( + lambda mun: Portal( + ibge_code=IbgeCode(mun.IBGE7), url=HttpUrl(mun.fonte) + # FIXME: avoid "None" strings in url column + ) if mun.fonte != "None" else np.nan, + axis=1, + ) + .dropna() + .to_list() + ) + + portals = PortalList(portals) + + return portals diff --git a/utils/src/fetch_portals/test/test_sources.py b/utils/src/fetch_portals/test/test_sources.py new file mode 100644 index 0000000..86cab24 --- /dev/null +++ b/utils/src/fetch_portals/test/test_sources.py @@ -0,0 +1,21 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + + +from ..sources import get_portals_from_census +from ...models import Portal, PortalList + + +def test_get_portals_from_census() -> None: + """Test getting a list of official gazettes portals from the QD census. + """ + portals = get_portals_from_census() + assert len(portals) >= 326 # there are at least 324 mapped portals + assert isinstance(portals, PortalList) + for portal in portals: + assert isinstance(portal, Portal) + # assert len(portal.ibge_code) == 7 + assert len(portal.url.domain) > 5 From ea7ad3c5ca5a50ce5b45b0eeb7acca288afff8b9 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Fri, 5 Feb 2021 17:16:34 -0300 Subject: [PATCH 03/19] Add portal fetchers --- utils/requirements-dev.txt | 17 ++- utils/requirements.txt | 7 +- utils/src/fetch_portals/fetchers.py | 97 ++++++++++++ utils/src/fetch_portals/sources.py | 139 +++++++++--------- utils/src/fetch_portals/test/test_fetchers.py | 132 +++++++++++++++++ utils/src/fetch_portals/test/test_sources.py | 42 +++--- 6 files changed, 333 insertions(+), 101 deletions(-) create mode 100644 utils/src/fetch_portals/fetchers.py create mode 100644 utils/src/fetch_portals/test/test_fetchers.py diff --git a/utils/requirements-dev.txt b/utils/requirements-dev.txt index 738944e..2a260e7 100644 --- a/utils/requirements-dev.txt +++ b/utils/requirements-dev.txt @@ -1,9 +1,10 @@ -kaggle -pandas -python-dotenv -requests -mypy -pytest -flake8 -isort +aiohttp +kaggle +pandas +python-dotenv +requests +mypy +pytest +flake8 +isort black \ No newline at end of file diff --git a/utils/requirements.txt b/utils/requirements.txt index 822b9bf..a88da40 100644 --- a/utils/requirements.txt +++ b/utils/requirements.txt @@ -1,4 +1,5 @@ -kaggle -pandas -python-dotenv +aiohttp +kaggle +pandas +python-dotenv requests \ No newline at end of file diff --git a/utils/src/fetch_portals/fetchers.py b/utils/src/fetch_portals/fetchers.py new file mode 100644 index 0000000..f74ec37 --- /dev/null +++ b/utils/src/fetch_portals/fetchers.py @@ -0,0 +1,97 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Functions to fetch official gazette portals statuses and contents. + +This module contains functions developed as a part of the `Censo Querido +Diário`_ effort, in order to periodically fetch the contents and monitor the +service availability of portals containing the official gazettes for the 5.526 +brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our +`contribution guidelines`_ (in portuguese) to learn the various ways you can +support the project. + +.. _Censo Querido Diário: + https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import asyncio +import logging +# from itertools import chain +from typing import List + +from ..models import FetchMode, PortalList + + +async def _gather_responses( + portals: PortalList, + mode: FetchMode = "ping", + max_retries: int = 3, + timeout: float = 10.0, +): + """Orchestrates asynchronous requests to official gazettes portals. + + Parameters: + portals: A `PortalList` instance to be fetched. + mode: How to fetch the portals. ``mode="ping"`` fetches only the + portals' status codes and request metadate. ``mode="capture"`` also + captures the portals' source code. + """ + + logging.info("Preparing fetch tasks...") + + portals = PortalList(portals) + + http_method: str + if mode == "ping": + http_method = "HEAD" + elif mode == "source": + http_method = "GET" + + task_list: List = list() + + for subset in portals.by_domain(): + task: asyncio.Task = asyncio.create_task(subset.fetch_all( + method=http_method, + max_retries=max_retries, + timeout=timeout, + )) + task_list.append(task) + + return await asyncio.gather(*task_list) + + +def fetch_portals( + portals: PortalList, + mode: FetchMode = "ping", + max_retries: int = 3, + timeout: float = 10.0, +): + """Orchestrates asynchronous requests to official gazettes portals. + + Parameters: + portals: A `PortalList` instance to be fetched. + mode: How to fetch the portals. ``mode="ping"`` fetches only the + portals' status codes and request metadate. ``mode="capture"`` also + captures the portals' source code. + """ + + results = list() + + task_list = asyncio.run( + _gather_responses(portals, mode, max_retries, timeout)) + + for task_results in task_list: + for result in task_results: + results.append(result) + + return results diff --git a/utils/src/fetch_portals/sources.py b/utils/src/fetch_portals/sources.py index 621755d..bd7c97c 100644 --- a/utils/src/fetch_portals/sources.py +++ b/utils/src/fetch_portals/sources.py @@ -1,69 +1,70 @@ -# Copyright 2020 Open Knowledge Brasil - -# Use of this source code is governed by an MIT-style -# license that can be found in the LICENSE file or at -# https://opensource.org/licenses/MIT. - -"""Functions to interact with sources of official gazettes portals. - -This module contains functions developed as a part of the `Censo Querido -Diário`_ effort, in order to periodically fetch the contents and monitor the -service availability of portals containing the official gazettes for the 5.526 -brazilian municipalities. - -The Censo Querido Diário is a collaborative effort to push forward the -disclosure of public information embodied in official publications. -Contributions to this initiative are more than welcome. Check our -`contribution guidelines`_ (in portuguese) to learn the various ways you can -support the project. - -.. _Censo Querido Diário: - https://censo.ok.org.br/sobre/ - -.. _contribution guidelines: - https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD -""" - -import logging -from typing import List - -import numpy as np -import pandas as pd - -from ..models import HttpUrl, IbgeCode, Portal, PortalList - - -def get_portals_from_census() -> PortalList: - """Get a list of official gazettes portals from Querido Diario Census data. - - Returns: - A list of `Portal`_ objects, containing the official Id for the city - and the portal URL. - """ - - logging.info("Getting census data...") - - # download census full data - url: str = "https://censo.ok.org.br/get-data/" - df_census: pd.DataFrame = pd.read_csv(url) - - # filter and process relevant data (cities geocodes and portal URLs) - logging.debug("Processing portals information...") - portals: List[Portal] = ( - pd.wide_to_long(df_census, "fonte", i="IBGE7", j="fonte_num", sep="_") - .reset_index() - .dropna() - .apply( - lambda mun: Portal( - ibge_code=IbgeCode(mun.IBGE7), url=HttpUrl(mun.fonte) - # FIXME: avoid "None" strings in url column - ) if mun.fonte != "None" else np.nan, - axis=1, - ) - .dropna() - .to_list() - ) - - portals = PortalList(portals) - - return portals +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Functions to interact with sources of official gazettes portals. + +This module contains functions developed as a part of the `Censo Querido +Diário`_ effort, in order to periodically fetch the contents and monitor the +service availability of portals containing the official gazettes for the 5.526 +brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our +`contribution guidelines`_ (in portuguese) to learn the various ways you can +support the project. + +.. _Censo Querido Diário: + https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import logging +from typing import List + +import numpy as np +import pandas as pd +from yarl import URL + +from ..models import IbgeCode, Portal, PortalList + + +def get_portals_from_census() -> PortalList: + """Get a list of official gazettes portals from Querido Diario Census data. + + Returns: + A list of `Portal`_ objects, containing the official Id for the city + and the portal URL. + """ + + logging.info("Getting census data...") + + # download census full data + url: str = "https://censo.ok.org.br/get-data/" + df_census: pd.DataFrame = pd.read_csv(url) + + # filter and process relevant data (cities geocodes and portal URLs) + logging.debug("Processing portals information...") + portals: List[Portal] = ( + pd.wide_to_long(df_census, "fonte", i="IBGE7", j="fonte_num", sep="_") + .reset_index() + .dropna() + .apply( + # FIXME: avoid "None" strings in url column + lambda mun: Portal( + ibge_code=IbgeCode(mun["IBGE7"]), url=URL(mun["fonte"]) + ) if mun.fonte != "None" else np.nan, + axis=1, + ) + .dropna() + .to_list() + ) + + portals = PortalList(portals) + + return portals diff --git a/utils/src/fetch_portals/test/test_fetchers.py b/utils/src/fetch_portals/test/test_fetchers.py new file mode 100644 index 0000000..31ba45d --- /dev/null +++ b/utils/src/fetch_portals/test/test_fetchers.py @@ -0,0 +1,132 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +import asyncio +from typing import List, Set + +import pytest +from httpx import URL + +from ...models import Portal, PortalCapture, PortalList +from ..fetchers import fetch_portals + + +@pytest.fixture +def example_portals() -> PortalList: + """Create a `PortalList`_ instance with a few official gazette portals. + """ + + # Altinho (PE) + portal1: Portal = Portal( + ibge_code=2600807, + url=URL( + "http://netuse.inf.br/altinho_pm/portaltransparencia/index.php?" + + "link=6" + ) + ) + portal2: Portal = Portal( + ibge_code=2600807, + url=URL("http://www.diariomunicipal.com.br/amupe/") + ) + + # Alto Bela Vista (SC) + portal3: Portal = Portal( + ibge_code=4200754, + url=URL( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ) + ) + + # Anchieta (SC) + portal4: Portal = Portal( + ibge_code=4200804, + url=URL( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A14" + ) + ) + + # Angelim (PE) + portal5: Portal = Portal( + ibge_code=2601003, + url=URL("http://www.diariomunicipal.com.br/amupe/pesquisar") + ) + portal6 = Portal( + ibge_code=2601003, + url=URL( + "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + + "legislacaomunicipal.faces" + ) + ) + portal7 = Portal( + ibge_code=2601003, + url=URL( + "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + + "outrosatos.faces" + ) + ) + + return PortalList( + [portal1, portal2, portal3, portal4, portal5, portal6, portal7] + ) + + +def test_split_by_domain(example_portals) -> None: + """Tests spliting `PortalList`_s into instances with an unique domain each. + """ + splitted: PortalList = example_portals.by_domain() + for subset in splitted: + domains: Set[str] = set(portal.url.host for portal in subset) + assert len(domains) == 1 + + +def test_head_subsets(example_portals) -> None: + """Tests pinging subsets of a `PortalList`_ with unique domains. + """ + subsets: PortalList = example_portals.by_domain() + for subset in subsets: + subset = PortalList(subset) + captures: List[PortalCapture] = asyncio.run( + subset.fetch_all(method="HEAD", timeout=30) + ) + assert len(captures) == len(subset) + for capture in captures: + assert isinstance(capture, PortalCapture) + + +def test_get_subsets(example_portals) -> None: + """Tests capturing subsets of a `PortalList`_ with unique domains. + """ + subsets: PortalList = example_portals.by_domain() + for subset in subsets: + subset = PortalList(subset) + captures: List[PortalCapture] = asyncio.run( + subset.fetch_all(method="GET", timeout=30) + ) + assert len(captures) == len(subset) + for capture in captures: + assert isinstance(capture, PortalCapture) + + +def test_orchestrate_pinging(example_portals) -> None: + """Tests asynchronously pinging multiple portals.""" + captures: List[PortalCapture] = fetch_portals( + example_portals, mode="ping" + ) + assert len(captures) == len(example_portals) + for capture in captures: + assert isinstance(capture, PortalCapture) + + +def test_orchestrate_sourcing(example_portals) -> None: + """Tests asynchronously getting source code for multiple portals.""" + captures: List[PortalCapture] = fetch_portals( + example_portals, mode="source" + ) + assert len(captures) == len(example_portals) + for capture in captures: + assert isinstance(capture, PortalCapture) diff --git a/utils/src/fetch_portals/test/test_sources.py b/utils/src/fetch_portals/test/test_sources.py index 86cab24..a9c8776 100644 --- a/utils/src/fetch_portals/test/test_sources.py +++ b/utils/src/fetch_portals/test/test_sources.py @@ -1,21 +1,21 @@ -# Copyright 2020 Open Knowledge Brasil - -# Use of this source code is governed by an MIT-style -# license that can be found in the LICENSE file or at -# https://opensource.org/licenses/MIT. - - -from ..sources import get_portals_from_census -from ...models import Portal, PortalList - - -def test_get_portals_from_census() -> None: - """Test getting a list of official gazettes portals from the QD census. - """ - portals = get_portals_from_census() - assert len(portals) >= 326 # there are at least 324 mapped portals - assert isinstance(portals, PortalList) - for portal in portals: - assert isinstance(portal, Portal) - # assert len(portal.ibge_code) == 7 - assert len(portal.url.domain) > 5 +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + + +from ..sources import get_portals_from_census +from ...models import Portal, PortalList + + +def test_get_portals_from_census() -> None: + """Test getting a list of official gazettes portals from the QD census. + """ + portals = get_portals_from_census() + assert len(portals) >= 326 # there are at least 324 mapped portals + assert isinstance(portals, PortalList) + for portal in portals: + assert isinstance(portal, Portal) + # assert len(portal.ibge_code) == 7 + assert len(portal.url.host) > 5 From 87226ecf0a39b57873fb41a844e80bd53bb24be2 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sun, 7 Feb 2021 15:23:43 -0300 Subject: [PATCH 04/19] Add kaggle-related callbacks. --- utils/.env.template | 44 ++-- utils/requirements-dev.txt | 1 + utils/src/fetch_portals/callbacks.py | 195 ++++++++++++++++ .../src/fetch_portals/test/test_callbacks.py | 150 ++++++++++++ utils/src/models.py | 213 ++++++++++++++++++ 5 files changed, 581 insertions(+), 22 deletions(-) create mode 100644 utils/src/fetch_portals/callbacks.py create mode 100644 utils/src/fetch_portals/test/test_callbacks.py create mode 100644 utils/src/models.py diff --git a/utils/.env.template b/utils/.env.template index 568bdbc..9b11837 100644 --- a/utils/.env.template +++ b/utils/.env.template @@ -1,23 +1,23 @@ -# Environment variables - -### IMPORTANT: when done editing this file, rename it to ".env" -### (without the ".template" ending) - -## General settings - -### log verbosity level - choose between 'error', 'warn', 'info', 'debug' -PORTALFETCH_LOG_LEVEL = "debug" - -### control whether to only ping portal status, or to fetch its source code -PORTALFETCH_MODE = "ping" - -### control callback to process and/or save the retrived data -PORTALFETCH_CALLBACK = "kaggle" - - -## Kaggle settings - -KAGGLE_USER = "exampleuser" -KAGGLE_KEY = "12345678abcdefgh" -KAGGLE_DATASET = "bcbernardo/censusqd2020" +# Environment variables + +### IMPORTANT: when done editing this file, rename it to ".env" +### (without the ".template" ending) + +## General settings + +### log verbosity level - choose between 'error', 'warn', 'info', 'debug' +PORTALFETCH_LOG_LEVEL = "debug" + +### control whether to only ping portal status, or to fetch its source code +PORTALFETCH_MODE = "ping" + +### control callback to process and/or save the retrived data +PORTALFETCH_CALLBACK = "kaggle" + + +## Kaggle settings + +KAGGLE_USERNAME = "exampleuser" +KAGGLE_KEY = "12345678abcdefgh" +KAGGLE_DATASET = "bcbernardo/censusqd2020" KAGGLE_FILE = "portals-availability.csv" \ No newline at end of file diff --git a/utils/requirements-dev.txt b/utils/requirements-dev.txt index 2a260e7..cf9a93a 100644 --- a/utils/requirements-dev.txt +++ b/utils/requirements-dev.txt @@ -1,6 +1,7 @@ aiohttp kaggle pandas +pandas-stubs python-dotenv requests mypy diff --git a/utils/src/fetch_portals/callbacks.py b/utils/src/fetch_portals/callbacks.py new file mode 100644 index 0000000..d64a5fa --- /dev/null +++ b/utils/src/fetch_portals/callbacks.py @@ -0,0 +1,195 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Callback functions to save data fetched from official gazettes portals. + +This module contains callback functions to process and/or save contents and +monitor the service availability of portals containing the official gazettes +for the 5.570 brazilian municipalities. +""" + +import json +import logging +import os +from dataclasses import is_dataclass +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Iterable, Literal, Optional, Union + +import pandas as pd + +from ..models import PathLike, PortalCapture + + +def _autogen_version_notes( + dest_file: str, operation: Literal["create", "append", "update"] +) -> str: + """Generate a default version note message. + + Parameters: + dest_file: Destination file being written (or appended to). + operation: Whether the file is being ``create``'d, ``append`'ed to or + completely ``update``'d (replaced). + """ + + logging.warning( + "Version notes not provided; a default message will be generated." + ) + + if operation == "create": + version_notes = "Create " + dest_file + elif operation == "append": + version_notes = "Add records to " + dest_file + elif operation == "update": + version_notes = "Update " + dest_file + + return version_notes + + +def to_kaggle( + data: Union[Iterable[Union[dict, PortalCapture]], pd.DataFrame], + dataset: str, + dest_file: str, + existing_behavior: Literal["replace", "append", "skip"] = "replace", + version_notes: Optional[str] = None, + local_dir: Optional[PathLike] = None, + delete_old_versions: bool = False, +) -> "DatasetNewVersionResponse": # type: ignore # noqa: F821 + """Write data to a destination dataset file in Kaggle. + + Parameters: + data: Data to be uploaded to the dataset. Can be a pandas `DataFrame`_ + instance, or an iterable of dataclass objects or dictionaries. + dataset: Kaggle dataset id, in the format ``/``. + dest_file: How to name the destination file in the dataset context. + existing_behavior: What to do if the file already exists in the + dataset. + version_notes: A message describing what changes will be made to the + dataset (optional; a default message will be generated if none was + given). + local_dir: A local directory where the dataset files will persist + (optional; defaults to None). + delete_old_versions: Whether to delete previous versions of the + dataset that exist in Kaggle. + + Returns + A `DatasetNewVersionResponse`_ instance with the new dataset version. + + .. _DataFrame: https://pandas.pydata.org/pandas-docs/stable/reference/ + frame.html + .. _DatasetNewVersionResponse: https://github.com/Kaggle/kaggle-api/blob/ + 89eb72dd811492c500839f65332f669cd839d2bc/kaggle/models/ + kaggle_models_extended.py#L150 + """ + + from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore + from kaggle.models.kaggle_models_extended import Metadata # type: ignore + + # check data object type is supported + if all(is_dataclass(record) for record in data) or all( + isinstance(record, dict) for record in data + ): + data = pd.DataFrame(data) + elif isinstance(data, pd.DataFrame): + pass + else: + raise TypeError( + "`data` parameter must be a list os Dataclass instances, a " + + f"dictionary or a pandas DataFrame, not {type(data).__name__}." + ) + + logging.info( + f"Uploading {len(data.index)} records to '{dest_file}' file in " + + f"Kaggle's '{dataset}' dataset." + ) + + # authenticate Kaggle API + logging.debug("Authenticating to Kaggle API...") + api = KaggleApi() + api.authenticate() + + # make sure dataset exists + logging.debug("Searching dataset...") + try: + dataset_owner, dataset_name = dataset.split("/") + matching_datasets = api.dataset_list( + search=dataset_name, user=dataset_owner + ) + assert dataset in [dataset.ref for dataset in matching_datasets] + except AssertionError: + # TODO: create dataset if it doesn't exist + raise ValueError("The dataset does not exist.") + + # use the provided local (persistent) directory, or create a temporary one + if not local_dir: + tmpdir = TemporaryDirectory() + data_dir: Any = tmpdir.name + logging.debug(f"Creatd temporary directory: {data_dir}") + else: + data_dir = local_dir + + # get dataset metadata + metafile = Path(data_dir, "datapackage.json") + if not os.path.isfile(metafile): + metadata_response = api.process_response( + api.metadata_get_with_http_info(dataset_owner, dataset_name) + ) + metadata = Metadata(metadata_response) + with open(metafile, "w") as f: + json.dump(metadata, f, indent=2, default=lambda o: o.__dict__) + + # download existing files + # TODO: skip downloading unchanged files if they already exist locally + # TODO: start downloading asynchronously while data is gathered + api.dataset_download_files(dataset, path=data_dir, unzip=True) + + # write data file as CSV + operation: Literal["create", "append", "update"] + if os.path.isfile(Path(data_dir, dest_file)): + if existing_behavior == "replace": + operation = "update" + data.to_csv(Path(data_dir, dest_file), mode="w") + elif existing_behavior == "append": + operation = "append" + data.to_csv(Path(data_dir, dest_file), mode="a") + elif existing_behavior == "skip": + logging.error(f"File '{dest_file}' already exists. Skiped.") + raise FileExistsError( + "File already exists and behavior is set to `skip`." + ) + else: + raise ValueError( + "`existing_behavior` argument must be one of " + + f"`replace`, `append` or `skip` ('{existing_behavior}' " + + "provided)." + ) + else: + operation = "create" + data.to_csv(Path(data_dir, dest_file)) + + # update metadata file + try: + api.dataset_metadata_update(dataset, data_dir) + except KeyError: + # BUG: KaggleApi's dataset_metadata_update() method references an + # inexistent key for checking for errors. Just ignore it. + pass + + # create a version notes message, if user hasn't provided one + if not version_notes: + version_notes = _autogen_version_notes(dest_file, operation) + + # upload data to Kaggle + new_version = api.dataset_create_version( + data_dir, version_notes, delete_old_versions=delete_old_versions + ) + + # clear temporary directory + if not local_dir: + tmpdir.cleanup() + + return new_version diff --git a/utils/src/fetch_portals/test/test_callbacks.py b/utils/src/fetch_portals/test/test_callbacks.py new file mode 100644 index 0000000..e9aae3d --- /dev/null +++ b/utils/src/fetch_portals/test/test_callbacks.py @@ -0,0 +1,150 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Tests callback functions. + +This module contains test cases for checking whether the callback functions +defined in the `callbacks.py`_ file are working as expected. +""" + + +import json +import os +from datetime import datetime, timedelta, timezone +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Generator + +import pytest +from dotenv import load_dotenv + +from ...models import PortalCapture +from ..callbacks import _autogen_version_notes, to_kaggle + + +@pytest.fixture +def mock_captures(): + """Creates a list of fake records to process and/or save.""" + captures = [ + PortalCapture( + ibge_code=2600807, + initial_url=( + "http://netuse.inf.br/altinho_pm/portaltransparencia/" + + "index.php?link=6" + ), + final_url=( + "http://netuse.inf.br/altinho_pm/portaltransparencia/" + + "index.php?link=6" + ), + method="HEAD", + attempts=1, + request_time=datetime.now(timezone.utc), + waiting_time=timedelta(seconds=1.1), + ssl_valid=True, + status=200, + message="OK", + ), + PortalCapture( + ibge_code=4200754, + initial_url=( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ), + final_url=( + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ), + method="HEAD", + attempts=1, + request_time=datetime.now(timezone.utc), + waiting_time=timedelta(seconds=0.92), + ssl_valid=True, + status=200, + message="OK", + ), + ] + + return captures + + +@pytest.fixture(scope="module") +def kaggle_api() -> "KaggleApi": # type: ignore # noqa: F821 + """Initialize and authenticate connection to Kaggle API.""" + # get set kaggle credentials as environment variables + script_path = Path(os.path.abspath(__file__)) + load_dotenv(os.path.join(script_path.parents[3], ".env")) + + # initialize api + from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore + + api = KaggleApi() + api.authenticate() + + return api + + +@pytest.fixture(scope="module") +def mock_kaggle_dataset(kaggle_api) -> Generator[str, None, None]: + """Creates a Kaggle dataset for testing purposes. + + Note: + There is currently no method for programatically removing a Kaggle + dataset. Therefore, the user must manually delete the created dataset, + located at ``https://kaggle.com/myuser/example`` (where ``myuser`` is + the name of the Kaggle user provided through the ``KAGGLE_USER`` + environment variable). + + Yields: + ID of the created dataset, in the format ``myuser/example``. + """ + kaggle_user = os.environ["KAGGLE_USERNAME"] + mock_data = """ + "fruit_name","fruit_color","fruit_number" + apple,red,6 + banana,yellow,12 + plum,purple,5 + """ + try: + tmpdir = TemporaryDirectory() + metadata = { + "title": "Example Dataset", + "id": kaggle_user + "/example", + "licenses": [{"name": "CC0-1.0"}], + } + with open( + os.path.join(tmpdir.name, "datapackage.json"), "w" + ) as meta_file: + meta_json = json.dumps(metadata) + meta_file.write(meta_json) + with open(os.path.join(tmpdir.name, "example_fruits.csv"), "w") as f: + f.write(mock_data) + kaggle_api.dataset_create_new(tmpdir.name) + yield metadata["id"] # type: ignore + finally: + tmpdir.cleanup() + + +def test_autogen_version_notes(): + """Tests generating a default version note message.""" + expected_notes = { + "create": "Create example.csv", + "append": "Add records to example.csv", + "update": "Update example.csv", + } + for operation, expected_note in expected_notes.items(): + version_note = _autogen_version_notes("example.csv", operation) + assert version_note == expected_note + + +def test_to_kaggle(mock_captures, mock_kaggle_dataset, kaggle_api): + """Tests saving some records to Kaggle""" + to_kaggle( + mock_captures, + dataset=mock_kaggle_dataset, + dest_file="example.csv", + ) + file_list = kaggle_api.dataset_list_files(mock_kaggle_dataset) + assert "example.csv" in [str(datafile) for datafile in file_list.files] diff --git a/utils/src/models.py b/utils/src/models.py new file mode 100644 index 0000000..3a68822 --- /dev/null +++ b/utils/src/models.py @@ -0,0 +1,213 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Representations of concepts used by other utilities. + +This module contains reusable types and classes that model both portals where +Brazilian official gazettes are published and their attributes. +""" + +import itertools +import logging +import os +from asyncio.exceptions import TimeoutError +from collections import UserList +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from enum import Enum +from typing import Any, List, Literal, NewType, Optional, Set, Union + +import aiohttp +from aiohttp import ClientConnectorCertificateError, ClientError, ClientTimeout +from yarl import URL + +FetchMode = Literal["ping", "source"] +IbgeCode = NewType("IbgeCode", int) # TODO: make it a UserString +LogLevel = Literal["error", "warn", "info", "debug"] +PathLike = Union[str, bytes, os.PathLike] + + +class GovernmentBranch(Enum): + """An enumeration of government branches in Brazil.""" + + EXECUTIVE = 1 # only the Executive branch is currently supported + # LEGISLATIVE = 2 + # JUDICIAL = 3 + # ESSENTIAL_JUSTICE = 4 + + +class GovernmentLevel(Enum): + """An enumeration of government levels in Brazil.""" + + # FEDERAL = 1 + # STATE = 2 # includes Federal District + MUNICIPALITY = 3 # only Municipalities are currently supported + + +@dataclass +class Portal: + """Representation of a portal that publishes local-level official gazettes.""" + + ibge_code: IbgeCode + url: URL + branch: GovernmentBranch = GovernmentBranch.EXECUTIVE + level: GovernmentLevel = GovernmentLevel.MUNICIPALITY + + +@dataclass +class PortalCapture: + """Capture of an official gazette publication portal at a point in time.""" + + ibge_code: IbgeCode + request_time: datetime + waiting_time: timedelta + attempts: int + initial_url: URL + final_url: Optional[URL] + method: Literal["GET", "POST"] + ssl_valid: bool + status: int + message: str + level: str = GovernmentLevel.MUNICIPALITY.name + branch: str = GovernmentBranch.EXECUTIVE.name + + +class PortalList(UserList): + """A list of official portals.""" + + def by_domain(self) -> List["PortalList"]: + """Separate a list of portals by their domains. + + This function creates a list populated with sets of unique portals that + have all the same domain in their URLs. + + Parameters: + portals: An iterable of `Portal` instances + + Returns: + A list of `PortalList`s, one for each domain in the original + instance. + """ + + logging.debug("Separating portals according to their domains...") + + # collect all unique domains + domains = set(portal.url.host for portal in self.data) + + # iterate over domains and check which portals belong to them + separated = list() + for domain in domains: + portals_in_domain = PortalList( + portal for portal in self.data if portal.url.host == domain + ) + + separated.append(portals_in_domain) # add to separated list + + return separated + + async def fetch_all( + self, + method: Literal["GET", "HEAD"] = "HEAD", + timeout: float = 10.0, + max_retries: int = 3, + ) -> List[PortalCapture]: + + logging.info(f"Fetching {len(self.data)} portals ('{method}')...") + + # create an empty list of responses data and metadata + responses: List[dict] = list() + + # remove url duplicates + unique_urls: Set[URL] = set(portal.url for portal in self.data) + + client_timeout = ClientTimeout(total=timeout) + + async with aiohttp.ClientSession( + timeout=client_timeout, trust_env=True + ) as client: + + # iterate over portal URLs + for url in unique_urls: + + # configure request + ssl_valid: bool = True # start assuming so + + # try fetching page + attempt: int = 1 + while attempt <= max_retries: + try: + logging.info( + f"Sending request to <{url}> " + + f"({attempt}/{max_retries})..." + ) + request_time: datetime = datetime.now(timezone.utc) + + async with client.request( + method, url=str(url), verify_ssl=ssl_valid + ) as response: + time_elapsed: timedelta = ( + datetime.now(timezone.utc) - request_time + ) + final_url: Optional[URL] = response.url + response_status: int = response.status + if method == "GET": + message: Any = str(await response.text()) + else: + message = response.reason + if not response.ok and attempt <= max_retries: + attempt += 1 + continue + + # Invalid SSL certificate; try again without verifying + except ClientConnectorCertificateError: + ssl_valid = False + if attempt < max_retries: + continue + + # some other error; try again + except (ClientError, TimeoutError) as err: + time_elapsed = ( + datetime.now(timezone.utc) - request_time + ) + message = err.__str__ + final_url = None + response_status = 999 + if attempt < max_retries: + attempt += 1 + continue + + # record answer if it is OK or exceeded max tries + logging.info(response_status) + responses.append( + { + "initial_url": url, + "final_url": final_url, + "method": method, + "attempts": attempt, + "request_time": request_time, + "waiting_time": time_elapsed, + "ssl_valid": ssl_valid, + "status": response_status, + "message": message, + } + ) + break + + # associate unique urls to portals + captures: List[PortalCapture] = list() + for portal, capture in itertools.product(self.data, responses): + if portal.url == capture["initial_url"]: + print(portal.url) + captures.append( + PortalCapture( + ibge_code=portal.ibge_code, + level=portal.level.value, + branch=portal.branch.value, + **capture, + ) + ) + print(len(captures)) + return captures From 978163fe03e5f191665c37a5c51219e2a65dfb74 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sat, 13 Feb 2021 18:11:30 -0300 Subject: [PATCH 05/19] Add main logic --- utils/.env.template | 19 ++- utils/src/fetch_portals/callbacks.py | 19 +-- utils/src/fetch_portals/fetchers.py | 15 +- utils/src/fetch_portals/main.py | 103 +++++++++++++ utils/src/fetch_portals/test/conftest.py | 73 ++++++++++ .../src/fetch_portals/test/test_callbacks.py | 64 --------- utils/src/fetch_portals/test/test_fetchers.py | 48 +++---- utils/src/fetch_portals/test/test_main.py | 135 ++++++++++++++++++ utils/src/models.py | 44 ++++-- 9 files changed, 401 insertions(+), 119 deletions(-) create mode 100644 utils/src/fetch_portals/main.py create mode 100644 utils/src/fetch_portals/test/conftest.py create mode 100644 utils/src/fetch_portals/test/test_main.py diff --git a/utils/.env.template b/utils/.env.template index 9b11837..a97b341 100644 --- a/utils/.env.template +++ b/utils/.env.template @@ -6,14 +6,27 @@ ## General settings ### log verbosity level - choose between 'error', 'warn', 'info', 'debug' -PORTALFETCH_LOG_LEVEL = "debug" +FETCHPORTALS_LOG_LEVEL = "debug" + +### set source of portal URLs and geo IDs - only 'census' is currently accepted +FETCHPORTALS_SOURCE = "census" ### control whether to only ping portal status, or to fetch its source code -PORTALFETCH_MODE = "ping" +FETCHPORTALS_MODE = "ping" + +### set maximum number of tries and time waiting +FETCHPORTALS_MAX_RETRIES = 3 +FETCHPORTALS_TIMEOUT = 10.0 ### control callback to process and/or save the retrived data -PORTALFETCH_CALLBACK = "kaggle" +FETCHPORTALS_CALLBACK = "kaggle" + +### control what to do when destination file already exists - must be one of +### 'replace', 'append' or 'skip' +FETCHPORTALS_EXISTING = "replace" +### set a local directory where retrieved files may persist +FETCHPORTALS_LOCALDIR = "./data" ## Kaggle settings diff --git a/utils/src/fetch_portals/callbacks.py b/utils/src/fetch_portals/callbacks.py index d64a5fa..bd7256f 100644 --- a/utils/src/fetch_portals/callbacks.py +++ b/utils/src/fetch_portals/callbacks.py @@ -21,7 +21,7 @@ import pandas as pd -from ..models import PathLike, PortalCapture +from ..models import ExistingBehavior, PathLike, PortalCapture def _autogen_version_notes( @@ -53,7 +53,7 @@ def to_kaggle( data: Union[Iterable[Union[dict, PortalCapture]], pd.DataFrame], dataset: str, dest_file: str, - existing_behavior: Literal["replace", "append", "skip"] = "replace", + existing_behavior: ExistingBehavior = "replace", version_notes: Optional[str] = None, local_dir: Optional[PathLike] = None, delete_old_versions: bool = False, @@ -171,13 +171,14 @@ def to_kaggle( operation = "create" data.to_csv(Path(data_dir, dest_file)) - # update metadata file - try: - api.dataset_metadata_update(dataset, data_dir) - except KeyError: - # BUG: KaggleApi's dataset_metadata_update() method references an - # inexistent key for checking for errors. Just ignore it. - pass + # # update metadata file + # BUG + # try: + # api.dataset_metadata_update(dataset, data_dir) + # except KeyError: + # # BUG: KaggleApi's dataset_metadata_update() method references an + # # inexistent key for checking for errors. Just ignore it. + # pass # create a version notes message, if user hasn't provided one if not version_notes: diff --git a/utils/src/fetch_portals/fetchers.py b/utils/src/fetch_portals/fetchers.py index f74ec37..a521058 100644 --- a/utils/src/fetch_portals/fetchers.py +++ b/utils/src/fetch_portals/fetchers.py @@ -60,11 +60,13 @@ async def _gather_responses( task_list: List = list() for subset in portals.by_domain(): - task: asyncio.Task = asyncio.create_task(subset.fetch_all( - method=http_method, - max_retries=max_retries, - timeout=timeout, - )) + task: asyncio.Task = asyncio.create_task( + subset.fetch_all( + method=http_method, + max_retries=max_retries, + timeout=timeout, + ) + ) task_list.append(task) return await asyncio.gather(*task_list) @@ -88,7 +90,8 @@ def fetch_portals( results = list() task_list = asyncio.run( - _gather_responses(portals, mode, max_retries, timeout)) + _gather_responses(portals, mode, max_retries, timeout) + ) for task_results in task_list: for result in task_results: diff --git a/utils/src/fetch_portals/main.py b/utils/src/fetch_portals/main.py new file mode 100644 index 0000000..8a42cbe --- /dev/null +++ b/utils/src/fetch_portals/main.py @@ -0,0 +1,103 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Main logic to capture data from official gazettes portals. + +This module contains main logic to capture official gazette portals' status and +source code for Brazilian municipalities. developed as a part of the `Censo +Querido Diário`_ effort, in order to periodically fetch the contents and +monitor the service availability of portals containing the official gazettes +for the 5.526 brazilian municipalities. + +The Censo Querido Diário is a collaborative effort to push forward the +disclosure of public information embodied in official publications. +Contributions to this initiative are more than welcome. Check our `contribution +guidelines`_ (in portuguese) to learn the various ways you can support the +project. + +.. _Censo Querido Diário: https://censo.ok.org.br/sobre/ + +.. _contribution guidelines: + https://github.com/okfn-brasil/censo-querido-diario/blob/main/CONTRIBUTING.MD +""" + +import json +import logging +import os +import sys +from pathlib import Path +from typing import List, Optional + +from dotenv import load_dotenv + +from ..models import (AcceptedCallback, AcceptedSource, ExistingBehavior, + FetchMode, LogLevel, PathLike, PortalCapture) +from .callbacks import to_kaggle +from .fetchers import fetch_portals +from .sources import get_portals_from_census + +# get configurations from environment variables, or use defaults +script_path: Path = Path(os.path.abspath(__file__)) +load_dotenv(os.path.join(script_path.parents[2], ".env")) + + +def main( + source: AcceptedSource = os.getenv("FETCHPORTALS_SOURCE", "census"), + mode: FetchMode = os.getenv("FETCHPORTALS_MODE", "ping"), + callback: Optional[AcceptedCallback] = os.getenv("FETCHPORTALS_CALLBACK"), + local_dir: Optional[PathLike] = os.getenv("FETCHPORTALS_LOCALDIR"), + existing: ExistingBehavior = os.getenv("FETCHPORTALS_EXISTING", "replace"), + max_retries: int = os.getenv("FETCHPORTALS_MAX_RETRIES", 3), + timeout: float = os.getenv("FETCHPORTALS_TIMEOUT", 10.0), + log_level: LogLevel = os.getenv("FETCHPORTALS_LOG_LEVEL", "warn"), +) -> None: + # init logs + logging.basicConfig(format="%(asctime)s %(message)s", level=log_level) + + # get a list of portals + if source == "census": + portals = get_portals_from_census() + else: + raise ValueError(f"'{source}' is not a valid source.") + + # fetch them + captures: List[PortalCapture] = fetch_portals(portals=portals, mode=mode) + + # save captured data + if callback == "kaggle": # save to a Kaggle dataset file + try: + assert "KAGGLE_USERNAME" in os.environ + assert "KAGGLE_KEY" in os.environ + except AssertionError: + logging.error("Kaggle credentials not found in environment.") + raise RuntimeError + dest_dataset: str = os.environ["KAGGLE_DATASET"] + dest_file: str = os.environ["KAGGLE_FILE"] + to_kaggle( + captures, + dataset=dest_dataset, + dest_file=dest_file, + existing_behavior=existing, + local_dir=local_dir, + ) + + # print to stdout (default) + elif not callback: + results_json: str = json.dumps( + [capture.to_dict() for capture in captures], + indent=4, + sort_keys=True, + separators=(",", ": "), + ).replace("\\n", "\n") + sys.stdout.write(results_json) + + # unimplemented callback + else: + raise ValueError(f"'{callback}' is not a valid callback.") + + +if __name__ == "__main__": + main() diff --git a/utils/src/fetch_portals/test/conftest.py b/utils/src/fetch_portals/test/conftest.py new file mode 100644 index 0000000..76f2524 --- /dev/null +++ b/utils/src/fetch_portals/test/conftest.py @@ -0,0 +1,73 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Reusable Pytest fixtures for testing fetch_portals package.""" + +import json +import os +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Generator + +import pytest +from dotenv import load_dotenv + + +@pytest.fixture(scope="session") +def kaggle_api() -> "KaggleApi": # type: ignore # noqa: F821 + """Initialize and authenticate connection to Kaggle API.""" + # get set kaggle credentials as environment variables + script_path = Path(os.path.abspath(__file__)) + load_dotenv(os.path.join(script_path.parents[3], ".env")) + + # initialize api + from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore + + api = KaggleApi() + api.authenticate() + + return api + + +@pytest.fixture(scope="session") +def mock_kaggle_dataset(kaggle_api) -> Generator[str, None, None]: + """Creates a Kaggle dataset for testing purposes. + + Note: + There is currently no method for programatically removing a Kaggle + dataset. Therefore, the user must manually delete the created dataset, + located at ``https://kaggle.com/myuser/example`` (where ``myuser`` is + the name of the Kaggle user provided through the ``KAGGLE_USER`` + environment variable). + + Yields: + ID of the created dataset, in the format ``myuser/example``. + """ + kaggle_user = os.environ["KAGGLE_USERNAME"] + mock_data = """ + "fruit_name","fruit_color","fruit_number" + apple,red,6 + banana,yellow,12 + plum,purple,5 + """ + try: + tmpdir = TemporaryDirectory() + metadata = { + "title": "Example Dataset", + "id": kaggle_user + "/example", + "licenses": [{"name": "CC0-1.0"}], + } + with open( + os.path.join(tmpdir.name, "datapackage.json"), "w" + ) as meta_file: + meta_json = json.dumps(metadata) + meta_file.write(meta_json) + with open(os.path.join(tmpdir.name, "example_fruits.csv"), "w") as f: + f.write(mock_data) + kaggle_api.dataset_create_new(tmpdir.name) + yield metadata["id"] # type: ignore + finally: + tmpdir.cleanup() diff --git a/utils/src/fetch_portals/test/test_callbacks.py b/utils/src/fetch_portals/test/test_callbacks.py index e9aae3d..e0580f6 100644 --- a/utils/src/fetch_portals/test/test_callbacks.py +++ b/utils/src/fetch_portals/test/test_callbacks.py @@ -10,16 +10,9 @@ defined in the `callbacks.py`_ file are working as expected. """ - -import json -import os from datetime import datetime, timedelta, timezone -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Generator import pytest -from dotenv import load_dotenv from ...models import PortalCapture from ..callbacks import _autogen_version_notes, to_kaggle @@ -70,63 +63,6 @@ def mock_captures(): return captures -@pytest.fixture(scope="module") -def kaggle_api() -> "KaggleApi": # type: ignore # noqa: F821 - """Initialize and authenticate connection to Kaggle API.""" - # get set kaggle credentials as environment variables - script_path = Path(os.path.abspath(__file__)) - load_dotenv(os.path.join(script_path.parents[3], ".env")) - - # initialize api - from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore - - api = KaggleApi() - api.authenticate() - - return api - - -@pytest.fixture(scope="module") -def mock_kaggle_dataset(kaggle_api) -> Generator[str, None, None]: - """Creates a Kaggle dataset for testing purposes. - - Note: - There is currently no method for programatically removing a Kaggle - dataset. Therefore, the user must manually delete the created dataset, - located at ``https://kaggle.com/myuser/example`` (where ``myuser`` is - the name of the Kaggle user provided through the ``KAGGLE_USER`` - environment variable). - - Yields: - ID of the created dataset, in the format ``myuser/example``. - """ - kaggle_user = os.environ["KAGGLE_USERNAME"] - mock_data = """ - "fruit_name","fruit_color","fruit_number" - apple,red,6 - banana,yellow,12 - plum,purple,5 - """ - try: - tmpdir = TemporaryDirectory() - metadata = { - "title": "Example Dataset", - "id": kaggle_user + "/example", - "licenses": [{"name": "CC0-1.0"}], - } - with open( - os.path.join(tmpdir.name, "datapackage.json"), "w" - ) as meta_file: - meta_json = json.dumps(metadata) - meta_file.write(meta_json) - with open(os.path.join(tmpdir.name, "example_fruits.csv"), "w") as f: - f.write(mock_data) - kaggle_api.dataset_create_new(tmpdir.name) - yield metadata["id"] # type: ignore - finally: - tmpdir.cleanup() - - def test_autogen_version_notes(): """Tests generating a default version note message.""" expected_notes = { diff --git a/utils/src/fetch_portals/test/test_fetchers.py b/utils/src/fetch_portals/test/test_fetchers.py index 31ba45d..ddcd2ba 100644 --- a/utils/src/fetch_portals/test/test_fetchers.py +++ b/utils/src/fetch_portals/test/test_fetchers.py @@ -16,58 +16,56 @@ @pytest.fixture def example_portals() -> PortalList: - """Create a `PortalList`_ instance with a few official gazette portals. - """ + """Create a `PortalList`_ instance with a few official gazette portals.""" # Altinho (PE) portal1: Portal = Portal( ibge_code=2600807, url=URL( - "http://netuse.inf.br/altinho_pm/portaltransparencia/index.php?" + - "link=6" - ) + "http://netuse.inf.br/altinho_pm/portaltransparencia/index.php?" + + "link=6" + ), ) portal2: Portal = Portal( - ibge_code=2600807, - url=URL("http://www.diariomunicipal.com.br/amupe/") + ibge_code=2600807, url=URL("http://www.diariomunicipal.com.br/amupe/") ) # Alto Bela Vista (SC) portal3: Portal = Portal( ibge_code=4200754, url=URL( - "https://diariomunicipal.sc.gov.br/site/" + - "?r=site/index&q=cod_entidade%3A13" - ) + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A13" + ), ) # Anchieta (SC) portal4: Portal = Portal( ibge_code=4200804, url=URL( - "https://diariomunicipal.sc.gov.br/site/" + - "?r=site/index&q=cod_entidade%3A14" - ) + "https://diariomunicipal.sc.gov.br/site/" + + "?r=site/index&q=cod_entidade%3A14" + ), ) # Angelim (PE) portal5: Portal = Portal( ibge_code=2601003, - url=URL("http://www.diariomunicipal.com.br/amupe/pesquisar") + url=URL("http://www.diariomunicipal.com.br/amupe/pesquisar"), ) portal6 = Portal( ibge_code=2601003, url=URL( - "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + - "legislacaomunicipal.faces" - ) + "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + + "legislacaomunicipal.faces" + ), ) portal7 = Portal( ibge_code=2601003, url=URL( - "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + - "outrosatos.faces" - ) + "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + + "outrosatos.faces" + ), ) return PortalList( @@ -85,8 +83,7 @@ def test_split_by_domain(example_portals) -> None: def test_head_subsets(example_portals) -> None: - """Tests pinging subsets of a `PortalList`_ with unique domains. - """ + """Tests pinging subsets of a `PortalList`_ with unique domains.""" subsets: PortalList = example_portals.by_domain() for subset in subsets: subset = PortalList(subset) @@ -99,8 +96,7 @@ def test_head_subsets(example_portals) -> None: def test_get_subsets(example_portals) -> None: - """Tests capturing subsets of a `PortalList`_ with unique domains. - """ + """Tests capturing subsets of a `PortalList`_ with unique domains.""" subsets: PortalList = example_portals.by_domain() for subset in subsets: subset = PortalList(subset) @@ -114,9 +110,7 @@ def test_get_subsets(example_portals) -> None: def test_orchestrate_pinging(example_portals) -> None: """Tests asynchronously pinging multiple portals.""" - captures: List[PortalCapture] = fetch_portals( - example_portals, mode="ping" - ) + captures: List[PortalCapture] = fetch_portals(example_portals, mode="ping") assert len(captures) == len(example_portals) for capture in captures: assert isinstance(capture, PortalCapture) diff --git a/utils/src/fetch_portals/test/test_main.py b/utils/src/fetch_portals/test/test_main.py new file mode 100644 index 0000000..b3467d4 --- /dev/null +++ b/utils/src/fetch_portals/test/test_main.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Tests callback functions. + +This module contains test cases for checking whether the callback functions +defined in the `callbacks.py`_ file are working as expected. +""" + +import os +from tempfile import TemporaryDirectory + +import pandas as pd + +from ..main import main + + +def test_ping(capsys): + """Tests pinging all portals in Querido Diario Census.""" + main(mode="ping", callback=None) + out, err = capsys.readouterr() + assert '"ibge_code": "2600807"' in out + + +def test_source(capsys): + """Tests getting source codes for all portals in Querido Diario Census.""" + main(mode="source", callback=None) + out, err = capsys.readouterr() + assert '"ibge_code": "2600807"' in out + assert "" in out + + +def test_ping_to_kaggle(mock_kaggle_dataset, kaggle_api): + """Tests saving pings to all portals in QD Census to Kaggle.""" + # copy original kaggle dataset config (that should not to be modified) + previous_kaggle_dataset = os.getenv("KAGGLE_DATASET") + previous_kaggle_file = os.getenv("KAGGLE_FILE") + + # upload data to mock dataset + try: + os.environ["KAGGLE_DATASET"] = mock_kaggle_dataset + os.environ["KAGGLE_FILE"] = "test-ping.csv" + main(mode="ping", callback="kaggle", existing="append") + with TemporaryDirectory() as tmpdir: + kaggle_api.dataset_download_file( + mock_kaggle_dataset, os.environ["KAGGLE_FILE"], tmpdir + ) + df = pd.read_csv(os.path.join(tmpdir, os.environ["KAGGLE_FILE"])) + for col in [ + "ibge_code", + "request_time", + "waiting_time", + "attempts", + "initial_url", + "final_url", + "method", + "ssl_valid", + "status", + "message", + "level", + "branch", + ]: + assert col in df.columns + assert len(df.index) > 3 + assert "200" in df["status"].unique() + assert "OK" in df["message"].unique() + + # reset kaggle dataset config to the original one + finally: + if previous_kaggle_dataset: + os.environ["KAGGLE_DATASET"] = previous_kaggle_dataset + else: + del os.environ["KAGGLE_DATASET"] + if previous_kaggle_file: + os.environ["KAGGLE_FILE"] = previous_kaggle_file + else: + del os.environ["KAGGLE_FILE"] + + +def test_source_to_kaggle(mock_kaggle_dataset, kaggle_api): + """Tests saving source codes for all portals in QD Census to Kaggle.""" + # copy original kaggle dataset config (that should not to be modified) + previous_kaggle_dataset = os.getenv("KAGGLE_DATASET") + previous_kaggle_file = os.getenv("KAGGLE_FILE") + + # upload data to mock dataset + try: + os.environ["KAGGLE_DATASET"] = mock_kaggle_dataset + os.environ["KAGGLE_FILE"] = "test-source.csv" + main(mode="source", callback="kaggle") + with TemporaryDirectory() as tmpdir: + kaggle_api.dataset_download_file( + mock_kaggle_dataset, os.environ["KAGGLE_FILE"], tmpdir + ) + try: + df = pd.read_csv( + os.path.join(tmpdir, os.environ["KAGGLE_FILE"]) + ) + except FileNotFoundError: + df = pd.read_csv( + os.path.join(tmpdir, os.environ["KAGGLE_FILE"] + ".zip") + ) + for col in [ + "ibge_code", + "request_time", + "waiting_time", + "attempts", + "initial_url", + "final_url", + "method", + "ssl_valid", + "status", + "message", + "level", + "branch", + ]: + assert col in df.columns + assert len(df.index) > 3 + assert df["message"].apply(lambda msg: "" in str(msg)).any() + + # reset kaggle dataset config to the original one + finally: + if previous_kaggle_dataset: + os.environ["KAGGLE_DATASET"] = previous_kaggle_dataset + else: + del os.environ["KAGGLE_DATASET"] + if previous_kaggle_file: + os.environ["KAGGLE_FILE"] = previous_kaggle_file + else: + del os.environ["KAGGLE_FILE"] diff --git a/utils/src/models.py b/utils/src/models.py index 3a68822..68cc16e 100644 --- a/utils/src/models.py +++ b/utils/src/models.py @@ -24,10 +24,13 @@ from aiohttp import ClientConnectorCertificateError, ClientError, ClientTimeout from yarl import URL +AcceptedCallback = Literal["kaggle"] +AcceptedSource = Literal["census"] +ExistingBehavior = Literal["append", "replace", "skip"] FetchMode = Literal["ping", "source"] IbgeCode = NewType("IbgeCode", int) # TODO: make it a UserString LogLevel = Literal["error", "warn", "info", "debug"] -PathLike = Union[str, bytes, os.PathLike] +PathLike = Union[str, bytes, "os.PathLike[Any]"] class GovernmentBranch(Enum): @@ -49,7 +52,8 @@ class GovernmentLevel(Enum): @dataclass class Portal: - """Representation of a portal that publishes local-level official gazettes.""" + """Representation of a portal that publishes local-level official gazettes. + """ ibge_code: IbgeCode url: URL @@ -71,8 +75,25 @@ class PortalCapture: ssl_valid: bool status: int message: str - level: str = GovernmentLevel.MUNICIPALITY.name - branch: str = GovernmentBranch.EXECUTIVE.name + level: GovernmentLevel = GovernmentLevel.MUNICIPALITY + branch: GovernmentBranch = GovernmentBranch.EXECUTIVE + + def to_dict(self): + """Converts a PortalCapture into a dictionary.""" + return { + "ibge_code": str(self.ibge_code), + "request_time": self.request_time.isoformat(), + "waiting_time": self.waiting_time.total_seconds(), + "attempts": self.attempts, + "initial_url": str(self.initial_url), + "final_url": str(self.final_url or ""), + "method": self.method, + "ssl_valid": int(self.ssl_valid), + "status": self.status, + "message": self.message, + "level": self.level, + "branch": self.branch, + } class PortalList(UserList): @@ -146,7 +167,7 @@ async def fetch_all( request_time: datetime = datetime.now(timezone.utc) async with client.request( - method, url=str(url), verify_ssl=ssl_valid + method, url=str(url), ssl=ssl_valid ) as response: time_elapsed: timedelta = ( datetime.now(timezone.utc) - request_time @@ -154,6 +175,7 @@ async def fetch_all( final_url: Optional[URL] = response.url response_status: int = response.status if method == "GET": + # TODO: get charsets defined in tags message: Any = str(await response.text()) else: message = response.reason @@ -168,11 +190,15 @@ async def fetch_all( continue # some other error; try again - except (ClientError, TimeoutError) as err: + except ( + ClientError, + TimeoutError, + UnicodeDecodeError, + ) as err: time_elapsed = ( datetime.now(timezone.utc) - request_time ) - message = err.__str__ + message = repr(err) final_url = None response_status = 999 if attempt < max_retries: @@ -180,7 +206,7 @@ async def fetch_all( continue # record answer if it is OK or exceeded max tries - logging.info(response_status) + logging.info(str(response_status)) responses.append( { "initial_url": url, @@ -200,7 +226,6 @@ async def fetch_all( captures: List[PortalCapture] = list() for portal, capture in itertools.product(self.data, responses): if portal.url == capture["initial_url"]: - print(portal.url) captures.append( PortalCapture( ibge_code=portal.ibge_code, @@ -209,5 +234,4 @@ async def fetch_all( **capture, ) ) - print(len(captures)) return captures From 77b876b5925ec3999fde12c8da69732c9a19d118 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sat, 13 Feb 2021 18:11:55 -0300 Subject: [PATCH 06/19] Fix style --- utils/src/fetch_portals/sources.py | 4 +++- utils/src/fetch_portals/test/test_sources.py | 5 ++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/utils/src/fetch_portals/sources.py b/utils/src/fetch_portals/sources.py index bd7c97c..65241c9 100644 --- a/utils/src/fetch_portals/sources.py +++ b/utils/src/fetch_portals/sources.py @@ -58,7 +58,9 @@ def get_portals_from_census() -> PortalList: # FIXME: avoid "None" strings in url column lambda mun: Portal( ibge_code=IbgeCode(mun["IBGE7"]), url=URL(mun["fonte"]) - ) if mun.fonte != "None" else np.nan, + ) + if mun.fonte != "None" + else np.nan, axis=1, ) .dropna() diff --git a/utils/src/fetch_portals/test/test_sources.py b/utils/src/fetch_portals/test/test_sources.py index a9c8776..097d7c4 100644 --- a/utils/src/fetch_portals/test/test_sources.py +++ b/utils/src/fetch_portals/test/test_sources.py @@ -5,13 +5,12 @@ # https://opensource.org/licenses/MIT. -from ..sources import get_portals_from_census from ...models import Portal, PortalList +from ..sources import get_portals_from_census def test_get_portals_from_census() -> None: - """Test getting a list of official gazettes portals from the QD census. - """ + """Test getting a list of official gazettes portals from the QD census.""" portals = get_portals_from_census() assert len(portals) >= 326 # there are at least 324 mapped portals assert isinstance(portals, PortalList) From f6c6c4dd9a1161853b0f8f9bd220c73caa5bc69f Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sat, 13 Feb 2021 18:13:39 -0300 Subject: [PATCH 07/19] Ignore zip files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 65e9d1d..abcfb33 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,4 @@ censo/share/ # Data notebooks/*.csv - +*.zip From d9a8c08624ab0438c140156d3e213f2bb55d5300 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sat, 13 Feb 2021 19:04:34 -0300 Subject: [PATCH 08/19] Move models.py module --- utils/src/__init__.py | 0 utils/src/fetch_portals/callbacks.py | 2 +- utils/src/fetch_portals/fetchers.py | 2 +- utils/src/{ => fetch_portals}/models.py | 0 utils/src/fetch_portals/sources.py | 2 +- utils/src/fetch_portals/test/test_callbacks.py | 2 +- utils/src/fetch_portals/test/test_fetchers.py | 2 +- utils/src/fetch_portals/test/test_sources.py | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) delete mode 100644 utils/src/__init__.py rename utils/src/{ => fetch_portals}/models.py (100%) diff --git a/utils/src/__init__.py b/utils/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/utils/src/fetch_portals/callbacks.py b/utils/src/fetch_portals/callbacks.py index bd7256f..2ca02c2 100644 --- a/utils/src/fetch_portals/callbacks.py +++ b/utils/src/fetch_portals/callbacks.py @@ -21,7 +21,7 @@ import pandas as pd -from ..models import ExistingBehavior, PathLike, PortalCapture +from .models import ExistingBehavior, PathLike, PortalCapture def _autogen_version_notes( diff --git a/utils/src/fetch_portals/fetchers.py b/utils/src/fetch_portals/fetchers.py index a521058..dd3ce1d 100644 --- a/utils/src/fetch_portals/fetchers.py +++ b/utils/src/fetch_portals/fetchers.py @@ -29,7 +29,7 @@ # from itertools import chain from typing import List -from ..models import FetchMode, PortalList +from .models import FetchMode, PortalList async def _gather_responses( diff --git a/utils/src/models.py b/utils/src/fetch_portals/models.py similarity index 100% rename from utils/src/models.py rename to utils/src/fetch_portals/models.py diff --git a/utils/src/fetch_portals/sources.py b/utils/src/fetch_portals/sources.py index 65241c9..ac318c1 100644 --- a/utils/src/fetch_portals/sources.py +++ b/utils/src/fetch_portals/sources.py @@ -31,7 +31,7 @@ import pandas as pd from yarl import URL -from ..models import IbgeCode, Portal, PortalList +from .models import IbgeCode, Portal, PortalList def get_portals_from_census() -> PortalList: diff --git a/utils/src/fetch_portals/test/test_callbacks.py b/utils/src/fetch_portals/test/test_callbacks.py index e0580f6..0a3c45d 100644 --- a/utils/src/fetch_portals/test/test_callbacks.py +++ b/utils/src/fetch_portals/test/test_callbacks.py @@ -14,8 +14,8 @@ import pytest -from ...models import PortalCapture from ..callbacks import _autogen_version_notes, to_kaggle +from ..models import PortalCapture @pytest.fixture diff --git a/utils/src/fetch_portals/test/test_fetchers.py b/utils/src/fetch_portals/test/test_fetchers.py index ddcd2ba..8908ee0 100644 --- a/utils/src/fetch_portals/test/test_fetchers.py +++ b/utils/src/fetch_portals/test/test_fetchers.py @@ -10,8 +10,8 @@ import pytest from httpx import URL -from ...models import Portal, PortalCapture, PortalList from ..fetchers import fetch_portals +from ..models import Portal, PortalCapture, PortalList @pytest.fixture diff --git a/utils/src/fetch_portals/test/test_sources.py b/utils/src/fetch_portals/test/test_sources.py index 097d7c4..4592e52 100644 --- a/utils/src/fetch_portals/test/test_sources.py +++ b/utils/src/fetch_portals/test/test_sources.py @@ -5,7 +5,7 @@ # https://opensource.org/licenses/MIT. -from ...models import Portal, PortalList +from ..models import Portal, PortalList from ..sources import get_portals_from_census From 1d710c34b8f99a3ff6037946f9b4bd52b025cbb6 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sat, 13 Feb 2021 19:06:13 -0300 Subject: [PATCH 09/19] Fix style --- utils/src/fetch_portals/fetchers.py | 1 + utils/src/fetch_portals/main.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/utils/src/fetch_portals/fetchers.py b/utils/src/fetch_portals/fetchers.py index dd3ce1d..d1de8d6 100644 --- a/utils/src/fetch_portals/fetchers.py +++ b/utils/src/fetch_portals/fetchers.py @@ -26,6 +26,7 @@ import asyncio import logging + # from itertools import chain from typing import List diff --git a/utils/src/fetch_portals/main.py b/utils/src/fetch_portals/main.py index 8a42cbe..e27dfef 100644 --- a/utils/src/fetch_portals/main.py +++ b/utils/src/fetch_portals/main.py @@ -33,10 +33,17 @@ from dotenv import load_dotenv -from ..models import (AcceptedCallback, AcceptedSource, ExistingBehavior, - FetchMode, LogLevel, PathLike, PortalCapture) from .callbacks import to_kaggle from .fetchers import fetch_portals +from .models import ( + AcceptedCallback, + AcceptedSource, + ExistingBehavior, + FetchMode, + LogLevel, + PathLike, + PortalCapture, +) from .sources import get_portals_from_census # get configurations from environment variables, or use defaults From 1f5db6368bac086d766e3e3fbde47b8642a83e4f Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sat, 13 Feb 2021 20:29:46 -0300 Subject: [PATCH 10/19] Package for distribution --- utils/pyproject.toml | 6 ++++++ utils/requirements-dev.txt | 12 +---------- utils/requirements.txt | 6 +----- utils/setup.cfg | 41 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 16 deletions(-) create mode 100644 utils/pyproject.toml create mode 100644 utils/setup.cfg diff --git a/utils/pyproject.toml b/utils/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/utils/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/utils/requirements-dev.txt b/utils/requirements-dev.txt index cf9a93a..bf1a8b0 100644 --- a/utils/requirements-dev.txt +++ b/utils/requirements-dev.txt @@ -1,11 +1 @@ -aiohttp -kaggle -pandas -pandas-stubs -python-dotenv -requests -mypy -pytest -flake8 -isort -black \ No newline at end of file +.[dev] \ No newline at end of file diff --git a/utils/requirements.txt b/utils/requirements.txt index a88da40..945c9b4 100644 --- a/utils/requirements.txt +++ b/utils/requirements.txt @@ -1,5 +1 @@ -aiohttp -kaggle -pandas -python-dotenv -requests \ No newline at end of file +. \ No newline at end of file diff --git a/utils/setup.cfg b/utils/setup.cfg new file mode 100644 index 0000000..db231a8 --- /dev/null +++ b/utils/setup.cfg @@ -0,0 +1,41 @@ +[metadata] +name = censusqdutils +version = 0.1.0 +url = https://github.com/okfn-brasil/censo-querido-diario +author = Open Knowledge Brasil +author_email = contato@serenata.ai +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent +description = Utils for processing Querido Diario Census data. +long_description = file: README.md +long_description_content_type = text/markdown +license = MIT + +[options] +python_requires = >=3.7 +package_dir = + =src +packages = find: +install_requires = + aiohttp >= 3.7 + kaggle >= 1.5 + pandas >= 1.2 + python-dotenv >= 0.15 + +[options.extras_require] +dev = + black == 20.8b1 + flake8 >= 3.8.4 + isort >= 5.7.0 + mypy >= 0.800 + pandas-stubs >= 1.0.4.4 + pytest >= 6.2.2 + +[options.packages.find] +where = src + +[options.entry_points] +console_scripts = + fetch-portals = fetch_portals.main:main From b287d8fd8bc0bc14ce3fc4f88e1f9a85b45f294b Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sun, 14 Feb 2021 16:14:20 -0300 Subject: [PATCH 11/19] Fix typing issues --- utils/src/fetch_portals/fetchers.py | 7 ++--- utils/src/fetch_portals/main.py | 31 +++++++++++++------ utils/src/fetch_portals/models.py | 1 + utils/src/fetch_portals/sources.py | 8 ++--- utils/src/fetch_portals/test/test_fetchers.py | 19 ++++++------ utils/src/fetch_portals/test/test_sources.py | 2 +- 6 files changed, 41 insertions(+), 27 deletions(-) diff --git a/utils/src/fetch_portals/fetchers.py b/utils/src/fetch_portals/fetchers.py index d1de8d6..5eb9621 100644 --- a/utils/src/fetch_portals/fetchers.py +++ b/utils/src/fetch_portals/fetchers.py @@ -26,11 +26,11 @@ import asyncio import logging +from typing import cast -# from itertools import chain from typing import List -from .models import FetchMode, PortalList +from .models import AcceptedHttpMethod, FetchMode, PortalList async def _gather_responses( @@ -52,7 +52,6 @@ async def _gather_responses( portals = PortalList(portals) - http_method: str if mode == "ping": http_method = "HEAD" elif mode == "source": @@ -63,7 +62,7 @@ async def _gather_responses( for subset in portals.by_domain(): task: asyncio.Task = asyncio.create_task( subset.fetch_all( - method=http_method, + method=cast(AcceptedHttpMethod, http_method), max_retries=max_retries, timeout=timeout, ) diff --git a/utils/src/fetch_portals/main.py b/utils/src/fetch_portals/main.py index e27dfef..3107905 100644 --- a/utils/src/fetch_portals/main.py +++ b/utils/src/fetch_portals/main.py @@ -29,7 +29,7 @@ import os import sys from pathlib import Path -from typing import List, Optional +from typing import cast, Final, List, Optional from dotenv import load_dotenv @@ -51,15 +51,28 @@ load_dotenv(os.path.join(script_path.parents[2], ".env")) +# load settings from environment, or use defaults +SOURCE: Final = os.getenv("FETCHPORTALS_SOURCE", "census") +MODE: Final = os.getenv("FETCHPORTALS_MODE", "ping") +CALLBACK: Final = os.getenv("FETCHPORTALS_CALLBACK", None) +LOCAL_DIR: Final = os.getenv("FETCHPORTALS_LOCALDIR", None) +EXISTING: Final = os.getenv("FETCHPORTALS_EXISTING", "replace") +MAX_RETRIES: Final = int(os.getenv("FETCHPORTALS_MAX_RETRIES", 3)) +TIMEOUT: Final = float(os.getenv("FETCHPORTALS_TIMEOUT", 10.0)) +LOG_LEVEL: Final = os.getenv("FETCHPORTALS_LOG_LEVEL", "warn") + + def main( - source: AcceptedSource = os.getenv("FETCHPORTALS_SOURCE", "census"), - mode: FetchMode = os.getenv("FETCHPORTALS_MODE", "ping"), - callback: Optional[AcceptedCallback] = os.getenv("FETCHPORTALS_CALLBACK"), - local_dir: Optional[PathLike] = os.getenv("FETCHPORTALS_LOCALDIR"), - existing: ExistingBehavior = os.getenv("FETCHPORTALS_EXISTING", "replace"), - max_retries: int = os.getenv("FETCHPORTALS_MAX_RETRIES", 3), - timeout: float = os.getenv("FETCHPORTALS_TIMEOUT", 10.0), - log_level: LogLevel = os.getenv("FETCHPORTALS_LOG_LEVEL", "warn"), + source: AcceptedSource = cast(AcceptedSource, SOURCE), + mode: FetchMode = cast(FetchMode, MODE), + callback: Optional[AcceptedCallback] = cast( + Optional[AcceptedCallback], CALLBACK + ), + local_dir: Optional[PathLike] = LOCAL_DIR, + existing: ExistingBehavior = cast(ExistingBehavior, EXISTING), + max_retries: int = MAX_RETRIES, + timeout: float = TIMEOUT, + log_level: LogLevel = cast(LogLevel, LOG_LEVEL), ) -> None: # init logs logging.basicConfig(format="%(asctime)s %(message)s", level=log_level) diff --git a/utils/src/fetch_portals/models.py b/utils/src/fetch_portals/models.py index 68cc16e..bea04c9 100644 --- a/utils/src/fetch_portals/models.py +++ b/utils/src/fetch_portals/models.py @@ -28,6 +28,7 @@ AcceptedSource = Literal["census"] ExistingBehavior = Literal["append", "replace", "skip"] FetchMode = Literal["ping", "source"] +AcceptedHttpMethod = Literal["GET", "HEAD"] IbgeCode = NewType("IbgeCode", int) # TODO: make it a UserString LogLevel = Literal["error", "warn", "info", "debug"] PathLike = Union[str, bytes, "os.PathLike[Any]"] diff --git a/utils/src/fetch_portals/sources.py b/utils/src/fetch_portals/sources.py index ac318c1..ad211a0 100644 --- a/utils/src/fetch_portals/sources.py +++ b/utils/src/fetch_portals/sources.py @@ -51,7 +51,9 @@ def get_portals_from_census() -> PortalList: # filter and process relevant data (cities geocodes and portal URLs) logging.debug("Processing portals information...") portals: List[Portal] = ( - pd.wide_to_long(df_census, "fonte", i="IBGE7", j="fonte_num", sep="_") + pd.wide_to_long( # type: ignore + df_census, "fonte", i="IBGE7", j="fonte_num", sep="_" + ) .reset_index() .dropna() .apply( @@ -67,6 +69,4 @@ def get_portals_from_census() -> PortalList: .to_list() ) - portals = PortalList(portals) - - return portals + return PortalList(portals) diff --git a/utils/src/fetch_portals/test/test_fetchers.py b/utils/src/fetch_portals/test/test_fetchers.py index 8908ee0..9defbc6 100644 --- a/utils/src/fetch_portals/test/test_fetchers.py +++ b/utils/src/fetch_portals/test/test_fetchers.py @@ -8,10 +8,10 @@ from typing import List, Set import pytest -from httpx import URL +from yarl import URL from ..fetchers import fetch_portals -from ..models import Portal, PortalCapture, PortalList +from ..models import IbgeCode, Portal, PortalCapture, PortalList @pytest.fixture @@ -20,19 +20,20 @@ def example_portals() -> PortalList: # Altinho (PE) portal1: Portal = Portal( - ibge_code=2600807, + ibge_code=IbgeCode(2600807), url=URL( "http://netuse.inf.br/altinho_pm/portaltransparencia/index.php?" + "link=6" ), ) portal2: Portal = Portal( - ibge_code=2600807, url=URL("http://www.diariomunicipal.com.br/amupe/") + ibge_code=IbgeCode(2600807), + url=URL("http://www.diariomunicipal.com.br/amupe/"), ) # Alto Bela Vista (SC) portal3: Portal = Portal( - ibge_code=4200754, + ibge_code=IbgeCode(4200754), url=URL( "https://diariomunicipal.sc.gov.br/site/" + "?r=site/index&q=cod_entidade%3A13" @@ -41,7 +42,7 @@ def example_portals() -> PortalList: # Anchieta (SC) portal4: Portal = Portal( - ibge_code=4200804, + ibge_code=IbgeCode(4200804), url=URL( "https://diariomunicipal.sc.gov.br/site/" + "?r=site/index&q=cod_entidade%3A14" @@ -50,18 +51,18 @@ def example_portals() -> PortalList: # Angelim (PE) portal5: Portal = Portal( - ibge_code=2601003, + ibge_code=IbgeCode(2601003), url=URL("http://www.diariomunicipal.com.br/amupe/pesquisar"), ) portal6 = Portal( - ibge_code=2601003, + ibge_code=IbgeCode(2601003), url=URL( "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + "legislacaomunicipal.faces" ), ) portal7 = Portal( - ibge_code=2601003, + ibge_code=IbgeCode(2601003), url=URL( "http://174.142.65.52:16444/transparencia/angelim/prefeitura/" + "outrosatos.faces" diff --git a/utils/src/fetch_portals/test/test_sources.py b/utils/src/fetch_portals/test/test_sources.py index 4592e52..1743b7b 100644 --- a/utils/src/fetch_portals/test/test_sources.py +++ b/utils/src/fetch_portals/test/test_sources.py @@ -17,4 +17,4 @@ def test_get_portals_from_census() -> None: for portal in portals: assert isinstance(portal, Portal) # assert len(portal.ibge_code) == 7 - assert len(portal.url.host) > 5 + assert len(str(portal.url.host or '')) > 5 From c0be8dbbd5f88ced154a728d83c69dc578e74292 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Sun, 14 Feb 2021 19:44:27 -0300 Subject: [PATCH 12/19] Fix log level for warnings --- utils/src/fetch_portals/main.py | 2 +- utils/src/fetch_portals/models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/src/fetch_portals/main.py b/utils/src/fetch_portals/main.py index 3107905..986108e 100644 --- a/utils/src/fetch_portals/main.py +++ b/utils/src/fetch_portals/main.py @@ -59,7 +59,7 @@ EXISTING: Final = os.getenv("FETCHPORTALS_EXISTING", "replace") MAX_RETRIES: Final = int(os.getenv("FETCHPORTALS_MAX_RETRIES", 3)) TIMEOUT: Final = float(os.getenv("FETCHPORTALS_TIMEOUT", 10.0)) -LOG_LEVEL: Final = os.getenv("FETCHPORTALS_LOG_LEVEL", "warn") +LOG_LEVEL: Final = os.getenv("FETCHPORTALS_LOG_LEVEL", "warning") def main( diff --git a/utils/src/fetch_portals/models.py b/utils/src/fetch_portals/models.py index bea04c9..e71a6c1 100644 --- a/utils/src/fetch_portals/models.py +++ b/utils/src/fetch_portals/models.py @@ -30,7 +30,7 @@ FetchMode = Literal["ping", "source"] AcceptedHttpMethod = Literal["GET", "HEAD"] IbgeCode = NewType("IbgeCode", int) # TODO: make it a UserString -LogLevel = Literal["error", "warn", "info", "debug"] +LogLevel = Literal["error", "warning", "info", "debug"] PathLike = Union[str, bytes, "os.PathLike[Any]"] From 7c0df86266674a8245399b96ed4e09cefa80f7e3 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Mon, 15 Feb 2021 21:01:53 -0300 Subject: [PATCH 13/19] Fix logging level selection --- utils/src/fetch_portals/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/src/fetch_portals/main.py b/utils/src/fetch_portals/main.py index 986108e..e01d742 100644 --- a/utils/src/fetch_portals/main.py +++ b/utils/src/fetch_portals/main.py @@ -74,8 +74,12 @@ def main( timeout: float = TIMEOUT, log_level: LogLevel = cast(LogLevel, LOG_LEVEL), ) -> None: + """Main program entry point.""" # init logs - logging.basicConfig(format="%(asctime)s %(message)s", level=log_level) + logging.basicConfig( + format="%(asctime)s %(message)s", + level=getattr(logging, log_level.upper()) + ) # get a list of portals if source == "census": From fdc9035511fe07fab6d617a1126d4b1b362020b0 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Mon, 15 Feb 2021 21:02:39 -0300 Subject: [PATCH 14/19] Enhance response logging --- utils/src/fetch_portals/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/src/fetch_portals/models.py b/utils/src/fetch_portals/models.py index e71a6c1..296cf3a 100644 --- a/utils/src/fetch_portals/models.py +++ b/utils/src/fetch_portals/models.py @@ -207,7 +207,7 @@ async def fetch_all( continue # record answer if it is OK or exceeded max tries - logging.info(str(response_status)) + logging.info(f"<{url}>: {message} ({response_status})") responses.append( { "initial_url": url, From cffdf94604a67323b130ae496238dcc2bd8cbd77 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Mon, 15 Feb 2021 21:03:15 -0300 Subject: [PATCH 15/19] Add Azure Function to ping portals --- .gitignore | 8 +++++++- utils/.funcignore | 5 +++++ utils/azure_ping_portals/__init__.py | 20 ++++++++++++++++++++ utils/azure_ping_portals/function.json | 11 +++++++++++ utils/host.json | 5 +++++ 5 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 utils/.funcignore create mode 100644 utils/azure_ping_portals/__init__.py create mode 100644 utils/azure_ping_portals/function.json create mode 100644 utils/host.json diff --git a/.gitignore b/.gitignore index abcfb33..496e810 100644 --- a/.gitignore +++ b/.gitignore @@ -129,7 +129,7 @@ dmypy.json .pyre/ # Ignore -.vscode/ + censo/bin/ censo/include/ censo/share/ @@ -137,3 +137,9 @@ censo/share/ # Data notebooks/*.csv *.zip + +# Azure stuff +.vscode +__azurite* +__blob* +local.settings.json \ No newline at end of file diff --git a/utils/.funcignore b/utils/.funcignore new file mode 100644 index 0000000..010071a --- /dev/null +++ b/utils/.funcignore @@ -0,0 +1,5 @@ +.env +.venv +local.settings.json +__azurite* +__pycache__ \ No newline at end of file diff --git a/utils/azure_ping_portals/__init__.py b/utils/azure_ping_portals/__init__.py new file mode 100644 index 0000000..4507381 --- /dev/null +++ b/utils/azure_ping_portals/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2020 Open Knowledge Brasil + +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +"""Periodically check the availability of Official Gazettes portals. +""" + +import logging + +import azure.functions as func +from fetch_portals.main import main as fetch + + +def main(timer: func.TimerRequest): + """Ping Querido Diario Census portals to check their availability.""" + logging.info(f"Starting function (past due {timer.past_due})") + fetch(mode="ping", existing="append", callback="kaggle") + logging.info("Finished checking portals from Census.") diff --git a/utils/azure_ping_portals/function.json b/utils/azure_ping_portals/function.json new file mode 100644 index 0000000..abb1e13 --- /dev/null +++ b/utils/azure_ping_portals/function.json @@ -0,0 +1,11 @@ +{ + "scriptFile": "__init__.py", + "bindings": [ + { + "name": "timer", + "type": "timerTrigger", + "direction": "in", + "schedule": "0 0 */3 * * *" + } + ] +} \ No newline at end of file diff --git a/utils/host.json b/utils/host.json new file mode 100644 index 0000000..f0abc88 --- /dev/null +++ b/utils/host.json @@ -0,0 +1,5 @@ +{ + "functionTimeout": "00:10:00", + "version": "2.0", + "watchDirectories": [ "src" ] +} \ No newline at end of file From d95dc8f2773d04fa59d7238cbfbb8f7d420c9b60 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Tue, 16 Feb 2021 17:54:10 -0300 Subject: [PATCH 16/19] Remove space characters around attribution operator --- utils/.env.template | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/utils/.env.template b/utils/.env.template index a97b341..03f43af 100644 --- a/utils/.env.template +++ b/utils/.env.template @@ -6,31 +6,31 @@ ## General settings ### log verbosity level - choose between 'error', 'warn', 'info', 'debug' -FETCHPORTALS_LOG_LEVEL = "debug" +FETCHPORTALS_LOG_LEVEL="debug" ### set source of portal URLs and geo IDs - only 'census' is currently accepted -FETCHPORTALS_SOURCE = "census" +FETCHPORTALS_SOURCE="census" ### control whether to only ping portal status, or to fetch its source code -FETCHPORTALS_MODE = "ping" +FETCHPORTALS_MODE="ping" ### set maximum number of tries and time waiting -FETCHPORTALS_MAX_RETRIES = 3 -FETCHPORTALS_TIMEOUT = 10.0 +FETCHPORTALS_MAX_RETRIES=3 +FETCHPORTALS_TIMEOUT=10.0 ### control callback to process and/or save the retrived data -FETCHPORTALS_CALLBACK = "kaggle" +FETCHPORTALS_CALLBACK="kaggle" ### control what to do when destination file already exists - must be one of ### 'replace', 'append' or 'skip' -FETCHPORTALS_EXISTING = "replace" +FETCHPORTALS_EXISTING="replace" ### set a local directory where retrieved files may persist -FETCHPORTALS_LOCALDIR = "./data" +FETCHPORTALS_LOCALDIR="./data" ## Kaggle settings -KAGGLE_USERNAME = "exampleuser" -KAGGLE_KEY = "12345678abcdefgh" -KAGGLE_DATASET = "bcbernardo/censusqd2020" -KAGGLE_FILE = "portals-availability.csv" \ No newline at end of file +KAGGLE_USERNAME="exampleuser" +KAGGLE_KEY="12345678abcdefgh" +KAGGLE_DATASET="bcbernardo/censusqd2020" +KAGGLE_FILE="portals-availability.csv" \ No newline at end of file From a02e42af7103c4dde035ac2852e20ed96c2c5b43 Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Tue, 16 Feb 2021 18:49:13 -0300 Subject: [PATCH 17/19] Add subrepo documentation --- utils/README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 utils/README.md diff --git a/utils/README.md b/utils/README.md new file mode 100644 index 0000000..42ffe06 --- /dev/null +++ b/utils/README.md @@ -0,0 +1,90 @@ +# Utilitários do Censo Querido Diário + +Este sub-repositório inclui rotinas e funções elaboradas pela comunidade para +processar, analisar e salvar os resultados do Censo Querido Diário. + +Atualmente, o único utilitário desenvolvido é o pacote `fetch_portals`, que se +comunica com todos os endereços web cadastrados no Censo para checar quais +portais estão *online* e/ou para obter seu código-fonte. + +Contribuições na forma de novos pacotes e utilitários para processar os dados +do Censo são bem-vindas. Cheque o [CONTRIBUTING.md](../CONTRIBUTING.md) do +projeto para mais detalhes de como ajudar nas diferentes tarefas do Censo, bem +como a seção [Adicionando um novo utilitário](#adicionando-um-novo-utilitário) +para instruções específicas de como criar uma nova rotina de pré-processamento. + +Se tiver alguma dúvida ou quiser ter uma visão geral dos próximos passos do +Censo, não hesite em visitar as +[issues](https://github.com/okfn-brasil/censo-querido-diario/issues) do projeto +ou entrar em contato pelo [Discord](https://discord.gg/M6ep5VED). + +## Instalação + +Os utilitários contidos nesse sub-repositório podem ser instalados como pacotes +Python avulsos. Para isso, você deve ter instalada na sua máquina uma versão +Python compatível (3.7 ou superior). + +Para instalar a partir do repositório, rode em um terminal de linha de comando: + +```bash +$ git clone https://github.com/okfn-brasil/censo-querido-diario.git +$ cd censo-querido-diario/utils +$ python -m venv .venv +$ source .venv/bin/activate # no PowerShell: $ .venv/Scripts\activate.ps1 +(.venv) $ python -m pip install . +``` + +Para que a instalação funcione e você possa usar o comando `fetch-portals` da +linha de comando, é necessário que antes você exporte algumas variáveis de +ambiente, que controlam o funcionamento do programa. + +Para isso, edite o arquivo `.env.template`, contido no diretório +`censo-querido-diario/utils`, alterando as configurações necessárias. +**Importante:** para rodar a versão atual do coletor de portais, você deve, no +mínimo, alterar as variáveis de ambiente iniciadas em `KAGGLE_*`. Você precisa +ter permissão de escrita no dataset utilizado para salvar os resultados. + +Quando finalizar a edição, salve o arquivo `.env.template` e renomei-o para +`.env`, apenas. + + +## Rodando o programa + +Com o utilitário instalado como um pacote e o respectivo ambiente virtual +ativado, basta rodar o comando `fetch-portals` na linha de comando. Esse +comando fará requisições a todos os portais de publicação de diários oficiais +mapeados no Censo, e salvará os resultados no dataset do Kaggle indicado no +arquivo `.env`. + +```bash +(.venv) $ fetch-portals +``` + +## Adicionando um novo utilitário + +Para desenvolver um pacote Python que consuma e processe os dados do Censo +Querido Diário, [faça um +*fork*](https://github.com/okfn-brasil/censo-querido-diario/fork) do +repositório para a sua própria conta e adicione os scripts em um sub-diretório +da pasta `src`. Para o nome do diretório e dos módulos, utilize apenas letras +minúsculas e *underscores* (\_). Insira também um arquivo `__init__.py` vazio +no diretório criado. From 95d1463cd4e00931bc03aac46f9fc13212d536ba Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Wed, 17 Feb 2021 15:13:16 -0300 Subject: [PATCH 18/19] Dockerize portal fetching --- .gitignore | 1 + utils/.dockerignore | 3 ++ utils/README.md | 56 ++++++++++++++++++++++++++++++---- utils/docker-compose.yml | 11 +++++++ utils/fetch_portals.Dockerfile | 17 +++++++++++ 5 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 utils/.dockerignore create mode 100644 utils/docker-compose.yml create mode 100644 utils/fetch_portals.Dockerfile diff --git a/.gitignore b/.gitignore index 496e810..8ef116c 100644 --- a/.gitignore +++ b/.gitignore @@ -137,6 +137,7 @@ censo/share/ # Data notebooks/*.csv *.zip +cache # Azure stuff .vscode diff --git a/utils/.dockerignore b/utils/.dockerignore new file mode 100644 index 0000000..85377aa --- /dev/null +++ b/utils/.dockerignore @@ -0,0 +1,3 @@ +__pycache* +.*cache +.venv \ No newline at end of file diff --git a/utils/README.md b/utils/README.md index 42ffe06..5bf9183 100644 --- a/utils/README.md +++ b/utils/README.md @@ -18,7 +18,39 @@ Censo, não hesite em visitar as [issues](https://github.com/okfn-brasil/censo-querido-diario/issues) do projeto ou entrar em contato pelo [Discord](https://discord.gg/M6ep5VED). -## Instalação +## Instalação e execução + +### Com o Docker (recomendado) + +A forma mais simples de rodar os utilitários é utilizando o Docker. Você deve +ter instalado o Docker Community Edition. Encontre a versão apropriada para o +seu sistema [aqui][Docker CE]. Você também deve ter o [git] instalado para fazer download do repositório. + +Para instalar os utilitários, abra um terminal de linha de comando e rode os +seguintes comandos: + +```bash +$ git clone https://github.com/okfn-brasil/censo-querido-diario.git +$ cd censo-querido-diario/utils +``` + +Em um explorador de arquivos, encontre o diretório onde você fez download do +repositório e abra o arquivo `censo-querido-diario/utils/.env.template`. Adapte +as configurações presentes no arquivo de acordo com os dados que pretende obter +(especialmente as iniciadas em `KAGGLE_*`, se for exportar para o Kaggle). +Salve o arquivo modificado renomeie-o para `.env` (sem o `.template` no final). + +Para inicializar a checagem dos portais, basta voltar ao terminal e inserir o +comando: + +```bash +$ docker-compose up +``` + +[Docker CE]: https://hub.docker.com/search?offering=community&type=edition +[git]: https://git-scm.com/ + +### Como pacote Python Os utilitários contidos nesse sub-repositório podem ser instalados como pacotes Python avulsos. Para isso, você deve ter instalada na sua máquina uma versão @@ -67,8 +99,6 @@ PS> Install-Module -Name Set-PsEnv --> -## Rodando o programa - Com o utilitário instalado como um pacote e o respectivo ambiente virtual ativado, basta rodar o comando `fetch-portals` na linha de comando. Esse comando fará requisições a todos os portais de publicação de diários oficiais @@ -85,6 +115,20 @@ Para desenvolver um pacote Python que consuma e processe os dados do Censo Querido Diário, [faça um *fork*](https://github.com/okfn-brasil/censo-querido-diario/fork) do repositório para a sua própria conta e adicione os scripts em um sub-diretório -da pasta `src`. Para o nome do diretório e dos módulos, utilize apenas letras -minúsculas e *underscores* (\_). Insira também um arquivo `__init__.py` vazio -no diretório criado. +da pasta `censo-querido-diario/utils/src`. + +Para o nome do diretório e dos módulos, utilize apenas letras minúsculas e +*underscores* (\_). Insira também um arquivo `__init__.py` vazio no diretório +criado, e adicione as dependências utilizadas na lista de pacotes abaixo do +item `install_requires` do arquivo +[`censo-querido-diario/utils/setup.cfg`](./setup.cfg). + +Se você quiser que o utilitário seja acessível por meio do Docker, crie um +arquivo chamado `.Dockerfile` em +`censo-querido-diario/utils`, contendo as instruções de construção do contâiner +(veja a [referência do Dockerfile]). Em seguida, adicione uma entrada no +arquivo `docker-compose.yml` localizado no mesmo diretório (veja a [referência +do Docker Compose] para mais detalhes). + +[referência do Dockerfile]: https://docs.docker.com/engine/reference/builder/ +[referência do Docker Compose]: https://docs.docker.com/compose/compose-file/ diff --git a/utils/docker-compose.yml b/utils/docker-compose.yml new file mode 100644 index 0000000..a96e8e0 --- /dev/null +++ b/utils/docker-compose.yml @@ -0,0 +1,11 @@ +version: '3.8' +services: + fetch-portals: + build: + context: . + dockerfile: fetch_portals.Dockerfile + volumes: + - ./cache:/usr/src/data + env_file: .env + environment: + - FETCHPORTALS_LOCALDIR=/usr/src/data \ No newline at end of file diff --git a/utils/fetch_portals.Dockerfile b/utils/fetch_portals.Dockerfile new file mode 100644 index 0000000..ecd8683 --- /dev/null +++ b/utils/fetch_portals.Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.8.5-slim + +# Setup env +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONFAULTHANDLER 1 + +RUN mkdir /usr/src/app +WORKDIR /usr/src/app + +COPY . . + +RUN python -m pip install . + +# Run the executable +ENTRYPOINT ["fetch-portals"] From 29e451450587ed721bc8e72287e8530235b7779d Mon Sep 17 00:00:00 2001 From: Bernardo Chrispim Baron Date: Wed, 17 Feb 2021 15:25:08 -0300 Subject: [PATCH 19/19] Update gitignore --- .gitignore | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 65e9d1d..8ef116c 100644 --- a/.gitignore +++ b/.gitignore @@ -129,11 +129,18 @@ dmypy.json .pyre/ # Ignore -.vscode/ + censo/bin/ censo/include/ censo/share/ # Data notebooks/*.csv - +*.zip +cache + +# Azure stuff +.vscode +__azurite* +__blob* +local.settings.json \ No newline at end of file