turicas · turicas · Mar 14, 2020 · Oct 30, 2019 · Oct 30, 2019 · Oct 30, 2019
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@
 .env
 .idea/*
 .tox
+.scrapy
 MANIFEST
 build/*
 dist/*

diff --git a/schema/pagamento-hist.csv b/schema/pagamento-hist.csv
@@ -0,0 +1,26 @@
+original_name,field_name,field_type
+Código Órgão Superior,codigo_orgao_superior,integer
+Nome Órgão Superior,nome_orgao_superior,text
+Código Órgão,codigo_orgao,integer
+Nome Órgao,nome_orgao,text
+Código Unidade Gestora,codigo_unidade_gestora,integer
+Nome Unidade Gestora,nome_unidade_gestora,text
+Código Grupo Despesa,codigo_grupo_despesa,integer
+Nome Grupo Despesa,nome_grupo_despesa,text
+Código Elemento Despesa,codigo_elemento_despesa,text
+Nome Elemento Despesa,nome_elemento_despesa,text
+Código Função,codigo_funcao,text
+Nome Função,nome_funcao,text
+Código Subfunção,codigo_subfuncao,text
+Nome Subfunção,nome_subfuncao,text
+Código Programa,codigo_programa,text
+Nome Programa,nome_programa,text
+Código Ação,codigo_acao,text
+Nome Ação,nome_acao,text
+Linguagem Cidadã,linguagem_cidada,text
+Código Favorecido,codigo_favorecido,text
+Nome Favorecido,nome_favorecido,text
+Número Documento,numero_documento,text
+Gestão Pagamento,gestao_pagamento,text
+Data Pagamento,data_pagamento,date
+Valor,valor,money_real
diff --git a/transparenciagovbr/middlewares.py b/transparenciagovbr/middlewares.py
@@ -53,7 +53,7 @@ def process_start_requests(self, start_requests, spider):
             yield r
 
     def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
+        spider.logger.info("Spider opened: %s" % spider.name)
 
 
 class TransparenciagovbrDownloaderMiddleware(object):
@@ -100,4 +100,4 @@ def process_exception(self, request, exception, spider):
         pass
 
     def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
+        spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/transparenciagovbr/settings.py b/transparenciagovbr/settings.py
@@ -11,89 +11,87 @@
 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = 'transparenciagovbr'
+BOT_NAME = "transparenciagovbr"
 
-SPIDER_MODULES = ['transparenciagovbr.spiders']
-NEWSPIDER_MODULE = 'transparenciagovbr.spiders'
+SPIDER_MODULES = ["transparenciagovbr.spiders"]
+NEWSPIDER_MODULE = "transparenciagovbr.spiders"
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'transparenciagovbr (+http://www.yourdomain.com)'
+# USER_AGENT = 'transparenciagovbr (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'transparenciagovbr.middlewares.TransparenciagovbrSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# DOWNLOADER_MIDDLEWARES = {
 #    'transparenciagovbr.middlewares.TransparenciagovbrDownloaderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
 # See https://doc.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
 #    'transparenciagovbr.pipelines.TransparenciagovbrPipeline': 300,
-#}
+# }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 HTTPCACHE_ENABLED = True
 HTTPCACHE_EXPIRATION_SECS = 0
-HTTPCACHE_DIR = 'httpcache'
+HTTPCACHE_DIR = "httpcache"
 HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 
-FEED_EXPORTERS = {
-    "csv.gz": "transparenciagovbr.exporters.GzipCsvItemExporter",
-}
+FEED_EXPORTERS = {"csv.gz": "transparenciagovbr.exporters.GzipCsvItemExporter"}
 FEED_FORMAT = "csv.gz"
 
 REPOSITORY_PATH = Path(__file__).parent.parent
diff --git a/transparenciagovbr/spiders/base.py b/transparenciagovbr/spiders/base.py
@@ -0,0 +1,36 @@
+import csv
+import io
+import zipfile
+
+import scrapy
+
+from transparenciagovbr.utils.date import date_range, date_to_dict
+
+
+class TransparenciaBaseSpider(scrapy.Spider):
+    allowed_domains = ["portaldatransparencia.gov.br"]
+
+    def start_requests(self):
+        for date in date_range(
+            start=self.start_date, stop=self.end_date, interval=self.publish_frequency
+        ):
+            yield scrapy.Request(
+                self.base_url.format(**date_to_dict(date)), callback=self.parse_zip
+            )
+
+    def convert_row(self, row):
+        return {
+            field_name: self.schema[field_name].deserialize(row[original_field_name])
+            for original_field_name, field_name in self.field_mapping.items()
+        }
+
+    def parse_zip(self, response):
+        zf = zipfile.ZipFile(io.BytesIO(response.body))
+        for file_info in zf.filelist:
+            if file_info.filename.endswith(self.filename_suffix):
+                fobj = io.TextIOWrapper(
+                    zf.open(file_info.filename), encoding="iso-8859-1"
+                )
+                reader = csv.DictReader(fobj, delimiter=";")
+                for row in reader:
+                    yield self.convert_row(row)
diff --git a/transparenciagovbr/spiders/pagamento-hist/__init__.py b/transparenciagovbr/spiders/pagamento-hist/__init__.py
diff --git a/transparenciagovbr/spiders/pagamento-hist/spiders.py b/transparenciagovbr/spiders/pagamento-hist/spiders.py
@@ -0,0 +1,63 @@
+import csv
+import datetime
+import io
+import zipfile
+
+import rows
+
+from transparenciagovbr import settings
+from transparenciagovbr.spiders.base import TransparenciaBaseSpider
+from transparenciagovbr.utils.io import NotNullTextWrapper
+
+
+class BrazilianDateField(rows.fields.DateField):
+    INPUT_FORMAT = "%d/%m/%Y"
+
+
+class MoneyRealField(rows.fields.DecimalField):
+    @classmethod
+    def deserialize(cls, value):
+        """
+        >>> MoneyRealField.deserialize("89188,11")
+        '89188.11'
+        """
+        value = value.replace(",", ".")
+        return super().deserialize(value)
+
+
+SCHEMA_PATH = str(
+    (settings.REPOSITORY_PATH / "schema" / "pagamento-hist.csv").absolute()
+)
+SCHEMA = rows.utils.load_schema(
+    SCHEMA_PATH,
+    context={
+        "date": BrazilianDateField,
+        "text": rows.fields.TextField,
+        "integer": rows.fields.IntegerField,
+        "money_real": MoneyRealField,
+    },
+)
+FIELD_MAPPING = {
+    row.original_name: row.field_name for row in rows.import_from_csv(SCHEMA_PATH)
+}
+
+
+class PagamentoHistSpider(TransparenciaBaseSpider):
+    name = "pagamento-hist"
+    base_url = "http://www.portaltransparencia.gov.br/download-de-dados/historico-gastos-diretos-pagamentos/{year}{month:02d}"
+    start_date = datetime.date(2011, 1, 1)
+    end_date = datetime.date(2012, 12, 31)
+    publish_frequency = "monthly"
+    schema = SCHEMA
+    field_mapping = FIELD_MAPPING
+
+    def parse_zip(self, response):
+        zf = zipfile.ZipFile(io.BytesIO(response.body))
+        assert len(zf.filelist) == 1
+        fobj = NotNullTextWrapper(
+            zf.open(zf.filelist[0].filename), encoding="iso-8859-1"
+        )
+        reader = csv.DictReader(fobj, delimiter="\t")
+
+        for row in reader:
+            yield self.convert_row(row)
diff --git a/transparenciagovbr/spiders/pagamento.py b/transparenciagovbr/spiders/pagamento.py
diff --git a/transparenciagovbr/spiders/pagamento/__init__.py b/transparenciagovbr/spiders/pagamento/__init__.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ @@
     .env
     .idea/*
     .tox
+    .scrapy
     MANIFEST
     build/*
     dist/*
@@ Expand Down @@