Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adiciona spider base #1

Merged
merged 5 commits into from
Mar 14, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
.env
.idea/*
.tox
.scrapy
MANIFEST
build/*
dist/*
Expand Down
26 changes: 26 additions & 0 deletions schema/pagamento-hist.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
original_name,field_name,field_type
Código Órgão Superior,codigo_orgao_superior,integer
Nome Órgão Superior,nome_orgao_superior,text
Código Órgão,codigo_orgao,integer
Nome Órgao,nome_orgao,text
Código Unidade Gestora,codigo_unidade_gestora,integer
Nome Unidade Gestora,nome_unidade_gestora,text
Código Grupo Despesa,codigo_grupo_despesa,integer
Nome Grupo Despesa,nome_grupo_despesa,text
Código Elemento Despesa,codigo_elemento_despesa,text
Nome Elemento Despesa,nome_elemento_despesa,text
Código Função,codigo_funcao,text
Nome Função,nome_funcao,text
Código Subfunção,codigo_subfuncao,text
Nome Subfunção,nome_subfuncao,text
Código Programa,codigo_programa,text
Nome Programa,nome_programa,text
Código Ação,codigo_acao,text
Nome Ação,nome_acao,text
Linguagem Cidadã,linguagem_cidada,text
Código Favorecido,codigo_favorecido,text
Nome Favorecido,nome_favorecido,text
Número Documento,numero_documento,text
Gestão Pagamento,gestao_pagamento,text
Data Pagamento,data_pagamento,date
Valor,valor,money_real
4 changes: 2 additions & 2 deletions transparenciagovbr/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def process_start_requests(self, start_requests, spider):
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
spider.logger.info("Spider opened: %s" % spider.name)


class TransparenciagovbrDownloaderMiddleware(object):
Expand Down Expand Up @@ -100,4 +100,4 @@ def process_exception(self, request, exception, spider):
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
spider.logger.info("Spider opened: %s" % spider.name)
58 changes: 28 additions & 30 deletions transparenciagovbr/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,89 +11,87 @@
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'transparenciagovbr'
BOT_NAME = "transparenciagovbr"

SPIDER_MODULES = ['transparenciagovbr.spiders']
NEWSPIDER_MODULE = 'transparenciagovbr.spiders'
SPIDER_MODULES = ["transparenciagovbr.spiders"]
NEWSPIDER_MODULE = "transparenciagovbr.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'transparenciagovbr (+http://www.yourdomain.com)'
# USER_AGENT = 'transparenciagovbr (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# }

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'transparenciagovbr.middlewares.TransparenciagovbrSpiderMiddleware': 543,
#}
# }

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'transparenciagovbr.middlewares.TransparenciagovbrDownloaderMiddleware': 543,
#}
# }

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# ITEM_PIPELINES = {
# 'transparenciagovbr.pipelines.TransparenciagovbrPipeline': 300,
#}
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_DIR = "httpcache"
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

FEED_EXPORTERS = {
"csv.gz": "transparenciagovbr.exporters.GzipCsvItemExporter",
}
FEED_EXPORTERS = {"csv.gz": "transparenciagovbr.exporters.GzipCsvItemExporter"}
FEED_FORMAT = "csv.gz"

REPOSITORY_PATH = Path(__file__).parent.parent
36 changes: 36 additions & 0 deletions transparenciagovbr/spiders/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import csv
import io
import zipfile

import scrapy

from transparenciagovbr.utils.date import date_range, date_to_dict


class TransparenciaBaseSpider(scrapy.Spider):
allowed_domains = ["portaldatransparencia.gov.br"]

def start_requests(self):
for date in date_range(
start=self.start_date, stop=self.end_date, interval=self.publish_frequency
):
yield scrapy.Request(
self.base_url.format(**date_to_dict(date)), callback=self.parse_zip
)

def convert_row(self, row):
return {
field_name: self.schema[field_name].deserialize(row[original_field_name])
for original_field_name, field_name in self.field_mapping.items()
}

def parse_zip(self, response):
zf = zipfile.ZipFile(io.BytesIO(response.body))
for file_info in zf.filelist:
if file_info.filename.endswith(self.filename_suffix):
fobj = io.TextIOWrapper(
zf.open(file_info.filename), encoding="iso-8859-1"
)
reader = csv.DictReader(fobj, delimiter=";")
for row in reader:
yield self.convert_row(row)
Empty file.
63 changes: 63 additions & 0 deletions transparenciagovbr/spiders/pagamento-hist/spiders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import csv
import datetime
import io
import zipfile

import rows

from transparenciagovbr import settings
from transparenciagovbr.spiders.base import TransparenciaBaseSpider
from transparenciagovbr.utils.io import NotNullTextWrapper


class BrazilianDateField(rows.fields.DateField):
INPUT_FORMAT = "%d/%m/%Y"


class MoneyRealField(rows.fields.DecimalField):
@classmethod
def deserialize(cls, value):
"""
>>> MoneyRealField.deserialize("89188,11")
'89188.11'
"""
value = value.replace(",", ".")
return super().deserialize(value)


SCHEMA_PATH = str(
(settings.REPOSITORY_PATH / "schema" / "pagamento-hist.csv").absolute()
)
SCHEMA = rows.utils.load_schema(
SCHEMA_PATH,
context={
"date": BrazilianDateField,
"text": rows.fields.TextField,
"integer": rows.fields.IntegerField,
"money_real": MoneyRealField,
},
)
FIELD_MAPPING = {
row.original_name: row.field_name for row in rows.import_from_csv(SCHEMA_PATH)
}


class PagamentoHistSpider(TransparenciaBaseSpider):
name = "pagamento-hist"
base_url = "http://www.portaltransparencia.gov.br/download-de-dados/historico-gastos-diretos-pagamentos/{year}{month:02d}"
start_date = datetime.date(2011, 1, 1)
end_date = datetime.date(2012, 12, 31)
publish_frequency = "monthly"
schema = SCHEMA
field_mapping = FIELD_MAPPING

def parse_zip(self, response):
zf = zipfile.ZipFile(io.BytesIO(response.body))
assert len(zf.filelist) == 1
fobj = NotNullTextWrapper(
zf.open(zf.filelist[0].filename), encoding="iso-8859-1"
)
reader = csv.DictReader(fobj, delimiter="\t")

for row in reader:
yield self.convert_row(row)
111 changes: 0 additions & 111 deletions transparenciagovbr/spiders/pagamento.py

This file was deleted.

Empty file.
Loading