Skip to content

Commit

Permalink
#673 atualiza spider de rj-campos para usar start_date
Browse files Browse the repository at this point in the history
  • Loading branch information
slfabio committed Nov 13, 2024
1 parent 578f7a9 commit 860ba26
Showing 1 changed file with 39 additions and 18 deletions.
57 changes: 39 additions & 18 deletions data_collection/gazette/spiders/rj/rj_campos_goytacazes.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,76 @@
import re
from datetime import date

import dateparser
from fuzzywuzzy import process
from scrapy import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class RjCampoGoytacazesSpider(BaseGazetteSpider):
name = "rj_campos_goytacazes"
TERRITORY_ID = "3301009"

allowed_domains = ["www.campos.rj.gov.br"]
name = "rj_campos_goytacazes"
start_urls = [
"https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15"
start_urls = ["https://www.campos.rj.gov.br/diario-oficial.php"]
start_date = date(2013, 11, 1)
months = [
"janeiro",
"fevereiro",
"março",
"abril",
"maio",
"junho",
"julho",
"agosto",
"setembro",
"outubro",
"novembro",
"dezembro",
]

def parse(self, response):
"""
@url https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15
@returns requests 1
@returns items 15 15
@scrapes date file_urls is_extra_edition power
"""

for element in response.css("ul.ul-licitacoes li"):
gazette_data = element.css("h4::text")
gazette_text = element.css("h4::text").get("")

date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
if not date_re:
continue

date = date_re.group(0)
# The extra edition for August 28th, 2018 has a typo in the month name.
date = date.replace("Agosoto", "Agosto")
# The edition for December 17th, 2012 has a typo in the month name.
date = date.replace("Dezembrbo", "Dezembro")
date = date_re.group(0).lower()
month = date_re.group(2).lower()
if month not in self.months:
correct_month, id_fuzzy = process.extractOne(month, self.months)
date = date.replace(month, correct_month)
self.logger.warning(
f' Erro de digitação em "{gazette_text}". CORRIGIDO DE {month} PARA {correct_month}'
)

date = dateparser.parse(date, languages=["pt"]).date()
if date > self.end_date:
continue
if date < self.start_date:
return

edition_number = gazette_data.re_first(r"Edição.*\s(\d+)")

path_to_gazette = element.css("a::attr(href)").get().strip()
# From November 17th, 2017 and backwards the path to the gazette PDF
# is relative.
if path_to_gazette.startswith("up/diario_oficial.php"):
path_to_gazette = response.urljoin(path_to_gazette)

is_extra_edition = gazette_text.startswith("Suplemento")
is_extra_edition = bool(
re.search(r"extra|supl|revis", gazette_text, re.IGNORECASE)
)

yield Gazette(
date=date,
file_urls=[path_to_gazette],
edition_number=edition_number,
is_extra_edition=is_extra_edition,
file_urls=[path_to_gazette],
power="executive",
)

Expand Down

0 comments on commit 860ba26

Please sign in to comment.