From 2e420c0d4b72f6a914e124565c7ca649d0b93764 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 30 Jul 2024 15:59:30 -0400 Subject: [PATCH] fix: verifier presence de zonage --- alexi/analyse.py | 10 ++++++---- alexi/extract.py | 4 +++- alexi/link.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/alexi/analyse.py b/alexi/analyse.py index c5dd99d..15ab897 100644 --- a/alexi/analyse.py +++ b/alexi/analyse.py @@ -9,7 +9,7 @@ from collections import deque from dataclasses import dataclass, field from pathlib import Path -from typing import Iterable, Iterator, NamedTuple, Optional +from typing import Iterable, Iterator, NamedTuple, Optional, Union from pdfplumber.utils.geometry import T_bbox, calculate_area, merge_bboxes @@ -481,7 +481,7 @@ def __call__( return doc -def extract_zonage(doc: Document) -> dict[str, dict[str, dict[str, str]]]: +def extract_zonage(doc: Document) -> Union[dict[str, dict[str, dict[str, str]]], None]: """ Extraire les éléments du zonage d'un règlement et générer des metadonnées pour l'identification des hyperliens et la @@ -490,7 +490,7 @@ def extract_zonage(doc: Document) -> dict[str, dict[str, dict[str, str]]]: mz: Optional[Element] = None if "Chapitre" not in doc.paliers: LOGGER.warning("Aucun chapitre présent dans %s", doc.fileid) - return {} + return None for c in doc.paliers["Chapitre"]: if "milieux et zones" in c.titre.lower(): LOGGER.info("Extraction de milieux et zones") @@ -498,7 +498,7 @@ def extract_zonage(doc: Document) -> dict[str, dict[str, dict[str, str]]]: break if mz is None: LOGGER.info("Chapitre milieux et zones non trouvé") - return {} + return None top = Path(doc.fileid) / "Chapitre" / mz.numero metadata: dict[str, dict[str, dict[str, str]]] = { "categorie_milieu": {}, @@ -520,4 +520,6 @@ def extract_zonage(doc: Document) -> dict[str, dict[str, dict[str, str]]]: "titre": m.group(2), "url": str(subsecdir), } + if len(metadata["categorie_milieu"]) == 0 and len(metadata["milieu"]) == 0: + return None return metadata diff --git a/alexi/extract.py b/alexi/extract.py index 8e049bc..2d9f39d 100644 --- a/alexi/extract.py +++ b/alexi/extract.py @@ -414,7 +414,9 @@ def __call__(self, path: Path) -> Union[Document, None]: if self.pdfdata: doc.pdfurl = self.pdfdata.get(pdf_path.name, {}).get("url", None) if "zonage" in doc.titre.lower() and "zonage" not in self.metadata: - self.metadata["zonage"] = extract_zonage(doc) + zonage = extract_zonage(doc) + if zonage is not None: + self.metadata["zonage"] = zonage return doc def analyse(self, iob: Iterable[T_obj], pdf_path: Path): diff --git a/alexi/link.py b/alexi/link.py index c822ae0..d5ced61 100644 --- a/alexi/link.py +++ b/alexi/link.py @@ -55,7 +55,7 @@ def __init__(self, metadata: Optional[dict] = None): self.metadata = {"docs": {}} if metadata is None else metadata self.numeros = {} self.titles = {} - self.urls = set() + self.urls: set[str] = set() for docpath, info in self.metadata["docs"].items(): self.numeros[info["numero"]] = docpath self.titles[normalize_title(info["titre"])] = docpath