Skip to content

Commit

Permalink
feat: extraire urls de legisquebec
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Jul 29, 2024
1 parent 790292d commit 910aaf1
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 0 deletions.
1 change: 1 addition & 0 deletions alexi/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ def analyse(self, iob: Iterable[T_obj], pdf_path: Path):

def output_json(self):
"""Sauvegarder les metadonnées"""
self.metadata["urls"] = sorted(self.resolver.urls)
with open(self.outdir / "index.json", "wt") as outfh:
LOGGER.info("Génération de %s", self.outdir / "index.json")
json.dump(self.metadata, outfh, indent=2, ensure_ascii=False)
Expand Down
2 changes: 2 additions & 0 deletions alexi/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(self, metadata: Optional[dict] = None):
self.metadata = {"docs": {}} if metadata is None else metadata
self.numeros = {}
self.titles = {}
self.urls = set()
for docpath, info in self.metadata["docs"].items():
self.numeros[info["numero"]] = docpath
self.titles[normalize_title(info["titre"])] = docpath
Expand Down Expand Up @@ -202,6 +203,7 @@ def resolve_external(self, text: str) -> Optional[str]:
url = "https://www.legisquebec.gouv.qc.ca/fr/document/lc/Q-2"
else:
return None
self.urls.add(url)
for m in SEC_RE.finditer(text):
sectype = m.group("sec")
num = m.group("num")
Expand Down

0 comments on commit 910aaf1

Please sign in to comment.