dhdaines · dhdaines · Feb 17, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/alexi/analyse.py b/alexi/analyse.py
@@ -6,6 +6,7 @@
 import logging
 import operator
 import re
+from collections import deque
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Iterable, Iterator, NamedTuple, Optional
@@ -22,9 +23,10 @@
 class Hyperlien(NamedTuple):
     """Hyperlien dans un bloc de texte."""
 
-    href: Optional[str]
     start: int
     end: int
+    alt: Optional[str]
+    href: Optional[str]
 
 
 @dataclass
@@ -74,40 +76,73 @@ def img(self) -> str:
 # should be done with the sequence CRF
 SECTION = r"\b(?:article|chapitre|section|sous-section|annexe)s?"
 NUMERO = r"[\d\.XIV]+"
-NUMEROS = rf"{NUMERO}(?:(?:,|\s+et)\s+{NUMERO})*"
+NUMEROS = rf"{NUMERO}(?P<numeros>(?:,|\s+(?:et|ou))\s+{NUMERO})*"
 MILIEU = r"\btypes?\s+des?\s+milieux?"
 MTYPE = r"[\dA-Z]+\.\d"
-MTYPES = rf"{MTYPE}(?:(?:,|\s+et)\s+{MTYPE})*"
-REGNUM = (
-    r"(?:(?:SQ-)?\d[\d\.A-Z-]+|\((?:c\.|(?:R\.\s*)?L\.?\s*R\.?\s*Q\.?)\s*,?[^\)]+\))"
-)
-REGLEMENT = rf"règlement\s+(?:{REGNUM}|(?:de|sur|concernant).*?{REGNUM})"
-LOI = r"""
+MTYPES = rf"{MTYPE}(?P<mtypes>(?:,|\s+(?:et|ou))\s+{MTYPE})*"
+RLRQ = r"(?:c\.|(?:R\.?\s*)?[LR]\.?\s*R\.?\s*Q\.?)\s*,?[^\)]+"
+REGNUM = rf"(?:(?:SQ-)?\d[\d\.A-Z-]+|\({RLRQ}\))"
+REGLEMENT = rf"""
+règlement\s+
+(?:
+   {REGNUM}
+  |de\s+zonage
+  |de\s+lotissement
+  |de\s+construction
+  |(?:sur\s+les|relatif\s+aux)\s+(?:PIIA|plans\s+d['’]implantation\s+et\s+d['’]intégration\s+architecturale)
+  |(?:sur\s+les|relatif\s+aux)\s+permis\s+et\s+(?:les\s+)?certificats
+  |(?:de|sur|concernant).*?{REGNUM}
+)"""
+LOI = rf"""
 (?:code\s+civil
-  |loi\s+.*?\((?:R?L\.?R\.?Q\.?|c\.),?[^\)]+\)
+  |(?:loi|code)\s+.*?\({RLRQ}\)
   |loi\s+sur\s+l['’]aménagement\s+et\s+l['’]urbanisme
+  |loi\s+sur\s+la\s+qualité\s+de\s+l['’]environnement
   |loi\s+sur\s+les\s+cités\s+et\s+villes
 )"""
 DU = r"(?:du|de\s+l['’]|de\s+la)"
 MATCHER = re.compile(
     rf"""
 (?:
-   (?:{SECTION}\s+{NUMEROS}
+   (?:{SECTION}\s+(?P<numero>{NUMEROS})
       (?:\s+{DU}\s+{SECTION}\s+{NUMERO})*
-     |{MILIEU}\s+{MTYPES})
+     |{MILIEU}\s+(?P<mtype>{MTYPES}))
    (?:\s+{DU}\s+(?:{REGLEMENT}|{LOI}))?
   |{REGLEMENT}|{LOI})
     """,
     re.IGNORECASE | re.VERBOSE,
 )
+NUMMATCH = re.compile(NUMERO, re.IGNORECASE | re.VERBOSE)
+MTMATCH = re.compile(MTYPE, re.IGNORECASE | re.VERBOSE)
 
 
 def match_links(text: str):
     """
     Identifier des hyperliens potentiels dans un texte.
     """
     for m in MATCHER.finditer(text):
-        yield Hyperlien(None, m.start(), m.end())
+        if m.group("numeros") is not None:
+            before = re.sub(r"s$", "", text[m.start() : m.start("numero")].strip())
+            after = text[m.end("numero") : m.end()]
+            for num in NUMMATCH.finditer(m.group("numero")):
+                yield Hyperlien(
+                    m.start("numero") + num.start(),
+                    m.start("numero") + num.end(),
+                    f"{before} {num.group()}{after}",
+                    None,
+                )
+        elif m.group("mtypes") is not None:
+            before = text[m.start() : m.start("mtype")]
+            after = text[m.end("mtype") : m.end()]
+            for mt in MTMATCH.finditer(m.group("mtype")):
+                yield Hyperlien(
+                    m.start("mtype") + mt.start(),
+                    m.start("mtype") + mt.end(),
+                    f"{before}{mt.group()}{after}",
+                    None,
+                )
+        else:
+            yield Hyperlien(m.start(), m.end(), None, None)
 
 
 def group_iob(words: Iterable[T_obj], key: str = "segment") -> Iterator[Bloc]:
@@ -159,6 +194,23 @@ def fromdict(self, **kwargs) -> "Element":
         el.sub = [Element.fromdict(**subel) for subel in el.sub]
         return el
 
+    def traverse(self) -> Iterator[tuple[list[str], "Element"]]:
+        """Pre-order traversal of the subtree."""
+        d = deque(self.sub)
+        path = []
+        while d:
+            el = d.popleft()
+            if el is None:
+                path.pop()
+                path.pop()
+                continue
+            yield path, el
+            if el.sub:
+                path.append(el.type)
+                path.append(el.numero)
+                d.appendleft(None)
+                d.extendleft(reversed(el.sub))
+
 
 ELTYPE = r"(?i:article|chapitre|section|sous-section|titre|annexe)"
 DOTSPACEDASH = r"(?:\.|\s*[:—–-]| )"

diff --git a/alexi/extract.py b/alexi/extract.py
@@ -9,7 +9,6 @@
 import logging
 import operator
 import os
-from collections import deque
 from pathlib import Path
 from typing import Any, Iterable, Optional, TextIO
 
@@ -228,15 +227,12 @@ def make_doc_subtree(doc: Document, outfh: TextIO):
     if doc.pdfurl is not None:
         outfh.write(f'(<a target="_blank" href="{doc.pdfurl}">PDF</a>)')
     outfh.write("</li>\n")
-    top = Path(doc.fileid)
-    d = deque((el, top, 1) for el in doc.structure.sub)
-    prev_level = 1
-    while d:
-        el, parent, level = d.popleft()
+    prev_level = 0
+    for parts, el in doc.structure.traverse():
         if el.type in ("Article", "Annexe"):
-            eldir = top / el.type / el.numero
+            eldir = Path(doc.fileid, el.type, el.numero)
         else:
-            eldir = parent / el.type / el.numero
+            eldir = Path(doc.fileid, *parts, el.type, el.numero)
         if el.numero[0] == "_":
             if el.titre:
                 eltitre = el.titre
@@ -247,6 +243,7 @@ def make_doc_subtree(doc: Document, outfh: TextIO):
                 eltitre = f"{el.type} {el.numero}: {el.titre}"
             else:
                 eltitre = f"{el.type} {el.numero}"
+        level = len(parts) / 2
         while level < prev_level:
             outfh.write("</ul></details></li>\n")
             prev_level -= 1
@@ -269,9 +266,8 @@ def make_doc_subtree(doc: Document, outfh: TextIO):
                     f' (<a target="_blank" href="{doc.pdfurl}#page={el.page}">PDF</a>)'
                 )
             outfh.write(f'<li class="{el.type} leaf">{link}{pdflink}</li>\n')
-        d.extendleft((subel, eldir, level + 1) for subel in reversed(el.sub))
         prev_level = level
-    while prev_level > 1:
+    while prev_level > 0:
         outfh.write("</ul></details></li>\n")
         prev_level -= 1
     outfh.write("</ul>\n")
@@ -518,31 +514,17 @@ def output_html(self, doc: Document):
             self.output_sub_index(doc, doc.structure, path)
         self.output_element(doc, path, doc.structure)
 
-        # Walk the structure of the document (FIXME: make a generator)
-        d = deque(doc.structure.sub)
-        while d:
-            el = d.popleft()
-            if el is None:
-                path = path.parent.parent
-                continue
+        for parts, el in doc.structure.traverse():
             if el.type in ("Article", "Annexe"):
                 continue
             LOGGER.info(
                 "Path: %s Structure: %s %s [%s]",
-                path,
+                parts,
                 el.type,
                 el.numero,
                 ",".join("%s %s" % (sel.type, sel.numero) for sel in el.sub),
             )
-            self.output_element(doc, path / el.type / el.numero, el)
-            if el.sub:
-                path = path / el.type / el.numero
-                self.output_sub_index(doc, el, path)
-                d.appendleft(None)
-                LOGGER.debug("SUB: None")
-                for sel in reversed(el.sub):
-                    d.appendleft(sel)
-                    LOGGER.debug("SUB: %s %s", sel.type, sel.numero)
+            self.output_element(doc, Path(*parts, el.type, el.numero), el)
         return doc
 
 

diff --git a/alexi/format.py b/alexi/format.py
@@ -94,8 +94,9 @@ def bloc_html(self, bloc: Bloc) -> str:
                 link_text = text[link.start : link.end]
                 href = link.href
                 if href is None and self.resolver:
-                    href = self.resolver(link_text, str(self.path), self.doc)
-                    LOGGER.info("%s:%s -> %s", link_text, self.path, href)
+                    href_text = link_text if link.alt is None else link.alt
+                    href = self.resolver(href_text, str(self.path), self.doc)
+                    LOGGER.info("%s:%s -> %s", href_text, self.path, href)
                 if href is None:
                     chunks.append(link_text)
                 else: