diff --git a/alexi/index.py b/alexi/index.py index b1fa0d1..8a2ada1 100644 --- a/alexi/index.py +++ b/alexi/index.py @@ -5,12 +5,12 @@ import json import logging import re -from pathlib import Path from dataclasses import dataclass +from pathlib import Path from bs4 import BeautifulSoup -from lunr import lunr, get_default_builder, trimmer -from lunr.pipeline import Pipeline +from lunr import get_default_builder, lunr, trimmer # type: ignore +from lunr.pipeline import Pipeline # type: ignore from unidecode import unidecode LOGGER = logging.getLogger("index") @@ -24,10 +24,11 @@ class Document: def body_text(soup: BeautifulSoup): - body = soup.div(id="body")[0] - for header in body(class_="header"): + body = soup.find_all("div", id="body") + assert body is not None + for header in body[0](class_="header"): header.extract() - for img in body("img"): + for img in body[0]("img"): alt = soup.new_tag("p") alt.string = img["alt"] img.replace_with(alt) @@ -68,15 +69,22 @@ def index(indir: Path, outdir: Path) -> None: if "Document" in section["class"]: LOGGER.info("Texte complet de %s ne sera pas indexé", title) continue - url = section.a["href"] + a = section.a + assert a is not None + url = a["href"] + assert not isinstance(url, list) # Assume it is a relative URL (we made it) LOGGER.info("Traitement: %s: %s", title, indir / url) with open(indir / url, "rt") as infh: subsoup = BeautifulSoup(infh, features="lxml") textes[url] = {"titre": title, "texte": body_text(subsoup)} for text in soup.select("li.leaf"): - title = text.a.text - url = text.a["href"] + assert text is not None + a = text.a + assert a is not None + title = a.text + url = a["href"] + assert not isinstance(url, list) LOGGER.info("Traitement: %s: %s", title, indir / url) with open(indir / url, "rt") as infh: subsoup = BeautifulSoup(infh, features="lxml") diff --git a/alexi/search.py b/alexi/search.py index dabf11f..e38309c 100644 --- a/alexi/search.py +++ b/alexi/search.py @@ -6,8 +6,9 @@ from pathlib import Path from typing import List -from lunr.languages import get_nltk_builder from lunr.index import Index +from lunr.languages import get_nltk_builder # type: ignore + from alexi.index import unifold # This is just here to register the necessary pipeline functions