Skip to content

Commit

Permalink
chore: gossage pour satisfaire mypy et compagnie
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Jul 5, 2024
1 parent da3bddc commit 0cdc33c
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 12 deletions.
28 changes: 18 additions & 10 deletions alexi/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import json
import logging
import re
from pathlib import Path
from dataclasses import dataclass
from pathlib import Path

from bs4 import BeautifulSoup
from lunr import lunr, get_default_builder, trimmer
from lunr.pipeline import Pipeline
from unidecode import unidecode
from lunr import get_default_builder, lunr, trimmer # type: ignore
from lunr.pipeline import Pipeline # type: ignore
from unidecode import unidecode # type: ignore

LOGGER = logging.getLogger("index")

Expand All @@ -24,10 +24,11 @@ class Document:


def body_text(soup: BeautifulSoup):
body = soup.div(id="body")[0]
for header in body(class_="header"):
body = soup.find_all("div", id="body")
assert body is not None
for header in body[0](class_="header"):
header.extract()
for img in body("img"):
for img in body[0]("img"):
alt = soup.new_tag("p")
alt.string = img["alt"]
img.replace_with(alt)
Expand Down Expand Up @@ -68,15 +69,22 @@ def index(indir: Path, outdir: Path) -> None:
if "Document" in section["class"]:
LOGGER.info("Texte complet de %s ne sera pas indexé", title)
continue
url = section.a["href"]
a = section.a
assert a is not None
url = a["href"]
assert not isinstance(url, list)
# Assume it is a relative URL (we made it)
LOGGER.info("Traitement: %s: %s", title, indir / url)
with open(indir / url, "rt") as infh:
subsoup = BeautifulSoup(infh, features="lxml")
textes[url] = {"titre": title, "texte": body_text(subsoup)}
for text in soup.select("li.leaf"):
title = text.a.text
url = text.a["href"]
assert text is not None
a = text.a
assert a is not None
title = a.text
url = a["href"]
assert not isinstance(url, list)
LOGGER.info("Traitement: %s: %s", title, indir / url)
with open(indir / url, "rt") as infh:
subsoup = BeautifulSoup(infh, features="lxml")
Expand Down
5 changes: 3 additions & 2 deletions alexi/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
from pathlib import Path
from typing import List

from lunr.languages import get_nltk_builder
from lunr.index import Index
from lunr.index import Index # type: ignore
from lunr.languages import get_nltk_builder # type: ignore

from alexi.index import unifold

# This is just here to register the necessary pipeline functions
Expand Down

0 comments on commit 0cdc33c

Please sign in to comment.