Skip to content

Commit

Permalink
feat(xmllib): check tags of richtext values (RDU-58) (#1282)
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum authored Nov 21, 2024
1 parent 3728089 commit 289251a
Show file tree
Hide file tree
Showing 12 changed files with 110 additions and 26 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ GitHub's dependabot is configured to automatically create a version bumping PR i
Version bumping PRs can also be created manually: run `uv lock --upgrade` and create a PR from the resulting changes.

All developers working with the DSP-TOOLS repository should regularly execute `uv self update` to update uv,
and `uv sync` to update the dependencies from `uv.lock`.
and `uv sync --upgrade` to update the dependencies from `uv.lock`.



Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/json_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
from dsp_tools.commands.excel2json.models.input_error import MoreThanOneRowProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import RequiredColumnMissingProblem
from dsp_tools.commands.excel2json.models.json_header import Descriptions
from dsp_tools.commands.excel2json.models.json_header import EmptyJsonHeader
Expand All @@ -29,6 +28,7 @@
from dsp_tools.commands.excel2json.utils import find_missing_required_values
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError
from dsp_tools.models.problems import Problem
from dsp_tools.utils.uri_util import is_uri


Expand Down
12 changes: 2 additions & 10 deletions src/dsp_tools/commands/excel2json/models/input_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from dataclasses import dataclass
from typing import Any
from typing import Protocol

from dsp_tools.models.problems import Problem

separator = "\n "
list_separator = "\n - "
Expand All @@ -11,15 +12,6 @@
grand_separator = "\n\n---------------------------------------\n\n"


class Problem(Protocol):
"""Information about input errors."""

def execute_error_protocol(self) -> str:
"""
This function initiates all the steps for successful problem communication with the user.
"""


@dataclass(frozen=True)
class PositionInExcel:
"""This class contains the information about the position of a value in the excel."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from loguru import logger

from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet
from dsp_tools.commands.excel2json.new_lists.models.input_error import CollectedSheetProblems
Expand All @@ -35,6 +34,7 @@
from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name
from dsp_tools.models.custom_warnings import DspToolsUserWarning
from dsp_tools.models.exceptions import InputError
from dsp_tools.models.problems import Problem


def make_all_excel_compliance_checks(sheet_list: list[ExcelSheet]) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

from dsp_tools.commands.excel2json.models.input_error import ExcelFileProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import grand_separator
from dsp_tools.commands.excel2json.models.input_error import list_separator
from dsp_tools.commands.excel2json.models.input_error import medium_separator
from dsp_tools.models.problems import Problem


@dataclass(frozen=True)
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
from dsp_tools.commands.excel2json.models.input_error import MoreThanOneSheetProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import PropertyProblem
from dsp_tools.commands.excel2json.models.ontology import GuiAttributes
from dsp_tools.commands.excel2json.models.ontology import OntoProperty
Expand All @@ -34,6 +33,7 @@
from dsp_tools.commands.excel2json.utils import get_wrong_row_numbers
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError
from dsp_tools.models.problems import Problem

languages = ["en", "de", "fr", "it", "rm"]
language_label_col = ["label_en", "label_de", "label_fr", "label_it", "label_rm"]
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from dsp_tools.commands.excel2json.models.input_error import MandatorySheetsMissingProblem
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import ResourceSheetNotListedProblem
from dsp_tools.commands.excel2json.models.ontology import OntoResource
from dsp_tools.commands.excel2json.models.ontology import ResourceCardinality
Expand All @@ -30,6 +29,7 @@
from dsp_tools.commands.excel2json.utils import get_labels
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError
from dsp_tools.models.problems import Problem

languages = ["en", "de", "fr", "it", "rm"]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol

import pandas as pd

from dsp_tools.models.problems import Problem

separator = "\n\n"
list_separator = "\n - "


class Problem(Protocol):
"""Information about input errors."""

def execute_error_protocol(self) -> str:
"""
This function initiates all the steps for successful problem communication with the user.
"""


@dataclass(frozen=True)
class FileProblems(Problem):
"""Handle the error communication to the user in case that some files don't exist or are unsupported."""
Expand Down
10 changes: 10 additions & 0 deletions src/dsp_tools/models/problems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import Protocol


class Problem(Protocol):
"""Information about input errors."""

def execute_error_protocol(self) -> str:
"""
This function initiates all the steps for successful problem communication with the user.
"""
18 changes: 18 additions & 0 deletions src/dsp_tools/xmllib/models/problems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import dataclass

from dsp_tools.models.problems import Problem


@dataclass
class IllegalTagProblem(Problem):
orig_err_msg: str
pseudo_xml: str

def execute_error_protocol(self) -> str:
msg = (
"The XML tags contained in a richtext property (encoding=xml) must be well-formed. "
"The special characters <, > and & are only allowed to construct a tag. "
)
msg += f"\nOriginal error message: {self.orig_err_msg}"
msg += f"\nEventual line/column numbers are relative to this text: {self.pseudo_xml}"
return msg
2 changes: 2 additions & 0 deletions src/dsp_tools/xmllib/models/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from dsp_tools.models.custom_warnings import DspToolsUserWarning
from dsp_tools.utils.uri_util import is_uri
from dsp_tools.xmllib.models.config_options import Permissions
from dsp_tools.xmllib.value_checkers import check_richtext_syntax
from dsp_tools.xmllib.value_checkers import is_bool_like
from dsp_tools.xmllib.value_checkers import is_color
from dsp_tools.xmllib.value_checkers import is_date
Expand Down Expand Up @@ -371,6 +372,7 @@ def __post_init__(self) -> None:
_warn_type_mismatch(
expected_type="string", value=self.value, prop_name=self.prop_name, res_id=self.resource_id
)
check_richtext_syntax(self.value)

def serialise(self) -> etree._Element:
ele = self.make_prop()
Expand Down
70 changes: 70 additions & 0 deletions src/dsp_tools/xmllib/value_checkers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import json
import warnings
from typing import Any

import pandas as pd
import regex
from lxml import etree
from namedentities.core import numeric_entities # type: ignore[import-untyped]

from dsp_tools.models.custom_warnings import DspToolsUserWarning
from dsp_tools.xmllib.models.problems import IllegalTagProblem


def is_nonempty_value(value: Any) -> bool:
Expand Down Expand Up @@ -214,3 +220,67 @@ def is_dsp_ark(value: Any) -> bool:
True if it is valid, else false
"""
return bool(regex.search(r"^ark:/", str(value)))


def check_richtext_syntax(richtext: str) -> None:
"""
DSP richtexts must be convertible into valid XML.
This checker escapes the reserved characters `<`, `>` and `&`,
but only if they are not part of a standard standoff tag or escape sequence.
Then, it tries to parse the resulting XML.
Note: Only DSP standard standoff tags are allowed in richtexts. They are documented
[here](https://docs.dasch.swiss/latest/DSP-API/03-endpoints/api-v2/text/standard-standoff/).
Args:
richtext: richtext to check
Warns:
DspToolsUserWarning: if the input contains XML syntax problems
"""
escaped_text = _escape_reserved_chars(richtext)
# transform named entities (=character references) to numeric entities, e.g. &nbsp; -> &#160;
num_ent = numeric_entities(escaped_text)
pseudo_xml = f"<text>{num_ent}</text>"
try:
_ = etree.fromstring(pseudo_xml)
except etree.XMLSyntaxError as err:
prob = IllegalTagProblem(orig_err_msg=err.msg, pseudo_xml=pseudo_xml)
warnings.warn(DspToolsUserWarning(prob.execute_error_protocol()))


def _escape_reserved_chars(richtext: str) -> str:
allowed_tags = [ # defined at https://docs.dasch.swiss/latest/DSP-API/03-endpoints/api-v2/text/standard-standoff/
"a( [^>]+)?", # <a> is the only tag that can have attributes
"p",
"em",
"strong",
"u",
"sub",
"sup",
"strike",
"h1",
"ol",
"ul",
"li",
"tbody",
"table",
"tr",
"td",
"br",
"hr",
"pre",
"cite",
"blockquote",
"code",
]
allowed_tags_regex = "|".join(allowed_tags)
lookahead = rf"(?!/?({allowed_tags_regex})/?>)"
illegal_lt = rf"<{lookahead}"
lookbehind = rf"(?<!</?({allowed_tags_regex})/?)"
illegal_gt = rf"{lookbehind}>"
illegal_amp = r"&(?![#a-zA-Z0-9]+;)"
richtext = regex.sub(illegal_lt, "&lt;", richtext or "")
richtext = regex.sub(illegal_gt, "&gt;", richtext)
richtext = regex.sub(illegal_amp, "&amp;", richtext)
return richtext

0 comments on commit 289251a

Please sign in to comment.