From 289251a80a926a0c9599c4666132a0fd7f333c77 Mon Sep 17 00:00:00 2001 From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> Date: Thu, 21 Nov 2024 14:05:31 +0100 Subject: [PATCH] feat(xmllib): check tags of richtext values (RDU-58) (#1282) --- README.md | 2 +- .../commands/excel2json/json_header.py | 2 +- .../commands/excel2json/models/input_error.py | 12 +--- .../excel2json/new_lists/compliance_checks.py | 2 +- .../new_lists/models/input_error.py | 2 +- .../commands/excel2json/properties.py | 2 +- .../commands/excel2json/resources.py | 2 +- .../upload_files/input_error.py | 12 +--- src/dsp_tools/models/problems.py | 10 +++ src/dsp_tools/xmllib/models/problems.py | 18 +++++ src/dsp_tools/xmllib/models/values.py | 2 + src/dsp_tools/xmllib/value_checkers.py | 70 +++++++++++++++++++ 12 files changed, 110 insertions(+), 26 deletions(-) create mode 100644 src/dsp_tools/models/problems.py create mode 100644 src/dsp_tools/xmllib/models/problems.py diff --git a/README.md b/README.md index 808757db3a..add5c58b07 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ GitHub's dependabot is configured to automatically create a version bumping PR i Version bumping PRs can also be created manually: run `uv lock --upgrade` and create a PR from the resulting changes. All developers working with the DSP-TOOLS repository should regularly execute `uv self update` to update uv, -and `uv sync` to update the dependencies from `uv.lock`. +and `uv sync --upgrade` to update the dependencies from `uv.lock`. diff --git a/src/dsp_tools/commands/excel2json/json_header.py b/src/dsp_tools/commands/excel2json/json_header.py index 8d88593999..e1d09f5e8c 100644 --- a/src/dsp_tools/commands/excel2json/json_header.py +++ b/src/dsp_tools/commands/excel2json/json_header.py @@ -13,7 +13,6 @@ from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem from dsp_tools.commands.excel2json.models.input_error import MoreThanOneRowProblem from dsp_tools.commands.excel2json.models.input_error import PositionInExcel -from dsp_tools.commands.excel2json.models.input_error import Problem from dsp_tools.commands.excel2json.models.input_error import RequiredColumnMissingProblem from dsp_tools.commands.excel2json.models.json_header import Descriptions from dsp_tools.commands.excel2json.models.json_header import EmptyJsonHeader @@ -29,6 +28,7 @@ from dsp_tools.commands.excel2json.utils import find_missing_required_values from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets from dsp_tools.models.exceptions import InputError +from dsp_tools.models.problems import Problem from dsp_tools.utils.uri_util import is_uri diff --git a/src/dsp_tools/commands/excel2json/models/input_error.py b/src/dsp_tools/commands/excel2json/models/input_error.py index 2cd7a40e56..4e26807cb9 100644 --- a/src/dsp_tools/commands/excel2json/models/input_error.py +++ b/src/dsp_tools/commands/excel2json/models/input_error.py @@ -2,7 +2,8 @@ from dataclasses import dataclass from typing import Any -from typing import Protocol + +from dsp_tools.models.problems import Problem separator = "\n " list_separator = "\n - " @@ -11,15 +12,6 @@ grand_separator = "\n\n---------------------------------------\n\n" -class Problem(Protocol): - """Information about input errors.""" - - def execute_error_protocol(self) -> str: - """ - This function initiates all the steps for successful problem communication with the user. - """ - - @dataclass(frozen=True) class PositionInExcel: """This class contains the information about the position of a value in the excel.""" diff --git a/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py b/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py index 49c92f8999..e18b583427 100644 --- a/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py +++ b/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py @@ -10,7 +10,6 @@ from loguru import logger from dsp_tools.commands.excel2json.models.input_error import PositionInExcel -from dsp_tools.commands.excel2json.models.input_error import Problem from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet from dsp_tools.commands.excel2json.new_lists.models.input_error import CollectedSheetProblems @@ -35,6 +34,7 @@ from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name from dsp_tools.models.custom_warnings import DspToolsUserWarning from dsp_tools.models.exceptions import InputError +from dsp_tools.models.problems import Problem def make_all_excel_compliance_checks(sheet_list: list[ExcelSheet]) -> None: diff --git a/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py b/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py index 5614644588..34d2ae9690 100644 --- a/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py +++ b/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py @@ -7,10 +7,10 @@ from dsp_tools.commands.excel2json.models.input_error import ExcelFileProblem from dsp_tools.commands.excel2json.models.input_error import PositionInExcel -from dsp_tools.commands.excel2json.models.input_error import Problem from dsp_tools.commands.excel2json.models.input_error import grand_separator from dsp_tools.commands.excel2json.models.input_error import list_separator from dsp_tools.commands.excel2json.models.input_error import medium_separator +from dsp_tools.models.problems import Problem @dataclass(frozen=True) diff --git a/src/dsp_tools/commands/excel2json/properties.py b/src/dsp_tools/commands/excel2json/properties.py index 0257e3481c..bc94a934b3 100644 --- a/src/dsp_tools/commands/excel2json/properties.py +++ b/src/dsp_tools/commands/excel2json/properties.py @@ -19,7 +19,6 @@ from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem from dsp_tools.commands.excel2json.models.input_error import MoreThanOneSheetProblem from dsp_tools.commands.excel2json.models.input_error import PositionInExcel -from dsp_tools.commands.excel2json.models.input_error import Problem from dsp_tools.commands.excel2json.models.input_error import PropertyProblem from dsp_tools.commands.excel2json.models.ontology import GuiAttributes from dsp_tools.commands.excel2json.models.ontology import OntoProperty @@ -34,6 +33,7 @@ from dsp_tools.commands.excel2json.utils import get_wrong_row_numbers from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets from dsp_tools.models.exceptions import InputError +from dsp_tools.models.problems import Problem languages = ["en", "de", "fr", "it", "rm"] language_label_col = ["label_en", "label_de", "label_fr", "label_it", "label_rm"] diff --git a/src/dsp_tools/commands/excel2json/resources.py b/src/dsp_tools/commands/excel2json/resources.py index 90dbc7684c..6c2a52e279 100644 --- a/src/dsp_tools/commands/excel2json/resources.py +++ b/src/dsp_tools/commands/excel2json/resources.py @@ -18,7 +18,6 @@ from dsp_tools.commands.excel2json.models.input_error import MandatorySheetsMissingProblem from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem from dsp_tools.commands.excel2json.models.input_error import PositionInExcel -from dsp_tools.commands.excel2json.models.input_error import Problem from dsp_tools.commands.excel2json.models.input_error import ResourceSheetNotListedProblem from dsp_tools.commands.excel2json.models.ontology import OntoResource from dsp_tools.commands.excel2json.models.ontology import ResourceCardinality @@ -30,6 +29,7 @@ from dsp_tools.commands.excel2json.utils import get_labels from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets from dsp_tools.models.exceptions import InputError +from dsp_tools.models.problems import Problem languages = ["en", "de", "fr", "it", "rm"] diff --git a/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py b/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py index 8038b31652..ea62eace07 100644 --- a/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py +++ b/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py @@ -1,22 +1,14 @@ from dataclasses import dataclass from pathlib import Path -from typing import Protocol import pandas as pd +from dsp_tools.models.problems import Problem + separator = "\n\n" list_separator = "\n - " -class Problem(Protocol): - """Information about input errors.""" - - def execute_error_protocol(self) -> str: - """ - This function initiates all the steps for successful problem communication with the user. - """ - - @dataclass(frozen=True) class FileProblems(Problem): """Handle the error communication to the user in case that some files don't exist or are unsupported.""" diff --git a/src/dsp_tools/models/problems.py b/src/dsp_tools/models/problems.py new file mode 100644 index 0000000000..004fbe0bba --- /dev/null +++ b/src/dsp_tools/models/problems.py @@ -0,0 +1,10 @@ +from typing import Protocol + + +class Problem(Protocol): + """Information about input errors.""" + + def execute_error_protocol(self) -> str: + """ + This function initiates all the steps for successful problem communication with the user. + """ diff --git a/src/dsp_tools/xmllib/models/problems.py b/src/dsp_tools/xmllib/models/problems.py new file mode 100644 index 0000000000..9378f94e80 --- /dev/null +++ b/src/dsp_tools/xmllib/models/problems.py @@ -0,0 +1,18 @@ +from dataclasses import dataclass + +from dsp_tools.models.problems import Problem + + +@dataclass +class IllegalTagProblem(Problem): + orig_err_msg: str + pseudo_xml: str + + def execute_error_protocol(self) -> str: + msg = ( + "The XML tags contained in a richtext property (encoding=xml) must be well-formed. " + "The special characters <, > and & are only allowed to construct a tag. " + ) + msg += f"\nOriginal error message: {self.orig_err_msg}" + msg += f"\nEventual line/column numbers are relative to this text: {self.pseudo_xml}" + return msg diff --git a/src/dsp_tools/xmllib/models/values.py b/src/dsp_tools/xmllib/models/values.py index d943e5ac63..e6024c8efe 100644 --- a/src/dsp_tools/xmllib/models/values.py +++ b/src/dsp_tools/xmllib/models/values.py @@ -10,6 +10,7 @@ from dsp_tools.models.custom_warnings import DspToolsUserWarning from dsp_tools.utils.uri_util import is_uri from dsp_tools.xmllib.models.config_options import Permissions +from dsp_tools.xmllib.value_checkers import check_richtext_syntax from dsp_tools.xmllib.value_checkers import is_bool_like from dsp_tools.xmllib.value_checkers import is_color from dsp_tools.xmllib.value_checkers import is_date @@ -371,6 +372,7 @@ def __post_init__(self) -> None: _warn_type_mismatch( expected_type="string", value=self.value, prop_name=self.prop_name, res_id=self.resource_id ) + check_richtext_syntax(self.value) def serialise(self) -> etree._Element: ele = self.make_prop() diff --git a/src/dsp_tools/xmllib/value_checkers.py b/src/dsp_tools/xmllib/value_checkers.py index 64496227bb..e7f937a1c5 100644 --- a/src/dsp_tools/xmllib/value_checkers.py +++ b/src/dsp_tools/xmllib/value_checkers.py @@ -1,8 +1,14 @@ import json +import warnings from typing import Any import pandas as pd import regex +from lxml import etree +from namedentities.core import numeric_entities # type: ignore[import-untyped] + +from dsp_tools.models.custom_warnings import DspToolsUserWarning +from dsp_tools.xmllib.models.problems import IllegalTagProblem def is_nonempty_value(value: Any) -> bool: @@ -214,3 +220,67 @@ def is_dsp_ark(value: Any) -> bool: True if it is valid, else false """ return bool(regex.search(r"^ark:/", str(value))) + + +def check_richtext_syntax(richtext: str) -> None: + """ + DSP richtexts must be convertible into valid XML. + This checker escapes the reserved characters `<`, `>` and `&`, + but only if they are not part of a standard standoff tag or escape sequence. + Then, it tries to parse the resulting XML. + + Note: Only DSP standard standoff tags are allowed in richtexts. They are documented + [here](https://docs.dasch.swiss/latest/DSP-API/03-endpoints/api-v2/text/standard-standoff/). + + Args: + richtext: richtext to check + + Warns: + DspToolsUserWarning: if the input contains XML syntax problems + """ + escaped_text = _escape_reserved_chars(richtext) + # transform named entities (=character references) to numeric entities, e.g.   ->   + num_ent = numeric_entities(escaped_text) + pseudo_xml = f"{num_ent}" + try: + _ = etree.fromstring(pseudo_xml) + except etree.XMLSyntaxError as err: + prob = IllegalTagProblem(orig_err_msg=err.msg, pseudo_xml=pseudo_xml) + warnings.warn(DspToolsUserWarning(prob.execute_error_protocol())) + + +def _escape_reserved_chars(richtext: str) -> str: + allowed_tags = [ # defined at https://docs.dasch.swiss/latest/DSP-API/03-endpoints/api-v2/text/standard-standoff/ + "a( [^>]+)?", # is the only tag that can have attributes + "p", + "em", + "strong", + "u", + "sub", + "sup", + "strike", + "h1", + "ol", + "ul", + "li", + "tbody", + "table", + "tr", + "td", + "br", + "hr", + "pre", + "cite", + "blockquote", + "code", + ] + allowed_tags_regex = "|".join(allowed_tags) + lookahead = rf"(?!/?({allowed_tags_regex})/?>)" + illegal_lt = rf"<{lookahead}" + lookbehind = rf"(?" + illegal_amp = r"&(?![#a-zA-Z0-9]+;)" + richtext = regex.sub(illegal_lt, "<", richtext or "") + richtext = regex.sub(illegal_gt, ">", richtext) + richtext = regex.sub(illegal_amp, "&", richtext) + return richtext