From 289251a80a926a0c9599c4666132a0fd7f333c77 Mon Sep 17 00:00:00 2001
From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com>
Date: Thu, 21 Nov 2024 14:05:31 +0100
Subject: [PATCH] feat(xmllib): check tags of richtext values (RDU-58) (#1282)
---
README.md | 2 +-
.../commands/excel2json/json_header.py | 2 +-
.../commands/excel2json/models/input_error.py | 12 +---
.../excel2json/new_lists/compliance_checks.py | 2 +-
.../new_lists/models/input_error.py | 2 +-
.../commands/excel2json/properties.py | 2 +-
.../commands/excel2json/resources.py | 2 +-
.../upload_files/input_error.py | 12 +---
src/dsp_tools/models/problems.py | 10 +++
src/dsp_tools/xmllib/models/problems.py | 18 +++++
src/dsp_tools/xmllib/models/values.py | 2 +
src/dsp_tools/xmllib/value_checkers.py | 70 +++++++++++++++++++
12 files changed, 110 insertions(+), 26 deletions(-)
create mode 100644 src/dsp_tools/models/problems.py
create mode 100644 src/dsp_tools/xmllib/models/problems.py
diff --git a/README.md b/README.md
index 808757db3a..add5c58b07 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ GitHub's dependabot is configured to automatically create a version bumping PR i
Version bumping PRs can also be created manually: run `uv lock --upgrade` and create a PR from the resulting changes.
All developers working with the DSP-TOOLS repository should regularly execute `uv self update` to update uv,
-and `uv sync` to update the dependencies from `uv.lock`.
+and `uv sync --upgrade` to update the dependencies from `uv.lock`.
diff --git a/src/dsp_tools/commands/excel2json/json_header.py b/src/dsp_tools/commands/excel2json/json_header.py
index 8d88593999..e1d09f5e8c 100644
--- a/src/dsp_tools/commands/excel2json/json_header.py
+++ b/src/dsp_tools/commands/excel2json/json_header.py
@@ -13,7 +13,6 @@
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
from dsp_tools.commands.excel2json.models.input_error import MoreThanOneRowProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
-from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import RequiredColumnMissingProblem
from dsp_tools.commands.excel2json.models.json_header import Descriptions
from dsp_tools.commands.excel2json.models.json_header import EmptyJsonHeader
@@ -29,6 +28,7 @@
from dsp_tools.commands.excel2json.utils import find_missing_required_values
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError
+from dsp_tools.models.problems import Problem
from dsp_tools.utils.uri_util import is_uri
diff --git a/src/dsp_tools/commands/excel2json/models/input_error.py b/src/dsp_tools/commands/excel2json/models/input_error.py
index 2cd7a40e56..4e26807cb9 100644
--- a/src/dsp_tools/commands/excel2json/models/input_error.py
+++ b/src/dsp_tools/commands/excel2json/models/input_error.py
@@ -2,7 +2,8 @@
from dataclasses import dataclass
from typing import Any
-from typing import Protocol
+
+from dsp_tools.models.problems import Problem
separator = "\n "
list_separator = "\n - "
@@ -11,15 +12,6 @@
grand_separator = "\n\n---------------------------------------\n\n"
-class Problem(Protocol):
- """Information about input errors."""
-
- def execute_error_protocol(self) -> str:
- """
- This function initiates all the steps for successful problem communication with the user.
- """
-
-
@dataclass(frozen=True)
class PositionInExcel:
"""This class contains the information about the position of a value in the excel."""
diff --git a/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py b/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py
index 49c92f8999..e18b583427 100644
--- a/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py
+++ b/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py
@@ -10,7 +10,6 @@
from loguru import logger
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
-from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet
from dsp_tools.commands.excel2json.new_lists.models.input_error import CollectedSheetProblems
@@ -35,6 +34,7 @@
from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name
from dsp_tools.models.custom_warnings import DspToolsUserWarning
from dsp_tools.models.exceptions import InputError
+from dsp_tools.models.problems import Problem
def make_all_excel_compliance_checks(sheet_list: list[ExcelSheet]) -> None:
diff --git a/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py b/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py
index 5614644588..34d2ae9690 100644
--- a/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py
+++ b/src/dsp_tools/commands/excel2json/new_lists/models/input_error.py
@@ -7,10 +7,10 @@
from dsp_tools.commands.excel2json.models.input_error import ExcelFileProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
-from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import grand_separator
from dsp_tools.commands.excel2json.models.input_error import list_separator
from dsp_tools.commands.excel2json.models.input_error import medium_separator
+from dsp_tools.models.problems import Problem
@dataclass(frozen=True)
diff --git a/src/dsp_tools/commands/excel2json/properties.py b/src/dsp_tools/commands/excel2json/properties.py
index 0257e3481c..bc94a934b3 100644
--- a/src/dsp_tools/commands/excel2json/properties.py
+++ b/src/dsp_tools/commands/excel2json/properties.py
@@ -19,7 +19,6 @@
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
from dsp_tools.commands.excel2json.models.input_error import MoreThanOneSheetProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
-from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import PropertyProblem
from dsp_tools.commands.excel2json.models.ontology import GuiAttributes
from dsp_tools.commands.excel2json.models.ontology import OntoProperty
@@ -34,6 +33,7 @@
from dsp_tools.commands.excel2json.utils import get_wrong_row_numbers
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError
+from dsp_tools.models.problems import Problem
languages = ["en", "de", "fr", "it", "rm"]
language_label_col = ["label_en", "label_de", "label_fr", "label_it", "label_rm"]
diff --git a/src/dsp_tools/commands/excel2json/resources.py b/src/dsp_tools/commands/excel2json/resources.py
index 90dbc7684c..6c2a52e279 100644
--- a/src/dsp_tools/commands/excel2json/resources.py
+++ b/src/dsp_tools/commands/excel2json/resources.py
@@ -18,7 +18,6 @@
from dsp_tools.commands.excel2json.models.input_error import MandatorySheetsMissingProblem
from dsp_tools.commands.excel2json.models.input_error import MissingValuesProblem
from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
-from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.models.input_error import ResourceSheetNotListedProblem
from dsp_tools.commands.excel2json.models.ontology import OntoResource
from dsp_tools.commands.excel2json.models.ontology import ResourceCardinality
@@ -30,6 +29,7 @@
from dsp_tools.commands.excel2json.utils import get_labels
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError
+from dsp_tools.models.problems import Problem
languages = ["en", "de", "fr", "it", "rm"]
diff --git a/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py b/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py
index 8038b31652..ea62eace07 100644
--- a/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py
+++ b/src/dsp_tools/commands/ingest_xmlupload/upload_files/input_error.py
@@ -1,22 +1,14 @@
from dataclasses import dataclass
from pathlib import Path
-from typing import Protocol
import pandas as pd
+from dsp_tools.models.problems import Problem
+
separator = "\n\n"
list_separator = "\n - "
-class Problem(Protocol):
- """Information about input errors."""
-
- def execute_error_protocol(self) -> str:
- """
- This function initiates all the steps for successful problem communication with the user.
- """
-
-
@dataclass(frozen=True)
class FileProblems(Problem):
"""Handle the error communication to the user in case that some files don't exist or are unsupported."""
diff --git a/src/dsp_tools/models/problems.py b/src/dsp_tools/models/problems.py
new file mode 100644
index 0000000000..004fbe0bba
--- /dev/null
+++ b/src/dsp_tools/models/problems.py
@@ -0,0 +1,10 @@
+from typing import Protocol
+
+
+class Problem(Protocol):
+ """Information about input errors."""
+
+ def execute_error_protocol(self) -> str:
+ """
+ This function initiates all the steps for successful problem communication with the user.
+ """
diff --git a/src/dsp_tools/xmllib/models/problems.py b/src/dsp_tools/xmllib/models/problems.py
new file mode 100644
index 0000000000..9378f94e80
--- /dev/null
+++ b/src/dsp_tools/xmllib/models/problems.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+
+from dsp_tools.models.problems import Problem
+
+
+@dataclass
+class IllegalTagProblem(Problem):
+ orig_err_msg: str
+ pseudo_xml: str
+
+ def execute_error_protocol(self) -> str:
+ msg = (
+ "The XML tags contained in a richtext property (encoding=xml) must be well-formed. "
+ "The special characters <, > and & are only allowed to construct a tag. "
+ )
+ msg += f"\nOriginal error message: {self.orig_err_msg}"
+ msg += f"\nEventual line/column numbers are relative to this text: {self.pseudo_xml}"
+ return msg
diff --git a/src/dsp_tools/xmllib/models/values.py b/src/dsp_tools/xmllib/models/values.py
index d943e5ac63..e6024c8efe 100644
--- a/src/dsp_tools/xmllib/models/values.py
+++ b/src/dsp_tools/xmllib/models/values.py
@@ -10,6 +10,7 @@
from dsp_tools.models.custom_warnings import DspToolsUserWarning
from dsp_tools.utils.uri_util import is_uri
from dsp_tools.xmllib.models.config_options import Permissions
+from dsp_tools.xmllib.value_checkers import check_richtext_syntax
from dsp_tools.xmllib.value_checkers import is_bool_like
from dsp_tools.xmllib.value_checkers import is_color
from dsp_tools.xmllib.value_checkers import is_date
@@ -371,6 +372,7 @@ def __post_init__(self) -> None:
_warn_type_mismatch(
expected_type="string", value=self.value, prop_name=self.prop_name, res_id=self.resource_id
)
+ check_richtext_syntax(self.value)
def serialise(self) -> etree._Element:
ele = self.make_prop()
diff --git a/src/dsp_tools/xmllib/value_checkers.py b/src/dsp_tools/xmllib/value_checkers.py
index 64496227bb..e7f937a1c5 100644
--- a/src/dsp_tools/xmllib/value_checkers.py
+++ b/src/dsp_tools/xmllib/value_checkers.py
@@ -1,8 +1,14 @@
import json
+import warnings
from typing import Any
import pandas as pd
import regex
+from lxml import etree
+from namedentities.core import numeric_entities # type: ignore[import-untyped]
+
+from dsp_tools.models.custom_warnings import DspToolsUserWarning
+from dsp_tools.xmllib.models.problems import IllegalTagProblem
def is_nonempty_value(value: Any) -> bool:
@@ -214,3 +220,67 @@ def is_dsp_ark(value: Any) -> bool:
True if it is valid, else false
"""
return bool(regex.search(r"^ark:/", str(value)))
+
+
+def check_richtext_syntax(richtext: str) -> None:
+ """
+ DSP richtexts must be convertible into valid XML.
+ This checker escapes the reserved characters `<`, `>` and `&`,
+ but only if they are not part of a standard standoff tag or escape sequence.
+ Then, it tries to parse the resulting XML.
+
+ Note: Only DSP standard standoff tags are allowed in richtexts. They are documented
+ [here](https://docs.dasch.swiss/latest/DSP-API/03-endpoints/api-v2/text/standard-standoff/).
+
+ Args:
+ richtext: richtext to check
+
+ Warns:
+ DspToolsUserWarning: if the input contains XML syntax problems
+ """
+ escaped_text = _escape_reserved_chars(richtext)
+ # transform named entities (=character references) to numeric entities, e.g. ->
+ num_ent = numeric_entities(escaped_text)
+ pseudo_xml = f"{num_ent}"
+ try:
+ _ = etree.fromstring(pseudo_xml)
+ except etree.XMLSyntaxError as err:
+ prob = IllegalTagProblem(orig_err_msg=err.msg, pseudo_xml=pseudo_xml)
+ warnings.warn(DspToolsUserWarning(prob.execute_error_protocol()))
+
+
+def _escape_reserved_chars(richtext: str) -> str:
+ allowed_tags = [ # defined at https://docs.dasch.swiss/latest/DSP-API/03-endpoints/api-v2/text/standard-standoff/
+ "a( [^>]+)?", # is the only tag that can have attributes
+ "p",
+ "em",
+ "strong",
+ "u",
+ "sub",
+ "sup",
+ "strike",
+ "h1",
+ "ol",
+ "ul",
+ "li",
+ "tbody",
+ "table",
+ "tr",
+ "td",
+ "br",
+ "hr",
+ "pre",
+ "cite",
+ "blockquote",
+ "code",
+ ]
+ allowed_tags_regex = "|".join(allowed_tags)
+ lookahead = rf"(?!/?({allowed_tags_regex})/?>)"
+ illegal_lt = rf"<{lookahead}"
+ lookbehind = rf"(?"
+ illegal_amp = r"&(?![#a-zA-Z0-9]+;)"
+ richtext = regex.sub(illegal_lt, "<", richtext or "")
+ richtext = regex.sub(illegal_gt, ">", richtext)
+ richtext = regex.sub(illegal_amp, "&", richtext)
+ return richtext