From 1711e3fde9ae6e9b44a7b6aedead563265135737 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 23 Oct 2023 12:55:51 +0200
Subject: [PATCH 1/9] add fn:parse-html based on Validator.nu

---
 basex-core/pom.xml                            |   6 +
 .../main/java/org/basex/query/QueryError.java |   4 +
 .../java/org/basex/query/func/Function.java   |   3 +
 .../basex/query/func/html/FnParseHtml.java    | 138 ++++++++++++++++++
 .../org/basex/query/func/FnModuleTest.java    |  14 ++
 pom.xml                                       |   7 +
 6 files changed, 172 insertions(+)
 create mode 100644 basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index 83e0ef7818..7e34f25e58 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -52,6 +52,12 @@
       <scope>provided</scope>
       <optional>true</optional>
     </dependency>
+    <dependency>
+      <groupId>nu.validator</groupId>
+      <artifactId>htmlparser</artifactId>
+      <scope>provided</scope>
+      <optional>true</optional>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java
index 7e30cd41b9..3916df95db 100644
--- a/basex-core/src/main/java/org/basex/query/QueryError.java
+++ b/basex-core/src/main/java/org/basex/query/QueryError.java
@@ -615,6 +615,10 @@ public enum QueryError {
   SAXERR_X(FODC, 6, "SAX: %"),
   /** Error code. */
   RESINV_X(FODC, 7, "Resource path '%' is invalid."),
+  /** Error code. */
+  INVHTML_X(FODC, 11, "String passed to fn:parse-html is not a well-formed HTML document: %"),
+  /** Error code. */
+  INVHTMLOPT_X(FODC, 12, "Unsupported HTML parser option: %"),
 
   /** Error code. */
   FORMNUM_X(FODF, 1280, "Unknown decimal format: '%'."),
diff --git a/basex-core/src/main/java/org/basex/query/func/Function.java b/basex-core/src/main/java/org/basex/query/func/Function.java
index da69a1237c..bf29459289 100644
--- a/basex-core/src/main/java/org/basex/query/func/Function.java
+++ b/basex-core/src/main/java/org/basex/query/func/Function.java
@@ -464,6 +464,9 @@ ITEM_ZM, flag(HOF)),
   PARSE_IETF_DATE(FnParseIetfDate::new, "parse-ietf-date(value)",
       params(STRING_ZO), DATE_TIME_ZO),
   /** XQuery function. */
+  PARSE_HTML(FnParseHtml::new, "parse-html(html[,options])",
+      params(ITEM_ZO, MAP_O), DOCUMENT_NODE_ZO),
+  /** XQuery function. */
   PARSE_INTEGER(FnParseInteger::new, "parse-integer(value[,radix])",
       params(STRING_O, INTEGER_O), INTEGER_O),
   /** XQuery function. */
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
new file mode 100644
index 0000000000..0f8a3a5801
--- /dev/null
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -0,0 +1,138 @@
+package org.basex.query.func.html;
+
+import static org.basex.query.QueryError.*;
+import static org.basex.util.Token.*;
+
+import java.io.*;
+
+import org.basex.build.html.*;
+import org.basex.build.xml.*;
+import org.basex.core.*;
+import org.basex.io.*;
+import org.basex.io.in.*;
+import org.basex.query.*;
+import org.basex.query.expr.*;
+import org.basex.query.func.*;
+import org.basex.query.value.item.*;
+import org.basex.query.value.node.*;
+import org.basex.query.value.seq.*;
+import org.basex.util.*;
+import org.xml.sax.*;
+
+import nu.validator.htmlparser.common.*;
+import nu.validator.htmlparser.sax.*;
+
+/**
+ * Function implementation.
+ *
+ * @author BaseX Team 2005-23, BSD License
+ * @author Gunther Rademacher
+ */
+public class FnParseHtml extends StandardFunc {
+  // TODO: handle second argument (method, html-version, encoding), produce error code FODC0012
+
+  @Override
+  public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
+    final Item value = arg(0).atomItem(qc, info);
+    return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)));
+  }
+
+  @Override
+  protected final Expr opt(final CompileContext cc) {
+    return optFirst();
+  }
+
+  /**
+   * Parses the input and creates an XML document.
+   * @param io input data
+   * @return node
+   * @throws QueryException query exception
+   */
+  protected final Item parse(final IO io) throws QueryException {
+    try {
+      if (!ParserImpl.available()) {
+        // reader could not be initialized; fall back to html:parse
+        final HtmlOptions htmlOptions = new HtmlOptions();
+        htmlOptions.set(HtmlOptions.LEXICAL, true);
+        htmlOptions.set(HtmlOptions.NONS, false);
+        return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions));
+      }
+      return new DBNode(new ParserImpl(io, new MainOptions()));
+    } catch(final IOException ex) {
+      throw INVHTML_X.get(info, ex);
+    }
+  }
+
+  /**
+   * Parser implementation.
+   */
+  private static class ParserImpl extends XMLParser {
+
+    /**
+     * Checks if Validator.nu is available.
+     * @return result of check
+     */
+    public static boolean available() {
+      return Reflect.available("nu.validator.htmlparser.sax.HtmlParser");
+    }
+
+    /**
+     * Constructor.
+     * @param source document source
+     * @param options main options
+     * @throws IOException I/O exception
+     */
+    ParserImpl(final IO source, final MainOptions options)
+        throws IOException {
+      super(toXml(source), options);
+    }
+
+    /**
+     * Converts an HTML document to XML.
+     * @param io io reference
+     * @return parser
+     * @throws IOException I/O exception
+     */
+    private static IO toXml(final IO io) throws IOException {
+      try(TextInput ti = new TextInput(io)) {
+
+        // tries to extract the encoding from the input
+        // TODO: remove this, in favor of encoding from options, or constant for string input
+        String enc = ti.encoding();
+        final byte[] content = ti.content();
+        // looks for a charset definition
+        final byte[] encoding = token("charset=");
+        int cs = indexOf(content, encoding);
+        if(cs > 0) {
+          // extracts the encoding string
+          cs += encoding.length;
+          int ce = cs;
+          final int cl = content.length;
+          while(++ce < cl && content[ce] > 0x28);
+          enc = string(substring(content, cs, ce));
+        }
+
+        // define output
+        final StringWriter sw = new StringWriter();
+        final nu.validator.htmlparser.sax.HtmlParser reader =
+            new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
+        reader.setFeature("http://xml.org/sax/features/namespaces", true);
+        reader.setFeature("http://xml.org/sax/features/namespace-prefixes", false);
+
+        final ContentHandler writer = new XmlSerializer(sw);
+        reader.setContentHandler(writer);
+        reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
+
+        // define input
+        final InputSource is = new InputSource(new ArrayInput(content));
+        is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+        reader.parse(is);
+        return new IOContent(token(sw.toString()), io.name());
+
+      } catch(final SAXException ex) {
+        Util.errln(ex);
+        throw INVHTML_X.getIO(ex.getLocalizedMessage());
+      }
+    }
+  }
+}
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index 90325bd7d4..a57d7be0d0 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1438,6 +1438,20 @@ public final class FnModuleTest extends SandboxTest {
     query("let $n := <li/> return " + func.args(" ($n, $n)"), "<li/>");
   }
 
+  /** Test method. */
+  @Test public void parseHtml() {
+    final Function func = PARSE_HTML;
+
+    query(func.args("42"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
+    query(func.args(" xs:hexBinary('3432')"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
+    query(func.args(" xs:base64Binary('NDI=')"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
+
+    error(func.args(42), STRBIN_X_X);
+  }
+
   /** Test method. */
   @Test public void parseIetfDate() {
     final Function func = PARSE_IETF_DATE;
diff --git a/pom.xml b/pom.xml
index dabb8b18ca..e9cf162f1a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -174,6 +174,13 @@
         <scope>runtime</scope>
         <optional>true</optional>
       </dependency>
+      <dependency>
+        <groupId>nu.validator</groupId>
+        <artifactId>htmlparser</artifactId>
+        <version>1.4.16</version>
+        <scope>runtime</scope>
+        <optional>true</optional>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 

From dfb0be0b2f594f5137948f9a1987a50fb66da384 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 30 Oct 2023 13:41:42 +0100
Subject: [PATCH 2/9] add support for Validator.nu options

---
 .../org/basex/build/html/HtmlOptions.java     | 138 ++++++++++++++++++
 .../basex/query/func/html/FnParseHtml.java    |  97 +++++++-----
 .../org/basex/query/func/FnModuleTest.java    |  20 ++-
 3 files changed, 216 insertions(+), 39 deletions(-)

diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index 5fe399485f..b46e8d6e56 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -9,6 +9,41 @@
  * @author Christian Gruen
  */
 public final class HtmlOptions extends Options {
+  /** Validator.nu option unicode-normalization-checking. */
+  public static final BooleanOption UNICODE_NORMALIZATION_CHECKING =
+      new BooleanOption("unicode-normalization-checking", false);
+  /** Validator.nu option mapping-lang-to-xml-lang. */
+  public static final BooleanOption MAPPING_LANG_TO_XML_LANG =
+      new BooleanOption("mapping-lang-to-xml-lang", false);
+  /** Validator.nu option scripting-enabled. */
+  public static final BooleanOption SCRIPTING_ENABLED =
+      new BooleanOption("scripting-enabled", false);
+
+  /** Validator.nu option content-space-policy. */
+  public static final EnumOption<XmlViolationPolicy> CONTENT_SPACE_POLICY =
+      new EnumOption<>("content-space-policy", XmlViolationPolicy.class);
+  /** Validator.nu option content-non-xml-char-policy. */
+  public static final EnumOption<XmlViolationPolicy> CONTENT_NON_XML_CHAR_POLICY =
+      new EnumOption<>("content-non-xml-char-policy", XmlViolationPolicy.class);
+  /** Validator.nu option comment-policy. */
+  public static final EnumOption<XmlViolationPolicy> COMMENT_POLICY =
+      new EnumOption<>("comment-policy", XmlViolationPolicy.class);
+  /** Validator.nu option xmlns-policy. */
+  public static final EnumOption<XmlViolationPolicy> XMLNS_POLICY =
+      new EnumOption<>("xmlns-policy", XmlViolationPolicy.class);
+  /** Validator.nu option name-policy. */
+  public static final EnumOption<XmlViolationPolicy> NAME_POLICY =
+      new EnumOption<>("name-policy", XmlViolationPolicy.class);
+  /** Validator.nu option streamability-violation-policy. */
+  public static final EnumOption<XmlViolationPolicy> STREAMABILITY_VIOLATION_POLICY =
+      new EnumOption<>("streamability-violation-policy", XmlViolationPolicy.class);
+  /** Validator.nu option xml-policy. */
+  public static final EnumOption<XmlViolationPolicy> XML_POLICY =
+      new EnumOption<>("xml-policy", XmlViolationPolicy.class);
+  /** Validator.nu option heuristics. */
+  public static final EnumOption<Heuristics> HEURISTICS =
+      new EnumOption<>("heuristics", Heuristics.class);
+
   /** TagSoup option: html. */
   public static final BooleanOption HTML = new BooleanOption("html", false);
   /** TagSoup option: omit-xml-declaration. */
@@ -59,4 +94,107 @@ public HtmlOptions() {
   public HtmlOptions(final Options opts) {
     super(opts);
   }
+
+  /**
+   * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
+   * dependency on Validator.nu in the classpath.
+   *
+   * Copyright (c) 2007 Henri Sivonen
+   *
+   * Permission is hereby granted, free of charge, to any person obtaining a
+   * copy of this software and associated documentation files (the "Software"),
+   * to deal in the Software without restriction, including without limitation
+   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+   * and/or sell copies of the Software, and to permit persons to whom the
+   * Software is furnished to do so, subject to the following conditions:
+   *
+   * The above copyright notice and this permission notice shall be included in
+   * all copies or substantial portions of the Software.
+   *
+   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   * DEALINGS IN THE SOFTWARE.
+   */
+
+  /**
+   * Policy for XML 1.0 violations.
+   *
+   * @version $Id$
+   * @author hsivonen
+   */
+  public enum XmlViolationPolicy {
+      /**
+       * Conform to HTML 5, allow XML 1.0 to be violated.
+       */
+      ALLOW,
+
+      /**
+       * Halt when something cannot be mapped to XML 1.0.
+       */
+      FATAL,
+
+      /**
+       * Be non-conforming and alter the infoset to fit
+       * XML 1.0 when something would otherwise not be
+       * mappable to XML 1.0.
+       */
+      ALTER_INFOSET
+  }
+
+  /**
+   * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
+   * dependency on Validator.nu in the classpath.
+   *
+   * Permission is hereby granted, free of charge, to any person obtaining a
+   * copy of this software and associated documentation files (the "Software"),
+   * to deal in the Software without restriction, including without limitation
+   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+   * and/or sell copies of the Software, and to permit persons to whom the
+   * Software is furnished to do so, subject to the following conditions:
+   *
+   * The above copyright notice and this permission notice shall be included in
+   * all copies or substantial portions of the Software.
+   *
+   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   * DEALINGS IN THE SOFTWARE.
+   */
+
+  /**
+   * Indicates a request for character encoding sniffer choice.
+   *
+   * @version $Id$
+   * @author hsivonen
+   */
+  public enum Heuristics {
+
+      /**
+       * Perform no heuristic sniffing.
+       */
+      NONE,
+
+      /**
+       * Use both jchardet and ICU4J.
+       */
+      ALL,
+
+      /**
+       * Use jchardet only.
+       */
+      CHARDET,
+
+      /**
+       * Use ICU4J only.
+       */
+      ICU
+  }
+
 }
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index 0f8a3a5801..2ae7c1d525 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -1,5 +1,6 @@
 package org.basex.query.func.html;
 
+import static org.basex.build.html.HtmlOptions.*;
 import static org.basex.query.QueryError.*;
 import static org.basex.util.Token.*;
 
@@ -9,7 +10,6 @@
 import org.basex.build.xml.*;
 import org.basex.core.*;
 import org.basex.io.*;
-import org.basex.io.in.*;
 import org.basex.query.*;
 import org.basex.query.expr.*;
 import org.basex.query.func.*;
@@ -19,8 +19,9 @@
 import org.basex.util.*;
 import org.xml.sax.*;
 
-import nu.validator.htmlparser.common.*;
 import nu.validator.htmlparser.sax.*;
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+import nu.validator.htmlparser.common.Heuristics;
 
 /**
  * Function implementation.
@@ -34,7 +35,8 @@ public class FnParseHtml extends StandardFunc {
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
     final Item value = arg(0).atomItem(qc, info);
-    return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)));
+    final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
+    return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), options);
   }
 
   @Override
@@ -45,10 +47,11 @@ protected final Expr opt(final CompileContext cc) {
   /**
    * Parses the input and creates an XML document.
    * @param io input data
+   * @param options HTML options
    * @return node
    * @throws QueryException query exception
    */
-  protected final Item parse(final IO io) throws QueryException {
+  protected final Item parse(final IO io, final HtmlOptions options) throws QueryException {
     try {
       if (!ParserImpl.available()) {
         // reader could not be initialized; fall back to html:parse
@@ -57,7 +60,7 @@ protected final Item parse(final IO io) throws QueryException {
         htmlOptions.set(HtmlOptions.NONS, false);
         return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions));
       }
-      return new DBNode(new ParserImpl(io, new MainOptions()));
+      return new DBNode(new ParserImpl(info, io, options));
     } catch(final IOException ex) {
       throw INVHTML_X.get(info, ex);
     }
@@ -73,65 +76,91 @@ private static class ParserImpl extends XMLParser {
      * @return result of check
      */
     public static boolean available() {
-      return Reflect.available("nu.validator.htmlparser.sax.HtmlParser");
+      return Reflect.available("nu.validator.htmlparser.sax.HtmlParser")
+          && Reflect.available("nu.validator.htmlparser.sax.XmlSerializer")
+          && Reflect.available("nu.validator.htmlparser.common.Heuristics")
+          && Reflect.available("nu.validator.htmlparser.common.XmlViolationPolicy");
     }
 
     /**
      * Constructor.
+     * @param info input info
      * @param source document source
-     * @param options main options
+     * @param options HTML options
      * @throws IOException I/O exception
+     * @throws QueryException query exception
      */
-    ParserImpl(final IO source, final MainOptions options)
-        throws IOException {
-      super(toXml(source), options);
+    ParserImpl(final InputInfo info, final IO source, final HtmlOptions options)
+        throws IOException, QueryException {
+      super(toXml(info, source, options), new MainOptions());
     }
 
     /**
      * Converts an HTML document to XML.
+     * @param info input info
      * @param io io reference
+     * @param hopts HTML options
      * @return parser
      * @throws IOException I/O exception
+     * @throws QueryException query exception
      */
-    private static IO toXml(final IO io) throws IOException {
-      try(TextInput ti = new TextInput(io)) {
-
-        // tries to extract the encoding from the input
-        // TODO: remove this, in favor of encoding from options, or constant for string input
-        String enc = ti.encoding();
-        final byte[] content = ti.content();
-        // looks for a charset definition
-        final byte[] encoding = token("charset=");
-        int cs = indexOf(content, encoding);
-        if(cs > 0) {
-          // extracts the encoding string
-          cs += encoding.length;
-          int ce = cs;
-          final int cl = content.length;
-          while(++ce < cl && content[ce] > 0x28);
-          enc = string(substring(content, cs, ce));
-        }
+    private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hopts)
+        throws IOException, QueryException {
 
+      try {
         // define output
         final StringWriter sw = new StringWriter();
         final nu.validator.htmlparser.sax.HtmlParser reader =
             new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
-        reader.setFeature("http://xml.org/sax/features/namespaces", true);
-        reader.setFeature("http://xml.org/sax/features/namespace-prefixes", false);
-
         final ContentHandler writer = new XmlSerializer(sw);
         reader.setContentHandler(writer);
         reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
 
         // define input
-        final InputSource is = new InputSource(new ArrayInput(content));
-        is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+        final InputSource is = new InputSource(io.inputStream());
+
+        // set Validator.nu options
+        if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
+          reader.setCheckingNormalization(true);
+        if(hopts.get(MAPPING_LANG_TO_XML_LANG))
+          reader.setMappingLangToXmlLang(true);
+        if(hopts.get(SCRIPTING_ENABLED))
+          reader.setScriptingEnabled(true);
+        if(hopts.contains(CONTENT_SPACE_POLICY))
+          reader.setContentSpacePolicy(
+              XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
+        if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
+          reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
+              hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
+        if(hopts.contains(COMMENT_POLICY))
+          reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
+        if(hopts.contains(XMLNS_POLICY))
+          reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
+        if(hopts.contains(NAME_POLICY))
+          reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
+        if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
+          reader.setStreamabilityViolationPolicy(
+              XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
+        if(hopts.contains(XML_POLICY))
+          reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
+
+        if(hopts.contains(HEURISTICS))
+          reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
+        // end Validator.nu options
+
+        if (hopts.contains(ENCODING)) {
+          String enc = hopts.get(HtmlOptions.ENCODING);
+          if (!Strings.supported(enc))
+            throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.');
+          is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+        }
+
         reader.parse(is);
         return new IOContent(token(sw.toString()), io.name());
 
       } catch(final SAXException ex) {
         Util.errln(ex);
-        throw INVHTML_X.getIO(ex.getLocalizedMessage());
+        throw INVHTML_X.get(info, ex.getLocalizedMessage());
       }
     }
   }
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index a57d7be0d0..5f5365d262 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -11,6 +11,7 @@
 import org.basex.query.expr.path.*;
 import org.basex.query.value.item.*;
 import org.basex.query.value.seq.*;
+import org.basex.util.*;
 import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.Test;
 
@@ -1434,7 +1435,7 @@ public final class FnModuleTest extends SandboxTest {
 
   /** Test method. */
   @Test public void outermost() {
-    final Function func = INNERMOST;
+    final Function func = OUTERMOST;
     query("let $n := <li/> return " + func.args(" ($n, $n)"), "<li/>");
   }
 
@@ -1444,12 +1445,21 @@ public final class FnModuleTest extends SandboxTest {
 
     query(func.args("42"),
         "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
-    query(func.args(" xs:hexBinary('3432')"),
-        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
-    query(func.args(" xs:base64Binary('NDI=')"),
-        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
+    query(func.args(_CONVERT_STRING_TO_HEX.args("<html><head><meta charset='" + Strings.UTF16LE
+        + "'></head><body>42</body>", Strings.UTF16LE),
+        " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta charset=\"" + Strings.UTF16LE
+        + "\"/></head><body>42</body></html>");
+    query(func.args(_CONVERT_STRING_TO_BASE64.args("<html><head><meta charset='" + Strings.UTF16BE
+        + "'></head><body>42</body>", Strings.UTF16BE),
+        " map {'encoding': '" + Strings.UTF16BE + "', 'heuristics': 'NONE'}"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta charset=\"" + Strings.UTF16BE
+        + "\"/></head><body>42</body></html>");
 
     error(func.args(42), STRBIN_X_X);
+    error(func.args(" \"42\"", 42), MAP_X_X);
+    error(func.args(" \"42\"", " map {'1234': ()}"), INVALIDOPT_X);
+    error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVALIDOPT_X);
   }
 
   /** Test method. */

From 8efe4a1f9bc31ab338ad0d1f408ffcf400c11970 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 30 Oct 2023 20:50:29 +0100
Subject: [PATCH 3/9] ignore encoding option when parsing a string value

---
 .../org/basex/query/func/html/FnParseHtml.java     | 14 ++++++++++----
 .../java/org/basex/query/func/FnModuleTest.java    |  3 +++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index 2ae7c1d525..a891859e34 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -36,7 +36,9 @@ public class FnParseHtml extends StandardFunc {
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
     final Item value = arg(0).atomItem(qc, info);
     final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
-    return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), options);
+    final IO io = value instanceof Bin ? new IOContent(toBytes(value))
+                                       : new IOContent(toBytes(value), "", Strings.UTF8);
+    return value.isEmpty() ? Empty.VALUE : parse(io, options);
   }
 
   @Override
@@ -148,11 +150,15 @@ private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hop
           reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
         // end Validator.nu options
 
-        if (hopts.contains(ENCODING)) {
-          String enc = hopts.get(HtmlOptions.ENCODING);
+        String enc = io.encoding() != null
+            ? io.encoding()
+            : hopts.contains(ENCODING)
+              ? hopts.get(HtmlOptions.ENCODING)
+              : null; // TODO: sniff encoding
+        if (enc != null) {
           if (!Strings.supported(enc))
             throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.');
-          is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+          is.setEncoding(Strings.normEncoding(enc));
         }
 
         reader.parse(is);
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index 5f5365d262..ed667b2a68 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1443,8 +1443,11 @@ public final class FnModuleTest extends SandboxTest {
   @Test public void parseHtml() {
     final Function func = PARSE_HTML;
 
+    query(func.args(" ()"), "");
     query(func.args("42"),
         "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
+    query(func.args("42", " map {'encoding': '" + Strings.UTF16LE + "'}"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>42</body></html>");
     query(func.args(_CONVERT_STRING_TO_HEX.args("<html><head><meta charset='" + Strings.UTF16LE
         + "'></head><body>42</body>", Strings.UTF16LE),
         " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"),

From d66e40be30866bcd08abd129c8af9b8bf174e698 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 6 Nov 2023 13:15:54 +0100
Subject: [PATCH 4/9] replace TagSoup by Validator.nu

---
 basex-core/pom.xml                            |  12 +-
 .../org/basex/build/html/HtmlOptions.java     |  54 ++----
 .../java/org/basex/build/html/HtmlParser.java | 152 ++++++++---------
 .../main/java/org/basex/core/MainOptions.java |   2 +-
 .../main/java/org/basex/query/QueryError.java |   9 +-
 .../org/basex/query/func/FuncOptions.java     |  33 ++--
 .../org/basex/query/func/StandardFunc.java    |  17 +-
 .../basex/query/func/html/FnParseHtml.java    | 159 +-----------------
 .../org/basex/query/func/html/HtmlParse.java  |   9 +-
 .../src/main/resources/lang/Chinese.lang      |   2 +-
 basex-core/src/main/resources/lang/Dutch.lang |   2 +-
 .../src/main/resources/lang/English.lang      |   2 +-
 .../src/main/resources/lang/French.lang       |   2 +-
 .../src/main/resources/lang/German.lang       |   2 +-
 .../src/main/resources/lang/Hungarian.lang    |   2 +-
 .../src/main/resources/lang/Indonesian.lang   |   2 +-
 .../src/main/resources/lang/Italian.lang      |   2 +-
 .../src/main/resources/lang/Japanese.lang     |   2 +-
 .../src/main/resources/lang/Mongolian.lang    |   2 +-
 .../src/main/resources/lang/Romanian.lang     |   2 +-
 .../src/main/resources/lang/Russian.lang      |   2 +-
 .../src/main/resources/lang/Spanish.lang      |   2 +-
 .../org/basex/query/func/FnModuleTest.java    |   4 +-
 .../org/basex/query/func/HtmlModuleTest.java  |   6 +-
 basex-examples/basex-examples.iml             |   2 +-
 basex-examples/pom.xml                        |   4 +-
 .../basex/examples/create/HTMLExample.java    |   2 +-
 pom.xml                                       |  13 +-
 28 files changed, 162 insertions(+), 342 deletions(-)

diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index 94512b712e..f3c6d14885 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -30,9 +30,9 @@
       <optional>true</optional>
     </dependency>
     <dependency>
-      <groupId>org.ccil.cowan.tagsoup</groupId>
-      <artifactId>tagsoup</artifactId>
-      <scope>compile</scope>
+      <groupId>nu.validator</groupId>
+      <artifactId>htmlparser</artifactId>
+      <scope>provided</scope>
       <optional>true</optional>
     </dependency>
     <dependency>
@@ -52,12 +52,6 @@
       <scope>provided</scope>
       <optional>true</optional>
     </dependency>
-    <dependency>
-      <groupId>nu.validator</groupId>
-      <artifactId>htmlparser</artifactId>
-      <scope>provided</scope>
-      <optional>true</optional>
-    </dependency>
   </dependencies>
 
   <build>
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index b46e8d6e56..a6ce8f6cc9 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -3,12 +3,22 @@
 import org.basex.util.options.*;
 
 /**
- * Options for parsing and serializing HTML documents with TagSoup.
+ * Options for parsing and serializing HTML documents with Validator.nu.
  *
  * @author BaseX Team 2005-23, BSD License
  * @author Christian Gruen
  */
 public final class HtmlOptions extends Options {
+  /** fn:parse-html option encoding. */
+  public static final StringOption ENCODING = new StringOption("encoding");
+  /** fn:parse-html option method. */
+  public static final StringOption METHOD = new StringOption("method");
+  /** fn:parse-html option html-version. */
+  public static final StringOption HTML_VERSION = new StringOption("html-version");
+  /** fn:parse-html option include-template-content. */
+  public static final BooleanOption INCLUDE_TEMPLATE_CONTENT =
+      new BooleanOption("include-template-content");
+
   /** Validator.nu option unicode-normalization-checking. */
   public static final BooleanOption UNICODE_NORMALIZATION_CHECKING =
       new BooleanOption("unicode-normalization-checking", false);
@@ -18,7 +28,6 @@ public final class HtmlOptions extends Options {
   /** Validator.nu option scripting-enabled. */
   public static final BooleanOption SCRIPTING_ENABLED =
       new BooleanOption("scripting-enabled", false);
-
   /** Validator.nu option content-space-policy. */
   public static final EnumOption<XmlViolationPolicy> CONTENT_SPACE_POLICY =
       new EnumOption<>("content-space-policy", XmlViolationPolicy.class);
@@ -44,43 +53,6 @@ public final class HtmlOptions extends Options {
   public static final EnumOption<Heuristics> HEURISTICS =
       new EnumOption<>("heuristics", Heuristics.class);
 
-  /** TagSoup option: html. */
-  public static final BooleanOption HTML = new BooleanOption("html", false);
-  /** TagSoup option: omit-xml-declaration. */
-  public static final BooleanOption OMIT_XML_DECLARATION =
-      new BooleanOption("omit-xml-declaration", false);
-  /** TagSoup option: nons. */
-  public static final BooleanOption NONS = new BooleanOption("nons", true);
-  /** TagSoup option: nobogons. */
-  public static final BooleanOption NOBOGONS = new BooleanOption("nobogons", false);
-  /** TagSoup option: nodefaults. */
-  public static final BooleanOption NODEFAULTS = new BooleanOption("nodefaults", false);
-  /** TagSoup option: nocolons. */
-  public static final BooleanOption NOCOLONS = new BooleanOption("nocolons", false);
-  /** TagSoup option: norestart. */
-  public static final BooleanOption NORESTART = new BooleanOption("norestart", false);
-  /** TagSoup option: nobogons. */
-  public static final BooleanOption IGNORABLE = new BooleanOption("ignorable", false);
-  /** TagSoup option: emptybogons. */
-  public static final BooleanOption EMPTYBOGONS = new BooleanOption("emptybogons", false);
-  /** TagSoup option: any. */
-  public static final BooleanOption ANY = new BooleanOption("any", false);
-  /** TagSoup option: norootbogons. */
-  public static final BooleanOption NOROOTBOGONS = new BooleanOption("norootbogons", false);
-  /** TagSoup option: nocdata. */
-  public static final BooleanOption NOCDATA = new BooleanOption("nocdata", false);
-  /** TagSoup option: lexical. */
-  public static final BooleanOption LEXICAL = new BooleanOption("lexical", false);
-
-  /** TagSoup option: method (html). */
-  public static final StringOption METHOD = new StringOption("method", "xml");
-  /** TagSoup option: doctype-system=systemid. */
-  public static final StringOption DOCTYPE_SYSTEM = new StringOption("doctype-system");
-  /** TagSoup option: doctype-public=publicid. */
-  public static final StringOption DOCTYPE_PUBLIC = new StringOption("doctype-public");
-  /** TagSoup option: encoding=encoding. */
-  public static final StringOption ENCODING = new StringOption("encoding");
-
   /**
    * Default constructor.
    */
@@ -97,7 +69,7 @@ public HtmlOptions(final Options opts) {
 
   /**
    * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
-   * dependency on Validator.nu in the classpath.
+   * class path dependency of HtmlOptions on Validator.nu.
    *
    * Copyright (c) 2007 Henri Sivonen
    *
@@ -147,7 +119,7 @@ public enum XmlViolationPolicy {
 
   /**
    * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
-   * dependency on Validator.nu in the classpath.
+   * class path dependency of HtmlOptions on Validator.nu.
    *
    * Permission is hereby granted, free of charge, to any person obtaining a
    * copy of this software and associated documentation files (the "Software"),
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index acfe3d5882..773defb9db 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -1,39 +1,57 @@
 package org.basex.build.html;
 
-import static org.basex.util.Token.*;
 import static org.basex.build.html.HtmlOptions.*;
+import static org.basex.query.QueryError.*;
+import static org.basex.util.Token.*;
 
 import java.io.*;
+import java.util.*;
+
 import org.basex.build.xml.*;
 import org.basex.core.*;
 import org.basex.io.*;
-import org.basex.io.in.*;
 import org.basex.util.*;
-import org.ccil.cowan.tagsoup.*;
 import org.xml.sax.*;
 
+import nu.validator.htmlparser.common.Heuristics;
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+import nu.validator.htmlparser.sax.*;
+
 /**
- * This class uses TagSoup to convert HTML input to well-formed XML.
- * If TagSoup is not found in the classpath, the original document is passed on.
+ * This class uses the Validator.nu HTML parser to convert HTML input to well-formed XML.
+ * If the Validator.nu HTML parser is not found in the classpath, the original document is
+ * passed on.
  *
- * TagSoup was written by John Cowan and is based on the Apache 2.0 License:
- * {@code http://home.ccil.org/~cowan/XML/tagsoup/}.
+ * The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License:
+ * {@code https://about.validator.nu/htmlparser/}.
  *
  * @author BaseX Team 2005-23, BSD License
  * @author Christian Gruen
  */
 public final class HtmlParser extends XMLParser {
   /** Name of HTML Parser. */
-  private static final String NAME = "TagSoup";
-  /** TagSoup URL. */
-  private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/";
+  private static final String NAME = "Validator.nu";
 
   /**
-   * Checks if a CatalogResolver is available.
+   * Checks if a Validator.nu is available.
    * @return result of check
    */
   public static boolean available() {
-    return Reflect.available("org.ccil.cowan.tagsoup.Parser");
+    return firstUnavailableClass() == null;
+  }
+
+  /**
+   * Check whether Validator.nu classes are available on the class path.
+   * @return the name of the first class that is not available, or null if all classes are available
+   */
+  public static String firstUnavailableClass() {
+    for(final String className : Arrays.asList("nu.validator.htmlparser.sax.HtmlParser",
+        "nu.validator.htmlparser.sax.XmlSerializer",
+        "nu.validator.htmlparser.common.XmlViolationPolicy",
+        "nu.validator.htmlparser.common.Heuristics")) {
+      if(!Reflect.available(className)) return className;
+    }
+    return null;
   }
 
   /**
@@ -77,81 +95,63 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
     // reader could not be initialized; fall back to XML
     if(!available()) return io;
 
-    try(TextInput ti = new TextInput(io)) {
-      // tries to extract the encoding from the input
-      String enc = ti.encoding();
-      final byte[] content = ti.content();
-
-      // looks for a charset definition
-      final byte[] encoding = token("charset=");
-      int cs = indexOf(content, encoding);
-      if(cs > 0) {
-        // extracts the encoding string
-        cs += encoding.length;
-        int ce = cs;
-        final int cl = content.length;
-        while(++ce < cl && content[ce] > 0x28);
-        enc = string(substring(content, cs, ce));
-      }
-
+    try {
       // define output
       final StringWriter sw = new StringWriter();
-      final XMLReader reader = new org.ccil.cowan.tagsoup.Parser();
-      final XMLWriter writer = new XMLWriter(sw);
-      writer.setOutputProperty(ENCODING.name(), Strings.UTF8);
+      final nu.validator.htmlparser.sax.HtmlParser reader =
+          new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
+      final ContentHandler writer = new XmlSerializer(sw);
       reader.setContentHandler(writer);
+      reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
 
-      // set TagSoup options
-      if(hopts.get(HTML)) {
-        reader.setFeature("http://xml.org/sax/features/namespaces", false);
-        writer.setOutputProperty(METHOD.name(), "html");
-        writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
+      // define input
+      final InputSource is = new InputSource(io.inputStream());
+
+      // set Validator.nu options
+      if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
+        reader.setCheckingNormalization(true);
+      if(hopts.get(MAPPING_LANG_TO_XML_LANG))
+        reader.setMappingLangToXmlLang(true);
+      if(hopts.get(SCRIPTING_ENABLED))
+        reader.setScriptingEnabled(true);
+      if(hopts.contains(CONTENT_SPACE_POLICY))
+        reader.setContentSpacePolicy(
+            XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
+      if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
+        reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
+            hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
+      if(hopts.contains(COMMENT_POLICY))
+        reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
+      if(hopts.contains(XMLNS_POLICY))
+        reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
+      if(hopts.contains(NAME_POLICY))
+        reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
+      if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
+        reader.setStreamabilityViolationPolicy(
+            XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
+      if(hopts.contains(XML_POLICY))
+        reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
+      if(hopts.contains(HEURISTICS))
+        reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
+      // end Validator.nu options
+
+      String enc = io.encoding() != null
+          ? io.encoding()
+          : hopts.contains(ENCODING)
+            ? hopts.get(HtmlOptions.ENCODING)
+            : null; // TODO: sniff encoding
+      if (enc != null) {
+        if (!Strings.supported(enc))
+          throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
+        is.setEncoding(Strings.normEncoding(enc));
       }
-      if(hopts.get(NONS))
-        reader.setFeature("http://xml.org/sax/features/namespaces", false);
-      if(hopts.get(NOBOGONS))
-        reader.setFeature(FEATURES + "ignore-bogons", true);
-      if(hopts.get(NODEFAULTS))
-        reader.setFeature(FEATURES + "default-attributes", false);
-      if(hopts.get(NOCOLONS))
-        reader.setFeature(FEATURES + "translate-colons", true);
-      if(hopts.get(NORESTART))
-        reader.setFeature(FEATURES + "restart-elements", false);
-      if(hopts.get(IGNORABLE))
-        reader.setFeature(FEATURES + "ignorable-whitespace", true);
-      if(hopts.get(EMPTYBOGONS))
-        reader.setFeature(FEATURES + "bogons-empty", true);
-      if(hopts.get(ANY))
-        reader.setFeature(FEATURES + "bogons-empty", false);
-      if(hopts.get(NOROOTBOGONS))
-        reader.setFeature(FEATURES + "root-bogons", false);
-      if(hopts.get(NOCDATA))
-        reader.setFeature(FEATURES + "cdata-elements", false);
-      if(hopts.get(LEXICAL))
-        reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
-
-      if(hopts.get(OMIT_XML_DECLARATION))
-        writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
-      if(hopts.contains(METHOD))
-        writer.setOutputProperty(METHOD.name(), hopts.get(METHOD));
-      if(hopts.contains(DOCTYPE_SYSTEM))
-        writer.setOutputProperty(DOCTYPE_SYSTEM.name(), hopts.get(DOCTYPE_SYSTEM));
-      if(hopts.contains(DOCTYPE_PUBLIC))
-        writer.setOutputProperty(DOCTYPE_PUBLIC.name(), hopts.get(DOCTYPE_PUBLIC));
-
-      if(hopts.contains(ENCODING))
-        enc = hopts.get(ENCODING);
-      // end TagSoup options
 
-      // define input
-      final InputSource is = new InputSource(new ArrayInput(content));
-      is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
       reader.parse(is);
       return new IOContent(token(sw.toString()), io.name());
 
     } catch(final SAXException ex) {
       Util.errln(ex);
-      return io;
+      throw INVHTML_X.getIO(ex.getLocalizedMessage());
     }
   }
 }
diff --git a/basex-core/src/main/java/org/basex/core/MainOptions.java b/basex-core/src/main/java/org/basex/core/MainOptions.java
index 2357d9a5e7..1afd1cda7a 100644
--- a/basex-core/src/main/java/org/basex/core/MainOptions.java
+++ b/basex-core/src/main/java/org/basex/core/MainOptions.java
@@ -41,7 +41,7 @@ public final class MainOptions extends Options {
   /** Define JSON parser options. */
   public static final OptionsOption<JsonParserOptions> JSONPARSER =
       new OptionsOption<>("JSONPARSER", new JsonParserOptions());
-  /** Define TagSoup HTML options. */
+  /** Define Validator.nu HTML options. */
   public static final OptionsOption<HtmlOptions> HTMLPARSER =
       new OptionsOption<>("HTMLPARSER", new HtmlOptions());
   /** Define import parser. */
diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java
index d127bde013..a13468463d 100644
--- a/basex-core/src/main/java/org/basex/query/QueryError.java
+++ b/basex-core/src/main/java/org/basex/query/QueryError.java
@@ -307,11 +307,6 @@ public enum QueryError {
   /** Error code. */
   HASH_ALGORITHM_X(HASH, "algorithm", "Algorithm not supported: '%'."),
 
-  // HTML Module
-
-  /** Error code. */
-  HTML_PARSE_X(HTML, "parse", "%"),
-
   // HTTP Module
 
   /** Invalid URI. */
@@ -616,9 +611,9 @@ public enum QueryError {
   /** Error code. */
   RESINV_X(FODC, 7, "Resource path '%' is invalid."),
   /** Error code. */
-  INVHTML_X(FODC, 11, "String passed to fn:parse-html is not a well-formed HTML document: %"),
+  INVHTML_X(FODC, 11, "HTML parsing failed: %"),
   /** Error code. */
-  INVHTMLOPT_X(FODC, 12, "Unsupported HTML parser option: %"),
+  INVHTMLOPT_X(FODC, 12, "HTML option processing failed: %"),
 
   /** Error code. */
   FORMNUM_X(FODF, 1280, "Unknown decimal format: '%'."),
diff --git a/basex-core/src/main/java/org/basex/query/func/FuncOptions.java b/basex-core/src/main/java/org/basex/query/func/FuncOptions.java
index d65b445dcf..b867d05de2 100644
--- a/basex-core/src/main/java/org/basex/query/func/FuncOptions.java
+++ b/basex-core/src/main/java/org/basex/query/func/FuncOptions.java
@@ -38,14 +38,15 @@ public final class FuncOptions {
   private final InputInfo info;
 
   /** Raise error if a supplied option is unknown. */
-  private boolean enforceKnown;
+  private final boolean enforceKnown;
 
   /**
    * Constructor.
    * @param info input info (can be {@code null})
+   * @param enforceKnown raise error, if a supplied options is unknown
    */
-  public FuncOptions(final InputInfo info) {
-    this(null, info);
+  public FuncOptions(final InputInfo info, final boolean enforceKnown) {
+    this(null, info, enforceKnown);
   }
 
   /**
@@ -54,24 +55,20 @@ public FuncOptions(final InputInfo info) {
    * @param info input info (can be {@code null})
    */
   public FuncOptions(final QNm root, final InputInfo info) {
-    test = root == null ? null : new NameTest(root);
-    this.root = root;
-    this.info = info;
+    this(root, info, false);
   }
 
   /**
-   * Assigns values to the specified options.
-   * @param item item to be converted (can be {@link Empty#VALUE})
-   * @param options options
-   * @param <T> option type
-   * @param enforce raise error if a supplied option is unknown
-   * @return specified options
-   * @throws QueryException query exception
+   * Constructor.
+   * @param root name of root node (can be {@code null})
+   * @param info input info (can be {@code null})
+   * @param enforceKnown raise error, if a supplied options is unknown
    */
-  public <T extends Options> T assign(final Item item, final T options, final boolean enforce)
-      throws QueryException {
-    enforceKnown = enforce;
-    return assign(item, options, INVALIDOPT_X);
+  private FuncOptions(final QNm root, final InputInfo info, final boolean enforceKnown) {
+    test = root == null ? null : new NameTest(root);
+    this.root = root;
+    this.info = info;
+    this.enforceKnown = enforceKnown;
   }
 
   /**
@@ -83,7 +80,7 @@ public <T extends Options> T assign(final Item item, final T options, final bool
    * @return specified options
    * @throws QueryException query exception
    */
-  private <T extends Options> T assign(final Item item, final T options, final QueryError error)
+  public <T extends Options> T assign(final Item item, final T options, final QueryError error)
       throws QueryException {
 
     if(!item.isEmpty()) {
diff --git a/basex-core/src/main/java/org/basex/query/func/StandardFunc.java b/basex-core/src/main/java/org/basex/query/func/StandardFunc.java
index 9ddd35bfba..40e384c368 100644
--- a/basex-core/src/main/java/org/basex/query/func/StandardFunc.java
+++ b/basex-core/src/main/java/org/basex/query/func/StandardFunc.java
@@ -507,7 +507,22 @@ protected final HashMap<String, String> toOptions(final Expr expr, final QueryCo
    */
   protected final <E extends Options> E toOptions(final Expr expr, final E options,
       final boolean enforce, final QueryContext qc) throws QueryException {
-    return new FuncOptions(info).assign(expr.item(qc, info), options, enforce);
+    return new FuncOptions(info, enforce).assign(expr.item(qc, info), options, INVALIDOPT_X);
+  }
+
+  /**
+   * Evaluates an expression, if it exists, and returns options.
+   * @param <E> options type
+   * @param expr expression (can be {@code Empty#UNDEFINED})
+   * @param options options template
+   * @param error error to raise, if a supplied option is unknown
+   * @param qc query context
+   * @return options
+   * @throws QueryException query exception
+   */
+  protected final <E extends Options> E toOptions(final Expr expr, final E options,
+      final QueryError error, final QueryContext qc) throws QueryException {
+    return new FuncOptions(info, true).assign(expr.item(qc, info), options, error);
   }
 
   /**
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index a891859e34..a6b74d1775 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -1,27 +1,11 @@
 package org.basex.query.func.html;
 
-import static org.basex.build.html.HtmlOptions.*;
 import static org.basex.query.QueryError.*;
-import static org.basex.util.Token.*;
 
-import java.io.*;
-
-import org.basex.build.html.*;
-import org.basex.build.xml.*;
-import org.basex.core.*;
-import org.basex.io.*;
+import org.basex.build.html.HtmlParser;
 import org.basex.query.*;
-import org.basex.query.expr.*;
-import org.basex.query.func.*;
 import org.basex.query.value.item.*;
-import org.basex.query.value.node.*;
-import org.basex.query.value.seq.*;
 import org.basex.util.*;
-import org.xml.sax.*;
-
-import nu.validator.htmlparser.sax.*;
-import nu.validator.htmlparser.common.XmlViolationPolicy;
-import nu.validator.htmlparser.common.Heuristics;
 
 /**
  * Function implementation.
@@ -29,145 +13,12 @@
  * @author BaseX Team 2005-23, BSD License
  * @author Gunther Rademacher
  */
-public class FnParseHtml extends StandardFunc {
-  // TODO: handle second argument (method, html-version, encoding), produce error code FODC0012
+public class FnParseHtml extends HtmlParse {
 
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
-    final Item value = arg(0).atomItem(qc, info);
-    final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
-    final IO io = value instanceof Bin ? new IOContent(toBytes(value))
-                                       : new IOContent(toBytes(value), "", Strings.UTF8);
-    return value.isEmpty() ? Empty.VALUE : parse(io, options);
-  }
-
-  @Override
-  protected final Expr opt(final CompileContext cc) {
-    return optFirst();
-  }
-
-  /**
-   * Parses the input and creates an XML document.
-   * @param io input data
-   * @param options HTML options
-   * @return node
-   * @throws QueryException query exception
-   */
-  protected final Item parse(final IO io, final HtmlOptions options) throws QueryException {
-    try {
-      if (!ParserImpl.available()) {
-        // reader could not be initialized; fall back to html:parse
-        final HtmlOptions htmlOptions = new HtmlOptions();
-        htmlOptions.set(HtmlOptions.LEXICAL, true);
-        htmlOptions.set(HtmlOptions.NONS, false);
-        return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions));
-      }
-      return new DBNode(new ParserImpl(info, io, options));
-    } catch(final IOException ex) {
-      throw INVHTML_X.get(info, ex);
-    }
-  }
-
-  /**
-   * Parser implementation.
-   */
-  private static class ParserImpl extends XMLParser {
-
-    /**
-     * Checks if Validator.nu is available.
-     * @return result of check
-     */
-    public static boolean available() {
-      return Reflect.available("nu.validator.htmlparser.sax.HtmlParser")
-          && Reflect.available("nu.validator.htmlparser.sax.XmlSerializer")
-          && Reflect.available("nu.validator.htmlparser.common.Heuristics")
-          && Reflect.available("nu.validator.htmlparser.common.XmlViolationPolicy");
-    }
-
-    /**
-     * Constructor.
-     * @param info input info
-     * @param source document source
-     * @param options HTML options
-     * @throws IOException I/O exception
-     * @throws QueryException query exception
-     */
-    ParserImpl(final InputInfo info, final IO source, final HtmlOptions options)
-        throws IOException, QueryException {
-      super(toXml(info, source, options), new MainOptions());
-    }
-
-    /**
-     * Converts an HTML document to XML.
-     * @param info input info
-     * @param io io reference
-     * @param hopts HTML options
-     * @return parser
-     * @throws IOException I/O exception
-     * @throws QueryException query exception
-     */
-    private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hopts)
-        throws IOException, QueryException {
-
-      try {
-        // define output
-        final StringWriter sw = new StringWriter();
-        final nu.validator.htmlparser.sax.HtmlParser reader =
-            new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
-        final ContentHandler writer = new XmlSerializer(sw);
-        reader.setContentHandler(writer);
-        reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
-
-        // define input
-        final InputSource is = new InputSource(io.inputStream());
-
-        // set Validator.nu options
-        if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
-          reader.setCheckingNormalization(true);
-        if(hopts.get(MAPPING_LANG_TO_XML_LANG))
-          reader.setMappingLangToXmlLang(true);
-        if(hopts.get(SCRIPTING_ENABLED))
-          reader.setScriptingEnabled(true);
-        if(hopts.contains(CONTENT_SPACE_POLICY))
-          reader.setContentSpacePolicy(
-              XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
-        if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
-          reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
-              hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
-        if(hopts.contains(COMMENT_POLICY))
-          reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
-        if(hopts.contains(XMLNS_POLICY))
-          reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
-        if(hopts.contains(NAME_POLICY))
-          reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
-        if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
-          reader.setStreamabilityViolationPolicy(
-              XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
-        if(hopts.contains(XML_POLICY))
-          reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
-
-        if(hopts.contains(HEURISTICS))
-          reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
-        // end Validator.nu options
-
-        String enc = io.encoding() != null
-            ? io.encoding()
-            : hopts.contains(ENCODING)
-              ? hopts.get(HtmlOptions.ENCODING)
-              : null; // TODO: sniff encoding
-        if (enc != null) {
-          if (!Strings.supported(enc))
-            throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.');
-          is.setEncoding(Strings.normEncoding(enc));
-        }
-
-        reader.parse(is);
-        return new IOContent(token(sw.toString()), io.name());
-
-      } catch(final SAXException ex) {
-        Util.errln(ex);
-        throw INVHTML_X.get(info, ex.getLocalizedMessage());
-      }
-    }
+    String className = HtmlParser.firstUnavailableClass();
+    if (className != null) throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
+    return super.item(qc, ii);
   }
 }
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index a582837af8..576fc82b4d 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -25,7 +25,10 @@ public class HtmlParse extends StandardFunc {
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
     final Item value = arg(0).atomItem(qc, info);
-    return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), qc);
+    if (value.isEmpty()) return Empty.VALUE;
+    final IO io = value instanceof Bin ? new IOContent(toBytes(value))
+                                       : new IOContent(toBytes(value), "", Strings.UTF8);
+    return parse(io, qc);
   }
 
   @Override
@@ -41,11 +44,11 @@ protected final Expr opt(final CompileContext cc) {
    * @throws QueryException query exception
    */
   protected final Item parse(final IO io, final QueryContext qc) throws QueryException {
-    final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
+    final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), INVHTMLOPT_X, qc);
     try {
       return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options));
     } catch(final IOException ex) {
-      throw HTML_PARSE_X.get(info, ex);
+      throw INVHTML_X.get(info, ex);
     }
   }
 }
diff --git a/basex-core/src/main/resources/lang/Chinese.lang b/basex-core/src/main/resources/lang/Chinese.lang
index 82caebcf90..610a45e5e4 100644
--- a/basex-core/src/main/resources/lang/Chinese.lang
+++ b/basex-core/src/main/resources/lang/Chinese.lang
@@ -417,7 +417,7 @@ h_db_format          = 数据库结构已经变了，请使用新版软件
 h_db_options_%       = 如果执行'%'，这个选项将会被设置
 h_diacritics         = 索引保留了发音符号
 h_fulltext_index     = 全文索引可以加速全文检索
-h_html_parser        = 将使用TagSoup将HTML转为XML
+h_html_parser        = 将使用Validator.nu将HTML转为XML
 h_index_format       = 索引格式变了，请建新索引
 h_int_parser         = 容错，而且比Java的默认解析器更快
 h_languauge          = 将使用根据语言确定的tokenizer
diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang
index e52e0e9345..d507ea781f 100644
--- a/basex-core/src/main/resources/lang/Dutch.lang
+++ b/basex-core/src/main/resources/lang/Dutch.lang
@@ -417,7 +417,7 @@ h_db_format          = Het database formaat is gewijzigd; maak een nieuwe databa
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = Diakritische tekens worden gebruikt in de index.
 h_fulltext_index     = Een full-text index versnelt full-text queries.
-h_html_parser        = De TagSoup parser zal gebruikt worden om HTML naar XML te converteren.
+h_html_parser        = De Validator.nu HTML parser zal gebruikt worden om HTML naar XML te converteren.
 h_index_format       = Het index formaat is gewijzigd; maak nieuwe indexen.
 h_int_parser         = Robuuster en sneller dan Java's standaard parser.
 h_languauge          = Met deze optie zullen taalspecifieke parsers worden gebruikt.
diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang
index 0b093bd970..5cc46cc7ed 100644
--- a/basex-core/src/main/resources/lang/English.lang
+++ b/basex-core/src/main/resources/lang/English.lang
@@ -417,7 +417,7 @@ h_db_format          = The database format has changed; please use a newer versi
 h_db_options_%       = The options will be assigned if '%' is executed.
 h_diacritics         = Diacritics are retained in the index.
 h_fulltext_index     = A full-text index speeds up full-text queries.
-h_html_parser        = The TagSoup parser will be used to convert HTML to XML.
+h_html_parser        = The Validator.nu HTML parser will be used to convert HTML to XML.
 h_index_format       = The index format has changed; please create new indexes.
 h_int_parser         = Fault tolerant, and faster than Java’s default parser.
 h_languauge          = Language specific tokenizers will be used.
diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang
index bcbb7a3f41..2f10e53bcf 100644
--- a/basex-core/src/main/resources/lang/French.lang
+++ b/basex-core/src/main/resources/lang/French.lang
@@ -417,7 +417,7 @@ h_db_format          = Le format de base de données a changé ; Veuillez créer
 h_db_options_%       = Les options seront assignées si on exécute '%'.
 h_diacritics         = Les signes diacritiques sont conservés dans l’index.
 h_fulltext_index     = Un index plein texte accélère les requêtes plein texte.
-h_html_parser        = Le parser TagSoup sera utilisé pour convertir le HTML en XML.
+h_html_parser        = Le parser HTML Validator.nu sera utilisé pour convertir le HTML en XML.
 h_index_format       = Le format des index a changé ;  Veuillez créer de nouveaux index.
 h_int_parser         = Tolérant aux fautes, et plus rapide que le parser Java par défaut.
 h_languauge          = Des analyseurs spécifiques à la langue vont être utilisés.
diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang
index 8a674e17c1..a4056c130d 100644
--- a/basex-core/src/main/resources/lang/German.lang
+++ b/basex-core/src/main/resources/lang/German.lang
@@ -417,7 +417,7 @@ h_db_format          = Das Datenbankformat hat sich geändert; bitte verwenden S
 h_db_options_%       = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird.
 h_diacritics         = Diakritische Zeichen werden im Index beibehalten.
 h_fulltext_index     = Ein Volltext-Index beschleunigt Volltext-Anfragen.
-h_html_parser        = Der TagSoup-Parser wird verwendet, um HTML in XML zu konvertieren.
+h_html_parser        = Der Validator.nu HTML-Parser wird verwendet, um HTML in XML zu konvertieren.
 h_index_format       = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes.
 h_int_parser         = Fehlertolerant und schneller als Javas XML-Parser.
 h_languauge          = Sprachspezifische Tokenisierung wird verwendet.
diff --git a/basex-core/src/main/resources/lang/Hungarian.lang b/basex-core/src/main/resources/lang/Hungarian.lang
index bfe804283e..41a1c781c5 100644
--- a/basex-core/src/main/resources/lang/Hungarian.lang
+++ b/basex-core/src/main/resources/lang/Hungarian.lang
@@ -417,7 +417,7 @@ h_db_format          = Az adatbázis formátuma megváltozott; kérem, használj
 h_db_options_%       = Ezek az beállítások csak a következő futtása után lépnek életbe: '%' 
 h_diacritics         = Ékezetek megmaradnak az indexelésben.
 h_fulltext_index     = A teljes-szöveg index gyorsítja a teljes-szöveges (full-text) lekérdezéseket.
-h_html_parser        = A TagSoup elemző HTML formátumot konvertál XML formátumra.
+h_html_parser        = A Validator.nu elemző HTML formátumot konvertál XML formátumra.
 h_index_format       = Az index formátuma megváltozott; kérem, készítsen új indexeket.
 h_int_parser         = Hibatűrő, továbbá a Java alapértelmezett elemzőjénél gyorsabb.
 h_languauge          = Nyelvfüggő szövegelemzések is használatra kerülnek.
diff --git a/basex-core/src/main/resources/lang/Indonesian.lang b/basex-core/src/main/resources/lang/Indonesian.lang
index c23c842fa2..096d98f267 100644
--- a/basex-core/src/main/resources/lang/Indonesian.lang
+++ b/basex-core/src/main/resources/lang/Indonesian.lang
@@ -417,7 +417,7 @@ h_db_format          = Bentuk basisdata telah berubah; mohon gunakan versi yang
 h_db_options_%       = Pilihan akan digunakan jika '%' dijalankan. 
 h_diacritics         = Diakritik dipertahankan dalam indeks.
 h_fulltext_index     = Indeks semua teks mempercepat kueri teks penuh.
-h_html_parser        = Pengurai TagSoup akan digunakan untuk mengubah HTML menjadi XML.
+h_html_parser        = Pengurai Validator.nu akan digunakan untuk mengubah HTML menjadi XML.
 h_index_format       = Bentuk indeks telah berubah; mohon buat indeks baru.
 h_int_parser         = Toleran kesalahan, dan lebih cepat dari pengurai standar Java.
 h_languauge          = Pengurai teks bahasa tertentu akan digunakan.
diff --git a/basex-core/src/main/resources/lang/Italian.lang b/basex-core/src/main/resources/lang/Italian.lang
index f95804268d..dcb10fb853 100644
--- a/basex-core/src/main/resources/lang/Italian.lang
+++ b/basex-core/src/main/resources/lang/Italian.lang
@@ -417,7 +417,7 @@ h_db_format          = Il formato della base di dati è cambiato; creare una nuo
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = I segni diacritici sono conservati nell'indice.
 h_fulltext_index     = Un indice "full-text" velocizza le interrogazioni sul testo.
-h_html_parser        = Il parser TagSoup verrò usato per convertire HTML in XML.
+h_html_parser        = Il parser Validator.nu verrò usato per convertire HTML in XML.
 h_index_format       = Il formato degli indici è cambiato; creare nuovi indici.
 h_int_parser         = Tollerante ai guasti e più veloce del parser di default di Java.
 h_languauge          = Parser di testo specifici per la lingua verranno usati
diff --git a/basex-core/src/main/resources/lang/Japanese.lang b/basex-core/src/main/resources/lang/Japanese.lang
index e12f3734f7..1aa797d681 100644
--- a/basex-core/src/main/resources/lang/Japanese.lang
+++ b/basex-core/src/main/resources/lang/Japanese.lang
@@ -417,7 +417,7 @@ h_db_format          = データベース形式を変更しました。新しい
 h_db_options_%       = % 実行時にオプションが割り当てられます。
 h_diacritics         = インデックス内で付加記号（ウムラウト等）は保持されます。
 h_fulltext_index     = 全文テキストインデックスは全文検索を高速化します。
-h_html_parser        = TagSoup パーサは HTML を XML に変換します。
+h_html_parser        = Validator.nu パーサは HTML を XML に変換します。
 h_index_format       = インデックス形式を変更しました。新しくインデックスを作成して下さい。
 h_int_parser         = フォールトトレラント、Javaのデフォルトパーサより高速。
 h_languauge          = 指定された言語のテキストパーサが使用されます。
diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang
index 5834f92c48..995f0d178f 100644
--- a/basex-core/src/main/resources/lang/Mongolian.lang
+++ b/basex-core/src/main/resources/lang/Mongolian.lang
@@ -417,7 +417,7 @@ h_db_format          = Өгөгдлийн сангийн формат өөрчл
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = Индекс дэх санах тэмдгийг авч үлдэх.
 h_fulltext_index     = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно.
-h_html_parser        = The TagSoup parser will be used to convert HTML to XML.
+h_html_parser        = The Validator.nu HTML parser will be used to convert HTML to XML.
 h_index_format       = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү.
 h_int_parser         = Fault tolerant, and faster than Java’s default parser.
 h_languauge          = Хэлний текст Parser тодорхойлогдох болно.
diff --git a/basex-core/src/main/resources/lang/Romanian.lang b/basex-core/src/main/resources/lang/Romanian.lang
index 7903adde32..5cae6261d3 100644
--- a/basex-core/src/main/resources/lang/Romanian.lang
+++ b/basex-core/src/main/resources/lang/Romanian.lang
@@ -417,7 +417,7 @@ h_db_format          = Formatul bazei de date a fost schimbat, vă rugăm să fo
 h_db_options_%       = Optiunile vor fi asignate daca '%' este executată.
 h_diacritics         = Diacritice sunt păstrate în index.
 h_fulltext_index     = Un full-text index accelereaza interogările full-text.
-h_html_parser        = Parserul TagSoup va fi folosit pentru a converti HTML în XML.
+h_html_parser        = Parserul "Validator.nu" va fi folosit pentru a converti HTML în XML.
 h_index_format       = Formatul index s-a schimbat, vă rugăm creati noi indici.
 h_int_parser         = Tolerant la greseli si mai rapid decat parserul default Java.
 h_languauge          = Parsere de text specifice limbii vor fi folosite.
diff --git a/basex-core/src/main/resources/lang/Russian.lang b/basex-core/src/main/resources/lang/Russian.lang
index 01be29bb23..d520598dbd 100644
--- a/basex-core/src/main/resources/lang/Russian.lang
+++ b/basex-core/src/main/resources/lang/Russian.lang
@@ -417,7 +417,7 @@ h_db_format          = Формат хранения баз данных был
 h_db_options_%       = Эти опции будут изменены только после выполнения команды [%] 
 h_diacritics         = Разделительные знаки будут включены в индекс
 h_fulltext_index     = Полнотекстовый индекс ускоряет соответствующие запросы
-h_html_parser        = Для конвертации HTML в XML будет использован парсер TagSoup
+h_html_parser        = Для конвертации HTML в XML будет использован парсер Validator.nu
 h_index_format       = Формат хранения индексов был изменен. Пожалуйста, создайте индексы заново.
 h_int_parser         = Толерантный к ошибкам и быстрее чем стандартный парсер Java
 h_languauge          = Будут использованы специализированные под каждый язык парсеры
diff --git a/basex-core/src/main/resources/lang/Spanish.lang b/basex-core/src/main/resources/lang/Spanish.lang
index 61439dc026..1d10d504bd 100644
--- a/basex-core/src/main/resources/lang/Spanish.lang
+++ b/basex-core/src/main/resources/lang/Spanish.lang
@@ -417,7 +417,7 @@ h_db_format          = El formato de la Base de Datos ha cambiado; por favor uti
 h_db_options_%       = Las opciónes serán asignado si se ejecuta '%'. 
 h_diacritics         = Las diacríticas están retenidas en el índice.
 h_fulltext_index     = Un índice de Texto Completo acelera las consulta de Texto Completo.
-h_html_parser        = Se utilizará el Analizador Sintáctico TagSoup para convertir HTML a XML.
+h_html_parser        = Se utilizará el Analizador Sintáctico Validator.nu para convertir HTML a XML.
 h_index_format       = El formato del índice ha cambiado; for favor, cree nuevos índices.
 h_int_parser         = Tolerante a fallos, y más rápido que el analizador sintáctico por defecto de Java.
 h_languauge          = Se utilizarán analizadores sintácticos de texto específicos del lenguaje.
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index 49e1539d5f..f21ec21461 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1468,8 +1468,8 @@ public final class FnModuleTest extends SandboxTest {
 
     error(func.args(42), STRBIN_X_X);
     error(func.args(" \"42\"", 42), MAP_X_X);
-    error(func.args(" \"42\"", " map {'1234': ()}"), INVALIDOPT_X);
-    error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVALIDOPT_X);
+    error(func.args(" \"42\"", " map {'1234': ()}"), INVHTMLOPT_X);
+    error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVHTMLOPT_X);
   }
 
   /** Test method. */
diff --git a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
index 59636312bf..e85fedcf5a 100644
--- a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
@@ -20,8 +20,7 @@ public final class HtmlModuleTest extends SandboxTest {
     query(func.args(" <_/>/text()"), "");
 
     final String path = "src/test/resources/input.html";
-    query(func.args(path) + "//body ! name()", "body");
-    query(func.args(path, " map { 'nons': false() }") + "//*:body ! name()", "body");
+    query(func.args(path) + "//*:body ! name()", "body");
   }
 
   /** Test method. */
@@ -33,7 +32,8 @@ public final class HtmlModuleTest extends SandboxTest {
     // check if the function returns an HTML root node
     query("exists(" + func.args("&lt;html/&gt;") + "/*:html)", true);
     // check if the function returns <html/>
-    query(func.args("&lt;html/&gt;", " map { 'nons': true() }"), "<html/>");
+    query(func.args("&lt;html/&gt;"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>");
   }
 
   /** Test method. */
diff --git a/basex-examples/basex-examples.iml b/basex-examples/basex-examples.iml
index d68f7b229d..c8c6019d38 100644
--- a/basex-examples/basex-examples.iml
+++ b/basex-examples/basex-examples.iml
@@ -30,7 +30,7 @@
     <orderEntry type="library" name="Maven: org.slf4j:slf4j-simple:1.7.12" level="project" />
     <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.12" level="project" />
     <orderEntry type="library" name="Maven: com.vividsolutions:jts:1.13" level="project" />
-    <orderEntry type="library" scope="RUNTIME" name="Maven: org.ccil.cowan.tagsoup:tagsoup:1.2.1" level="project" />
+    <orderEntry type="library" scope="RUNTIME" name="Maven: nu.validator:htmlparser:1.4.16" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
   </component>
diff --git a/basex-examples/pom.xml b/basex-examples/pom.xml
index 2e4dbd2db5..c350579a51 100644
--- a/basex-examples/pom.xml
+++ b/basex-examples/pom.xml
@@ -18,8 +18,8 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.ccil.cowan.tagsoup</groupId>
-      <artifactId>tagsoup</artifactId>
+      <groupId>nu.validator</groupId>
+      <artifactId>htmlparser</artifactId>
     </dependency>
     <dependency>
       <groupId>org.junit.jupiter</groupId>
diff --git a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
index 3481bbcc26..40b96ffc23 100644
--- a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
+++ b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
@@ -6,7 +6,7 @@
 /**
  * This example demonstrates how to import a file in the HTML format
  * into the database. The specified input file will be converted to XML
- * if TagSoup is found in the classpath.
+ * if Validator.nu is found in the classpath.
  *
  * @author BaseX Team 2005-23, BSD License
  * @author Christian Gruen
diff --git a/pom.xml b/pom.xml
index 3d09d55fcb..24f5d352c1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,9 +64,9 @@
         <optional>true</optional>
       </dependency>
       <dependency>
-        <groupId>org.ccil.cowan.tagsoup</groupId>
-        <artifactId>tagsoup</artifactId>
-        <version>1.2.1</version>
+        <groupId>nu.validator</groupId>
+        <artifactId>htmlparser</artifactId>
+        <version>1.4.16</version>
         <scope>runtime</scope>
         <optional>true</optional>
       </dependency>
@@ -174,13 +174,6 @@
         <scope>runtime</scope>
         <optional>true</optional>
       </dependency>
-      <dependency>
-        <groupId>nu.validator</groupId>
-        <artifactId>htmlparser</artifactId>
-        <version>1.4.16</version>
-        <scope>runtime</scope>
-        <optional>true</optional>
-      </dependency>
     </dependencies>
   </dependencyManagement>
 

From e99d48b0e097f8162bdb6e4fd38fffe36053322d Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 13 Nov 2023 12:41:32 +0100
Subject: [PATCH 5/9] set scope=compile; handle dependencies of "heuristics"
 setting; test meta/@charset

---
 basex-core/pom.xml                            |  2 +-
 .../java/org/basex/query/func/Function.java   |  2 +-
 .../org/basex/query/func/html/HtmlParse.java  | 32 +++++++++++++++++++
 .../org/basex/query/func/FnModuleTest.java    | 18 ++++++-----
 4 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index f3c6d14885..33a2503167 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -32,7 +32,7 @@
     <dependency>
       <groupId>nu.validator</groupId>
       <artifactId>htmlparser</artifactId>
-      <scope>provided</scope>
+      <scope>compile</scope>
       <optional>true</optional>
     </dependency>
     <dependency>
diff --git a/basex-core/src/main/java/org/basex/query/func/Function.java b/basex-core/src/main/java/org/basex/query/func/Function.java
index c58cbe4b6a..8392399bf7 100644
--- a/basex-core/src/main/java/org/basex/query/func/Function.java
+++ b/basex-core/src/main/java/org/basex/query/func/Function.java
@@ -468,7 +468,7 @@ ITEM_ZM, flag(HOF)),
       params(STRING_ZO), DATE_TIME_ZO),
   /** XQuery function. */
   PARSE_HTML(FnParseHtml::new, "parse-html(html[,options])",
-      params(ITEM_ZO, MAP_O), DOCUMENT_NODE_ZO),
+      params(ANY_ATOMIC_TYPE_ZO, MAP_O), DOCUMENT_NODE_ZO),
   /** XQuery function. */
   PARSE_INTEGER(FnParseInteger::new, "parse-integer(value[,radix])",
       params(STRING_O, INTEGER_O), INTEGER_O),
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index 576fc82b4d..c7cf9d24af 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -1,5 +1,6 @@
 package org.basex.query.func.html;
 
+import static org.basex.build.html.HtmlOptions.*;
 import static org.basex.query.QueryError.*;
 
 import java.io.*;
@@ -22,6 +23,12 @@
  * @author Christian Gruen
  */
 public class HtmlParse extends StandardFunc {
+  /** Class needed for heuristics=ICU. */
+  private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
+  /** Class needed for heuristics=CHARDET. */
+  private static final String CHARDET_CLASS_NAME =
+      "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
+
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
     final Item value = arg(0).atomItem(qc, info);
@@ -45,10 +52,35 @@ protected final Expr opt(final CompileContext cc) {
    */
   protected final Item parse(final IO io, final QueryContext qc) throws QueryException {
     final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), INVHTMLOPT_X, qc);
+    if(options.contains(HEURISTICS)) {
+      switch (options.get(HEURISTICS)) {
+      case ALL:
+        ensureAvailable(ICU_CLASS_NAME);
+        ensureAvailable(CHARDET_CLASS_NAME);
+        break;
+      case ICU:
+        ensureAvailable(ICU_CLASS_NAME);
+        break;
+      case CHARDET:
+        ensureAvailable(CHARDET_CLASS_NAME);
+        break;
+      default:
+      }
+    }
     try {
       return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options));
     } catch(final IOException ex) {
       throw INVHTML_X.get(info, ex);
     }
   }
+
+  /**
+   * Ensure that a required class is available on the class path.
+   * @param className the class name
+   * @throws QueryException query exception,
+   */
+  private void ensureAvailable(final String className) throws QueryException {
+    if(!Reflect.available(className))
+      throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
+  }
 }
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index e4eaf33975..44b7e3b8bb 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1480,16 +1480,18 @@ public final class FnModuleTest extends SandboxTest {
         " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"),
         "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta charset=\"" + Strings.UTF16LE
         + "\"/></head><body>42</body></html>");
-    query(func.args(_CONVERT_STRING_TO_BASE64.args("<html><head><meta charset='" + Strings.UTF16BE
-        + "'></head><body>42</body>", Strings.UTF16BE),
-        " map {'encoding': '" + Strings.UTF16BE + "', 'heuristics': 'NONE'}"),
-        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta charset=\"" + Strings.UTF16BE
-        + "\"/></head><body>42</body></html>");
+    query(func.args(_CONVERT_STRING_TO_BASE64.args("<html><head><meta charset='ISO-8859-7'></head>"
+        + "<body>\u20AC</body>", "ISO-8859-7"), " map {'heuristics': 'NONE'}"),
+        "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta charset=\"ISO-8859-7\"/></head>"
+        + "<body>\u20AC</body></html>");
 
     error(func.args(42), STRBIN_X_X);
-    error(func.args(" \"42\"", 42), MAP_X_X);
-    error(func.args(" \"42\"", " map {'1234': ()}"), INVHTMLOPT_X);
-    error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVHTMLOPT_X);
+    error(func.args("42", 42), MAP_X_X);
+    error(func.args("42", " map {'1234': ''}"), INVHTMLOPT_X);
+    error(func.args("42", " map {'heuristics': '5678'}"), INVHTMLOPT_X);
+    error(func.args("42", " map {'heuristics': 'CHARDET'}"), BASEX_CLASSPATH_X_X);
+    error(func.args("42", " map {'heuristics': 'ICU'}"), BASEX_CLASSPATH_X_X);
+    error(func.args("42", " map {'heuristics': 'ALL'}"), BASEX_CLASSPATH_X_X);
   }
 
   /** Test method. */

From 7dd830e38f6b5b7a601943ecb1196d343d0b7878 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 13 Nov 2023 13:04:00 +0100
Subject: [PATCH 6/9] minor changes

---
 .../org/basex/build/html/HtmlOptions.java     | 30 +++++++++----------
 .../java/org/basex/build/html/HtmlParser.java |  7 ++---
 .../org/basex/query/func/html/HtmlParse.java  |  4 +--
 basex-core/src/main/resources/lang/Dutch.lang |  2 +-
 .../src/main/resources/lang/English.lang      |  2 +-
 .../src/main/resources/lang/French.lang       |  2 +-
 .../src/main/resources/lang/German.lang       |  2 +-
 .../src/main/resources/lang/Mongolian.lang    |  2 +-
 8 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index a6ce8f6cc9..311a854066 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -9,47 +9,47 @@
  * @author Christian Gruen
  */
 public final class HtmlOptions extends Options {
-  /** fn:parse-html option encoding. */
+  /** fn:parse-html option: encoding. */
   public static final StringOption ENCODING = new StringOption("encoding");
-  /** fn:parse-html option method. */
+  /** fn:parse-html option: method. */
   public static final StringOption METHOD = new StringOption("method");
-  /** fn:parse-html option html-version. */
+  /** fn:parse-html option: html-version. */
   public static final StringOption HTML_VERSION = new StringOption("html-version");
-  /** fn:parse-html option include-template-content. */
+  /** fn:parse-html option: include-template-content. */
   public static final BooleanOption INCLUDE_TEMPLATE_CONTENT =
       new BooleanOption("include-template-content");
 
-  /** Validator.nu option unicode-normalization-checking. */
+  /** Validator.nu option: unicode-normalization-checking. */
   public static final BooleanOption UNICODE_NORMALIZATION_CHECKING =
       new BooleanOption("unicode-normalization-checking", false);
-  /** Validator.nu option mapping-lang-to-xml-lang. */
+  /** Validator.nu option: mapping-lang-to-xml-lang. */
   public static final BooleanOption MAPPING_LANG_TO_XML_LANG =
       new BooleanOption("mapping-lang-to-xml-lang", false);
-  /** Validator.nu option scripting-enabled. */
+  /** Validator.nu option: scripting-enabled. */
   public static final BooleanOption SCRIPTING_ENABLED =
       new BooleanOption("scripting-enabled", false);
-  /** Validator.nu option content-space-policy. */
+  /** Validator.nu option: content-space-policy. */
   public static final EnumOption<XmlViolationPolicy> CONTENT_SPACE_POLICY =
       new EnumOption<>("content-space-policy", XmlViolationPolicy.class);
-  /** Validator.nu option content-non-xml-char-policy. */
+  /** Validator.nu option: content-non-xml-char-policy. */
   public static final EnumOption<XmlViolationPolicy> CONTENT_NON_XML_CHAR_POLICY =
       new EnumOption<>("content-non-xml-char-policy", XmlViolationPolicy.class);
-  /** Validator.nu option comment-policy. */
+  /** Validator.nu option: comment-policy. */
   public static final EnumOption<XmlViolationPolicy> COMMENT_POLICY =
       new EnumOption<>("comment-policy", XmlViolationPolicy.class);
-  /** Validator.nu option xmlns-policy. */
+  /** Validator.nu option: xmlns-policy. */
   public static final EnumOption<XmlViolationPolicy> XMLNS_POLICY =
       new EnumOption<>("xmlns-policy", XmlViolationPolicy.class);
-  /** Validator.nu option name-policy. */
+  /** Validator.nu option: name-policy. */
   public static final EnumOption<XmlViolationPolicy> NAME_POLICY =
       new EnumOption<>("name-policy", XmlViolationPolicy.class);
-  /** Validator.nu option streamability-violation-policy. */
+  /** Validator.nu option: streamability-violation-policy. */
   public static final EnumOption<XmlViolationPolicy> STREAMABILITY_VIOLATION_POLICY =
       new EnumOption<>("streamability-violation-policy", XmlViolationPolicy.class);
-  /** Validator.nu option xml-policy. */
+  /** Validator.nu option: xml-policy. */
   public static final EnumOption<XmlViolationPolicy> XML_POLICY =
       new EnumOption<>("xml-policy", XmlViolationPolicy.class);
-  /** Validator.nu option heuristics. */
+  /** Validator.nu option: heuristics. */
   public static final EnumOption<Heuristics> HEURISTICS =
       new EnumOption<>("heuristics", Heuristics.class);
 
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 773defb9db..8fda826216 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -104,9 +104,6 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
       reader.setContentHandler(writer);
       reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
 
-      // define input
-      final InputSource is = new InputSource(io.inputStream());
-
       // set Validator.nu options
       if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
         reader.setCheckingNormalization(true);
@@ -135,11 +132,13 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
         reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
       // end Validator.nu options
 
+      // define input
+      final InputSource is = new InputSource(io.inputStream());
       String enc = io.encoding() != null
           ? io.encoding()
           : hopts.contains(ENCODING)
             ? hopts.get(HtmlOptions.ENCODING)
-            : null; // TODO: sniff encoding
+            : null;
       if (enc != null) {
         if (!Strings.supported(enc))
           throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index c7cf9d24af..41bbf4e1e1 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -23,9 +23,9 @@
  * @author Christian Gruen
  */
 public class HtmlParse extends StandardFunc {
-  /** Class needed for heuristics=ICU. */
+  /** Class needed for option heuristics=ICU. */
   private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
-  /** Class needed for heuristics=CHARDET. */
+  /** Class needed for option heuristics=CHARDET. */
   private static final String CHARDET_CLASS_NAME =
       "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
 
diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang
index d507ea781f..b438ac5864 100644
--- a/basex-core/src/main/resources/lang/Dutch.lang
+++ b/basex-core/src/main/resources/lang/Dutch.lang
@@ -417,7 +417,7 @@ h_db_format          = Het database formaat is gewijzigd; maak een nieuwe databa
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = Diakritische tekens worden gebruikt in de index.
 h_fulltext_index     = Een full-text index versnelt full-text queries.
-h_html_parser        = De Validator.nu HTML parser zal gebruikt worden om HTML naar XML te converteren.
+h_html_parser        = De Validator.nu parser zal gebruikt worden om HTML naar XML te converteren.
 h_index_format       = Het index formaat is gewijzigd; maak nieuwe indexen.
 h_int_parser         = Robuuster en sneller dan Java's standaard parser.
 h_languauge          = Met deze optie zullen taalspecifieke parsers worden gebruikt.
diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang
index 5cc46cc7ed..9cc5ca7d20 100644
--- a/basex-core/src/main/resources/lang/English.lang
+++ b/basex-core/src/main/resources/lang/English.lang
@@ -417,7 +417,7 @@ h_db_format          = The database format has changed; please use a newer versi
 h_db_options_%       = The options will be assigned if '%' is executed.
 h_diacritics         = Diacritics are retained in the index.
 h_fulltext_index     = A full-text index speeds up full-text queries.
-h_html_parser        = The Validator.nu HTML parser will be used to convert HTML to XML.
+h_html_parser        = The Validator.nu parser will be used to convert HTML to XML.
 h_index_format       = The index format has changed; please create new indexes.
 h_int_parser         = Fault tolerant, and faster than Java’s default parser.
 h_languauge          = Language specific tokenizers will be used.
diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang
index 2f10e53bcf..7fd856822b 100644
--- a/basex-core/src/main/resources/lang/French.lang
+++ b/basex-core/src/main/resources/lang/French.lang
@@ -417,7 +417,7 @@ h_db_format          = Le format de base de données a changé ; Veuillez créer
 h_db_options_%       = Les options seront assignées si on exécute '%'.
 h_diacritics         = Les signes diacritiques sont conservés dans l’index.
 h_fulltext_index     = Un index plein texte accélère les requêtes plein texte.
-h_html_parser        = Le parser HTML Validator.nu sera utilisé pour convertir le HTML en XML.
+h_html_parser        = Le parser Validator.nu sera utilisé pour convertir le HTML en XML.
 h_index_format       = Le format des index a changé ;  Veuillez créer de nouveaux index.
 h_int_parser         = Tolérant aux fautes, et plus rapide que le parser Java par défaut.
 h_languauge          = Des analyseurs spécifiques à la langue vont être utilisés.
diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang
index a4056c130d..1226bac232 100644
--- a/basex-core/src/main/resources/lang/German.lang
+++ b/basex-core/src/main/resources/lang/German.lang
@@ -417,7 +417,7 @@ h_db_format          = Das Datenbankformat hat sich geändert; bitte verwenden S
 h_db_options_%       = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird.
 h_diacritics         = Diakritische Zeichen werden im Index beibehalten.
 h_fulltext_index     = Ein Volltext-Index beschleunigt Volltext-Anfragen.
-h_html_parser        = Der Validator.nu HTML-Parser wird verwendet, um HTML in XML zu konvertieren.
+h_html_parser        = Der Validator.nu-Parser wird verwendet, um HTML in XML zu konvertieren.
 h_index_format       = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes.
 h_int_parser         = Fehlertolerant und schneller als Javas XML-Parser.
 h_languauge          = Sprachspezifische Tokenisierung wird verwendet.
diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang
index 995f0d178f..bbc2140b6e 100644
--- a/basex-core/src/main/resources/lang/Mongolian.lang
+++ b/basex-core/src/main/resources/lang/Mongolian.lang
@@ -417,7 +417,7 @@ h_db_format          = Өгөгдлийн сангийн формат өөрчл
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = Индекс дэх санах тэмдгийг авч үлдэх.
 h_fulltext_index     = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно.
-h_html_parser        = The Validator.nu HTML parser will be used to convert HTML to XML.
+h_html_parser        = The Validator.nu parser will be used to convert HTML to XML.
 h_index_format       = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү.
 h_int_parser         = Fault tolerant, and faster than Java’s default parser.
 h_languauge          = Хэлний текст Parser тодорхойлогдох болно.

From ff514e1d2f5ead11d913f23853ad9f5e9e74b4ad Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Mon, 13 Nov 2023 13:23:53 +0100
Subject: [PATCH 7/9] very minor change

---
 basex-core/src/main/java/org/basex/build/html/HtmlParser.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 8fda826216..5877ce1498 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -33,7 +33,7 @@ public final class HtmlParser extends XMLParser {
   private static final String NAME = "Validator.nu";
 
   /**
-   * Checks if a Validator.nu is available.
+   * Checks if Validator.nu is available.
    * @return result of check
    */
   public static boolean available() {

From 44f5d131e204e0a63ad1c86a584366745dea1306 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Wed, 22 Jan 2025 18:35:01 +0100
Subject: [PATCH 8/9] support both TagSoup and Validator.NU

---
 basex-core/pom.xml                            |   6 +
 .../org/basex/build/html/HtmlOptions.java     |  41 +-
 .../java/org/basex/build/html/HtmlParser.java | 362 ++++++++++++++----
 .../src/main/java/org/basex/core/Text.java    |   2 +-
 .../basex/gui/dialog/DialogHtmlParser.java    |  10 +-
 .../basex/query/func/html/FnParseHtml.java    |   6 +-
 .../org/basex/query/func/html/HtmlDoc.java    |   3 +-
 .../org/basex/query/func/html/HtmlParse.java  |  61 ++-
 .../org/basex/query/func/html/HtmlParser.java |   6 +-
 .../src/main/resources/lang/Chinese.lang      |   2 +-
 basex-core/src/main/resources/lang/Dutch.lang |   2 +-
 .../src/main/resources/lang/English.lang      |   2 +-
 .../src/main/resources/lang/French.lang       |   2 +-
 .../src/main/resources/lang/German.lang       |   2 +-
 .../src/main/resources/lang/Hungarian.lang    |   2 +-
 .../src/main/resources/lang/Indonesian.lang   |   2 +-
 .../src/main/resources/lang/Italian.lang      |   2 +-
 .../src/main/resources/lang/Japanese.lang     |   2 +-
 .../src/main/resources/lang/Mongolian.lang    |   2 +-
 .../src/main/resources/lang/Romanian.lang     |   2 +-
 .../src/main/resources/lang/Russian.lang      |   2 +-
 .../src/main/resources/lang/Spanish.lang      |   2 +-
 .../org/basex/query/func/HtmlModuleTest.java  |   8 +-
 basex-examples/basex-examples.iml             |   2 +-
 basex-examples/pom.xml                        |   4 +-
 .../basex/examples/create/HTMLExample.java    |   2 +-
 pom.xml                                       |   7 +
 27 files changed, 402 insertions(+), 144 deletions(-)

diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index ee62cdd08e..79b46a0dc5 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -29,6 +29,12 @@
       <artifactId>lucene-stemmers</artifactId>
       <optional>true</optional>
     </dependency>
+    <dependency>
+      <groupId>org.ccil.cowan.tagsoup</groupId>
+      <artifactId>tagsoup</artifactId>
+      <scope>compile</scope>
+      <optional>true</optional>
+    </dependency>
     <dependency>
       <groupId>nu.validator</groupId>
       <artifactId>htmlparser</artifactId>
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index e1271747d4..7eebede2ce 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -9,10 +9,45 @@
  * @author Christian Gruen
  */
 public final class HtmlOptions extends Options {
-  /** fn:parse-html option: encoding. */
+  /** TagSoup option: html. */
+  public static final BooleanOption HTML = new BooleanOption("html", false);
+  /** TagSoup option: omit-xml-declaration. */
+  public static final BooleanOption OMIT_XML_DECLARATION =
+      new BooleanOption("omit-xml-declaration", false);
+  /** TagSoup option: nons. */
+  public static final BooleanOption NONS = new BooleanOption("nons", true);
+  /** TagSoup option: nobogons. */
+  public static final BooleanOption NOBOGONS = new BooleanOption("nobogons", false);
+  /** TagSoup option: nodefaults. */
+  public static final BooleanOption NODEFAULTS = new BooleanOption("nodefaults", false);
+  /** TagSoup option: nocolons. */
+  public static final BooleanOption NOCOLONS = new BooleanOption("nocolons", false);
+  /** TagSoup option: norestart. */
+  public static final BooleanOption NORESTART = new BooleanOption("norestart", false);
+  /** TagSoup option: nobogons. */
+  public static final BooleanOption IGNORABLE = new BooleanOption("ignorable", false);
+  /** TagSoup option: emptybogons. */
+  public static final BooleanOption EMPTYBOGONS = new BooleanOption("emptybogons", false);
+  /** TagSoup option: any. */
+  public static final BooleanOption ANY = new BooleanOption("any", false);
+  /** TagSoup option: norootbogons. */
+  public static final BooleanOption NOROOTBOGONS = new BooleanOption("norootbogons", false);
+  /** TagSoup option: nocdata. */
+  public static final BooleanOption NOCDATA = new BooleanOption("nocdata", false);
+  /** TagSoup option: lexical. */
+  public static final BooleanOption LEXICAL = new BooleanOption("lexical", false);
+
+  /** TagSoup option: doctype-system=systemid. */
+  public static final StringOption DOCTYPE_SYSTEM = new StringOption("doctype-system");
+  /** TagSoup option: doctype-public=publicid. */
+  public static final StringOption DOCTYPE_PUBLIC = new StringOption("doctype-public");
+
+  /** Common option: encoding. */
   public static final StringOption ENCODING = new StringOption("encoding");
-  /** fn:parse-html option: method. */
-  public static final StringOption METHOD = new StringOption("method");
+  /** Common option: method. */
+  public static final EnumOption<HtmlParser.Method> METHOD = new EnumOption<>("method",
+      HtmlParser.Method.class);
+
   /** fn:parse-html option: html-version. */
   public static final StringOption HTML_VERSION = new StringOption("html-version");
   /** fn:parse-html option: include-template-content. */
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 558cf1b064..3083a5519b 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -1,16 +1,18 @@
 package org.basex.build.html;
 
 import static org.basex.build.html.HtmlOptions.*;
+import static org.basex.build.html.HtmlOptions.NOCDATA;
 import static org.basex.query.QueryError.*;
 import static org.basex.util.Token.*;
 
 import java.io.*;
-import java.util.*;
 
 import org.basex.build.xml.*;
 import org.basex.core.*;
 import org.basex.io.*;
+import org.basex.query.*;
 import org.basex.util.*;
+import org.ccil.cowan.tagsoup.*;
 import org.xml.sax.*;
 
 import nu.validator.htmlparser.common.Heuristics;
@@ -29,109 +31,57 @@
  * @author Christian Gruen
  */
 public final class HtmlParser extends XMLParser {
-  /** Name of HTML Parser. */
-  private static final String NAME = "Validator.nu";
-
-  /**
-   * Checks if Validator.nu is available.
-   * @return result of check
-   */
-  public static boolean available() {
-    return firstUnavailableClass() == null;
-  }
-
   /**
-   * Check whether Validator.nu classes are available on the class path.
-   * @return the name of the first class that is not available, or null if all classes are available
-   */
-  public static String firstUnavailableClass() {
-    for(final String className : Arrays.asList("nu.validator.htmlparser.sax.HtmlParser",
-        "nu.validator.htmlparser.sax.XmlSerializer",
-        "nu.validator.htmlparser.common.XmlViolationPolicy",
-        "nu.validator.htmlparser.common.Heuristics")) {
-      if(!Reflect.available(className)) return className;
-    }
-    return null;
-  }
-
-  /**
-   * Returns the name of the parser, or an empty string.
-   * @return name of parser
+   * Constructor.
+   * @param source document source
+   * @param options main options
+   * @param hopts html options
+   * @throws IOException I/O exception
    */
-  public static String parser() {
-    return available() ? NAME : "";
+  public HtmlParser(final IO source, final MainOptions options, final HtmlOptions hopts)
+      throws IOException {
+    this(source, Parser.of(hopts), options, hopts);
   }
 
   /**
    * Constructor.
    * @param source document source
+   * @param parser parser to be used
    * @param options main options
    * @param hopts html options
    * @throws IOException I/O exception
    */
-  public HtmlParser(final IO source, final MainOptions options, final HtmlOptions hopts)
-      throws IOException {
-    super(toXml(source, hopts), options);
+  public HtmlParser(final IO source, final Parser parser, final MainOptions options,
+      final HtmlOptions hopts) throws IOException {
+    super(toXml(source, parser, hopts), options);
   }
 
   /**
    * Converts an HTML document to XML.
    * @param io io reference
+   * @param parser parser to be used
    * @param hopts html options
    * @return parser
    * @throws IOException I/O exception
    */
-  private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException {
+  private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopts)
+      throws IOException {
     // reader could not be initialized; fall back to XML
-    if(!available()) return io;
-
+    if(!parser.available(hopts)) return io;
     try {
       // define output
       final StringWriter sw = new StringWriter();
-      final nu.validator.htmlparser.sax.HtmlParser reader =
-          new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
-      final ContentHandler writer = new XmlSerializer(sw);
-      reader.setContentHandler(writer);
-      reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
-
-      // set Validator.nu options
-      if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
-        reader.setCheckingNormalization(true);
-      if(hopts.get(MAPPING_LANG_TO_XML_LANG))
-        reader.setMappingLangToXmlLang(true);
-      if(hopts.get(SCRIPTING_ENABLED))
-        reader.setScriptingEnabled(true);
-      if(hopts.contains(CONTENT_SPACE_POLICY))
-        reader.setContentSpacePolicy(
-            XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
-      if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
-        reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
-            hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
-      if(hopts.contains(COMMENT_POLICY))
-        reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
-      if(hopts.contains(XMLNS_POLICY))
-        reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
-      if(hopts.contains(NAME_POLICY))
-        reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
-      if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
-        reader.setStreamabilityViolationPolicy(
-            XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
-      if(hopts.contains(XML_POLICY))
-        reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
-      if(hopts.contains(HEURISTICS))
-        reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
-      // end Validator.nu options
+      final XMLReader reader = parser.reader(hopts, sw);
 
       // define input
       final InputSource is = new InputSource(io.inputStream());
-      String enc = io.encoding() != null
+      final String enc = io.encoding() != null
           ? io.encoding()
           : hopts.contains(ENCODING)
             ? hopts.get(HtmlOptions.ENCODING)
             : null;
       if (enc != null) {
-        if (!Strings.supported(enc))
-          throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
+        if(!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
         is.setEncoding(Strings.normEncoding(enc));
       }
 
@@ -143,4 +93,272 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
       throw INVHTML_X.getIO(ex.getLocalizedMessage());
     }
   }
+
+  /** Method option values. */
+  public enum Method {
+    /** TagSoup parser with method 'xml'. */
+    xml(Parser.TAGSOUP),
+    /** TagSoup parser with method 'html'. */
+    html(Parser.TAGSOUP),
+    /** Validator.nu parser. */
+    nu(Parser.NU);
+
+    /** Parser associated with this method. */
+    public final Parser parser;
+
+    /**
+     * Constructor.
+     * @param parser parser associated with this method
+     */
+    Method(final Parser parser) {
+      this.parser = parser;
+    }
+  }
+
+  /** Parser type. */
+  public enum Parser {
+    /** TagSoup parser. */
+    TAGSOUP("TagSoup", "org.ccil.cowan.tagsoup.Parser") {
+
+      /** TagSoup URL. */
+      private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/";
+
+      @Override
+      public boolean fallbackToXml() {
+        return true;
+      }
+
+      @Override
+      XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException {
+        XMLReader reader = new org.ccil.cowan.tagsoup.Parser();
+        final XMLWriter writer = new XMLWriter(sw);
+        writer.setOutputProperty(ENCODING.name(), Strings.UTF8);
+        reader.setContentHandler(writer);
+
+        // set TagSoup options
+        if(hopts.get(HTML)) {
+          reader.setFeature("http://xml.org/sax/features/namespaces", false);
+          writer.setOutputProperty(METHOD.name(), "html");
+          writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
+        }
+        if(hopts.get(NONS))
+          reader.setFeature("http://xml.org/sax/features/namespaces", false);
+        if(hopts.get(NOBOGONS))
+          reader.setFeature(FEATURES + "ignore-bogons", true);
+        if(hopts.get(NODEFAULTS))
+          reader.setFeature(FEATURES + "default-attributes", false);
+        if(hopts.get(NOCOLONS))
+          reader.setFeature(FEATURES + "translate-colons", true);
+        if(hopts.get(NORESTART))
+          reader.setFeature(FEATURES + "restart-elements", false);
+        if(hopts.get(IGNORABLE))
+          reader.setFeature(FEATURES + "ignorable-whitespace", true);
+        if(hopts.get(EMPTYBOGONS))
+          reader.setFeature(FEATURES + "bogons-empty", true);
+        if(hopts.get(ANY))
+          reader.setFeature(FEATURES + "bogons-empty", false);
+        if(hopts.get(NOROOTBOGONS))
+          reader.setFeature(FEATURES + "root-bogons", false);
+        if(hopts.get(NOCDATA))
+          reader.setFeature(FEATURES + "cdata-elements", false);
+        if(hopts.get(LEXICAL))
+          reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
+        if(hopts.get(OMIT_XML_DECLARATION))
+          writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
+        if(hopts.contains(METHOD))
+          writer.setOutputProperty(METHOD.name(), hopts.get(METHOD).name());
+        if(hopts.contains(DOCTYPE_SYSTEM))
+          writer.setOutputProperty(DOCTYPE_SYSTEM.name(), hopts.get(DOCTYPE_SYSTEM));
+        if(hopts.contains(DOCTYPE_PUBLIC))
+          writer.setOutputProperty(DOCTYPE_PUBLIC.name(), hopts.get(DOCTYPE_PUBLIC));
+        return reader;
+      }
+    },
+
+    /** Validator.nu parser. */
+    NU("Validator.nu", "nu.validator.htmlparser.sax.HtmlParser",
+        "nu.validator.htmlparser.sax.XmlSerializer",
+        "nu.validator.htmlparser.common.XmlViolationPolicy",
+        "nu.validator.htmlparser.common.Heuristics") {
+
+      /** Class needed for option heuristics=ICU. */
+      private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
+      /** Class needed for option heuristics=CHARDET. */
+      private static final String CHARDET_CLASS_NAME =
+          "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
+
+      @Override
+      public boolean fallbackToXml() {
+        return false;
+      }
+
+      @Override
+      XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException {
+        final nu.validator.htmlparser.sax.HtmlParser reader =
+            new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
+        final ContentHandler writer = new XmlSerializer(sw);
+        reader.setContentHandler(writer);
+        reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
+
+        if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
+          reader.setCheckingNormalization(true);
+        if(hopts.get(MAPPING_LANG_TO_XML_LANG))
+          reader.setMappingLangToXmlLang(true);
+        if(hopts.get(SCRIPTING_ENABLED))
+          reader.setScriptingEnabled(true);
+        if(hopts.contains(CONTENT_SPACE_POLICY))
+          reader.setContentSpacePolicy(
+              XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
+        if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
+          reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
+              hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
+        if(hopts.contains(COMMENT_POLICY))
+          reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
+        if(hopts.contains(XMLNS_POLICY))
+          reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
+        if(hopts.contains(NAME_POLICY))
+          reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
+        if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
+          reader.setStreamabilityViolationPolicy(
+              XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
+        if(hopts.contains(XML_POLICY))
+          reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
+        if(hopts.contains(HEURISTICS))
+          reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
+        return reader;
+      }
+
+      @Override
+      public void ensureAvailable(final HtmlOptions options, final byte[] func,
+          final InputInfo info) throws QueryException {
+        super.ensureAvailable(options, func, info);
+        if(options.contains(HEURISTICS)) {
+          switch(options.get(HEURISTICS)) {
+          case ALL:
+            ensureAvailable(ICU_CLASS_NAME, func, info);
+            ensureAvailable(CHARDET_CLASS_NAME, func, info);
+            break;
+          case ICU:
+            ensureAvailable(ICU_CLASS_NAME, func, info);
+            break;
+          case CHARDET:
+            ensureAvailable(CHARDET_CLASS_NAME, func, info);
+            break;
+          default:
+          }
+        }
+      }
+
+      @Override
+      public boolean available(final HtmlOptions options) {
+        if(!super.available(options)) return false;
+        if(!options.contains(HEURISTICS)) return true;
+        switch(options.get(HEURISTICS)) {
+        case ALL:
+          if(!Reflect.available(ICU_CLASS_NAME)) return false;
+          if(!Reflect.available(CHARDET_CLASS_NAME)) return false;
+          break;
+        case ICU:
+          if(!Reflect.available(ICU_CLASS_NAME)) return false;
+          break;
+        case CHARDET:
+          if(!Reflect.available(CHARDET_CLASS_NAME)) return false;
+          break;
+        default:
+        }
+        return true;
+      }
+    };
+
+    /** Default parser. */
+    public static final Parser DEFAULT = TAGSOUP;
+
+    /** String representation. */
+    private final String string;
+    /** Required classes. */
+    private final String[] classes;
+
+    /**
+     * Whether to fall back to XML if this parser is not available.
+     * @return result of check
+     */
+    public abstract boolean fallbackToXml();
+
+    /**
+     * Return a reader instance for this parser.
+     * @param options HTML options
+     * @param writer string writer
+     * @return reader
+     * @throws SAXException SAX exception
+     */
+    abstract XMLReader reader(HtmlOptions options, StringWriter writer) throws SAXException;
+
+    /**
+     * Constructor.
+     * @param string string representation
+     * @param classes required classes
+     */
+    Parser(final String string, final String... classes) {
+      this.string = string;
+      this.classes = classes;
+    }
+
+    /**
+     * Checks if this parser is available.
+     * @param options HTML options
+     * @return result of check
+     */
+    public boolean available(@SuppressWarnings("unused") final HtmlOptions options) {
+      for(final String cl : classes) if(!Reflect.available(cl)) return false;
+      return true;
+    }
+
+    /**
+     * Throws an exception if any of the classes required for this parser are unavailable.
+     * @param options HTML options
+     * @param func name of function that is asking for this parser
+     * @param info input info (can be {@code null})
+     * @throws QueryException query exception
+     */
+    public void ensureAvailable(@SuppressWarnings("unused") final HtmlOptions options,
+        final byte[] func, final InputInfo info) throws QueryException {
+      for(final String cl : classes) ensureAvailable(cl, func, info);
+    }
+
+    /**
+     * Throws an exception if a class required for this parser is unavailable.
+     * @param className the class name
+     * @param func name of function that is asking for this parser
+     * @param info input info (can be {@code null})
+     * @throws QueryException query exception,
+     */
+    private static void ensureAvailable(final String className, final byte[] func,
+        final InputInfo info) throws QueryException {
+      if(!Reflect.available(className)) throw BASEX_CLASSPATH_X_X.get(info, func, className);
+    }
+
+    /**
+     * Returns the parser associated with the specified HTML options.
+     * @param options HTML options.
+     * @return parser
+     */
+    public static Parser of(final HtmlOptions options) {
+      return of(options, Parser.DEFAULT);
+    }
+
+    /**
+     * Returns the parser associated with the specified HTML options.
+     * @param options HTML options.
+     * @param defaultParser default parser
+     * @return parser
+     */
+    public static Parser of(final HtmlOptions options, final Parser defaultParser) {
+      return options.contains(METHOD) ? options.get(METHOD).parser : defaultParser;
+    }
+
+    @Override
+    public String toString() {
+      return string;
+    }
+  }
 }
diff --git a/basex-core/src/main/java/org/basex/core/Text.java b/basex-core/src/main/java/org/basex/core/Text.java
index d7f8121800..dd8d080b65 100644
--- a/basex-core/src/main/java/org/basex/core/Text.java
+++ b/basex-core/src/main/java/org/basex/core/Text.java
@@ -1426,7 +1426,7 @@ public interface Text {
   String H_VERSION_NEW_X_X = lang("h_version_new_%_%");
 
   /** HTML Parser. */
-  String H_HTML_PARSER = lang("h_html_parser");
+  String H_HTML_PARSER_X = lang("h_html_parser_%");
   /** No HTML Parser. */
   String H_NO_HTML_PARSER = lang("h_no_html_parser");
 
diff --git a/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java b/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java
index f5b0d625e1..ce48bd3dc4 100644
--- a/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java
+++ b/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java
@@ -6,10 +6,12 @@
 import java.io.*;
 
 import org.basex.build.html.*;
+import org.basex.build.html.HtmlParser.*;
 import org.basex.core.*;
 import org.basex.gui.*;
-import org.basex.gui.GUIConstants.Msg;
+import org.basex.gui.GUIConstants.*;
 import org.basex.gui.layout.*;
+import org.basex.util.*;
 import org.basex.util.options.*;
 
 /**
@@ -33,10 +35,10 @@ final class DialogHtmlParser extends DialogParser {
    */
   DialogHtmlParser(final BaseXDialog dialog, final MainOptions opts) {
     hopts = new HtmlOptions(opts.get(MainOptions.HTMLPARSER));
-
-    final boolean avl = HtmlParser.available();
+    final Parser parser = Parser.of(hopts);
+    final boolean avl = parser.available(hopts);
     final BaseXBack pp  = new BaseXBack(new RowLayout(8));
-    pp.add(new BaseXLabel(avl ? H_HTML_PARSER : H_NO_HTML_PARSER));
+    pp.add(new BaseXLabel(avl ? Util.info(H_HTML_PARSER_X, parser) : H_NO_HTML_PARSER));
 
     options = new BaseXTextField(dialog, hopts.toString());
     options.setToolTipText(tooltip(hopts));
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index b6bbce098a..54540389a5 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -1,7 +1,5 @@
 package org.basex.query.func.html;
 
-import static org.basex.query.QueryError.*;
-
 import org.basex.build.html.HtmlParser;
 import org.basex.query.*;
 import org.basex.query.value.item.*;
@@ -17,8 +15,6 @@ public class FnParseHtml extends HtmlParse {
 
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
-    String className = HtmlParser.firstUnavailableClass();
-    if (className != null) throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
-    return super.item(qc, ii);
+    return parse(htmlInput(qc), HtmlParser.Parser.NU, qc);
   }
 }
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java
index 1e21bff317..a8a08c664c 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java
@@ -1,5 +1,6 @@
 package org.basex.query.func.html;
 
+import org.basex.build.html.HtmlParser.*;
 import org.basex.query.*;
 import org.basex.query.value.item.*;
 import org.basex.query.value.seq.*;
@@ -15,6 +16,6 @@ public final class HtmlDoc extends HtmlParse {
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
     final String source = toStringOrNull(arg(0), qc);
-    return source != null ? parse(toIO(source), qc) : Empty.VALUE;
+    return source != null ? parse(toIO(source), Parser.DEFAULT, qc) : Empty.VALUE;
   }
 }
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index 181692e2e7..c4986d2c6b 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -1,11 +1,11 @@
 package org.basex.query.func.html;
 
-import static org.basex.build.html.HtmlOptions.*;
 import static org.basex.query.QueryError.*;
 
 import java.io.*;
 
 import org.basex.build.html.*;
+import org.basex.build.html.HtmlParser.*;
 import org.basex.core.*;
 import org.basex.io.*;
 import org.basex.query.*;
@@ -23,19 +23,23 @@
  * @author Christian Gruen
  */
 public class HtmlParse extends StandardFunc {
-  /** Class needed for option heuristics=ICU. */
-  private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
-  /** Class needed for option heuristics=CHARDET. */
-  private static final String CHARDET_CLASS_NAME =
-      "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
-
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
+    return parse(htmlInput(qc), Parser.DEFAULT, qc);
+  }
+
+  /**
+   * Converts the HTML input in the first argument to an IOContent instance from a binary or string
+   * item.
+   * @param qc query context
+   * @return input as an IOContent instance ({@code null}, if empty)
+   * @throws QueryException query exception
+   */
+  protected IOContent htmlInput(final QueryContext qc) throws QueryException {
     final Item value = arg(0).atomItem(qc, info);
-    if (value.isEmpty()) return Empty.VALUE;
-    final IO io = value instanceof Bin ? new IOContent(toBytes(value))
-                                       : new IOContent(toBytes(value), "", Strings.UTF8);
-    return parse(io, qc);
+    if(value.isEmpty()) return null;
+    return value instanceof Bin ? new IOContent(toBytes(value))
+                                : new IOContent(toBytes(value), "", Strings.UTF8);
   }
 
   @Override
@@ -46,41 +50,22 @@ protected final Expr opt(final CompileContext cc) {
   /**
    * Parses the input and creates an XML document.
    * @param io input data
+   * @param defaultParser default HTML parser to be used in absence of the METHOD option
    * @param qc query context
    * @return node
    * @throws QueryException query exception
    */
-  protected final Item parse(final IO io, final QueryContext qc) throws QueryException {
+  protected final Item parse(final IO io, final Parser defaultParser, final QueryContext qc)
+      throws QueryException {
+    if(io == null) return Empty.VALUE;
     final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), qc);
-    if(options.contains(HEURISTICS)) {
-      switch (options.get(HEURISTICS)) {
-      case ALL:
-        ensureAvailable(ICU_CLASS_NAME);
-        ensureAvailable(CHARDET_CLASS_NAME);
-        break;
-      case ICU:
-        ensureAvailable(ICU_CLASS_NAME);
-        break;
-      case CHARDET:
-        ensureAvailable(CHARDET_CLASS_NAME);
-        break;
-      default:
-      }
-    }
+    final Parser parser = Parser.of(options, defaultParser);
+    if(!parser.fallbackToXml()) parser.ensureAvailable(options, definition.local(), info);
     try {
-      return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options));
+      return new DBNode(
+          new org.basex.build.html.HtmlParser(io, parser, new MainOptions(), options));
     } catch(final IOException ex) {
       throw INVHTML_X.get(info, ex);
     }
   }
-
-  /**
-   * Ensure that a required class is available on the class path.
-   * @param className the class name
-   * @throws QueryException query exception,
-   */
-  private void ensureAvailable(final String className) throws QueryException {
-    if(!Reflect.available(className))
-      throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
-  }
 }
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java
index 0d5ce3ec59..da95eafa78 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java
@@ -1,5 +1,7 @@
 package org.basex.query.func.html;
 
+import org.basex.build.html.*;
+import org.basex.build.html.HtmlParser.*;
 import org.basex.query.*;
 import org.basex.query.func.*;
 import org.basex.query.value.item.*;
@@ -14,6 +16,8 @@
 public final class HtmlParser extends StandardFunc {
   @Override
   public Item item(final QueryContext qc, final InputInfo ii) {
-    return Str.get(org.basex.build.html.HtmlParser.parser());
+    final HtmlOptions options = new HtmlOptions();
+    final Parser parser = Parser.of(options);
+    return Str.get(parser.available(options) ? parser.toString() : "");
   }
 }
diff --git a/basex-core/src/main/resources/lang/Chinese.lang b/basex-core/src/main/resources/lang/Chinese.lang
index 11f1839948..1a7b4bfb29 100644
--- a/basex-core/src/main/resources/lang/Chinese.lang
+++ b/basex-core/src/main/resources/lang/Chinese.lang
@@ -413,7 +413,7 @@ h_db_format          = 数据库结构已经变了，请使用新版软件
 h_db_options_%       = 如果执行'%'，这个选项将会被设置
 h_diacritics         = 索引保留了发音符号
 h_fulltext_index     = 全文索引可以加速全文检索
-h_html_parser        = 将使用Validator.nu将HTML转为XML
+h_html_parser_%      = 将使用%将HTML转为XML
 h_index_format       = 索引格式变了，请建新索引
 h_int_parser         = 容错，而且比Java的默认解析器更快
 h_languauge          = 将使用根据语言确定的tokenizer
diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang
index 774465b9b5..d3198c072f 100644
--- a/basex-core/src/main/resources/lang/Dutch.lang
+++ b/basex-core/src/main/resources/lang/Dutch.lang
@@ -413,7 +413,7 @@ h_db_format          = Het database formaat is gewijzigd; maak een nieuwe databa
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = Diakritische tekens worden gebruikt in de index.
 h_fulltext_index     = Een full-text index versnelt full-text queries.
-h_html_parser        = De Validator.nu parser zal gebruikt worden om HTML naar XML te converteren.
+h_html_parser_%      = De % parser zal gebruikt worden om HTML naar XML te converteren.
 h_index_format       = Het index formaat is gewijzigd; maak nieuwe indexen.
 h_int_parser         = Robuuster en sneller dan Java's standaard parser.
 h_languauge          = Met deze optie zullen taalspecifieke parsers worden gebruikt.
diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang
index d7274e38a9..4886826085 100644
--- a/basex-core/src/main/resources/lang/English.lang
+++ b/basex-core/src/main/resources/lang/English.lang
@@ -413,7 +413,7 @@ h_db_format          = The database format has changed; please use a newer versi
 h_db_options_%       = The options will be assigned if '%' is executed.
 h_diacritics         = Diacritics are retained in the index.
 h_fulltext_index     = A full-text index speeds up full-text queries.
-h_html_parser        = The Validator.nu parser will be used to convert HTML to XML.
+h_html_parser_%      = The % parser will be used to convert HTML to XML.
 h_index_format       = The index format has changed; please create new indexes.
 h_int_parser         = Fault tolerant, and faster than Java’s default parser.
 h_languauge          = Language specific tokenizers will be used.
diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang
index 9a02e46376..b1b6c7f100 100644
--- a/basex-core/src/main/resources/lang/French.lang
+++ b/basex-core/src/main/resources/lang/French.lang
@@ -413,7 +413,7 @@ h_db_format          = Le format de base de données a changé ; Veuillez créer
 h_db_options_%       = Les options seront assignées si on exécute '%'.
 h_diacritics         = Les signes diacritiques sont conservés dans l’index.
 h_fulltext_index     = Un index plein texte accélère les requêtes plein texte.
-h_html_parser        = Le parser Validator.nu sera utilisé pour convertir le HTML en XML.
+h_html_parser_%      = Le parser % sera utilisé pour convertir le HTML en XML.
 h_index_format       = Le format des index a changé ;  Veuillez créer de nouveaux index.
 h_int_parser         = Tolérant aux fautes, et plus rapide que le parser Java par défaut.
 h_languauge          = Des analyseurs spécifiques à la langue vont être utilisés.
diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang
index cedba9ccd8..07d34b07b6 100644
--- a/basex-core/src/main/resources/lang/German.lang
+++ b/basex-core/src/main/resources/lang/German.lang
@@ -413,7 +413,7 @@ h_db_format          = Das Datenbankformat hat sich geändert; bitte verwenden S
 h_db_options_%       = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird.
 h_diacritics         = Diakritische Zeichen werden im Index beibehalten.
 h_fulltext_index     = Ein Volltext-Index beschleunigt Volltext-Anfragen.
-h_html_parser        = Der Validator.nu-Parser wird verwendet, um HTML in XML zu konvertieren.
+h_html_parser_%      = Der %-Parser wird verwendet, um HTML in XML zu konvertieren.
 h_index_format       = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes.
 h_int_parser         = Fehlertolerant und schneller als Javas XML-Parser.
 h_languauge          = Sprachspezifische Tokenisierung wird verwendet.
diff --git a/basex-core/src/main/resources/lang/Hungarian.lang b/basex-core/src/main/resources/lang/Hungarian.lang
index 20c823ea28..569fce6e0a 100644
--- a/basex-core/src/main/resources/lang/Hungarian.lang
+++ b/basex-core/src/main/resources/lang/Hungarian.lang
@@ -413,7 +413,7 @@ h_db_format          = Az adatbázis formátuma megváltozott; kérem, használj
 h_db_options_%       = Ezek az beállítások csak a következő futtása után lépnek életbe: '%' 
 h_diacritics         = Ékezetek megmaradnak az indexelésben.
 h_fulltext_index     = A teljes-szöveg index gyorsítja a teljes-szöveges (full-text) lekérdezéseket.
-h_html_parser        = A Validator.nu elemző HTML formátumot konvertál XML formátumra.
+h_html_parser_%      = A % elemző HTML formátumot konvertál XML formátumra.
 h_index_format       = Az index formátuma megváltozott; kérem, készítsen új indexeket.
 h_int_parser         = Hibatűrő, továbbá a Java alapértelmezett elemzőjénél gyorsabb.
 h_languauge          = Nyelvfüggő szövegelemzések is használatra kerülnek.
diff --git a/basex-core/src/main/resources/lang/Indonesian.lang b/basex-core/src/main/resources/lang/Indonesian.lang
index 74356e5ed8..c7d0d9c56a 100644
--- a/basex-core/src/main/resources/lang/Indonesian.lang
+++ b/basex-core/src/main/resources/lang/Indonesian.lang
@@ -413,7 +413,7 @@ h_db_format          = Bentuk basisdata telah berubah; mohon gunakan versi yang
 h_db_options_%       = Pilihan akan digunakan jika '%' dijalankan. 
 h_diacritics         = Diakritik dipertahankan dalam indeks.
 h_fulltext_index     = Indeks semua teks mempercepat kueri teks penuh.
-h_html_parser        = Pengurai Validator.nu akan digunakan untuk mengubah HTML menjadi XML.
+h_html_parser_%      = Pengurai % akan digunakan untuk mengubah HTML menjadi XML.
 h_index_format       = Bentuk indeks telah berubah; mohon buat indeks baru.
 h_int_parser         = Toleran kesalahan, dan lebih cepat dari pengurai standar Java.
 h_languauge          = Pengurai teks bahasa tertentu akan digunakan.
diff --git a/basex-core/src/main/resources/lang/Italian.lang b/basex-core/src/main/resources/lang/Italian.lang
index 168907f804..b62c0c180e 100644
--- a/basex-core/src/main/resources/lang/Italian.lang
+++ b/basex-core/src/main/resources/lang/Italian.lang
@@ -413,7 +413,7 @@ h_db_format          = Il formato della base di dati è cambiato; creare una nuo
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = I segni diacritici sono conservati nell'indice.
 h_fulltext_index     = Un indice "full-text" velocizza le interrogazioni sul testo.
-h_html_parser        = Il parser Validator.nu verrò usato per convertire HTML in XML.
+h_html_parser_%      = Il parser % verrò usato per convertire HTML in XML.
 h_index_format       = Il formato degli indici è cambiato; creare nuovi indici.
 h_int_parser         = Tollerante ai guasti e più veloce del parser di default di Java.
 h_languauge          = Parser di testo specifici per la lingua verranno usati
diff --git a/basex-core/src/main/resources/lang/Japanese.lang b/basex-core/src/main/resources/lang/Japanese.lang
index d0773f8a4a..84a08cd66f 100644
--- a/basex-core/src/main/resources/lang/Japanese.lang
+++ b/basex-core/src/main/resources/lang/Japanese.lang
@@ -413,7 +413,7 @@ h_db_format          = データベース形式を変更しました。新しい
 h_db_options_%       = % 実行時にオプションが割り当てられます。
 h_diacritics         = インデックス内で付加記号（ウムラウト等）は保持されます。
 h_fulltext_index     = 全文テキストインデックスは全文検索を高速化します。
-h_html_parser        = Validator.nu パーサは HTML を XML に変換します。
+h_html_parser_%      = % パーサは HTML を XML に変換します。
 h_index_format       = インデックス形式を変更しました。新しくインデックスを作成して下さい。
 h_int_parser         = フォールトトレラント、Javaのデフォルトパーサより高速。
 h_languauge          = 指定された言語のテキストパーサが使用されます。
diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang
index 6c9fce85f5..9079d8891c 100644
--- a/basex-core/src/main/resources/lang/Mongolian.lang
+++ b/basex-core/src/main/resources/lang/Mongolian.lang
@@ -413,7 +413,7 @@ h_db_format          = Өгөгдлийн сангийн формат өөрчл
 h_db_options_%       = The options will be assigned if '%' is executed. 
 h_diacritics         = Индекс дэх санах тэмдгийг авч үлдэх.
 h_fulltext_index     = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно.
-h_html_parser        = The Validator.nu parser will be used to convert HTML to XML.
+h_html_parser_%      = The % parser will be used to convert HTML to XML.
 h_index_format       = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү.
 h_int_parser         = Fault tolerant, and faster than Java’s default parser.
 h_languauge          = Хэлний текст Parser тодорхойлогдох болно.
diff --git a/basex-core/src/main/resources/lang/Romanian.lang b/basex-core/src/main/resources/lang/Romanian.lang
index dac622b9d9..aaceae5e82 100644
--- a/basex-core/src/main/resources/lang/Romanian.lang
+++ b/basex-core/src/main/resources/lang/Romanian.lang
@@ -413,7 +413,7 @@ h_db_format          = Formatul bazei de date a fost schimbat, vă rugăm să fo
 h_db_options_%       = Optiunile vor fi asignate daca '%' este executată.
 h_diacritics         = Diacritice sunt păstrate în index.
 h_fulltext_index     = Un full-text index accelereaza interogările full-text.
-h_html_parser        = Parserul "Validator.nu" va fi folosit pentru a converti HTML în XML.
+h_html_parser_%      = Parserul "%" va fi folosit pentru a converti HTML în XML.
 h_index_format       = Formatul index s-a schimbat, vă rugăm creati noi indici.
 h_int_parser         = Tolerant la greseli si mai rapid decat parserul default Java.
 h_languauge          = Parsere de text specifice limbii vor fi folosite.
diff --git a/basex-core/src/main/resources/lang/Russian.lang b/basex-core/src/main/resources/lang/Russian.lang
index 1dda70b524..15a89ecce8 100644
--- a/basex-core/src/main/resources/lang/Russian.lang
+++ b/basex-core/src/main/resources/lang/Russian.lang
@@ -413,7 +413,7 @@ h_db_format          = Формат хранения баз данных был
 h_db_options_%       = Эти опции будут изменены только после выполнения команды [%] 
 h_diacritics         = Разделительные знаки будут включены в индекс
 h_fulltext_index     = Полнотекстовый индекс ускоряет соответствующие запросы
-h_html_parser        = Для конвертации HTML в XML будет использован парсер Validator.nu
+h_html_parser_%      = Для конвертации HTML в XML будет использован парсер %
 h_index_format       = Формат хранения индексов был изменен. Пожалуйста, создайте индексы заново.
 h_int_parser         = Толерантный к ошибкам и быстрее чем стандартный парсер Java
 h_languauge          = Будут использованы специализированные под каждый язык парсеры
diff --git a/basex-core/src/main/resources/lang/Spanish.lang b/basex-core/src/main/resources/lang/Spanish.lang
index 42e0ce4329..4d032eecd5 100644
--- a/basex-core/src/main/resources/lang/Spanish.lang
+++ b/basex-core/src/main/resources/lang/Spanish.lang
@@ -413,7 +413,7 @@ h_db_format          = El formato de la Base de Datos ha cambiado; por favor uti
 h_db_options_%       = Las opciónes serán asignado si se ejecuta '%'. 
 h_diacritics         = Las diacríticas están retenidas en el índice.
 h_fulltext_index     = Un índice de Texto Completo acelera las consulta de Texto Completo.
-h_html_parser        = Se utilizará el Analizador Sintáctico Validator.nu para convertir HTML a XML.
+h_html_parser_%      = Se utilizará el Analizador Sintáctico % para convertir HTML a XML.
 h_index_format       = El formato del índice ha cambiado; for favor, cree nuevos índices.
 h_int_parser         = Tolerante a fallos, y más rápido que el analizador sintáctico por defecto de Java.
 h_languauge          = Se utilizarán analizadores sintácticos de texto específicos del lenguaje.
diff --git a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
index 58b5b8b323..d5cf02e045 100644
--- a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
@@ -20,7 +20,10 @@ public final class HtmlModuleTest extends SandboxTest {
     query(func.args(" <_/>/text()"), "");
 
     final String path = "src/test/resources/input.html";
-    query(func.args(path) + "//*:body ! name()", "body");
+    query(func.args(path) + "//body ! name()", "body");
+    query(func.args(path, " map { 'nons': false() }") + "//*:body ! name()", "body");
+    query(func.args(path, " {'method': 'nu'}") + "//Q{http://www.w3.org/1999/xhtml}body ! name()",
+        "body");
   }
 
   /** Test method. */
@@ -32,7 +35,8 @@ public final class HtmlModuleTest extends SandboxTest {
     // check if the function returns an HTML root node
     query("exists(" + func.args("&lt;html/&gt;") + "/*:html)", true);
     // check if the function returns <html/>
-    query(func.args("&lt;html/&gt;"),
+    query(func.args("&lt;html/&gt;", " map { 'nons': true() }"), "<html/>");
+    query(func.args("&lt;html/&gt;", " {'method': 'nu'}"),
         "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>");
   }
 
diff --git a/basex-examples/basex-examples.iml b/basex-examples/basex-examples.iml
index c8c6019d38..d68f7b229d 100644
--- a/basex-examples/basex-examples.iml
+++ b/basex-examples/basex-examples.iml
@@ -30,7 +30,7 @@
     <orderEntry type="library" name="Maven: org.slf4j:slf4j-simple:1.7.12" level="project" />
     <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.12" level="project" />
     <orderEntry type="library" name="Maven: com.vividsolutions:jts:1.13" level="project" />
-    <orderEntry type="library" scope="RUNTIME" name="Maven: nu.validator:htmlparser:1.4.16" level="project" />
+    <orderEntry type="library" scope="RUNTIME" name="Maven: org.ccil.cowan.tagsoup:tagsoup:1.2.1" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
     <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
   </component>
diff --git a/basex-examples/pom.xml b/basex-examples/pom.xml
index 03a834b03a..c66dc0ffb0 100644
--- a/basex-examples/pom.xml
+++ b/basex-examples/pom.xml
@@ -18,8 +18,8 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>nu.validator</groupId>
-      <artifactId>htmlparser</artifactId>
+      <groupId>org.ccil.cowan.tagsoup</groupId>
+      <artifactId>tagsoup</artifactId>
     </dependency>
     <dependency>
       <groupId>org.junit.jupiter</groupId>
diff --git a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
index 757baa63e8..7f9dcc9701 100644
--- a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
+++ b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
@@ -6,7 +6,7 @@
 /**
  * This example demonstrates how to import a file in the HTML format
  * into the database. The specified input file will be converted to XML
- * if Validator.nu is found in the classpath.
+ * if the HTML parser is found in the classpath.
  *
  * @author BaseX Team, BSD License
  * @author Christian Gruen
diff --git a/pom.xml b/pom.xml
index 8c315215bf..1bb0f41810 100644
--- a/pom.xml
+++ b/pom.xml
@@ -63,6 +63,13 @@
         <scope>runtime</scope>
         <optional>true</optional>
       </dependency>
+      <dependency>
+        <groupId>org.ccil.cowan.tagsoup</groupId>
+        <artifactId>tagsoup</artifactId>
+        <version>1.2.1</version>
+        <scope>runtime</scope>
+        <optional>true</optional>
+      </dependency>
       <dependency>
         <groupId>nu.validator</groupId>
         <artifactId>htmlparser</artifactId>

From 12dab13c46665071e3ff709843d7665aa120b1b4 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher <grd@gmx.net>
Date: Wed, 22 Jan 2025 19:06:06 +0100
Subject: [PATCH 9/9] minor changes

---
 .../main/java/org/basex/build/html/HtmlOptions.java   |  2 +-
 .../main/java/org/basex/build/html/HtmlParser.java    | 11 +++++++----
 .../src/main/java/org/basex/core/MainOptions.java     |  2 +-
 .../src/main/java/org/basex/query/QueryError.java     |  2 --
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index 7eebede2ce..74e7a80bc3 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -3,7 +3,7 @@
 import org.basex.util.options.*;
 
 /**
- * Options for parsing and serializing HTML documents with Validator.nu.
+ * Options for parsing and serializing HTML documents with TagSoup and Validator.nu.
  *
  * @author BaseX Team, BSD License
  * @author Christian Gruen
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 3083a5519b..62aff78895 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -20,9 +20,12 @@
 import nu.validator.htmlparser.sax.*;
 
 /**
- * This class uses the Validator.nu HTML parser to convert HTML input to well-formed XML.
- * If the Validator.nu HTML parser is not found in the classpath, the original document is
- * passed on.
+ * This class uses the TagSoup or Validator.nu HTML parser to convert HTML input to well-formed
+ * XML. If TagSoup should be used, and it is not found in the classpath, the original document
+ * is passed on.
+ *
+ * TagSoup was written by John Cowan and is based on the Apache 2.0 License:
+ * {@code http://home.ccil.org/~cowan/XML/tagsoup/}.
  *
  * The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License:
  * {@code https://about.validator.nu/htmlparser/}.
@@ -80,7 +83,7 @@ private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopt
           : hopts.contains(ENCODING)
             ? hopts.get(HtmlOptions.ENCODING)
             : null;
-      if (enc != null) {
+      if(enc != null) {
         if(!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
         is.setEncoding(Strings.normEncoding(enc));
       }
diff --git a/basex-core/src/main/java/org/basex/core/MainOptions.java b/basex-core/src/main/java/org/basex/core/MainOptions.java
index bf5c21b49b..2abb02ba61 100644
--- a/basex-core/src/main/java/org/basex/core/MainOptions.java
+++ b/basex-core/src/main/java/org/basex/core/MainOptions.java
@@ -40,7 +40,7 @@ public final class MainOptions extends Options {
   /** Define JSON parser options. */
   public static final OptionsOption<JsonParserOptions> JSONPARSER =
       new OptionsOption<>("JSONPARSER", new JsonParserOptions());
-  /** Define Validator.nu HTML options. */
+  /** Define HTML options. */
   public static final OptionsOption<HtmlOptions> HTMLPARSER =
       new OptionsOption<>("HTMLPARSER", new HtmlOptions());
   /** Define import parser. */
diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java
index 746225978a..81211886f6 100644
--- a/basex-core/src/main/java/org/basex/query/QueryError.java
+++ b/basex-core/src/main/java/org/basex/query/QueryError.java
@@ -619,8 +619,6 @@ public enum QueryError {
   RESINV_X(FODC, 7, "Resource path '%' is invalid."),
   /** Error code. */
   INVHTML_X(FODC, 11, "HTML parsing failed: %"),
-  /** Error code. */
-  INVHTMLOPT_X(FODC, 12, "HTML option processing failed: %"),
 
   /** Error code. */
   FORMATWHICH_X(FODF, 1280, "Unknown decimal format: %."),