From 1711e3fde9ae6e9b44a7b6aedead563265135737 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 23 Oct 2023 12:55:51 +0200
Subject: [PATCH 1/9] add fn:parse-html based on Validator.nu
---
basex-core/pom.xml | 6 +
.../main/java/org/basex/query/QueryError.java | 4 +
.../java/org/basex/query/func/Function.java | 3 +
.../basex/query/func/html/FnParseHtml.java | 138 ++++++++++++++++++
.../org/basex/query/func/FnModuleTest.java | 14 ++
pom.xml | 7 +
6 files changed, 172 insertions(+)
create mode 100644 basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index 83e0ef7818..7e34f25e58 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -52,6 +52,12 @@
provided
true
+
+ nu.validator
+ htmlparser
+ provided
+ true
+
diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java
index 7e30cd41b9..3916df95db 100644
--- a/basex-core/src/main/java/org/basex/query/QueryError.java
+++ b/basex-core/src/main/java/org/basex/query/QueryError.java
@@ -615,6 +615,10 @@ public enum QueryError {
SAXERR_X(FODC, 6, "SAX: %"),
/** Error code. */
RESINV_X(FODC, 7, "Resource path '%' is invalid."),
+ /** Error code. */
+ INVHTML_X(FODC, 11, "String passed to fn:parse-html is not a well-formed HTML document: %"),
+ /** Error code. */
+ INVHTMLOPT_X(FODC, 12, "Unsupported HTML parser option: %"),
/** Error code. */
FORMNUM_X(FODF, 1280, "Unknown decimal format: '%'."),
diff --git a/basex-core/src/main/java/org/basex/query/func/Function.java b/basex-core/src/main/java/org/basex/query/func/Function.java
index da69a1237c..bf29459289 100644
--- a/basex-core/src/main/java/org/basex/query/func/Function.java
+++ b/basex-core/src/main/java/org/basex/query/func/Function.java
@@ -464,6 +464,9 @@ ITEM_ZM, flag(HOF)),
PARSE_IETF_DATE(FnParseIetfDate::new, "parse-ietf-date(value)",
params(STRING_ZO), DATE_TIME_ZO),
/** XQuery function. */
+ PARSE_HTML(FnParseHtml::new, "parse-html(html[,options])",
+ params(ITEM_ZO, MAP_O), DOCUMENT_NODE_ZO),
+ /** XQuery function. */
PARSE_INTEGER(FnParseInteger::new, "parse-integer(value[,radix])",
params(STRING_O, INTEGER_O), INTEGER_O),
/** XQuery function. */
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
new file mode 100644
index 0000000000..0f8a3a5801
--- /dev/null
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -0,0 +1,138 @@
+package org.basex.query.func.html;
+
+import static org.basex.query.QueryError.*;
+import static org.basex.util.Token.*;
+
+import java.io.*;
+
+import org.basex.build.html.*;
+import org.basex.build.xml.*;
+import org.basex.core.*;
+import org.basex.io.*;
+import org.basex.io.in.*;
+import org.basex.query.*;
+import org.basex.query.expr.*;
+import org.basex.query.func.*;
+import org.basex.query.value.item.*;
+import org.basex.query.value.node.*;
+import org.basex.query.value.seq.*;
+import org.basex.util.*;
+import org.xml.sax.*;
+
+import nu.validator.htmlparser.common.*;
+import nu.validator.htmlparser.sax.*;
+
+/**
+ * Function implementation.
+ *
+ * @author BaseX Team 2005-23, BSD License
+ * @author Gunther Rademacher
+ */
+public class FnParseHtml extends StandardFunc {
+ // TODO: handle second argument (method, html-version, encoding), produce error code FODC0012
+
+ @Override
+ public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
+ final Item value = arg(0).atomItem(qc, info);
+ return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)));
+ }
+
+ @Override
+ protected final Expr opt(final CompileContext cc) {
+ return optFirst();
+ }
+
+ /**
+ * Parses the input and creates an XML document.
+ * @param io input data
+ * @return node
+ * @throws QueryException query exception
+ */
+ protected final Item parse(final IO io) throws QueryException {
+ try {
+ if (!ParserImpl.available()) {
+ // reader could not be initialized; fall back to html:parse
+ final HtmlOptions htmlOptions = new HtmlOptions();
+ htmlOptions.set(HtmlOptions.LEXICAL, true);
+ htmlOptions.set(HtmlOptions.NONS, false);
+ return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions));
+ }
+ return new DBNode(new ParserImpl(io, new MainOptions()));
+ } catch(final IOException ex) {
+ throw INVHTML_X.get(info, ex);
+ }
+ }
+
+ /**
+ * Parser implementation.
+ */
+ private static class ParserImpl extends XMLParser {
+
+ /**
+ * Checks if Validator.nu is available.
+ * @return result of check
+ */
+ public static boolean available() {
+ return Reflect.available("nu.validator.htmlparser.sax.HtmlParser");
+ }
+
+ /**
+ * Constructor.
+ * @param source document source
+ * @param options main options
+ * @throws IOException I/O exception
+ */
+ ParserImpl(final IO source, final MainOptions options)
+ throws IOException {
+ super(toXml(source), options);
+ }
+
+ /**
+ * Converts an HTML document to XML.
+ * @param io io reference
+ * @return parser
+ * @throws IOException I/O exception
+ */
+ private static IO toXml(final IO io) throws IOException {
+ try(TextInput ti = new TextInput(io)) {
+
+ // tries to extract the encoding from the input
+ // TODO: remove this, in favor of encoding from options, or constant for string input
+ String enc = ti.encoding();
+ final byte[] content = ti.content();
+ // looks for a charset definition
+ final byte[] encoding = token("charset=");
+ int cs = indexOf(content, encoding);
+ if(cs > 0) {
+ // extracts the encoding string
+ cs += encoding.length;
+ int ce = cs;
+ final int cl = content.length;
+ while(++ce < cl && content[ce] > 0x28);
+ enc = string(substring(content, cs, ce));
+ }
+
+ // define output
+ final StringWriter sw = new StringWriter();
+ final nu.validator.htmlparser.sax.HtmlParser reader =
+ new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
+ reader.setFeature("http://xml.org/sax/features/namespaces", true);
+ reader.setFeature("http://xml.org/sax/features/namespace-prefixes", false);
+
+ final ContentHandler writer = new XmlSerializer(sw);
+ reader.setContentHandler(writer);
+ reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
+
+ // define input
+ final InputSource is = new InputSource(new ArrayInput(content));
+ is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+ reader.parse(is);
+ return new IOContent(token(sw.toString()), io.name());
+
+ } catch(final SAXException ex) {
+ Util.errln(ex);
+ throw INVHTML_X.getIO(ex.getLocalizedMessage());
+ }
+ }
+ }
+}
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index 90325bd7d4..a57d7be0d0 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1438,6 +1438,20 @@ public final class FnModuleTest extends SandboxTest {
query("let $n := return " + func.args(" ($n, $n)"), "");
}
+ /** Test method. */
+ @Test public void parseHtml() {
+ final Function func = PARSE_HTML;
+
+ query(func.args("42"),
+ "42");
+ query(func.args(" xs:hexBinary('3432')"),
+ "
42");
+ query(func.args(" xs:base64Binary('NDI=')"),
+ "42");
+
+ error(func.args(42), STRBIN_X_X);
+ }
+
/** Test method. */
@Test public void parseIetfDate() {
final Function func = PARSE_IETF_DATE;
diff --git a/pom.xml b/pom.xml
index dabb8b18ca..e9cf162f1a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -174,6 +174,13 @@
runtime
true
+
+ nu.validator
+ htmlparser
+ 1.4.16
+ runtime
+ true
+
From dfb0be0b2f594f5137948f9a1987a50fb66da384 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 30 Oct 2023 13:41:42 +0100
Subject: [PATCH 2/9] add support for Validator.nu options
---
.../org/basex/build/html/HtmlOptions.java | 138 ++++++++++++++++++
.../basex/query/func/html/FnParseHtml.java | 97 +++++++-----
.../org/basex/query/func/FnModuleTest.java | 20 ++-
3 files changed, 216 insertions(+), 39 deletions(-)
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index 5fe399485f..b46e8d6e56 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -9,6 +9,41 @@
* @author Christian Gruen
*/
public final class HtmlOptions extends Options {
+ /** Validator.nu option unicode-normalization-checking. */
+ public static final BooleanOption UNICODE_NORMALIZATION_CHECKING =
+ new BooleanOption("unicode-normalization-checking", false);
+ /** Validator.nu option mapping-lang-to-xml-lang. */
+ public static final BooleanOption MAPPING_LANG_TO_XML_LANG =
+ new BooleanOption("mapping-lang-to-xml-lang", false);
+ /** Validator.nu option scripting-enabled. */
+ public static final BooleanOption SCRIPTING_ENABLED =
+ new BooleanOption("scripting-enabled", false);
+
+ /** Validator.nu option content-space-policy. */
+ public static final EnumOption CONTENT_SPACE_POLICY =
+ new EnumOption<>("content-space-policy", XmlViolationPolicy.class);
+ /** Validator.nu option content-non-xml-char-policy. */
+ public static final EnumOption CONTENT_NON_XML_CHAR_POLICY =
+ new EnumOption<>("content-non-xml-char-policy", XmlViolationPolicy.class);
+ /** Validator.nu option comment-policy. */
+ public static final EnumOption COMMENT_POLICY =
+ new EnumOption<>("comment-policy", XmlViolationPolicy.class);
+ /** Validator.nu option xmlns-policy. */
+ public static final EnumOption XMLNS_POLICY =
+ new EnumOption<>("xmlns-policy", XmlViolationPolicy.class);
+ /** Validator.nu option name-policy. */
+ public static final EnumOption NAME_POLICY =
+ new EnumOption<>("name-policy", XmlViolationPolicy.class);
+ /** Validator.nu option streamability-violation-policy. */
+ public static final EnumOption STREAMABILITY_VIOLATION_POLICY =
+ new EnumOption<>("streamability-violation-policy", XmlViolationPolicy.class);
+ /** Validator.nu option xml-policy. */
+ public static final EnumOption XML_POLICY =
+ new EnumOption<>("xml-policy", XmlViolationPolicy.class);
+ /** Validator.nu option heuristics. */
+ public static final EnumOption HEURISTICS =
+ new EnumOption<>("heuristics", Heuristics.class);
+
/** TagSoup option: html. */
public static final BooleanOption HTML = new BooleanOption("html", false);
/** TagSoup option: omit-xml-declaration. */
@@ -59,4 +94,107 @@ public HtmlOptions() {
public HtmlOptions(final Options opts) {
super(opts);
}
+
+ /**
+ * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
+ * dependency on Validator.nu in the classpath.
+ *
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+ /**
+ * Policy for XML 1.0 violations.
+ *
+ * @version $Id$
+ * @author hsivonen
+ */
+ public enum XmlViolationPolicy {
+ /**
+ * Conform to HTML 5, allow XML 1.0 to be violated.
+ */
+ ALLOW,
+
+ /**
+ * Halt when something cannot be mapped to XML 1.0.
+ */
+ FATAL,
+
+ /**
+ * Be non-conforming and alter the infoset to fit
+ * XML 1.0 when something would otherwise not be
+ * mappable to XML 1.0.
+ */
+ ALTER_INFOSET
+ }
+
+ /**
+ * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
+ * dependency on Validator.nu in the classpath.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+ /**
+ * Indicates a request for character encoding sniffer choice.
+ *
+ * @version $Id$
+ * @author hsivonen
+ */
+ public enum Heuristics {
+
+ /**
+ * Perform no heuristic sniffing.
+ */
+ NONE,
+
+ /**
+ * Use both jchardet and ICU4J.
+ */
+ ALL,
+
+ /**
+ * Use jchardet only.
+ */
+ CHARDET,
+
+ /**
+ * Use ICU4J only.
+ */
+ ICU
+ }
+
}
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index 0f8a3a5801..2ae7c1d525 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -1,5 +1,6 @@
package org.basex.query.func.html;
+import static org.basex.build.html.HtmlOptions.*;
import static org.basex.query.QueryError.*;
import static org.basex.util.Token.*;
@@ -9,7 +10,6 @@
import org.basex.build.xml.*;
import org.basex.core.*;
import org.basex.io.*;
-import org.basex.io.in.*;
import org.basex.query.*;
import org.basex.query.expr.*;
import org.basex.query.func.*;
@@ -19,8 +19,9 @@
import org.basex.util.*;
import org.xml.sax.*;
-import nu.validator.htmlparser.common.*;
import nu.validator.htmlparser.sax.*;
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+import nu.validator.htmlparser.common.Heuristics;
/**
* Function implementation.
@@ -34,7 +35,8 @@ public class FnParseHtml extends StandardFunc {
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
final Item value = arg(0).atomItem(qc, info);
- return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)));
+ final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
+ return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), options);
}
@Override
@@ -45,10 +47,11 @@ protected final Expr opt(final CompileContext cc) {
/**
* Parses the input and creates an XML document.
* @param io input data
+ * @param options HTML options
* @return node
* @throws QueryException query exception
*/
- protected final Item parse(final IO io) throws QueryException {
+ protected final Item parse(final IO io, final HtmlOptions options) throws QueryException {
try {
if (!ParserImpl.available()) {
// reader could not be initialized; fall back to html:parse
@@ -57,7 +60,7 @@ protected final Item parse(final IO io) throws QueryException {
htmlOptions.set(HtmlOptions.NONS, false);
return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions));
}
- return new DBNode(new ParserImpl(io, new MainOptions()));
+ return new DBNode(new ParserImpl(info, io, options));
} catch(final IOException ex) {
throw INVHTML_X.get(info, ex);
}
@@ -73,65 +76,91 @@ private static class ParserImpl extends XMLParser {
* @return result of check
*/
public static boolean available() {
- return Reflect.available("nu.validator.htmlparser.sax.HtmlParser");
+ return Reflect.available("nu.validator.htmlparser.sax.HtmlParser")
+ && Reflect.available("nu.validator.htmlparser.sax.XmlSerializer")
+ && Reflect.available("nu.validator.htmlparser.common.Heuristics")
+ && Reflect.available("nu.validator.htmlparser.common.XmlViolationPolicy");
}
/**
* Constructor.
+ * @param info input info
* @param source document source
- * @param options main options
+ * @param options HTML options
* @throws IOException I/O exception
+ * @throws QueryException query exception
*/
- ParserImpl(final IO source, final MainOptions options)
- throws IOException {
- super(toXml(source), options);
+ ParserImpl(final InputInfo info, final IO source, final HtmlOptions options)
+ throws IOException, QueryException {
+ super(toXml(info, source, options), new MainOptions());
}
/**
* Converts an HTML document to XML.
+ * @param info input info
* @param io io reference
+ * @param hopts HTML options
* @return parser
* @throws IOException I/O exception
+ * @throws QueryException query exception
*/
- private static IO toXml(final IO io) throws IOException {
- try(TextInput ti = new TextInput(io)) {
-
- // tries to extract the encoding from the input
- // TODO: remove this, in favor of encoding from options, or constant for string input
- String enc = ti.encoding();
- final byte[] content = ti.content();
- // looks for a charset definition
- final byte[] encoding = token("charset=");
- int cs = indexOf(content, encoding);
- if(cs > 0) {
- // extracts the encoding string
- cs += encoding.length;
- int ce = cs;
- final int cl = content.length;
- while(++ce < cl && content[ce] > 0x28);
- enc = string(substring(content, cs, ce));
- }
+ private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hopts)
+ throws IOException, QueryException {
+ try {
// define output
final StringWriter sw = new StringWriter();
final nu.validator.htmlparser.sax.HtmlParser reader =
new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
- reader.setFeature("http://xml.org/sax/features/namespaces", true);
- reader.setFeature("http://xml.org/sax/features/namespace-prefixes", false);
-
final ContentHandler writer = new XmlSerializer(sw);
reader.setContentHandler(writer);
reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
// define input
- final InputSource is = new InputSource(new ArrayInput(content));
- is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+ final InputSource is = new InputSource(io.inputStream());
+
+ // set Validator.nu options
+ if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
+ reader.setCheckingNormalization(true);
+ if(hopts.get(MAPPING_LANG_TO_XML_LANG))
+ reader.setMappingLangToXmlLang(true);
+ if(hopts.get(SCRIPTING_ENABLED))
+ reader.setScriptingEnabled(true);
+ if(hopts.contains(CONTENT_SPACE_POLICY))
+ reader.setContentSpacePolicy(
+ XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
+ if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
+ reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
+ hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
+ if(hopts.contains(COMMENT_POLICY))
+ reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
+ if(hopts.contains(XMLNS_POLICY))
+ reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
+ if(hopts.contains(NAME_POLICY))
+ reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
+ if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
+ reader.setStreamabilityViolationPolicy(
+ XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
+ if(hopts.contains(XML_POLICY))
+ reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
+
+ if(hopts.contains(HEURISTICS))
+ reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
+ // end Validator.nu options
+
+ if (hopts.contains(ENCODING)) {
+ String enc = hopts.get(HtmlOptions.ENCODING);
+ if (!Strings.supported(enc))
+ throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.');
+ is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+ }
+
reader.parse(is);
return new IOContent(token(sw.toString()), io.name());
} catch(final SAXException ex) {
Util.errln(ex);
- throw INVHTML_X.getIO(ex.getLocalizedMessage());
+ throw INVHTML_X.get(info, ex.getLocalizedMessage());
}
}
}
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index a57d7be0d0..5f5365d262 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -11,6 +11,7 @@
import org.basex.query.expr.path.*;
import org.basex.query.value.item.*;
import org.basex.query.value.seq.*;
+import org.basex.util.*;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.Test;
@@ -1434,7 +1435,7 @@ public final class FnModuleTest extends SandboxTest {
/** Test method. */
@Test public void outermost() {
- final Function func = INNERMOST;
+ final Function func = OUTERMOST;
query("let $n := return " + func.args(" ($n, $n)"), "");
}
@@ -1444,12 +1445,21 @@ public final class FnModuleTest extends SandboxTest {
query(func.args("42"),
"42");
- query(func.args(" xs:hexBinary('3432')"),
- "42");
- query(func.args(" xs:base64Binary('NDI=')"),
- "42");
+ query(func.args(_CONVERT_STRING_TO_HEX.args("42", Strings.UTF16LE),
+ " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"),
+ "42");
+ query(func.args(_CONVERT_STRING_TO_BASE64.args("42", Strings.UTF16BE),
+ " map {'encoding': '" + Strings.UTF16BE + "', 'heuristics': 'NONE'}"),
+ "42");
error(func.args(42), STRBIN_X_X);
+ error(func.args(" \"42\"", 42), MAP_X_X);
+ error(func.args(" \"42\"", " map {'1234': ()}"), INVALIDOPT_X);
+ error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVALIDOPT_X);
}
/** Test method. */
From 8efe4a1f9bc31ab338ad0d1f408ffcf400c11970 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 30 Oct 2023 20:50:29 +0100
Subject: [PATCH 3/9] ignore encoding option when parsing a string value
---
.../org/basex/query/func/html/FnParseHtml.java | 14 ++++++++++----
.../java/org/basex/query/func/FnModuleTest.java | 3 +++
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index 2ae7c1d525..a891859e34 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -36,7 +36,9 @@ public class FnParseHtml extends StandardFunc {
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
final Item value = arg(0).atomItem(qc, info);
final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
- return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), options);
+ final IO io = value instanceof Bin ? new IOContent(toBytes(value))
+ : new IOContent(toBytes(value), "", Strings.UTF8);
+ return value.isEmpty() ? Empty.VALUE : parse(io, options);
}
@Override
@@ -148,11 +150,15 @@ private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hop
reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
// end Validator.nu options
- if (hopts.contains(ENCODING)) {
- String enc = hopts.get(HtmlOptions.ENCODING);
+ String enc = io.encoding() != null
+ ? io.encoding()
+ : hopts.contains(ENCODING)
+ ? hopts.get(HtmlOptions.ENCODING)
+ : null; // TODO: sniff encoding
+ if (enc != null) {
if (!Strings.supported(enc))
throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.');
- is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
+ is.setEncoding(Strings.normEncoding(enc));
}
reader.parse(is);
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index 5f5365d262..ed667b2a68 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1443,8 +1443,11 @@ public final class FnModuleTest extends SandboxTest {
@Test public void parseHtml() {
final Function func = PARSE_HTML;
+ query(func.args(" ()"), "");
query(func.args("42"),
"42");
+ query(func.args("42", " map {'encoding': '" + Strings.UTF16LE + "'}"),
+ "42");
query(func.args(_CONVERT_STRING_TO_HEX.args("42", Strings.UTF16LE),
" map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"),
From d66e40be30866bcd08abd129c8af9b8bf174e698 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 6 Nov 2023 13:15:54 +0100
Subject: [PATCH 4/9] replace TagSoup by Validator.nu
---
basex-core/pom.xml | 12 +-
.../org/basex/build/html/HtmlOptions.java | 54 ++----
.../java/org/basex/build/html/HtmlParser.java | 152 ++++++++---------
.../main/java/org/basex/core/MainOptions.java | 2 +-
.../main/java/org/basex/query/QueryError.java | 9 +-
.../org/basex/query/func/FuncOptions.java | 33 ++--
.../org/basex/query/func/StandardFunc.java | 17 +-
.../basex/query/func/html/FnParseHtml.java | 159 +-----------------
.../org/basex/query/func/html/HtmlParse.java | 9 +-
.../src/main/resources/lang/Chinese.lang | 2 +-
basex-core/src/main/resources/lang/Dutch.lang | 2 +-
.../src/main/resources/lang/English.lang | 2 +-
.../src/main/resources/lang/French.lang | 2 +-
.../src/main/resources/lang/German.lang | 2 +-
.../src/main/resources/lang/Hungarian.lang | 2 +-
.../src/main/resources/lang/Indonesian.lang | 2 +-
.../src/main/resources/lang/Italian.lang | 2 +-
.../src/main/resources/lang/Japanese.lang | 2 +-
.../src/main/resources/lang/Mongolian.lang | 2 +-
.../src/main/resources/lang/Romanian.lang | 2 +-
.../src/main/resources/lang/Russian.lang | 2 +-
.../src/main/resources/lang/Spanish.lang | 2 +-
.../org/basex/query/func/FnModuleTest.java | 4 +-
.../org/basex/query/func/HtmlModuleTest.java | 6 +-
basex-examples/basex-examples.iml | 2 +-
basex-examples/pom.xml | 4 +-
.../basex/examples/create/HTMLExample.java | 2 +-
pom.xml | 13 +-
28 files changed, 162 insertions(+), 342 deletions(-)
diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index 94512b712e..f3c6d14885 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -30,9 +30,9 @@
true
- org.ccil.cowan.tagsoup
- tagsoup
- compile
+ nu.validator
+ htmlparser
+ provided
true
@@ -52,12 +52,6 @@
provided
true
-
- nu.validator
- htmlparser
- provided
- true
-
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index b46e8d6e56..a6ce8f6cc9 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -3,12 +3,22 @@
import org.basex.util.options.*;
/**
- * Options for parsing and serializing HTML documents with TagSoup.
+ * Options for parsing and serializing HTML documents with Validator.nu.
*
* @author BaseX Team 2005-23, BSD License
* @author Christian Gruen
*/
public final class HtmlOptions extends Options {
+ /** fn:parse-html option encoding. */
+ public static final StringOption ENCODING = new StringOption("encoding");
+ /** fn:parse-html option method. */
+ public static final StringOption METHOD = new StringOption("method");
+ /** fn:parse-html option html-version. */
+ public static final StringOption HTML_VERSION = new StringOption("html-version");
+ /** fn:parse-html option include-template-content. */
+ public static final BooleanOption INCLUDE_TEMPLATE_CONTENT =
+ new BooleanOption("include-template-content");
+
/** Validator.nu option unicode-normalization-checking. */
public static final BooleanOption UNICODE_NORMALIZATION_CHECKING =
new BooleanOption("unicode-normalization-checking", false);
@@ -18,7 +28,6 @@ public final class HtmlOptions extends Options {
/** Validator.nu option scripting-enabled. */
public static final BooleanOption SCRIPTING_ENABLED =
new BooleanOption("scripting-enabled", false);
-
/** Validator.nu option content-space-policy. */
public static final EnumOption CONTENT_SPACE_POLICY =
new EnumOption<>("content-space-policy", XmlViolationPolicy.class);
@@ -44,43 +53,6 @@ public final class HtmlOptions extends Options {
public static final EnumOption HEURISTICS =
new EnumOption<>("heuristics", Heuristics.class);
- /** TagSoup option: html. */
- public static final BooleanOption HTML = new BooleanOption("html", false);
- /** TagSoup option: omit-xml-declaration. */
- public static final BooleanOption OMIT_XML_DECLARATION =
- new BooleanOption("omit-xml-declaration", false);
- /** TagSoup option: nons. */
- public static final BooleanOption NONS = new BooleanOption("nons", true);
- /** TagSoup option: nobogons. */
- public static final BooleanOption NOBOGONS = new BooleanOption("nobogons", false);
- /** TagSoup option: nodefaults. */
- public static final BooleanOption NODEFAULTS = new BooleanOption("nodefaults", false);
- /** TagSoup option: nocolons. */
- public static final BooleanOption NOCOLONS = new BooleanOption("nocolons", false);
- /** TagSoup option: norestart. */
- public static final BooleanOption NORESTART = new BooleanOption("norestart", false);
- /** TagSoup option: nobogons. */
- public static final BooleanOption IGNORABLE = new BooleanOption("ignorable", false);
- /** TagSoup option: emptybogons. */
- public static final BooleanOption EMPTYBOGONS = new BooleanOption("emptybogons", false);
- /** TagSoup option: any. */
- public static final BooleanOption ANY = new BooleanOption("any", false);
- /** TagSoup option: norootbogons. */
- public static final BooleanOption NOROOTBOGONS = new BooleanOption("norootbogons", false);
- /** TagSoup option: nocdata. */
- public static final BooleanOption NOCDATA = new BooleanOption("nocdata", false);
- /** TagSoup option: lexical. */
- public static final BooleanOption LEXICAL = new BooleanOption("lexical", false);
-
- /** TagSoup option: method (html). */
- public static final StringOption METHOD = new StringOption("method", "xml");
- /** TagSoup option: doctype-system=systemid. */
- public static final StringOption DOCTYPE_SYSTEM = new StringOption("doctype-system");
- /** TagSoup option: doctype-public=publicid. */
- public static final StringOption DOCTYPE_PUBLIC = new StringOption("doctype-public");
- /** TagSoup option: encoding=encoding. */
- public static final StringOption ENCODING = new StringOption("encoding");
-
/**
* Default constructor.
*/
@@ -97,7 +69,7 @@ public HtmlOptions(final Options opts) {
/**
* Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
- * dependency on Validator.nu in the classpath.
+ * class path dependency of HtmlOptions on Validator.nu.
*
* Copyright (c) 2007 Henri Sivonen
*
@@ -147,7 +119,7 @@ public enum XmlViolationPolicy {
/**
* Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the
- * dependency on Validator.nu in the classpath.
+ * class path dependency of HtmlOptions on Validator.nu.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index acfe3d5882..773defb9db 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -1,39 +1,57 @@
package org.basex.build.html;
-import static org.basex.util.Token.*;
import static org.basex.build.html.HtmlOptions.*;
+import static org.basex.query.QueryError.*;
+import static org.basex.util.Token.*;
import java.io.*;
+import java.util.*;
+
import org.basex.build.xml.*;
import org.basex.core.*;
import org.basex.io.*;
-import org.basex.io.in.*;
import org.basex.util.*;
-import org.ccil.cowan.tagsoup.*;
import org.xml.sax.*;
+import nu.validator.htmlparser.common.Heuristics;
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+import nu.validator.htmlparser.sax.*;
+
/**
- * This class uses TagSoup to convert HTML input to well-formed XML.
- * If TagSoup is not found in the classpath, the original document is passed on.
+ * This class uses the Validator.nu HTML parser to convert HTML input to well-formed XML.
+ * If the Validator.nu HTML parser is not found in the classpath, the original document is
+ * passed on.
*
- * TagSoup was written by John Cowan and is based on the Apache 2.0 License:
- * {@code http://home.ccil.org/~cowan/XML/tagsoup/}.
+ * The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License:
+ * {@code https://about.validator.nu/htmlparser/}.
*
* @author BaseX Team 2005-23, BSD License
* @author Christian Gruen
*/
public final class HtmlParser extends XMLParser {
/** Name of HTML Parser. */
- private static final String NAME = "TagSoup";
- /** TagSoup URL. */
- private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/";
+ private static final String NAME = "Validator.nu";
/**
- * Checks if a CatalogResolver is available.
+ * Checks if a Validator.nu is available.
* @return result of check
*/
public static boolean available() {
- return Reflect.available("org.ccil.cowan.tagsoup.Parser");
+ return firstUnavailableClass() == null;
+ }
+
+ /**
+ * Check whether Validator.nu classes are available on the class path.
+ * @return the name of the first class that is not available, or null if all classes are available
+ */
+ public static String firstUnavailableClass() {
+ for(final String className : Arrays.asList("nu.validator.htmlparser.sax.HtmlParser",
+ "nu.validator.htmlparser.sax.XmlSerializer",
+ "nu.validator.htmlparser.common.XmlViolationPolicy",
+ "nu.validator.htmlparser.common.Heuristics")) {
+ if(!Reflect.available(className)) return className;
+ }
+ return null;
}
/**
@@ -77,81 +95,63 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
// reader could not be initialized; fall back to XML
if(!available()) return io;
- try(TextInput ti = new TextInput(io)) {
- // tries to extract the encoding from the input
- String enc = ti.encoding();
- final byte[] content = ti.content();
-
- // looks for a charset definition
- final byte[] encoding = token("charset=");
- int cs = indexOf(content, encoding);
- if(cs > 0) {
- // extracts the encoding string
- cs += encoding.length;
- int ce = cs;
- final int cl = content.length;
- while(++ce < cl && content[ce] > 0x28);
- enc = string(substring(content, cs, ce));
- }
-
+ try {
// define output
final StringWriter sw = new StringWriter();
- final XMLReader reader = new org.ccil.cowan.tagsoup.Parser();
- final XMLWriter writer = new XMLWriter(sw);
- writer.setOutputProperty(ENCODING.name(), Strings.UTF8);
+ final nu.validator.htmlparser.sax.HtmlParser reader =
+ new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
+ final ContentHandler writer = new XmlSerializer(sw);
reader.setContentHandler(writer);
+ reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
- // set TagSoup options
- if(hopts.get(HTML)) {
- reader.setFeature("http://xml.org/sax/features/namespaces", false);
- writer.setOutputProperty(METHOD.name(), "html");
- writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
+ // define input
+ final InputSource is = new InputSource(io.inputStream());
+
+ // set Validator.nu options
+ if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
+ reader.setCheckingNormalization(true);
+ if(hopts.get(MAPPING_LANG_TO_XML_LANG))
+ reader.setMappingLangToXmlLang(true);
+ if(hopts.get(SCRIPTING_ENABLED))
+ reader.setScriptingEnabled(true);
+ if(hopts.contains(CONTENT_SPACE_POLICY))
+ reader.setContentSpacePolicy(
+ XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
+ if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
+ reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
+ hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
+ if(hopts.contains(COMMENT_POLICY))
+ reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
+ if(hopts.contains(XMLNS_POLICY))
+ reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
+ if(hopts.contains(NAME_POLICY))
+ reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
+ if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
+ reader.setStreamabilityViolationPolicy(
+ XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
+ if(hopts.contains(XML_POLICY))
+ reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
+ if(hopts.contains(HEURISTICS))
+ reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
+ // end Validator.nu options
+
+ String enc = io.encoding() != null
+ ? io.encoding()
+ : hopts.contains(ENCODING)
+ ? hopts.get(HtmlOptions.ENCODING)
+ : null; // TODO: sniff encoding
+ if (enc != null) {
+ if (!Strings.supported(enc))
+ throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
+ is.setEncoding(Strings.normEncoding(enc));
}
- if(hopts.get(NONS))
- reader.setFeature("http://xml.org/sax/features/namespaces", false);
- if(hopts.get(NOBOGONS))
- reader.setFeature(FEATURES + "ignore-bogons", true);
- if(hopts.get(NODEFAULTS))
- reader.setFeature(FEATURES + "default-attributes", false);
- if(hopts.get(NOCOLONS))
- reader.setFeature(FEATURES + "translate-colons", true);
- if(hopts.get(NORESTART))
- reader.setFeature(FEATURES + "restart-elements", false);
- if(hopts.get(IGNORABLE))
- reader.setFeature(FEATURES + "ignorable-whitespace", true);
- if(hopts.get(EMPTYBOGONS))
- reader.setFeature(FEATURES + "bogons-empty", true);
- if(hopts.get(ANY))
- reader.setFeature(FEATURES + "bogons-empty", false);
- if(hopts.get(NOROOTBOGONS))
- reader.setFeature(FEATURES + "root-bogons", false);
- if(hopts.get(NOCDATA))
- reader.setFeature(FEATURES + "cdata-elements", false);
- if(hopts.get(LEXICAL))
- reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
-
- if(hopts.get(OMIT_XML_DECLARATION))
- writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
- if(hopts.contains(METHOD))
- writer.setOutputProperty(METHOD.name(), hopts.get(METHOD));
- if(hopts.contains(DOCTYPE_SYSTEM))
- writer.setOutputProperty(DOCTYPE_SYSTEM.name(), hopts.get(DOCTYPE_SYSTEM));
- if(hopts.contains(DOCTYPE_PUBLIC))
- writer.setOutputProperty(DOCTYPE_PUBLIC.name(), hopts.get(DOCTYPE_PUBLIC));
-
- if(hopts.contains(ENCODING))
- enc = hopts.get(ENCODING);
- // end TagSoup options
- // define input
- final InputSource is = new InputSource(new ArrayInput(content));
- is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8);
reader.parse(is);
return new IOContent(token(sw.toString()), io.name());
} catch(final SAXException ex) {
Util.errln(ex);
- return io;
+ throw INVHTML_X.getIO(ex.getLocalizedMessage());
}
}
}
diff --git a/basex-core/src/main/java/org/basex/core/MainOptions.java b/basex-core/src/main/java/org/basex/core/MainOptions.java
index 2357d9a5e7..1afd1cda7a 100644
--- a/basex-core/src/main/java/org/basex/core/MainOptions.java
+++ b/basex-core/src/main/java/org/basex/core/MainOptions.java
@@ -41,7 +41,7 @@ public final class MainOptions extends Options {
/** Define JSON parser options. */
public static final OptionsOption JSONPARSER =
new OptionsOption<>("JSONPARSER", new JsonParserOptions());
- /** Define TagSoup HTML options. */
+ /** Define Validator.nu HTML options. */
public static final OptionsOption HTMLPARSER =
new OptionsOption<>("HTMLPARSER", new HtmlOptions());
/** Define import parser. */
diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java
index d127bde013..a13468463d 100644
--- a/basex-core/src/main/java/org/basex/query/QueryError.java
+++ b/basex-core/src/main/java/org/basex/query/QueryError.java
@@ -307,11 +307,6 @@ public enum QueryError {
/** Error code. */
HASH_ALGORITHM_X(HASH, "algorithm", "Algorithm not supported: '%'."),
- // HTML Module
-
- /** Error code. */
- HTML_PARSE_X(HTML, "parse", "%"),
-
// HTTP Module
/** Invalid URI. */
@@ -616,9 +611,9 @@ public enum QueryError {
/** Error code. */
RESINV_X(FODC, 7, "Resource path '%' is invalid."),
/** Error code. */
- INVHTML_X(FODC, 11, "String passed to fn:parse-html is not a well-formed HTML document: %"),
+ INVHTML_X(FODC, 11, "HTML parsing failed: %"),
/** Error code. */
- INVHTMLOPT_X(FODC, 12, "Unsupported HTML parser option: %"),
+ INVHTMLOPT_X(FODC, 12, "HTML option processing failed: %"),
/** Error code. */
FORMNUM_X(FODF, 1280, "Unknown decimal format: '%'."),
diff --git a/basex-core/src/main/java/org/basex/query/func/FuncOptions.java b/basex-core/src/main/java/org/basex/query/func/FuncOptions.java
index d65b445dcf..b867d05de2 100644
--- a/basex-core/src/main/java/org/basex/query/func/FuncOptions.java
+++ b/basex-core/src/main/java/org/basex/query/func/FuncOptions.java
@@ -38,14 +38,15 @@ public final class FuncOptions {
private final InputInfo info;
/** Raise error if a supplied option is unknown. */
- private boolean enforceKnown;
+ private final boolean enforceKnown;
/**
* Constructor.
* @param info input info (can be {@code null})
+ * @param enforceKnown raise error, if a supplied options is unknown
*/
- public FuncOptions(final InputInfo info) {
- this(null, info);
+ public FuncOptions(final InputInfo info, final boolean enforceKnown) {
+ this(null, info, enforceKnown);
}
/**
@@ -54,24 +55,20 @@ public FuncOptions(final InputInfo info) {
* @param info input info (can be {@code null})
*/
public FuncOptions(final QNm root, final InputInfo info) {
- test = root == null ? null : new NameTest(root);
- this.root = root;
- this.info = info;
+ this(root, info, false);
}
/**
- * Assigns values to the specified options.
- * @param item item to be converted (can be {@link Empty#VALUE})
- * @param options options
- * @param option type
- * @param enforce raise error if a supplied option is unknown
- * @return specified options
- * @throws QueryException query exception
+ * Constructor.
+ * @param root name of root node (can be {@code null})
+ * @param info input info (can be {@code null})
+ * @param enforceKnown raise error, if a supplied options is unknown
*/
- public T assign(final Item item, final T options, final boolean enforce)
- throws QueryException {
- enforceKnown = enforce;
- return assign(item, options, INVALIDOPT_X);
+ private FuncOptions(final QNm root, final InputInfo info, final boolean enforceKnown) {
+ test = root == null ? null : new NameTest(root);
+ this.root = root;
+ this.info = info;
+ this.enforceKnown = enforceKnown;
}
/**
@@ -83,7 +80,7 @@ public T assign(final Item item, final T options, final bool
* @return specified options
* @throws QueryException query exception
*/
- private T assign(final Item item, final T options, final QueryError error)
+ public T assign(final Item item, final T options, final QueryError error)
throws QueryException {
if(!item.isEmpty()) {
diff --git a/basex-core/src/main/java/org/basex/query/func/StandardFunc.java b/basex-core/src/main/java/org/basex/query/func/StandardFunc.java
index 9ddd35bfba..40e384c368 100644
--- a/basex-core/src/main/java/org/basex/query/func/StandardFunc.java
+++ b/basex-core/src/main/java/org/basex/query/func/StandardFunc.java
@@ -507,7 +507,22 @@ protected final HashMap toOptions(final Expr expr, final QueryCo
*/
protected final E toOptions(final Expr expr, final E options,
final boolean enforce, final QueryContext qc) throws QueryException {
- return new FuncOptions(info).assign(expr.item(qc, info), options, enforce);
+ return new FuncOptions(info, enforce).assign(expr.item(qc, info), options, INVALIDOPT_X);
+ }
+
+ /**
+ * Evaluates an expression, if it exists, and returns options.
+ * @param options type
+ * @param expr expression (can be {@code Empty#UNDEFINED})
+ * @param options options template
+ * @param error error to raise, if a supplied option is unknown
+ * @param qc query context
+ * @return options
+ * @throws QueryException query exception
+ */
+ protected final E toOptions(final Expr expr, final E options,
+ final QueryError error, final QueryContext qc) throws QueryException {
+ return new FuncOptions(info, true).assign(expr.item(qc, info), options, error);
}
/**
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index a891859e34..a6b74d1775 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -1,27 +1,11 @@
package org.basex.query.func.html;
-import static org.basex.build.html.HtmlOptions.*;
import static org.basex.query.QueryError.*;
-import static org.basex.util.Token.*;
-import java.io.*;
-
-import org.basex.build.html.*;
-import org.basex.build.xml.*;
-import org.basex.core.*;
-import org.basex.io.*;
+import org.basex.build.html.HtmlParser;
import org.basex.query.*;
-import org.basex.query.expr.*;
-import org.basex.query.func.*;
import org.basex.query.value.item.*;
-import org.basex.query.value.node.*;
-import org.basex.query.value.seq.*;
import org.basex.util.*;
-import org.xml.sax.*;
-
-import nu.validator.htmlparser.sax.*;
-import nu.validator.htmlparser.common.XmlViolationPolicy;
-import nu.validator.htmlparser.common.Heuristics;
/**
* Function implementation.
@@ -29,145 +13,12 @@
* @author BaseX Team 2005-23, BSD License
* @author Gunther Rademacher
*/
-public class FnParseHtml extends StandardFunc {
- // TODO: handle second argument (method, html-version, encoding), produce error code FODC0012
+public class FnParseHtml extends HtmlParse {
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
- final Item value = arg(0).atomItem(qc, info);
- final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
- final IO io = value instanceof Bin ? new IOContent(toBytes(value))
- : new IOContent(toBytes(value), "", Strings.UTF8);
- return value.isEmpty() ? Empty.VALUE : parse(io, options);
- }
-
- @Override
- protected final Expr opt(final CompileContext cc) {
- return optFirst();
- }
-
- /**
- * Parses the input and creates an XML document.
- * @param io input data
- * @param options HTML options
- * @return node
- * @throws QueryException query exception
- */
- protected final Item parse(final IO io, final HtmlOptions options) throws QueryException {
- try {
- if (!ParserImpl.available()) {
- // reader could not be initialized; fall back to html:parse
- final HtmlOptions htmlOptions = new HtmlOptions();
- htmlOptions.set(HtmlOptions.LEXICAL, true);
- htmlOptions.set(HtmlOptions.NONS, false);
- return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions));
- }
- return new DBNode(new ParserImpl(info, io, options));
- } catch(final IOException ex) {
- throw INVHTML_X.get(info, ex);
- }
- }
-
- /**
- * Parser implementation.
- */
- private static class ParserImpl extends XMLParser {
-
- /**
- * Checks if Validator.nu is available.
- * @return result of check
- */
- public static boolean available() {
- return Reflect.available("nu.validator.htmlparser.sax.HtmlParser")
- && Reflect.available("nu.validator.htmlparser.sax.XmlSerializer")
- && Reflect.available("nu.validator.htmlparser.common.Heuristics")
- && Reflect.available("nu.validator.htmlparser.common.XmlViolationPolicy");
- }
-
- /**
- * Constructor.
- * @param info input info
- * @param source document source
- * @param options HTML options
- * @throws IOException I/O exception
- * @throws QueryException query exception
- */
- ParserImpl(final InputInfo info, final IO source, final HtmlOptions options)
- throws IOException, QueryException {
- super(toXml(info, source, options), new MainOptions());
- }
-
- /**
- * Converts an HTML document to XML.
- * @param info input info
- * @param io io reference
- * @param hopts HTML options
- * @return parser
- * @throws IOException I/O exception
- * @throws QueryException query exception
- */
- private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hopts)
- throws IOException, QueryException {
-
- try {
- // define output
- final StringWriter sw = new StringWriter();
- final nu.validator.htmlparser.sax.HtmlParser reader =
- new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
- final ContentHandler writer = new XmlSerializer(sw);
- reader.setContentHandler(writer);
- reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
-
- // define input
- final InputSource is = new InputSource(io.inputStream());
-
- // set Validator.nu options
- if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
- reader.setCheckingNormalization(true);
- if(hopts.get(MAPPING_LANG_TO_XML_LANG))
- reader.setMappingLangToXmlLang(true);
- if(hopts.get(SCRIPTING_ENABLED))
- reader.setScriptingEnabled(true);
- if(hopts.contains(CONTENT_SPACE_POLICY))
- reader.setContentSpacePolicy(
- XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
- if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
- reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
- hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
- if(hopts.contains(COMMENT_POLICY))
- reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
- if(hopts.contains(XMLNS_POLICY))
- reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
- if(hopts.contains(NAME_POLICY))
- reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
- if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
- reader.setStreamabilityViolationPolicy(
- XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
- if(hopts.contains(XML_POLICY))
- reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
-
- if(hopts.contains(HEURISTICS))
- reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
- // end Validator.nu options
-
- String enc = io.encoding() != null
- ? io.encoding()
- : hopts.contains(ENCODING)
- ? hopts.get(HtmlOptions.ENCODING)
- : null; // TODO: sniff encoding
- if (enc != null) {
- if (!Strings.supported(enc))
- throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.');
- is.setEncoding(Strings.normEncoding(enc));
- }
-
- reader.parse(is);
- return new IOContent(token(sw.toString()), io.name());
-
- } catch(final SAXException ex) {
- Util.errln(ex);
- throw INVHTML_X.get(info, ex.getLocalizedMessage());
- }
- }
+ String className = HtmlParser.firstUnavailableClass();
+ if (className != null) throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
+ return super.item(qc, ii);
}
}
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index a582837af8..576fc82b4d 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -25,7 +25,10 @@ public class HtmlParse extends StandardFunc {
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
final Item value = arg(0).atomItem(qc, info);
- return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), qc);
+ if (value.isEmpty()) return Empty.VALUE;
+ final IO io = value instanceof Bin ? new IOContent(toBytes(value))
+ : new IOContent(toBytes(value), "", Strings.UTF8);
+ return parse(io, qc);
}
@Override
@@ -41,11 +44,11 @@ protected final Expr opt(final CompileContext cc) {
* @throws QueryException query exception
*/
protected final Item parse(final IO io, final QueryContext qc) throws QueryException {
- final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc);
+ final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), INVHTMLOPT_X, qc);
try {
return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options));
} catch(final IOException ex) {
- throw HTML_PARSE_X.get(info, ex);
+ throw INVHTML_X.get(info, ex);
}
}
}
diff --git a/basex-core/src/main/resources/lang/Chinese.lang b/basex-core/src/main/resources/lang/Chinese.lang
index 82caebcf90..610a45e5e4 100644
--- a/basex-core/src/main/resources/lang/Chinese.lang
+++ b/basex-core/src/main/resources/lang/Chinese.lang
@@ -417,7 +417,7 @@ h_db_format = 数据库结构已经变了,请使用新版软件
h_db_options_% = 如果执行'%',这个选项将会被设置
h_diacritics = 索引保留了发音符号
h_fulltext_index = 全文索引可以加速全文检索
-h_html_parser = 将使用TagSoup将HTML转为XML
+h_html_parser = 将使用Validator.nu将HTML转为XML
h_index_format = 索引格式变了,请建新索引
h_int_parser = 容错,而且比Java的默认解析器更快
h_languauge = 将使用根据语言确定的tokenizer
diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang
index e52e0e9345..d507ea781f 100644
--- a/basex-core/src/main/resources/lang/Dutch.lang
+++ b/basex-core/src/main/resources/lang/Dutch.lang
@@ -417,7 +417,7 @@ h_db_format = Het database formaat is gewijzigd; maak een nieuwe databa
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Diakritische tekens worden gebruikt in de index.
h_fulltext_index = Een full-text index versnelt full-text queries.
-h_html_parser = De TagSoup parser zal gebruikt worden om HTML naar XML te converteren.
+h_html_parser = De Validator.nu HTML parser zal gebruikt worden om HTML naar XML te converteren.
h_index_format = Het index formaat is gewijzigd; maak nieuwe indexen.
h_int_parser = Robuuster en sneller dan Java's standaard parser.
h_languauge = Met deze optie zullen taalspecifieke parsers worden gebruikt.
diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang
index 0b093bd970..5cc46cc7ed 100644
--- a/basex-core/src/main/resources/lang/English.lang
+++ b/basex-core/src/main/resources/lang/English.lang
@@ -417,7 +417,7 @@ h_db_format = The database format has changed; please use a newer versi
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Diacritics are retained in the index.
h_fulltext_index = A full-text index speeds up full-text queries.
-h_html_parser = The TagSoup parser will be used to convert HTML to XML.
+h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML.
h_index_format = The index format has changed; please create new indexes.
h_int_parser = Fault tolerant, and faster than Java’s default parser.
h_languauge = Language specific tokenizers will be used.
diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang
index bcbb7a3f41..2f10e53bcf 100644
--- a/basex-core/src/main/resources/lang/French.lang
+++ b/basex-core/src/main/resources/lang/French.lang
@@ -417,7 +417,7 @@ h_db_format = Le format de base de données a changé ; Veuillez créer
h_db_options_% = Les options seront assignées si on exécute '%'.
h_diacritics = Les signes diacritiques sont conservés dans l’index.
h_fulltext_index = Un index plein texte accélère les requêtes plein texte.
-h_html_parser = Le parser TagSoup sera utilisé pour convertir le HTML en XML.
+h_html_parser = Le parser HTML Validator.nu sera utilisé pour convertir le HTML en XML.
h_index_format = Le format des index a changé ; Veuillez créer de nouveaux index.
h_int_parser = Tolérant aux fautes, et plus rapide que le parser Java par défaut.
h_languauge = Des analyseurs spécifiques à la langue vont être utilisés.
diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang
index 8a674e17c1..a4056c130d 100644
--- a/basex-core/src/main/resources/lang/German.lang
+++ b/basex-core/src/main/resources/lang/German.lang
@@ -417,7 +417,7 @@ h_db_format = Das Datenbankformat hat sich geändert; bitte verwenden S
h_db_options_% = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird.
h_diacritics = Diakritische Zeichen werden im Index beibehalten.
h_fulltext_index = Ein Volltext-Index beschleunigt Volltext-Anfragen.
-h_html_parser = Der TagSoup-Parser wird verwendet, um HTML in XML zu konvertieren.
+h_html_parser = Der Validator.nu HTML-Parser wird verwendet, um HTML in XML zu konvertieren.
h_index_format = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes.
h_int_parser = Fehlertolerant und schneller als Javas XML-Parser.
h_languauge = Sprachspezifische Tokenisierung wird verwendet.
diff --git a/basex-core/src/main/resources/lang/Hungarian.lang b/basex-core/src/main/resources/lang/Hungarian.lang
index bfe804283e..41a1c781c5 100644
--- a/basex-core/src/main/resources/lang/Hungarian.lang
+++ b/basex-core/src/main/resources/lang/Hungarian.lang
@@ -417,7 +417,7 @@ h_db_format = Az adatbázis formátuma megváltozott; kérem, használj
h_db_options_% = Ezek az beállítások csak a következő futtása után lépnek életbe: '%'
h_diacritics = Ékezetek megmaradnak az indexelésben.
h_fulltext_index = A teljes-szöveg index gyorsítja a teljes-szöveges (full-text) lekérdezéseket.
-h_html_parser = A TagSoup elemző HTML formátumot konvertál XML formátumra.
+h_html_parser = A Validator.nu elemző HTML formátumot konvertál XML formátumra.
h_index_format = Az index formátuma megváltozott; kérem, készítsen új indexeket.
h_int_parser = Hibatűrő, továbbá a Java alapértelmezett elemzőjénél gyorsabb.
h_languauge = Nyelvfüggő szövegelemzések is használatra kerülnek.
diff --git a/basex-core/src/main/resources/lang/Indonesian.lang b/basex-core/src/main/resources/lang/Indonesian.lang
index c23c842fa2..096d98f267 100644
--- a/basex-core/src/main/resources/lang/Indonesian.lang
+++ b/basex-core/src/main/resources/lang/Indonesian.lang
@@ -417,7 +417,7 @@ h_db_format = Bentuk basisdata telah berubah; mohon gunakan versi yang
h_db_options_% = Pilihan akan digunakan jika '%' dijalankan.
h_diacritics = Diakritik dipertahankan dalam indeks.
h_fulltext_index = Indeks semua teks mempercepat kueri teks penuh.
-h_html_parser = Pengurai TagSoup akan digunakan untuk mengubah HTML menjadi XML.
+h_html_parser = Pengurai Validator.nu akan digunakan untuk mengubah HTML menjadi XML.
h_index_format = Bentuk indeks telah berubah; mohon buat indeks baru.
h_int_parser = Toleran kesalahan, dan lebih cepat dari pengurai standar Java.
h_languauge = Pengurai teks bahasa tertentu akan digunakan.
diff --git a/basex-core/src/main/resources/lang/Italian.lang b/basex-core/src/main/resources/lang/Italian.lang
index f95804268d..dcb10fb853 100644
--- a/basex-core/src/main/resources/lang/Italian.lang
+++ b/basex-core/src/main/resources/lang/Italian.lang
@@ -417,7 +417,7 @@ h_db_format = Il formato della base di dati è cambiato; creare una nuo
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = I segni diacritici sono conservati nell'indice.
h_fulltext_index = Un indice "full-text" velocizza le interrogazioni sul testo.
-h_html_parser = Il parser TagSoup verrò usato per convertire HTML in XML.
+h_html_parser = Il parser Validator.nu verrò usato per convertire HTML in XML.
h_index_format = Il formato degli indici è cambiato; creare nuovi indici.
h_int_parser = Tollerante ai guasti e più veloce del parser di default di Java.
h_languauge = Parser di testo specifici per la lingua verranno usati
diff --git a/basex-core/src/main/resources/lang/Japanese.lang b/basex-core/src/main/resources/lang/Japanese.lang
index e12f3734f7..1aa797d681 100644
--- a/basex-core/src/main/resources/lang/Japanese.lang
+++ b/basex-core/src/main/resources/lang/Japanese.lang
@@ -417,7 +417,7 @@ h_db_format = データベース形式を変更しました。新しい
h_db_options_% = % 実行時にオプションが割り当てられます。
h_diacritics = インデックス内で付加記号(ウムラウト等)は保持されます。
h_fulltext_index = 全文テキストインデックスは全文検索を高速化します。
-h_html_parser = TagSoup パーサは HTML を XML に変換します。
+h_html_parser = Validator.nu パーサは HTML を XML に変換します。
h_index_format = インデックス形式を変更しました。新しくインデックスを作成して下さい。
h_int_parser = フォールトトレラント、Javaのデフォルトパーサより高速。
h_languauge = 指定された言語のテキストパーサが使用されます。
diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang
index 5834f92c48..995f0d178f 100644
--- a/basex-core/src/main/resources/lang/Mongolian.lang
+++ b/basex-core/src/main/resources/lang/Mongolian.lang
@@ -417,7 +417,7 @@ h_db_format = Өгөгдлийн сангийн формат өөрчл
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Индекс дэх санах тэмдгийг авч үлдэх.
h_fulltext_index = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно.
-h_html_parser = The TagSoup parser will be used to convert HTML to XML.
+h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML.
h_index_format = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү.
h_int_parser = Fault tolerant, and faster than Java’s default parser.
h_languauge = Хэлний текст Parser тодорхойлогдох болно.
diff --git a/basex-core/src/main/resources/lang/Romanian.lang b/basex-core/src/main/resources/lang/Romanian.lang
index 7903adde32..5cae6261d3 100644
--- a/basex-core/src/main/resources/lang/Romanian.lang
+++ b/basex-core/src/main/resources/lang/Romanian.lang
@@ -417,7 +417,7 @@ h_db_format = Formatul bazei de date a fost schimbat, vă rugăm să fo
h_db_options_% = Optiunile vor fi asignate daca '%' este executată.
h_diacritics = Diacritice sunt păstrate în index.
h_fulltext_index = Un full-text index accelereaza interogările full-text.
-h_html_parser = Parserul TagSoup va fi folosit pentru a converti HTML în XML.
+h_html_parser = Parserul "Validator.nu" va fi folosit pentru a converti HTML în XML.
h_index_format = Formatul index s-a schimbat, vă rugăm creati noi indici.
h_int_parser = Tolerant la greseli si mai rapid decat parserul default Java.
h_languauge = Parsere de text specifice limbii vor fi folosite.
diff --git a/basex-core/src/main/resources/lang/Russian.lang b/basex-core/src/main/resources/lang/Russian.lang
index 01be29bb23..d520598dbd 100644
--- a/basex-core/src/main/resources/lang/Russian.lang
+++ b/basex-core/src/main/resources/lang/Russian.lang
@@ -417,7 +417,7 @@ h_db_format = Формат хранения баз данных был
h_db_options_% = Эти опции будут изменены только после выполнения команды [%]
h_diacritics = Разделительные знаки будут включены в индекс
h_fulltext_index = Полнотекстовый индекс ускоряет соответствующие запросы
-h_html_parser = Для конвертации HTML в XML будет использован парсер TagSoup
+h_html_parser = Для конвертации HTML в XML будет использован парсер Validator.nu
h_index_format = Формат хранения индексов был изменен. Пожалуйста, создайте индексы заново.
h_int_parser = Толерантный к ошибкам и быстрее чем стандартный парсер Java
h_languauge = Будут использованы специализированные под каждый язык парсеры
diff --git a/basex-core/src/main/resources/lang/Spanish.lang b/basex-core/src/main/resources/lang/Spanish.lang
index 61439dc026..1d10d504bd 100644
--- a/basex-core/src/main/resources/lang/Spanish.lang
+++ b/basex-core/src/main/resources/lang/Spanish.lang
@@ -417,7 +417,7 @@ h_db_format = El formato de la Base de Datos ha cambiado; por favor uti
h_db_options_% = Las opciónes serán asignado si se ejecuta '%'.
h_diacritics = Las diacríticas están retenidas en el índice.
h_fulltext_index = Un índice de Texto Completo acelera las consulta de Texto Completo.
-h_html_parser = Se utilizará el Analizador Sintáctico TagSoup para convertir HTML a XML.
+h_html_parser = Se utilizará el Analizador Sintáctico Validator.nu para convertir HTML a XML.
h_index_format = El formato del índice ha cambiado; for favor, cree nuevos índices.
h_int_parser = Tolerante a fallos, y más rápido que el analizador sintáctico por defecto de Java.
h_languauge = Se utilizarán analizadores sintácticos de texto específicos del lenguaje.
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index 49e1539d5f..f21ec21461 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1468,8 +1468,8 @@ public final class FnModuleTest extends SandboxTest {
error(func.args(42), STRBIN_X_X);
error(func.args(" \"42\"", 42), MAP_X_X);
- error(func.args(" \"42\"", " map {'1234': ()}"), INVALIDOPT_X);
- error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVALIDOPT_X);
+ error(func.args(" \"42\"", " map {'1234': ()}"), INVHTMLOPT_X);
+ error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVHTMLOPT_X);
}
/** Test method. */
diff --git a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
index 59636312bf..e85fedcf5a 100644
--- a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
@@ -20,8 +20,7 @@ public final class HtmlModuleTest extends SandboxTest {
query(func.args(" <_/>/text()"), "");
final String path = "src/test/resources/input.html";
- query(func.args(path) + "//body ! name()", "body");
- query(func.args(path, " map { 'nons': false() }") + "//*:body ! name()", "body");
+ query(func.args(path) + "//*:body ! name()", "body");
}
/** Test method. */
@@ -33,7 +32,8 @@ public final class HtmlModuleTest extends SandboxTest {
// check if the function returns an HTML root node
query("exists(" + func.args("<html/>") + "/*:html)", true);
// check if the function returns
- query(func.args("<html/>", " map { 'nons': true() }"), "");
+ query(func.args("<html/>"),
+ "");
}
/** Test method. */
diff --git a/basex-examples/basex-examples.iml b/basex-examples/basex-examples.iml
index d68f7b229d..c8c6019d38 100644
--- a/basex-examples/basex-examples.iml
+++ b/basex-examples/basex-examples.iml
@@ -30,7 +30,7 @@
-
+
diff --git a/basex-examples/pom.xml b/basex-examples/pom.xml
index 2e4dbd2db5..c350579a51 100644
--- a/basex-examples/pom.xml
+++ b/basex-examples/pom.xml
@@ -18,8 +18,8 @@
${project.version}
- org.ccil.cowan.tagsoup
- tagsoup
+ nu.validator
+ htmlparser
org.junit.jupiter
diff --git a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
index 3481bbcc26..40b96ffc23 100644
--- a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
+++ b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
@@ -6,7 +6,7 @@
/**
* This example demonstrates how to import a file in the HTML format
* into the database. The specified input file will be converted to XML
- * if TagSoup is found in the classpath.
+ * if Validator.nu is found in the classpath.
*
* @author BaseX Team 2005-23, BSD License
* @author Christian Gruen
diff --git a/pom.xml b/pom.xml
index 3d09d55fcb..24f5d352c1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,9 +64,9 @@
true
- org.ccil.cowan.tagsoup
- tagsoup
- 1.2.1
+ nu.validator
+ htmlparser
+ 1.4.16
runtime
true
@@ -174,13 +174,6 @@
runtime
true
-
- nu.validator
- htmlparser
- 1.4.16
- runtime
- true
-
From e99d48b0e097f8162bdb6e4fd38fffe36053322d Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 13 Nov 2023 12:41:32 +0100
Subject: [PATCH 5/9] set scope=compile; handle dependencies of "heuristics"
setting; test meta/@charset
---
basex-core/pom.xml | 2 +-
.../java/org/basex/query/func/Function.java | 2 +-
.../org/basex/query/func/html/HtmlParse.java | 32 +++++++++++++++++++
.../org/basex/query/func/FnModuleTest.java | 18 ++++++-----
4 files changed, 44 insertions(+), 10 deletions(-)
diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index f3c6d14885..33a2503167 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -32,7 +32,7 @@
nu.validator
htmlparser
- provided
+ compile
true
diff --git a/basex-core/src/main/java/org/basex/query/func/Function.java b/basex-core/src/main/java/org/basex/query/func/Function.java
index c58cbe4b6a..8392399bf7 100644
--- a/basex-core/src/main/java/org/basex/query/func/Function.java
+++ b/basex-core/src/main/java/org/basex/query/func/Function.java
@@ -468,7 +468,7 @@ ITEM_ZM, flag(HOF)),
params(STRING_ZO), DATE_TIME_ZO),
/** XQuery function. */
PARSE_HTML(FnParseHtml::new, "parse-html(html[,options])",
- params(ITEM_ZO, MAP_O), DOCUMENT_NODE_ZO),
+ params(ANY_ATOMIC_TYPE_ZO, MAP_O), DOCUMENT_NODE_ZO),
/** XQuery function. */
PARSE_INTEGER(FnParseInteger::new, "parse-integer(value[,radix])",
params(STRING_O, INTEGER_O), INTEGER_O),
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index 576fc82b4d..c7cf9d24af 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -1,5 +1,6 @@
package org.basex.query.func.html;
+import static org.basex.build.html.HtmlOptions.*;
import static org.basex.query.QueryError.*;
import java.io.*;
@@ -22,6 +23,12 @@
* @author Christian Gruen
*/
public class HtmlParse extends StandardFunc {
+ /** Class needed for heuristics=ICU. */
+ private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
+ /** Class needed for heuristics=CHARDET. */
+ private static final String CHARDET_CLASS_NAME =
+ "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
+
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
final Item value = arg(0).atomItem(qc, info);
@@ -45,10 +52,35 @@ protected final Expr opt(final CompileContext cc) {
*/
protected final Item parse(final IO io, final QueryContext qc) throws QueryException {
final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), INVHTMLOPT_X, qc);
+ if(options.contains(HEURISTICS)) {
+ switch (options.get(HEURISTICS)) {
+ case ALL:
+ ensureAvailable(ICU_CLASS_NAME);
+ ensureAvailable(CHARDET_CLASS_NAME);
+ break;
+ case ICU:
+ ensureAvailable(ICU_CLASS_NAME);
+ break;
+ case CHARDET:
+ ensureAvailable(CHARDET_CLASS_NAME);
+ break;
+ default:
+ }
+ }
try {
return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options));
} catch(final IOException ex) {
throw INVHTML_X.get(info, ex);
}
}
+
+ /**
+ * Ensure that a required class is available on the class path.
+ * @param className the class name
+ * @throws QueryException query exception,
+ */
+ private void ensureAvailable(final String className) throws QueryException {
+ if(!Reflect.available(className))
+ throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
+ }
}
diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
index e4eaf33975..44b7e3b8bb 100644
--- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java
@@ -1480,16 +1480,18 @@ public final class FnModuleTest extends SandboxTest {
" map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"),
"42");
- query(func.args(_CONVERT_STRING_TO_BASE64.args("42", Strings.UTF16BE),
- " map {'encoding': '" + Strings.UTF16BE + "', 'heuristics': 'NONE'}"),
- "42");
+ query(func.args(_CONVERT_STRING_TO_BASE64.args(""
+ + "\u20AC", "ISO-8859-7"), " map {'heuristics': 'NONE'}"),
+ ""
+ + "\u20AC");
error(func.args(42), STRBIN_X_X);
- error(func.args(" \"42\"", 42), MAP_X_X);
- error(func.args(" \"42\"", " map {'1234': ()}"), INVHTMLOPT_X);
- error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVHTMLOPT_X);
+ error(func.args("42", 42), MAP_X_X);
+ error(func.args("42", " map {'1234': ''}"), INVHTMLOPT_X);
+ error(func.args("42", " map {'heuristics': '5678'}"), INVHTMLOPT_X);
+ error(func.args("42", " map {'heuristics': 'CHARDET'}"), BASEX_CLASSPATH_X_X);
+ error(func.args("42", " map {'heuristics': 'ICU'}"), BASEX_CLASSPATH_X_X);
+ error(func.args("42", " map {'heuristics': 'ALL'}"), BASEX_CLASSPATH_X_X);
}
/** Test method. */
From 7dd830e38f6b5b7a601943ecb1196d343d0b7878 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 13 Nov 2023 13:04:00 +0100
Subject: [PATCH 6/9] minor changes
---
.../org/basex/build/html/HtmlOptions.java | 30 +++++++++----------
.../java/org/basex/build/html/HtmlParser.java | 7 ++---
.../org/basex/query/func/html/HtmlParse.java | 4 +--
basex-core/src/main/resources/lang/Dutch.lang | 2 +-
.../src/main/resources/lang/English.lang | 2 +-
.../src/main/resources/lang/French.lang | 2 +-
.../src/main/resources/lang/German.lang | 2 +-
.../src/main/resources/lang/Mongolian.lang | 2 +-
8 files changed, 25 insertions(+), 26 deletions(-)
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index a6ce8f6cc9..311a854066 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -9,47 +9,47 @@
* @author Christian Gruen
*/
public final class HtmlOptions extends Options {
- /** fn:parse-html option encoding. */
+ /** fn:parse-html option: encoding. */
public static final StringOption ENCODING = new StringOption("encoding");
- /** fn:parse-html option method. */
+ /** fn:parse-html option: method. */
public static final StringOption METHOD = new StringOption("method");
- /** fn:parse-html option html-version. */
+ /** fn:parse-html option: html-version. */
public static final StringOption HTML_VERSION = new StringOption("html-version");
- /** fn:parse-html option include-template-content. */
+ /** fn:parse-html option: include-template-content. */
public static final BooleanOption INCLUDE_TEMPLATE_CONTENT =
new BooleanOption("include-template-content");
- /** Validator.nu option unicode-normalization-checking. */
+ /** Validator.nu option: unicode-normalization-checking. */
public static final BooleanOption UNICODE_NORMALIZATION_CHECKING =
new BooleanOption("unicode-normalization-checking", false);
- /** Validator.nu option mapping-lang-to-xml-lang. */
+ /** Validator.nu option: mapping-lang-to-xml-lang. */
public static final BooleanOption MAPPING_LANG_TO_XML_LANG =
new BooleanOption("mapping-lang-to-xml-lang", false);
- /** Validator.nu option scripting-enabled. */
+ /** Validator.nu option: scripting-enabled. */
public static final BooleanOption SCRIPTING_ENABLED =
new BooleanOption("scripting-enabled", false);
- /** Validator.nu option content-space-policy. */
+ /** Validator.nu option: content-space-policy. */
public static final EnumOption CONTENT_SPACE_POLICY =
new EnumOption<>("content-space-policy", XmlViolationPolicy.class);
- /** Validator.nu option content-non-xml-char-policy. */
+ /** Validator.nu option: content-non-xml-char-policy. */
public static final EnumOption CONTENT_NON_XML_CHAR_POLICY =
new EnumOption<>("content-non-xml-char-policy", XmlViolationPolicy.class);
- /** Validator.nu option comment-policy. */
+ /** Validator.nu option: comment-policy. */
public static final EnumOption COMMENT_POLICY =
new EnumOption<>("comment-policy", XmlViolationPolicy.class);
- /** Validator.nu option xmlns-policy. */
+ /** Validator.nu option: xmlns-policy. */
public static final EnumOption XMLNS_POLICY =
new EnumOption<>("xmlns-policy", XmlViolationPolicy.class);
- /** Validator.nu option name-policy. */
+ /** Validator.nu option: name-policy. */
public static final EnumOption NAME_POLICY =
new EnumOption<>("name-policy", XmlViolationPolicy.class);
- /** Validator.nu option streamability-violation-policy. */
+ /** Validator.nu option: streamability-violation-policy. */
public static final EnumOption STREAMABILITY_VIOLATION_POLICY =
new EnumOption<>("streamability-violation-policy", XmlViolationPolicy.class);
- /** Validator.nu option xml-policy. */
+ /** Validator.nu option: xml-policy. */
public static final EnumOption XML_POLICY =
new EnumOption<>("xml-policy", XmlViolationPolicy.class);
- /** Validator.nu option heuristics. */
+ /** Validator.nu option: heuristics. */
public static final EnumOption HEURISTICS =
new EnumOption<>("heuristics", Heuristics.class);
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 773defb9db..8fda826216 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -104,9 +104,6 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
reader.setContentHandler(writer);
reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
- // define input
- final InputSource is = new InputSource(io.inputStream());
-
// set Validator.nu options
if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
reader.setCheckingNormalization(true);
@@ -135,11 +132,13 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
// end Validator.nu options
+ // define input
+ final InputSource is = new InputSource(io.inputStream());
String enc = io.encoding() != null
? io.encoding()
: hopts.contains(ENCODING)
? hopts.get(HtmlOptions.ENCODING)
- : null; // TODO: sniff encoding
+ : null;
if (enc != null) {
if (!Strings.supported(enc))
throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index c7cf9d24af..41bbf4e1e1 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -23,9 +23,9 @@
* @author Christian Gruen
*/
public class HtmlParse extends StandardFunc {
- /** Class needed for heuristics=ICU. */
+ /** Class needed for option heuristics=ICU. */
private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
- /** Class needed for heuristics=CHARDET. */
+ /** Class needed for option heuristics=CHARDET. */
private static final String CHARDET_CLASS_NAME =
"org.mozilla.intl.chardet.nsICharsetDetectionObserver";
diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang
index d507ea781f..b438ac5864 100644
--- a/basex-core/src/main/resources/lang/Dutch.lang
+++ b/basex-core/src/main/resources/lang/Dutch.lang
@@ -417,7 +417,7 @@ h_db_format = Het database formaat is gewijzigd; maak een nieuwe databa
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Diakritische tekens worden gebruikt in de index.
h_fulltext_index = Een full-text index versnelt full-text queries.
-h_html_parser = De Validator.nu HTML parser zal gebruikt worden om HTML naar XML te converteren.
+h_html_parser = De Validator.nu parser zal gebruikt worden om HTML naar XML te converteren.
h_index_format = Het index formaat is gewijzigd; maak nieuwe indexen.
h_int_parser = Robuuster en sneller dan Java's standaard parser.
h_languauge = Met deze optie zullen taalspecifieke parsers worden gebruikt.
diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang
index 5cc46cc7ed..9cc5ca7d20 100644
--- a/basex-core/src/main/resources/lang/English.lang
+++ b/basex-core/src/main/resources/lang/English.lang
@@ -417,7 +417,7 @@ h_db_format = The database format has changed; please use a newer versi
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Diacritics are retained in the index.
h_fulltext_index = A full-text index speeds up full-text queries.
-h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML.
+h_html_parser = The Validator.nu parser will be used to convert HTML to XML.
h_index_format = The index format has changed; please create new indexes.
h_int_parser = Fault tolerant, and faster than Java’s default parser.
h_languauge = Language specific tokenizers will be used.
diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang
index 2f10e53bcf..7fd856822b 100644
--- a/basex-core/src/main/resources/lang/French.lang
+++ b/basex-core/src/main/resources/lang/French.lang
@@ -417,7 +417,7 @@ h_db_format = Le format de base de données a changé ; Veuillez créer
h_db_options_% = Les options seront assignées si on exécute '%'.
h_diacritics = Les signes diacritiques sont conservés dans l’index.
h_fulltext_index = Un index plein texte accélère les requêtes plein texte.
-h_html_parser = Le parser HTML Validator.nu sera utilisé pour convertir le HTML en XML.
+h_html_parser = Le parser Validator.nu sera utilisé pour convertir le HTML en XML.
h_index_format = Le format des index a changé ; Veuillez créer de nouveaux index.
h_int_parser = Tolérant aux fautes, et plus rapide que le parser Java par défaut.
h_languauge = Des analyseurs spécifiques à la langue vont être utilisés.
diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang
index a4056c130d..1226bac232 100644
--- a/basex-core/src/main/resources/lang/German.lang
+++ b/basex-core/src/main/resources/lang/German.lang
@@ -417,7 +417,7 @@ h_db_format = Das Datenbankformat hat sich geändert; bitte verwenden S
h_db_options_% = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird.
h_diacritics = Diakritische Zeichen werden im Index beibehalten.
h_fulltext_index = Ein Volltext-Index beschleunigt Volltext-Anfragen.
-h_html_parser = Der Validator.nu HTML-Parser wird verwendet, um HTML in XML zu konvertieren.
+h_html_parser = Der Validator.nu-Parser wird verwendet, um HTML in XML zu konvertieren.
h_index_format = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes.
h_int_parser = Fehlertolerant und schneller als Javas XML-Parser.
h_languauge = Sprachspezifische Tokenisierung wird verwendet.
diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang
index 995f0d178f..bbc2140b6e 100644
--- a/basex-core/src/main/resources/lang/Mongolian.lang
+++ b/basex-core/src/main/resources/lang/Mongolian.lang
@@ -417,7 +417,7 @@ h_db_format = Өгөгдлийн сангийн формат өөрчл
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Индекс дэх санах тэмдгийг авч үлдэх.
h_fulltext_index = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно.
-h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML.
+h_html_parser = The Validator.nu parser will be used to convert HTML to XML.
h_index_format = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү.
h_int_parser = Fault tolerant, and faster than Java’s default parser.
h_languauge = Хэлний текст Parser тодорхойлогдох болно.
From ff514e1d2f5ead11d913f23853ad9f5e9e74b4ad Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Mon, 13 Nov 2023 13:23:53 +0100
Subject: [PATCH 7/9] very minor change
---
basex-core/src/main/java/org/basex/build/html/HtmlParser.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 8fda826216..5877ce1498 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -33,7 +33,7 @@ public final class HtmlParser extends XMLParser {
private static final String NAME = "Validator.nu";
/**
- * Checks if a Validator.nu is available.
+ * Checks if Validator.nu is available.
* @return result of check
*/
public static boolean available() {
From 44f5d131e204e0a63ad1c86a584366745dea1306 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Wed, 22 Jan 2025 18:35:01 +0100
Subject: [PATCH 8/9] support both TagSoup and Validator.NU
---
basex-core/pom.xml | 6 +
.../org/basex/build/html/HtmlOptions.java | 41 +-
.../java/org/basex/build/html/HtmlParser.java | 362 ++++++++++++++----
.../src/main/java/org/basex/core/Text.java | 2 +-
.../basex/gui/dialog/DialogHtmlParser.java | 10 +-
.../basex/query/func/html/FnParseHtml.java | 6 +-
.../org/basex/query/func/html/HtmlDoc.java | 3 +-
.../org/basex/query/func/html/HtmlParse.java | 61 ++-
.../org/basex/query/func/html/HtmlParser.java | 6 +-
.../src/main/resources/lang/Chinese.lang | 2 +-
basex-core/src/main/resources/lang/Dutch.lang | 2 +-
.../src/main/resources/lang/English.lang | 2 +-
.../src/main/resources/lang/French.lang | 2 +-
.../src/main/resources/lang/German.lang | 2 +-
.../src/main/resources/lang/Hungarian.lang | 2 +-
.../src/main/resources/lang/Indonesian.lang | 2 +-
.../src/main/resources/lang/Italian.lang | 2 +-
.../src/main/resources/lang/Japanese.lang | 2 +-
.../src/main/resources/lang/Mongolian.lang | 2 +-
.../src/main/resources/lang/Romanian.lang | 2 +-
.../src/main/resources/lang/Russian.lang | 2 +-
.../src/main/resources/lang/Spanish.lang | 2 +-
.../org/basex/query/func/HtmlModuleTest.java | 8 +-
basex-examples/basex-examples.iml | 2 +-
basex-examples/pom.xml | 4 +-
.../basex/examples/create/HTMLExample.java | 2 +-
pom.xml | 7 +
27 files changed, 402 insertions(+), 144 deletions(-)
diff --git a/basex-core/pom.xml b/basex-core/pom.xml
index ee62cdd08e..79b46a0dc5 100644
--- a/basex-core/pom.xml
+++ b/basex-core/pom.xml
@@ -29,6 +29,12 @@
lucene-stemmers
true
+
+ org.ccil.cowan.tagsoup
+ tagsoup
+ compile
+ true
+
nu.validator
htmlparser
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index e1271747d4..7eebede2ce 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -9,10 +9,45 @@
* @author Christian Gruen
*/
public final class HtmlOptions extends Options {
- /** fn:parse-html option: encoding. */
+ /** TagSoup option: html. */
+ public static final BooleanOption HTML = new BooleanOption("html", false);
+ /** TagSoup option: omit-xml-declaration. */
+ public static final BooleanOption OMIT_XML_DECLARATION =
+ new BooleanOption("omit-xml-declaration", false);
+ /** TagSoup option: nons. */
+ public static final BooleanOption NONS = new BooleanOption("nons", true);
+ /** TagSoup option: nobogons. */
+ public static final BooleanOption NOBOGONS = new BooleanOption("nobogons", false);
+ /** TagSoup option: nodefaults. */
+ public static final BooleanOption NODEFAULTS = new BooleanOption("nodefaults", false);
+ /** TagSoup option: nocolons. */
+ public static final BooleanOption NOCOLONS = new BooleanOption("nocolons", false);
+ /** TagSoup option: norestart. */
+ public static final BooleanOption NORESTART = new BooleanOption("norestart", false);
+ /** TagSoup option: nobogons. */
+ public static final BooleanOption IGNORABLE = new BooleanOption("ignorable", false);
+ /** TagSoup option: emptybogons. */
+ public static final BooleanOption EMPTYBOGONS = new BooleanOption("emptybogons", false);
+ /** TagSoup option: any. */
+ public static final BooleanOption ANY = new BooleanOption("any", false);
+ /** TagSoup option: norootbogons. */
+ public static final BooleanOption NOROOTBOGONS = new BooleanOption("norootbogons", false);
+ /** TagSoup option: nocdata. */
+ public static final BooleanOption NOCDATA = new BooleanOption("nocdata", false);
+ /** TagSoup option: lexical. */
+ public static final BooleanOption LEXICAL = new BooleanOption("lexical", false);
+
+ /** TagSoup option: doctype-system=systemid. */
+ public static final StringOption DOCTYPE_SYSTEM = new StringOption("doctype-system");
+ /** TagSoup option: doctype-public=publicid. */
+ public static final StringOption DOCTYPE_PUBLIC = new StringOption("doctype-public");
+
+ /** Common option: encoding. */
public static final StringOption ENCODING = new StringOption("encoding");
- /** fn:parse-html option: method. */
- public static final StringOption METHOD = new StringOption("method");
+ /** Common option: method. */
+ public static final EnumOption METHOD = new EnumOption<>("method",
+ HtmlParser.Method.class);
+
/** fn:parse-html option: html-version. */
public static final StringOption HTML_VERSION = new StringOption("html-version");
/** fn:parse-html option: include-template-content. */
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 558cf1b064..3083a5519b 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -1,16 +1,18 @@
package org.basex.build.html;
import static org.basex.build.html.HtmlOptions.*;
+import static org.basex.build.html.HtmlOptions.NOCDATA;
import static org.basex.query.QueryError.*;
import static org.basex.util.Token.*;
import java.io.*;
-import java.util.*;
import org.basex.build.xml.*;
import org.basex.core.*;
import org.basex.io.*;
+import org.basex.query.*;
import org.basex.util.*;
+import org.ccil.cowan.tagsoup.*;
import org.xml.sax.*;
import nu.validator.htmlparser.common.Heuristics;
@@ -29,109 +31,57 @@
* @author Christian Gruen
*/
public final class HtmlParser extends XMLParser {
- /** Name of HTML Parser. */
- private static final String NAME = "Validator.nu";
-
- /**
- * Checks if Validator.nu is available.
- * @return result of check
- */
- public static boolean available() {
- return firstUnavailableClass() == null;
- }
-
/**
- * Check whether Validator.nu classes are available on the class path.
- * @return the name of the first class that is not available, or null if all classes are available
- */
- public static String firstUnavailableClass() {
- for(final String className : Arrays.asList("nu.validator.htmlparser.sax.HtmlParser",
- "nu.validator.htmlparser.sax.XmlSerializer",
- "nu.validator.htmlparser.common.XmlViolationPolicy",
- "nu.validator.htmlparser.common.Heuristics")) {
- if(!Reflect.available(className)) return className;
- }
- return null;
- }
-
- /**
- * Returns the name of the parser, or an empty string.
- * @return name of parser
+ * Constructor.
+ * @param source document source
+ * @param options main options
+ * @param hopts html options
+ * @throws IOException I/O exception
*/
- public static String parser() {
- return available() ? NAME : "";
+ public HtmlParser(final IO source, final MainOptions options, final HtmlOptions hopts)
+ throws IOException {
+ this(source, Parser.of(hopts), options, hopts);
}
/**
* Constructor.
* @param source document source
+ * @param parser parser to be used
* @param options main options
* @param hopts html options
* @throws IOException I/O exception
*/
- public HtmlParser(final IO source, final MainOptions options, final HtmlOptions hopts)
- throws IOException {
- super(toXml(source, hopts), options);
+ public HtmlParser(final IO source, final Parser parser, final MainOptions options,
+ final HtmlOptions hopts) throws IOException {
+ super(toXml(source, parser, hopts), options);
}
/**
* Converts an HTML document to XML.
* @param io io reference
+ * @param parser parser to be used
* @param hopts html options
* @return parser
* @throws IOException I/O exception
*/
- private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException {
+ private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopts)
+ throws IOException {
// reader could not be initialized; fall back to XML
- if(!available()) return io;
-
+ if(!parser.available(hopts)) return io;
try {
// define output
final StringWriter sw = new StringWriter();
- final nu.validator.htmlparser.sax.HtmlParser reader =
- new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
- final ContentHandler writer = new XmlSerializer(sw);
- reader.setContentHandler(writer);
- reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
-
- // set Validator.nu options
- if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
- reader.setCheckingNormalization(true);
- if(hopts.get(MAPPING_LANG_TO_XML_LANG))
- reader.setMappingLangToXmlLang(true);
- if(hopts.get(SCRIPTING_ENABLED))
- reader.setScriptingEnabled(true);
- if(hopts.contains(CONTENT_SPACE_POLICY))
- reader.setContentSpacePolicy(
- XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
- if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
- reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
- hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
- if(hopts.contains(COMMENT_POLICY))
- reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
- if(hopts.contains(XMLNS_POLICY))
- reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
- if(hopts.contains(NAME_POLICY))
- reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
- if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
- reader.setStreamabilityViolationPolicy(
- XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
- if(hopts.contains(XML_POLICY))
- reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
- if(hopts.contains(HEURISTICS))
- reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
- // end Validator.nu options
+ final XMLReader reader = parser.reader(hopts, sw);
// define input
final InputSource is = new InputSource(io.inputStream());
- String enc = io.encoding() != null
+ final String enc = io.encoding() != null
? io.encoding()
: hopts.contains(ENCODING)
? hopts.get(HtmlOptions.ENCODING)
: null;
if (enc != null) {
- if (!Strings.supported(enc))
- throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
+ if(!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
is.setEncoding(Strings.normEncoding(enc));
}
@@ -143,4 +93,272 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException
throw INVHTML_X.getIO(ex.getLocalizedMessage());
}
}
+
+ /** Method option values. */
+ public enum Method {
+ /** TagSoup parser with method 'xml'. */
+ xml(Parser.TAGSOUP),
+ /** TagSoup parser with method 'html'. */
+ html(Parser.TAGSOUP),
+ /** Validator.nu parser. */
+ nu(Parser.NU);
+
+ /** Parser associated with this method. */
+ public final Parser parser;
+
+ /**
+ * Constructor.
+ * @param parser parser associated with this method
+ */
+ Method(final Parser parser) {
+ this.parser = parser;
+ }
+ }
+
+ /** Parser type. */
+ public enum Parser {
+ /** TagSoup parser. */
+ TAGSOUP("TagSoup", "org.ccil.cowan.tagsoup.Parser") {
+
+ /** TagSoup URL. */
+ private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/";
+
+ @Override
+ public boolean fallbackToXml() {
+ return true;
+ }
+
+ @Override
+ XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException {
+ XMLReader reader = new org.ccil.cowan.tagsoup.Parser();
+ final XMLWriter writer = new XMLWriter(sw);
+ writer.setOutputProperty(ENCODING.name(), Strings.UTF8);
+ reader.setContentHandler(writer);
+
+ // set TagSoup options
+ if(hopts.get(HTML)) {
+ reader.setFeature("http://xml.org/sax/features/namespaces", false);
+ writer.setOutputProperty(METHOD.name(), "html");
+ writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
+ }
+ if(hopts.get(NONS))
+ reader.setFeature("http://xml.org/sax/features/namespaces", false);
+ if(hopts.get(NOBOGONS))
+ reader.setFeature(FEATURES + "ignore-bogons", true);
+ if(hopts.get(NODEFAULTS))
+ reader.setFeature(FEATURES + "default-attributes", false);
+ if(hopts.get(NOCOLONS))
+ reader.setFeature(FEATURES + "translate-colons", true);
+ if(hopts.get(NORESTART))
+ reader.setFeature(FEATURES + "restart-elements", false);
+ if(hopts.get(IGNORABLE))
+ reader.setFeature(FEATURES + "ignorable-whitespace", true);
+ if(hopts.get(EMPTYBOGONS))
+ reader.setFeature(FEATURES + "bogons-empty", true);
+ if(hopts.get(ANY))
+ reader.setFeature(FEATURES + "bogons-empty", false);
+ if(hopts.get(NOROOTBOGONS))
+ reader.setFeature(FEATURES + "root-bogons", false);
+ if(hopts.get(NOCDATA))
+ reader.setFeature(FEATURES + "cdata-elements", false);
+ if(hopts.get(LEXICAL))
+ reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
+ if(hopts.get(OMIT_XML_DECLARATION))
+ writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes");
+ if(hopts.contains(METHOD))
+ writer.setOutputProperty(METHOD.name(), hopts.get(METHOD).name());
+ if(hopts.contains(DOCTYPE_SYSTEM))
+ writer.setOutputProperty(DOCTYPE_SYSTEM.name(), hopts.get(DOCTYPE_SYSTEM));
+ if(hopts.contains(DOCTYPE_PUBLIC))
+ writer.setOutputProperty(DOCTYPE_PUBLIC.name(), hopts.get(DOCTYPE_PUBLIC));
+ return reader;
+ }
+ },
+
+ /** Validator.nu parser. */
+ NU("Validator.nu", "nu.validator.htmlparser.sax.HtmlParser",
+ "nu.validator.htmlparser.sax.XmlSerializer",
+ "nu.validator.htmlparser.common.XmlViolationPolicy",
+ "nu.validator.htmlparser.common.Heuristics") {
+
+ /** Class needed for option heuristics=ICU. */
+ private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
+ /** Class needed for option heuristics=CHARDET. */
+ private static final String CHARDET_CLASS_NAME =
+ "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
+
+ @Override
+ public boolean fallbackToXml() {
+ return false;
+ }
+
+ @Override
+ XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException {
+ final nu.validator.htmlparser.sax.HtmlParser reader =
+ new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET);
+ final ContentHandler writer = new XmlSerializer(sw);
+ reader.setContentHandler(writer);
+ reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
+
+ if(hopts.get(UNICODE_NORMALIZATION_CHECKING))
+ reader.setCheckingNormalization(true);
+ if(hopts.get(MAPPING_LANG_TO_XML_LANG))
+ reader.setMappingLangToXmlLang(true);
+ if(hopts.get(SCRIPTING_ENABLED))
+ reader.setScriptingEnabled(true);
+ if(hopts.contains(CONTENT_SPACE_POLICY))
+ reader.setContentSpacePolicy(
+ XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name()));
+ if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY))
+ reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf(
+ hopts.get(CONTENT_NON_XML_CHAR_POLICY).name()));
+ if(hopts.contains(COMMENT_POLICY))
+ reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name()));
+ if(hopts.contains(XMLNS_POLICY))
+ reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name()));
+ if(hopts.contains(NAME_POLICY))
+ reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name()));
+ if(hopts.contains(STREAMABILITY_VIOLATION_POLICY))
+ reader.setStreamabilityViolationPolicy(
+ XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name()));
+ if(hopts.contains(XML_POLICY))
+ reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name()));
+ if(hopts.contains(HEURISTICS))
+ reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name()));
+ return reader;
+ }
+
+ @Override
+ public void ensureAvailable(final HtmlOptions options, final byte[] func,
+ final InputInfo info) throws QueryException {
+ super.ensureAvailable(options, func, info);
+ if(options.contains(HEURISTICS)) {
+ switch(options.get(HEURISTICS)) {
+ case ALL:
+ ensureAvailable(ICU_CLASS_NAME, func, info);
+ ensureAvailable(CHARDET_CLASS_NAME, func, info);
+ break;
+ case ICU:
+ ensureAvailable(ICU_CLASS_NAME, func, info);
+ break;
+ case CHARDET:
+ ensureAvailable(CHARDET_CLASS_NAME, func, info);
+ break;
+ default:
+ }
+ }
+ }
+
+ @Override
+ public boolean available(final HtmlOptions options) {
+ if(!super.available(options)) return false;
+ if(!options.contains(HEURISTICS)) return true;
+ switch(options.get(HEURISTICS)) {
+ case ALL:
+ if(!Reflect.available(ICU_CLASS_NAME)) return false;
+ if(!Reflect.available(CHARDET_CLASS_NAME)) return false;
+ break;
+ case ICU:
+ if(!Reflect.available(ICU_CLASS_NAME)) return false;
+ break;
+ case CHARDET:
+ if(!Reflect.available(CHARDET_CLASS_NAME)) return false;
+ break;
+ default:
+ }
+ return true;
+ }
+ };
+
+ /** Default parser. */
+ public static final Parser DEFAULT = TAGSOUP;
+
+ /** String representation. */
+ private final String string;
+ /** Required classes. */
+ private final String[] classes;
+
+ /**
+ * Whether to fall back to XML if this parser is not available.
+ * @return result of check
+ */
+ public abstract boolean fallbackToXml();
+
+ /**
+ * Return a reader instance for this parser.
+ * @param options HTML options
+ * @param writer string writer
+ * @return reader
+ * @throws SAXException SAX exception
+ */
+ abstract XMLReader reader(HtmlOptions options, StringWriter writer) throws SAXException;
+
+ /**
+ * Constructor.
+ * @param string string representation
+ * @param classes required classes
+ */
+ Parser(final String string, final String... classes) {
+ this.string = string;
+ this.classes = classes;
+ }
+
+ /**
+ * Checks if this parser is available.
+ * @param options HTML options
+ * @return result of check
+ */
+ public boolean available(@SuppressWarnings("unused") final HtmlOptions options) {
+ for(final String cl : classes) if(!Reflect.available(cl)) return false;
+ return true;
+ }
+
+ /**
+ * Throws an exception if any of the classes required for this parser are unavailable.
+ * @param options HTML options
+ * @param func name of function that is asking for this parser
+ * @param info input info (can be {@code null})
+ * @throws QueryException query exception
+ */
+ public void ensureAvailable(@SuppressWarnings("unused") final HtmlOptions options,
+ final byte[] func, final InputInfo info) throws QueryException {
+ for(final String cl : classes) ensureAvailable(cl, func, info);
+ }
+
+ /**
+ * Throws an exception if a class required for this parser is unavailable.
+ * @param className the class name
+ * @param func name of function that is asking for this parser
+ * @param info input info (can be {@code null})
+ * @throws QueryException query exception,
+ */
+ private static void ensureAvailable(final String className, final byte[] func,
+ final InputInfo info) throws QueryException {
+ if(!Reflect.available(className)) throw BASEX_CLASSPATH_X_X.get(info, func, className);
+ }
+
+ /**
+ * Returns the parser associated with the specified HTML options.
+ * @param options HTML options.
+ * @return parser
+ */
+ public static Parser of(final HtmlOptions options) {
+ return of(options, Parser.DEFAULT);
+ }
+
+ /**
+ * Returns the parser associated with the specified HTML options.
+ * @param options HTML options.
+ * @param defaultParser default parser
+ * @return parser
+ */
+ public static Parser of(final HtmlOptions options, final Parser defaultParser) {
+ return options.contains(METHOD) ? options.get(METHOD).parser : defaultParser;
+ }
+
+ @Override
+ public String toString() {
+ return string;
+ }
+ }
}
diff --git a/basex-core/src/main/java/org/basex/core/Text.java b/basex-core/src/main/java/org/basex/core/Text.java
index d7f8121800..dd8d080b65 100644
--- a/basex-core/src/main/java/org/basex/core/Text.java
+++ b/basex-core/src/main/java/org/basex/core/Text.java
@@ -1426,7 +1426,7 @@ public interface Text {
String H_VERSION_NEW_X_X = lang("h_version_new_%_%");
/** HTML Parser. */
- String H_HTML_PARSER = lang("h_html_parser");
+ String H_HTML_PARSER_X = lang("h_html_parser_%");
/** No HTML Parser. */
String H_NO_HTML_PARSER = lang("h_no_html_parser");
diff --git a/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java b/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java
index f5b0d625e1..ce48bd3dc4 100644
--- a/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java
+++ b/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java
@@ -6,10 +6,12 @@
import java.io.*;
import org.basex.build.html.*;
+import org.basex.build.html.HtmlParser.*;
import org.basex.core.*;
import org.basex.gui.*;
-import org.basex.gui.GUIConstants.Msg;
+import org.basex.gui.GUIConstants.*;
import org.basex.gui.layout.*;
+import org.basex.util.*;
import org.basex.util.options.*;
/**
@@ -33,10 +35,10 @@ final class DialogHtmlParser extends DialogParser {
*/
DialogHtmlParser(final BaseXDialog dialog, final MainOptions opts) {
hopts = new HtmlOptions(opts.get(MainOptions.HTMLPARSER));
-
- final boolean avl = HtmlParser.available();
+ final Parser parser = Parser.of(hopts);
+ final boolean avl = parser.available(hopts);
final BaseXBack pp = new BaseXBack(new RowLayout(8));
- pp.add(new BaseXLabel(avl ? H_HTML_PARSER : H_NO_HTML_PARSER));
+ pp.add(new BaseXLabel(avl ? Util.info(H_HTML_PARSER_X, parser) : H_NO_HTML_PARSER));
options = new BaseXTextField(dialog, hopts.toString());
options.setToolTipText(tooltip(hopts));
diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
index b6bbce098a..54540389a5 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java
@@ -1,7 +1,5 @@
package org.basex.query.func.html;
-import static org.basex.query.QueryError.*;
-
import org.basex.build.html.HtmlParser;
import org.basex.query.*;
import org.basex.query.value.item.*;
@@ -17,8 +15,6 @@ public class FnParseHtml extends HtmlParse {
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
- String className = HtmlParser.firstUnavailableClass();
- if (className != null) throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
- return super.item(qc, ii);
+ return parse(htmlInput(qc), HtmlParser.Parser.NU, qc);
}
}
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java
index 1e21bff317..a8a08c664c 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java
@@ -1,5 +1,6 @@
package org.basex.query.func.html;
+import org.basex.build.html.HtmlParser.*;
import org.basex.query.*;
import org.basex.query.value.item.*;
import org.basex.query.value.seq.*;
@@ -15,6 +16,6 @@ public final class HtmlDoc extends HtmlParse {
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
final String source = toStringOrNull(arg(0), qc);
- return source != null ? parse(toIO(source), qc) : Empty.VALUE;
+ return source != null ? parse(toIO(source), Parser.DEFAULT, qc) : Empty.VALUE;
}
}
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
index 181692e2e7..c4986d2c6b 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java
@@ -1,11 +1,11 @@
package org.basex.query.func.html;
-import static org.basex.build.html.HtmlOptions.*;
import static org.basex.query.QueryError.*;
import java.io.*;
import org.basex.build.html.*;
+import org.basex.build.html.HtmlParser.*;
import org.basex.core.*;
import org.basex.io.*;
import org.basex.query.*;
@@ -23,19 +23,23 @@
* @author Christian Gruen
*/
public class HtmlParse extends StandardFunc {
- /** Class needed for option heuristics=ICU. */
- private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector";
- /** Class needed for option heuristics=CHARDET. */
- private static final String CHARDET_CLASS_NAME =
- "org.mozilla.intl.chardet.nsICharsetDetectionObserver";
-
@Override
public Item item(final QueryContext qc, final InputInfo ii) throws QueryException {
+ return parse(htmlInput(qc), Parser.DEFAULT, qc);
+ }
+
+ /**
+ * Converts the HTML input in the first argument to an IOContent instance from a binary or string
+ * item.
+ * @param qc query context
+ * @return input as an IOContent instance ({@code null}, if empty)
+ * @throws QueryException query exception
+ */
+ protected IOContent htmlInput(final QueryContext qc) throws QueryException {
final Item value = arg(0).atomItem(qc, info);
- if (value.isEmpty()) return Empty.VALUE;
- final IO io = value instanceof Bin ? new IOContent(toBytes(value))
- : new IOContent(toBytes(value), "", Strings.UTF8);
- return parse(io, qc);
+ if(value.isEmpty()) return null;
+ return value instanceof Bin ? new IOContent(toBytes(value))
+ : new IOContent(toBytes(value), "", Strings.UTF8);
}
@Override
@@ -46,41 +50,22 @@ protected final Expr opt(final CompileContext cc) {
/**
* Parses the input and creates an XML document.
* @param io input data
+ * @param defaultParser default HTML parser to be used in absence of the METHOD option
* @param qc query context
* @return node
* @throws QueryException query exception
*/
- protected final Item parse(final IO io, final QueryContext qc) throws QueryException {
+ protected final Item parse(final IO io, final Parser defaultParser, final QueryContext qc)
+ throws QueryException {
+ if(io == null) return Empty.VALUE;
final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), qc);
- if(options.contains(HEURISTICS)) {
- switch (options.get(HEURISTICS)) {
- case ALL:
- ensureAvailable(ICU_CLASS_NAME);
- ensureAvailable(CHARDET_CLASS_NAME);
- break;
- case ICU:
- ensureAvailable(ICU_CLASS_NAME);
- break;
- case CHARDET:
- ensureAvailable(CHARDET_CLASS_NAME);
- break;
- default:
- }
- }
+ final Parser parser = Parser.of(options, defaultParser);
+ if(!parser.fallbackToXml()) parser.ensureAvailable(options, definition.local(), info);
try {
- return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options));
+ return new DBNode(
+ new org.basex.build.html.HtmlParser(io, parser, new MainOptions(), options));
} catch(final IOException ex) {
throw INVHTML_X.get(info, ex);
}
}
-
- /**
- * Ensure that a required class is available on the class path.
- * @param className the class name
- * @throws QueryException query exception,
- */
- private void ensureAvailable(final String className) throws QueryException {
- if(!Reflect.available(className))
- throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className);
- }
}
diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java
index 0d5ce3ec59..da95eafa78 100644
--- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java
@@ -1,5 +1,7 @@
package org.basex.query.func.html;
+import org.basex.build.html.*;
+import org.basex.build.html.HtmlParser.*;
import org.basex.query.*;
import org.basex.query.func.*;
import org.basex.query.value.item.*;
@@ -14,6 +16,8 @@
public final class HtmlParser extends StandardFunc {
@Override
public Item item(final QueryContext qc, final InputInfo ii) {
- return Str.get(org.basex.build.html.HtmlParser.parser());
+ final HtmlOptions options = new HtmlOptions();
+ final Parser parser = Parser.of(options);
+ return Str.get(parser.available(options) ? parser.toString() : "");
}
}
diff --git a/basex-core/src/main/resources/lang/Chinese.lang b/basex-core/src/main/resources/lang/Chinese.lang
index 11f1839948..1a7b4bfb29 100644
--- a/basex-core/src/main/resources/lang/Chinese.lang
+++ b/basex-core/src/main/resources/lang/Chinese.lang
@@ -413,7 +413,7 @@ h_db_format = 数据库结构已经变了,请使用新版软件
h_db_options_% = 如果执行'%',这个选项将会被设置
h_diacritics = 索引保留了发音符号
h_fulltext_index = 全文索引可以加速全文检索
-h_html_parser = 将使用Validator.nu将HTML转为XML
+h_html_parser_% = 将使用%将HTML转为XML
h_index_format = 索引格式变了,请建新索引
h_int_parser = 容错,而且比Java的默认解析器更快
h_languauge = 将使用根据语言确定的tokenizer
diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang
index 774465b9b5..d3198c072f 100644
--- a/basex-core/src/main/resources/lang/Dutch.lang
+++ b/basex-core/src/main/resources/lang/Dutch.lang
@@ -413,7 +413,7 @@ h_db_format = Het database formaat is gewijzigd; maak een nieuwe databa
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Diakritische tekens worden gebruikt in de index.
h_fulltext_index = Een full-text index versnelt full-text queries.
-h_html_parser = De Validator.nu parser zal gebruikt worden om HTML naar XML te converteren.
+h_html_parser_% = De % parser zal gebruikt worden om HTML naar XML te converteren.
h_index_format = Het index formaat is gewijzigd; maak nieuwe indexen.
h_int_parser = Robuuster en sneller dan Java's standaard parser.
h_languauge = Met deze optie zullen taalspecifieke parsers worden gebruikt.
diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang
index d7274e38a9..4886826085 100644
--- a/basex-core/src/main/resources/lang/English.lang
+++ b/basex-core/src/main/resources/lang/English.lang
@@ -413,7 +413,7 @@ h_db_format = The database format has changed; please use a newer versi
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Diacritics are retained in the index.
h_fulltext_index = A full-text index speeds up full-text queries.
-h_html_parser = The Validator.nu parser will be used to convert HTML to XML.
+h_html_parser_% = The % parser will be used to convert HTML to XML.
h_index_format = The index format has changed; please create new indexes.
h_int_parser = Fault tolerant, and faster than Java’s default parser.
h_languauge = Language specific tokenizers will be used.
diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang
index 9a02e46376..b1b6c7f100 100644
--- a/basex-core/src/main/resources/lang/French.lang
+++ b/basex-core/src/main/resources/lang/French.lang
@@ -413,7 +413,7 @@ h_db_format = Le format de base de données a changé ; Veuillez créer
h_db_options_% = Les options seront assignées si on exécute '%'.
h_diacritics = Les signes diacritiques sont conservés dans l’index.
h_fulltext_index = Un index plein texte accélère les requêtes plein texte.
-h_html_parser = Le parser Validator.nu sera utilisé pour convertir le HTML en XML.
+h_html_parser_% = Le parser % sera utilisé pour convertir le HTML en XML.
h_index_format = Le format des index a changé ; Veuillez créer de nouveaux index.
h_int_parser = Tolérant aux fautes, et plus rapide que le parser Java par défaut.
h_languauge = Des analyseurs spécifiques à la langue vont être utilisés.
diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang
index cedba9ccd8..07d34b07b6 100644
--- a/basex-core/src/main/resources/lang/German.lang
+++ b/basex-core/src/main/resources/lang/German.lang
@@ -413,7 +413,7 @@ h_db_format = Das Datenbankformat hat sich geändert; bitte verwenden S
h_db_options_% = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird.
h_diacritics = Diakritische Zeichen werden im Index beibehalten.
h_fulltext_index = Ein Volltext-Index beschleunigt Volltext-Anfragen.
-h_html_parser = Der Validator.nu-Parser wird verwendet, um HTML in XML zu konvertieren.
+h_html_parser_% = Der %-Parser wird verwendet, um HTML in XML zu konvertieren.
h_index_format = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes.
h_int_parser = Fehlertolerant und schneller als Javas XML-Parser.
h_languauge = Sprachspezifische Tokenisierung wird verwendet.
diff --git a/basex-core/src/main/resources/lang/Hungarian.lang b/basex-core/src/main/resources/lang/Hungarian.lang
index 20c823ea28..569fce6e0a 100644
--- a/basex-core/src/main/resources/lang/Hungarian.lang
+++ b/basex-core/src/main/resources/lang/Hungarian.lang
@@ -413,7 +413,7 @@ h_db_format = Az adatbázis formátuma megváltozott; kérem, használj
h_db_options_% = Ezek az beállítások csak a következő futtása után lépnek életbe: '%'
h_diacritics = Ékezetek megmaradnak az indexelésben.
h_fulltext_index = A teljes-szöveg index gyorsítja a teljes-szöveges (full-text) lekérdezéseket.
-h_html_parser = A Validator.nu elemző HTML formátumot konvertál XML formátumra.
+h_html_parser_% = A % elemző HTML formátumot konvertál XML formátumra.
h_index_format = Az index formátuma megváltozott; kérem, készítsen új indexeket.
h_int_parser = Hibatűrő, továbbá a Java alapértelmezett elemzőjénél gyorsabb.
h_languauge = Nyelvfüggő szövegelemzések is használatra kerülnek.
diff --git a/basex-core/src/main/resources/lang/Indonesian.lang b/basex-core/src/main/resources/lang/Indonesian.lang
index 74356e5ed8..c7d0d9c56a 100644
--- a/basex-core/src/main/resources/lang/Indonesian.lang
+++ b/basex-core/src/main/resources/lang/Indonesian.lang
@@ -413,7 +413,7 @@ h_db_format = Bentuk basisdata telah berubah; mohon gunakan versi yang
h_db_options_% = Pilihan akan digunakan jika '%' dijalankan.
h_diacritics = Diakritik dipertahankan dalam indeks.
h_fulltext_index = Indeks semua teks mempercepat kueri teks penuh.
-h_html_parser = Pengurai Validator.nu akan digunakan untuk mengubah HTML menjadi XML.
+h_html_parser_% = Pengurai % akan digunakan untuk mengubah HTML menjadi XML.
h_index_format = Bentuk indeks telah berubah; mohon buat indeks baru.
h_int_parser = Toleran kesalahan, dan lebih cepat dari pengurai standar Java.
h_languauge = Pengurai teks bahasa tertentu akan digunakan.
diff --git a/basex-core/src/main/resources/lang/Italian.lang b/basex-core/src/main/resources/lang/Italian.lang
index 168907f804..b62c0c180e 100644
--- a/basex-core/src/main/resources/lang/Italian.lang
+++ b/basex-core/src/main/resources/lang/Italian.lang
@@ -413,7 +413,7 @@ h_db_format = Il formato della base di dati è cambiato; creare una nuo
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = I segni diacritici sono conservati nell'indice.
h_fulltext_index = Un indice "full-text" velocizza le interrogazioni sul testo.
-h_html_parser = Il parser Validator.nu verrò usato per convertire HTML in XML.
+h_html_parser_% = Il parser % verrò usato per convertire HTML in XML.
h_index_format = Il formato degli indici è cambiato; creare nuovi indici.
h_int_parser = Tollerante ai guasti e più veloce del parser di default di Java.
h_languauge = Parser di testo specifici per la lingua verranno usati
diff --git a/basex-core/src/main/resources/lang/Japanese.lang b/basex-core/src/main/resources/lang/Japanese.lang
index d0773f8a4a..84a08cd66f 100644
--- a/basex-core/src/main/resources/lang/Japanese.lang
+++ b/basex-core/src/main/resources/lang/Japanese.lang
@@ -413,7 +413,7 @@ h_db_format = データベース形式を変更しました。新しい
h_db_options_% = % 実行時にオプションが割り当てられます。
h_diacritics = インデックス内で付加記号(ウムラウト等)は保持されます。
h_fulltext_index = 全文テキストインデックスは全文検索を高速化します。
-h_html_parser = Validator.nu パーサは HTML を XML に変換します。
+h_html_parser_% = % パーサは HTML を XML に変換します。
h_index_format = インデックス形式を変更しました。新しくインデックスを作成して下さい。
h_int_parser = フォールトトレラント、Javaのデフォルトパーサより高速。
h_languauge = 指定された言語のテキストパーサが使用されます。
diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang
index 6c9fce85f5..9079d8891c 100644
--- a/basex-core/src/main/resources/lang/Mongolian.lang
+++ b/basex-core/src/main/resources/lang/Mongolian.lang
@@ -413,7 +413,7 @@ h_db_format = Өгөгдлийн сангийн формат өөрчл
h_db_options_% = The options will be assigned if '%' is executed.
h_diacritics = Индекс дэх санах тэмдгийг авч үлдэх.
h_fulltext_index = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно.
-h_html_parser = The Validator.nu parser will be used to convert HTML to XML.
+h_html_parser_% = The % parser will be used to convert HTML to XML.
h_index_format = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү.
h_int_parser = Fault tolerant, and faster than Java’s default parser.
h_languauge = Хэлний текст Parser тодорхойлогдох болно.
diff --git a/basex-core/src/main/resources/lang/Romanian.lang b/basex-core/src/main/resources/lang/Romanian.lang
index dac622b9d9..aaceae5e82 100644
--- a/basex-core/src/main/resources/lang/Romanian.lang
+++ b/basex-core/src/main/resources/lang/Romanian.lang
@@ -413,7 +413,7 @@ h_db_format = Formatul bazei de date a fost schimbat, vă rugăm să fo
h_db_options_% = Optiunile vor fi asignate daca '%' este executată.
h_diacritics = Diacritice sunt păstrate în index.
h_fulltext_index = Un full-text index accelereaza interogările full-text.
-h_html_parser = Parserul "Validator.nu" va fi folosit pentru a converti HTML în XML.
+h_html_parser_% = Parserul "%" va fi folosit pentru a converti HTML în XML.
h_index_format = Formatul index s-a schimbat, vă rugăm creati noi indici.
h_int_parser = Tolerant la greseli si mai rapid decat parserul default Java.
h_languauge = Parsere de text specifice limbii vor fi folosite.
diff --git a/basex-core/src/main/resources/lang/Russian.lang b/basex-core/src/main/resources/lang/Russian.lang
index 1dda70b524..15a89ecce8 100644
--- a/basex-core/src/main/resources/lang/Russian.lang
+++ b/basex-core/src/main/resources/lang/Russian.lang
@@ -413,7 +413,7 @@ h_db_format = Формат хранения баз данных был
h_db_options_% = Эти опции будут изменены только после выполнения команды [%]
h_diacritics = Разделительные знаки будут включены в индекс
h_fulltext_index = Полнотекстовый индекс ускоряет соответствующие запросы
-h_html_parser = Для конвертации HTML в XML будет использован парсер Validator.nu
+h_html_parser_% = Для конвертации HTML в XML будет использован парсер %
h_index_format = Формат хранения индексов был изменен. Пожалуйста, создайте индексы заново.
h_int_parser = Толерантный к ошибкам и быстрее чем стандартный парсер Java
h_languauge = Будут использованы специализированные под каждый язык парсеры
diff --git a/basex-core/src/main/resources/lang/Spanish.lang b/basex-core/src/main/resources/lang/Spanish.lang
index 42e0ce4329..4d032eecd5 100644
--- a/basex-core/src/main/resources/lang/Spanish.lang
+++ b/basex-core/src/main/resources/lang/Spanish.lang
@@ -413,7 +413,7 @@ h_db_format = El formato de la Base de Datos ha cambiado; por favor uti
h_db_options_% = Las opciónes serán asignado si se ejecuta '%'.
h_diacritics = Las diacríticas están retenidas en el índice.
h_fulltext_index = Un índice de Texto Completo acelera las consulta de Texto Completo.
-h_html_parser = Se utilizará el Analizador Sintáctico Validator.nu para convertir HTML a XML.
+h_html_parser_% = Se utilizará el Analizador Sintáctico % para convertir HTML a XML.
h_index_format = El formato del índice ha cambiado; for favor, cree nuevos índices.
h_int_parser = Tolerante a fallos, y más rápido que el analizador sintáctico por defecto de Java.
h_languauge = Se utilizarán analizadores sintácticos de texto específicos del lenguaje.
diff --git a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
index 58b5b8b323..d5cf02e045 100644
--- a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
+++ b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java
@@ -20,7 +20,10 @@ public final class HtmlModuleTest extends SandboxTest {
query(func.args(" <_/>/text()"), "");
final String path = "src/test/resources/input.html";
- query(func.args(path) + "//*:body ! name()", "body");
+ query(func.args(path) + "//body ! name()", "body");
+ query(func.args(path, " map { 'nons': false() }") + "//*:body ! name()", "body");
+ query(func.args(path, " {'method': 'nu'}") + "//Q{http://www.w3.org/1999/xhtml}body ! name()",
+ "body");
}
/** Test method. */
@@ -32,7 +35,8 @@ public final class HtmlModuleTest extends SandboxTest {
// check if the function returns an HTML root node
query("exists(" + func.args("<html/>") + "/*:html)", true);
// check if the function returns
- query(func.args("<html/>"),
+ query(func.args("<html/>", " map { 'nons': true() }"), "");
+ query(func.args("<html/>", " {'method': 'nu'}"),
"");
}
diff --git a/basex-examples/basex-examples.iml b/basex-examples/basex-examples.iml
index c8c6019d38..d68f7b229d 100644
--- a/basex-examples/basex-examples.iml
+++ b/basex-examples/basex-examples.iml
@@ -30,7 +30,7 @@
-
+
diff --git a/basex-examples/pom.xml b/basex-examples/pom.xml
index 03a834b03a..c66dc0ffb0 100644
--- a/basex-examples/pom.xml
+++ b/basex-examples/pom.xml
@@ -18,8 +18,8 @@
${project.version}
- nu.validator
- htmlparser
+ org.ccil.cowan.tagsoup
+ tagsoup
org.junit.jupiter
diff --git a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
index 757baa63e8..7f9dcc9701 100644
--- a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
+++ b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java
@@ -6,7 +6,7 @@
/**
* This example demonstrates how to import a file in the HTML format
* into the database. The specified input file will be converted to XML
- * if Validator.nu is found in the classpath.
+ * if the HTML parser is found in the classpath.
*
* @author BaseX Team, BSD License
* @author Christian Gruen
diff --git a/pom.xml b/pom.xml
index 8c315215bf..1bb0f41810 100644
--- a/pom.xml
+++ b/pom.xml
@@ -63,6 +63,13 @@
runtime
true
+
+ org.ccil.cowan.tagsoup
+ tagsoup
+ 1.2.1
+ runtime
+ true
+
nu.validator
htmlparser
From 12dab13c46665071e3ff709843d7665aa120b1b4 Mon Sep 17 00:00:00 2001
From: Gunther Rademacher
Date: Wed, 22 Jan 2025 19:06:06 +0100
Subject: [PATCH 9/9] minor changes
---
.../main/java/org/basex/build/html/HtmlOptions.java | 2 +-
.../main/java/org/basex/build/html/HtmlParser.java | 11 +++++++----
.../src/main/java/org/basex/core/MainOptions.java | 2 +-
.../src/main/java/org/basex/query/QueryError.java | 2 --
4 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
index 7eebede2ce..74e7a80bc3 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java
@@ -3,7 +3,7 @@
import org.basex.util.options.*;
/**
- * Options for parsing and serializing HTML documents with Validator.nu.
+ * Options for parsing and serializing HTML documents with TagSoup and Validator.nu.
*
* @author BaseX Team, BSD License
* @author Christian Gruen
diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
index 3083a5519b..62aff78895 100644
--- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
+++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java
@@ -20,9 +20,12 @@
import nu.validator.htmlparser.sax.*;
/**
- * This class uses the Validator.nu HTML parser to convert HTML input to well-formed XML.
- * If the Validator.nu HTML parser is not found in the classpath, the original document is
- * passed on.
+ * This class uses the TagSoup or Validator.nu HTML parser to convert HTML input to well-formed
+ * XML. If TagSoup should be used, and it is not found in the classpath, the original document
+ * is passed on.
+ *
+ * TagSoup was written by John Cowan and is based on the Apache 2.0 License:
+ * {@code http://home.ccil.org/~cowan/XML/tagsoup/}.
*
* The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License:
* {@code https://about.validator.nu/htmlparser/}.
@@ -80,7 +83,7 @@ private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopt
: hopts.contains(ENCODING)
? hopts.get(HtmlOptions.ENCODING)
: null;
- if (enc != null) {
+ if(enc != null) {
if(!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.');
is.setEncoding(Strings.normEncoding(enc));
}
diff --git a/basex-core/src/main/java/org/basex/core/MainOptions.java b/basex-core/src/main/java/org/basex/core/MainOptions.java
index bf5c21b49b..2abb02ba61 100644
--- a/basex-core/src/main/java/org/basex/core/MainOptions.java
+++ b/basex-core/src/main/java/org/basex/core/MainOptions.java
@@ -40,7 +40,7 @@ public final class MainOptions extends Options {
/** Define JSON parser options. */
public static final OptionsOption JSONPARSER =
new OptionsOption<>("JSONPARSER", new JsonParserOptions());
- /** Define Validator.nu HTML options. */
+ /** Define HTML options. */
public static final OptionsOption HTMLPARSER =
new OptionsOption<>("HTMLPARSER", new HtmlOptions());
/** Define import parser. */
diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java
index 746225978a..81211886f6 100644
--- a/basex-core/src/main/java/org/basex/query/QueryError.java
+++ b/basex-core/src/main/java/org/basex/query/QueryError.java
@@ -619,8 +619,6 @@ public enum QueryError {
RESINV_X(FODC, 7, "Resource path '%' is invalid."),
/** Error code. */
INVHTML_X(FODC, 11, "HTML parsing failed: %"),
- /** Error code. */
- INVHTMLOPT_X(FODC, 12, "HTML option processing failed: %"),
/** Error code. */
FORMATWHICH_X(FODF, 1280, "Unknown decimal format: %."),