From 1711e3fde9ae6e9b44a7b6aedead563265135737 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 23 Oct 2023 12:55:51 +0200 Subject: [PATCH 1/9] add fn:parse-html based on Validator.nu --- basex-core/pom.xml | 6 + .../main/java/org/basex/query/QueryError.java | 4 + .../java/org/basex/query/func/Function.java | 3 + .../basex/query/func/html/FnParseHtml.java | 138 ++++++++++++++++++ .../org/basex/query/func/FnModuleTest.java | 14 ++ pom.xml | 7 + 6 files changed, 172 insertions(+) create mode 100644 basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java diff --git a/basex-core/pom.xml b/basex-core/pom.xml index 83e0ef7818..7e34f25e58 100644 --- a/basex-core/pom.xml +++ b/basex-core/pom.xml @@ -52,6 +52,12 @@ provided true + + nu.validator + htmlparser + provided + true + diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java index 7e30cd41b9..3916df95db 100644 --- a/basex-core/src/main/java/org/basex/query/QueryError.java +++ b/basex-core/src/main/java/org/basex/query/QueryError.java @@ -615,6 +615,10 @@ public enum QueryError { SAXERR_X(FODC, 6, "SAX: %"), /** Error code. */ RESINV_X(FODC, 7, "Resource path '%' is invalid."), + /** Error code. */ + INVHTML_X(FODC, 11, "String passed to fn:parse-html is not a well-formed HTML document: %"), + /** Error code. */ + INVHTMLOPT_X(FODC, 12, "Unsupported HTML parser option: %"), /** Error code. */ FORMNUM_X(FODF, 1280, "Unknown decimal format: '%'."), diff --git a/basex-core/src/main/java/org/basex/query/func/Function.java b/basex-core/src/main/java/org/basex/query/func/Function.java index da69a1237c..bf29459289 100644 --- a/basex-core/src/main/java/org/basex/query/func/Function.java +++ b/basex-core/src/main/java/org/basex/query/func/Function.java @@ -464,6 +464,9 @@ ITEM_ZM, flag(HOF)), PARSE_IETF_DATE(FnParseIetfDate::new, "parse-ietf-date(value)", params(STRING_ZO), DATE_TIME_ZO), /** XQuery function. */ + PARSE_HTML(FnParseHtml::new, "parse-html(html[,options])", + params(ITEM_ZO, MAP_O), DOCUMENT_NODE_ZO), + /** XQuery function. */ PARSE_INTEGER(FnParseInteger::new, "parse-integer(value[,radix])", params(STRING_O, INTEGER_O), INTEGER_O), /** XQuery function. */ diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java new file mode 100644 index 0000000000..0f8a3a5801 --- /dev/null +++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java @@ -0,0 +1,138 @@ +package org.basex.query.func.html; + +import static org.basex.query.QueryError.*; +import static org.basex.util.Token.*; + +import java.io.*; + +import org.basex.build.html.*; +import org.basex.build.xml.*; +import org.basex.core.*; +import org.basex.io.*; +import org.basex.io.in.*; +import org.basex.query.*; +import org.basex.query.expr.*; +import org.basex.query.func.*; +import org.basex.query.value.item.*; +import org.basex.query.value.node.*; +import org.basex.query.value.seq.*; +import org.basex.util.*; +import org.xml.sax.*; + +import nu.validator.htmlparser.common.*; +import nu.validator.htmlparser.sax.*; + +/** + * Function implementation. + * + * @author BaseX Team 2005-23, BSD License + * @author Gunther Rademacher + */ +public class FnParseHtml extends StandardFunc { + // TODO: handle second argument (method, html-version, encoding), produce error code FODC0012 + + @Override + public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { + final Item value = arg(0).atomItem(qc, info); + return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value))); + } + + @Override + protected final Expr opt(final CompileContext cc) { + return optFirst(); + } + + /** + * Parses the input and creates an XML document. + * @param io input data + * @return node + * @throws QueryException query exception + */ + protected final Item parse(final IO io) throws QueryException { + try { + if (!ParserImpl.available()) { + // reader could not be initialized; fall back to html:parse + final HtmlOptions htmlOptions = new HtmlOptions(); + htmlOptions.set(HtmlOptions.LEXICAL, true); + htmlOptions.set(HtmlOptions.NONS, false); + return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions)); + } + return new DBNode(new ParserImpl(io, new MainOptions())); + } catch(final IOException ex) { + throw INVHTML_X.get(info, ex); + } + } + + /** + * Parser implementation. + */ + private static class ParserImpl extends XMLParser { + + /** + * Checks if Validator.nu is available. + * @return result of check + */ + public static boolean available() { + return Reflect.available("nu.validator.htmlparser.sax.HtmlParser"); + } + + /** + * Constructor. + * @param source document source + * @param options main options + * @throws IOException I/O exception + */ + ParserImpl(final IO source, final MainOptions options) + throws IOException { + super(toXml(source), options); + } + + /** + * Converts an HTML document to XML. + * @param io io reference + * @return parser + * @throws IOException I/O exception + */ + private static IO toXml(final IO io) throws IOException { + try(TextInput ti = new TextInput(io)) { + + // tries to extract the encoding from the input + // TODO: remove this, in favor of encoding from options, or constant for string input + String enc = ti.encoding(); + final byte[] content = ti.content(); + // looks for a charset definition + final byte[] encoding = token("charset="); + int cs = indexOf(content, encoding); + if(cs > 0) { + // extracts the encoding string + cs += encoding.length; + int ce = cs; + final int cl = content.length; + while(++ce < cl && content[ce] > 0x28); + enc = string(substring(content, cs, ce)); + } + + // define output + final StringWriter sw = new StringWriter(); + final nu.validator.htmlparser.sax.HtmlParser reader = + new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET); + reader.setFeature("http://xml.org/sax/features/namespaces", true); + reader.setFeature("http://xml.org/sax/features/namespace-prefixes", false); + + final ContentHandler writer = new XmlSerializer(sw); + reader.setContentHandler(writer); + reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); + + // define input + final InputSource is = new InputSource(new ArrayInput(content)); + is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8); + reader.parse(is); + return new IOContent(token(sw.toString()), io.name()); + + } catch(final SAXException ex) { + Util.errln(ex); + throw INVHTML_X.getIO(ex.getLocalizedMessage()); + } + } + } +} diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java index 90325bd7d4..a57d7be0d0 100644 --- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java @@ -1438,6 +1438,20 @@ public final class FnModuleTest extends SandboxTest { query("let $n :=
  • return " + func.args(" ($n, $n)"), "
  • "); } + /** Test method. */ + @Test public void parseHtml() { + final Function func = PARSE_HTML; + + query(func.args("42"), + "42"); + query(func.args(" xs:hexBinary('3432')"), + "42"); + query(func.args(" xs:base64Binary('NDI=')"), + "42"); + + error(func.args(42), STRBIN_X_X); + } + /** Test method. */ @Test public void parseIetfDate() { final Function func = PARSE_IETF_DATE; diff --git a/pom.xml b/pom.xml index dabb8b18ca..e9cf162f1a 100644 --- a/pom.xml +++ b/pom.xml @@ -174,6 +174,13 @@ runtime true + + nu.validator + htmlparser + 1.4.16 + runtime + true + From dfb0be0b2f594f5137948f9a1987a50fb66da384 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 30 Oct 2023 13:41:42 +0100 Subject: [PATCH 2/9] add support for Validator.nu options --- .../org/basex/build/html/HtmlOptions.java | 138 ++++++++++++++++++ .../basex/query/func/html/FnParseHtml.java | 97 +++++++----- .../org/basex/query/func/FnModuleTest.java | 20 ++- 3 files changed, 216 insertions(+), 39 deletions(-) diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java index 5fe399485f..b46e8d6e56 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java @@ -9,6 +9,41 @@ * @author Christian Gruen */ public final class HtmlOptions extends Options { + /** Validator.nu option unicode-normalization-checking. */ + public static final BooleanOption UNICODE_NORMALIZATION_CHECKING = + new BooleanOption("unicode-normalization-checking", false); + /** Validator.nu option mapping-lang-to-xml-lang. */ + public static final BooleanOption MAPPING_LANG_TO_XML_LANG = + new BooleanOption("mapping-lang-to-xml-lang", false); + /** Validator.nu option scripting-enabled. */ + public static final BooleanOption SCRIPTING_ENABLED = + new BooleanOption("scripting-enabled", false); + + /** Validator.nu option content-space-policy. */ + public static final EnumOption CONTENT_SPACE_POLICY = + new EnumOption<>("content-space-policy", XmlViolationPolicy.class); + /** Validator.nu option content-non-xml-char-policy. */ + public static final EnumOption CONTENT_NON_XML_CHAR_POLICY = + new EnumOption<>("content-non-xml-char-policy", XmlViolationPolicy.class); + /** Validator.nu option comment-policy. */ + public static final EnumOption COMMENT_POLICY = + new EnumOption<>("comment-policy", XmlViolationPolicy.class); + /** Validator.nu option xmlns-policy. */ + public static final EnumOption XMLNS_POLICY = + new EnumOption<>("xmlns-policy", XmlViolationPolicy.class); + /** Validator.nu option name-policy. */ + public static final EnumOption NAME_POLICY = + new EnumOption<>("name-policy", XmlViolationPolicy.class); + /** Validator.nu option streamability-violation-policy. */ + public static final EnumOption STREAMABILITY_VIOLATION_POLICY = + new EnumOption<>("streamability-violation-policy", XmlViolationPolicy.class); + /** Validator.nu option xml-policy. */ + public static final EnumOption XML_POLICY = + new EnumOption<>("xml-policy", XmlViolationPolicy.class); + /** Validator.nu option heuristics. */ + public static final EnumOption HEURISTICS = + new EnumOption<>("heuristics", Heuristics.class); + /** TagSoup option: html. */ public static final BooleanOption HTML = new BooleanOption("html", false); /** TagSoup option: omit-xml-declaration. */ @@ -59,4 +94,107 @@ public HtmlOptions() { public HtmlOptions(final Options opts) { super(opts); } + + /** + * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the + * dependency on Validator.nu in the classpath. + * + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + /** + * Policy for XML 1.0 violations. + * + * @version $Id$ + * @author hsivonen + */ + public enum XmlViolationPolicy { + /** + * Conform to HTML 5, allow XML 1.0 to be violated. + */ + ALLOW, + + /** + * Halt when something cannot be mapped to XML 1.0. + */ + FATAL, + + /** + * Be non-conforming and alter the infoset to fit + * XML 1.0 when something would otherwise not be + * mappable to XML 1.0. + */ + ALTER_INFOSET + } + + /** + * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the + * dependency on Validator.nu in the classpath. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + /** + * Indicates a request for character encoding sniffer choice. + * + * @version $Id$ + * @author hsivonen + */ + public enum Heuristics { + + /** + * Perform no heuristic sniffing. + */ + NONE, + + /** + * Use both jchardet and ICU4J. + */ + ALL, + + /** + * Use jchardet only. + */ + CHARDET, + + /** + * Use ICU4J only. + */ + ICU + } + } diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java index 0f8a3a5801..2ae7c1d525 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java +++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java @@ -1,5 +1,6 @@ package org.basex.query.func.html; +import static org.basex.build.html.HtmlOptions.*; import static org.basex.query.QueryError.*; import static org.basex.util.Token.*; @@ -9,7 +10,6 @@ import org.basex.build.xml.*; import org.basex.core.*; import org.basex.io.*; -import org.basex.io.in.*; import org.basex.query.*; import org.basex.query.expr.*; import org.basex.query.func.*; @@ -19,8 +19,9 @@ import org.basex.util.*; import org.xml.sax.*; -import nu.validator.htmlparser.common.*; import nu.validator.htmlparser.sax.*; +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.common.Heuristics; /** * Function implementation. @@ -34,7 +35,8 @@ public class FnParseHtml extends StandardFunc { @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { final Item value = arg(0).atomItem(qc, info); - return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value))); + final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc); + return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), options); } @Override @@ -45,10 +47,11 @@ protected final Expr opt(final CompileContext cc) { /** * Parses the input and creates an XML document. * @param io input data + * @param options HTML options * @return node * @throws QueryException query exception */ - protected final Item parse(final IO io) throws QueryException { + protected final Item parse(final IO io, final HtmlOptions options) throws QueryException { try { if (!ParserImpl.available()) { // reader could not be initialized; fall back to html:parse @@ -57,7 +60,7 @@ protected final Item parse(final IO io) throws QueryException { htmlOptions.set(HtmlOptions.NONS, false); return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions)); } - return new DBNode(new ParserImpl(io, new MainOptions())); + return new DBNode(new ParserImpl(info, io, options)); } catch(final IOException ex) { throw INVHTML_X.get(info, ex); } @@ -73,65 +76,91 @@ private static class ParserImpl extends XMLParser { * @return result of check */ public static boolean available() { - return Reflect.available("nu.validator.htmlparser.sax.HtmlParser"); + return Reflect.available("nu.validator.htmlparser.sax.HtmlParser") + && Reflect.available("nu.validator.htmlparser.sax.XmlSerializer") + && Reflect.available("nu.validator.htmlparser.common.Heuristics") + && Reflect.available("nu.validator.htmlparser.common.XmlViolationPolicy"); } /** * Constructor. + * @param info input info * @param source document source - * @param options main options + * @param options HTML options * @throws IOException I/O exception + * @throws QueryException query exception */ - ParserImpl(final IO source, final MainOptions options) - throws IOException { - super(toXml(source), options); + ParserImpl(final InputInfo info, final IO source, final HtmlOptions options) + throws IOException, QueryException { + super(toXml(info, source, options), new MainOptions()); } /** * Converts an HTML document to XML. + * @param info input info * @param io io reference + * @param hopts HTML options * @return parser * @throws IOException I/O exception + * @throws QueryException query exception */ - private static IO toXml(final IO io) throws IOException { - try(TextInput ti = new TextInput(io)) { - - // tries to extract the encoding from the input - // TODO: remove this, in favor of encoding from options, or constant for string input - String enc = ti.encoding(); - final byte[] content = ti.content(); - // looks for a charset definition - final byte[] encoding = token("charset="); - int cs = indexOf(content, encoding); - if(cs > 0) { - // extracts the encoding string - cs += encoding.length; - int ce = cs; - final int cl = content.length; - while(++ce < cl && content[ce] > 0x28); - enc = string(substring(content, cs, ce)); - } + private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hopts) + throws IOException, QueryException { + try { // define output final StringWriter sw = new StringWriter(); final nu.validator.htmlparser.sax.HtmlParser reader = new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET); - reader.setFeature("http://xml.org/sax/features/namespaces", true); - reader.setFeature("http://xml.org/sax/features/namespace-prefixes", false); - final ContentHandler writer = new XmlSerializer(sw); reader.setContentHandler(writer); reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); // define input - final InputSource is = new InputSource(new ArrayInput(content)); - is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8); + final InputSource is = new InputSource(io.inputStream()); + + // set Validator.nu options + if(hopts.get(UNICODE_NORMALIZATION_CHECKING)) + reader.setCheckingNormalization(true); + if(hopts.get(MAPPING_LANG_TO_XML_LANG)) + reader.setMappingLangToXmlLang(true); + if(hopts.get(SCRIPTING_ENABLED)) + reader.setScriptingEnabled(true); + if(hopts.contains(CONTENT_SPACE_POLICY)) + reader.setContentSpacePolicy( + XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name())); + if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY)) + reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf( + hopts.get(CONTENT_NON_XML_CHAR_POLICY).name())); + if(hopts.contains(COMMENT_POLICY)) + reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name())); + if(hopts.contains(XMLNS_POLICY)) + reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name())); + if(hopts.contains(NAME_POLICY)) + reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name())); + if(hopts.contains(STREAMABILITY_VIOLATION_POLICY)) + reader.setStreamabilityViolationPolicy( + XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name())); + if(hopts.contains(XML_POLICY)) + reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name())); + + if(hopts.contains(HEURISTICS)) + reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); + // end Validator.nu options + + if (hopts.contains(ENCODING)) { + String enc = hopts.get(HtmlOptions.ENCODING); + if (!Strings.supported(enc)) + throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.'); + is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8); + } + reader.parse(is); return new IOContent(token(sw.toString()), io.name()); } catch(final SAXException ex) { Util.errln(ex); - throw INVHTML_X.getIO(ex.getLocalizedMessage()); + throw INVHTML_X.get(info, ex.getLocalizedMessage()); } } } diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java index a57d7be0d0..5f5365d262 100644 --- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java @@ -11,6 +11,7 @@ import org.basex.query.expr.path.*; import org.basex.query.value.item.*; import org.basex.query.value.seq.*; +import org.basex.util.*; import org.junit.jupiter.api.*; import org.junit.jupiter.api.Test; @@ -1434,7 +1435,7 @@ public final class FnModuleTest extends SandboxTest { /** Test method. */ @Test public void outermost() { - final Function func = INNERMOST; + final Function func = OUTERMOST; query("let $n :=
  • return " + func.args(" ($n, $n)"), "
  • "); } @@ -1444,12 +1445,21 @@ public final class FnModuleTest extends SandboxTest { query(func.args("42"), "42"); - query(func.args(" xs:hexBinary('3432')"), - "42"); - query(func.args(" xs:base64Binary('NDI=')"), - "42"); + query(func.args(_CONVERT_STRING_TO_HEX.args("42", Strings.UTF16LE), + " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"), + "42"); + query(func.args(_CONVERT_STRING_TO_BASE64.args("42", Strings.UTF16BE), + " map {'encoding': '" + Strings.UTF16BE + "', 'heuristics': 'NONE'}"), + "42"); error(func.args(42), STRBIN_X_X); + error(func.args(" \"42\"", 42), MAP_X_X); + error(func.args(" \"42\"", " map {'1234': ()}"), INVALIDOPT_X); + error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVALIDOPT_X); } /** Test method. */ From 8efe4a1f9bc31ab338ad0d1f408ffcf400c11970 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 30 Oct 2023 20:50:29 +0100 Subject: [PATCH 3/9] ignore encoding option when parsing a string value --- .../org/basex/query/func/html/FnParseHtml.java | 14 ++++++++++---- .../java/org/basex/query/func/FnModuleTest.java | 3 +++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java index 2ae7c1d525..a891859e34 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java +++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java @@ -36,7 +36,9 @@ public class FnParseHtml extends StandardFunc { public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { final Item value = arg(0).atomItem(qc, info); final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc); - return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), options); + final IO io = value instanceof Bin ? new IOContent(toBytes(value)) + : new IOContent(toBytes(value), "", Strings.UTF8); + return value.isEmpty() ? Empty.VALUE : parse(io, options); } @Override @@ -148,11 +150,15 @@ private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hop reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); // end Validator.nu options - if (hopts.contains(ENCODING)) { - String enc = hopts.get(HtmlOptions.ENCODING); + String enc = io.encoding() != null + ? io.encoding() + : hopts.contains(ENCODING) + ? hopts.get(HtmlOptions.ENCODING) + : null; // TODO: sniff encoding + if (enc != null) { if (!Strings.supported(enc)) throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.'); - is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8); + is.setEncoding(Strings.normEncoding(enc)); } reader.parse(is); diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java index 5f5365d262..ed667b2a68 100644 --- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java @@ -1443,8 +1443,11 @@ public final class FnModuleTest extends SandboxTest { @Test public void parseHtml() { final Function func = PARSE_HTML; + query(func.args(" ()"), ""); query(func.args("42"), "42"); + query(func.args("42", " map {'encoding': '" + Strings.UTF16LE + "'}"), + "42"); query(func.args(_CONVERT_STRING_TO_HEX.args("42", Strings.UTF16LE), " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"), From d66e40be30866bcd08abd129c8af9b8bf174e698 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 6 Nov 2023 13:15:54 +0100 Subject: [PATCH 4/9] replace TagSoup by Validator.nu --- basex-core/pom.xml | 12 +- .../org/basex/build/html/HtmlOptions.java | 54 ++---- .../java/org/basex/build/html/HtmlParser.java | 152 ++++++++--------- .../main/java/org/basex/core/MainOptions.java | 2 +- .../main/java/org/basex/query/QueryError.java | 9 +- .../org/basex/query/func/FuncOptions.java | 33 ++-- .../org/basex/query/func/StandardFunc.java | 17 +- .../basex/query/func/html/FnParseHtml.java | 159 +----------------- .../org/basex/query/func/html/HtmlParse.java | 9 +- .../src/main/resources/lang/Chinese.lang | 2 +- basex-core/src/main/resources/lang/Dutch.lang | 2 +- .../src/main/resources/lang/English.lang | 2 +- .../src/main/resources/lang/French.lang | 2 +- .../src/main/resources/lang/German.lang | 2 +- .../src/main/resources/lang/Hungarian.lang | 2 +- .../src/main/resources/lang/Indonesian.lang | 2 +- .../src/main/resources/lang/Italian.lang | 2 +- .../src/main/resources/lang/Japanese.lang | 2 +- .../src/main/resources/lang/Mongolian.lang | 2 +- .../src/main/resources/lang/Romanian.lang | 2 +- .../src/main/resources/lang/Russian.lang | 2 +- .../src/main/resources/lang/Spanish.lang | 2 +- .../org/basex/query/func/FnModuleTest.java | 4 +- .../org/basex/query/func/HtmlModuleTest.java | 6 +- basex-examples/basex-examples.iml | 2 +- basex-examples/pom.xml | 4 +- .../basex/examples/create/HTMLExample.java | 2 +- pom.xml | 13 +- 28 files changed, 162 insertions(+), 342 deletions(-) diff --git a/basex-core/pom.xml b/basex-core/pom.xml index 94512b712e..f3c6d14885 100644 --- a/basex-core/pom.xml +++ b/basex-core/pom.xml @@ -30,9 +30,9 @@ true - org.ccil.cowan.tagsoup - tagsoup - compile + nu.validator + htmlparser + provided true @@ -52,12 +52,6 @@ provided true - - nu.validator - htmlparser - provided - true - diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java index b46e8d6e56..a6ce8f6cc9 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java @@ -3,12 +3,22 @@ import org.basex.util.options.*; /** - * Options for parsing and serializing HTML documents with TagSoup. + * Options for parsing and serializing HTML documents with Validator.nu. * * @author BaseX Team 2005-23, BSD License * @author Christian Gruen */ public final class HtmlOptions extends Options { + /** fn:parse-html option encoding. */ + public static final StringOption ENCODING = new StringOption("encoding"); + /** fn:parse-html option method. */ + public static final StringOption METHOD = new StringOption("method"); + /** fn:parse-html option html-version. */ + public static final StringOption HTML_VERSION = new StringOption("html-version"); + /** fn:parse-html option include-template-content. */ + public static final BooleanOption INCLUDE_TEMPLATE_CONTENT = + new BooleanOption("include-template-content"); + /** Validator.nu option unicode-normalization-checking. */ public static final BooleanOption UNICODE_NORMALIZATION_CHECKING = new BooleanOption("unicode-normalization-checking", false); @@ -18,7 +28,6 @@ public final class HtmlOptions extends Options { /** Validator.nu option scripting-enabled. */ public static final BooleanOption SCRIPTING_ENABLED = new BooleanOption("scripting-enabled", false); - /** Validator.nu option content-space-policy. */ public static final EnumOption CONTENT_SPACE_POLICY = new EnumOption<>("content-space-policy", XmlViolationPolicy.class); @@ -44,43 +53,6 @@ public final class HtmlOptions extends Options { public static final EnumOption HEURISTICS = new EnumOption<>("heuristics", Heuristics.class); - /** TagSoup option: html. */ - public static final BooleanOption HTML = new BooleanOption("html", false); - /** TagSoup option: omit-xml-declaration. */ - public static final BooleanOption OMIT_XML_DECLARATION = - new BooleanOption("omit-xml-declaration", false); - /** TagSoup option: nons. */ - public static final BooleanOption NONS = new BooleanOption("nons", true); - /** TagSoup option: nobogons. */ - public static final BooleanOption NOBOGONS = new BooleanOption("nobogons", false); - /** TagSoup option: nodefaults. */ - public static final BooleanOption NODEFAULTS = new BooleanOption("nodefaults", false); - /** TagSoup option: nocolons. */ - public static final BooleanOption NOCOLONS = new BooleanOption("nocolons", false); - /** TagSoup option: norestart. */ - public static final BooleanOption NORESTART = new BooleanOption("norestart", false); - /** TagSoup option: nobogons. */ - public static final BooleanOption IGNORABLE = new BooleanOption("ignorable", false); - /** TagSoup option: emptybogons. */ - public static final BooleanOption EMPTYBOGONS = new BooleanOption("emptybogons", false); - /** TagSoup option: any. */ - public static final BooleanOption ANY = new BooleanOption("any", false); - /** TagSoup option: norootbogons. */ - public static final BooleanOption NOROOTBOGONS = new BooleanOption("norootbogons", false); - /** TagSoup option: nocdata. */ - public static final BooleanOption NOCDATA = new BooleanOption("nocdata", false); - /** TagSoup option: lexical. */ - public static final BooleanOption LEXICAL = new BooleanOption("lexical", false); - - /** TagSoup option: method (html). */ - public static final StringOption METHOD = new StringOption("method", "xml"); - /** TagSoup option: doctype-system=systemid. */ - public static final StringOption DOCTYPE_SYSTEM = new StringOption("doctype-system"); - /** TagSoup option: doctype-public=publicid. */ - public static final StringOption DOCTYPE_PUBLIC = new StringOption("doctype-public"); - /** TagSoup option: encoding=encoding. */ - public static final StringOption ENCODING = new StringOption("encoding"); - /** * Default constructor. */ @@ -97,7 +69,7 @@ public HtmlOptions(final Options opts) { /** * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the - * dependency on Validator.nu in the classpath. + * class path dependency of HtmlOptions on Validator.nu. * * Copyright (c) 2007 Henri Sivonen * @@ -147,7 +119,7 @@ public enum XmlViolationPolicy { /** * Copied from nu.validator.htmlparser.common.XmlViolationPolicy in order to avoid the - * dependency on Validator.nu in the classpath. + * class path dependency of HtmlOptions on Validator.nu. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java index acfe3d5882..773defb9db 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java @@ -1,39 +1,57 @@ package org.basex.build.html; -import static org.basex.util.Token.*; import static org.basex.build.html.HtmlOptions.*; +import static org.basex.query.QueryError.*; +import static org.basex.util.Token.*; import java.io.*; +import java.util.*; + import org.basex.build.xml.*; import org.basex.core.*; import org.basex.io.*; -import org.basex.io.in.*; import org.basex.util.*; -import org.ccil.cowan.tagsoup.*; import org.xml.sax.*; +import nu.validator.htmlparser.common.Heuristics; +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.sax.*; + /** - * This class uses TagSoup to convert HTML input to well-formed XML. - * If TagSoup is not found in the classpath, the original document is passed on. + * This class uses the Validator.nu HTML parser to convert HTML input to well-formed XML. + * If the Validator.nu HTML parser is not found in the classpath, the original document is + * passed on. * - * TagSoup was written by John Cowan and is based on the Apache 2.0 License: - * {@code http://home.ccil.org/~cowan/XML/tagsoup/}. + * The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License: + * {@code https://about.validator.nu/htmlparser/}. * * @author BaseX Team 2005-23, BSD License * @author Christian Gruen */ public final class HtmlParser extends XMLParser { /** Name of HTML Parser. */ - private static final String NAME = "TagSoup"; - /** TagSoup URL. */ - private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/"; + private static final String NAME = "Validator.nu"; /** - * Checks if a CatalogResolver is available. + * Checks if a Validator.nu is available. * @return result of check */ public static boolean available() { - return Reflect.available("org.ccil.cowan.tagsoup.Parser"); + return firstUnavailableClass() == null; + } + + /** + * Check whether Validator.nu classes are available on the class path. + * @return the name of the first class that is not available, or null if all classes are available + */ + public static String firstUnavailableClass() { + for(final String className : Arrays.asList("nu.validator.htmlparser.sax.HtmlParser", + "nu.validator.htmlparser.sax.XmlSerializer", + "nu.validator.htmlparser.common.XmlViolationPolicy", + "nu.validator.htmlparser.common.Heuristics")) { + if(!Reflect.available(className)) return className; + } + return null; } /** @@ -77,81 +95,63 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException // reader could not be initialized; fall back to XML if(!available()) return io; - try(TextInput ti = new TextInput(io)) { - // tries to extract the encoding from the input - String enc = ti.encoding(); - final byte[] content = ti.content(); - - // looks for a charset definition - final byte[] encoding = token("charset="); - int cs = indexOf(content, encoding); - if(cs > 0) { - // extracts the encoding string - cs += encoding.length; - int ce = cs; - final int cl = content.length; - while(++ce < cl && content[ce] > 0x28); - enc = string(substring(content, cs, ce)); - } - + try { // define output final StringWriter sw = new StringWriter(); - final XMLReader reader = new org.ccil.cowan.tagsoup.Parser(); - final XMLWriter writer = new XMLWriter(sw); - writer.setOutputProperty(ENCODING.name(), Strings.UTF8); + final nu.validator.htmlparser.sax.HtmlParser reader = + new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET); + final ContentHandler writer = new XmlSerializer(sw); reader.setContentHandler(writer); + reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); - // set TagSoup options - if(hopts.get(HTML)) { - reader.setFeature("http://xml.org/sax/features/namespaces", false); - writer.setOutputProperty(METHOD.name(), "html"); - writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes"); + // define input + final InputSource is = new InputSource(io.inputStream()); + + // set Validator.nu options + if(hopts.get(UNICODE_NORMALIZATION_CHECKING)) + reader.setCheckingNormalization(true); + if(hopts.get(MAPPING_LANG_TO_XML_LANG)) + reader.setMappingLangToXmlLang(true); + if(hopts.get(SCRIPTING_ENABLED)) + reader.setScriptingEnabled(true); + if(hopts.contains(CONTENT_SPACE_POLICY)) + reader.setContentSpacePolicy( + XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name())); + if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY)) + reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf( + hopts.get(CONTENT_NON_XML_CHAR_POLICY).name())); + if(hopts.contains(COMMENT_POLICY)) + reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name())); + if(hopts.contains(XMLNS_POLICY)) + reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name())); + if(hopts.contains(NAME_POLICY)) + reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name())); + if(hopts.contains(STREAMABILITY_VIOLATION_POLICY)) + reader.setStreamabilityViolationPolicy( + XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name())); + if(hopts.contains(XML_POLICY)) + reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name())); + if(hopts.contains(HEURISTICS)) + reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); + // end Validator.nu options + + String enc = io.encoding() != null + ? io.encoding() + : hopts.contains(ENCODING) + ? hopts.get(HtmlOptions.ENCODING) + : null; // TODO: sniff encoding + if (enc != null) { + if (!Strings.supported(enc)) + throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.'); + is.setEncoding(Strings.normEncoding(enc)); } - if(hopts.get(NONS)) - reader.setFeature("http://xml.org/sax/features/namespaces", false); - if(hopts.get(NOBOGONS)) - reader.setFeature(FEATURES + "ignore-bogons", true); - if(hopts.get(NODEFAULTS)) - reader.setFeature(FEATURES + "default-attributes", false); - if(hopts.get(NOCOLONS)) - reader.setFeature(FEATURES + "translate-colons", true); - if(hopts.get(NORESTART)) - reader.setFeature(FEATURES + "restart-elements", false); - if(hopts.get(IGNORABLE)) - reader.setFeature(FEATURES + "ignorable-whitespace", true); - if(hopts.get(EMPTYBOGONS)) - reader.setFeature(FEATURES + "bogons-empty", true); - if(hopts.get(ANY)) - reader.setFeature(FEATURES + "bogons-empty", false); - if(hopts.get(NOROOTBOGONS)) - reader.setFeature(FEATURES + "root-bogons", false); - if(hopts.get(NOCDATA)) - reader.setFeature(FEATURES + "cdata-elements", false); - if(hopts.get(LEXICAL)) - reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); - - if(hopts.get(OMIT_XML_DECLARATION)) - writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes"); - if(hopts.contains(METHOD)) - writer.setOutputProperty(METHOD.name(), hopts.get(METHOD)); - if(hopts.contains(DOCTYPE_SYSTEM)) - writer.setOutputProperty(DOCTYPE_SYSTEM.name(), hopts.get(DOCTYPE_SYSTEM)); - if(hopts.contains(DOCTYPE_PUBLIC)) - writer.setOutputProperty(DOCTYPE_PUBLIC.name(), hopts.get(DOCTYPE_PUBLIC)); - - if(hopts.contains(ENCODING)) - enc = hopts.get(ENCODING); - // end TagSoup options - // define input - final InputSource is = new InputSource(new ArrayInput(content)); - is.setEncoding(Strings.supported(enc) ? Strings.normEncoding(enc) : Strings.UTF8); reader.parse(is); return new IOContent(token(sw.toString()), io.name()); } catch(final SAXException ex) { Util.errln(ex); - return io; + throw INVHTML_X.getIO(ex.getLocalizedMessage()); } } } diff --git a/basex-core/src/main/java/org/basex/core/MainOptions.java b/basex-core/src/main/java/org/basex/core/MainOptions.java index 2357d9a5e7..1afd1cda7a 100644 --- a/basex-core/src/main/java/org/basex/core/MainOptions.java +++ b/basex-core/src/main/java/org/basex/core/MainOptions.java @@ -41,7 +41,7 @@ public final class MainOptions extends Options { /** Define JSON parser options. */ public static final OptionsOption JSONPARSER = new OptionsOption<>("JSONPARSER", new JsonParserOptions()); - /** Define TagSoup HTML options. */ + /** Define Validator.nu HTML options. */ public static final OptionsOption HTMLPARSER = new OptionsOption<>("HTMLPARSER", new HtmlOptions()); /** Define import parser. */ diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java index d127bde013..a13468463d 100644 --- a/basex-core/src/main/java/org/basex/query/QueryError.java +++ b/basex-core/src/main/java/org/basex/query/QueryError.java @@ -307,11 +307,6 @@ public enum QueryError { /** Error code. */ HASH_ALGORITHM_X(HASH, "algorithm", "Algorithm not supported: '%'."), - // HTML Module - - /** Error code. */ - HTML_PARSE_X(HTML, "parse", "%"), - // HTTP Module /** Invalid URI. */ @@ -616,9 +611,9 @@ public enum QueryError { /** Error code. */ RESINV_X(FODC, 7, "Resource path '%' is invalid."), /** Error code. */ - INVHTML_X(FODC, 11, "String passed to fn:parse-html is not a well-formed HTML document: %"), + INVHTML_X(FODC, 11, "HTML parsing failed: %"), /** Error code. */ - INVHTMLOPT_X(FODC, 12, "Unsupported HTML parser option: %"), + INVHTMLOPT_X(FODC, 12, "HTML option processing failed: %"), /** Error code. */ FORMNUM_X(FODF, 1280, "Unknown decimal format: '%'."), diff --git a/basex-core/src/main/java/org/basex/query/func/FuncOptions.java b/basex-core/src/main/java/org/basex/query/func/FuncOptions.java index d65b445dcf..b867d05de2 100644 --- a/basex-core/src/main/java/org/basex/query/func/FuncOptions.java +++ b/basex-core/src/main/java/org/basex/query/func/FuncOptions.java @@ -38,14 +38,15 @@ public final class FuncOptions { private final InputInfo info; /** Raise error if a supplied option is unknown. */ - private boolean enforceKnown; + private final boolean enforceKnown; /** * Constructor. * @param info input info (can be {@code null}) + * @param enforceKnown raise error, if a supplied options is unknown */ - public FuncOptions(final InputInfo info) { - this(null, info); + public FuncOptions(final InputInfo info, final boolean enforceKnown) { + this(null, info, enforceKnown); } /** @@ -54,24 +55,20 @@ public FuncOptions(final InputInfo info) { * @param info input info (can be {@code null}) */ public FuncOptions(final QNm root, final InputInfo info) { - test = root == null ? null : new NameTest(root); - this.root = root; - this.info = info; + this(root, info, false); } /** - * Assigns values to the specified options. - * @param item item to be converted (can be {@link Empty#VALUE}) - * @param options options - * @param option type - * @param enforce raise error if a supplied option is unknown - * @return specified options - * @throws QueryException query exception + * Constructor. + * @param root name of root node (can be {@code null}) + * @param info input info (can be {@code null}) + * @param enforceKnown raise error, if a supplied options is unknown */ - public T assign(final Item item, final T options, final boolean enforce) - throws QueryException { - enforceKnown = enforce; - return assign(item, options, INVALIDOPT_X); + private FuncOptions(final QNm root, final InputInfo info, final boolean enforceKnown) { + test = root == null ? null : new NameTest(root); + this.root = root; + this.info = info; + this.enforceKnown = enforceKnown; } /** @@ -83,7 +80,7 @@ public T assign(final Item item, final T options, final bool * @return specified options * @throws QueryException query exception */ - private T assign(final Item item, final T options, final QueryError error) + public T assign(final Item item, final T options, final QueryError error) throws QueryException { if(!item.isEmpty()) { diff --git a/basex-core/src/main/java/org/basex/query/func/StandardFunc.java b/basex-core/src/main/java/org/basex/query/func/StandardFunc.java index 9ddd35bfba..40e384c368 100644 --- a/basex-core/src/main/java/org/basex/query/func/StandardFunc.java +++ b/basex-core/src/main/java/org/basex/query/func/StandardFunc.java @@ -507,7 +507,22 @@ protected final HashMap toOptions(final Expr expr, final QueryCo */ protected final E toOptions(final Expr expr, final E options, final boolean enforce, final QueryContext qc) throws QueryException { - return new FuncOptions(info).assign(expr.item(qc, info), options, enforce); + return new FuncOptions(info, enforce).assign(expr.item(qc, info), options, INVALIDOPT_X); + } + + /** + * Evaluates an expression, if it exists, and returns options. + * @param options type + * @param expr expression (can be {@code Empty#UNDEFINED}) + * @param options options template + * @param error error to raise, if a supplied option is unknown + * @param qc query context + * @return options + * @throws QueryException query exception + */ + protected final E toOptions(final Expr expr, final E options, + final QueryError error, final QueryContext qc) throws QueryException { + return new FuncOptions(info, true).assign(expr.item(qc, info), options, error); } /** diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java index a891859e34..a6b74d1775 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java +++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java @@ -1,27 +1,11 @@ package org.basex.query.func.html; -import static org.basex.build.html.HtmlOptions.*; import static org.basex.query.QueryError.*; -import static org.basex.util.Token.*; -import java.io.*; - -import org.basex.build.html.*; -import org.basex.build.xml.*; -import org.basex.core.*; -import org.basex.io.*; +import org.basex.build.html.HtmlParser; import org.basex.query.*; -import org.basex.query.expr.*; -import org.basex.query.func.*; import org.basex.query.value.item.*; -import org.basex.query.value.node.*; -import org.basex.query.value.seq.*; import org.basex.util.*; -import org.xml.sax.*; - -import nu.validator.htmlparser.sax.*; -import nu.validator.htmlparser.common.XmlViolationPolicy; -import nu.validator.htmlparser.common.Heuristics; /** * Function implementation. @@ -29,145 +13,12 @@ * @author BaseX Team 2005-23, BSD License * @author Gunther Rademacher */ -public class FnParseHtml extends StandardFunc { - // TODO: handle second argument (method, html-version, encoding), produce error code FODC0012 +public class FnParseHtml extends HtmlParse { @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { - final Item value = arg(0).atomItem(qc, info); - final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc); - final IO io = value instanceof Bin ? new IOContent(toBytes(value)) - : new IOContent(toBytes(value), "", Strings.UTF8); - return value.isEmpty() ? Empty.VALUE : parse(io, options); - } - - @Override - protected final Expr opt(final CompileContext cc) { - return optFirst(); - } - - /** - * Parses the input and creates an XML document. - * @param io input data - * @param options HTML options - * @return node - * @throws QueryException query exception - */ - protected final Item parse(final IO io, final HtmlOptions options) throws QueryException { - try { - if (!ParserImpl.available()) { - // reader could not be initialized; fall back to html:parse - final HtmlOptions htmlOptions = new HtmlOptions(); - htmlOptions.set(HtmlOptions.LEXICAL, true); - htmlOptions.set(HtmlOptions.NONS, false); - return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), htmlOptions)); - } - return new DBNode(new ParserImpl(info, io, options)); - } catch(final IOException ex) { - throw INVHTML_X.get(info, ex); - } - } - - /** - * Parser implementation. - */ - private static class ParserImpl extends XMLParser { - - /** - * Checks if Validator.nu is available. - * @return result of check - */ - public static boolean available() { - return Reflect.available("nu.validator.htmlparser.sax.HtmlParser") - && Reflect.available("nu.validator.htmlparser.sax.XmlSerializer") - && Reflect.available("nu.validator.htmlparser.common.Heuristics") - && Reflect.available("nu.validator.htmlparser.common.XmlViolationPolicy"); - } - - /** - * Constructor. - * @param info input info - * @param source document source - * @param options HTML options - * @throws IOException I/O exception - * @throws QueryException query exception - */ - ParserImpl(final InputInfo info, final IO source, final HtmlOptions options) - throws IOException, QueryException { - super(toXml(info, source, options), new MainOptions()); - } - - /** - * Converts an HTML document to XML. - * @param info input info - * @param io io reference - * @param hopts HTML options - * @return parser - * @throws IOException I/O exception - * @throws QueryException query exception - */ - private static IO toXml(final InputInfo info, final IO io, final HtmlOptions hopts) - throws IOException, QueryException { - - try { - // define output - final StringWriter sw = new StringWriter(); - final nu.validator.htmlparser.sax.HtmlParser reader = - new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET); - final ContentHandler writer = new XmlSerializer(sw); - reader.setContentHandler(writer); - reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); - - // define input - final InputSource is = new InputSource(io.inputStream()); - - // set Validator.nu options - if(hopts.get(UNICODE_NORMALIZATION_CHECKING)) - reader.setCheckingNormalization(true); - if(hopts.get(MAPPING_LANG_TO_XML_LANG)) - reader.setMappingLangToXmlLang(true); - if(hopts.get(SCRIPTING_ENABLED)) - reader.setScriptingEnabled(true); - if(hopts.contains(CONTENT_SPACE_POLICY)) - reader.setContentSpacePolicy( - XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name())); - if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY)) - reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf( - hopts.get(CONTENT_NON_XML_CHAR_POLICY).name())); - if(hopts.contains(COMMENT_POLICY)) - reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name())); - if(hopts.contains(XMLNS_POLICY)) - reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name())); - if(hopts.contains(NAME_POLICY)) - reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name())); - if(hopts.contains(STREAMABILITY_VIOLATION_POLICY)) - reader.setStreamabilityViolationPolicy( - XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name())); - if(hopts.contains(XML_POLICY)) - reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name())); - - if(hopts.contains(HEURISTICS)) - reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); - // end Validator.nu options - - String enc = io.encoding() != null - ? io.encoding() - : hopts.contains(ENCODING) - ? hopts.get(HtmlOptions.ENCODING) - : null; // TODO: sniff encoding - if (enc != null) { - if (!Strings.supported(enc)) - throw INVALIDOPT_X.get(info, "Unsupported encoding: " + enc + '.'); - is.setEncoding(Strings.normEncoding(enc)); - } - - reader.parse(is); - return new IOContent(token(sw.toString()), io.name()); - - } catch(final SAXException ex) { - Util.errln(ex); - throw INVHTML_X.get(info, ex.getLocalizedMessage()); - } - } + String className = HtmlParser.firstUnavailableClass(); + if (className != null) throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className); + return super.item(qc, ii); } } diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java index a582837af8..576fc82b4d 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java +++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java @@ -25,7 +25,10 @@ public class HtmlParse extends StandardFunc { @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { final Item value = arg(0).atomItem(qc, info); - return value.isEmpty() ? Empty.VALUE : parse(new IOContent(toBytes(value)), qc); + if (value.isEmpty()) return Empty.VALUE; + final IO io = value instanceof Bin ? new IOContent(toBytes(value)) + : new IOContent(toBytes(value), "", Strings.UTF8); + return parse(io, qc); } @Override @@ -41,11 +44,11 @@ protected final Expr opt(final CompileContext cc) { * @throws QueryException query exception */ protected final Item parse(final IO io, final QueryContext qc) throws QueryException { - final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), true, qc); + final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), INVHTMLOPT_X, qc); try { return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options)); } catch(final IOException ex) { - throw HTML_PARSE_X.get(info, ex); + throw INVHTML_X.get(info, ex); } } } diff --git a/basex-core/src/main/resources/lang/Chinese.lang b/basex-core/src/main/resources/lang/Chinese.lang index 82caebcf90..610a45e5e4 100644 --- a/basex-core/src/main/resources/lang/Chinese.lang +++ b/basex-core/src/main/resources/lang/Chinese.lang @@ -417,7 +417,7 @@ h_db_format = 数据库结构已经变了,请使用新版软件 h_db_options_% = 如果执行'%',这个选项将会被设置 h_diacritics = 索引保留了发音符号 h_fulltext_index = 全文索引可以加速全文检索 -h_html_parser = 将使用TagSoup将HTML转为XML +h_html_parser = 将使用Validator.nu将HTML转为XML h_index_format = 索引格式变了,请建新索引 h_int_parser = 容错,而且比Java的默认解析器更快 h_languauge = 将使用根据语言确定的tokenizer diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang index e52e0e9345..d507ea781f 100644 --- a/basex-core/src/main/resources/lang/Dutch.lang +++ b/basex-core/src/main/resources/lang/Dutch.lang @@ -417,7 +417,7 @@ h_db_format = Het database formaat is gewijzigd; maak een nieuwe databa h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Diakritische tekens worden gebruikt in de index. h_fulltext_index = Een full-text index versnelt full-text queries. -h_html_parser = De TagSoup parser zal gebruikt worden om HTML naar XML te converteren. +h_html_parser = De Validator.nu HTML parser zal gebruikt worden om HTML naar XML te converteren. h_index_format = Het index formaat is gewijzigd; maak nieuwe indexen. h_int_parser = Robuuster en sneller dan Java's standaard parser. h_languauge = Met deze optie zullen taalspecifieke parsers worden gebruikt. diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang index 0b093bd970..5cc46cc7ed 100644 --- a/basex-core/src/main/resources/lang/English.lang +++ b/basex-core/src/main/resources/lang/English.lang @@ -417,7 +417,7 @@ h_db_format = The database format has changed; please use a newer versi h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Diacritics are retained in the index. h_fulltext_index = A full-text index speeds up full-text queries. -h_html_parser = The TagSoup parser will be used to convert HTML to XML. +h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML. h_index_format = The index format has changed; please create new indexes. h_int_parser = Fault tolerant, and faster than Java’s default parser. h_languauge = Language specific tokenizers will be used. diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang index bcbb7a3f41..2f10e53bcf 100644 --- a/basex-core/src/main/resources/lang/French.lang +++ b/basex-core/src/main/resources/lang/French.lang @@ -417,7 +417,7 @@ h_db_format = Le format de base de données a changé ; Veuillez créer h_db_options_% = Les options seront assignées si on exécute '%'. h_diacritics = Les signes diacritiques sont conservés dans l’index. h_fulltext_index = Un index plein texte accélère les requêtes plein texte. -h_html_parser = Le parser TagSoup sera utilisé pour convertir le HTML en XML. +h_html_parser = Le parser HTML Validator.nu sera utilisé pour convertir le HTML en XML. h_index_format = Le format des index a changé ; Veuillez créer de nouveaux index. h_int_parser = Tolérant aux fautes, et plus rapide que le parser Java par défaut. h_languauge = Des analyseurs spécifiques à la langue vont être utilisés. diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang index 8a674e17c1..a4056c130d 100644 --- a/basex-core/src/main/resources/lang/German.lang +++ b/basex-core/src/main/resources/lang/German.lang @@ -417,7 +417,7 @@ h_db_format = Das Datenbankformat hat sich geändert; bitte verwenden S h_db_options_% = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird. h_diacritics = Diakritische Zeichen werden im Index beibehalten. h_fulltext_index = Ein Volltext-Index beschleunigt Volltext-Anfragen. -h_html_parser = Der TagSoup-Parser wird verwendet, um HTML in XML zu konvertieren. +h_html_parser = Der Validator.nu HTML-Parser wird verwendet, um HTML in XML zu konvertieren. h_index_format = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes. h_int_parser = Fehlertolerant und schneller als Javas XML-Parser. h_languauge = Sprachspezifische Tokenisierung wird verwendet. diff --git a/basex-core/src/main/resources/lang/Hungarian.lang b/basex-core/src/main/resources/lang/Hungarian.lang index bfe804283e..41a1c781c5 100644 --- a/basex-core/src/main/resources/lang/Hungarian.lang +++ b/basex-core/src/main/resources/lang/Hungarian.lang @@ -417,7 +417,7 @@ h_db_format = Az adatbázis formátuma megváltozott; kérem, használj h_db_options_% = Ezek az beállítások csak a következő futtása után lépnek életbe: '%' h_diacritics = Ékezetek megmaradnak az indexelésben. h_fulltext_index = A teljes-szöveg index gyorsítja a teljes-szöveges (full-text) lekérdezéseket. -h_html_parser = A TagSoup elemző HTML formátumot konvertál XML formátumra. +h_html_parser = A Validator.nu elemző HTML formátumot konvertál XML formátumra. h_index_format = Az index formátuma megváltozott; kérem, készítsen új indexeket. h_int_parser = Hibatűrő, továbbá a Java alapértelmezett elemzőjénél gyorsabb. h_languauge = Nyelvfüggő szövegelemzések is használatra kerülnek. diff --git a/basex-core/src/main/resources/lang/Indonesian.lang b/basex-core/src/main/resources/lang/Indonesian.lang index c23c842fa2..096d98f267 100644 --- a/basex-core/src/main/resources/lang/Indonesian.lang +++ b/basex-core/src/main/resources/lang/Indonesian.lang @@ -417,7 +417,7 @@ h_db_format = Bentuk basisdata telah berubah; mohon gunakan versi yang h_db_options_% = Pilihan akan digunakan jika '%' dijalankan. h_diacritics = Diakritik dipertahankan dalam indeks. h_fulltext_index = Indeks semua teks mempercepat kueri teks penuh. -h_html_parser = Pengurai TagSoup akan digunakan untuk mengubah HTML menjadi XML. +h_html_parser = Pengurai Validator.nu akan digunakan untuk mengubah HTML menjadi XML. h_index_format = Bentuk indeks telah berubah; mohon buat indeks baru. h_int_parser = Toleran kesalahan, dan lebih cepat dari pengurai standar Java. h_languauge = Pengurai teks bahasa tertentu akan digunakan. diff --git a/basex-core/src/main/resources/lang/Italian.lang b/basex-core/src/main/resources/lang/Italian.lang index f95804268d..dcb10fb853 100644 --- a/basex-core/src/main/resources/lang/Italian.lang +++ b/basex-core/src/main/resources/lang/Italian.lang @@ -417,7 +417,7 @@ h_db_format = Il formato della base di dati è cambiato; creare una nuo h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = I segni diacritici sono conservati nell'indice. h_fulltext_index = Un indice "full-text" velocizza le interrogazioni sul testo. -h_html_parser = Il parser TagSoup verrò usato per convertire HTML in XML. +h_html_parser = Il parser Validator.nu verrò usato per convertire HTML in XML. h_index_format = Il formato degli indici è cambiato; creare nuovi indici. h_int_parser = Tollerante ai guasti e più veloce del parser di default di Java. h_languauge = Parser di testo specifici per la lingua verranno usati diff --git a/basex-core/src/main/resources/lang/Japanese.lang b/basex-core/src/main/resources/lang/Japanese.lang index e12f3734f7..1aa797d681 100644 --- a/basex-core/src/main/resources/lang/Japanese.lang +++ b/basex-core/src/main/resources/lang/Japanese.lang @@ -417,7 +417,7 @@ h_db_format = データベース形式を変更しました。新しい h_db_options_% = % 実行時にオプションが割り当てられます。 h_diacritics = インデックス内で付加記号(ウムラウト等)は保持されます。 h_fulltext_index = 全文テキストインデックスは全文検索を高速化します。 -h_html_parser = TagSoup パーサは HTML を XML に変換します。 +h_html_parser = Validator.nu パーサは HTML を XML に変換します。 h_index_format = インデックス形式を変更しました。新しくインデックスを作成して下さい。 h_int_parser = フォールトトレラント、Javaのデフォルトパーサより高速。 h_languauge = 指定された言語のテキストパーサが使用されます。 diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang index 5834f92c48..995f0d178f 100644 --- a/basex-core/src/main/resources/lang/Mongolian.lang +++ b/basex-core/src/main/resources/lang/Mongolian.lang @@ -417,7 +417,7 @@ h_db_format = Өгөгдлийн сангийн формат өөрчл h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Индекс дэх санах тэмдгийг авч үлдэх. h_fulltext_index = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно. -h_html_parser = The TagSoup parser will be used to convert HTML to XML. +h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML. h_index_format = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү. h_int_parser = Fault tolerant, and faster than Java’s default parser. h_languauge = Хэлний текст Parser тодорхойлогдох болно. diff --git a/basex-core/src/main/resources/lang/Romanian.lang b/basex-core/src/main/resources/lang/Romanian.lang index 7903adde32..5cae6261d3 100644 --- a/basex-core/src/main/resources/lang/Romanian.lang +++ b/basex-core/src/main/resources/lang/Romanian.lang @@ -417,7 +417,7 @@ h_db_format = Formatul bazei de date a fost schimbat, vă rugăm să fo h_db_options_% = Optiunile vor fi asignate daca '%' este executată. h_diacritics = Diacritice sunt păstrate în index. h_fulltext_index = Un full-text index accelereaza interogările full-text. -h_html_parser = Parserul TagSoup va fi folosit pentru a converti HTML în XML. +h_html_parser = Parserul "Validator.nu" va fi folosit pentru a converti HTML în XML. h_index_format = Formatul index s-a schimbat, vă rugăm creati noi indici. h_int_parser = Tolerant la greseli si mai rapid decat parserul default Java. h_languauge = Parsere de text specifice limbii vor fi folosite. diff --git a/basex-core/src/main/resources/lang/Russian.lang b/basex-core/src/main/resources/lang/Russian.lang index 01be29bb23..d520598dbd 100644 --- a/basex-core/src/main/resources/lang/Russian.lang +++ b/basex-core/src/main/resources/lang/Russian.lang @@ -417,7 +417,7 @@ h_db_format = Формат хранения баз данных был h_db_options_% = Эти опции будут изменены только после выполнения команды [%] h_diacritics = Разделительные знаки будут включены в индекс h_fulltext_index = Полнотекстовый индекс ускоряет соответствующие запросы -h_html_parser = Для конвертации HTML в XML будет использован парсер TagSoup +h_html_parser = Для конвертации HTML в XML будет использован парсер Validator.nu h_index_format = Формат хранения индексов был изменен. Пожалуйста, создайте индексы заново. h_int_parser = Толерантный к ошибкам и быстрее чем стандартный парсер Java h_languauge = Будут использованы специализированные под каждый язык парсеры diff --git a/basex-core/src/main/resources/lang/Spanish.lang b/basex-core/src/main/resources/lang/Spanish.lang index 61439dc026..1d10d504bd 100644 --- a/basex-core/src/main/resources/lang/Spanish.lang +++ b/basex-core/src/main/resources/lang/Spanish.lang @@ -417,7 +417,7 @@ h_db_format = El formato de la Base de Datos ha cambiado; por favor uti h_db_options_% = Las opciónes serán asignado si se ejecuta '%'. h_diacritics = Las diacríticas están retenidas en el índice. h_fulltext_index = Un índice de Texto Completo acelera las consulta de Texto Completo. -h_html_parser = Se utilizará el Analizador Sintáctico TagSoup para convertir HTML a XML. +h_html_parser = Se utilizará el Analizador Sintáctico Validator.nu para convertir HTML a XML. h_index_format = El formato del índice ha cambiado; for favor, cree nuevos índices. h_int_parser = Tolerante a fallos, y más rápido que el analizador sintáctico por defecto de Java. h_languauge = Se utilizarán analizadores sintácticos de texto específicos del lenguaje. diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java index 49e1539d5f..f21ec21461 100644 --- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java @@ -1468,8 +1468,8 @@ public final class FnModuleTest extends SandboxTest { error(func.args(42), STRBIN_X_X); error(func.args(" \"42\"", 42), MAP_X_X); - error(func.args(" \"42\"", " map {'1234': ()}"), INVALIDOPT_X); - error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVALIDOPT_X); + error(func.args(" \"42\"", " map {'1234': ()}"), INVHTMLOPT_X); + error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVHTMLOPT_X); } /** Test method. */ diff --git a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java index 59636312bf..e85fedcf5a 100644 --- a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java @@ -20,8 +20,7 @@ public final class HtmlModuleTest extends SandboxTest { query(func.args(" <_/>/text()"), ""); final String path = "src/test/resources/input.html"; - query(func.args(path) + "//body ! name()", "body"); - query(func.args(path, " map { 'nons': false() }") + "//*:body ! name()", "body"); + query(func.args(path) + "//*:body ! name()", "body"); } /** Test method. */ @@ -33,7 +32,8 @@ public final class HtmlModuleTest extends SandboxTest { // check if the function returns an HTML root node query("exists(" + func.args("<html/>") + "/*:html)", true); // check if the function returns - query(func.args("<html/>", " map { 'nons': true() }"), ""); + query(func.args("<html/>"), + ""); } /** Test method. */ diff --git a/basex-examples/basex-examples.iml b/basex-examples/basex-examples.iml index d68f7b229d..c8c6019d38 100644 --- a/basex-examples/basex-examples.iml +++ b/basex-examples/basex-examples.iml @@ -30,7 +30,7 @@ - + diff --git a/basex-examples/pom.xml b/basex-examples/pom.xml index 2e4dbd2db5..c350579a51 100644 --- a/basex-examples/pom.xml +++ b/basex-examples/pom.xml @@ -18,8 +18,8 @@ ${project.version} - org.ccil.cowan.tagsoup - tagsoup + nu.validator + htmlparser org.junit.jupiter diff --git a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java index 3481bbcc26..40b96ffc23 100644 --- a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java +++ b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java @@ -6,7 +6,7 @@ /** * This example demonstrates how to import a file in the HTML format * into the database. The specified input file will be converted to XML - * if TagSoup is found in the classpath. + * if Validator.nu is found in the classpath. * * @author BaseX Team 2005-23, BSD License * @author Christian Gruen diff --git a/pom.xml b/pom.xml index 3d09d55fcb..24f5d352c1 100644 --- a/pom.xml +++ b/pom.xml @@ -64,9 +64,9 @@ true - org.ccil.cowan.tagsoup - tagsoup - 1.2.1 + nu.validator + htmlparser + 1.4.16 runtime true @@ -174,13 +174,6 @@ runtime true - - nu.validator - htmlparser - 1.4.16 - runtime - true - From e99d48b0e097f8162bdb6e4fd38fffe36053322d Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 13 Nov 2023 12:41:32 +0100 Subject: [PATCH 5/9] set scope=compile; handle dependencies of "heuristics" setting; test meta/@charset --- basex-core/pom.xml | 2 +- .../java/org/basex/query/func/Function.java | 2 +- .../org/basex/query/func/html/HtmlParse.java | 32 +++++++++++++++++++ .../org/basex/query/func/FnModuleTest.java | 18 ++++++----- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/basex-core/pom.xml b/basex-core/pom.xml index f3c6d14885..33a2503167 100644 --- a/basex-core/pom.xml +++ b/basex-core/pom.xml @@ -32,7 +32,7 @@ nu.validator htmlparser - provided + compile true diff --git a/basex-core/src/main/java/org/basex/query/func/Function.java b/basex-core/src/main/java/org/basex/query/func/Function.java index c58cbe4b6a..8392399bf7 100644 --- a/basex-core/src/main/java/org/basex/query/func/Function.java +++ b/basex-core/src/main/java/org/basex/query/func/Function.java @@ -468,7 +468,7 @@ ITEM_ZM, flag(HOF)), params(STRING_ZO), DATE_TIME_ZO), /** XQuery function. */ PARSE_HTML(FnParseHtml::new, "parse-html(html[,options])", - params(ITEM_ZO, MAP_O), DOCUMENT_NODE_ZO), + params(ANY_ATOMIC_TYPE_ZO, MAP_O), DOCUMENT_NODE_ZO), /** XQuery function. */ PARSE_INTEGER(FnParseInteger::new, "parse-integer(value[,radix])", params(STRING_O, INTEGER_O), INTEGER_O), diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java index 576fc82b4d..c7cf9d24af 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java +++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java @@ -1,5 +1,6 @@ package org.basex.query.func.html; +import static org.basex.build.html.HtmlOptions.*; import static org.basex.query.QueryError.*; import java.io.*; @@ -22,6 +23,12 @@ * @author Christian Gruen */ public class HtmlParse extends StandardFunc { + /** Class needed for heuristics=ICU. */ + private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector"; + /** Class needed for heuristics=CHARDET. */ + private static final String CHARDET_CLASS_NAME = + "org.mozilla.intl.chardet.nsICharsetDetectionObserver"; + @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { final Item value = arg(0).atomItem(qc, info); @@ -45,10 +52,35 @@ protected final Expr opt(final CompileContext cc) { */ protected final Item parse(final IO io, final QueryContext qc) throws QueryException { final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), INVHTMLOPT_X, qc); + if(options.contains(HEURISTICS)) { + switch (options.get(HEURISTICS)) { + case ALL: + ensureAvailable(ICU_CLASS_NAME); + ensureAvailable(CHARDET_CLASS_NAME); + break; + case ICU: + ensureAvailable(ICU_CLASS_NAME); + break; + case CHARDET: + ensureAvailable(CHARDET_CLASS_NAME); + break; + default: + } + } try { return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options)); } catch(final IOException ex) { throw INVHTML_X.get(info, ex); } } + + /** + * Ensure that a required class is available on the class path. + * @param className the class name + * @throws QueryException query exception, + */ + private void ensureAvailable(final String className) throws QueryException { + if(!Reflect.available(className)) + throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className); + } } diff --git a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java index e4eaf33975..44b7e3b8bb 100644 --- a/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/FnModuleTest.java @@ -1480,16 +1480,18 @@ public final class FnModuleTest extends SandboxTest { " map {'encoding': '" + Strings.UTF16LE + "', 'xml-policy': 'ALTER_INFOSET'}"), "42"); - query(func.args(_CONVERT_STRING_TO_BASE64.args("42", Strings.UTF16BE), - " map {'encoding': '" + Strings.UTF16BE + "', 'heuristics': 'NONE'}"), - "42"); + query(func.args(_CONVERT_STRING_TO_BASE64.args("" + + "\u20AC", "ISO-8859-7"), " map {'heuristics': 'NONE'}"), + "" + + "\u20AC"); error(func.args(42), STRBIN_X_X); - error(func.args(" \"42\"", 42), MAP_X_X); - error(func.args(" \"42\"", " map {'1234': ()}"), INVHTMLOPT_X); - error(func.args(" \"42\"", " map {'heuristics': '5678'}"), INVHTMLOPT_X); + error(func.args("42", 42), MAP_X_X); + error(func.args("42", " map {'1234': ''}"), INVHTMLOPT_X); + error(func.args("42", " map {'heuristics': '5678'}"), INVHTMLOPT_X); + error(func.args("42", " map {'heuristics': 'CHARDET'}"), BASEX_CLASSPATH_X_X); + error(func.args("42", " map {'heuristics': 'ICU'}"), BASEX_CLASSPATH_X_X); + error(func.args("42", " map {'heuristics': 'ALL'}"), BASEX_CLASSPATH_X_X); } /** Test method. */ From 7dd830e38f6b5b7a601943ecb1196d343d0b7878 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 13 Nov 2023 13:04:00 +0100 Subject: [PATCH 6/9] minor changes --- .../org/basex/build/html/HtmlOptions.java | 30 +++++++++---------- .../java/org/basex/build/html/HtmlParser.java | 7 ++--- .../org/basex/query/func/html/HtmlParse.java | 4 +-- basex-core/src/main/resources/lang/Dutch.lang | 2 +- .../src/main/resources/lang/English.lang | 2 +- .../src/main/resources/lang/French.lang | 2 +- .../src/main/resources/lang/German.lang | 2 +- .../src/main/resources/lang/Mongolian.lang | 2 +- 8 files changed, 25 insertions(+), 26 deletions(-) diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java index a6ce8f6cc9..311a854066 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java @@ -9,47 +9,47 @@ * @author Christian Gruen */ public final class HtmlOptions extends Options { - /** fn:parse-html option encoding. */ + /** fn:parse-html option: encoding. */ public static final StringOption ENCODING = new StringOption("encoding"); - /** fn:parse-html option method. */ + /** fn:parse-html option: method. */ public static final StringOption METHOD = new StringOption("method"); - /** fn:parse-html option html-version. */ + /** fn:parse-html option: html-version. */ public static final StringOption HTML_VERSION = new StringOption("html-version"); - /** fn:parse-html option include-template-content. */ + /** fn:parse-html option: include-template-content. */ public static final BooleanOption INCLUDE_TEMPLATE_CONTENT = new BooleanOption("include-template-content"); - /** Validator.nu option unicode-normalization-checking. */ + /** Validator.nu option: unicode-normalization-checking. */ public static final BooleanOption UNICODE_NORMALIZATION_CHECKING = new BooleanOption("unicode-normalization-checking", false); - /** Validator.nu option mapping-lang-to-xml-lang. */ + /** Validator.nu option: mapping-lang-to-xml-lang. */ public static final BooleanOption MAPPING_LANG_TO_XML_LANG = new BooleanOption("mapping-lang-to-xml-lang", false); - /** Validator.nu option scripting-enabled. */ + /** Validator.nu option: scripting-enabled. */ public static final BooleanOption SCRIPTING_ENABLED = new BooleanOption("scripting-enabled", false); - /** Validator.nu option content-space-policy. */ + /** Validator.nu option: content-space-policy. */ public static final EnumOption CONTENT_SPACE_POLICY = new EnumOption<>("content-space-policy", XmlViolationPolicy.class); - /** Validator.nu option content-non-xml-char-policy. */ + /** Validator.nu option: content-non-xml-char-policy. */ public static final EnumOption CONTENT_NON_XML_CHAR_POLICY = new EnumOption<>("content-non-xml-char-policy", XmlViolationPolicy.class); - /** Validator.nu option comment-policy. */ + /** Validator.nu option: comment-policy. */ public static final EnumOption COMMENT_POLICY = new EnumOption<>("comment-policy", XmlViolationPolicy.class); - /** Validator.nu option xmlns-policy. */ + /** Validator.nu option: xmlns-policy. */ public static final EnumOption XMLNS_POLICY = new EnumOption<>("xmlns-policy", XmlViolationPolicy.class); - /** Validator.nu option name-policy. */ + /** Validator.nu option: name-policy. */ public static final EnumOption NAME_POLICY = new EnumOption<>("name-policy", XmlViolationPolicy.class); - /** Validator.nu option streamability-violation-policy. */ + /** Validator.nu option: streamability-violation-policy. */ public static final EnumOption STREAMABILITY_VIOLATION_POLICY = new EnumOption<>("streamability-violation-policy", XmlViolationPolicy.class); - /** Validator.nu option xml-policy. */ + /** Validator.nu option: xml-policy. */ public static final EnumOption XML_POLICY = new EnumOption<>("xml-policy", XmlViolationPolicy.class); - /** Validator.nu option heuristics. */ + /** Validator.nu option: heuristics. */ public static final EnumOption HEURISTICS = new EnumOption<>("heuristics", Heuristics.class); diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java index 773defb9db..8fda826216 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java @@ -104,9 +104,6 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException reader.setContentHandler(writer); reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); - // define input - final InputSource is = new InputSource(io.inputStream()); - // set Validator.nu options if(hopts.get(UNICODE_NORMALIZATION_CHECKING)) reader.setCheckingNormalization(true); @@ -135,11 +132,13 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); // end Validator.nu options + // define input + final InputSource is = new InputSource(io.inputStream()); String enc = io.encoding() != null ? io.encoding() : hopts.contains(ENCODING) ? hopts.get(HtmlOptions.ENCODING) - : null; // TODO: sniff encoding + : null; if (enc != null) { if (!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.'); diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java index c7cf9d24af..41bbf4e1e1 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java +++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java @@ -23,9 +23,9 @@ * @author Christian Gruen */ public class HtmlParse extends StandardFunc { - /** Class needed for heuristics=ICU. */ + /** Class needed for option heuristics=ICU. */ private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector"; - /** Class needed for heuristics=CHARDET. */ + /** Class needed for option heuristics=CHARDET. */ private static final String CHARDET_CLASS_NAME = "org.mozilla.intl.chardet.nsICharsetDetectionObserver"; diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang index d507ea781f..b438ac5864 100644 --- a/basex-core/src/main/resources/lang/Dutch.lang +++ b/basex-core/src/main/resources/lang/Dutch.lang @@ -417,7 +417,7 @@ h_db_format = Het database formaat is gewijzigd; maak een nieuwe databa h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Diakritische tekens worden gebruikt in de index. h_fulltext_index = Een full-text index versnelt full-text queries. -h_html_parser = De Validator.nu HTML parser zal gebruikt worden om HTML naar XML te converteren. +h_html_parser = De Validator.nu parser zal gebruikt worden om HTML naar XML te converteren. h_index_format = Het index formaat is gewijzigd; maak nieuwe indexen. h_int_parser = Robuuster en sneller dan Java's standaard parser. h_languauge = Met deze optie zullen taalspecifieke parsers worden gebruikt. diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang index 5cc46cc7ed..9cc5ca7d20 100644 --- a/basex-core/src/main/resources/lang/English.lang +++ b/basex-core/src/main/resources/lang/English.lang @@ -417,7 +417,7 @@ h_db_format = The database format has changed; please use a newer versi h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Diacritics are retained in the index. h_fulltext_index = A full-text index speeds up full-text queries. -h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML. +h_html_parser = The Validator.nu parser will be used to convert HTML to XML. h_index_format = The index format has changed; please create new indexes. h_int_parser = Fault tolerant, and faster than Java’s default parser. h_languauge = Language specific tokenizers will be used. diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang index 2f10e53bcf..7fd856822b 100644 --- a/basex-core/src/main/resources/lang/French.lang +++ b/basex-core/src/main/resources/lang/French.lang @@ -417,7 +417,7 @@ h_db_format = Le format de base de données a changé ; Veuillez créer h_db_options_% = Les options seront assignées si on exécute '%'. h_diacritics = Les signes diacritiques sont conservés dans l’index. h_fulltext_index = Un index plein texte accélère les requêtes plein texte. -h_html_parser = Le parser HTML Validator.nu sera utilisé pour convertir le HTML en XML. +h_html_parser = Le parser Validator.nu sera utilisé pour convertir le HTML en XML. h_index_format = Le format des index a changé ; Veuillez créer de nouveaux index. h_int_parser = Tolérant aux fautes, et plus rapide que le parser Java par défaut. h_languauge = Des analyseurs spécifiques à la langue vont être utilisés. diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang index a4056c130d..1226bac232 100644 --- a/basex-core/src/main/resources/lang/German.lang +++ b/basex-core/src/main/resources/lang/German.lang @@ -417,7 +417,7 @@ h_db_format = Das Datenbankformat hat sich geändert; bitte verwenden S h_db_options_% = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird. h_diacritics = Diakritische Zeichen werden im Index beibehalten. h_fulltext_index = Ein Volltext-Index beschleunigt Volltext-Anfragen. -h_html_parser = Der Validator.nu HTML-Parser wird verwendet, um HTML in XML zu konvertieren. +h_html_parser = Der Validator.nu-Parser wird verwendet, um HTML in XML zu konvertieren. h_index_format = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes. h_int_parser = Fehlertolerant und schneller als Javas XML-Parser. h_languauge = Sprachspezifische Tokenisierung wird verwendet. diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang index 995f0d178f..bbc2140b6e 100644 --- a/basex-core/src/main/resources/lang/Mongolian.lang +++ b/basex-core/src/main/resources/lang/Mongolian.lang @@ -417,7 +417,7 @@ h_db_format = Өгөгдлийн сангийн формат өөрчл h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Индекс дэх санах тэмдгийг авч үлдэх. h_fulltext_index = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно. -h_html_parser = The Validator.nu HTML parser will be used to convert HTML to XML. +h_html_parser = The Validator.nu parser will be used to convert HTML to XML. h_index_format = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү. h_int_parser = Fault tolerant, and faster than Java’s default parser. h_languauge = Хэлний текст Parser тодорхойлогдох болно. From ff514e1d2f5ead11d913f23853ad9f5e9e74b4ad Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Mon, 13 Nov 2023 13:23:53 +0100 Subject: [PATCH 7/9] very minor change --- basex-core/src/main/java/org/basex/build/html/HtmlParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java index 8fda826216..5877ce1498 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java @@ -33,7 +33,7 @@ public final class HtmlParser extends XMLParser { private static final String NAME = "Validator.nu"; /** - * Checks if a Validator.nu is available. + * Checks if Validator.nu is available. * @return result of check */ public static boolean available() { From 44f5d131e204e0a63ad1c86a584366745dea1306 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Wed, 22 Jan 2025 18:35:01 +0100 Subject: [PATCH 8/9] support both TagSoup and Validator.NU --- basex-core/pom.xml | 6 + .../org/basex/build/html/HtmlOptions.java | 41 +- .../java/org/basex/build/html/HtmlParser.java | 362 ++++++++++++++---- .../src/main/java/org/basex/core/Text.java | 2 +- .../basex/gui/dialog/DialogHtmlParser.java | 10 +- .../basex/query/func/html/FnParseHtml.java | 6 +- .../org/basex/query/func/html/HtmlDoc.java | 3 +- .../org/basex/query/func/html/HtmlParse.java | 61 ++- .../org/basex/query/func/html/HtmlParser.java | 6 +- .../src/main/resources/lang/Chinese.lang | 2 +- basex-core/src/main/resources/lang/Dutch.lang | 2 +- .../src/main/resources/lang/English.lang | 2 +- .../src/main/resources/lang/French.lang | 2 +- .../src/main/resources/lang/German.lang | 2 +- .../src/main/resources/lang/Hungarian.lang | 2 +- .../src/main/resources/lang/Indonesian.lang | 2 +- .../src/main/resources/lang/Italian.lang | 2 +- .../src/main/resources/lang/Japanese.lang | 2 +- .../src/main/resources/lang/Mongolian.lang | 2 +- .../src/main/resources/lang/Romanian.lang | 2 +- .../src/main/resources/lang/Russian.lang | 2 +- .../src/main/resources/lang/Spanish.lang | 2 +- .../org/basex/query/func/HtmlModuleTest.java | 8 +- basex-examples/basex-examples.iml | 2 +- basex-examples/pom.xml | 4 +- .../basex/examples/create/HTMLExample.java | 2 +- pom.xml | 7 + 27 files changed, 402 insertions(+), 144 deletions(-) diff --git a/basex-core/pom.xml b/basex-core/pom.xml index ee62cdd08e..79b46a0dc5 100644 --- a/basex-core/pom.xml +++ b/basex-core/pom.xml @@ -29,6 +29,12 @@ lucene-stemmers true + + org.ccil.cowan.tagsoup + tagsoup + compile + true + nu.validator htmlparser diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java index e1271747d4..7eebede2ce 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java @@ -9,10 +9,45 @@ * @author Christian Gruen */ public final class HtmlOptions extends Options { - /** fn:parse-html option: encoding. */ + /** TagSoup option: html. */ + public static final BooleanOption HTML = new BooleanOption("html", false); + /** TagSoup option: omit-xml-declaration. */ + public static final BooleanOption OMIT_XML_DECLARATION = + new BooleanOption("omit-xml-declaration", false); + /** TagSoup option: nons. */ + public static final BooleanOption NONS = new BooleanOption("nons", true); + /** TagSoup option: nobogons. */ + public static final BooleanOption NOBOGONS = new BooleanOption("nobogons", false); + /** TagSoup option: nodefaults. */ + public static final BooleanOption NODEFAULTS = new BooleanOption("nodefaults", false); + /** TagSoup option: nocolons. */ + public static final BooleanOption NOCOLONS = new BooleanOption("nocolons", false); + /** TagSoup option: norestart. */ + public static final BooleanOption NORESTART = new BooleanOption("norestart", false); + /** TagSoup option: nobogons. */ + public static final BooleanOption IGNORABLE = new BooleanOption("ignorable", false); + /** TagSoup option: emptybogons. */ + public static final BooleanOption EMPTYBOGONS = new BooleanOption("emptybogons", false); + /** TagSoup option: any. */ + public static final BooleanOption ANY = new BooleanOption("any", false); + /** TagSoup option: norootbogons. */ + public static final BooleanOption NOROOTBOGONS = new BooleanOption("norootbogons", false); + /** TagSoup option: nocdata. */ + public static final BooleanOption NOCDATA = new BooleanOption("nocdata", false); + /** TagSoup option: lexical. */ + public static final BooleanOption LEXICAL = new BooleanOption("lexical", false); + + /** TagSoup option: doctype-system=systemid. */ + public static final StringOption DOCTYPE_SYSTEM = new StringOption("doctype-system"); + /** TagSoup option: doctype-public=publicid. */ + public static final StringOption DOCTYPE_PUBLIC = new StringOption("doctype-public"); + + /** Common option: encoding. */ public static final StringOption ENCODING = new StringOption("encoding"); - /** fn:parse-html option: method. */ - public static final StringOption METHOD = new StringOption("method"); + /** Common option: method. */ + public static final EnumOption METHOD = new EnumOption<>("method", + HtmlParser.Method.class); + /** fn:parse-html option: html-version. */ public static final StringOption HTML_VERSION = new StringOption("html-version"); /** fn:parse-html option: include-template-content. */ diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java index 558cf1b064..3083a5519b 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java @@ -1,16 +1,18 @@ package org.basex.build.html; import static org.basex.build.html.HtmlOptions.*; +import static org.basex.build.html.HtmlOptions.NOCDATA; import static org.basex.query.QueryError.*; import static org.basex.util.Token.*; import java.io.*; -import java.util.*; import org.basex.build.xml.*; import org.basex.core.*; import org.basex.io.*; +import org.basex.query.*; import org.basex.util.*; +import org.ccil.cowan.tagsoup.*; import org.xml.sax.*; import nu.validator.htmlparser.common.Heuristics; @@ -29,109 +31,57 @@ * @author Christian Gruen */ public final class HtmlParser extends XMLParser { - /** Name of HTML Parser. */ - private static final String NAME = "Validator.nu"; - - /** - * Checks if Validator.nu is available. - * @return result of check - */ - public static boolean available() { - return firstUnavailableClass() == null; - } - /** - * Check whether Validator.nu classes are available on the class path. - * @return the name of the first class that is not available, or null if all classes are available - */ - public static String firstUnavailableClass() { - for(final String className : Arrays.asList("nu.validator.htmlparser.sax.HtmlParser", - "nu.validator.htmlparser.sax.XmlSerializer", - "nu.validator.htmlparser.common.XmlViolationPolicy", - "nu.validator.htmlparser.common.Heuristics")) { - if(!Reflect.available(className)) return className; - } - return null; - } - - /** - * Returns the name of the parser, or an empty string. - * @return name of parser + * Constructor. + * @param source document source + * @param options main options + * @param hopts html options + * @throws IOException I/O exception */ - public static String parser() { - return available() ? NAME : ""; + public HtmlParser(final IO source, final MainOptions options, final HtmlOptions hopts) + throws IOException { + this(source, Parser.of(hopts), options, hopts); } /** * Constructor. * @param source document source + * @param parser parser to be used * @param options main options * @param hopts html options * @throws IOException I/O exception */ - public HtmlParser(final IO source, final MainOptions options, final HtmlOptions hopts) - throws IOException { - super(toXml(source, hopts), options); + public HtmlParser(final IO source, final Parser parser, final MainOptions options, + final HtmlOptions hopts) throws IOException { + super(toXml(source, parser, hopts), options); } /** * Converts an HTML document to XML. * @param io io reference + * @param parser parser to be used * @param hopts html options * @return parser * @throws IOException I/O exception */ - private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException { + private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopts) + throws IOException { // reader could not be initialized; fall back to XML - if(!available()) return io; - + if(!parser.available(hopts)) return io; try { // define output final StringWriter sw = new StringWriter(); - final nu.validator.htmlparser.sax.HtmlParser reader = - new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET); - final ContentHandler writer = new XmlSerializer(sw); - reader.setContentHandler(writer); - reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); - - // set Validator.nu options - if(hopts.get(UNICODE_NORMALIZATION_CHECKING)) - reader.setCheckingNormalization(true); - if(hopts.get(MAPPING_LANG_TO_XML_LANG)) - reader.setMappingLangToXmlLang(true); - if(hopts.get(SCRIPTING_ENABLED)) - reader.setScriptingEnabled(true); - if(hopts.contains(CONTENT_SPACE_POLICY)) - reader.setContentSpacePolicy( - XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name())); - if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY)) - reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf( - hopts.get(CONTENT_NON_XML_CHAR_POLICY).name())); - if(hopts.contains(COMMENT_POLICY)) - reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name())); - if(hopts.contains(XMLNS_POLICY)) - reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name())); - if(hopts.contains(NAME_POLICY)) - reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name())); - if(hopts.contains(STREAMABILITY_VIOLATION_POLICY)) - reader.setStreamabilityViolationPolicy( - XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name())); - if(hopts.contains(XML_POLICY)) - reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name())); - if(hopts.contains(HEURISTICS)) - reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); - // end Validator.nu options + final XMLReader reader = parser.reader(hopts, sw); // define input final InputSource is = new InputSource(io.inputStream()); - String enc = io.encoding() != null + final String enc = io.encoding() != null ? io.encoding() : hopts.contains(ENCODING) ? hopts.get(HtmlOptions.ENCODING) : null; if (enc != null) { - if (!Strings.supported(enc)) - throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.'); + if(!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.'); is.setEncoding(Strings.normEncoding(enc)); } @@ -143,4 +93,272 @@ private static IO toXml(final IO io, final HtmlOptions hopts) throws IOException throw INVHTML_X.getIO(ex.getLocalizedMessage()); } } + + /** Method option values. */ + public enum Method { + /** TagSoup parser with method 'xml'. */ + xml(Parser.TAGSOUP), + /** TagSoup parser with method 'html'. */ + html(Parser.TAGSOUP), + /** Validator.nu parser. */ + nu(Parser.NU); + + /** Parser associated with this method. */ + public final Parser parser; + + /** + * Constructor. + * @param parser parser associated with this method + */ + Method(final Parser parser) { + this.parser = parser; + } + } + + /** Parser type. */ + public enum Parser { + /** TagSoup parser. */ + TAGSOUP("TagSoup", "org.ccil.cowan.tagsoup.Parser") { + + /** TagSoup URL. */ + private static final String FEATURES = "http://www.ccil.org/~cowan/tagsoup/features/"; + + @Override + public boolean fallbackToXml() { + return true; + } + + @Override + XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException { + XMLReader reader = new org.ccil.cowan.tagsoup.Parser(); + final XMLWriter writer = new XMLWriter(sw); + writer.setOutputProperty(ENCODING.name(), Strings.UTF8); + reader.setContentHandler(writer); + + // set TagSoup options + if(hopts.get(HTML)) { + reader.setFeature("http://xml.org/sax/features/namespaces", false); + writer.setOutputProperty(METHOD.name(), "html"); + writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes"); + } + if(hopts.get(NONS)) + reader.setFeature("http://xml.org/sax/features/namespaces", false); + if(hopts.get(NOBOGONS)) + reader.setFeature(FEATURES + "ignore-bogons", true); + if(hopts.get(NODEFAULTS)) + reader.setFeature(FEATURES + "default-attributes", false); + if(hopts.get(NOCOLONS)) + reader.setFeature(FEATURES + "translate-colons", true); + if(hopts.get(NORESTART)) + reader.setFeature(FEATURES + "restart-elements", false); + if(hopts.get(IGNORABLE)) + reader.setFeature(FEATURES + "ignorable-whitespace", true); + if(hopts.get(EMPTYBOGONS)) + reader.setFeature(FEATURES + "bogons-empty", true); + if(hopts.get(ANY)) + reader.setFeature(FEATURES + "bogons-empty", false); + if(hopts.get(NOROOTBOGONS)) + reader.setFeature(FEATURES + "root-bogons", false); + if(hopts.get(NOCDATA)) + reader.setFeature(FEATURES + "cdata-elements", false); + if(hopts.get(LEXICAL)) + reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); + if(hopts.get(OMIT_XML_DECLARATION)) + writer.setOutputProperty(OMIT_XML_DECLARATION.name(), "yes"); + if(hopts.contains(METHOD)) + writer.setOutputProperty(METHOD.name(), hopts.get(METHOD).name()); + if(hopts.contains(DOCTYPE_SYSTEM)) + writer.setOutputProperty(DOCTYPE_SYSTEM.name(), hopts.get(DOCTYPE_SYSTEM)); + if(hopts.contains(DOCTYPE_PUBLIC)) + writer.setOutputProperty(DOCTYPE_PUBLIC.name(), hopts.get(DOCTYPE_PUBLIC)); + return reader; + } + }, + + /** Validator.nu parser. */ + NU("Validator.nu", "nu.validator.htmlparser.sax.HtmlParser", + "nu.validator.htmlparser.sax.XmlSerializer", + "nu.validator.htmlparser.common.XmlViolationPolicy", + "nu.validator.htmlparser.common.Heuristics") { + + /** Class needed for option heuristics=ICU. */ + private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector"; + /** Class needed for option heuristics=CHARDET. */ + private static final String CHARDET_CLASS_NAME = + "org.mozilla.intl.chardet.nsICharsetDetectionObserver"; + + @Override + public boolean fallbackToXml() { + return false; + } + + @Override + XMLReader reader(final HtmlOptions hopts, final StringWriter sw) throws SAXException { + final nu.validator.htmlparser.sax.HtmlParser reader = + new nu.validator.htmlparser.sax.HtmlParser(XmlViolationPolicy.ALTER_INFOSET); + final ContentHandler writer = new XmlSerializer(sw); + reader.setContentHandler(writer); + reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); + + if(hopts.get(UNICODE_NORMALIZATION_CHECKING)) + reader.setCheckingNormalization(true); + if(hopts.get(MAPPING_LANG_TO_XML_LANG)) + reader.setMappingLangToXmlLang(true); + if(hopts.get(SCRIPTING_ENABLED)) + reader.setScriptingEnabled(true); + if(hopts.contains(CONTENT_SPACE_POLICY)) + reader.setContentSpacePolicy( + XmlViolationPolicy.valueOf(hopts.get(CONTENT_SPACE_POLICY).name())); + if(hopts.contains(CONTENT_NON_XML_CHAR_POLICY)) + reader.setContentNonXmlCharPolicy(XmlViolationPolicy.valueOf( + hopts.get(CONTENT_NON_XML_CHAR_POLICY).name())); + if(hopts.contains(COMMENT_POLICY)) + reader.setCommentPolicy(XmlViolationPolicy.valueOf(hopts.get(COMMENT_POLICY).name())); + if(hopts.contains(XMLNS_POLICY)) + reader.setXmlnsPolicy(XmlViolationPolicy.valueOf(hopts.get(XMLNS_POLICY).name())); + if(hopts.contains(NAME_POLICY)) + reader.setNamePolicy(XmlViolationPolicy.valueOf(hopts.get(NAME_POLICY).name())); + if(hopts.contains(STREAMABILITY_VIOLATION_POLICY)) + reader.setStreamabilityViolationPolicy( + XmlViolationPolicy.valueOf(hopts.get(STREAMABILITY_VIOLATION_POLICY).name())); + if(hopts.contains(XML_POLICY)) + reader.setXmlPolicy(XmlViolationPolicy.valueOf(hopts.get(XML_POLICY).name())); + if(hopts.contains(HEURISTICS)) + reader.setHeuristics(Heuristics.valueOf(hopts.get(HEURISTICS).name())); + return reader; + } + + @Override + public void ensureAvailable(final HtmlOptions options, final byte[] func, + final InputInfo info) throws QueryException { + super.ensureAvailable(options, func, info); + if(options.contains(HEURISTICS)) { + switch(options.get(HEURISTICS)) { + case ALL: + ensureAvailable(ICU_CLASS_NAME, func, info); + ensureAvailable(CHARDET_CLASS_NAME, func, info); + break; + case ICU: + ensureAvailable(ICU_CLASS_NAME, func, info); + break; + case CHARDET: + ensureAvailable(CHARDET_CLASS_NAME, func, info); + break; + default: + } + } + } + + @Override + public boolean available(final HtmlOptions options) { + if(!super.available(options)) return false; + if(!options.contains(HEURISTICS)) return true; + switch(options.get(HEURISTICS)) { + case ALL: + if(!Reflect.available(ICU_CLASS_NAME)) return false; + if(!Reflect.available(CHARDET_CLASS_NAME)) return false; + break; + case ICU: + if(!Reflect.available(ICU_CLASS_NAME)) return false; + break; + case CHARDET: + if(!Reflect.available(CHARDET_CLASS_NAME)) return false; + break; + default: + } + return true; + } + }; + + /** Default parser. */ + public static final Parser DEFAULT = TAGSOUP; + + /** String representation. */ + private final String string; + /** Required classes. */ + private final String[] classes; + + /** + * Whether to fall back to XML if this parser is not available. + * @return result of check + */ + public abstract boolean fallbackToXml(); + + /** + * Return a reader instance for this parser. + * @param options HTML options + * @param writer string writer + * @return reader + * @throws SAXException SAX exception + */ + abstract XMLReader reader(HtmlOptions options, StringWriter writer) throws SAXException; + + /** + * Constructor. + * @param string string representation + * @param classes required classes + */ + Parser(final String string, final String... classes) { + this.string = string; + this.classes = classes; + } + + /** + * Checks if this parser is available. + * @param options HTML options + * @return result of check + */ + public boolean available(@SuppressWarnings("unused") final HtmlOptions options) { + for(final String cl : classes) if(!Reflect.available(cl)) return false; + return true; + } + + /** + * Throws an exception if any of the classes required for this parser are unavailable. + * @param options HTML options + * @param func name of function that is asking for this parser + * @param info input info (can be {@code null}) + * @throws QueryException query exception + */ + public void ensureAvailable(@SuppressWarnings("unused") final HtmlOptions options, + final byte[] func, final InputInfo info) throws QueryException { + for(final String cl : classes) ensureAvailable(cl, func, info); + } + + /** + * Throws an exception if a class required for this parser is unavailable. + * @param className the class name + * @param func name of function that is asking for this parser + * @param info input info (can be {@code null}) + * @throws QueryException query exception, + */ + private static void ensureAvailable(final String className, final byte[] func, + final InputInfo info) throws QueryException { + if(!Reflect.available(className)) throw BASEX_CLASSPATH_X_X.get(info, func, className); + } + + /** + * Returns the parser associated with the specified HTML options. + * @param options HTML options. + * @return parser + */ + public static Parser of(final HtmlOptions options) { + return of(options, Parser.DEFAULT); + } + + /** + * Returns the parser associated with the specified HTML options. + * @param options HTML options. + * @param defaultParser default parser + * @return parser + */ + public static Parser of(final HtmlOptions options, final Parser defaultParser) { + return options.contains(METHOD) ? options.get(METHOD).parser : defaultParser; + } + + @Override + public String toString() { + return string; + } + } } diff --git a/basex-core/src/main/java/org/basex/core/Text.java b/basex-core/src/main/java/org/basex/core/Text.java index d7f8121800..dd8d080b65 100644 --- a/basex-core/src/main/java/org/basex/core/Text.java +++ b/basex-core/src/main/java/org/basex/core/Text.java @@ -1426,7 +1426,7 @@ public interface Text { String H_VERSION_NEW_X_X = lang("h_version_new_%_%"); /** HTML Parser. */ - String H_HTML_PARSER = lang("h_html_parser"); + String H_HTML_PARSER_X = lang("h_html_parser_%"); /** No HTML Parser. */ String H_NO_HTML_PARSER = lang("h_no_html_parser"); diff --git a/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java b/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java index f5b0d625e1..ce48bd3dc4 100644 --- a/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java +++ b/basex-core/src/main/java/org/basex/gui/dialog/DialogHtmlParser.java @@ -6,10 +6,12 @@ import java.io.*; import org.basex.build.html.*; +import org.basex.build.html.HtmlParser.*; import org.basex.core.*; import org.basex.gui.*; -import org.basex.gui.GUIConstants.Msg; +import org.basex.gui.GUIConstants.*; import org.basex.gui.layout.*; +import org.basex.util.*; import org.basex.util.options.*; /** @@ -33,10 +35,10 @@ final class DialogHtmlParser extends DialogParser { */ DialogHtmlParser(final BaseXDialog dialog, final MainOptions opts) { hopts = new HtmlOptions(opts.get(MainOptions.HTMLPARSER)); - - final boolean avl = HtmlParser.available(); + final Parser parser = Parser.of(hopts); + final boolean avl = parser.available(hopts); final BaseXBack pp = new BaseXBack(new RowLayout(8)); - pp.add(new BaseXLabel(avl ? H_HTML_PARSER : H_NO_HTML_PARSER)); + pp.add(new BaseXLabel(avl ? Util.info(H_HTML_PARSER_X, parser) : H_NO_HTML_PARSER)); options = new BaseXTextField(dialog, hopts.toString()); options.setToolTipText(tooltip(hopts)); diff --git a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java index b6bbce098a..54540389a5 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java +++ b/basex-core/src/main/java/org/basex/query/func/html/FnParseHtml.java @@ -1,7 +1,5 @@ package org.basex.query.func.html; -import static org.basex.query.QueryError.*; - import org.basex.build.html.HtmlParser; import org.basex.query.*; import org.basex.query.value.item.*; @@ -17,8 +15,6 @@ public class FnParseHtml extends HtmlParse { @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { - String className = HtmlParser.firstUnavailableClass(); - if (className != null) throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className); - return super.item(qc, ii); + return parse(htmlInput(qc), HtmlParser.Parser.NU, qc); } } diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java index 1e21bff317..a8a08c664c 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java +++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlDoc.java @@ -1,5 +1,6 @@ package org.basex.query.func.html; +import org.basex.build.html.HtmlParser.*; import org.basex.query.*; import org.basex.query.value.item.*; import org.basex.query.value.seq.*; @@ -15,6 +16,6 @@ public final class HtmlDoc extends HtmlParse { @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { final String source = toStringOrNull(arg(0), qc); - return source != null ? parse(toIO(source), qc) : Empty.VALUE; + return source != null ? parse(toIO(source), Parser.DEFAULT, qc) : Empty.VALUE; } } diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java index 181692e2e7..c4986d2c6b 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java +++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java @@ -1,11 +1,11 @@ package org.basex.query.func.html; -import static org.basex.build.html.HtmlOptions.*; import static org.basex.query.QueryError.*; import java.io.*; import org.basex.build.html.*; +import org.basex.build.html.HtmlParser.*; import org.basex.core.*; import org.basex.io.*; import org.basex.query.*; @@ -23,19 +23,23 @@ * @author Christian Gruen */ public class HtmlParse extends StandardFunc { - /** Class needed for option heuristics=ICU. */ - private static final String ICU_CLASS_NAME = "com.ibm.icu.text.CharsetDetector"; - /** Class needed for option heuristics=CHARDET. */ - private static final String CHARDET_CLASS_NAME = - "org.mozilla.intl.chardet.nsICharsetDetectionObserver"; - @Override public Item item(final QueryContext qc, final InputInfo ii) throws QueryException { + return parse(htmlInput(qc), Parser.DEFAULT, qc); + } + + /** + * Converts the HTML input in the first argument to an IOContent instance from a binary or string + * item. + * @param qc query context + * @return input as an IOContent instance ({@code null}, if empty) + * @throws QueryException query exception + */ + protected IOContent htmlInput(final QueryContext qc) throws QueryException { final Item value = arg(0).atomItem(qc, info); - if (value.isEmpty()) return Empty.VALUE; - final IO io = value instanceof Bin ? new IOContent(toBytes(value)) - : new IOContent(toBytes(value), "", Strings.UTF8); - return parse(io, qc); + if(value.isEmpty()) return null; + return value instanceof Bin ? new IOContent(toBytes(value)) + : new IOContent(toBytes(value), "", Strings.UTF8); } @Override @@ -46,41 +50,22 @@ protected final Expr opt(final CompileContext cc) { /** * Parses the input and creates an XML document. * @param io input data + * @param defaultParser default HTML parser to be used in absence of the METHOD option * @param qc query context * @return node * @throws QueryException query exception */ - protected final Item parse(final IO io, final QueryContext qc) throws QueryException { + protected final Item parse(final IO io, final Parser defaultParser, final QueryContext qc) + throws QueryException { + if(io == null) return Empty.VALUE; final HtmlOptions options = toOptions(arg(1), new HtmlOptions(), qc); - if(options.contains(HEURISTICS)) { - switch (options.get(HEURISTICS)) { - case ALL: - ensureAvailable(ICU_CLASS_NAME); - ensureAvailable(CHARDET_CLASS_NAME); - break; - case ICU: - ensureAvailable(ICU_CLASS_NAME); - break; - case CHARDET: - ensureAvailable(CHARDET_CLASS_NAME); - break; - default: - } - } + final Parser parser = Parser.of(options, defaultParser); + if(!parser.fallbackToXml()) parser.ensureAvailable(options, definition.local(), info); try { - return new DBNode(new org.basex.build.html.HtmlParser(io, new MainOptions(), options)); + return new DBNode( + new org.basex.build.html.HtmlParser(io, parser, new MainOptions(), options)); } catch(final IOException ex) { throw INVHTML_X.get(info, ex); } } - - /** - * Ensure that a required class is available on the class path. - * @param className the class name - * @throws QueryException query exception, - */ - private void ensureAvailable(final String className) throws QueryException { - if(!Reflect.available(className)) - throw BASEX_CLASSPATH_X_X.get(info, definition.local(), className); - } } diff --git a/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java b/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java index 0d5ce3ec59..da95eafa78 100644 --- a/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java +++ b/basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java @@ -1,5 +1,7 @@ package org.basex.query.func.html; +import org.basex.build.html.*; +import org.basex.build.html.HtmlParser.*; import org.basex.query.*; import org.basex.query.func.*; import org.basex.query.value.item.*; @@ -14,6 +16,8 @@ public final class HtmlParser extends StandardFunc { @Override public Item item(final QueryContext qc, final InputInfo ii) { - return Str.get(org.basex.build.html.HtmlParser.parser()); + final HtmlOptions options = new HtmlOptions(); + final Parser parser = Parser.of(options); + return Str.get(parser.available(options) ? parser.toString() : ""); } } diff --git a/basex-core/src/main/resources/lang/Chinese.lang b/basex-core/src/main/resources/lang/Chinese.lang index 11f1839948..1a7b4bfb29 100644 --- a/basex-core/src/main/resources/lang/Chinese.lang +++ b/basex-core/src/main/resources/lang/Chinese.lang @@ -413,7 +413,7 @@ h_db_format = 数据库结构已经变了,请使用新版软件 h_db_options_% = 如果执行'%',这个选项将会被设置 h_diacritics = 索引保留了发音符号 h_fulltext_index = 全文索引可以加速全文检索 -h_html_parser = 将使用Validator.nu将HTML转为XML +h_html_parser_% = 将使用%将HTML转为XML h_index_format = 索引格式变了,请建新索引 h_int_parser = 容错,而且比Java的默认解析器更快 h_languauge = 将使用根据语言确定的tokenizer diff --git a/basex-core/src/main/resources/lang/Dutch.lang b/basex-core/src/main/resources/lang/Dutch.lang index 774465b9b5..d3198c072f 100644 --- a/basex-core/src/main/resources/lang/Dutch.lang +++ b/basex-core/src/main/resources/lang/Dutch.lang @@ -413,7 +413,7 @@ h_db_format = Het database formaat is gewijzigd; maak een nieuwe databa h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Diakritische tekens worden gebruikt in de index. h_fulltext_index = Een full-text index versnelt full-text queries. -h_html_parser = De Validator.nu parser zal gebruikt worden om HTML naar XML te converteren. +h_html_parser_% = De % parser zal gebruikt worden om HTML naar XML te converteren. h_index_format = Het index formaat is gewijzigd; maak nieuwe indexen. h_int_parser = Robuuster en sneller dan Java's standaard parser. h_languauge = Met deze optie zullen taalspecifieke parsers worden gebruikt. diff --git a/basex-core/src/main/resources/lang/English.lang b/basex-core/src/main/resources/lang/English.lang index d7274e38a9..4886826085 100644 --- a/basex-core/src/main/resources/lang/English.lang +++ b/basex-core/src/main/resources/lang/English.lang @@ -413,7 +413,7 @@ h_db_format = The database format has changed; please use a newer versi h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Diacritics are retained in the index. h_fulltext_index = A full-text index speeds up full-text queries. -h_html_parser = The Validator.nu parser will be used to convert HTML to XML. +h_html_parser_% = The % parser will be used to convert HTML to XML. h_index_format = The index format has changed; please create new indexes. h_int_parser = Fault tolerant, and faster than Java’s default parser. h_languauge = Language specific tokenizers will be used. diff --git a/basex-core/src/main/resources/lang/French.lang b/basex-core/src/main/resources/lang/French.lang index 9a02e46376..b1b6c7f100 100644 --- a/basex-core/src/main/resources/lang/French.lang +++ b/basex-core/src/main/resources/lang/French.lang @@ -413,7 +413,7 @@ h_db_format = Le format de base de données a changé ; Veuillez créer h_db_options_% = Les options seront assignées si on exécute '%'. h_diacritics = Les signes diacritiques sont conservés dans l’index. h_fulltext_index = Un index plein texte accélère les requêtes plein texte. -h_html_parser = Le parser Validator.nu sera utilisé pour convertir le HTML en XML. +h_html_parser_% = Le parser % sera utilisé pour convertir le HTML en XML. h_index_format = Le format des index a changé ; Veuillez créer de nouveaux index. h_int_parser = Tolérant aux fautes, et plus rapide que le parser Java par défaut. h_languauge = Des analyseurs spécifiques à la langue vont être utilisés. diff --git a/basex-core/src/main/resources/lang/German.lang b/basex-core/src/main/resources/lang/German.lang index cedba9ccd8..07d34b07b6 100644 --- a/basex-core/src/main/resources/lang/German.lang +++ b/basex-core/src/main/resources/lang/German.lang @@ -413,7 +413,7 @@ h_db_format = Das Datenbankformat hat sich geändert; bitte verwenden S h_db_options_% = Die Optionen werden zugewiesen, wenn '%' ausgeführt wird. h_diacritics = Diakritische Zeichen werden im Index beibehalten. h_fulltext_index = Ein Volltext-Index beschleunigt Volltext-Anfragen. -h_html_parser = Der Validator.nu-Parser wird verwendet, um HTML in XML zu konvertieren. +h_html_parser_% = Der %-Parser wird verwendet, um HTML in XML zu konvertieren. h_index_format = Das Indexformat hat sich geändert; bitte erstellen Sie neue Indizes. h_int_parser = Fehlertolerant und schneller als Javas XML-Parser. h_languauge = Sprachspezifische Tokenisierung wird verwendet. diff --git a/basex-core/src/main/resources/lang/Hungarian.lang b/basex-core/src/main/resources/lang/Hungarian.lang index 20c823ea28..569fce6e0a 100644 --- a/basex-core/src/main/resources/lang/Hungarian.lang +++ b/basex-core/src/main/resources/lang/Hungarian.lang @@ -413,7 +413,7 @@ h_db_format = Az adatbázis formátuma megváltozott; kérem, használj h_db_options_% = Ezek az beállítások csak a következő futtása után lépnek életbe: '%' h_diacritics = Ékezetek megmaradnak az indexelésben. h_fulltext_index = A teljes-szöveg index gyorsítja a teljes-szöveges (full-text) lekérdezéseket. -h_html_parser = A Validator.nu elemző HTML formátumot konvertál XML formátumra. +h_html_parser_% = A % elemző HTML formátumot konvertál XML formátumra. h_index_format = Az index formátuma megváltozott; kérem, készítsen új indexeket. h_int_parser = Hibatűrő, továbbá a Java alapértelmezett elemzőjénél gyorsabb. h_languauge = Nyelvfüggő szövegelemzések is használatra kerülnek. diff --git a/basex-core/src/main/resources/lang/Indonesian.lang b/basex-core/src/main/resources/lang/Indonesian.lang index 74356e5ed8..c7d0d9c56a 100644 --- a/basex-core/src/main/resources/lang/Indonesian.lang +++ b/basex-core/src/main/resources/lang/Indonesian.lang @@ -413,7 +413,7 @@ h_db_format = Bentuk basisdata telah berubah; mohon gunakan versi yang h_db_options_% = Pilihan akan digunakan jika '%' dijalankan. h_diacritics = Diakritik dipertahankan dalam indeks. h_fulltext_index = Indeks semua teks mempercepat kueri teks penuh. -h_html_parser = Pengurai Validator.nu akan digunakan untuk mengubah HTML menjadi XML. +h_html_parser_% = Pengurai % akan digunakan untuk mengubah HTML menjadi XML. h_index_format = Bentuk indeks telah berubah; mohon buat indeks baru. h_int_parser = Toleran kesalahan, dan lebih cepat dari pengurai standar Java. h_languauge = Pengurai teks bahasa tertentu akan digunakan. diff --git a/basex-core/src/main/resources/lang/Italian.lang b/basex-core/src/main/resources/lang/Italian.lang index 168907f804..b62c0c180e 100644 --- a/basex-core/src/main/resources/lang/Italian.lang +++ b/basex-core/src/main/resources/lang/Italian.lang @@ -413,7 +413,7 @@ h_db_format = Il formato della base di dati è cambiato; creare una nuo h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = I segni diacritici sono conservati nell'indice. h_fulltext_index = Un indice "full-text" velocizza le interrogazioni sul testo. -h_html_parser = Il parser Validator.nu verrò usato per convertire HTML in XML. +h_html_parser_% = Il parser % verrò usato per convertire HTML in XML. h_index_format = Il formato degli indici è cambiato; creare nuovi indici. h_int_parser = Tollerante ai guasti e più veloce del parser di default di Java. h_languauge = Parser di testo specifici per la lingua verranno usati diff --git a/basex-core/src/main/resources/lang/Japanese.lang b/basex-core/src/main/resources/lang/Japanese.lang index d0773f8a4a..84a08cd66f 100644 --- a/basex-core/src/main/resources/lang/Japanese.lang +++ b/basex-core/src/main/resources/lang/Japanese.lang @@ -413,7 +413,7 @@ h_db_format = データベース形式を変更しました。新しい h_db_options_% = % 実行時にオプションが割り当てられます。 h_diacritics = インデックス内で付加記号(ウムラウト等)は保持されます。 h_fulltext_index = 全文テキストインデックスは全文検索を高速化します。 -h_html_parser = Validator.nu パーサは HTML を XML に変換します。 +h_html_parser_% = % パーサは HTML を XML に変換します。 h_index_format = インデックス形式を変更しました。新しくインデックスを作成して下さい。 h_int_parser = フォールトトレラント、Javaのデフォルトパーサより高速。 h_languauge = 指定された言語のテキストパーサが使用されます。 diff --git a/basex-core/src/main/resources/lang/Mongolian.lang b/basex-core/src/main/resources/lang/Mongolian.lang index 6c9fce85f5..9079d8891c 100644 --- a/basex-core/src/main/resources/lang/Mongolian.lang +++ b/basex-core/src/main/resources/lang/Mongolian.lang @@ -413,7 +413,7 @@ h_db_format = Өгөгдлийн сангийн формат өөрчл h_db_options_% = The options will be assigned if '%' is executed. h_diacritics = Индекс дэх санах тэмдгийг авч үлдэх. h_fulltext_index = Бүтэн текст индекс нь бүрэн текст квериг хурдан ажиллагаатай болгоно. -h_html_parser = The Validator.nu parser will be used to convert HTML to XML. +h_html_parser_% = The % parser will be used to convert HTML to XML. h_index_format = Индекс формат өөрчлөгдсөн байна; шинээр үүсгэнэ үү. h_int_parser = Fault tolerant, and faster than Java’s default parser. h_languauge = Хэлний текст Parser тодорхойлогдох болно. diff --git a/basex-core/src/main/resources/lang/Romanian.lang b/basex-core/src/main/resources/lang/Romanian.lang index dac622b9d9..aaceae5e82 100644 --- a/basex-core/src/main/resources/lang/Romanian.lang +++ b/basex-core/src/main/resources/lang/Romanian.lang @@ -413,7 +413,7 @@ h_db_format = Formatul bazei de date a fost schimbat, vă rugăm să fo h_db_options_% = Optiunile vor fi asignate daca '%' este executată. h_diacritics = Diacritice sunt păstrate în index. h_fulltext_index = Un full-text index accelereaza interogările full-text. -h_html_parser = Parserul "Validator.nu" va fi folosit pentru a converti HTML în XML. +h_html_parser_% = Parserul "%" va fi folosit pentru a converti HTML în XML. h_index_format = Formatul index s-a schimbat, vă rugăm creati noi indici. h_int_parser = Tolerant la greseli si mai rapid decat parserul default Java. h_languauge = Parsere de text specifice limbii vor fi folosite. diff --git a/basex-core/src/main/resources/lang/Russian.lang b/basex-core/src/main/resources/lang/Russian.lang index 1dda70b524..15a89ecce8 100644 --- a/basex-core/src/main/resources/lang/Russian.lang +++ b/basex-core/src/main/resources/lang/Russian.lang @@ -413,7 +413,7 @@ h_db_format = Формат хранения баз данных был h_db_options_% = Эти опции будут изменены только после выполнения команды [%] h_diacritics = Разделительные знаки будут включены в индекс h_fulltext_index = Полнотекстовый индекс ускоряет соответствующие запросы -h_html_parser = Для конвертации HTML в XML будет использован парсер Validator.nu +h_html_parser_% = Для конвертации HTML в XML будет использован парсер % h_index_format = Формат хранения индексов был изменен. Пожалуйста, создайте индексы заново. h_int_parser = Толерантный к ошибкам и быстрее чем стандартный парсер Java h_languauge = Будут использованы специализированные под каждый язык парсеры diff --git a/basex-core/src/main/resources/lang/Spanish.lang b/basex-core/src/main/resources/lang/Spanish.lang index 42e0ce4329..4d032eecd5 100644 --- a/basex-core/src/main/resources/lang/Spanish.lang +++ b/basex-core/src/main/resources/lang/Spanish.lang @@ -413,7 +413,7 @@ h_db_format = El formato de la Base de Datos ha cambiado; por favor uti h_db_options_% = Las opciónes serán asignado si se ejecuta '%'. h_diacritics = Las diacríticas están retenidas en el índice. h_fulltext_index = Un índice de Texto Completo acelera las consulta de Texto Completo. -h_html_parser = Se utilizará el Analizador Sintáctico Validator.nu para convertir HTML a XML. +h_html_parser_% = Se utilizará el Analizador Sintáctico % para convertir HTML a XML. h_index_format = El formato del índice ha cambiado; for favor, cree nuevos índices. h_int_parser = Tolerante a fallos, y más rápido que el analizador sintáctico por defecto de Java. h_languauge = Se utilizarán analizadores sintácticos de texto específicos del lenguaje. diff --git a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java index 58b5b8b323..d5cf02e045 100644 --- a/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java +++ b/basex-core/src/test/java/org/basex/query/func/HtmlModuleTest.java @@ -20,7 +20,10 @@ public final class HtmlModuleTest extends SandboxTest { query(func.args(" <_/>/text()"), ""); final String path = "src/test/resources/input.html"; - query(func.args(path) + "//*:body ! name()", "body"); + query(func.args(path) + "//body ! name()", "body"); + query(func.args(path, " map { 'nons': false() }") + "//*:body ! name()", "body"); + query(func.args(path, " {'method': 'nu'}") + "//Q{http://www.w3.org/1999/xhtml}body ! name()", + "body"); } /** Test method. */ @@ -32,7 +35,8 @@ public final class HtmlModuleTest extends SandboxTest { // check if the function returns an HTML root node query("exists(" + func.args("<html/>") + "/*:html)", true); // check if the function returns - query(func.args("<html/>"), + query(func.args("<html/>", " map { 'nons': true() }"), ""); + query(func.args("<html/>", " {'method': 'nu'}"), ""); } diff --git a/basex-examples/basex-examples.iml b/basex-examples/basex-examples.iml index c8c6019d38..d68f7b229d 100644 --- a/basex-examples/basex-examples.iml +++ b/basex-examples/basex-examples.iml @@ -30,7 +30,7 @@ - + diff --git a/basex-examples/pom.xml b/basex-examples/pom.xml index 03a834b03a..c66dc0ffb0 100644 --- a/basex-examples/pom.xml +++ b/basex-examples/pom.xml @@ -18,8 +18,8 @@ ${project.version} - nu.validator - htmlparser + org.ccil.cowan.tagsoup + tagsoup org.junit.jupiter diff --git a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java index 757baa63e8..7f9dcc9701 100644 --- a/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java +++ b/basex-examples/src/main/java/org/basex/examples/create/HTMLExample.java @@ -6,7 +6,7 @@ /** * This example demonstrates how to import a file in the HTML format * into the database. The specified input file will be converted to XML - * if Validator.nu is found in the classpath. + * if the HTML parser is found in the classpath. * * @author BaseX Team, BSD License * @author Christian Gruen diff --git a/pom.xml b/pom.xml index 8c315215bf..1bb0f41810 100644 --- a/pom.xml +++ b/pom.xml @@ -63,6 +63,13 @@ runtime true + + org.ccil.cowan.tagsoup + tagsoup + 1.2.1 + runtime + true + nu.validator htmlparser From 12dab13c46665071e3ff709843d7665aa120b1b4 Mon Sep 17 00:00:00 2001 From: Gunther Rademacher Date: Wed, 22 Jan 2025 19:06:06 +0100 Subject: [PATCH 9/9] minor changes --- .../main/java/org/basex/build/html/HtmlOptions.java | 2 +- .../main/java/org/basex/build/html/HtmlParser.java | 11 +++++++---- .../src/main/java/org/basex/core/MainOptions.java | 2 +- .../src/main/java/org/basex/query/QueryError.java | 2 -- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java index 7eebede2ce..74e7a80bc3 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlOptions.java @@ -3,7 +3,7 @@ import org.basex.util.options.*; /** - * Options for parsing and serializing HTML documents with Validator.nu. + * Options for parsing and serializing HTML documents with TagSoup and Validator.nu. * * @author BaseX Team, BSD License * @author Christian Gruen diff --git a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java index 3083a5519b..62aff78895 100644 --- a/basex-core/src/main/java/org/basex/build/html/HtmlParser.java +++ b/basex-core/src/main/java/org/basex/build/html/HtmlParser.java @@ -20,9 +20,12 @@ import nu.validator.htmlparser.sax.*; /** - * This class uses the Validator.nu HTML parser to convert HTML input to well-formed XML. - * If the Validator.nu HTML parser is not found in the classpath, the original document is - * passed on. + * This class uses the TagSoup or Validator.nu HTML parser to convert HTML input to well-formed + * XML. If TagSoup should be used, and it is not found in the classpath, the original document + * is passed on. + * + * TagSoup was written by John Cowan and is based on the Apache 2.0 License: + * {@code http://home.ccil.org/~cowan/XML/tagsoup/}. * * The Validator.nu HTML parser was written by Henri Sivonen and is based on the MIT License: * {@code https://about.validator.nu/htmlparser/}. @@ -80,7 +83,7 @@ private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopt : hopts.contains(ENCODING) ? hopts.get(HtmlOptions.ENCODING) : null; - if (enc != null) { + if(enc != null) { if(!Strings.supported(enc)) throw INVALIDOPT_X.getIO("Unsupported encoding: " + enc + '.'); is.setEncoding(Strings.normEncoding(enc)); } diff --git a/basex-core/src/main/java/org/basex/core/MainOptions.java b/basex-core/src/main/java/org/basex/core/MainOptions.java index bf5c21b49b..2abb02ba61 100644 --- a/basex-core/src/main/java/org/basex/core/MainOptions.java +++ b/basex-core/src/main/java/org/basex/core/MainOptions.java @@ -40,7 +40,7 @@ public final class MainOptions extends Options { /** Define JSON parser options. */ public static final OptionsOption JSONPARSER = new OptionsOption<>("JSONPARSER", new JsonParserOptions()); - /** Define Validator.nu HTML options. */ + /** Define HTML options. */ public static final OptionsOption HTMLPARSER = new OptionsOption<>("HTMLPARSER", new HtmlOptions()); /** Define import parser. */ diff --git a/basex-core/src/main/java/org/basex/query/QueryError.java b/basex-core/src/main/java/org/basex/query/QueryError.java index 746225978a..81211886f6 100644 --- a/basex-core/src/main/java/org/basex/query/QueryError.java +++ b/basex-core/src/main/java/org/basex/query/QueryError.java @@ -619,8 +619,6 @@ public enum QueryError { RESINV_X(FODC, 7, "Resource path '%' is invalid."), /** Error code. */ INVHTML_X(FODC, 11, "HTML parsing failed: %"), - /** Error code. */ - INVHTMLOPT_X(FODC, 12, "HTML option processing failed: %"), /** Error code. */ FORMATWHICH_X(FODF, 1280, "Unknown decimal format: %."),