diff --git a/CHANGES.xml b/CHANGES.xml index ecc5776a..d9fe1679 100644 --- a/CHANGES.xml +++ b/CHANGES.xml @@ -7,6 +7,15 @@ + + + + New DOMPreserveTransformer. + + + + + diff --git a/pom.xml b/pom.xml index 9764b8b3..d3962c64 100644 --- a/pom.xml +++ b/pom.xml @@ -24,7 +24,7 @@ com.norconex.collectors norconex-importer - 3.0.0 + 3.0.1-SNAPSHOT Norconex Importer diff --git a/src/main/java/com/norconex/importer/handler/transformer/impl/DOMDeleteTransformer.java b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMDeleteTransformer.java index dfcd6520..b5b54383 100644 --- a/src/main/java/com/norconex/importer/handler/transformer/impl/DOMDeleteTransformer.java +++ b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMDeleteTransformer.java @@ -45,9 +45,10 @@ import com.norconex.importer.util.DOMUtil; /** - *

Enables deletion of one or more elements matching a given selector + *

+ * Enables deletion of one or more elements matching a given selector * from a document content. Applies to HTML, XHTML, or XML document. - * To extract DOM elements into metadata fields, use {@link DOMTagger} + * To deal with DOM elements in metadata fields, use {@link DOMTagger} * instead. *

* @@ -124,6 +125,7 @@ * @author Pascal Essiembre * @since 3.0.0 * @see DOMTagger + * @see DOMPreserveTransformer */ @SuppressWarnings("javadoc") public class DOMDeleteTransformer extends AbstractDocumentTransformer { @@ -136,7 +138,6 @@ public class DOMDeleteTransformer extends AbstractDocumentTransformer { * Constructor. */ public DOMDeleteTransformer() { - super(); addRestrictions( CommonRestrictions.domContentTypes(DocMetadata.CONTENT_TYPE)); } @@ -209,7 +210,7 @@ public void setSelectors(List selectors) { } public void addSelector(String selector) { if (StringUtils.isNotBlank(selector)) { - this.selectors.add(selector); + selectors.add(selector); } } diff --git a/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.java b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.java new file mode 100644 index 00000000..0bbfd9d3 --- /dev/null +++ b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.java @@ -0,0 +1,471 @@ +/* Copyright 2022 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.norconex.importer.handler.transformer.impl; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.builder.EqualsBuilder; +import org.apache.commons.lang3.builder.HashCodeBuilder; +import org.apache.commons.lang3.builder.ReflectionToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.norconex.commons.lang.xml.XML; +import com.norconex.importer.doc.DocMetadata; +import com.norconex.importer.handler.CommonRestrictions; +import com.norconex.importer.handler.HandlerDoc; +import com.norconex.importer.handler.ImporterHandlerException; +import com.norconex.importer.handler.tagger.impl.DOMTagger; +import com.norconex.importer.handler.transformer.AbstractDocumentTransformer; +import com.norconex.importer.parser.ParseState; +import com.norconex.importer.util.CharsetUtil; +import com.norconex.importer.util.DOMUtil; + +/** + *

+ * Preserves only one or more elements matching a given selector from + * a document content. Applies to HTML, XHTML, or XML document. + * To store preserved values into fields, use {@link DOMTagger} + * instead. + *

+ *

+ * This class constructs a DOM tree from a document or field content. + * That DOM tree is loaded entirely into memory. Use this transformer with + * caution if you know you'll need to parse huge files. + *

+ *

+ * The jsoup parser library is used to load a + * document content into a DOM tree. Elements are referenced using a + * + * CSS or JQuery-like syntax. + *

+ *

Should be used as a pre-parse handler.

+ * + *

Content-types

+ *

+ * By default, this filter is restricted to (applies only to) documents matching + * the restrictions returned by + * {@link CommonRestrictions#domContentTypes(String)}. + * You can specify your own content types if you know they represent a file + * with HTML or XML-like markup tags. + *

+ *

+ * When used as a pre-parse handler, + * this class attempts to detect the content character + * encoding unless the character encoding + * was specified using {@link #setSourceCharset(String)}. Since document + * parsing converts content to UTF-8, UTF-8 is always assumed when + * used as a post-parse handler. + *

+ *

+ * You can control what gets preserved + * exactly thanks to the "extract" argument of + * {@link DOMExtractDetails#setExtract(String)}. Possible values are:

+ *
    + *
  • text: Default option when extract is blank. The text of + * the element, including combined children.
  • + *
  • html: Extracts an element inner + * HTML (including children).
  • + *
  • outerHtml: Extracts an element outer + * HTML (like "html", but includes the "current" tag).
  • + *
  • ownText: Extracts the text owned by this element only; + * does not get the combined text of all children.
  • + *
  • data: Extracts the combined data of a data-element (e.g. + * <script>).
  • + *
  • id: Extracts the ID attribute of the element (if any).
  • + *
  • tagName: Extract the name of the tag of the element.
  • + *
  • val: Extracts the value of a form element + * (input, textarea, etc).
  • + *
  • className: Extracts the literal value of the element's + * "class" attribute, which may include multiple class names, + * space separated.
  • + *
  • cssSelector: Extracts a CSS selector that will uniquely + * select (identify) this element.
  • + *
  • attr(attributeKey): Extracts the value of the element + * attribute matching your replacement for "attributeKey" + * (e.g. "attr(title)" will extract the "title" attribute).
  • + *
+ * + *

+ * You can specify a defaultValue + * on each DOM extraction details. When no match occurred for a given selector, + * the default value will be inserted in the modified document content. + * When matching blanks (see below) you will get + * an empty string as opposed to the default value. + * Empty strings and spaces are supported as default values + * (the default value is now taken literally). + *

+ *

+ * You can set matchBlanks to + * true to match elements that are present + * but have blank values. Blank values are empty values or values containing + * white spaces only. Because white spaces are normalized by the DOM parser, + * such matches will always return an empty string (spaces will be trimmed). + * By default elements with blank values are not matched and are ignored. + *

+ *

+ * You can specify which parser to use when reading + * documents. The default is "html" and will normalize the content + * as HTML. This is generally a desired behavior, but this can sometimes + * have your selector fail. If you encounter this + * problem, try switching to "xml" parser, which does not attempt normalization + * on the content. The drawback with "xml" is you may not get all HTML-specific + * selector options to work. If you know you are dealing with XML to begin + * with, specifying "xml" should be a good option. + *

+ * + *

Multiple preserved elements

+ *

+ * It is possible to preserve multiple elements or text. Specifying multiple + * DOM selector will achieve that. Each potential match is always + * performed on the DOM as it was received. + * You can use with {@link DOMDeleteTransformer} for additional flexibility. + *

+ *

+ * It is important to note that preserved elements and text may not always form + * valid XML when put back together. If your goal is to have the Importer + * parser extracts the raw text from it like any other documents, this is not an + * issue, but it could be if you want to use the new document content as XML + * in a different context. + *

+ * + * {@nx.xml.usage + * + * + * {@nx.include com.norconex.importer.handler.AbstractImporterHandler#restrictTo} + * + * + * + * + * + * } + * + * {@nx.xml.example + * + * + * + * + * } + *

+ * Given this HTML snippet... + *

+ *
+ * <div>
+ *   <div class="firstName">Joe</div>
+ *   <div class="lastName">Dalton</div>
+ *   <div class="city">Daisy Town</div>
+ * </div>
+ * 
+ *

+ * ... the above example will result in the document content having + * the following: + *

+ *
+ *   <div class="firstName">Joe</div>
+ *   <div class="lastName">Dalton</div>
+ * 
+ * + * @author Pascal Essiembre + * @since 3.0.1 + * @see DOMTagger + * @see DOMDeleteTransformer + */ +@SuppressWarnings("javadoc") +public class DOMPreserveTransformer extends AbstractDocumentTransformer { + + private final List extractions = new ArrayList<>(); + private String sourceCharset = null; + private String parser = DOMUtil.PARSER_HTML; + + /** + * Constructor. + */ + public DOMPreserveTransformer() { + addRestrictions( + CommonRestrictions.domContentTypes(DocMetadata.CONTENT_TYPE)); + } + + /** + * Gets the assumed source character encoding. + * @return character encoding of the source to be transformed + */ + public String getSourceCharset() { + return sourceCharset; + } + /** + * Sets the assumed source character encoding. + * @param sourceCharset character encoding of the source to be transformed + */ + public void setSourceCharset(String sourceCharset) { + this.sourceCharset = sourceCharset; + } + + /** + * Gets the parser to use when creating the DOM-tree. + * @return html (default) or xml. + */ + public String getParser() { + return parser; + } + /** + * Sets the parser to use when creating the DOM-tree. + * @param parser html or xml. + */ + public void setParser(String parser) { + this.parser = parser; + } + + @Override + protected void transformApplicableDocument(HandlerDoc doc, + InputStream document, OutputStream output, ParseState parseState) + throws ImporterHandlerException { + String ref = doc.getReference(); + try { + String inputCharset = CharsetUtil.firstNonBlankOrUTF8( + parseState, + sourceCharset, + doc.getDocInfo().getContentEncoding()); + IOUtils.write(handle(Jsoup.parse(document, inputCharset, ref, + DOMUtil.toJSoupParser(getParser()))), + output, inputCharset); + } catch (IOException e) { + throw new ImporterHandlerException( + "Cannot process DOM element(s) from DOM-tree.", e); + } + } + + private String handle(Document jsoupDoc) { + List extractedValues = new ArrayList<>(); + for (DOMExtractDetails details : extractions) { + domExtractDoc(extractedValues, jsoupDoc, details); + } + return StringUtils.join(extractedValues, '\n'); + } + + private void domExtractDoc(List extractedValues, + Document doc, DOMExtractDetails details) { + Elements elms = doc.select(StringUtils.trim(details.selector)); + boolean hasDefault = details.getDefaultValue() != null; + + // no elements matching + if (elms.isEmpty()) { + if (hasDefault) { + extractedValues.add(details.getDefaultValue()); + } + return; + } + + // one or more elements matching + for (Element elm : elms) { + String value = DOMUtil.getElementValue(elm, details.extract); + // JSoup normalizes white spaces and should always trim them, + // but we force it here to ensure 100% consistency. + value = StringUtils.trim(value); + boolean matches = ((value != null) + && (details.matchBlanks || !StringUtils.isBlank(value))); + if (matches) { + extractedValues.add(value); + } else if (hasDefault) { + extractedValues.add(details.getDefaultValue()); + } + } + } + + /** + * Adds DOM extraction details. + * @param extractDetails DOM extraction details + */ + public void addDOMExtractDetails(DOMExtractDetails extractDetails) { + if (extractDetails != null) { + extractions.add(extractDetails); + } + } + + /** + * Gets a list of DOM extraction details. + * @return list of DOM extraction details. + */ + public List getDOMExtractDetailsList() { + return Collections.unmodifiableList(extractions); + } + + /** + * Removes the DOM extraction details matching the given selector + * @param selector DOM selector + */ + public void removeDOMExtractDetails(String selector) { + List toRemove = new ArrayList<>(); + for (DOMExtractDetails details : extractions) { + if (Objects.equals(details.getSelector(), selector)) { + toRemove.add(details); + } + } + synchronized (extractions) { + extractions.removeAll(toRemove); + } + } + + /** + * Removes all DOM extraction details. + */ + public void removeDOMExtractDetailsList() { + synchronized (extractions) { + extractions.clear(); + } + } + + @Override + protected void loadHandlerFromXML(XML xml) { + setSourceCharset(xml.getString("@sourceCharset", sourceCharset)); + setParser(xml.getString("@parser", parser)); + List nodes = xml.getXMLList("dom"); + if (!nodes.isEmpty()) { + extractions.clear(); + } + for (XML node : nodes) { + node.checkDeprecated("@overwrite", "onSet", true); + DOMExtractDetails details = new DOMExtractDetails( + node.getString("@selector", null), + node.getString("@extract", null)); + details.setMatchBlanks(node.getBoolean("@matchBlanks", false)); + details.setDefaultValue(node.getString("@defaultValue", null)); + addDOMExtractDetails(details); + } + } + + @Override + protected void saveHandlerToXML(XML xml) { + xml.setAttribute("sourceCharset", sourceCharset); + xml.setAttribute("parser", parser); + for (DOMExtractDetails details : extractions) { + xml.addElement("dom") + .setAttribute("selector", details.getSelector()) + .setAttribute("extract", details.getExtract()) + .setAttribute("matchBlanks", details.isMatchBlanks()) + .setAttribute("defaultValue", details.getDefaultValue()); + } + } + + @Override + public boolean equals(final Object other) { + return EqualsBuilder.reflectionEquals(this, other); + } + @Override + public int hashCode() { + return HashCodeBuilder.reflectionHashCode(this); + } + @Override + public String toString() { + return new ReflectionToStringBuilder( + this, ToStringStyle.SHORT_PREFIX_STYLE).toString(); + } + + /** + * DOM Extraction Details + * @author Pascal Essiembre + */ + public static class DOMExtractDetails { + private String selector; + private String extract; + private boolean matchBlanks; + private String defaultValue; + + public DOMExtractDetails() { + } + public DOMExtractDetails( + String selector) { + this(selector, null); + } + public DOMExtractDetails(String selector, String extract) { + this.selector = selector; + this.extract = extract; + } + + public String getSelector() { + return selector; + } + public DOMExtractDetails setSelector(String selector) { + this.selector = selector; + return this; + } + + public String getExtract() { + return extract; + } + public DOMExtractDetails setExtract(String extract) { + this.extract = extract; + return this; + } + + /** + * Gets whether elements with blank values should be considered a + * match and have an empty string returned as opposed to nothing at all. + * Default is false; + * @return true if elements with blank values are supported + */ + public boolean isMatchBlanks() { + return matchBlanks; + } + /** + * Sets whether elements with blank values should be considered a + * match and have an empty string returned as opposed to nothing at all. + * @param matchBlanks true to support elements with + * blank values + * @return DOM extraction details + */ + public DOMExtractDetails setMatchBlanks(boolean matchBlanks) { + this.matchBlanks = matchBlanks; + return this; + } + + public String getDefaultValue() { + return defaultValue; + } + public DOMExtractDetails setDefaultValue(String defaultValue) { + this.defaultValue = defaultValue; + return this; + } + + @Override + public boolean equals(final Object other) { + return EqualsBuilder.reflectionEquals(this, other); + } + @Override + public int hashCode() { + return HashCodeBuilder.reflectionHashCode(this); + } + @Override + public String toString() { + return new ReflectionToStringBuilder( + this, ToStringStyle.SHORT_PREFIX_STYLE).toString(); + } + } +} diff --git a/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.xsd b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.xsd new file mode 100644 index 00000000..090fecc7 --- /dev/null +++ b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.xsd @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.java b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.java new file mode 100644 index 00000000..5abf1dfc --- /dev/null +++ b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.java @@ -0,0 +1,127 @@ +/* Copyright 2022 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.norconex.importer.handler.transformer.impl; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import com.norconex.commons.lang.ResourceLoader; +import com.norconex.commons.lang.map.Properties; +import com.norconex.commons.lang.xml.XML; +import com.norconex.importer.TestUtil; +import com.norconex.importer.doc.DocMetadata; +import com.norconex.importer.handler.ImporterHandlerException; +import com.norconex.importer.handler.transformer.impl.DOMPreserveTransformer.DOMExtractDetails; +import com.norconex.importer.parser.ParseState; + +/** + * @author Pascal Essiembre + * @since 2.4.0 + */ +class DOMPreserveTransformerTest { + + @Test + void testWriteRead() { + DOMPreserveTransformer t = new DOMPreserveTransformer(); + t.setParser("xml"); + + DOMExtractDetails extract1 = new DOMExtractDetails("someTag", "text"); + t.addDOMExtractDetails(extract1); + DOMExtractDetails extract2 = new DOMExtractDetails() + .setSelector("otherTag") + .setExtract("html"); + t.addDOMExtractDetails(extract2); + t.setSourceCharset(StandardCharsets.ISO_8859_1.toString()); + + Assertions.assertEquals(2, t.getDOMExtractDetailsList().size()); + Assertions.assertNotSame(extract1, extract2); + Assertions.assertNotEquals(extract1.toString(), extract2.toString()); + Assertions.assertDoesNotThrow(() -> XML.assertWriteRead(t, "handler")); + + t.removeDOMExtractDetails(extract1.getSelector()); + Assertions.assertEquals(1, t.getDOMExtractDetailsList().size()); + t.removeDOMExtractDetailsList(); + Assertions.assertEquals(0, t.getDOMExtractDetailsList().size()); + } + + + @Test + void testTransform() throws ImporterHandlerException, IOException { + DOMPreserveTransformer t = new DOMPreserveTransformer(); + t.setParser("xml"); + + // Test batch #1 + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tag text + "parentA > childA1", "text")); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: attribute + "parentB > childB1", "attr(name)")); + t.addDOMExtractDetails(new DOMExtractDetails( // no match: use default + "parentD > childD1").setDefaultValue("Child D1")); + t.addDOMExtractDetails(new DOMExtractDetails( // no match: no default + "parentE > childE1")); + Assertions.assertEquals("Child A1\nchild1\nChild D1", transform(t)); + + // Test batch #2 + t.removeDOMExtractDetailsList(); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tag html + "childA2", "html")); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tag outerHtml + "childA2", "outerHtml")); + Assertions.assertEquals("Child A2\n" + + "Child A2", transform(t)); + + // Test batch #3 + t.removeDOMExtractDetailsList(); + t.addDOMExtractDetails(new DOMExtractDetails( // no match: ownText + "parentA", "ownText")); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: data + "parentB", "data")); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: ownText + "parentC", "ownText")); + Assertions.assertEquals( + "I'm Data\nParent C Before Parent C After", transform(t)); + + // Test batch #4 + t.removeDOMExtractDetailsList(); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tagName + "[name=child1]", "tagName")); + t.addDOMExtractDetails(new DOMExtractDetails( // preserve: cssSelector + "childC", "cssSelector")); + Assertions.assertEquals("childB1\nchildC\n" + + "DOMTransformerTest > parentC > childC:nth-child(1)\n" + + "DOMTransformerTest > parentC > childC:nth-child(2)", + transform(t)); + } + + private static String transform(DOMPreserveTransformer t) + throws IOException, ImporterHandlerException { + try (InputStream content = + ResourceLoader.getXmlStream(DOMPreserveTransformerTest.class); + ByteArrayOutputStream os = new ByteArrayOutputStream()) { + Properties metadata = new Properties(); + metadata.set(DocMetadata.CONTENT_TYPE, "application/xml"); + t.transformDocument(TestUtil.toHandlerDoc( + "n/a", content, metadata), content, os, ParseState.PRE); + return os.toString(UTF_8.toString()); + } + } +} diff --git a/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.xml b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.xml new file mode 100644 index 00000000..ac9ece03 --- /dev/null +++ b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.xml @@ -0,0 +1,38 @@ + + + + + + + Child A1 + Child A2 + + + + Child B1 + Child B2 + + + + + Parent C Before + Twin C1 + Twin C2 + Parent C After + + +