+ * Enables deletion of one or more elements matching a given selector
* from a document content. Applies to HTML, XHTML, or XML document.
- * To extract DOM elements into metadata fields, use {@link DOMTagger}
+ * To deal with DOM elements in metadata fields, use {@link DOMTagger}
* instead.
*
*
@@ -124,6 +125,7 @@
* @author Pascal Essiembre
* @since 3.0.0
* @see DOMTagger
+ * @see DOMPreserveTransformer
*/
@SuppressWarnings("javadoc")
public class DOMDeleteTransformer extends AbstractDocumentTransformer {
@@ -136,7 +138,6 @@ public class DOMDeleteTransformer extends AbstractDocumentTransformer {
* Constructor.
*/
public DOMDeleteTransformer() {
- super();
addRestrictions(
CommonRestrictions.domContentTypes(DocMetadata.CONTENT_TYPE));
}
@@ -209,7 +210,7 @@ public void setSelectors(List selectors) {
}
public void addSelector(String selector) {
if (StringUtils.isNotBlank(selector)) {
- this.selectors.add(selector);
+ selectors.add(selector);
}
}
diff --git a/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.java b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.java
new file mode 100644
index 00000000..0bbfd9d3
--- /dev/null
+++ b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.java
@@ -0,0 +1,471 @@
+/* Copyright 2022 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.norconex.importer.handler.transformer.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.builder.EqualsBuilder;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ToStringStyle;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import com.norconex.commons.lang.xml.XML;
+import com.norconex.importer.doc.DocMetadata;
+import com.norconex.importer.handler.CommonRestrictions;
+import com.norconex.importer.handler.HandlerDoc;
+import com.norconex.importer.handler.ImporterHandlerException;
+import com.norconex.importer.handler.tagger.impl.DOMTagger;
+import com.norconex.importer.handler.transformer.AbstractDocumentTransformer;
+import com.norconex.importer.parser.ParseState;
+import com.norconex.importer.util.CharsetUtil;
+import com.norconex.importer.util.DOMUtil;
+
+/**
+ *
+ * Preserves only one or more elements matching a given selector from
+ * a document content. Applies to HTML, XHTML, or XML document.
+ * To store preserved values into fields, use {@link DOMTagger}
+ * instead.
+ *
+ *
+ * This class constructs a DOM tree from a document or field content.
+ * That DOM tree is loaded entirely into memory. Use this transformer with
+ * caution if you know you'll need to parse huge files.
+ *
+ *
+ * The jsoup parser library is used to load a
+ * document content into a DOM tree. Elements are referenced using a
+ *
+ * CSS or JQuery-like syntax.
+ *
+ * Should be used as a pre-parse handler.
+ *
+ * Content-types
+ *
+ * By default, this filter is restricted to (applies only to) documents matching
+ * the restrictions returned by
+ * {@link CommonRestrictions#domContentTypes(String)}.
+ * You can specify your own content types if you know they represent a file
+ * with HTML or XML-like markup tags.
+ *
+ *
+ * When used as a pre-parse handler,
+ * this class attempts to detect the content character
+ * encoding unless the character encoding
+ * was specified using {@link #setSourceCharset(String)}. Since document
+ * parsing converts content to UTF-8, UTF-8 is always assumed when
+ * used as a post-parse handler.
+ *
+ *
+ * You can control what gets preserved
+ * exactly thanks to the "extract" argument of
+ * {@link DOMExtractDetails#setExtract(String)}. Possible values are:
+ *
+ * - text: Default option when extract is blank. The text of
+ * the element, including combined children.
+ * - html: Extracts an element inner
+ * HTML (including children).
+ * - outerHtml: Extracts an element outer
+ * HTML (like "html", but includes the "current" tag).
+ * - ownText: Extracts the text owned by this element only;
+ * does not get the combined text of all children.
+ * - data: Extracts the combined data of a data-element (e.g.
+ * <script>).
+ * - id: Extracts the ID attribute of the element (if any).
+ * - tagName: Extract the name of the tag of the element.
+ * - val: Extracts the value of a form element
+ * (input, textarea, etc).
+ * - className: Extracts the literal value of the element's
+ * "class" attribute, which may include multiple class names,
+ * space separated.
+ * - cssSelector: Extracts a CSS selector that will uniquely
+ * select (identify) this element.
+ * - attr(attributeKey): Extracts the value of the element
+ * attribute matching your replacement for "attributeKey"
+ * (e.g. "attr(title)" will extract the "title" attribute).
+ *
+ *
+ *
+ * You can specify a defaultValue
+ * on each DOM extraction details. When no match occurred for a given selector,
+ * the default value will be inserted in the modified document content.
+ * When matching blanks (see below) you will get
+ * an empty string as opposed to the default value.
+ * Empty strings and spaces are supported as default values
+ * (the default value is now taken literally).
+ *
+ *
+ * You can set matchBlanks
to
+ * true
to match elements that are present
+ * but have blank values. Blank values are empty values or values containing
+ * white spaces only. Because white spaces are normalized by the DOM parser,
+ * such matches will always return an empty string (spaces will be trimmed).
+ * By default elements with blank values are not matched and are ignored.
+ *
+ *
+ * You can specify which parser to use when reading
+ * documents. The default is "html" and will normalize the content
+ * as HTML. This is generally a desired behavior, but this can sometimes
+ * have your selector fail. If you encounter this
+ * problem, try switching to "xml" parser, which does not attempt normalization
+ * on the content. The drawback with "xml" is you may not get all HTML-specific
+ * selector options to work. If you know you are dealing with XML to begin
+ * with, specifying "xml" should be a good option.
+ *
+ *
+ * Multiple preserved elements
+ *
+ * It is possible to preserve multiple elements or text. Specifying multiple
+ * DOM selector will achieve that. Each potential match is always
+ * performed on the DOM as it was received.
+ * You can use with {@link DOMDeleteTransformer} for additional flexibility.
+ *
+ *
+ * It is important to note that preserved elements and text may not always form
+ * valid XML when put back together. If your goal is to have the Importer
+ * parser extracts the raw text from it like any other documents, this is not an
+ * issue, but it could be if you want to use the new document content as XML
+ * in a different context.
+ *
+ *
+ * {@nx.xml.usage
+ *
+ *
+ * {@nx.include com.norconex.importer.handler.AbstractImporterHandler#restrictTo}
+ *
+ *
+ *
+ *
+ *
+ * }
+ *
+ * {@nx.xml.example
+ *
+ *
+ *
+ *
+ * }
+ *
+ * Given this HTML snippet...
+ *
+ *
+ * <div>
+ * <div class="firstName">Joe</div>
+ * <div class="lastName">Dalton</div>
+ * <div class="city">Daisy Town</div>
+ * </div>
+ *
+ *
+ * ... the above example will result in the document content having
+ * the following:
+ *
+ *
+ * <div class="firstName">Joe</div>
+ * <div class="lastName">Dalton</div>
+ *
+ *
+ * @author Pascal Essiembre
+ * @since 3.0.1
+ * @see DOMTagger
+ * @see DOMDeleteTransformer
+ */
+@SuppressWarnings("javadoc")
+public class DOMPreserveTransformer extends AbstractDocumentTransformer {
+
+ private final List extractions = new ArrayList<>();
+ private String sourceCharset = null;
+ private String parser = DOMUtil.PARSER_HTML;
+
+ /**
+ * Constructor.
+ */
+ public DOMPreserveTransformer() {
+ addRestrictions(
+ CommonRestrictions.domContentTypes(DocMetadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Gets the assumed source character encoding.
+ * @return character encoding of the source to be transformed
+ */
+ public String getSourceCharset() {
+ return sourceCharset;
+ }
+ /**
+ * Sets the assumed source character encoding.
+ * @param sourceCharset character encoding of the source to be transformed
+ */
+ public void setSourceCharset(String sourceCharset) {
+ this.sourceCharset = sourceCharset;
+ }
+
+ /**
+ * Gets the parser to use when creating the DOM-tree.
+ * @return html
(default) or xml
.
+ */
+ public String getParser() {
+ return parser;
+ }
+ /**
+ * Sets the parser to use when creating the DOM-tree.
+ * @param parser html
or xml
.
+ */
+ public void setParser(String parser) {
+ this.parser = parser;
+ }
+
+ @Override
+ protected void transformApplicableDocument(HandlerDoc doc,
+ InputStream document, OutputStream output, ParseState parseState)
+ throws ImporterHandlerException {
+ String ref = doc.getReference();
+ try {
+ String inputCharset = CharsetUtil.firstNonBlankOrUTF8(
+ parseState,
+ sourceCharset,
+ doc.getDocInfo().getContentEncoding());
+ IOUtils.write(handle(Jsoup.parse(document, inputCharset, ref,
+ DOMUtil.toJSoupParser(getParser()))),
+ output, inputCharset);
+ } catch (IOException e) {
+ throw new ImporterHandlerException(
+ "Cannot process DOM element(s) from DOM-tree.", e);
+ }
+ }
+
+ private String handle(Document jsoupDoc) {
+ List extractedValues = new ArrayList<>();
+ for (DOMExtractDetails details : extractions) {
+ domExtractDoc(extractedValues, jsoupDoc, details);
+ }
+ return StringUtils.join(extractedValues, '\n');
+ }
+
+ private void domExtractDoc(List extractedValues,
+ Document doc, DOMExtractDetails details) {
+ Elements elms = doc.select(StringUtils.trim(details.selector));
+ boolean hasDefault = details.getDefaultValue() != null;
+
+ // no elements matching
+ if (elms.isEmpty()) {
+ if (hasDefault) {
+ extractedValues.add(details.getDefaultValue());
+ }
+ return;
+ }
+
+ // one or more elements matching
+ for (Element elm : elms) {
+ String value = DOMUtil.getElementValue(elm, details.extract);
+ // JSoup normalizes white spaces and should always trim them,
+ // but we force it here to ensure 100% consistency.
+ value = StringUtils.trim(value);
+ boolean matches = ((value != null)
+ && (details.matchBlanks || !StringUtils.isBlank(value)));
+ if (matches) {
+ extractedValues.add(value);
+ } else if (hasDefault) {
+ extractedValues.add(details.getDefaultValue());
+ }
+ }
+ }
+
+ /**
+ * Adds DOM extraction details.
+ * @param extractDetails DOM extraction details
+ */
+ public void addDOMExtractDetails(DOMExtractDetails extractDetails) {
+ if (extractDetails != null) {
+ extractions.add(extractDetails);
+ }
+ }
+
+ /**
+ * Gets a list of DOM extraction details.
+ * @return list of DOM extraction details.
+ */
+ public List getDOMExtractDetailsList() {
+ return Collections.unmodifiableList(extractions);
+ }
+
+ /**
+ * Removes the DOM extraction details matching the given selector
+ * @param selector DOM selector
+ */
+ public void removeDOMExtractDetails(String selector) {
+ List toRemove = new ArrayList<>();
+ for (DOMExtractDetails details : extractions) {
+ if (Objects.equals(details.getSelector(), selector)) {
+ toRemove.add(details);
+ }
+ }
+ synchronized (extractions) {
+ extractions.removeAll(toRemove);
+ }
+ }
+
+ /**
+ * Removes all DOM extraction details.
+ */
+ public void removeDOMExtractDetailsList() {
+ synchronized (extractions) {
+ extractions.clear();
+ }
+ }
+
+ @Override
+ protected void loadHandlerFromXML(XML xml) {
+ setSourceCharset(xml.getString("@sourceCharset", sourceCharset));
+ setParser(xml.getString("@parser", parser));
+ List nodes = xml.getXMLList("dom");
+ if (!nodes.isEmpty()) {
+ extractions.clear();
+ }
+ for (XML node : nodes) {
+ node.checkDeprecated("@overwrite", "onSet", true);
+ DOMExtractDetails details = new DOMExtractDetails(
+ node.getString("@selector", null),
+ node.getString("@extract", null));
+ details.setMatchBlanks(node.getBoolean("@matchBlanks", false));
+ details.setDefaultValue(node.getString("@defaultValue", null));
+ addDOMExtractDetails(details);
+ }
+ }
+
+ @Override
+ protected void saveHandlerToXML(XML xml) {
+ xml.setAttribute("sourceCharset", sourceCharset);
+ xml.setAttribute("parser", parser);
+ for (DOMExtractDetails details : extractions) {
+ xml.addElement("dom")
+ .setAttribute("selector", details.getSelector())
+ .setAttribute("extract", details.getExtract())
+ .setAttribute("matchBlanks", details.isMatchBlanks())
+ .setAttribute("defaultValue", details.getDefaultValue());
+ }
+ }
+
+ @Override
+ public boolean equals(final Object other) {
+ return EqualsBuilder.reflectionEquals(this, other);
+ }
+ @Override
+ public int hashCode() {
+ return HashCodeBuilder.reflectionHashCode(this);
+ }
+ @Override
+ public String toString() {
+ return new ReflectionToStringBuilder(
+ this, ToStringStyle.SHORT_PREFIX_STYLE).toString();
+ }
+
+ /**
+ * DOM Extraction Details
+ * @author Pascal Essiembre
+ */
+ public static class DOMExtractDetails {
+ private String selector;
+ private String extract;
+ private boolean matchBlanks;
+ private String defaultValue;
+
+ public DOMExtractDetails() {
+ }
+ public DOMExtractDetails(
+ String selector) {
+ this(selector, null);
+ }
+ public DOMExtractDetails(String selector, String extract) {
+ this.selector = selector;
+ this.extract = extract;
+ }
+
+ public String getSelector() {
+ return selector;
+ }
+ public DOMExtractDetails setSelector(String selector) {
+ this.selector = selector;
+ return this;
+ }
+
+ public String getExtract() {
+ return extract;
+ }
+ public DOMExtractDetails setExtract(String extract) {
+ this.extract = extract;
+ return this;
+ }
+
+ /**
+ * Gets whether elements with blank values should be considered a
+ * match and have an empty string returned as opposed to nothing at all.
+ * Default is false
;
+ * @return true
if elements with blank values are supported
+ */
+ public boolean isMatchBlanks() {
+ return matchBlanks;
+ }
+ /**
+ * Sets whether elements with blank values should be considered a
+ * match and have an empty string returned as opposed to nothing at all.
+ * @param matchBlanks true
to support elements with
+ * blank values
+ * @return DOM extraction details
+ */
+ public DOMExtractDetails setMatchBlanks(boolean matchBlanks) {
+ this.matchBlanks = matchBlanks;
+ return this;
+ }
+
+ public String getDefaultValue() {
+ return defaultValue;
+ }
+ public DOMExtractDetails setDefaultValue(String defaultValue) {
+ this.defaultValue = defaultValue;
+ return this;
+ }
+
+ @Override
+ public boolean equals(final Object other) {
+ return EqualsBuilder.reflectionEquals(this, other);
+ }
+ @Override
+ public int hashCode() {
+ return HashCodeBuilder.reflectionHashCode(this);
+ }
+ @Override
+ public String toString() {
+ return new ReflectionToStringBuilder(
+ this, ToStringStyle.SHORT_PREFIX_STYLE).toString();
+ }
+ }
+}
diff --git a/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.xsd b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.xsd
new file mode 100644
index 00000000..090fecc7
--- /dev/null
+++ b/src/main/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformer.xsd
@@ -0,0 +1,60 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.java b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.java
new file mode 100644
index 00000000..5abf1dfc
--- /dev/null
+++ b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.java
@@ -0,0 +1,127 @@
+/* Copyright 2022 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.norconex.importer.handler.transformer.impl;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import com.norconex.commons.lang.ResourceLoader;
+import com.norconex.commons.lang.map.Properties;
+import com.norconex.commons.lang.xml.XML;
+import com.norconex.importer.TestUtil;
+import com.norconex.importer.doc.DocMetadata;
+import com.norconex.importer.handler.ImporterHandlerException;
+import com.norconex.importer.handler.transformer.impl.DOMPreserveTransformer.DOMExtractDetails;
+import com.norconex.importer.parser.ParseState;
+
+/**
+ * @author Pascal Essiembre
+ * @since 2.4.0
+ */
+class DOMPreserveTransformerTest {
+
+ @Test
+ void testWriteRead() {
+ DOMPreserveTransformer t = new DOMPreserveTransformer();
+ t.setParser("xml");
+
+ DOMExtractDetails extract1 = new DOMExtractDetails("someTag", "text");
+ t.addDOMExtractDetails(extract1);
+ DOMExtractDetails extract2 = new DOMExtractDetails()
+ .setSelector("otherTag")
+ .setExtract("html");
+ t.addDOMExtractDetails(extract2);
+ t.setSourceCharset(StandardCharsets.ISO_8859_1.toString());
+
+ Assertions.assertEquals(2, t.getDOMExtractDetailsList().size());
+ Assertions.assertNotSame(extract1, extract2);
+ Assertions.assertNotEquals(extract1.toString(), extract2.toString());
+ Assertions.assertDoesNotThrow(() -> XML.assertWriteRead(t, "handler"));
+
+ t.removeDOMExtractDetails(extract1.getSelector());
+ Assertions.assertEquals(1, t.getDOMExtractDetailsList().size());
+ t.removeDOMExtractDetailsList();
+ Assertions.assertEquals(0, t.getDOMExtractDetailsList().size());
+ }
+
+
+ @Test
+ void testTransform() throws ImporterHandlerException, IOException {
+ DOMPreserveTransformer t = new DOMPreserveTransformer();
+ t.setParser("xml");
+
+ // Test batch #1
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tag text
+ "parentA > childA1", "text"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: attribute
+ "parentB > childB1", "attr(name)"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // no match: use default
+ "parentD > childD1").setDefaultValue("Child D1"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // no match: no default
+ "parentE > childE1"));
+ Assertions.assertEquals("Child A1\nchild1\nChild D1", transform(t));
+
+ // Test batch #2
+ t.removeDOMExtractDetailsList();
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tag html
+ "childA2", "html"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tag outerHtml
+ "childA2", "outerHtml"));
+ Assertions.assertEquals("Child A2\n"
+ + "Child A2", transform(t));
+
+ // Test batch #3
+ t.removeDOMExtractDetailsList();
+ t.addDOMExtractDetails(new DOMExtractDetails( // no match: ownText
+ "parentA", "ownText"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: data
+ "parentB", "data"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: ownText
+ "parentC", "ownText"));
+ Assertions.assertEquals(
+ "I'm Data\nParent C Before Parent C After", transform(t));
+
+ // Test batch #4
+ t.removeDOMExtractDetailsList();
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: tagName
+ "[name=child1]", "tagName"));
+ t.addDOMExtractDetails(new DOMExtractDetails( // preserve: cssSelector
+ "childC", "cssSelector"));
+ Assertions.assertEquals("childB1\nchildC\n"
+ + "DOMTransformerTest > parentC > childC:nth-child(1)\n"
+ + "DOMTransformerTest > parentC > childC:nth-child(2)",
+ transform(t));
+ }
+
+ private static String transform(DOMPreserveTransformer t)
+ throws IOException, ImporterHandlerException {
+ try (InputStream content =
+ ResourceLoader.getXmlStream(DOMPreserveTransformerTest.class);
+ ByteArrayOutputStream os = new ByteArrayOutputStream()) {
+ Properties metadata = new Properties();
+ metadata.set(DocMetadata.CONTENT_TYPE, "application/xml");
+ t.transformDocument(TestUtil.toHandlerDoc(
+ "n/a", content, metadata), content, os, ParseState.PRE);
+ return os.toString(UTF_8.toString());
+ }
+ }
+}
diff --git a/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.xml b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.xml
new file mode 100644
index 00000000..ac9ece03
--- /dev/null
+++ b/src/test/java/com/norconex/importer/handler/transformer/impl/DOMPreserveTransformerTest.xml
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+ Child A1
+ Child A2
+
+
+
+ Child B1
+ Child B2
+
+
+
+
+ Parent C Before
+ Twin C1
+ Twin C2
+ Parent C After
+
+
+