Skip to content

Commit

Permalink
Fixed "caseSensitive" flag sometimes having no effect in
Browse files Browse the repository at this point in the history
RegexMetadataFilter and RegexReferenceFilter.
  • Loading branch information
essiembre committed May 26, 2017
1 parent 30cbeb2 commit cfe026a
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 25 deletions.
4 changes: 2 additions & 2 deletions norconex-collector-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.norconex.collectors</groupId>
<artifactId>norconex-collector-core</artifactId>
<version>1.8.1</version>
<version>1.8.2-SNAPSHOT</version>
<name>Norconex Collector Core</name>

<properties>
Expand Down Expand Up @@ -65,7 +65,7 @@
<dependency>
<groupId>com.norconex.collectors</groupId>
<artifactId>norconex-importer</artifactId>
<version>2.7.1</version>
<version>2.7.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.norconex.collectors</groupId>
Expand Down
10 changes: 10 additions & 0 deletions norconex-collector-core/src/changes/changes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@
</properties>
<body>

<release version="1.8.2-SNAPSHOT" date="2017-??-??" description="Bugfix release">
<action dev="danizen" type="update">
Dependency updates: Norconex Importer 2.7.2.
</action>
<action dev="essiembre" type="fix">
Fixed "caseSensitive" flag sometimes having no effect in
RegexMetadataFilter and RegexReferenceFilter.
</action>
</release>

<release version="1.8.1" date="2017-05-25" description="Maintenance release">
<action dev="essiembre,danizen" type="add">
MongoCrawlDataStore now support specifying the MongoDB authentication
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ public final void setExtensions(String extensions) {
this.extensions = extensions;
if (extensions != null) {
this.extensionParts = extensions.split("\\s*,\\s*");
for (int i = 0; i < this.extensionParts.length; i++)
for (int i = 0; i < this.extensionParts.length; i++) {
this.extensionParts[i] = this.extensionParts[i].trim();
}
} else {
this.extensionParts = ArrayUtils.EMPTY_STRING_ARRAY;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public class RegexMetadataFilter extends AbstractOnMatchFilter
private boolean caseSensitive;
private String field;
private String regex;
private Pattern pattern;
private Pattern cachedPattern;

public RegexMetadataFilter() {
this(null, null, OnMatch.INCLUDE);
Expand Down Expand Up @@ -110,21 +110,14 @@ public String getField() {
}
public final void setCaseSensitive(boolean caseSensitive) {
this.caseSensitive = caseSensitive;
cachedPattern = null;
}
public final void setField(String header) {
this.field = header;
}
public final void setRegex(String regex) {
this.regex = regex;
if (regex != null) {
int flags = Pattern.DOTALL;
if (!caseSensitive) {
flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
}
this.pattern = Pattern.compile(regex, flags);
} else {
this.pattern = Pattern.compile(".*");
}
cachedPattern = null;
}

@Override
Expand All @@ -135,13 +128,31 @@ public boolean acceptMetadata(String reference, Properties metadata) {
Collection<String> values = metadata.getStrings(field);
for (Object value : values) {
String strVal = Objects.toString(value, StringUtils.EMPTY);
if (pattern.matcher(strVal).matches()) {
if (getCachedPattern().matcher(strVal).matches()) {
return getOnMatch() == OnMatch.INCLUDE;
}
}
return getOnMatch() == OnMatch.EXCLUDE;
}

private synchronized Pattern getCachedPattern() {
if (cachedPattern != null) {
return cachedPattern;
}
Pattern p;
if (regex == null) {
p = Pattern.compile(".*");
} else {
int flags = Pattern.DOTALL;
if (!caseSensitive) {
flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
}
p = Pattern.compile(regex, flags);
}
cachedPattern = p;
return p;
}

@Override
public boolean acceptDocument(ImporterDocument document) {
if (document == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public class RegexReferenceFilter extends AbstractOnMatchFilter implements

private boolean caseSensitive;
private String regex;
private Pattern pattern;
private Pattern cachedPattern;

public RegexReferenceFilter() {
this(null, OnMatch.INCLUDE);
Expand Down Expand Up @@ -103,18 +103,11 @@ public boolean isCaseSensitive() {
}
public final void setCaseSensitive(boolean caseSensitive) {
this.caseSensitive = caseSensitive;
cachedPattern = null;
}
public final void setRegex(String regex) {
this.regex = regex;
if (regex != null) {
int flags = Pattern.DOTALL;
if (!caseSensitive) {
flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
}
this.pattern = Pattern.compile(regex, flags);
} else {
this.pattern = Pattern.compile(".*");
}
cachedPattern = null;
}

@Override
Expand All @@ -123,9 +116,27 @@ public boolean acceptReference(String url) {
if (StringUtils.isBlank(regex)) {
return isInclude;
}
boolean matches = pattern.matcher(url).matches();
boolean matches = getCachedPattern().matcher(url).matches();
return matches && isInclude || !matches && !isInclude;
}

private synchronized Pattern getCachedPattern() {
if (cachedPattern != null) {
return cachedPattern;
}
Pattern p;
if (regex == null) {
p = Pattern.compile(".*");
} else {
int flags = Pattern.DOTALL;
if (!caseSensitive) {
flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
}
p = Pattern.compile(regex, flags);
}
cachedPattern = p;
return p;
}

@Override
public void loadFromXML(Reader in) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
*/
package com.norconex.collector.core.filter.impl;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;

import org.junit.Test;
Expand All @@ -22,6 +25,24 @@
import com.norconex.importer.handler.filter.OnMatch;

public class RegexReferenceFilterTest {

@Test
public void testCaseSensitivity() throws IOException {
RegexReferenceFilter f = new RegexReferenceFilter();
f.setOnMatch(OnMatch.INCLUDE);
f.setRegex("case");

// must match any case:
f.setCaseSensitive(false);
assertTrue(f.acceptReference("case"));
assertTrue(f.acceptReference("CASE"));

// must match only matching case:
f.setCaseSensitive(true);
assertTrue(f.acceptReference("case"));
assertFalse(f.acceptReference("CASE"));
}


@Test
public void testWriteRead() throws IOException {
Expand Down

0 comments on commit cfe026a

Please sign in to comment.