Skip to content

Commit

Permalink
Releases crawler4j 4.7.2 (make langdetect optional)
Browse files Browse the repository at this point in the history
  • Loading branch information
rzo1 committed Oct 27, 2021
1 parent 1576272 commit be5f68f
Show file tree
Hide file tree
Showing 16 changed files with 79 additions and 25 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Add the following dependency to your pom.xml:
<dependency>
<groupId>de.hs-heilbronn.mi</groupId>
<artifactId>crawler4j-with-sleepycat</artifactId>
<version>4.7.1</version>
<version>4.7.2</version>
<type>pom</type>
</dependency>
```
Expand Down
2 changes: 1 addition & 1 deletion crawler4j-boms/crawler4j-with-hsqldb/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-boms</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion crawler4j-boms/crawler4j-with-sleepycat/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-boms</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
Expand Down
2 changes: 1 addition & 1 deletion crawler4j-boms/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-parent</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion crawler4j-commons/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-parent</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ public class CrawlConfig {
*/
private CookieStore cookieStore;

/**
* Used to enable language detection via tha tika-langdetect module.
* If set to {@code true}, additional runtime dependencies (tika-langdetect-optimaize) are required.
*/
private boolean languageDetection;

/**
* DNS resolver to use, {@link SystemDefaultDnsResolver} is default.
*/
Expand Down Expand Up @@ -733,6 +739,14 @@ public void setBatchReadSize(int batchReadSize) {
this.batchReadSize = batchReadSize;
}

public boolean isLanguageDetection() {
return languageDetection;
}

public void setLanguageDetection(boolean languageDetection) {
this.languageDetection = languageDetection;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Expand Down Expand Up @@ -762,6 +776,7 @@ public String toString() {
sb.append("Halt on error: " + isHaltOnError() + "\n");
sb.append("Allow single level domain:" + isAllowSingleLevelDomain() + "\n");
sb.append("Batch read size: " + getBatchReadSize() + "\n");
sb.append("Language Detection enabled:: " + isLanguageDetection() + "\n");
return sb.toString();
}
}
7 changes: 2 additions & 5 deletions crawler4j-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-parent</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>

<artifactId>crawler4j-core</artifactId>
Expand Down Expand Up @@ -209,15 +209,12 @@
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect-optimaize</artifactId>
<version>${apache.tika.version}</version>
<optional>true</optional>
<exclusions>
<exclusion>
<groupId>com.intellij</groupId>
<artifactId>annotations</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

import crawlercommons.filters.basic.BasicURLNormalizer;
import edu.uci.ics.crawler4j.url.WebURLFactory;
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -46,7 +44,7 @@ public class Parser {

private final HtmlParser htmlContentParser;

private final LanguageDetector languageDetector;
private TikaLanguageDetector languageDetector;

private final Net net;
private final WebURLFactory factory;
Expand All @@ -60,9 +58,10 @@ public Parser(CrawlConfig config, BasicURLNormalizer normalizer, HtmlParser html
this.htmlContentParser = htmlParser;
this.net = new Net(config, tldList, webURLFactory);
this.factory = webURLFactory;
this.languageDetector = new OptimaizeLangDetector();
this.languageDetector.loadModels();
this.normalizer = normalizer;
if(config.isLanguageDetection()) {
this.languageDetector = new TikaLanguageDetector();
}
}

public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
Expand Down Expand Up @@ -128,11 +127,14 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
page.setContentCharset(parsedData.getContentCharset());
}

// Please note that identifying language takes less than 10 milliseconds
page.setLanguage(languageDetector.detect(parsedData.getText()).getLanguage());
if(config.isLanguageDetection()) {
// Please note that identifying language takes less than 10 milliseconds
page.setLanguage(languageDetector.detect(parsedData.getText()));
} else {
page.setLanguage("");
}

page.setParseData(parsedData);

}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*-
* #%L
* de.hs-heilbronn.mi:crawler4j-core
* %%
* Copyright (C) 2010 - 2021 crawler4j-fork
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package edu.uci.ics.crawler4j.parser;

import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;

import java.io.IOException;

public class TikaLanguageDetector {

private final LanguageDetector languageDetector;

public TikaLanguageDetector() throws IOException {
this.languageDetector = new OptimaizeLangDetector();
this.languageDetector.loadModels();
}

public String detect(CharSequence text) {
return languageDetector.detect(text).getLanguage();
}
}
2 changes: 1 addition & 1 deletion crawler4j-examples/crawler4j-examples-base/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>crawler4j-examples</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<artifactId>crawler4j-examples-base</artifactId>
<name>${project.groupId}:${project.artifactId}</name>
Expand Down
2 changes: 1 addition & 1 deletion crawler4j-examples/crawler4j-examples-postgres/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>crawler4j-examples</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<name>${project.groupId}:${project.artifactId}</name>
<artifactId>crawler4j-examples-postgres</artifactId>
Expand Down
2 changes: 1 addition & 1 deletion crawler4j-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>crawler4j-parent</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>

<name>${project.groupId}:${project.artifactId}</name>
Expand Down
2 changes: 1 addition & 1 deletion crawler4j-frontier/crawler4j-frontier-hsqldb/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-frontier</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion crawler4j-frontier/crawler4j-frontier-sleepycat/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-frontier</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion crawler4j-frontier/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>crawler4j-parent</artifactId>
<groupId>de.hs-heilbronn.mi</groupId>
<version>4.7.1</version>
<version>4.7.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<groupId>de.hs-heilbronn.mi</groupId>
<artifactId>crawler4j-parent</artifactId>
<packaging>pom</packaging>
<version>4.7.1</version>
<version>4.7.2</version>
<name>${project.groupId}:${project.artifactId}</name>

<description>Open Source Web Crawler for Java</description>
Expand Down

0 comments on commit be5f68f

Please sign in to comment.