Skip to content

Commit

Permalink
Removes Google Guava
Browse files Browse the repository at this point in the history
Replaces "public-suffix-list" with crawler-commons
  • Loading branch information
rzo1 committed Oct 27, 2021
1 parent b0b458b commit c0da390
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 71 deletions.
22 changes: 5 additions & 17 deletions crawler4j-commons/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
<name>${project.groupId}:${project.artifactId}</name>

<dependencies>
<dependency>
<groupId>com.github.crawler-commons</groupId>
<artifactId>crawler-commons</artifactId>
<version>${crawler-commons.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents.client5</groupId>
<artifactId>httpclient5</artifactId>
Expand Down Expand Up @@ -43,23 +48,6 @@
<artifactId>url-detector</artifactId>
<version>${url-detector.version}</version>
</dependency>
<dependency>
<groupId>de.malkusch.whois-server-list</groupId>
<artifactId>public-suffix-list</artifactId>
<version>${public.suffix.list.version}</version>
<exclusions>
<exclusion>
<!-- we already have a newer version (via Guava) of it in the classpath -->
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<!-- Logging API -->
<dependency>
<groupId>org.slf4j</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,8 @@
import java.io.InputStream;
import java.net.URL;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.domains.EffectiveTldFinder;

import com.google.common.net.InternetDomainName;

import de.malkusch.whoisServerList.publicSuffixList.PublicSuffixList;
import de.malkusch.whoisServerList.publicSuffixList.PublicSuffixListFactory;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;

/**
Expand All @@ -40,16 +35,8 @@
*/
public class TLDList {

@SuppressWarnings("unused")
private final Logger logger = LoggerFactory.getLogger(TLDList.class);

private final boolean onlineUpdate;

private PublicSuffixList publicSuffixList;

public TLDList(CrawlConfig config) throws IOException {
this.onlineUpdate = config.isOnlineTldListUpdate();
if (onlineUpdate) {
if (config.isOnlineTldListUpdate()) {
InputStream stream;
String filename = config.getPublicSuffixLocalFile();
if (filename == null) {
Expand All @@ -59,26 +46,18 @@ public TLDList(CrawlConfig config) throws IOException {
stream = new FileInputStream(filename);
}
try {
this.publicSuffixList = new PublicSuffixListFactory().build(stream);
EffectiveTldFinder.getInstance().initialize(stream);
} finally {
stream.close();
}
}
}

public boolean contains(String domain) {
if (onlineUpdate) {
return publicSuffixList.isPublicSuffix(domain);
} else {
return InternetDomainName.from(domain).isPublicSuffix();
}
return EffectiveTldFinder.getAssignedDomain(domain) != null;
}

public boolean isRegisteredDomain(String domain) {
if (onlineUpdate) {
return publicSuffixList.isRegistrable(domain);
} else {
return InternetDomainName.from(domain).isTopPrivateDomain();
}
return EffectiveTldFinder.getEffectiveTLD(domain) != null;
}
}
4 changes: 4 additions & 0 deletions crawler4j-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,10 @@
<artifactId>tika-langdetect-optimaize</artifactId>
<version>${apache.tika.version}</version>
<exclusions>
<exclusion>
<groupId>com.intellij</groupId>
<artifactId>annotations</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
import java.util.regex.Pattern;

import com.google.common.collect.ImmutableList;
import com.google.common.io.Files;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.BinaryParseData;
Expand All @@ -52,7 +53,7 @@ public class ImageCrawler extends WebCrawler {

public ImageCrawler(File storageFolder, List<String> crawlDomains) {
this.storageFolder = storageFolder;
this.crawlDomains = ImmutableList.copyOf(crawlDomains);
this.crawlDomains = Collections.unmodifiableList(crawlDomains);
}

@Override
Expand Down Expand Up @@ -92,7 +93,7 @@ public void visit(Page page) {
// Store image
String filename = storageFolder.getAbsolutePath() + '/' + hashedName;
try {
Files.write(page.getContentData(), new File(filename));
Files.write(Paths.get(filename), page.getContentData(), StandardOpenOption.CREATE_NEW);
WebCrawler.logger.info("Stored: {}", url);
} catch (IOException iox) {
WebCrawler.logger.error("Failed to write file: {}", filename, iox);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
*/
package edu.uci.ics.crawler4j.examples.multiple;

import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ImmutableList;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
Expand All @@ -43,7 +42,7 @@ public class BasicCrawler extends WebCrawler {
private final List<String> myCrawlDomains;

public BasicCrawler(List<String> myCrawlDomains) {
this.myCrawlDomains = ImmutableList.copyOf(myCrawlDomains);
this.myCrawlDomains = Collections.unmodifiableList(myCrawlDomains);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,9 @@
import edu.uci.ics.crawler4j.frontier.FrontierConfiguration;
import edu.uci.ics.crawler4j.frontier.SleepycatFrontierConfiguration;
import edu.uci.ics.crawler4j.url.SleepycatWebURLFactory;
import edu.uci.ics.crawler4j.url.WebURLFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ImmutableList;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
Expand Down Expand Up @@ -75,8 +72,8 @@ public static void main(String[] args) throws Exception {
CrawlController controller1 = new CrawlController(config1, normalizer1, pageFetcher1, robotstxtServer, frontierConfiguration);
CrawlController controller2 = new CrawlController(config2, normalizer2, pageFetcher2, robotstxtServer, frontierConfiguration2);

List<String> crawler1Domains = ImmutableList.of("https://www.ics.uci.edu/", "https://www.cnn.com/");
List<String> crawler2Domains = ImmutableList.of("https://en.wikipedia.org/");
List<String> crawler1Domains = List.of("https://www.ics.uci.edu/", "https://www.cnn.com/");
List<String> crawler2Domains = List.of("https://en.wikipedia.org/");

controller1.addSeed("https://www.ics.uci.edu/");
controller1.addSeed("https://www.cnn.com/");
Expand Down
6 changes: 0 additions & 6 deletions crawler4j-examples/crawler4j-examples-postgres/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,6 @@
<artifactId>flyway-core</artifactId>
<version>${flyway.db.version}</version>
</dependency>
<dependency>
<!-- Google's core Java libraries -->
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import edu.uci.ics.crawler4j.url.SleepycatWebURLFactory;
import org.flywaydb.core.Flyway;

import com.google.common.io.Files;
import com.mchange.v2.c3p0.ComboPooledDataSource;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
Expand All @@ -35,11 +34,14 @@
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

import java.nio.file.Files;

public class SampleLauncher {

public static void main(String[] args) throws Exception {

String crawlStorageFolder = Files.createTempDir().getAbsolutePath();
String crawlStorageFolder = Files.createTempDirectory("crawler4j-").toAbsolutePath().toString();

final int numberOfCrawlers = Integer.parseInt(args[2]);

CrawlConfig config = new CrawlConfig();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
*/
package de.hshn.mi.crawler4j.url;

import com.google.common.net.InternetDomainName;
import edu.uci.ics.crawler4j.url.TLDList;
import edu.uci.ics.crawler4j.url.WebURL;

Expand Down Expand Up @@ -78,7 +77,7 @@ public void setURL(String url) {
String domain = url.substring(domainStartIdx, domainEndIdx);
registeredDomain = domain;
subDomain = "";
if (tldList != null && !(domain.isEmpty()) && InternetDomainName.isValid(domain)) {
if (tldList != null && !(domain.isEmpty())) {
String candidate = null;
String rd = null;
String sd = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import java.util.Map;

import com.google.common.net.InternetDomainName;
import com.sleepycat.persist.model.Entity;
import com.sleepycat.persist.model.PrimaryKey;

Expand Down Expand Up @@ -86,7 +85,7 @@ public void setURL(String url) {
String domain = url.substring(domainStartIdx, domainEndIdx);
registeredDomain = domain;
subDomain = "";
if (tldList != null && !(domain.isEmpty()) && InternetDomainName.isValid(domain)) {
if (tldList != null && !(domain.isEmpty())) {
String candidate = null;
String rd = null;
String sd = null;
Expand Down
3 changes: 0 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@
<apache.http.core.h2.version>5.1.1</apache.http.core.h2.version>

<apache.tika.version>2.1.0</apache.tika.version>
<guava.version>31.0.1-jre</guava.version>
<!-- XXX replace with crawler commons -->
<public.suffix.list.version>2.2.0</public.suffix.list.version>
<!-- XXX replace with crawler commons -->
<url-detector.version>0.1.23</url-detector.version>
<crawler-commons.version>1.2</crawler-commons.version>
Expand Down

0 comments on commit c0da390

Please sign in to comment.