From c0da3907637280ae1d799e1e18995f277252fdf1 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Wed, 27 Oct 2021 14:38:56 +0200 Subject: [PATCH] Removes Google Guava Replaces "public-suffix-list" with crawler-commons --- crawler4j-commons/pom.xml | 22 +++---------- .../edu/uci/ics/crawler4j/url/TLDList.java | 31 +++---------------- crawler4j-core/pom.xml | 4 +++ .../examples/imagecrawler/ImageCrawler.java | 11 ++++--- .../examples/multiple/BasicCrawler.java | 5 ++- .../multiple/MultipleCrawlerController.java | 7 ++--- .../crawler4j-examples-postgres/pom.xml | 6 ---- .../crawler4j/examples/SampleLauncher.java | 6 ++-- .../mi/crawler4j/url/HSQLDBWebURLImpl.java | 3 +- .../edu/uci/ics/crawler4j/url/WebURLImpl.java | 3 +- pom.xml | 3 -- 11 files changed, 30 insertions(+), 71 deletions(-) diff --git a/crawler4j-commons/pom.xml b/crawler4j-commons/pom.xml index d12efd943..ba519186a 100644 --- a/crawler4j-commons/pom.xml +++ b/crawler4j-commons/pom.xml @@ -13,6 +13,11 @@ ${project.groupId}:${project.artifactId} + + com.github.crawler-commons + crawler-commons + ${crawler-commons.version} + org.apache.httpcomponents.client5 httpclient5 @@ -43,23 +48,6 @@ url-detector ${url-detector.version} - - de.malkusch.whois-server-list - public-suffix-list - ${public.suffix.list.version} - - - - com.google.code.findbugs - jsr305 - - - - - com.google.guava - guava - ${guava.version} - org.slf4j diff --git a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/url/TLDList.java b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/url/TLDList.java index be86f0f75..0045cc069 100644 --- a/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/url/TLDList.java +++ b/crawler4j-commons/src/main/java/edu/uci/ics/crawler4j/url/TLDList.java @@ -24,13 +24,8 @@ import java.io.InputStream; import java.net.URL; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import crawlercommons.domains.EffectiveTldFinder; -import com.google.common.net.InternetDomainName; - -import de.malkusch.whoisServerList.publicSuffixList.PublicSuffixList; -import de.malkusch.whoisServerList.publicSuffixList.PublicSuffixListFactory; import edu.uci.ics.crawler4j.crawler.CrawlConfig; /** @@ -40,16 +35,8 @@ */ public class TLDList { - @SuppressWarnings("unused") - private final Logger logger = LoggerFactory.getLogger(TLDList.class); - - private final boolean onlineUpdate; - - private PublicSuffixList publicSuffixList; - public TLDList(CrawlConfig config) throws IOException { - this.onlineUpdate = config.isOnlineTldListUpdate(); - if (onlineUpdate) { + if (config.isOnlineTldListUpdate()) { InputStream stream; String filename = config.getPublicSuffixLocalFile(); if (filename == null) { @@ -59,7 +46,7 @@ public TLDList(CrawlConfig config) throws IOException { stream = new FileInputStream(filename); } try { - this.publicSuffixList = new PublicSuffixListFactory().build(stream); + EffectiveTldFinder.getInstance().initialize(stream); } finally { stream.close(); } @@ -67,18 +54,10 @@ public TLDList(CrawlConfig config) throws IOException { } public boolean contains(String domain) { - if (onlineUpdate) { - return publicSuffixList.isPublicSuffix(domain); - } else { - return InternetDomainName.from(domain).isPublicSuffix(); - } + return EffectiveTldFinder.getAssignedDomain(domain) != null; } public boolean isRegisteredDomain(String domain) { - if (onlineUpdate) { - return publicSuffixList.isRegistrable(domain); - } else { - return InternetDomainName.from(domain).isTopPrivateDomain(); - } + return EffectiveTldFinder.getEffectiveTLD(domain) != null; } } diff --git a/crawler4j-core/pom.xml b/crawler4j-core/pom.xml index 704e212a9..a733dae0d 100644 --- a/crawler4j-core/pom.xml +++ b/crawler4j-core/pom.xml @@ -210,6 +210,10 @@ tika-langdetect-optimaize ${apache.tika.version} + + com.intellij + annotations + com.google.guava guava diff --git a/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java b/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java index f134c98c5..d253e5216 100644 --- a/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java +++ b/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java @@ -21,13 +21,14 @@ import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.Collections; import java.util.List; import java.util.UUID; import java.util.regex.Pattern; -import com.google.common.collect.ImmutableList; -import com.google.common.io.Files; - import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.parser.BinaryParseData; @@ -52,7 +53,7 @@ public class ImageCrawler extends WebCrawler { public ImageCrawler(File storageFolder, List crawlDomains) { this.storageFolder = storageFolder; - this.crawlDomains = ImmutableList.copyOf(crawlDomains); + this.crawlDomains = Collections.unmodifiableList(crawlDomains); } @Override @@ -92,7 +93,7 @@ public void visit(Page page) { // Store image String filename = storageFolder.getAbsolutePath() + '/' + hashedName; try { - Files.write(page.getContentData(), new File(filename)); + Files.write(Paths.get(filename), page.getContentData(), StandardOpenOption.CREATE_NEW); WebCrawler.logger.info("Stored: {}", url); } catch (IOException iox) { WebCrawler.logger.error("Failed to write file: {}", filename, iox); diff --git a/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java b/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java index 3004eec3f..d9cd14f89 100644 --- a/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java +++ b/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java @@ -19,6 +19,7 @@ */ package edu.uci.ics.crawler4j.examples.multiple; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.regex.Pattern; @@ -26,8 +27,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.ImmutableList; - import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.parser.HtmlParseData; @@ -43,7 +42,7 @@ public class BasicCrawler extends WebCrawler { private final List myCrawlDomains; public BasicCrawler(List myCrawlDomains) { - this.myCrawlDomains = ImmutableList.copyOf(myCrawlDomains); + this.myCrawlDomains = Collections.unmodifiableList(myCrawlDomains); } @Override diff --git a/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/MultipleCrawlerController.java b/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/MultipleCrawlerController.java index f47b17229..3794f50a1 100644 --- a/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/MultipleCrawlerController.java +++ b/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/MultipleCrawlerController.java @@ -25,12 +25,9 @@ import edu.uci.ics.crawler4j.frontier.FrontierConfiguration; import edu.uci.ics.crawler4j.frontier.SleepycatFrontierConfiguration; import edu.uci.ics.crawler4j.url.SleepycatWebURLFactory; -import edu.uci.ics.crawler4j.url.WebURLFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.ImmutableList; - import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.fetcher.PageFetcher; @@ -75,8 +72,8 @@ public static void main(String[] args) throws Exception { CrawlController controller1 = new CrawlController(config1, normalizer1, pageFetcher1, robotstxtServer, frontierConfiguration); CrawlController controller2 = new CrawlController(config2, normalizer2, pageFetcher2, robotstxtServer, frontierConfiguration2); - List crawler1Domains = ImmutableList.of("https://www.ics.uci.edu/", "https://www.cnn.com/"); - List crawler2Domains = ImmutableList.of("https://en.wikipedia.org/"); + List crawler1Domains = List.of("https://www.ics.uci.edu/", "https://www.cnn.com/"); + List crawler2Domains = List.of("https://en.wikipedia.org/"); controller1.addSeed("https://www.ics.uci.edu/"); controller1.addSeed("https://www.cnn.com/"); diff --git a/crawler4j-examples/crawler4j-examples-postgres/pom.xml b/crawler4j-examples/crawler4j-examples-postgres/pom.xml index 8728e3ba1..c5bb7c866 100644 --- a/crawler4j-examples/crawler4j-examples-postgres/pom.xml +++ b/crawler4j-examples/crawler4j-examples-postgres/pom.xml @@ -37,12 +37,6 @@ flyway-core ${flyway.db.version} - - - com.google.guava - guava - ${guava.version} - junit junit diff --git a/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java b/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java index 4056c2508..24d0c8f05 100644 --- a/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java +++ b/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java @@ -25,7 +25,6 @@ import edu.uci.ics.crawler4j.url.SleepycatWebURLFactory; import org.flywaydb.core.Flyway; -import com.google.common.io.Files; import com.mchange.v2.c3p0.ComboPooledDataSource; import edu.uci.ics.crawler4j.crawler.CrawlConfig; @@ -35,11 +34,14 @@ import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; +import java.nio.file.Files; + public class SampleLauncher { public static void main(String[] args) throws Exception { - String crawlStorageFolder = Files.createTempDir().getAbsolutePath(); + String crawlStorageFolder = Files.createTempDirectory("crawler4j-").toAbsolutePath().toString(); + final int numberOfCrawlers = Integer.parseInt(args[2]); CrawlConfig config = new CrawlConfig(); diff --git a/crawler4j-frontier/crawler4j-frontier-hsqldb/src/main/java/de/hshn/mi/crawler4j/url/HSQLDBWebURLImpl.java b/crawler4j-frontier/crawler4j-frontier-hsqldb/src/main/java/de/hshn/mi/crawler4j/url/HSQLDBWebURLImpl.java index 464d47683..52e027728 100644 --- a/crawler4j-frontier/crawler4j-frontier-hsqldb/src/main/java/de/hshn/mi/crawler4j/url/HSQLDBWebURLImpl.java +++ b/crawler4j-frontier/crawler4j-frontier-hsqldb/src/main/java/de/hshn/mi/crawler4j/url/HSQLDBWebURLImpl.java @@ -19,7 +19,6 @@ */ package de.hshn.mi.crawler4j.url; -import com.google.common.net.InternetDomainName; import edu.uci.ics.crawler4j.url.TLDList; import edu.uci.ics.crawler4j.url.WebURL; @@ -78,7 +77,7 @@ public void setURL(String url) { String domain = url.substring(domainStartIdx, domainEndIdx); registeredDomain = domain; subDomain = ""; - if (tldList != null && !(domain.isEmpty()) && InternetDomainName.isValid(domain)) { + if (tldList != null && !(domain.isEmpty())) { String candidate = null; String rd = null; String sd = null; diff --git a/crawler4j-frontier/crawler4j-frontier-sleepycat/src/main/java/edu/uci/ics/crawler4j/url/WebURLImpl.java b/crawler4j-frontier/crawler4j-frontier-sleepycat/src/main/java/edu/uci/ics/crawler4j/url/WebURLImpl.java index e938d4974..ad6f5d11f 100644 --- a/crawler4j-frontier/crawler4j-frontier-sleepycat/src/main/java/edu/uci/ics/crawler4j/url/WebURLImpl.java +++ b/crawler4j-frontier/crawler4j-frontier-sleepycat/src/main/java/edu/uci/ics/crawler4j/url/WebURLImpl.java @@ -21,7 +21,6 @@ import java.util.Map; -import com.google.common.net.InternetDomainName; import com.sleepycat.persist.model.Entity; import com.sleepycat.persist.model.PrimaryKey; @@ -86,7 +85,7 @@ public void setURL(String url) { String domain = url.substring(domainStartIdx, domainEndIdx); registeredDomain = domain; subDomain = ""; - if (tldList != null && !(domain.isEmpty()) && InternetDomainName.isValid(domain)) { + if (tldList != null && !(domain.isEmpty())) { String candidate = null; String rd = null; String sd = null; diff --git a/pom.xml b/pom.xml index 2f279395b..f2a2065b6 100644 --- a/pom.xml +++ b/pom.xml @@ -103,9 +103,6 @@ 5.1.1 2.1.0 - 31.0.1-jre - - 2.2.0 0.1.23 1.2