diff --git a/README.md b/README.md index 4dd692e..408e14f 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Add the following dependency to your pom.xml: com.github.peterbencze serritor - 1.1 + 1.2 ``` @@ -26,38 +26,58 @@ See the [Wiki](https://github.com/peterbencze/serritor/wiki) page. BaseCrawler provides a skeletal implementation of a crawler to minimize the effort to create your own. First, create a class that extends BaseCrawler. In this class, you can customize the behavior of your crawler. There are callbacks available for every stage of crawling. Below you can find a sample implementation: ```java public class MyCrawler extends BaseCrawler { - + public MyCrawler() { - config.addSeedAsString("http://yourspecificwebsite.com"); - config.setFilterOffsiteRequests(true); + // Enable offsite request filtering + config.setOffsiteRequestFiltering(true); + + // Add a crawl seed, this is where the crawling starts + CrawlRequest request = new CrawlRequestBuilder("http://example.com").build(); + config.addCrawlSeed(request); } @Override - protected void onResponseComplete(HtmlResponse response) { - List links = response.getWebDriver().findElements(By.tagName("a")); - links.stream().forEach((WebElement link) -> crawlUrlAsString(link.getAttribute("href"))); + protected void onResponseComplete(final HtmlResponse response) { + // Crawl every link that can be found on the page + response.getWebDriver().findElements(By.tagName("a")) + .stream() + .forEach((WebElement link) -> { + CrawlRequest request = new CrawlRequestBuilder(link.getAttribute("href")).build(); + crawl(request); + }); } @Override - protected void onNonHtmlResponse(NonHtmlResponse response) { - System.out.println("Received a non-HTML response from: " + response.getCurrentUrl()); + protected void onNonHtmlResponse(final NonHtmlResponse response) { + System.out.println("Received a non-HTML response from: " + response.getCrawlRequest().getRequestUrl()); } - + @Override - protected void onUnsuccessfulRequest(UnsuccessfulRequest request) { - System.out.println("Could not get response from: " + request.getCurrentUrl()); + protected void onUnsuccessfulRequest(final UnsuccessfulRequest request) { + System.out.println("Could not get response from: " + request.getCrawlRequest().getRequestUrl()); } } ``` That's it! In just a few lines you can make a crawler that extracts and crawls every URL it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium. -By default, the crawler uses [HtmlUnitDriver](https://github.com/SeleniumHQ/selenium/wiki/HtmlUnitDriver) but you can also set your preferred WebDriver: +By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/): ```java -config.setWebDriver(new ChromeDriver()); +public static void main(String[] args) { + MyCrawler myCrawler = new MyCrawler(); + + // Use HtmlUnit headless browser + myCrawler.start(); +} ``` +Of course, you can also use any other browsers by specifying a corresponding WebDriver instance: +```java +public static void main(String[] args) { + MyCrawler myCrawler = new MyCrawler(); -## Support -The developers would like to thank [Precognox](http://precognox.com/) for the support. + // Use Google Chrome + myCrawler.start(new ChromeDriver()); +} +``` ## License The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). diff --git a/pom.xml b/pom.xml index 63e55ad..ae261e0 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 1.1 + 1.2 jar Serritor @@ -61,12 +61,17 @@ org.seleniumhq.selenium selenium-java - 3.0.1 + 3.4.0 org.seleniumhq.selenium htmlunit-driver - 2.23.2 + 2.27 + + + com.google.guava + guava + 22.0 @@ -115,7 +120,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6.7 + 1.6.8 true ossrh diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 054ebbe..9ca72cc 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,20 +15,18 @@ */ package com.github.peterbencze.serritor.api; -import com.google.common.net.InternetDomainName; -import com.github.peterbencze.serritor.internal.CrawlFrontier; -import com.github.peterbencze.serritor.internal.CrawlRequest; -import com.github.peterbencze.serritor.internal.CrawlRequest.CrawlRequestBuilder; -import com.github.peterbencze.serritor.internal.CrawlerConfiguration; +import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder; import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder; import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder; +import com.github.peterbencze.serritor.internal.CrawlCandidate; +import com.github.peterbencze.serritor.internal.CrawlFrontier; +import com.github.peterbencze.serritor.internal.CrawlerConfiguration; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; -import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.util.List; @@ -41,6 +39,7 @@ import org.apache.http.impl.client.HttpClientBuilder; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; +import org.openqa.selenium.htmlunit.HtmlUnitDriver; /** * Provides a skeletal implementation of a crawler to minimize the effort for @@ -50,61 +49,66 @@ */ public abstract class BaseCrawler { - /** - * Allows the application to configure the crawler. - */ + //Allows the application to configure the crawler protected final CrawlerConfiguration config; - private boolean stopCrawling; + // Indicates if the crawler is currently running or not private boolean isStopped; + + // Indicates if the crawling should be stopped (used for cancelling the loop in the run method) + private boolean stopCrawling; + + // Used for sending HTTP HEAD requests and receiving associate responses private HttpClient httpClient; + private WebDriver webDriver; + private CrawlFrontier frontier; - private int currentCrawlDepth; - private URL currentRequestUrl; protected BaseCrawler() { - // Create the default configuration + // Create a default configuration config = new CrawlerConfiguration(); + // Indicate that the crawler is not running isStopped = true; } /** - * Starts the crawler. + * Starts the crawler using HtmlUnit headless browser. */ public final void start() { - start(null); + start(new HtmlUnitDriver(true), null); } /** - * Resumes a previously saved state. + * Starts the crawler using the browser specified by the WebDriver instance. * - * @param in The input stream to use - * @throws IOException Any of the usual Input/Output related exceptions. - * @throws ClassNotFoundException Class of a serialized object cannot be - * found. + * @param driver The WebDriver instance that will be used by the crawler */ - public final void resume(InputStream in) throws IOException, ClassNotFoundException { - try (ObjectInputStream objectInputStream = new ObjectInputStream(in)) { - CrawlFrontier frontierToUse = (CrawlFrontier) objectInputStream.readObject(); - start(frontierToUse); - } + public final void start(final WebDriver driver) { + start(driver, null); } /** - * Stops the crawler. + * Constructs all the necessary objects and runs the crawler. + * + * @param frontierToUse Previously saved frontier to be used by the crawler. */ - public final void stop() { - if (isStopped) { - throw new IllegalStateException("The crawler is not started."); + private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { + // Check if the crawler is running + if (!isStopped) { + throw new IllegalStateException("The crawler is already started."); } - if (stopCrawling) { - throw new IllegalStateException("Stop has already been called."); - } + isStopped = false; - stopCrawling = true; + httpClient = HttpClientBuilder.create().build(); + + webDriver = driver; + + frontier = frontierToUse != null ? frontierToUse : new CrawlFrontier(config); + + run(); } /** @@ -113,88 +117,80 @@ public final void stop() { * @param out The output stream to use * @throws IOException Any exception thrown by the underlying OutputStream. */ - public final void saveState(OutputStream out) throws IOException { + public final void saveState(final OutputStream out) throws IOException { + // Check if the crawler has been started, otherwise we have nothing to save if (frontier == null) { throw new IllegalStateException("No state to save."); } + // Save the frontier's current state try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(out)) { objectOutputStream.writeObject(frontier); } } /** - * Appends an URL to the list of URLs that should be crawled. + * Resumes a previously saved state using HtmlUnit headless browser. * - * @param urlToCrawl The URL to be crawled + * @param in The input stream to use + * @throws IOException Any of the usual Input/Output related exceptions. + * @throws ClassNotFoundException Class of a serialized object cannot be + * found. */ - protected final void crawlUrl(URL urlToCrawl) { - try { - String topPrivateDomain = getTopPrivateDomain(urlToCrawl); - - CrawlRequest newCrawlRequest = new CrawlRequestBuilder() - .setRefererUrl(currentRequestUrl) - .setRequestUrl(urlToCrawl) - .setTopPrivateDomain(topPrivateDomain) - .setCrawlDepth(currentCrawlDepth + 1) - .build(); - - frontier.feedRequest(newCrawlRequest); - } catch (IllegalStateException ex) { - throw new IllegalArgumentException(ex); - } + public final void resume(final InputStream in) throws IOException, ClassNotFoundException { + resume(new HtmlUnitDriver(true), in); } /** - * Appends an URL (as String) to the list of URLs that should be crawled. + * Resumes a previously saved state using the browser specified by the + * WebDriver instance. * - * @param urlToCrawl The URL to be crawled + * @param driver The WebDriver instance that will be used by the crawler + * @param in The input stream to use + * @throws IOException Any of the usual Input/Output related exceptions. + * @throws ClassNotFoundException Class of a serialized object cannot be + * found. */ - protected final void crawlUrlAsString(String urlToCrawl) { - try { - crawlUrl(new URL(urlToCrawl)); - } catch (MalformedURLException ex) { - throw new IllegalArgumentException(ex); + public final void resume(final WebDriver driver, final InputStream in) throws IOException, ClassNotFoundException { + try (ObjectInputStream objectInputStream = new ObjectInputStream(in)) { + CrawlFrontier frontierToUse = (CrawlFrontier) objectInputStream.readObject(); + start(driver, frontierToUse); } } /** - * Extends the list of URLs that should be crawled with a list of URLs. - * - * @param urlsToCrawl The list of URLs to be crawled + * Stops the crawler. */ - protected final void crawlUrls(List urlsToCrawl) { - urlsToCrawl.stream().forEach(this::crawlUrl); + public final void stop() { + // Check if the crawler is running + if (isStopped) { + throw new IllegalStateException("The crawler is not started."); + } + + if (stopCrawling) { + throw new IllegalStateException("Stop has already been called."); + } + + // Indicate that the crawling should be stopped + stopCrawling = true; } /** - * Extends the list of URLs (as Strings) that should be crawled with a list - * of URLs. + * Passes a crawl request to the crawl frontier. * - * @param urlsToCrawl The list of URLs to be crawled + * @param request The crawl request */ - protected final void crawlUrlsAsStrings(List urlsToCrawl) { - urlsToCrawl.stream().forEach(this::crawlUrlAsString); + protected final void crawl(final CrawlRequest request) { + frontier.feedRequest(request, false); } /** - * Constructs all the necessary objects and runs the crawler. + * Passes multiple crawl requests to the crawl frontier. * - * @param frontierToUse Previously saved frontier to be used by the crawler. + * @param requests The list of crawl requests */ - private void start(CrawlFrontier frontierToUse) { - if (!isStopped) { - throw new IllegalStateException("The crawler is already started."); - } - - isStopped = false; - - httpClient = HttpClientBuilder.create().build(); - webDriver = config.getWebDriver(); - - frontier = frontierToUse != null ? frontierToUse : new CrawlFrontier(config); - - run(); + protected final void crawl(final List requests) { + requests.stream().forEach(this::crawl); } /** @@ -204,21 +200,21 @@ private void run() { try { onBegin(); - while (!stopCrawling && frontier.hasNextRequest()) { - CrawlRequest currentRequest = frontier.getNextRequest(); + while (!stopCrawling && frontier.hasNextCandidate()) { + // Get the next crawl candidate from the queue + CrawlCandidate currentCandidate = frontier.getNextCandidate(); - currentRequestUrl = currentRequest.getRequestUrl(); - String currentRequestUrlAsString = currentRequestUrl.toString(); - currentCrawlDepth = currentRequest.getCrawlDepth(); + URL currentCandidateUrl = currentCandidate.getCandidateUrl(); + String currentRequestUrlAsString = currentCandidateUrl.toString(); HttpHeadResponse httpHeadResponse; - URL responseUrl = currentRequestUrl; + URL responseUrl = currentCandidateUrl; try { HttpClientContext context = HttpClientContext.create(); // Send an HTTP HEAD request to the current URL to determine its availability and content type - httpHeadResponse = getHttpHeadResponse(currentRequestUrl, context); + httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context); // If the request has been redirected, get the final URL List redirectLocations = context.getRedirectLocations(); @@ -226,10 +222,8 @@ private void run() { responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL(); } } catch (IOException ex) { - UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder() - .setCrawlDepth(currentCrawlDepth) - .setRefererUrl(currentRequest.getRefererUrl()) - .setCurrentUrl(currentRequestUrl) + UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) .setException(ex) .build(); @@ -239,20 +233,14 @@ private void run() { // If the request has been redirected, a new crawl request should be created for the redirected URL if (!responseUrl.toString().equals(currentRequestUrlAsString)) { - CrawlRequest newCrawlRequest = new CrawlRequestBuilder() - .setRefererUrl(currentRequestUrl) - .setRequestUrl(responseUrl) - .setTopPrivateDomain(getTopPrivateDomain(responseUrl)) - .setCrawlDepth(currentCrawlDepth) - .build(); + CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build(); + frontier.feedRequest(redirectedCrawlRequest, false); - frontier.feedRequest(newCrawlRequest); continue; } - // Get the content type of the response - String contentType = getContentType(httpHeadResponse); - if (contentType != null && contentType.contains("text/html")) { + // Check if the content of the response is HTML + if (isContentHtml(httpHeadResponse)) { boolean timedOut = false; try { @@ -262,10 +250,8 @@ private void run() { timedOut = true; } - HtmlResponse htmlResponse = new HtmlResponseBuilder() - .setCrawlDepth(currentCrawlDepth) - .setRefererUrl(currentRequest.getRefererUrl()) - .setCurrentUrl(currentRequestUrl) + HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) .setHttpHeadResponse(httpHeadResponse) .setWebDriver(webDriver) .build(); @@ -279,10 +265,8 @@ private void run() { } else { // URLs that point to non-HTML content should not be opened in the browser - NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder() - .setCrawlDepth(currentCrawlDepth) - .setRefererUrl(currentRequest.getRefererUrl()) - .setCurrentUrl(currentRequestUrl) + NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) .setHttpHeadResponse(httpHeadResponse) .build(); @@ -304,43 +288,27 @@ private void run() { } } - /** - * Returns the top private domain for the given URL. - * - * @param url The URL to parse - * @return The top private domain - */ - private String getTopPrivateDomain(URL url) { - return InternetDomainName.from(url.getHost()).topPrivateDomain().toString(); - } - /** * Returns a HTTP HEAD response for the given URL. * * @param destinationUrl The URL to crawl * @return The HTTP HEAD response */ - private HttpHeadResponse getHttpHeadResponse(URL destinationUrl, HttpClientContext context) throws IOException { + private HttpHeadResponse getHttpHeadResponse(final URL destinationUrl, final HttpClientContext context) throws IOException { HttpHead headRequest = new HttpHead(destinationUrl.toString()); HttpResponse response = httpClient.execute(headRequest, context); return new HttpHeadResponse(response); } /** - * Returns the content type of the response. + * Indicates if the content of the response is HTML or not. * * @param httpHeadResponse The HTTP HEAD response - * @return The content type of the response + * @return True if the content is HTML, false otherwise */ - private String getContentType(HttpHeadResponse httpHeadResponse) { - String contentType = null; - + private boolean isContentHtml(final HttpHeadResponse httpHeadResponse) { Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type"); - if (contentTypeHeader != null) { - contentType = contentTypeHeader.getValue(); - } - - return contentType; + return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html"); } /** @@ -354,7 +322,7 @@ protected void onBegin() { * * @param response The HTML response */ - protected void onResponseComplete(HtmlResponse response) { + protected void onResponseComplete(final HtmlResponse response) { } /** @@ -364,7 +332,7 @@ protected void onResponseComplete(HtmlResponse response) { * * @param response The HTML response */ - protected void onResponseTimeout(HtmlResponse response) { + protected void onResponseTimeout(final HtmlResponse response) { } /** @@ -372,7 +340,7 @@ protected void onResponseTimeout(HtmlResponse response) { * * @param response The non-HTML response */ - protected void onNonHtmlResponse(NonHtmlResponse response) { + protected void onNonHtmlResponse(final NonHtmlResponse response) { } /** @@ -381,7 +349,7 @@ protected void onNonHtmlResponse(NonHtmlResponse response) { * * @param request The unsuccessful request */ - protected void onUnsuccessfulRequest(UnsuccessfulRequest request) { + protected void onUnsuccessfulRequest(final UnsuccessfulRequest request) { } /** diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java new file mode 100644 index 0000000..6f8c674 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -0,0 +1,171 @@ +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.api; + +import com.google.common.net.InternetDomainName; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Optional; + +/** + * Represents a crawl request that might be processed by the crawler in the + * future. The reason why it is not sure that it will be processed is because it + * might get filtered out by one of the enabled filters. + * + * @author Krisztian Mozsi + * @author Peter Bencze + */ +public final class CrawlRequest implements Serializable { + + private final URL requestUrl; + private final String topPrivateDomain; + private final int priority; + private final Serializable metadata; + + private CrawlRequest(final CrawlRequestBuilder builder) { + requestUrl = builder.requestUrl; + topPrivateDomain = builder.topPrivateDomain; + priority = builder.priority; + metadata = builder.metadata; + } + + /** + * Returns the request's URL. + * + * @return The URL of the request + */ + public URL getRequestUrl() { + return requestUrl; + } + + /** + * Returns the top private domain of the request's URL. + * + * @return The top private domain of the URL + */ + public String getTopPrivateDomain() { + return topPrivateDomain; + } + + /** + * Returns the request's priority. + * + * @return The priority of the request + */ + public int getPriority() { + return priority; + } + + /** + * Returns metadata associated with the request. + * + * @return The request's metadata + */ + public Optional getMetadata() { + return Optional.ofNullable(metadata); + } + + public static final class CrawlRequestBuilder { + + private final URL requestUrl; + + private String topPrivateDomain; + private int priority; + private Serializable metadata; + + /** + * Constructs a CrawlRequestBuilder instance that can be used to create + * CrawRequest instances. + * + * @param requestUrl The request's URL given as a URL instance + */ + public CrawlRequestBuilder(final URL requestUrl) { + this.requestUrl = requestUrl; + + // Extract the top private domain from the request URL + try { + topPrivateDomain = InternetDomainName.from(requestUrl.getHost()) + .topPrivateDomain() + .toString(); + } catch (IllegalStateException ex) { + throw new IllegalArgumentException(String.format("The top private domain cannot be extracted from the given request URL (\"%s\").", requestUrl), ex); + } + + // Default priority is 0 + priority = 0; + } + + /** + * Constructs a CrawlRequestBuilder instance that can be used to create + * CrawRequest instances. + * + * @param requestUrl The request's URL given as a String instance + */ + public CrawlRequestBuilder(final String requestUrl) { + this(getUrlFromString(requestUrl)); + } + + /** + * Sets the request's priority. + * + * @param priority The priority of the request (higher number means + * higher priority) + * @return The builder instance + */ + public CrawlRequestBuilder setPriority(final int priority) { + this.priority = priority; + return this; + } + + /** + * Sets additional metadata for the request which can be later accessed + * when the crawler processed the request. + * + * @param metadata The metadata associated with the request + * @return The builder instance + */ + public CrawlRequestBuilder setMetadata(final Serializable metadata) { + this.metadata = metadata; + return this; + } + + /** + * Builds the specified CrawlRequest instance. + * + * @return The specified CrawlRequest instance + */ + public CrawlRequest build() { + return new CrawlRequest(this); + } + + /** + * Constructs a URL instance based on the specified URL string. Since + * call to this must be the first statement in a constructor, this + * method is necessary for the conversion to be made. + * + * @param requestUrl The request URL as String + * @return The request URL + */ + private static URL getUrlFromString(final String requestUrl) { + try { + return new URL(requestUrl); + } catch (MalformedURLException ex) { + throw new IllegalArgumentException(String.format("The given request URL (\"%s\") is malformed.", requestUrl), ex); + } + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java index 4225947..c4da75a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java index 1ddfe7f..12a47a2 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.internal.CallbackParameter; +import java.net.URL; import org.openqa.selenium.WebDriver; /** @@ -28,7 +29,7 @@ public final class HtmlResponse extends CallbackParameter { private final HttpHeadResponse httpHeadResponse; private final WebDriver webDriver; - private HtmlResponse(HtmlResponseBuilder builder) { + private HtmlResponse(final HtmlResponseBuilder builder) { super(builder); httpHeadResponse = builder.httpHeadResponse; @@ -53,17 +54,21 @@ public WebDriver getWebDriver() { return webDriver; } - public static class HtmlResponseBuilder extends CallbackParameterBuilder { + public static final class HtmlResponseBuilder extends CallbackParameterBuilder { private HttpHeadResponse httpHeadResponse; private WebDriver webDriver; - public HtmlResponseBuilder setHttpHeadResponse(HttpHeadResponse httpHeadResponse) { + public HtmlResponseBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) { + super(refererUrl, crawlDepth, crawlRequest); + } + + public HtmlResponseBuilder setHttpHeadResponse(final HttpHeadResponse httpHeadResponse) { this.httpHeadResponse = httpHeadResponse; return this; } - public HtmlResponseBuilder setWebDriver(WebDriver webDriver) { + public HtmlResponseBuilder setWebDriver(final WebDriver webDriver) { this.webDriver = webDriver; return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java index 593b71d..847b281 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ public final class HttpHeadResponse { private final HttpResponse response; - public HttpHeadResponse(HttpResponse response) { + public HttpHeadResponse(final HttpResponse response) { this.response = response; } @@ -41,7 +41,7 @@ public HttpHeadResponse(HttpResponse response) { * @param name The name of the header * @return True if it is present, false otherwise */ - public boolean containsHeader(String name) { + public boolean containsHeader(final String name) { return response.containsHeader(name); } @@ -60,7 +60,7 @@ public Header[] getAllHeaders() { * @param name The name of the header * @return The first header with the specified name */ - public Header getFirstHeader(String name) { + public Header getFirstHeader(final String name) { return response.getFirstHeader(name); } @@ -70,7 +70,7 @@ public Header getFirstHeader(String name) { * @param name The name of the headers * @return All the headers */ - public Header[] getHeaders(String name) { + public Header[] getHeaders(final String name) { return response.getHeaders(name); } @@ -80,7 +80,7 @@ public Header[] getHeaders(String name) { * @param name The name of the header * @return The last header with a specified name */ - public Header getLastHeader(String name) { + public Header getLastHeader(final String name) { return response.getLastHeader(name); } @@ -108,7 +108,7 @@ public HeaderIterator headerIterator() { * @param name The name of the headers * @return An iterator of the headers with a given name */ - public HeaderIterator headerIterator(String name) { + public HeaderIterator headerIterator(final String name) { return response.headerIterator(name); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java index f67410b..9245beb 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.internal.CallbackParameter; +import java.net.URL; /** * Represents a non-HTML response. @@ -26,7 +27,7 @@ public final class NonHtmlResponse extends CallbackParameter { private final HttpHeadResponse httpHeadResponse; - private NonHtmlResponse(NonHtmlResponseBuilder builder) { + private NonHtmlResponse(final NonHtmlResponseBuilder builder) { super(builder); httpHeadResponse = builder.httpHeadResponse; @@ -41,11 +42,15 @@ public HttpHeadResponse getHttpHeadResponse() { return httpHeadResponse; } - public static class NonHtmlResponseBuilder extends CallbackParameterBuilder { + public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder { private HttpHeadResponse httpHeadResponse; - public NonHtmlResponseBuilder setHttpHeadResponse(HttpHeadResponse httpHeadResponse) { + public NonHtmlResponseBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) { + super(refererUrl, crawlDepth, crawlRequest); + } + + public NonHtmlResponseBuilder setHttpHeadResponse(final HttpHeadResponse httpHeadResponse) { this.httpHeadResponse = httpHeadResponse; return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java index 2dee4fc..c545c01 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import com.github.peterbencze.serritor.internal.CallbackParameter; import java.io.IOException; +import java.net.URL; /** * Represents an unsuccessful request. @@ -27,7 +28,7 @@ public final class UnsuccessfulRequest extends CallbackParameter { private final IOException exception; - private UnsuccessfulRequest(UnsuccessfulRequestBuilder builder) { + private UnsuccessfulRequest(final UnsuccessfulRequestBuilder builder) { super(builder); exception = builder.exception; @@ -42,11 +43,15 @@ public IOException getException() { return exception; } - public static class UnsuccessfulRequestBuilder extends CallbackParameterBuilder { + public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder { private IOException exception; - public UnsuccessfulRequestBuilder setException(IOException exception) { + public UnsuccessfulRequestBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) { + super(refererUrl, crawlDepth, crawlRequest); + } + + public UnsuccessfulRequestBuilder setException(final IOException exception) { this.exception = exception; return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java index 5e961e3..4359aa5 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,12 @@ */ package com.github.peterbencze.serritor.internal; +import com.github.peterbencze.serritor.api.CrawlRequest; import java.net.URL; +import java.util.Optional; /** - * The base class that all callback parameters inherit from. + * The base class from which all callback parameters inherit from. * * @author Peter Bencze */ @@ -26,12 +28,21 @@ public abstract class CallbackParameter { private final int crawlDepth; private final URL refererUrl; - private final URL currentUrl; + private final CrawlRequest crawlRequest; - protected CallbackParameter(CallbackParameterBuilder builder) { + protected CallbackParameter(final CallbackParameterBuilder builder) { crawlDepth = builder.crawlDepth; refererUrl = builder.refererUrl; - currentUrl = builder.currentUrl; + crawlRequest = builder.crawlRequest; + } + + /** + * Returns the referer URL. + * + * @return The referer URL + */ + public final Optional getRefererUrl() { + return Optional.ofNullable(refererUrl); } /** @@ -44,42 +55,36 @@ public final int getCrawlDepth() { } /** - * Returns the referer URL. + * Returns the crawl request that was processed by the crawler. * - * @return The referer URL + * @return The processed crawl request */ - public final URL getRefererUrl() { - return refererUrl; + public final CrawlRequest getCrawlRequest() { + return crawlRequest; } /** - * Returns the current URL. + * Returns the request's URL. + * + * @return The request's URL * - * @return The current URL + * @deprecated As of release 1.2, replaced by {@link #getCrawlRequest()} */ + @Deprecated public final URL getCurrentUrl() { - return currentUrl; + return crawlRequest.getRequestUrl(); } public static abstract class CallbackParameterBuilder> { - private int crawlDepth; - private URL refererUrl; - private URL currentUrl; - - public T setCrawlDepth(int crawlDepth) { - this.crawlDepth = crawlDepth; - return (T) this; - } + private final URL refererUrl; + private final int crawlDepth; + private final CrawlRequest crawlRequest; - public T setRefererUrl(URL refererUrl) { + public CallbackParameterBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { this.refererUrl = refererUrl; - return (T) this; - } - - public T setCurrentUrl(URL currentUrl) { - this.currentUrl = currentUrl; - return (T) this; + this.crawlDepth = crawlDepth; + this.crawlRequest = crawlRequest; } public abstract CallbackParameter build(); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java new file mode 100644 index 0000000..8d599ab --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java @@ -0,0 +1,119 @@ +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.CrawlRequest; +import java.io.Serializable; +import java.net.URL; + +/** + * Represents a candidate for crawling that will be surely processed by the + * crawler. + * + * @author Peter Bencze + */ +public final class CrawlCandidate implements Serializable { + + private final URL refererUrl; + private final int crawlDepth; + private final CrawlRequest crawlRequest; + + public CrawlCandidate(final CrawlCandidateBuilder builder) { + this.crawlRequest = builder.crawlRequest; + this.refererUrl = builder.refererUrl; + this.crawlDepth = builder.crawlDepth; + } + + /** + * Returns the referer's URL. + * + * @return The URL of the referer + */ + public URL getRefererUrl() { + return refererUrl; + } + + /** + * Returns the candidate's URL. + * + * @return The URL of the candidate + */ + public URL getCandidateUrl() { + return crawlRequest.getRequestUrl(); + } + + /** + * Returns the top private domain of the candidate's URL. + * + * @return The top private domain of the URL + */ + public String getTopPrivateDomain() { + return crawlRequest.getTopPrivateDomain(); + } + + /** + * Returns the crawl depth of the candidate. + * + * @return The crawl depth + */ + public int getCrawlDepth() { + return crawlDepth; + } + + /** + * Returns the priority of the candidate. + * + * @return The priority + */ + public int getPriority() { + return crawlRequest.getPriority(); + } + + /** + * Returns the crawl request from which this candidate was constructed. + * + * @return The crawl request + */ + public CrawlRequest getCrawlRequest() { + return crawlRequest; + } + + public static final class CrawlCandidateBuilder { + + private final CrawlRequest crawlRequest; + + private URL refererUrl; + private int crawlDepth; + + public CrawlCandidateBuilder(final CrawlRequest request) { + crawlRequest = request; + } + + public CrawlCandidateBuilder setRefererUrl(final URL refererUrl) { + this.refererUrl = refererUrl; + return this; + } + + public CrawlCandidateBuilder setCrawlDepth(final int crawlDepth) { + this.crawlDepth = crawlDepth; + return this; + } + + public CrawlCandidate build() { + return new CrawlCandidate(this); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 9855cc4..408710e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,19 @@ */ package com.github.peterbencze.serritor.internal; -import com.github.peterbencze.serritor.api.CrawlingStrategy; +import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.internal.CrawlCandidate.CrawlCandidateBuilder; import java.io.Serializable; import java.net.URL; import java.util.Arrays; +import java.util.Comparator; +import static java.util.Comparator.reverseOrder; import java.util.HashSet; import java.util.List; import java.util.PriorityQueue; import java.util.Queue; import java.util.Set; +import java.util.function.Function; import org.apache.commons.codec.digest.DigestUtils; /** @@ -40,77 +44,93 @@ public final class CrawlFrontier implements Serializable { private final Set allowedDomains; private final Set urlFingerprints; - private final Queue requests; + private final Queue candidates; - public CrawlFrontier(CrawlerConfiguration config) { + private CrawlCandidate currentCandidate; + + public CrawlFrontier(final CrawlerConfiguration config) { this.config = config; allowedDomains = new HashSet<>(); urlFingerprints = new HashSet<>(); - requests = getPriorityQueue(config.getCrawlingStrategy()); + // Construct a priority queue according to the crawling strategy specified in the configuration + candidates = getPriorityQueue(); - config.getSeeds().stream() + // Feed initial crawl requests (seeds) + config.getCrawlSeeds().stream() .forEach((CrawlRequest request) -> { - if (config.getFilterOffsiteRequests()) { - allowedDomains.add(request.getTopPrivateDomain()); - } - - if (config.getFilterDuplicateRequests()) { - String urlFingerprint = getFingerprintForUrl(request.getRequestUrl()); - - if (!urlFingerprints.contains(urlFingerprint)) { - addRequest(request, urlFingerprint); - } - } + feedRequest(request, true); }); } /** - * Method for the crawler to feed requests to the frontier. + * Feeds a crawl request to the frontier. * * @param request The request to be fed + * @param isCrawlSeed True if the request is a crawl seed, false otherwise */ - public void feedRequest(CrawlRequest request) { - String urlFingerprint = getFingerprintForUrl(request.getRequestUrl()); - - if (config.getFilterDuplicateRequests() && urlFingerprints.contains(urlFingerprint)) { - return; + public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { + if (config.isOffsiteRequestFilteringEnabled()) { + if (isCrawlSeed) { + allowedDomains.add(request.getTopPrivateDomain()); + } else { + if (!allowedDomains.contains(request.getTopPrivateDomain())) { + return; + } + } } - if (config.getFilterOffsiteRequests() && !allowedDomains.contains(request.getTopPrivateDomain())) { - return; + if (config.isDuplicateRequestFilteringEnabled()) { + String urlFingerprint = getFingerprintForUrl(request.getRequestUrl()); + + // Check if the URL has already been crawled + if (urlFingerprints.contains(urlFingerprint)) { + return; + } + + // If not, add its fingerprint to the set of URL fingerprints + urlFingerprints.add(urlFingerprint); } - addRequest(request, urlFingerprint); - } + CrawlCandidateBuilder builder; - /** - * Indicates if there are any requests left in the queue. - * - * @return True if there are requests in the queue, false otherwise - */ - public boolean hasNextRequest() { - return !requests.isEmpty(); + if (!isCrawlSeed) { + int crawlDepthLimit = config.getMaxCrawlDepth(); + int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; + + // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit + if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) { + return; + } + + builder = new CrawlCandidateBuilder(request).setRefererUrl(currentCandidate.getCandidateUrl()) + .setCrawlDepth(nextCrawlDepth); + } else { + builder = new CrawlCandidateBuilder(request); + } + + // Finally, add constructed candidate to the queue + candidates.add(builder.build()); } /** - * Gets the next request from the queue. + * Indicates if there are any candidates left in the queue. * - * @return The next request + * @return True if there are candidates in the queue, false otherwise */ - public CrawlRequest getNextRequest() { - return requests.poll(); + public boolean hasNextCandidate() { + return !candidates.isEmpty(); } /** - * Adds a request to the queue and stores its fingerprint. + * Gets the next candidate from the queue. * - * @param request The request to be added to the queue + * @return The next candidate */ - private void addRequest(CrawlRequest request, String urlFingerprint) { - urlFingerprints.add(urlFingerprint); - requests.add(request); + public CrawlCandidate getNextCandidate() { + currentCandidate = candidates.poll(); + return currentCandidate; } /** @@ -119,14 +139,18 @@ private void addRequest(CrawlRequest request, String urlFingerprint) { * @param url The URL that the fingerprint will be created for * @return The fingerprint of the URL */ - private String getFingerprintForUrl(URL url) { + private String getFingerprintForUrl(final URL url) { + // First, we start off with the host only StringBuilder truncatedUrl = new StringBuilder(url.getHost()); + // If there is a path in the URL, we append it after the host String path = url.getPath(); if (path != null && !"/".equals(path)) { truncatedUrl.append(path); } + // If there are any query params, we sort and append them to what we got so far + // This is required in order to detect already crawled URLs with different order of query params String query = url.getQuery(); if (query != null) { truncatedUrl.append("?"); @@ -139,24 +163,26 @@ private String getFingerprintForUrl(URL url) { .forEachOrdered(truncatedUrl::append); } + // Finally, create the SHA-256 hash return DigestUtils.sha256Hex(truncatedUrl.toString()); } /** - * Creates a new priority queue using the given strategy related comparator. + * Creates a new priority queue using the specified strategy. * - * @param strategy The URL traversal strategy * @return A new PriorityQueue instance for CrawlRequests using the given * comparator */ - private PriorityQueue getPriorityQueue(CrawlingStrategy strategy) { - switch (strategy) { + private PriorityQueue getPriorityQueue() { + switch (config.getCrawlingStrategy()) { case BREADTH_FIRST: - return new PriorityQueue<>(new CrawlRequestComparator()); + return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth) + .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); case DEPTH_FIRST: - return new PriorityQueue<>(new CrawlRequestComparator().reversed()); + return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth, reverseOrder()) + .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); } - throw new IllegalArgumentException("Not supported crawling strategy."); + throw new IllegalArgumentException("Unsupported crawling strategy."); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlRequest.java deleted file mode 100644 index e6701ad..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlRequest.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright 2016 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.github.peterbencze.serritor.internal; - -import java.io.Serializable; -import java.net.URL; - -/** - * Represents a crawl request that should be processed by the crawler. - * - * @author Krisztian Mozsi - * @author Peter Bencze - */ -public final class CrawlRequest implements Serializable { - - private final URL refererUrl; - private final URL requestUrl; - private final String topPrivateDomain; - private final int crawlDepth; - - private CrawlRequest(CrawlRequestBuilder builder) { - refererUrl = builder.refererUrl; - requestUrl = builder.requestUrl; - topPrivateDomain = builder.topPrivateDomain; - crawlDepth = builder.crawlDepth; - } - - /** - * Returns the referer's URL. - * - * @return The URL of the referer. - */ - public URL getRefererUrl() { - return refererUrl; - } - - /** - * Returns the request's URL. - * - * @return The URL of the request - */ - public URL getRequestUrl() { - return requestUrl; - } - - /** - * Returns the top private domain of the request's URL. - * - * @return The top private domain of the URL - */ - public String getTopPrivateDomain() { - return topPrivateDomain; - } - - /** - * Returns the crawl depth of the request. - * - * @return The crawl depth - */ - public int getCrawlDepth() { - return crawlDepth; - } - - public static class CrawlRequestBuilder { - - private URL refererUrl; - private URL requestUrl; - private String topPrivateDomain; - private int crawlDepth; - - public CrawlRequestBuilder setRefererUrl(URL refererUrl) { - this.refererUrl = refererUrl; - return this; - } - - public CrawlRequestBuilder setRequestUrl(URL requestUrl) { - this.requestUrl = requestUrl; - return this; - } - - public CrawlRequestBuilder setTopPrivateDomain(String topPrivateDomain) { - this.topPrivateDomain = topPrivateDomain; - return this; - } - - public CrawlRequestBuilder setCrawlDepth(int crawlDepth) { - this.crawlDepth = crawlDepth; - return this; - } - - public CrawlRequest build() { - return new CrawlRequest(this); - } - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlRequestComparator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlRequestComparator.java deleted file mode 100644 index b809d1c..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlRequestComparator.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2016 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.github.peterbencze.serritor.internal; - -import java.io.Serializable; -import java.util.Comparator; - -/** - * Defines a comparator for CrawlRequests to decide the next URL to crawl. Using - * this implementation, the first element in the ordering is the request with - * the least depth (equals to breadth-first search). Reversing the comparator - * will result in depth-first search. - * - * @author Krisztian Mozsi - */ -public final class CrawlRequestComparator implements Comparator, Serializable { - - @Override - public int compare(CrawlRequest request1, CrawlRequest request2) { - if (request1.getCrawlDepth() < request2.getCrawlDepth()) { - return -1; - } - - if (request1.getCrawlDepth() > request2.getCrawlDepth()) { - return 1; - } - - return 0; - } - -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index a46a814..c7e065a 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,17 +15,12 @@ */ package com.github.peterbencze.serritor.internal; +import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlingStrategy; -import com.google.common.net.InternetDomainName; -import com.github.peterbencze.serritor.internal.CrawlRequest.CrawlRequestBuilder; import java.io.Serializable; -import java.net.MalformedURLException; -import java.net.URL; import java.time.Duration; import java.util.ArrayList; import java.util.List; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.htmlunit.HtmlUnitDriver; /** * Provides an interface to configure the crawler. @@ -35,96 +30,137 @@ */ public final class CrawlerConfiguration implements Serializable { - private final List seeds; + private final List crawlSeeds; - private transient WebDriver webDriver; private CrawlingStrategy crawlingStrategy; private boolean filterDuplicateRequests; private boolean filterOffsiteRequests; private Duration delayBetweenRequests; + private int maxCrawlDepth; public CrawlerConfiguration() { - webDriver = new HtmlUnitDriver(true); - seeds = new ArrayList<>(); + // Default configuration + crawlSeeds = new ArrayList<>(); crawlingStrategy = CrawlingStrategy.BREADTH_FIRST; filterDuplicateRequests = true; delayBetweenRequests = Duration.ZERO; + maxCrawlDepth = 0; } - public WebDriver getWebDriver() { - return webDriver; + /** + * Returns the list of crawl seeds. + * + * @return The list of crawl seeds + */ + public List getCrawlSeeds() { + return crawlSeeds; } - public void setWebDriver(WebDriver webDriver) { - this.webDriver = webDriver; + /** + * Appends a crawl request to the list of crawl seeds. + * + * @param request The crawl request + */ + public void addCrawlSeed(final CrawlRequest request) { + crawlSeeds.add(request); } - public List getSeeds() { - return seeds; - } - - public void addSeed(URL seed) { - try { - String topPrivateDomain = InternetDomainName.from(seed.getHost()) - .topPrivateDomain() - .toString(); - - CrawlRequest newCrawlRequest = new CrawlRequestBuilder() - .setRequestUrl(seed) - .setTopPrivateDomain(topPrivateDomain) - .build(); - - seeds.add(newCrawlRequest); - } catch (IllegalStateException ex) { - throw new IllegalArgumentException(ex); - } - } - - public void addSeedAsString(String seed) { - try { - addSeed(new URL(seed)); - } catch (MalformedURLException ex) { - throw new IllegalArgumentException(ex); - } - } - - public void addSeeds(List seeds) { - seeds.stream().forEach(this::addSeed); - } - - public void addSeedsAsStrings(List seeds) { - seeds.stream().forEach(this::addSeedAsString); + /** + * Appends a list of crawl requests to the list of crawl seeds. + * + * @param requests The list of crawl requests + */ + public void addCrawlSeeds(final List requests) { + crawlSeeds.addAll(requests); } + /** + * Returns the crawling strategy of the crawler. + * + * @return The crawling strategy + */ public CrawlingStrategy getCrawlingStrategy() { return crawlingStrategy; } - public void setCrawlingStrategy(CrawlingStrategy crawlingStrategy) { + /** + * Sets the crawling strategy of the crawler. + * + * @param crawlingStrategy The crawling strategy + */ + public void setCrawlingStrategy(final CrawlingStrategy crawlingStrategy) { this.crawlingStrategy = crawlingStrategy; } - public boolean getFilterDuplicateRequests() { + /** + * Indicates if duplicate request filtering is enabled or not. + * + * @return True if it is enabled, false otherwise + */ + public boolean isDuplicateRequestFilteringEnabled() { return filterDuplicateRequests; } - public void setFilterDuplicateRequests(boolean filterDuplicateRequests) { + /** + * Sets duplicate request filtering. + * + * @param filterDuplicateRequests True means enabled, false means disabled + */ + public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) { this.filterDuplicateRequests = filterDuplicateRequests; } - public boolean getFilterOffsiteRequests() { + /** + * Indicates if offsite request filtering is enabled or not. + * + * @return True if it is enabled, false otherwise + */ + public boolean isOffsiteRequestFilteringEnabled() { return filterOffsiteRequests; } - public void setFilterOffsiteRequests(boolean filterOffsiteRequests) { + /** + * Sets offsite request filtering. + * + * @param filterOffsiteRequests True means enabled, false means disabled + */ + public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { this.filterOffsiteRequests = filterOffsiteRequests; } + /** + * Returns the delay between each request. + * + * @return The delay between each request + */ public Duration getDelayBetweenRequests() { return delayBetweenRequests; } - public void setDelayBetweenRequests(Duration delayBetweenRequests) { + /** + * Sets the delay between each request. + * + * @param delayBetweenRequests The delay between each request + */ + public void setDelayBetweenRequests(final Duration delayBetweenRequests) { this.delayBetweenRequests = delayBetweenRequests; } + + /** + * Returns the maximum possible crawl depth. + * + * @return The maximum crawl depth + */ + public int getMaxCrawlDepth() { + return maxCrawlDepth; + } + + /** + * Sets the maximum possible crawl depth. + * + * @param maxCrawlDepth The maximum crawl depth, zero means no limit + */ + public void setMaxCrawlDepth(int maxCrawlDepth) { + this.maxCrawlDepth = maxCrawlDepth; + } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index a56a5b3..2c8fb29 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Peter Bencze. + * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,13 +15,14 @@ */ package com.github.peterbencze.serritor.internal; -import com.github.peterbencze.serritor.internal.CrawlRequest.CrawlRequestBuilder; +import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; +import com.github.peterbencze.serritor.api.CrawlingStrategy; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import org.junit.Before; import org.junit.Test; @@ -32,200 +33,402 @@ * @author Krisztian Mozsi * @author Peter Bencze */ -public class CrawlFrontierTest { +public final class CrawlFrontierTest { - private static final CrawlRequestBuilder CRAWL_REQUEST_BUILDER = new CrawlRequestBuilder(); + // Root URLs + private static final URL ROOT_URL_0; + private static final URL ROOT_URL_1; - private final static URL ROOT_URL1; - private final static CrawlRequest ROOT_URL1_CRAWL_REQUEST; + // Root URL crawl depth + private static final int ROOT_URL_CRAWL_DEPTH = 0; - private final static URL ROOT_URL2; + // Root URL priorities + private static final int ROOT_URL_0_PRIORITY = 0; + private static final int ROOT_URL_1_PRIORITY = 1; - private final static String ROOT_URL_DOMAIN = "root-url.com"; - private final static String ROOT_URL2_DOMAIN = "root-url2.com"; + // Root URL crawl requests + private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST; + private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST; - private final static URL CHILD_URL1; - private final static CrawlRequest CHILD_URL1_CRAWL_REQUEST; + // Child URLs + private static final URL CHILD_URL_0; + private static final URL CHILD_URL_1; + private static final URL CHILD_URL_2; - private final static URL CHILD_URL2; - private final static CrawlRequest CHILD_URL2_CRAWL_REQUEST; + // Child URL crawl depth + private static final int CHILD_URL_CRAWL_DEPTH = 1; - private final static URL CHILD_URL3; - private final static CrawlRequest CHILD_URL3_CRAWL_REQUEST; + // Child URL priorities + private static final int CHILD_URL_0_PRIORITY = 0; + private static final int CHILD_URL_1_PRIORITY = CHILD_URL_0_PRIORITY; + private static final int CHILD_URL_2_PRIORITY = 1; - private final static URL OFFSITE_URL; - private final static CrawlRequest OFFSITE_URL_CRAWL_REQUEST; - private final static String OFFSITE_URL_DOMAIN = "offsite-url.com"; + // Child URL crawl requests + private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST; + private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST; + private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST; + + // Child URL path + private static final String CHILD_URL_PATH = "/child"; - private CrawlerConfiguration config; - private CrawlFrontier frontier; + // Offsite URL + private static final URL OFFSITE_URL; + + // Offsite URL priority + private static final int OFFSITE_URL_PRIORITY = 0; + + // Offsite URL crawl request + private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST; + + // Max crawl depth + private static final int MAX_CRAWL_DEPTH = 1; static { try { - ROOT_URL1 = new URL("http://root-url.com"); - ROOT_URL2 = new URL("http://root-url2.com"); + // Initialization of root URLs + ROOT_URL_0 = new URL("http://root_url_0.com"); + ROOT_URL_1 = new URL("http://root_url_1.com"); + + // Initialization of child URLs + CHILD_URL_0 = new URL(String.format("http://root_url_0.com%s_0.html", CHILD_URL_PATH)); + CHILD_URL_1 = new URL(String.format("http://root_url_0.com%s_1.html", CHILD_URL_PATH)); - CHILD_URL1 = new URL("http://root-url.com/child1.html"); - CHILD_URL2 = new URL("http://root-url.com/child2.html"); - CHILD_URL3 = new URL("http://root-url2.com/child3.html"); + CHILD_URL_2 = new URL(String.format("http://root_url_1.com%s_0.html", CHILD_URL_PATH)); - OFFSITE_URL = new URL("http://offsite-url.com"); + // Initialization of the offsite URL + OFFSITE_URL = new URL("http://offsite_url.com"); } catch (MalformedURLException ex) { throw new Error(ex); } - ROOT_URL1_CRAWL_REQUEST = CRAWL_REQUEST_BUILDER.setRefererUrl(ROOT_URL1) - .setRequestUrl(ROOT_URL1) - .setTopPrivateDomain(ROOT_URL_DOMAIN) - .setCrawlDepth(1) + // Initialize crawl requests + ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0) + .setPriority(ROOT_URL_0_PRIORITY) .build(); - CHILD_URL1_CRAWL_REQUEST = CRAWL_REQUEST_BUILDER.setRefererUrl(ROOT_URL1) - .setRequestUrl(CHILD_URL1) - .setTopPrivateDomain(ROOT_URL_DOMAIN) - .setCrawlDepth(1) + ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1) + .setPriority(ROOT_URL_1_PRIORITY) .build(); - CHILD_URL2_CRAWL_REQUEST = CRAWL_REQUEST_BUILDER.setRefererUrl(ROOT_URL2) - .setRequestUrl(CHILD_URL2) - .setTopPrivateDomain(ROOT_URL2_DOMAIN) + CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0) + .setPriority(CHILD_URL_0_PRIORITY) .build(); - CHILD_URL3_CRAWL_REQUEST = CRAWL_REQUEST_BUILDER.setRefererUrl(ROOT_URL2) - .setRequestUrl(CHILD_URL3) - .setTopPrivateDomain(ROOT_URL2_DOMAIN) - .setCrawlDepth(2) + CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1) + .setPriority(CHILD_URL_1_PRIORITY) .build(); - OFFSITE_URL_CRAWL_REQUEST = CRAWL_REQUEST_BUILDER.setRefererUrl(ROOT_URL1) - .setRequestUrl(OFFSITE_URL) - .setTopPrivateDomain(OFFSITE_URL_DOMAIN) - .setCrawlDepth(1) + CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2) + .setPriority(CHILD_URL_2_PRIORITY) + .build(); + + OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL) + .setPriority(OFFSITE_URL_PRIORITY) .build(); } + private CrawlerConfiguration config; + private CrawlFrontier frontier; + @Before public void initialize() { + // Create configuration config = new CrawlerConfiguration(); - config.addSeeds(Arrays.asList(ROOT_URL1, ROOT_URL2)); - config.setFilterOffsiteRequests(true); + config.setOffsiteRequestFiltering(true); + config.addCrawlSeeds(Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST)); + // Create frontier frontier = new CrawlFrontier(config); } @Test public void hasNextRequestTest() { - assertTrue(frontier.hasNextRequest()); + // At this point, there are 2 candidates in the queue + + // Check if there are any candidates in the queue, the method should return true + assertTrue(frontier.hasNextCandidate()); - frontier.getNextRequest(); - assertTrue(frontier.hasNextRequest()); + // Get the next candidate from the queue + frontier.getNextCandidate(); - frontier.getNextRequest(); - assertFalse(frontier.hasNextRequest()); + // Check if there are any candidates in the queue, the method should return true again + assertTrue(frontier.hasNextCandidate()); - frontier.feedRequest(CHILD_URL1_CRAWL_REQUEST); - frontier.feedRequest(CHILD_URL2_CRAWL_REQUEST); - assertTrue(frontier.hasNextRequest()); + // Get the next candidate from the queue + frontier.getNextCandidate(); - frontier.getNextRequest(); - assertTrue(frontier.hasNextRequest()); + // Check if there are any candidates in the queue, the method should return false at this point + assertFalse(frontier.hasNextCandidate()); - frontier.getNextRequest(); - assertFalse(frontier.hasNextRequest()); + // Feed 2 crawl requests + frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + + // Check if there are any candidates in the queue, the method should return true + assertTrue(frontier.hasNextCandidate()); + + // Get the next candidate from the queue + frontier.getNextCandidate(); + + // Check if there are any candidates in the queue, the method should return true once again + assertTrue(frontier.hasNextCandidate()); + + // Get the next candidate from the queue + frontier.getNextCandidate(); + + // Finally, check if there are any candidates in the queue, the method should return false at this point + assertFalse(frontier.hasNextCandidate()); } @Test - public void hasNextRequestEmptyFrontierTest() { + public void hasNextRequestEmptyQueueTest() { + // Create frontier without any seeds frontier = new CrawlFrontier(new CrawlerConfiguration()); - assertFalse(frontier.hasNextRequest()); + + // Check if there are any candidates in the queue, the method should return false + assertFalse(frontier.hasNextCandidate()); } @Test - public void getNextRequestTest() { - CrawlRequest currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); + public void getNextRequestWithDuplicateRequestFilterTest() { + // Clear the crawl candidate queue of the frontier + clearCrawlCandidateQueue(); - // feed child URLs - frontier.feedRequest(CHILD_URL1_CRAWL_REQUEST); - frontier.feedRequest(CHILD_URL2_CRAWL_REQUEST); + // Feed a duplicate crawl request (root URL 0 is a seed) + frontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); - currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); + // Check if the candidate was added to the queue, the method should return false + assertFalse(frontier.hasNextCandidate()); + } - currentRequest = frontier.getNextRequest(); - assertEquals(1, currentRequest.getCrawlDepth()); + @Test + public void getNextRequestWithOffsiteRequestFilterTest() { + // Clear the crawl candidate queue of the frontier + clearCrawlCandidateQueue(); - frontier.feedRequest(CHILD_URL3_CRAWL_REQUEST); + // Feed an offsite request + frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false); - currentRequest = frontier.getNextRequest(); - assertEquals(1, currentRequest.getCrawlDepth()); + // Check if the candidate was added to the queue, the method should return false + assertFalse(frontier.hasNextCandidate()); + } - currentRequest = frontier.getNextRequest(); - assertEquals(CHILD_URL3.toString(), currentRequest.getRequestUrl().toString()); - assertEquals(2, currentRequest.getCrawlDepth()); - assertEquals(ROOT_URL2_DOMAIN, currentRequest.getTopPrivateDomain()); + @Test + public void getNextRequestWithoutDuplicateRequestFilterTest() { + // Turn off duplicate request filtering + config.setDuplicateRequestFiltering(false); + + // Clear the crawl candidate queue of the frontier + clearCrawlCandidateQueue(); - currentRequest = frontier.getNextRequest(); - assertNull(currentRequest); + // Feed a duplicate crawl request + frontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + + // Check if the candidates was added to the queue, the method should return true + assertTrue(frontier.hasNextCandidate()); + + // Check if the URLs match + assertEquals(ROOT_URL_0.toString(), frontier.getNextCandidate().getCandidateUrl().toString()); } @Test - public void getNextRequestWithDuplicateRequestFilterTest() { - CrawlRequest currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - - // feed a duplicate request - frontier.feedRequest(ROOT_URL1_CRAWL_REQUEST); - currentRequest = frontier.getNextRequest(); - assertNull(currentRequest); + public void getNextRequestWithoutOffsiteRequestFilterTest() { + // Turn off offsite request filtering + config.setOffsiteRequestFiltering(false); + + // Clear the crawl candidate queue of the frontier + clearCrawlCandidateQueue(); + + // Feed an offsite request + frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false); + + // Check if the candidates was added to the queue, the method should return true + assertTrue(frontier.hasNextCandidate()); + + // Check if the URLs match + assertEquals(OFFSITE_URL.toString(), frontier.getNextCandidate().getCandidateUrl().toString()); } @Test - public void getNextRequestWithOffsiteRequestFilterTest() { - CrawlRequest currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - - // feed an offsite request - frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST); - currentRequest = frontier.getNextRequest(); - assertNull(currentRequest); + public void getNextRequestBreadthFirstTest() { + // Get the crawl candidate of root URL 1. + CrawlCandidate nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be root URL 1. + assertEquals(ROOT_URL_1.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 0 because it is a root URL. + assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 1. + assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); + + // Feed a child request that come from root URL 1. + frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); + + // Get the crawl candidate of root URL 0. + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be root URL 0. + assertEquals(ROOT_URL_0.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL. + assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 0. + assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); + + // Feed 2 child requests that come from root URL 0. + frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + + // Get the crawl candidate of child URL 2. + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be child URL 2. + assertEquals(CHILD_URL_2.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1. + assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 1. + assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); + + // Get the crawl candidate of a child URL. + // Note: a priority queue does not ensure FIFO order when elements have the same depth and priority + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this request, it should be a child URL. + assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH)); + + // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0. + assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Get the priority of this candidate + int previousChildCandidatePriority = nextCandidate.getPriority(); + + // Get the crawl candidate of the next child URL. + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be a child URL. + assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH)); + + // Check the crawl depth of this candidate, it should be 1 again becaise it is another child URL that also comes from root URL 0. + assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Compare the priority of this candidate to the previous candidate's priority. + assertEquals(previousChildCandidatePriority, nextCandidate.getPriority()); + + // There should be no more candidates left at this point. + assertFalse(frontier.hasNextCandidate()); } @Test - public void getNextRequestWithoutDuplicateRequestFilterTest() { - config.setFilterDuplicateRequests(false); - - CrawlRequest currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - - // feed a duplicate request - frontier.feedRequest(ROOT_URL1_CRAWL_REQUEST); - currentRequest = frontier.getNextRequest(); - assertEquals(ROOT_URL1.toString(), currentRequest.getRequestUrl().toString()); - assertEquals(ROOT_URL_DOMAIN, currentRequest.getTopPrivateDomain()); - assertEquals(1, currentRequest.getCrawlDepth()); - } + public void getNextRequestDepthFirstTest() { + // Set the crawling strategy to depth-first + config.setCrawlingStrategy(CrawlingStrategy.DEPTH_FIRST); + frontier = new CrawlFrontier(config); + + // Get the crawl candidate of root URL 1 + CrawlCandidate nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be root URL 1 + assertEquals(ROOT_URL_1.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 0 because it is a root URL + assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 1 + assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); + + // Feed a child request that comes from root URL 1 + frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); + + // Get the crawl candidate of a child URL + // Note: a priority queue does not ensure FIFO order when elements have the same depth and priority + nextCandidate = frontier.getNextCandidate(); + // Check the URL of this candidate, it should be a child URL + assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH)); + + // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1 + assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 1 + assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); + + // Get the crawl candidate of root URL 0. + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be root URL 0 + assertEquals(ROOT_URL_0.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL + assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 0 + assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); + + // Feed 2 child requests that come from root URL 0 + frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + + // Get the crawl candidate of child URL 0 + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be child URL 0 + assertEquals(CHILD_URL_0.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0 + assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 0 + assertEquals(CHILD_URL_0_PRIORITY, nextCandidate.getPriority()); + + // Get the crawl candidate of child URL 1 + nextCandidate = frontier.getNextCandidate(); + + // Check the URL of this candidate, it should be child URL 1 + assertEquals(CHILD_URL_1.toString(), nextCandidate.getCandidateUrl().toString()); + + // Check the crawl depth of this candidate, it should be 1 again becaise it is a child URL that also comes from root URL 0 + assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + + // Check the priority of this candidate, it should be 0 + assertEquals(CHILD_URL_1_PRIORITY, nextCandidate.getPriority()); + + // There should be no more candidates left at this point + assertFalse(frontier.hasNextCandidate()); + } + @Test - public void getNextRequestWithoutOffsiteRequestFilterTest() { - config.setFilterOffsiteRequests(false); - - CrawlRequest currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - currentRequest = frontier.getNextRequest(); - assertEquals(0, currentRequest.getCrawlDepth()); - - // feed an offsite request - frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST); - currentRequest = frontier.getNextRequest(); - assertEquals(OFFSITE_URL.toString(), currentRequest.getRequestUrl().toString()); - assertEquals(OFFSITE_URL_DOMAIN, currentRequest.getTopPrivateDomain()); - assertEquals(1, currentRequest.getCrawlDepth()); + public void maxCrawlDepthTest() { + // Set max crawl depth + config.setMaxCrawlDepth(MAX_CRAWL_DEPTH); + + // Clear the crawl candidate queue of the frontier + clearCrawlCandidateQueue(); + + // Feed a child request, its crawl depth will be 1 + frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + + // Get the crawl candidate of the previously added child URL + CrawlCandidate nextCandidate = frontier.getNextCandidate(); + + // Check its crawl depth, it should be less than or equal to the limit + assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH); + + // Feed another child request, its crawl depth will be 2 which is above the limit + frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + + // There should be no more candidates at this point + assertFalse(frontier.hasNextCandidate()); + } + + private void clearCrawlCandidateQueue() { + // Loop until there are no remaining candidates in the queue + while (frontier.hasNextCandidate()) { + frontier.getNextCandidate(); + } } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlRequestComparatorTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlRequestComparatorTest.java deleted file mode 100644 index a6e6135..0000000 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlRequestComparatorTest.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright 2016 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.github.peterbencze.serritor.internal; - -import com.github.peterbencze.serritor.internal.CrawlRequest.CrawlRequestBuilder; -import java.util.PriorityQueue; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; -import org.junit.Test; - -/** - * Test cases for CrawlRequestComparator. - * - * @author Krisztian Mozsi - * @author Peter Bencze - */ -public class CrawlRequestComparatorTest { - - private static final CrawlRequest.CrawlRequestBuilder CRAWL_REQUEST_BUILDER = new CrawlRequestBuilder(); - - private static final CrawlRequest CRAWL_REQUEST_WITH_0_DEPTH; - private static final CrawlRequest CRAWL_REQUEST_WITH_1_DEPTH; - private static final CrawlRequest CRAWL_REQUEST_WITH_2_DEPTH; - - private PriorityQueue crawlRequests; - private CrawlRequestComparator comparator; - - static { - CRAWL_REQUEST_WITH_0_DEPTH = CRAWL_REQUEST_BUILDER.setCrawlDepth(0) - .build(); - - CRAWL_REQUEST_WITH_1_DEPTH = CRAWL_REQUEST_BUILDER.setCrawlDepth(1) - .build(); - - CRAWL_REQUEST_WITH_2_DEPTH = CRAWL_REQUEST_BUILDER.setCrawlDepth(2) - .build(); - } - - @Test - public void breadthFirstCrawlTest() { - comparator = new CrawlRequestComparator(); - crawlRequests = new PriorityQueue<>(comparator); - priorityQueueInitialize(); - - CrawlRequest currentRequest = crawlRequests.poll(); - assertEquals(0, currentRequest.getCrawlDepth()); - - crawlRequests.add(CRAWL_REQUEST_WITH_1_DEPTH); - currentRequest = crawlRequests.poll(); - assertEquals(0, currentRequest.getCrawlDepth()); - - crawlRequests.add(CRAWL_REQUEST_WITH_2_DEPTH); - crawlRequests.add(CRAWL_REQUEST_WITH_1_DEPTH); - currentRequest = crawlRequests.poll(); - assertEquals(1, currentRequest.getCrawlDepth()); - - currentRequest = crawlRequests.poll(); - assertEquals(1, currentRequest.getCrawlDepth()); - - currentRequest = crawlRequests.poll(); - assertEquals(2, currentRequest.getCrawlDepth()); - - currentRequest = crawlRequests.poll(); - assertNull(currentRequest); - } - - @Test - public void depthFirstCrawlTest() { - comparator = new CrawlRequestComparator(); - crawlRequests = new PriorityQueue<>(comparator.reversed()); - priorityQueueInitialize(); - - CrawlRequest currentRequest = crawlRequests.poll(); - assertEquals(0, currentRequest.getCrawlDepth()); - - crawlRequests.add(CRAWL_REQUEST_WITH_1_DEPTH); - currentRequest = crawlRequests.poll(); - assertEquals(1, currentRequest.getCrawlDepth()); - - currentRequest = crawlRequests.poll(); - assertEquals(0, currentRequest.getCrawlDepth()); - - crawlRequests.add(CRAWL_REQUEST_WITH_2_DEPTH); - crawlRequests.add(CRAWL_REQUEST_WITH_1_DEPTH); - currentRequest = crawlRequests.poll(); - assertEquals(2, currentRequest.getCrawlDepth()); - - currentRequest = crawlRequests.poll(); - assertEquals(1, currentRequest.getCrawlDepth()); - - currentRequest = crawlRequests.poll(); - assertNull(currentRequest); - } - - private void priorityQueueInitialize() { - crawlRequests.add(CRAWL_REQUEST_WITH_0_DEPTH); - crawlRequests.add(CRAWL_REQUEST_WITH_0_DEPTH); - } -}