From c1d66df83b7d4d56f9f54df6a8378ce9c1be308f Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 19 Feb 2018 18:21:44 +0100 Subject: [PATCH 01/24] Remove unnecessary generic parameters, add missing final modifiers --- pom.xml | 2 +- .../com/github/peterbencze/serritor/api/HtmlResponse.java | 4 ++-- .../com/github/peterbencze/serritor/api/NonHtmlResponse.java | 4 ++-- .../github/peterbencze/serritor/api/UnsuccessfulRequest.java | 4 ++-- .../peterbencze/serritor/internal/CallbackParameter.java | 2 +- .../peterbencze/serritor/internal/CrawlerConfiguration.java | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index 2e9cb15..33c497b 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 1.2.1 + 1.2.2 jar Serritor diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java index 12a47a2..4138abb 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java @@ -54,12 +54,12 @@ public WebDriver getWebDriver() { return webDriver; } - public static final class HtmlResponseBuilder extends CallbackParameterBuilder { + public static final class HtmlResponseBuilder extends CallbackParameterBuilder { private HttpHeadResponse httpHeadResponse; private WebDriver webDriver; - public HtmlResponseBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) { + public HtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { super(refererUrl, crawlDepth, crawlRequest); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java index 9245beb..c1f58bf 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java @@ -42,11 +42,11 @@ public HttpHeadResponse getHttpHeadResponse() { return httpHeadResponse; } - public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder { + public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder { private HttpHeadResponse httpHeadResponse; - public NonHtmlResponseBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) { + public NonHtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { super(refererUrl, crawlDepth, crawlRequest); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java index c545c01..f809a6a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java @@ -43,11 +43,11 @@ public IOException getException() { return exception; } - public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder { + public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder { private IOException exception; - public UnsuccessfulRequestBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) { + public UnsuccessfulRequestBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { super(refererUrl, crawlDepth, crawlRequest); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java index 1817733..28af583 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java @@ -63,7 +63,7 @@ public final CrawlRequest getCrawlRequest() { return crawlRequest; } - public static abstract class CallbackParameterBuilder> { + public static abstract class CallbackParameterBuilder { private final URL refererUrl; private final int crawlDepth; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index c7e065a..8b1d306 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -160,7 +160,7 @@ public int getMaxCrawlDepth() { * * @param maxCrawlDepth The maximum crawl depth, zero means no limit */ - public void setMaxCrawlDepth(int maxCrawlDepth) { + public void setMaxCrawlDepth(final int maxCrawlDepth) { this.maxCrawlDepth = maxCrawlDepth; } } From ed28065762a540194375c53e4d392b13d9afb816 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 19 Feb 2018 18:32:42 +0100 Subject: [PATCH 02/24] Add initialization of variable --- .../peterbencze/serritor/internal/CrawlerConfiguration.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 8b1d306..09574b4 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -43,6 +43,7 @@ public CrawlerConfiguration() { crawlSeeds = new ArrayList<>(); crawlingStrategy = CrawlingStrategy.BREADTH_FIRST; filterDuplicateRequests = true; + filterOffsiteRequests = false; delayBetweenRequests = Duration.ZERO; maxCrawlDepth = 0; } From cb53c1d355d7ab3e1b1222e0bf925705a51db51f Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 19 Feb 2018 22:11:28 +0100 Subject: [PATCH 03/24] Refact --- .../serritor/internal/CrawlerConfiguration.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 09574b4..4e9170f 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -29,6 +29,12 @@ * @author Peter Bencze */ public final class CrawlerConfiguration implements Serializable { + + private static final CrawlingStrategy DEFAULT_CRAWLING_STRATEGY = CrawlingStrategy.BREADTH_FIRST; + private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true; + private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false; + private static final Duration DEFAULT_DELAY_BETWEEN_REQUESTS = Duration.ZERO; + private static final int DEFAULT_MAX_CRAWL_DEPTH = 0; private final List crawlSeeds; @@ -41,11 +47,11 @@ public final class CrawlerConfiguration implements Serializable { public CrawlerConfiguration() { // Default configuration crawlSeeds = new ArrayList<>(); - crawlingStrategy = CrawlingStrategy.BREADTH_FIRST; - filterDuplicateRequests = true; - filterOffsiteRequests = false; - delayBetweenRequests = Duration.ZERO; - maxCrawlDepth = 0; + crawlingStrategy = DEFAULT_CRAWLING_STRATEGY; + filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; + filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT; + delayBetweenRequests = DEFAULT_DELAY_BETWEEN_REQUESTS; + maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH; } /** From 6ac261adeda41525f95d1b1cd21e48eeeaa0de56 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 20 Feb 2018 01:59:52 +0100 Subject: [PATCH 04/24] Add license file --- LICENSE | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. From 52110b7626ae8419fe09f6d0cfe7c37aa75be234 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 20 Feb 2018 18:28:03 +0100 Subject: [PATCH 05/24] Rename CrawlingStrategy to CrawlStrategy --- ...awlingStrategy.java => CrawlStrategy.java} | 2 +- .../serritor/internal/CrawlFrontier.java | 6 ++--- .../internal/CrawlerConfiguration.java | 24 +++++++++---------- .../serritor/internal/CrawlFrontierTest.java | 6 ++--- 4 files changed, 19 insertions(+), 19 deletions(-) rename src/main/java/com/github/peterbencze/serritor/api/{CrawlingStrategy.java => CrawlStrategy.java} (96%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java similarity index 96% rename from src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java rename to src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java index c4da75a..c88435b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java @@ -20,7 +20,7 @@ * * @author Peter Bencze */ -public enum CrawlingStrategy { +public enum CrawlStrategy { BREADTH_FIRST, DEPTH_FIRST diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 408710e..ed6c20e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -54,7 +54,7 @@ public CrawlFrontier(final CrawlerConfiguration config) { allowedDomains = new HashSet<>(); urlFingerprints = new HashSet<>(); - // Construct a priority queue according to the crawling strategy specified in the configuration + // Construct a priority queue according to the crawl strategy specified in the configuration candidates = getPriorityQueue(); // Feed initial crawl requests (seeds) @@ -174,7 +174,7 @@ private String getFingerprintForUrl(final URL url) { * comparator */ private PriorityQueue getPriorityQueue() { - switch (config.getCrawlingStrategy()) { + switch (config.getCrawlStrategy()) { case BREADTH_FIRST: return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth) .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); @@ -183,6 +183,6 @@ private PriorityQueue getPriorityQueue() { .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); } - throw new IllegalArgumentException("Unsupported crawling strategy."); + throw new IllegalArgumentException("Unsupported crawl strategy."); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 4e9170f..b275c9a 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlRequest; -import com.github.peterbencze.serritor.api.CrawlingStrategy; +import com.github.peterbencze.serritor.api.CrawlStrategy; import java.io.Serializable; import java.time.Duration; import java.util.ArrayList; @@ -30,7 +30,7 @@ */ public final class CrawlerConfiguration implements Serializable { - private static final CrawlingStrategy DEFAULT_CRAWLING_STRATEGY = CrawlingStrategy.BREADTH_FIRST; + private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST; private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true; private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false; private static final Duration DEFAULT_DELAY_BETWEEN_REQUESTS = Duration.ZERO; @@ -38,7 +38,7 @@ public final class CrawlerConfiguration implements Serializable { private final List crawlSeeds; - private CrawlingStrategy crawlingStrategy; + private CrawlStrategy crawlStrategy; private boolean filterDuplicateRequests; private boolean filterOffsiteRequests; private Duration delayBetweenRequests; @@ -47,7 +47,7 @@ public final class CrawlerConfiguration implements Serializable { public CrawlerConfiguration() { // Default configuration crawlSeeds = new ArrayList<>(); - crawlingStrategy = DEFAULT_CRAWLING_STRATEGY; + crawlStrategy = DEFAULT_CRAWL_STRATEGY; filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT; delayBetweenRequests = DEFAULT_DELAY_BETWEEN_REQUESTS; @@ -82,21 +82,21 @@ public void addCrawlSeeds(final List requests) { } /** - * Returns the crawling strategy of the crawler. + * Returns the crawl strategy of the crawler. * - * @return The crawling strategy + * @return The crawl strategy */ - public CrawlingStrategy getCrawlingStrategy() { - return crawlingStrategy; + public CrawlStrategy getCrawlStrategy() { + return crawlStrategy; } /** - * Sets the crawling strategy of the crawler. + * Sets the crawl strategy of the crawler. * - * @param crawlingStrategy The crawling strategy + * @param crawlStrategy The crawl strategy */ - public void setCrawlingStrategy(final CrawlingStrategy crawlingStrategy) { - this.crawlingStrategy = crawlingStrategy; + public void setCrawlStrategy(final CrawlStrategy crawlStrategy) { + this.crawlStrategy = crawlStrategy; } /** diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 2c8fb29..621da01 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -17,7 +17,7 @@ import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; -import com.github.peterbencze.serritor.api.CrawlingStrategy; +import com.github.peterbencze.serritor.api.CrawlStrategy; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; @@ -325,8 +325,8 @@ public void getNextRequestBreadthFirstTest() { @Test public void getNextRequestDepthFirstTest() { - // Set the crawling strategy to depth-first - config.setCrawlingStrategy(CrawlingStrategy.DEPTH_FIRST); + // Set the crawl strategy to depth-first + config.setCrawlStrategy(CrawlStrategy.DEPTH_FIRST); frontier = new CrawlFrontier(config); // Get the crawl candidate of root URL 1 From a35b8962e9ad4ebbddf19f0bf61bac4514bb456c Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 25 Feb 2018 02:49:39 +0100 Subject: [PATCH 06/24] Add different crawl delay strategies --- .../peterbencze/serritor/api/BaseCrawler.java | 199 ++++++++++-------- .../serritor/api/CrawlDelayStrategy.java | 28 +++ .../serritor/internal/AdaptiveCrawlDelay.java | 77 +++++++ .../serritor/internal/CrawlDelay.java | 31 +++ .../serritor/internal/CrawlDelayFactory.java | 68 ++++++ .../internal/CrawlerConfiguration.java | 130 ++++++++++-- .../serritor/internal/FixedCrawlDelay.java | 46 ++++ .../serritor/internal/RandomCrawlDelay.java | 52 +++++ 8 files changed, 521 insertions(+), 110 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 4d8bca3..cd48aa1 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -20,6 +20,8 @@ import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder; import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder; import com.github.peterbencze.serritor.internal.CrawlCandidate; +import com.github.peterbencze.serritor.internal.CrawlDelay; +import com.github.peterbencze.serritor.internal.CrawlDelayFactory; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.CrawlerConfiguration; import java.io.IOException; @@ -37,6 +39,7 @@ import org.apache.http.client.methods.HttpHead; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.impl.client.HttpClientBuilder; +import org.openqa.selenium.JavascriptExecutor; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; import org.openqa.selenium.htmlunit.HtmlUnitDriver; @@ -49,7 +52,7 @@ */ public abstract class BaseCrawler { - //Allows the application to configure the crawler + // Allows the application to configure the crawler protected final CrawlerConfiguration config; // Indicates if the crawler is currently running or not @@ -63,7 +66,10 @@ public abstract class BaseCrawler { private WebDriver webDriver; - private CrawlFrontier frontier; + private CrawlFrontier crawlFrontier; + + // Specifies which type of crawl delay to use + private CrawlDelay crawlDelay; protected BaseCrawler() { // Create a default configuration @@ -92,23 +98,34 @@ public final void start(final WebDriver driver) { /** * Constructs all the necessary objects and runs the crawler. * - * @param frontierToUse Previously saved frontier to be used by the crawler. + * @param frontierToUse Crawl frontier to be used by the crawler. */ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { - // Check if the crawler is running - if (!isStopped) { - throw new IllegalStateException("The crawler is already started."); - } + try { + // Check if the crawler is running + if (!isStopped) { + throw new IllegalStateException("The crawler is already started."); + } + + isStopped = false; - isStopped = false; + httpClient = HttpClientBuilder.create().build(); - httpClient = HttpClientBuilder.create().build(); + webDriver = driver; - webDriver = driver; + crawlFrontier = frontierToUse; - frontier = frontierToUse; + CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(config, (JavascriptExecutor) driver); + crawlDelay = crawlDelayFactory.getInstanceOf(config.getCrawlDelayStrategy()); - run(); + run(); + } finally { + // Always close the WebDriver + webDriver.quit(); + + stopCrawling = false; + isStopped = true; + } } /** @@ -119,13 +136,13 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { */ public final void saveState(final OutputStream out) throws IOException { // Check if the crawler has been started, otherwise we have nothing to save - if (frontier == null) { + if (crawlFrontier == null) { throw new IllegalStateException("No state to save."); } // Save the frontier's current state ObjectOutputStream objectOutputStream = new ObjectOutputStream(out); - objectOutputStream.writeObject(frontier); + objectOutputStream.writeObject(crawlFrontier); } /** @@ -153,7 +170,7 @@ public final void resumeState(final InputStream in) throws IOException, ClassNot public final void resumeState(final WebDriver driver, final InputStream in) throws IOException, ClassNotFoundException { ObjectInputStream objectInputStream = new ObjectInputStream(in); CrawlFrontier frontierToUse = (CrawlFrontier) objectInputStream.readObject(); - + start(driver, frontierToUse); } @@ -188,7 +205,7 @@ protected final void crawl(final CrawlRequest request) { throw new IllegalStateException("The crawler is not started. Maybe you meant to add this request as a crawl seed?"); } - frontier.feedRequest(request, false); + crawlFrontier.feedRequest(request, false); } /** @@ -204,95 +221,85 @@ protected final void crawl(final List requests) { * Defines the workflow of the crawler. */ private void run() { - try { - onBegin(); + onBegin(); - while (!stopCrawling && frontier.hasNextCandidate()) { - // Get the next crawl candidate from the queue - CrawlCandidate currentCandidate = frontier.getNextCandidate(); + while (!stopCrawling && crawlFrontier.hasNextCandidate()) { + // Get the next crawl candidate from the queue + CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); - URL currentCandidateUrl = currentCandidate.getCandidateUrl(); - String currentRequestUrlAsString = currentCandidateUrl.toString(); + URL currentCandidateUrl = currentCandidate.getCandidateUrl(); + String currentRequestUrlAsString = currentCandidateUrl.toString(); - HttpHeadResponse httpHeadResponse; - URL responseUrl = currentCandidateUrl; + HttpHeadResponse httpHeadResponse; + URL responseUrl = currentCandidateUrl; - try { - HttpClientContext context = HttpClientContext.create(); - - // Send an HTTP HEAD request to the current URL to determine its availability and content type - httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context); - - // If the request has been redirected, get the final URL - List redirectLocations = context.getRedirectLocations(); - if (redirectLocations != null) { - responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL(); - } - } catch (IOException ex) { - UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setException(ex) - .build(); - - onUnsuccessfulRequest(unsuccessfulRequest); - continue; - } + try { + HttpClientContext context = HttpClientContext.create(); - // If the request has been redirected, a new crawl request should be created for the redirected URL - if (!responseUrl.toString().equals(currentRequestUrlAsString)) { - CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build(); - frontier.feedRequest(redirectedCrawlRequest, false); + // Send an HTTP HEAD request to the current URL to determine its availability and content type + httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context); - continue; + // If the request has been redirected, get the final URL + List redirectLocations = context.getRedirectLocations(); + if (redirectLocations != null) { + responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL(); } + } catch (IOException ex) { + UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) + .setException(ex) + .build(); + + onUnsuccessfulRequest(unsuccessfulRequest); + continue; + } - // Check if the content of the response is HTML - if (isContentHtml(httpHeadResponse)) { - boolean timedOut = false; - - try { - // Open the URL in the browser - webDriver.get(currentRequestUrlAsString); - } catch (TimeoutException ex) { - timedOut = true; - } - - HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setHttpHeadResponse(httpHeadResponse) - .setWebDriver(webDriver) - .build(); - - // Check if the request has timed out - if (!timedOut) { - onResponseComplete(htmlResponse); - } else { - onResponseTimeout(htmlResponse); - } - } else { - // URLs that point to non-HTML content should not be opened in the browser + // If the request has been redirected, a new crawl request should be created for the redirected URL + if (!responseUrl.toString().equals(currentRequestUrlAsString)) { + CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build(); + crawlFrontier.feedRequest(redirectedCrawlRequest, false); + + continue; + } + + // Check if the content of the response is HTML + if (isContentHtml(httpHeadResponse)) { + boolean timedOut = false; - NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setHttpHeadResponse(httpHeadResponse) - .build(); + try { + // Open the URL in the browser + webDriver.get(currentRequestUrlAsString); + } catch (TimeoutException ex) { + timedOut = true; + } + + HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) + .setHttpHeadResponse(httpHeadResponse) + .setWebDriver(webDriver) + .build(); - onNonHtmlResponse(nonHtmlResponse); + // Check if the request has timed out + if (!timedOut) { + onResponseComplete(htmlResponse); + } else { + onResponseTimeout(htmlResponse); } + } else { + // URLs that point to non-HTML content should not be opened in the browser - TimeUnit.MILLISECONDS.sleep(config.getDelayBetweenRequests().toMillis()); - } + NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) + .setHttpHeadResponse(httpHeadResponse) + .build(); - onFinish(); - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - } finally { - // Always close the WebDriver - webDriver.quit(); + onNonHtmlResponse(nonHtmlResponse); + } - stopCrawling = false; - isStopped = true; + performDelay(); } + + onFinish(); } /** @@ -318,6 +325,18 @@ private boolean isContentHtml(final HttpHeadResponse httpHeadResponse) { return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html"); } + /** + * Delays the next request. + */ + private void performDelay() { + try { + TimeUnit.MILLISECONDS.sleep(crawlDelay.getDelay()); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + stopCrawling = true; + } + } + /** * Called when the crawler is about to begin its operation. */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java new file mode 100644 index 0000000..0c10e7b --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java @@ -0,0 +1,28 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.api; + +/** + * Available crawl delay strategies that can be used by the crawler. + * + * @author Peter Bencze + */ +public enum CrawlDelayStrategy { + + FIXED, + ADAPTIVE, + RANDOM +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java new file mode 100644 index 0000000..c3e6b4c --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java @@ -0,0 +1,77 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import org.openqa.selenium.JavascriptExecutor; + +/** + * A type of crawl delay, in which case the delay corresponds to the page + * loading time, if it's between the specified range, otherwise the minimum or + * maximum duration is used. + * + * @author Peter Bencze + */ +public final class AdaptiveCrawlDelay implements CrawlDelay { + + private final long minDelayInMillis; + private final long maxDelayInMillis; + private final JavascriptExecutor javascriptExecutor; + + /** + * Constructs a new AdaptiveCrawlDelay instance. + * + * @param config A CrawlerConfiguration instance which + * specifies the minimum and maximum delay. + * @param javascriptExecutor A WebDriver instance which is + * capable of executing JavaScript. + */ + public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) { + minDelayInMillis = config.getMinimumCrawlDelayInMillis(); + maxDelayInMillis = config.getMaximumCrawlDelayInMillis(); + this.javascriptExecutor = javascriptExecutor; + } + + /** + * Checks if the browser supports the Navigation Timing API. + * + * @return true if the browser is compatible, + * false otherwise + */ + public boolean isBrowserCompatible() { + return (boolean) javascriptExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)"); + } + + /** + * Calculates the page loading time and returns the delay accordingly, + * between the specified min-max range. If the calculated delay is smaller + * than the minimum, it returns the minimum delay. If the calculated delay + * is higher than the maximum, it returns the maximum delay. + * + * @return The delay in milliseconds + */ + @Override + public long getDelay() { + long delayInMillis = (long) javascriptExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;"); + + if (delayInMillis < minDelayInMillis) { + return minDelayInMillis; + } else if (delayInMillis > maxDelayInMillis) { + return maxDelayInMillis; + } + + return delayInMillis; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java new file mode 100644 index 0000000..652b2e9 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java @@ -0,0 +1,31 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +/** + * An interface that every type of crawl delay should implement. + * + * @author Peter Bencze + */ +public interface CrawlDelay { + + /** + * Returns the delay that should pass between each request. + * + * @return The delay in milliseconds + */ + long getDelay(); +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java new file mode 100644 index 0000000..97e78d1 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java @@ -0,0 +1,68 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.CrawlDelayStrategy; +import org.openqa.selenium.JavascriptExecutor; + +/** + * Factory class which is used to construct the required crawl delay instance + * specified in the configuration. + * + * @author Peter Bencze + */ +public final class CrawlDelayFactory { + + private final CrawlerConfiguration config; + private final JavascriptExecutor javascriptExecutor; + + /** + * Constructs a new CrawlDelayFactory instance. + * + * @param config A CrawlerConfiguration instance which + * specifies the minimum and maximum delay. + * @param javascriptExecutor A WebDriver instance which is + * capable of executing JavaScript. + */ + public CrawlDelayFactory(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) { + this.config = config; + this.javascriptExecutor = javascriptExecutor; + } + + /** + * Constructs the specific crawl delay instance determined by the strategy. + * + * @param crawlDelayStrategy The crawl delay strategy + * @return The specific crawl delay instance + */ + public CrawlDelay getInstanceOf(final CrawlDelayStrategy crawlDelayStrategy) { + switch (crawlDelayStrategy) { + case FIXED: + return new FixedCrawlDelay(config); + case RANDOM: + return new RandomCrawlDelay(config); + case ADAPTIVE: + AdaptiveCrawlDelay adaptiveCrawlDelay = new AdaptiveCrawlDelay(config, javascriptExecutor); + if (!adaptiveCrawlDelay.isBrowserCompatible()) { + throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser."); + } + + return adaptiveCrawlDelay; + } + + throw new IllegalArgumentException("Unsupported crawl delay strategy."); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index b275c9a..0a3b1e2 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -15,6 +15,7 @@ */ package com.github.peterbencze.serritor.internal; +import com.github.peterbencze.serritor.api.CrawlDelayStrategy; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlStrategy; import java.io.Serializable; @@ -29,20 +30,26 @@ * @author Peter Bencze */ public final class CrawlerConfiguration implements Serializable { - + private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST; private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true; private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false; - private static final Duration DEFAULT_DELAY_BETWEEN_REQUESTS = Duration.ZERO; private static final int DEFAULT_MAX_CRAWL_DEPTH = 0; + private static final CrawlDelayStrategy DEFAULT_CRAWL_DELAY = CrawlDelayStrategy.FIXED; + private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS = Duration.ZERO.toMillis(); + private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS = Duration.ofSeconds(1).toMillis(); + private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis(); private final List crawlSeeds; private CrawlStrategy crawlStrategy; private boolean filterDuplicateRequests; private boolean filterOffsiteRequests; - private Duration delayBetweenRequests; private int maxCrawlDepth; + private CrawlDelayStrategy crawlDelayStrategy; + private long fixedCrawlDelayInMillis; + private long minCrawlDelayInMillis; + private long maxCrawlDelayInMillis; public CrawlerConfiguration() { // Default configuration @@ -50,8 +57,11 @@ public CrawlerConfiguration() { crawlStrategy = DEFAULT_CRAWL_STRATEGY; filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT; - delayBetweenRequests = DEFAULT_DELAY_BETWEEN_REQUESTS; maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH; + crawlDelayStrategy = DEFAULT_CRAWL_DELAY; + fixedCrawlDelayInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS; + minCrawlDelayInMillis = DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS; + maxCrawlDelayInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS; } /** @@ -136,38 +146,118 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { } /** - * Returns the delay between each request. + * Returns the maximum possible crawl depth. * - * @return The delay between each request + * @return The maximum crawl depth */ - public Duration getDelayBetweenRequests() { - return delayBetweenRequests; + public int getMaxCrawlDepth() { + return maxCrawlDepth; } /** - * Sets the delay between each request. + * Sets the maximum possible crawl depth. * - * @param delayBetweenRequests The delay between each request + * @param maxCrawlDepth The maximum crawl depth, zero means no limit */ - public void setDelayBetweenRequests(final Duration delayBetweenRequests) { - this.delayBetweenRequests = delayBetweenRequests; + public void setMaxCrawlDepth(final int maxCrawlDepth) { + this.maxCrawlDepth = maxCrawlDepth; } /** - * Returns the maximum possible crawl depth. + * Sets the crawl delay strategy to be used by the crawler. * - * @return The maximum crawl depth + * @param crawlDelayStrategy The crawl delay strategy */ - public int getMaxCrawlDepth() { - return maxCrawlDepth; + public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { + this.crawlDelayStrategy = crawlDelayStrategy; } /** - * Sets the maximum possible crawl depth. + * Returns the crawl delay strategy used by the crawler. * - * @param maxCrawlDepth The maximum crawl depth, zero means no limit + * @return The crawl delay type */ - public void setMaxCrawlDepth(final int maxCrawlDepth) { - this.maxCrawlDepth = maxCrawlDepth; + public CrawlDelayStrategy getCrawlDelayStrategy() { + return crawlDelayStrategy; + } + + /** + * Sets the exact duration of delay between each request. + * + * @param fixedCrawlDelayDuration The duration of delay + */ + public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { + try { + fixedCrawlDelayInMillis = fixedCrawlDelayDuration.toMillis(); + } catch (ArithmeticException ex) { + throw new IllegalArgumentException("The duration is too large."); + } + } + + /** + * Returns the exact duration of delay between each request. + * + * @return The duration of delay in milliseconds + */ + public long getFixedCrawlDelayInMillis() { + return fixedCrawlDelayInMillis; + } + + /** + * Sets the minimum duration of delay between each request. + * + * @param minCrawlDelayDuration The minimum duration of delay + */ + public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { + if (minCrawlDelayDuration.isNegative()) { + throw new IllegalArgumentException("The minimum crawl delay should be positive."); + } + + try { + long delayInMillis = minCrawlDelayDuration.toMillis(); + if (delayInMillis >= maxCrawlDelayInMillis) { + throw new IllegalArgumentException("The minimum crawl delay should be less than the maximum."); + } + + minCrawlDelayInMillis = delayInMillis; + } catch (ArithmeticException ex) { + throw new IllegalArgumentException("The duration is too large."); + } + } + + /** + * Returns the minimum duration of delay between each request. + * + * @return The minimum duration of delay in milliseconds + */ + public long getMinimumCrawlDelayInMillis() { + return minCrawlDelayInMillis; + } + + /** + * Sets the maximum duration of delay between each request. + * + * @param maxCrawlDelayDuration The maximum duration of delay + */ + public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { + try { + long delayInMillis = maxCrawlDelayDuration.toMillis(); + if (delayInMillis <= minCrawlDelayInMillis) { + throw new IllegalArgumentException("The maximum crawl delay should be higher than the minimum."); + } + + maxCrawlDelayInMillis = delayInMillis; + } catch (ArithmeticException ex) { + throw new IllegalArgumentException("The duration is too large."); + } + } + + /** + * Returns the maximum duration of delay between each request. + * + * @return The maximum duration of delay in milliseconds + */ + public long getMaximumCrawlDelayInMillis() { + return maxCrawlDelayInMillis; } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java new file mode 100644 index 0000000..cea20d0 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java @@ -0,0 +1,46 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +/** + * A type of crawl delay, in which case the delay is constant and equals to the + * duration specified in the configuration. + * + * @author Peter Bencze + */ +public final class FixedCrawlDelay implements CrawlDelay { + + private final long delayInMillis; + + /** + * Constructs a new FixedCrawlDelay instance. + * + * @param config A CrawlerConfiguration instance which specifies the fixed delay + */ + public FixedCrawlDelay(final CrawlerConfiguration config) { + delayInMillis = config.getFixedCrawlDelayInMillis(); + } + + /** + * Returns the fixed delay specified in the configuration. + * + * @return The delay in milliseconds + */ + @Override + public long getDelay() { + return delayInMillis; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java new file mode 100644 index 0000000..3bc9871 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java @@ -0,0 +1,52 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import java.util.concurrent.ThreadLocalRandom; + +/** + * A type of crawl delay in which case the duration is randomized between the + * specified minimum and maximum range. + * + * @author Peter Bencze + */ +public final class RandomCrawlDelay implements CrawlDelay { + + private final long origin; + private final long bound; + + /** + * Constructs a new RandomCrawlDelay instance. + * + * @param config A CrawlerConfiguration instance which + * specifies the minimum and maximum delay. + */ + public RandomCrawlDelay(final CrawlerConfiguration config) { + origin = config.getMinimumCrawlDelayInMillis(); + bound = config.getMaximumCrawlDelayInMillis() + 1; + } + + /** + * Returns a random delay between the minimum and maximum range specified in + * the configuration. + * + * @return The delay in milliseconds + */ + @Override + public long getDelay() { + return ThreadLocalRandom.current().nextLong(origin, bound); + } +} From cc24efcba519db2658df10d28be56d41a9f0f61a Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 25 Feb 2018 02:51:58 +0100 Subject: [PATCH 07/24] Rename crawl depth getter and setter --- .../github/peterbencze/serritor/internal/CrawlFrontier.java | 2 +- .../peterbencze/serritor/internal/CrawlerConfiguration.java | 4 ++-- .../peterbencze/serritor/internal/CrawlFrontierTest.java | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index ed6c20e..2d58e03 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -96,7 +96,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { CrawlCandidateBuilder builder; if (!isCrawlSeed) { - int crawlDepthLimit = config.getMaxCrawlDepth(); + int crawlDepthLimit = config.getMaximumCrawlDepth(); int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 0a3b1e2..2dfb8fc 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -150,7 +150,7 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { * * @return The maximum crawl depth */ - public int getMaxCrawlDepth() { + public int getMaximumCrawlDepth() { return maxCrawlDepth; } @@ -159,7 +159,7 @@ public int getMaxCrawlDepth() { * * @param maxCrawlDepth The maximum crawl depth, zero means no limit */ - public void setMaxCrawlDepth(final int maxCrawlDepth) { + public void setMaximumCrawlDepth(final int maxCrawlDepth) { this.maxCrawlDepth = maxCrawlDepth; } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 621da01..df7788c 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -404,7 +404,7 @@ public void getNextRequestDepthFirstTest() { @Test public void maxCrawlDepthTest() { // Set max crawl depth - config.setMaxCrawlDepth(MAX_CRAWL_DEPTH); + config.setMaximumCrawlDepth(MAX_CRAWL_DEPTH); // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); From b53b2698b259ced68a4a4fdd7a0e8eb01ca831ba Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 25 Feb 2018 03:06:23 +0100 Subject: [PATCH 08/24] Refact --- .../com/github/peterbencze/serritor/api/CrawlRequest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 6f8c674..26ff078 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -80,6 +80,8 @@ public Optional getMetadata() { } public static final class CrawlRequestBuilder { + + private static final int DEFAULT_PRIORITY = 0; private final URL requestUrl; @@ -105,8 +107,8 @@ public CrawlRequestBuilder(final URL requestUrl) { throw new IllegalArgumentException(String.format("The top private domain cannot be extracted from the given request URL (\"%s\").", requestUrl), ex); } - // Default priority is 0 - priority = 0; + // Set default priority + priority = DEFAULT_PRIORITY; } /** From bd533ad982d07f687ed64b1bf1180cad4b71821a Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 5 Mar 2018 21:51:00 +0100 Subject: [PATCH 09/24] Add configurator and argument checking --- .../peterbencze/serritor/api/BaseCrawler.java | 17 +- .../serritor/internal/AdaptiveCrawlDelay.java | 4 +- .../internal/CrawlerConfiguration.java | 114 ++++++-------- .../internal/CrawlerConfigurator.java | 148 ++++++++++++++++++ .../serritor/internal/FixedCrawlDelay.java | 2 +- .../serritor/internal/RandomCrawlDelay.java | 4 +- .../serritor/internal/CrawlFrontierTest.java | 25 +-- 7 files changed, 219 insertions(+), 95 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index cd48aa1..392fc68 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -24,6 +24,7 @@ import com.github.peterbencze.serritor.internal.CrawlDelayFactory; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.CrawlerConfiguration; +import com.github.peterbencze.serritor.internal.CrawlerConfigurator; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; @@ -51,9 +52,10 @@ * @author Peter Bencze */ public abstract class BaseCrawler { + + protected final CrawlerConfigurator configurator; - // Allows the application to configure the crawler - protected final CrawlerConfiguration config; + private final CrawlerConfiguration configuration; // Indicates if the crawler is currently running or not private boolean isStopped; @@ -68,12 +70,11 @@ public abstract class BaseCrawler { private CrawlFrontier crawlFrontier; - // Specifies which type of crawl delay to use private CrawlDelay crawlDelay; protected BaseCrawler() { - // Create a default configuration - config = new CrawlerConfiguration(); + configuration = new CrawlerConfiguration(); + configurator = new CrawlerConfigurator(configuration); // Indicate that the crawler is not running isStopped = true; @@ -92,7 +93,7 @@ public final void start() { * @param driver The WebDriver instance that will be used by the crawler */ public final void start(final WebDriver driver) { - start(driver, new CrawlFrontier(config)); + start(driver, new CrawlFrontier(configuration)); } /** @@ -115,8 +116,8 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { crawlFrontier = frontierToUse; - CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(config, (JavascriptExecutor) driver); - crawlDelay = crawlDelayFactory.getInstanceOf(config.getCrawlDelayStrategy()); + CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(configuration, (JavascriptExecutor) driver); + crawlDelay = crawlDelayFactory.getInstanceOf(configuration.getCrawlDelayStrategy()); run(); } finally { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java index c3e6b4c..9fd9e9a 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java @@ -39,8 +39,8 @@ public final class AdaptiveCrawlDelay implements CrawlDelay { * capable of executing JavaScript. */ public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) { - minDelayInMillis = config.getMinimumCrawlDelayInMillis(); - maxDelayInMillis = config.getMaximumCrawlDelayInMillis(); + minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis(); + maxDelayInMillis = config.getMaximumCrawlDelayDurationInMillis(); this.javascriptExecutor = javascriptExecutor; } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 2dfb8fc..1fa666f 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -24,9 +24,8 @@ import java.util.List; /** - * Provides an interface to configure the crawler. + * This class contains the settings of the crawler. * - * @author Krisztian Mozsi * @author Peter Bencze */ public final class CrawlerConfiguration implements Serializable { @@ -47,21 +46,22 @@ public final class CrawlerConfiguration implements Serializable { private boolean filterOffsiteRequests; private int maxCrawlDepth; private CrawlDelayStrategy crawlDelayStrategy; - private long fixedCrawlDelayInMillis; - private long minCrawlDelayInMillis; - private long maxCrawlDelayInMillis; + private long fixedCrawlDelayDurationInMillis; + private long minCrawlDelayDurationInMillis; + private long maxCrawlDelayDurationInMillis; public CrawlerConfiguration() { - // Default configuration + // Initialize configuration with default values + crawlSeeds = new ArrayList<>(); crawlStrategy = DEFAULT_CRAWL_STRATEGY; filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT; maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH; crawlDelayStrategy = DEFAULT_CRAWL_DELAY; - fixedCrawlDelayInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS; - minCrawlDelayInMillis = DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS; - maxCrawlDelayInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS; + fixedCrawlDelayDurationInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS; + minCrawlDelayDurationInMillis = DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS; + maxCrawlDelayDurationInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS; } /** @@ -76,21 +76,13 @@ public List getCrawlSeeds() { /** * Appends a crawl request to the list of crawl seeds. * - * @param request The crawl request + * @param request The CrawlRequest instance which represents + * the crawl seed */ public void addCrawlSeed(final CrawlRequest request) { crawlSeeds.add(request); } - /** - * Appends a list of crawl requests to the list of crawl seeds. - * - * @param requests The list of crawl requests - */ - public void addCrawlSeeds(final List requests) { - crawlSeeds.addAll(requests); - } - /** * Returns the crawl strategy of the crawler. * @@ -101,7 +93,9 @@ public CrawlStrategy getCrawlStrategy() { } /** - * Sets the crawl strategy of the crawler. + * Sets the crawl strategy to be used by the crawler. Breadth-first strategy + * orders crawl requests by the lowest crawl depth, whereas depth-first + * orders them by the highest crawl depth. * * @param crawlStrategy The crawl strategy */ @@ -112,16 +106,17 @@ public void setCrawlStrategy(final CrawlStrategy crawlStrategy) { /** * Indicates if duplicate request filtering is enabled or not. * - * @return True if it is enabled, false otherwise + * @return true if enabled, false otherwise */ public boolean isDuplicateRequestFilteringEnabled() { return filterDuplicateRequests; } /** - * Sets duplicate request filtering. + * Enables or disables duplicate request filtering. * - * @param filterDuplicateRequests True means enabled, false means disabled + * @param filterDuplicateRequests true means enabled, + * false means disabled */ public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) { this.filterDuplicateRequests = filterDuplicateRequests; @@ -130,16 +125,17 @@ public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) /** * Indicates if offsite request filtering is enabled or not. * - * @return True if it is enabled, false otherwise + * @return true if enabled, false otherwise */ public boolean isOffsiteRequestFilteringEnabled() { return filterOffsiteRequests; } /** - * Sets offsite request filtering. + * Enables or disables offsite request filtering. * - * @param filterOffsiteRequests True means enabled, false means disabled + * @param filterOffsiteRequests true means enabled, + * false means disabled */ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { this.filterOffsiteRequests = filterOffsiteRequests; @@ -155,9 +151,10 @@ public int getMaximumCrawlDepth() { } /** - * Sets the maximum possible crawl depth. + * Sets the maximum possible crawl depth. It should be a non-negative number + * where 0 means there is no limit. * - * @param maxCrawlDepth The maximum crawl depth, zero means no limit + * @param maxCrawlDepth The maximum crawl depth */ public void setMaximumCrawlDepth(final int maxCrawlDepth) { this.maxCrawlDepth = maxCrawlDepth; @@ -175,7 +172,7 @@ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { /** * Returns the crawl delay strategy used by the crawler. * - * @return The crawl delay type + * @return The crawl delay strategy */ public CrawlDelayStrategy getCrawlDelayStrategy() { return crawlDelayStrategy; @@ -184,14 +181,11 @@ public CrawlDelayStrategy getCrawlDelayStrategy() { /** * Sets the exact duration of delay between each request. * - * @param fixedCrawlDelayDuration The duration of delay + * @param fixedCrawlDelayDurationInMillis The duration of delay in + * milliseconds */ - public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { - try { - fixedCrawlDelayInMillis = fixedCrawlDelayDuration.toMillis(); - } catch (ArithmeticException ex) { - throw new IllegalArgumentException("The duration is too large."); - } + public void setFixedCrawlDelayDurationInMillis(final long fixedCrawlDelayDurationInMillis) { + this.fixedCrawlDelayDurationInMillis = fixedCrawlDelayDurationInMillis; } /** @@ -199,30 +193,18 @@ public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { * * @return The duration of delay in milliseconds */ - public long getFixedCrawlDelayInMillis() { - return fixedCrawlDelayInMillis; + public long getFixedCrawlDelayDurationInMillis() { + return fixedCrawlDelayDurationInMillis; } /** * Sets the minimum duration of delay between each request. * - * @param minCrawlDelayDuration The minimum duration of delay + * @param minCrawlDelayDurationInMillis The minimum duration of delay in + * milliseconds */ - public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { - if (minCrawlDelayDuration.isNegative()) { - throw new IllegalArgumentException("The minimum crawl delay should be positive."); - } - - try { - long delayInMillis = minCrawlDelayDuration.toMillis(); - if (delayInMillis >= maxCrawlDelayInMillis) { - throw new IllegalArgumentException("The minimum crawl delay should be less than the maximum."); - } - - minCrawlDelayInMillis = delayInMillis; - } catch (ArithmeticException ex) { - throw new IllegalArgumentException("The duration is too large."); - } + public void setMinimumCrawlDelayDurationInMillis(final long minCrawlDelayDurationInMillis) { + this.minCrawlDelayDurationInMillis = minCrawlDelayDurationInMillis; } /** @@ -230,26 +212,18 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { * * @return The minimum duration of delay in milliseconds */ - public long getMinimumCrawlDelayInMillis() { - return minCrawlDelayInMillis; + public long getMinimumCrawlDelayDurationInMillis() { + return minCrawlDelayDurationInMillis; } /** * Sets the maximum duration of delay between each request. * - * @param maxCrawlDelayDuration The maximum duration of delay + * @param maxCrawlDelayDurationInMillis The maximum duration of delay in + * milliseconds */ - public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { - try { - long delayInMillis = maxCrawlDelayDuration.toMillis(); - if (delayInMillis <= minCrawlDelayInMillis) { - throw new IllegalArgumentException("The maximum crawl delay should be higher than the minimum."); - } - - maxCrawlDelayInMillis = delayInMillis; - } catch (ArithmeticException ex) { - throw new IllegalArgumentException("The duration is too large."); - } + public void setMaximumCrawlDelayDuration(final long maxCrawlDelayDurationInMillis) { + this.maxCrawlDelayDurationInMillis = maxCrawlDelayDurationInMillis; } /** @@ -257,7 +231,7 @@ public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { * * @return The maximum duration of delay in milliseconds */ - public long getMaximumCrawlDelayInMillis() { - return maxCrawlDelayInMillis; + public long getMaximumCrawlDelayDurationInMillis() { + return maxCrawlDelayDurationInMillis; } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java new file mode 100644 index 0000000..378f591 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java @@ -0,0 +1,148 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.CrawlDelayStrategy; +import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.api.CrawlStrategy; +import com.google.common.base.Preconditions; +import java.time.Duration; +import java.util.List; + +/** + * This class provides an interface for the user to configure the crawler. + * + * @author Peter Bencze + */ +public class CrawlerConfigurator { + + private final CrawlerConfiguration config; + + public CrawlerConfigurator(CrawlerConfiguration config) { + this.config = config; + } + + /** + * Appends a crawl request to the list of crawl seeds. + * + * @param request The CrawlRequest instance which represents + * the crawl seed + */ + public void addCrawlSeed(final CrawlRequest request) { + config.addCrawlSeed(Preconditions.checkNotNull(request)); + } + + /** + * Appends a list of crawl requests to the list of crawl seeds. + * + * @param requests The list of CrawlRequest instances which + * represent the crawl seeds + */ + public void addCrawlSeeds(final List requests) { + requests.forEach(this::addCrawlSeed); + } + + /** + * Sets the crawl strategy to be used by the crawler. Breadth-first strategy + * orders crawl requests by the lowest crawl depth, whereas depth-first + * orders them by the highest crawl depth. + * + * @param crawlStrategy The crawl strategy + */ + public void setCrawlStrategy(final CrawlStrategy crawlStrategy) { + config.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy)); + } + + /** + * Enables or disables duplicate request filtering. + * + * @param filterDuplicateRequests true means enabled, + * false means disabled + */ + public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) { + config.setDuplicateRequestFiltering(filterDuplicateRequests); + } + + /** + * Enables or disables offsite request filtering. + * + * @param filterOffsiteRequests true means enabled, + * false means disabled + */ + public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { + config.setOffsiteRequestFiltering(filterOffsiteRequests); + } + + /** + * Sets the maximum possible crawl depth. It should be a non-negative number + * where 0 means there is no limit. + * + * @param maxCrawlDepth The maximum crawl depth + */ + public void setMaximumCrawlDepth(final int maxCrawlDepth) { + Preconditions.checkArgument(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative."); + + config.setMaximumCrawlDepth(maxCrawlDepth); + } + + /** + * Sets the crawl delay strategy to be used by the crawler. + * + * @param crawlDelayStrategy The crawl delay strategy + */ + public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { + config.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy)); + } + + /** + * Sets the exact duration of delay between each request. + * + * @param fixedCrawlDelayDuration The duration of delay + */ + public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { + config.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis()); + } + + /** + * Sets the minimum duration of delay between each request. + * + * @param minCrawlDelayDuration The minimum duration of delay + */ + public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { + Preconditions.checkArgument(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative."); + + long minCrawlDelayDurationInMillis = minCrawlDelayDuration.toMillis(); + long maxCrawlDelayInMillis = config.getMaximumCrawlDelayDurationInMillis(); + + Preconditions.checkArgument(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum."); + + config.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis); + } + + /** + * Sets the maximum duration of delay between each request. + * + * @param maxCrawlDelayDuration The maximum duration of delay + */ + public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { + long minCrawlDelayDurationInMillis = config.getMinimumCrawlDelayDurationInMillis(); + long maxCrawlDelayDurationInMillis = maxCrawlDelayDuration.toMillis(); + + Preconditions.checkArgument(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum."); + + config.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java index cea20d0..3eb0f87 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java @@ -31,7 +31,7 @@ public final class FixedCrawlDelay implements CrawlDelay { * @param config A CrawlerConfiguration instance which specifies the fixed delay */ public FixedCrawlDelay(final CrawlerConfiguration config) { - delayInMillis = config.getFixedCrawlDelayInMillis(); + delayInMillis = config.getFixedCrawlDelayDurationInMillis(); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java index 3bc9871..6c16073 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java @@ -35,8 +35,8 @@ public final class RandomCrawlDelay implements CrawlDelay { * specifies the minimum and maximum delay. */ public RandomCrawlDelay(final CrawlerConfiguration config) { - origin = config.getMinimumCrawlDelayInMillis(); - bound = config.getMaximumCrawlDelayInMillis() + 1; + origin = config.getMinimumCrawlDelayDurationInMillis(); + bound = config.getMaximumCrawlDelayDurationInMillis() + 1; } /** diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index df7788c..6cab603 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -28,7 +28,7 @@ import org.junit.Test; /** - * Test cases for CrawlFrontier. + * Test cases for CrawlFrontier. * * @author Krisztian Mozsi * @author Peter Bencze @@ -67,7 +67,7 @@ public final class CrawlFrontierTest { private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST; private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST; private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST; - + // Child URL path private static final String CHILD_URL_PATH = "/child"; @@ -79,7 +79,7 @@ public final class CrawlFrontierTest { // Offsite URL crawl request private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST; - + // Max crawl depth private static final int MAX_CRAWL_DEPTH = 1; @@ -132,10 +132,11 @@ public final class CrawlFrontierTest { @Before public void initialize() { - // Create configuration config = new CrawlerConfiguration(); + config.setOffsiteRequestFiltering(true); - config.addCrawlSeeds(Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST)); + Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST) + .forEach(config::addCrawlSeed); // Create frontier frontier = new CrawlFrontier(config); @@ -400,27 +401,27 @@ public void getNextRequestDepthFirstTest() { // There should be no more candidates left at this point assertFalse(frontier.hasNextCandidate()); } - + @Test public void maxCrawlDepthTest() { // Set max crawl depth config.setMaximumCrawlDepth(MAX_CRAWL_DEPTH); - + // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); - + // Feed a child request, its crawl depth will be 1 frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - + // Get the crawl candidate of the previously added child URL CrawlCandidate nextCandidate = frontier.getNextCandidate(); - + // Check its crawl depth, it should be less than or equal to the limit assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH); - + // Feed another child request, its crawl depth will be 2 which is above the limit frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - + // There should be no more candidates at this point assertFalse(frontier.hasNextCandidate()); } From 7cf0abd262f0d62fd1643be25cd476b59110108a Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 7 Mar 2018 00:43:49 +0100 Subject: [PATCH 10/24] Refactor crawl delays, alter comments --- pom.xml | 7 -- .../peterbencze/serritor/api/BaseCrawler.java | 76 +++++++++++++------ .../serritor/api/CrawlRequest.java | 35 ++++----- .../serritor/api/HtmlResponse.java | 4 +- .../serritor/api/HttpHeadResponse.java | 6 +- .../serritor/api/UnsuccessfulRequest.java | 5 +- ....java => AdaptiveCrawlDelayMechanism.java} | 28 +++---- .../serritor/internal/CallbackParameter.java | 2 +- .../serritor/internal/CrawlCandidate.java | 2 +- .../serritor/internal/CrawlDelayFactory.java | 68 ----------------- ...awlDelay.java => CrawlDelayMechanism.java} | 6 +- .../serritor/internal/CrawlFrontier.java | 39 +++++----- .../internal/CrawlerConfigurator.java | 30 ++++---- ...lay.java => FixedCrawlDelayMechanism.java} | 19 ++--- ...ay.java => RandomCrawlDelayMechanism.java} | 20 ++--- .../serritor/internal/CrawlFrontierTest.java | 1 - 16 files changed, 151 insertions(+), 197 deletions(-) rename src/main/java/com/github/peterbencze/serritor/internal/{AdaptiveCrawlDelay.java => AdaptiveCrawlDelayMechanism.java} (61%) delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java rename src/main/java/com/github/peterbencze/serritor/internal/{CrawlDelay.java => CrawlDelayMechanism.java} (83%) rename src/main/java/com/github/peterbencze/serritor/internal/{FixedCrawlDelay.java => FixedCrawlDelayMechanism.java} (62%) rename src/main/java/com/github/peterbencze/serritor/internal/{RandomCrawlDelay.java => RandomCrawlDelayMechanism.java} (60%) diff --git a/pom.xml b/pom.xml index 33c497b..a3ccc27 100644 --- a/pom.xml +++ b/pom.xml @@ -25,13 +25,6 @@ Owner - - Krisztian Mozsi - mozsik0@gmail.com - - Committer - - diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 392fc68..17b01c1 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -19,12 +19,14 @@ import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder; import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder; import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder; +import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlCandidate; -import com.github.peterbencze.serritor.internal.CrawlDelay; -import com.github.peterbencze.serritor.internal.CrawlDelayFactory; +import com.github.peterbencze.serritor.internal.CrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.CrawlerConfiguration; import com.github.peterbencze.serritor.internal.CrawlerConfigurator; +import com.github.peterbencze.serritor.internal.FixedCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.RandomCrawlDelayMechanism; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; @@ -52,7 +54,7 @@ * @author Peter Bencze */ public abstract class BaseCrawler { - + protected final CrawlerConfigurator configurator; private final CrawlerConfiguration configuration; @@ -70,7 +72,7 @@ public abstract class BaseCrawler { private CrawlFrontier crawlFrontier; - private CrawlDelay crawlDelay; + private CrawlDelayMechanism crawlDelayMechanism; protected BaseCrawler() { configuration = new CrawlerConfiguration(); @@ -88,9 +90,11 @@ public final void start() { } /** - * Starts the crawler using the browser specified by the WebDriver instance. + * Starts the crawler using the browser specified by the + * WebDriver instance. * - * @param driver The WebDriver instance that will be used by the crawler + * @param driver The WebDriver instance that will be used by + * the crawler */ public final void start(final WebDriver driver) { start(driver, new CrawlFrontier(configuration)); @@ -99,7 +103,8 @@ public final void start(final WebDriver driver) { /** * Constructs all the necessary objects and runs the crawler. * - * @param frontierToUse Crawl frontier to be used by the crawler. + * @param frontierToUse The CrawlFrontier instance to be used + * by the crawler. */ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { try { @@ -109,15 +114,10 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { } isStopped = false; - httpClient = HttpClientBuilder.create().build(); - webDriver = driver; - crawlFrontier = frontierToUse; - - CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(configuration, (JavascriptExecutor) driver); - crawlDelay = crawlDelayFactory.getInstanceOf(configuration.getCrawlDelayStrategy()); + crawlDelayMechanism = createCrawlDelayMechanism(); run(); } finally { @@ -132,8 +132,9 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { /** * Saves the current state of the crawler to the specified output stream. * - * @param out The output stream to use - * @throws IOException Any exception thrown by the underlying OutputStream. + * @param out The OutputStream instance to use + * @throws IOException Any exception thrown by the underlying + * OutputStream. */ public final void saveState(final OutputStream out) throws IOException { // Check if the crawler has been started, otherwise we have nothing to save @@ -149,8 +150,8 @@ public final void saveState(final OutputStream out) throws IOException { /** * Resumes a previously saved state using HtmlUnit headless browser. * - * @param in The input stream to use - * @throws IOException Any of the usual Input/Output related exceptions. + * @param in The InputStream instance to use + * @throws IOException Any of the usual input/output related exceptions. * @throws ClassNotFoundException Class of a serialized object cannot be * found. */ @@ -162,9 +163,10 @@ public final void resumeState(final InputStream in) throws IOException, ClassNot * Resumes a previously saved state using the browser specified by the * WebDriver instance. * - * @param driver The WebDriver instance that will be used by the crawler - * @param in The input stream to use - * @throws IOException Any of the usual Input/Output related exceptions. + * @param driver The WebDriver instance to be used by the + * crawler + * @param in The InputStream instance to use + * @throws IOException Any of the usual input/output related exceptions. * @throws ClassNotFoundException Class of a serialized object cannot be * found. */ @@ -198,7 +200,7 @@ public final void stop() { * {@link CrawlerConfiguration#addCrawlSeed(com.github.peterbencze.serritor.api.CrawlRequest)} * for adding crawl seeds. * - * @param request The crawl request + * @param request The CrawlRequest instance */ protected final void crawl(final CrawlRequest request) { // Check if the crawler is running @@ -212,7 +214,7 @@ protected final void crawl(final CrawlRequest request) { /** * Passes multiple crawl requests to the crawl frontier. * - * @param requests The list of crawl requests + * @param requests The list of CrawlRequest instances */ protected final void crawl(final List requests) { requests.stream().forEach(this::crawl); @@ -319,19 +321,43 @@ private HttpHeadResponse getHttpHeadResponse(final URL destinationUrl, final Htt * Indicates if the content of the response is HTML or not. * * @param httpHeadResponse The HTTP HEAD response - * @return True if the content is HTML, false otherwise + * @return true if the content is HTML, false + * otherwise */ - private boolean isContentHtml(final HttpHeadResponse httpHeadResponse) { + private static boolean isContentHtml(final HttpHeadResponse httpHeadResponse) { Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type"); return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html"); } + /** + * Constructs the crawl delay mechanism specified in the configuration. + * + * @return The crawl delay mechanism + */ + private CrawlDelayMechanism createCrawlDelayMechanism() { + switch (configuration.getCrawlDelayStrategy()) { + case FIXED: + return new FixedCrawlDelayMechanism(configuration); + case RANDOM: + return new RandomCrawlDelayMechanism(configuration); + case ADAPTIVE: + AdaptiveCrawlDelayMechanism adaptiveCrawlDelay = new AdaptiveCrawlDelayMechanism(configuration, (JavascriptExecutor) webDriver); + if (!adaptiveCrawlDelay.isBrowserCompatible()) { + throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser."); + } + + return adaptiveCrawlDelay; + } + + throw new IllegalArgumentException("Unsupported crawl delay strategy."); + } + /** * Delays the next request. */ private void performDelay() { try { - TimeUnit.MILLISECONDS.sleep(crawlDelay.getDelay()); + TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); stopCrawling = true; diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 26ff078..34f26f0 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -26,7 +26,6 @@ * future. The reason why it is not sure that it will be processed is because it * might get filtered out by one of the enabled filters. * - * @author Krisztian Mozsi * @author Peter Bencze */ public final class CrawlRequest implements Serializable { @@ -80,7 +79,7 @@ public Optional getMetadata() { } public static final class CrawlRequestBuilder { - + private static final int DEFAULT_PRIORITY = 0; private final URL requestUrl; @@ -90,10 +89,11 @@ public static final class CrawlRequestBuilder { private Serializable metadata; /** - * Constructs a CrawlRequestBuilder instance that can be used to create - * CrawRequest instances. + * Constructs a CrawlRequestBuilder instance that can be + * used to create CrawRequest instances. * - * @param requestUrl The request's URL given as a URL instance + * @param requestUrl The request's URL given as a URL + * instance */ public CrawlRequestBuilder(final URL requestUrl) { this.requestUrl = requestUrl; @@ -112,10 +112,11 @@ public CrawlRequestBuilder(final URL requestUrl) { } /** - * Constructs a CrawlRequestBuilder instance that can be used to create - * CrawRequest instances. + * Constructs a CrawlRequestBuilder instance that can be + * used to create CrawRequest instances. * - * @param requestUrl The request's URL given as a String instance + * @param requestUrl The request's URL given as a String + * instance */ public CrawlRequestBuilder(final String requestUrl) { this(getUrlFromString(requestUrl)); @@ -126,7 +127,7 @@ public CrawlRequestBuilder(final String requestUrl) { * * @param priority The priority of the request (higher number means * higher priority) - * @return The builder instance + * @return The CrawlRequestBuilder instance */ public CrawlRequestBuilder setPriority(final int priority) { this.priority = priority; @@ -138,7 +139,7 @@ public CrawlRequestBuilder setPriority(final int priority) { * when the crawler processed the request. * * @param metadata The metadata associated with the request - * @return The builder instance + * @return The CrawlRequestBuilder instance */ public CrawlRequestBuilder setMetadata(final Serializable metadata) { this.metadata = metadata; @@ -146,21 +147,21 @@ public CrawlRequestBuilder setMetadata(final Serializable metadata) { } /** - * Builds the specified CrawlRequest instance. + * Builds the configured CrawlRequest instance. * - * @return The specified CrawlRequest instance + * @return The configured CrawlRequest instance */ public CrawlRequest build() { return new CrawlRequest(this); } /** - * Constructs a URL instance based on the specified URL string. Since - * call to this must be the first statement in a constructor, this - * method is necessary for the conversion to be made. + * Constructs a URL instance based on the specified URL + * string. Since call to this must be the first statement in a + * constructor, this method is necessary for the conversion to be made. * - * @param requestUrl The request URL as String - * @return The request URL + * @param requestUrl The request URL as String + * @return The URL instance */ private static URL getUrlFromString(final String requestUrl) { try { diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java index 4138abb..a7be956 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java @@ -46,9 +46,9 @@ public HttpHeadResponse getHttpHeadResponse() { } /** - * Returns the WebDriver instance for the browser. + * Returns the WebDriver instance for the browser. * - * @return The WebDriver instance + * @return The WebDriver instance */ public WebDriver getWebDriver() { return webDriver; diff --git a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java index 847b281..93f2aed 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java @@ -39,7 +39,7 @@ public HttpHeadResponse(final HttpResponse response) { * Checks if a certain header is present in this message. * * @param name The name of the header - * @return True if it is present, false otherwise + * @return true if present, false otherwise */ public boolean containsHeader(final String name) { return response.containsHeader(name); @@ -48,7 +48,7 @@ public boolean containsHeader(final String name) { /** * Returns all the headers of this response. * - * @return All the headers + * @return The array of headers */ public Header[] getAllHeaders() { return response.getAllHeaders(); @@ -68,7 +68,7 @@ public Header getFirstHeader(final String name) { * Returns all the headers with a specified name of this response. * * @param name The name of the headers - * @return All the headers + * @return The array of headers */ public Header[] getHeaders(final String name) { return response.getHeaders(name); diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java index f809a6a..12c67cc 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java @@ -35,9 +35,10 @@ private UnsuccessfulRequest(final UnsuccessfulRequestBuilder builder) { } /** - * Returns the exception that was thrown. + * Returns the exception that was thrown while trying to fulfill the + * request. * - * @return The thrown exception + * @return The IOException instance */ public IOException getException() { return exception; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java similarity index 61% rename from src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java rename to src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java index 9fd9e9a..ac8913e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java @@ -18,30 +18,30 @@ import org.openqa.selenium.JavascriptExecutor; /** - * A type of crawl delay, in which case the delay corresponds to the page - * loading time, if it's between the specified range, otherwise the minimum or + * A crawl delay mechanism, in which case the delay corresponds to the page + * loading time, if it is between the specified range, otherwise the minimum or * maximum duration is used. * * @author Peter Bencze */ -public final class AdaptiveCrawlDelay implements CrawlDelay { +public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism { private final long minDelayInMillis; private final long maxDelayInMillis; - private final JavascriptExecutor javascriptExecutor; + private final JavascriptExecutor jsExecutor; /** - * Constructs a new AdaptiveCrawlDelay instance. + * Constructs a new AdaptiveCrawlDelayMechanism instance. * - * @param config A CrawlerConfiguration instance which + * @param configuration The CrawlerConfiguration instance which * specifies the minimum and maximum delay. - * @param javascriptExecutor A WebDriver instance which is - * capable of executing JavaScript. + * @param jsExecutor The WebDriver instance which is capable of + * executing JavaScript. */ - public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) { - minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis(); - maxDelayInMillis = config.getMaximumCrawlDelayDurationInMillis(); - this.javascriptExecutor = javascriptExecutor; + public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration configuration, final JavascriptExecutor jsExecutor) { + minDelayInMillis = configuration.getMinimumCrawlDelayDurationInMillis(); + maxDelayInMillis = configuration.getMaximumCrawlDelayDurationInMillis(); + this.jsExecutor = jsExecutor; } /** @@ -51,7 +51,7 @@ public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExe * false otherwise */ public boolean isBrowserCompatible() { - return (boolean) javascriptExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)"); + return (boolean) jsExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)"); } /** @@ -64,7 +64,7 @@ public boolean isBrowserCompatible() { */ @Override public long getDelay() { - long delayInMillis = (long) javascriptExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;"); + long delayInMillis = (long) jsExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;"); if (delayInMillis < minDelayInMillis) { return minDelayInMillis; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java index 28af583..9ca1d75 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java @@ -57,7 +57,7 @@ public final int getCrawlDepth() { /** * Returns the crawl request that was processed by the crawler. * - * @return The processed crawl request + * @return The processed CrawlRequest instance */ public final CrawlRequest getCrawlRequest() { return crawlRequest; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java index 8d599ab..49b0721 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java @@ -85,7 +85,7 @@ public int getPriority() { /** * Returns the crawl request from which this candidate was constructed. * - * @return The crawl request + * @return The CrawlRequest instance */ public CrawlRequest getCrawlRequest() { return crawlRequest; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java deleted file mode 100644 index 97e78d1..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2018 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.github.peterbencze.serritor.internal; - -import com.github.peterbencze.serritor.api.CrawlDelayStrategy; -import org.openqa.selenium.JavascriptExecutor; - -/** - * Factory class which is used to construct the required crawl delay instance - * specified in the configuration. - * - * @author Peter Bencze - */ -public final class CrawlDelayFactory { - - private final CrawlerConfiguration config; - private final JavascriptExecutor javascriptExecutor; - - /** - * Constructs a new CrawlDelayFactory instance. - * - * @param config A CrawlerConfiguration instance which - * specifies the minimum and maximum delay. - * @param javascriptExecutor A WebDriver instance which is - * capable of executing JavaScript. - */ - public CrawlDelayFactory(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) { - this.config = config; - this.javascriptExecutor = javascriptExecutor; - } - - /** - * Constructs the specific crawl delay instance determined by the strategy. - * - * @param crawlDelayStrategy The crawl delay strategy - * @return The specific crawl delay instance - */ - public CrawlDelay getInstanceOf(final CrawlDelayStrategy crawlDelayStrategy) { - switch (crawlDelayStrategy) { - case FIXED: - return new FixedCrawlDelay(config); - case RANDOM: - return new RandomCrawlDelay(config); - case ADAPTIVE: - AdaptiveCrawlDelay adaptiveCrawlDelay = new AdaptiveCrawlDelay(config, javascriptExecutor); - if (!adaptiveCrawlDelay.isBrowserCompatible()) { - throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser."); - } - - return adaptiveCrawlDelay; - } - - throw new IllegalArgumentException("Unsupported crawl delay strategy."); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java similarity index 83% rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java rename to src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java index 652b2e9..34317b1 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java @@ -16,16 +16,16 @@ package com.github.peterbencze.serritor.internal; /** - * An interface that every type of crawl delay should implement. + * An interface that every crawl delay mechanism should implement. * * @author Peter Bencze */ -public interface CrawlDelay { +public interface CrawlDelayMechanism { /** * Returns the delay that should pass between each request. * - * @return The delay in milliseconds + * @return The duration of delay in milliseconds */ long getDelay(); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 2d58e03..58e56e2 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -35,11 +35,10 @@ * crawling. * * @author Peter Bencze - * @author Krisztian Mozsi */ public final class CrawlFrontier implements Serializable { - private final CrawlerConfiguration config; + private final CrawlerConfiguration configuration; private final Set allowedDomains; private final Set urlFingerprints; @@ -48,17 +47,17 @@ public final class CrawlFrontier implements Serializable { private CrawlCandidate currentCandidate; - public CrawlFrontier(final CrawlerConfiguration config) { - this.config = config; + public CrawlFrontier(final CrawlerConfiguration configuration) { + this.configuration = configuration; allowedDomains = new HashSet<>(); urlFingerprints = new HashSet<>(); // Construct a priority queue according to the crawl strategy specified in the configuration - candidates = getPriorityQueue(); + candidates = createPriorityQueue(); // Feed initial crawl requests (seeds) - config.getCrawlSeeds().stream() + configuration.getCrawlSeeds().stream() .forEach((CrawlRequest request) -> { feedRequest(request, true); }); @@ -67,11 +66,12 @@ public CrawlFrontier(final CrawlerConfiguration config) { /** * Feeds a crawl request to the frontier. * - * @param request The request to be fed - * @param isCrawlSeed True if the request is a crawl seed, false otherwise + * @param request The CrawlRequest instance to be fed + * @param isCrawlSeed true if the request is a crawl seed, + * false otherwise */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { - if (config.isOffsiteRequestFilteringEnabled()) { + if (configuration.isOffsiteRequestFilteringEnabled()) { if (isCrawlSeed) { allowedDomains.add(request.getTopPrivateDomain()); } else { @@ -81,8 +81,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { } } - if (config.isDuplicateRequestFilteringEnabled()) { - String urlFingerprint = getFingerprintForUrl(request.getRequestUrl()); + if (configuration.isDuplicateRequestFilteringEnabled()) { + String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); // Check if the URL has already been crawled if (urlFingerprints.contains(urlFingerprint)) { @@ -96,7 +96,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { CrawlCandidateBuilder builder; if (!isCrawlSeed) { - int crawlDepthLimit = config.getMaximumCrawlDepth(); + int crawlDepthLimit = configuration.getMaximumCrawlDepth(); int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit @@ -117,7 +117,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { /** * Indicates if there are any candidates left in the queue. * - * @return True if there are candidates in the queue, false otherwise + * @return true if there are candidates in the queue, + * false otherwise */ public boolean hasNextCandidate() { return !candidates.isEmpty(); @@ -126,7 +127,7 @@ public boolean hasNextCandidate() { /** * Gets the next candidate from the queue. * - * @return The next candidate + * @return The next CrawlCandidate instance */ public CrawlCandidate getNextCandidate() { currentCandidate = candidates.poll(); @@ -139,7 +140,7 @@ public CrawlCandidate getNextCandidate() { * @param url The URL that the fingerprint will be created for * @return The fingerprint of the URL */ - private String getFingerprintForUrl(final URL url) { + private static String createFingerprintForUrl(final URL url) { // First, we start off with the host only StringBuilder truncatedUrl = new StringBuilder(url.getHost()); @@ -170,11 +171,11 @@ private String getFingerprintForUrl(final URL url) { /** * Creates a new priority queue using the specified strategy. * - * @return A new PriorityQueue instance for CrawlRequests using the given - * comparator + * @return The PriorityQueue instance for crawl requests using + * the given comparator */ - private PriorityQueue getPriorityQueue() { - switch (config.getCrawlStrategy()) { + private PriorityQueue createPriorityQueue() { + switch (configuration.getCrawlStrategy()) { case BREADTH_FIRST: return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth) .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java index 378f591..e2f4527 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java @@ -27,12 +27,12 @@ * * @author Peter Bencze */ -public class CrawlerConfigurator { +public final class CrawlerConfigurator { - private final CrawlerConfiguration config; + private final CrawlerConfiguration configuration; - public CrawlerConfigurator(CrawlerConfiguration config) { - this.config = config; + public CrawlerConfigurator(CrawlerConfiguration configuration) { + this.configuration = configuration; } /** @@ -42,7 +42,7 @@ public CrawlerConfigurator(CrawlerConfiguration config) { * the crawl seed */ public void addCrawlSeed(final CrawlRequest request) { - config.addCrawlSeed(Preconditions.checkNotNull(request)); + configuration.addCrawlSeed(Preconditions.checkNotNull(request)); } /** @@ -63,7 +63,7 @@ public void addCrawlSeeds(final List requests) { * @param crawlStrategy The crawl strategy */ public void setCrawlStrategy(final CrawlStrategy crawlStrategy) { - config.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy)); + configuration.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy)); } /** @@ -73,7 +73,7 @@ public void setCrawlStrategy(final CrawlStrategy crawlStrategy) { * false means disabled */ public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) { - config.setDuplicateRequestFiltering(filterDuplicateRequests); + configuration.setDuplicateRequestFiltering(filterDuplicateRequests); } /** @@ -83,7 +83,7 @@ public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) * false means disabled */ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { - config.setOffsiteRequestFiltering(filterOffsiteRequests); + configuration.setOffsiteRequestFiltering(filterOffsiteRequests); } /** @@ -95,7 +95,7 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { public void setMaximumCrawlDepth(final int maxCrawlDepth) { Preconditions.checkArgument(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative."); - config.setMaximumCrawlDepth(maxCrawlDepth); + configuration.setMaximumCrawlDepth(maxCrawlDepth); } /** @@ -104,7 +104,7 @@ public void setMaximumCrawlDepth(final int maxCrawlDepth) { * @param crawlDelayStrategy The crawl delay strategy */ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { - config.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy)); + configuration.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy)); } /** @@ -113,7 +113,7 @@ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { * @param fixedCrawlDelayDuration The duration of delay */ public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { - config.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis()); + configuration.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis()); } /** @@ -125,11 +125,11 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { Preconditions.checkArgument(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative."); long minCrawlDelayDurationInMillis = minCrawlDelayDuration.toMillis(); - long maxCrawlDelayInMillis = config.getMaximumCrawlDelayDurationInMillis(); + long maxCrawlDelayInMillis = configuration.getMaximumCrawlDelayDurationInMillis(); Preconditions.checkArgument(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum."); - config.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis); + configuration.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis); } /** @@ -138,11 +138,11 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { * @param maxCrawlDelayDuration The maximum duration of delay */ public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { - long minCrawlDelayDurationInMillis = config.getMinimumCrawlDelayDurationInMillis(); + long minCrawlDelayDurationInMillis = configuration.getMinimumCrawlDelayDurationInMillis(); long maxCrawlDelayDurationInMillis = maxCrawlDelayDuration.toMillis(); Preconditions.checkArgument(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum."); - config.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis); + configuration.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java similarity index 62% rename from src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java rename to src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java index 3eb0f87..0ca8307 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java @@ -16,27 +16,28 @@ package com.github.peterbencze.serritor.internal; /** - * A type of crawl delay, in which case the delay is constant and equals to the - * duration specified in the configuration. + * A crawl delay mechanism, in which case the delay is constant and equals to + * the duration specified in the configuration. * * @author Peter Bencze */ -public final class FixedCrawlDelay implements CrawlDelay { +public final class FixedCrawlDelayMechanism implements CrawlDelayMechanism { private final long delayInMillis; /** - * Constructs a new FixedCrawlDelay instance. - * - * @param config A CrawlerConfiguration instance which specifies the fixed delay + * Constructs a new FixedCrawlDelayMechanism instance. + * + * @param configuration The CrawlerConfiguration instance which + * specifies the fixed delay duration. */ - public FixedCrawlDelay(final CrawlerConfiguration config) { - delayInMillis = config.getFixedCrawlDelayDurationInMillis(); + public FixedCrawlDelayMechanism(final CrawlerConfiguration configuration) { + this.delayInMillis = configuration.getFixedCrawlDelayDurationInMillis(); } /** * Returns the fixed delay specified in the configuration. - * + * * @return The delay in milliseconds */ @Override diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java similarity index 60% rename from src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java rename to src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java index 6c16073..13b33b0 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java @@ -18,25 +18,25 @@ import java.util.concurrent.ThreadLocalRandom; /** - * A type of crawl delay in which case the duration is randomized between the + * A crawl delay mechanism in which case the duration is randomized between the * specified minimum and maximum range. * * @author Peter Bencze */ -public final class RandomCrawlDelay implements CrawlDelay { +public final class RandomCrawlDelayMechanism implements CrawlDelayMechanism { - private final long origin; - private final long bound; + private final long lowerLimit; + private final long upperLimit; /** - * Constructs a new RandomCrawlDelay instance. + * Constructs a new RandomCrawlDelayMechanism instance. * - * @param config A CrawlerConfiguration instance which + * @param configuration The CrawlerConfiguration instance which * specifies the minimum and maximum delay. */ - public RandomCrawlDelay(final CrawlerConfiguration config) { - origin = config.getMinimumCrawlDelayDurationInMillis(); - bound = config.getMaximumCrawlDelayDurationInMillis() + 1; + public RandomCrawlDelayMechanism(final CrawlerConfiguration configuration) { + lowerLimit = configuration.getMinimumCrawlDelayDurationInMillis(); + upperLimit = configuration.getMaximumCrawlDelayDurationInMillis() + 1; } /** @@ -47,6 +47,6 @@ public RandomCrawlDelay(final CrawlerConfiguration config) { */ @Override public long getDelay() { - return ThreadLocalRandom.current().nextLong(origin, bound); + return ThreadLocalRandom.current().nextLong(lowerLimit, upperLimit); } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 6cab603..047e202 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -30,7 +30,6 @@ /** * Test cases for CrawlFrontier. * - * @author Krisztian Mozsi * @author Peter Bencze */ public final class CrawlFrontierTest { From f175f90099fac0d649e0273057fca377918770ce Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 8 Mar 2018 00:40:36 +0100 Subject: [PATCH 11/24] Refactor serialization --- .../peterbencze/serritor/api/BaseCrawler.java | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 17b01c1..489a6f0 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -29,13 +29,12 @@ import com.github.peterbencze.serritor.internal.RandomCrawlDelayMechanism; import java.io.IOException; import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; import java.io.OutputStream; import java.net.URI; import java.net.URL; import java.util.List; import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.SerializationUtils; import org.apache.http.Header; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; @@ -133,29 +132,23 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { * Saves the current state of the crawler to the specified output stream. * * @param out The OutputStream instance to use - * @throws IOException Any exception thrown by the underlying - * OutputStream. */ - public final void saveState(final OutputStream out) throws IOException { - // Check if the crawler has been started, otherwise we have nothing to save + public final void saveState(final OutputStream out) { + // Check if the crawler has been started at least once, otherwise we have nothing to save if (crawlFrontier == null) { throw new IllegalStateException("No state to save."); } - // Save the frontier's current state - ObjectOutputStream objectOutputStream = new ObjectOutputStream(out); - objectOutputStream.writeObject(crawlFrontier); + // Save the crawl frontier's current state + SerializationUtils.serialize(crawlFrontier, out); } /** * Resumes a previously saved state using HtmlUnit headless browser. * * @param in The InputStream instance to use - * @throws IOException Any of the usual input/output related exceptions. - * @throws ClassNotFoundException Class of a serialized object cannot be - * found. */ - public final void resumeState(final InputStream in) throws IOException, ClassNotFoundException { + public final void resumeState(final InputStream in) { resumeState(new HtmlUnitDriver(true), in); } @@ -166,13 +159,10 @@ public final void resumeState(final InputStream in) throws IOException, ClassNot * @param driver The WebDriver instance to be used by the * crawler * @param in The InputStream instance to use - * @throws IOException Any of the usual input/output related exceptions. - * @throws ClassNotFoundException Class of a serialized object cannot be - * found. */ - public final void resumeState(final WebDriver driver, final InputStream in) throws IOException, ClassNotFoundException { - ObjectInputStream objectInputStream = new ObjectInputStream(in); - CrawlFrontier frontierToUse = (CrawlFrontier) objectInputStream.readObject(); + public final void resumeState(final WebDriver driver, final InputStream in) { + // Re-create crawl frontier from the saved state + CrawlFrontier frontierToUse = SerializationUtils.deserialize(in); start(driver, frontierToUse); } From c481cd46abf7982e9ec26bb0ce7113f5ec580a50 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 8 Mar 2018 16:10:23 +0100 Subject: [PATCH 12/24] Add the possibility of specifying allowed crawl domains --- .../serritor/api/CrawlRequest.java | 37 ++++---- .../serritor/internal/CrawlCandidate.java | 9 +- .../serritor/internal/CrawlDomain.java | 91 +++++++++++++++++++ .../serritor/internal/CrawlFrontier.java | 27 ++++-- .../internal/CrawlerConfiguration.java | 23 +++++ .../internal/CrawlerConfigurator.java | 23 +++++ .../serritor/internal/CrawlDomainTest.java | 59 ++++++++++++ .../serritor/internal/CrawlFrontierTest.java | 31 +++++-- 8 files changed, 260 insertions(+), 40 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 34f26f0..5cb9a23 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -16,6 +16,8 @@ package com.github.peterbencze.serritor.api; import com.google.common.net.InternetDomainName; +import java.io.IOException; +import java.io.ObjectInputStream; import java.io.Serializable; import java.net.MalformedURLException; import java.net.URL; @@ -31,13 +33,14 @@ public final class CrawlRequest implements Serializable { private final URL requestUrl; - private final String topPrivateDomain; private final int priority; private final Serializable metadata; + + private transient InternetDomainName domain; private CrawlRequest(final CrawlRequestBuilder builder) { requestUrl = builder.requestUrl; - topPrivateDomain = builder.topPrivateDomain; + domain = builder.domain; priority = builder.priority; metadata = builder.metadata; } @@ -52,12 +55,12 @@ public URL getRequestUrl() { } /** - * Returns the top private domain of the request's URL. + * Returns the domain of the request's URL. * - * @return The top private domain of the URL + * @return The domain of the request URL */ - public String getTopPrivateDomain() { - return topPrivateDomain; + public InternetDomainName getDomain() { + return domain; } /** @@ -83,8 +86,8 @@ public static final class CrawlRequestBuilder { private static final int DEFAULT_PRIORITY = 0; private final URL requestUrl; - - private String topPrivateDomain; + private final InternetDomainName domain; + private int priority; private Serializable metadata; @@ -98,14 +101,8 @@ public static final class CrawlRequestBuilder { public CrawlRequestBuilder(final URL requestUrl) { this.requestUrl = requestUrl; - // Extract the top private domain from the request URL - try { - topPrivateDomain = InternetDomainName.from(requestUrl.getHost()) - .topPrivateDomain() - .toString(); - } catch (IllegalStateException ex) { - throw new IllegalArgumentException(String.format("The top private domain cannot be extracted from the given request URL (\"%s\").", requestUrl), ex); - } + // Extract the domain from the request URL + domain = InternetDomainName.from(requestUrl.getHost()); // Set default priority priority = DEFAULT_PRIORITY; @@ -167,8 +164,14 @@ private static URL getUrlFromString(final String requestUrl) { try { return new URL(requestUrl); } catch (MalformedURLException ex) { - throw new IllegalArgumentException(String.format("The given request URL (\"%s\") is malformed.", requestUrl), ex); + throw new IllegalArgumentException(String.format("The URL (\"%s\") is malformed.", requestUrl), ex); } } } + + private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException { + in.defaultReadObject(); + + domain = InternetDomainName.from(requestUrl.getHost()); + } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java index 49b0721..7a4acbd 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlRequest; +import com.google.common.net.InternetDomainName; import java.io.Serializable; import java.net.URL; @@ -56,12 +57,12 @@ public URL getCandidateUrl() { } /** - * Returns the top private domain of the candidate's URL. + * Returns the domain of the candidate's URL. * - * @return The top private domain of the URL + * @return The domain of the candidate URL */ - public String getTopPrivateDomain() { - return crawlRequest.getTopPrivateDomain(); + public InternetDomainName getDomain() { + return crawlRequest.getDomain(); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java new file mode 100644 index 0000000..89bba42 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java @@ -0,0 +1,91 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import com.google.common.collect.ImmutableList; +import com.google.common.net.InternetDomainName; +import java.io.Serializable; + +/** + * Represents an internet domain in which crawling is allowed. + * + * @author Peter Bencze + */ +public final class CrawlDomain implements Serializable { + + private final ImmutableList parts; + + /** + * Constructs a new CrawlDomain instance. + * + * @param domain An immutable well-formed internet domain name + */ + public CrawlDomain(final InternetDomainName domain) { + parts = domain.parts(); + } + + /** + * Indicates if two CrawlDomain instances are equal or not. + * Crawl domains with the same domain name are considered equal. + * + * @param obj A CrawlDomain instance + * @return true if equal, false otherwise + */ + @Override + public boolean equals(final Object obj) { + if (obj == this) { + return true; + } + + if (obj instanceof CrawlDomain) { + CrawlDomain other = (CrawlDomain) obj; + return parts.equals(other.parts); + } + + return false; + } + + /** + * Calculates the hash code from the individual components of the domain + * name. + * + * @return The hash code for the crawl domain + */ + @Override + public int hashCode() { + return parts.hashCode(); + } + + /** + * Indicates if this crawl domain contains the specific internet domain. + * + * @param domain An immutable well-formed internet domain name + * @return true if belongs, false otherwise + */ + public boolean contains(final InternetDomainName domain) { + ImmutableList otherDomainParts = domain.parts(); + + if (parts.size() > otherDomainParts.size()) { + return false; + } + + otherDomainParts = otherDomainParts.reverse() + .subList(0, parts.size()); + + return parts.reverse() + .equals(otherDomainParts); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 58e56e2..5fa6283 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -40,7 +40,7 @@ public final class CrawlFrontier implements Serializable { private final CrawlerConfiguration configuration; - private final Set allowedDomains; + private final Set allowedCrawlDomains; private final Set urlFingerprints; private final Queue candidates; @@ -50,7 +50,8 @@ public final class CrawlFrontier implements Serializable { public CrawlFrontier(final CrawlerConfiguration configuration) { this.configuration = configuration; - allowedDomains = new HashSet<>(); + allowedCrawlDomains = configuration.getAllowedCrawlDomains(); + urlFingerprints = new HashSet<>(); // Construct a priority queue according to the crawl strategy specified in the configuration @@ -72,24 +73,32 @@ public CrawlFrontier(final CrawlerConfiguration configuration) { */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (configuration.isOffsiteRequestFilteringEnabled()) { - if (isCrawlSeed) { - allowedDomains.add(request.getTopPrivateDomain()); - } else { - if (!allowedDomains.contains(request.getTopPrivateDomain())) { - return; + // Check if the request's domain is in the allowed crawl domains + + boolean inCrawlDomain = false; + + for (CrawlDomain allowedCrawlDomain : allowedCrawlDomains) { + if (allowedCrawlDomain.contains(request.getDomain())) { + inCrawlDomain = true; + break; } } + + if (!inCrawlDomain) { + return; + } } if (configuration.isDuplicateRequestFilteringEnabled()) { + // Check if the URL has already been crawled + String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); - // Check if the URL has already been crawled + if (urlFingerprints.contains(urlFingerprint)) { return; } - // If not, add its fingerprint to the set of URL fingerprints urlFingerprints.add(urlFingerprint); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 1fa666f..303bc52 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -21,7 +21,9 @@ import java.io.Serializable; import java.time.Duration; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; /** * This class contains the settings of the crawler. @@ -39,6 +41,7 @@ public final class CrawlerConfiguration implements Serializable { private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS = Duration.ofSeconds(1).toMillis(); private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis(); + private final Set allowedCrawlDomains; private final List crawlSeeds; private CrawlStrategy crawlStrategy; @@ -53,6 +56,7 @@ public final class CrawlerConfiguration implements Serializable { public CrawlerConfiguration() { // Initialize configuration with default values + allowedCrawlDomains = new HashSet<>(); crawlSeeds = new ArrayList<>(); crawlStrategy = DEFAULT_CRAWL_STRATEGY; filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; @@ -64,6 +68,25 @@ public CrawlerConfiguration() { maxCrawlDelayDurationInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS; } + /** + * Returns the set of allowed crawl domains. + * + * @return The set of allowed crawl domains + */ + public Set getAllowedCrawlDomains() { + return allowedCrawlDomains; + } + + /** + * Appends a crawl domain to the list of allowed ones. + * + * @param allowedCrawlDomain The CrawlDomain instance which + * represents the allowed crawl domain + */ + public void addAllowedCrawlDomain(CrawlDomain allowedCrawlDomain) { + allowedCrawlDomains.add(allowedCrawlDomain); + } + /** * Returns the list of crawl seeds. * diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java index e2f4527..5c4a4df 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java @@ -19,6 +19,7 @@ import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlStrategy; import com.google.common.base.Preconditions; +import com.google.common.net.InternetDomainName; import java.time.Duration; import java.util.List; @@ -35,6 +36,28 @@ public CrawlerConfigurator(CrawlerConfiguration configuration) { this.configuration = configuration; } + /** + * Appends an internet domain to the list of allowed crawl domains. + * + * @param allowedCrawlDomain A well-formed internet domain name + */ + public void addAllowedCrawlDomain(final String allowedCrawlDomain) { + InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain); + + Preconditions.checkArgument(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain)); + + configuration.addAllowedCrawlDomain(new CrawlDomain(domain)); + } + + /** + * Appends a list of internet domains to the list of allowed crawl domains. + * + * @param allowedCrawlDomains A list of well-formed internet domain names + */ + public void addAllowedCrawlDomains(final List allowedCrawlDomains) { + allowedCrawlDomains.forEach(this::addAllowedCrawlDomain); + } + /** * Appends a crawl request to the list of crawl seeds. * diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java new file mode 100644 index 0000000..3148b0c --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java @@ -0,0 +1,59 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.internal; + +import com.google.common.net.InternetDomainName; +import org.junit.Assert; +import org.junit.Test; + +/** + * Test cases for CrawlDomain. + * + * @author Peter Bencze + */ +public class CrawlDomainTest { + + private static final InternetDomainName DOMAIN = InternetDomainName.from("test.com"); + private static final InternetDomainName SUBDOMAIN = InternetDomainName.from("sub.test.com"); + + private static final CrawlDomain CRAWL_DOMAIN_0 = new CrawlDomain(DOMAIN); + private static final CrawlDomain CRAWL_DOMAIN_1 = new CrawlDomain(DOMAIN); + private static final CrawlDomain CRAWL_DOMAIN_2 = new CrawlDomain(SUBDOMAIN); + + @Test + public void testEquals() { + // A crawl domain should be equal with itself + Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_0)); + + // Crawl domains with the same domain should be equal + Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_1)); + + // Crawl domains with different domains should not be equal + Assert.assertFalse(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_2)); + } + + @Test + public void testContains() { + // A crawl domain should contain its own domain + Assert.assertTrue(CRAWL_DOMAIN_0.contains(DOMAIN)); + + // A crawl domain should contain its own domain's subdomain + Assert.assertTrue(CRAWL_DOMAIN_0.contains(SUBDOMAIN)); + + // A crawl domain should not contain a domain different from its own domain + Assert.assertFalse(CRAWL_DOMAIN_2.contains(DOMAIN)); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 047e202..74854bf 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -18,6 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.api.CrawlStrategy; +import com.google.common.net.InternetDomainName; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; @@ -34,6 +35,10 @@ */ public final class CrawlFrontierTest { + // Allowed crawl domains + private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_0 = new CrawlDomain(InternetDomainName.from("root_url_0.com")); + private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_1 = new CrawlDomain(InternetDomainName.from("root_url_1.com")); + // Root URLs private static final URL ROOT_URL_0; private static final URL ROOT_URL_1; @@ -126,19 +131,25 @@ public final class CrawlFrontierTest { .build(); } - private CrawlerConfiguration config; + private CrawlerConfiguration configuration; private CrawlFrontier frontier; @Before public void initialize() { - config = new CrawlerConfiguration(); + configuration = new CrawlerConfiguration(); + + configuration.setOffsiteRequestFiltering(true); + + // Add allowed crawl domains + Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1) + .forEach(configuration::addAllowedCrawlDomain); - config.setOffsiteRequestFiltering(true); + // Add crawl seeds Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST) - .forEach(config::addCrawlSeed); + .forEach(configuration::addCrawlSeed); // Create frontier - frontier = new CrawlFrontier(config); + frontier = new CrawlFrontier(configuration); } @Test @@ -216,7 +227,7 @@ public void getNextRequestWithOffsiteRequestFilterTest() { @Test public void getNextRequestWithoutDuplicateRequestFilterTest() { // Turn off duplicate request filtering - config.setDuplicateRequestFiltering(false); + configuration.setDuplicateRequestFiltering(false); // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); @@ -234,7 +245,7 @@ public void getNextRequestWithoutDuplicateRequestFilterTest() { @Test public void getNextRequestWithoutOffsiteRequestFilterTest() { // Turn off offsite request filtering - config.setOffsiteRequestFiltering(false); + configuration.setOffsiteRequestFiltering(false); // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); @@ -326,8 +337,8 @@ public void getNextRequestBreadthFirstTest() { @Test public void getNextRequestDepthFirstTest() { // Set the crawl strategy to depth-first - config.setCrawlStrategy(CrawlStrategy.DEPTH_FIRST); - frontier = new CrawlFrontier(config); + configuration.setCrawlStrategy(CrawlStrategy.DEPTH_FIRST); + frontier = new CrawlFrontier(configuration); // Get the crawl candidate of root URL 1 CrawlCandidate nextCandidate = frontier.getNextCandidate(); @@ -404,7 +415,7 @@ public void getNextRequestDepthFirstTest() { @Test public void maxCrawlDepthTest() { // Set max crawl depth - config.setMaximumCrawlDepth(MAX_CRAWL_DEPTH); + configuration.setMaximumCrawlDepth(MAX_CRAWL_DEPTH); // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); From 4c78f444ce546cc9d01dc80a81ef73d2f75d68db Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 8 Mar 2018 21:38:43 +0100 Subject: [PATCH 13/24] Refact assertions --- .../peterbencze/serritor/internal/CrawlDomainTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java index 3148b0c..c420965 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java @@ -36,13 +36,13 @@ public class CrawlDomainTest { @Test public void testEquals() { // A crawl domain should be equal with itself - Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_0)); + Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_0); // Crawl domains with the same domain should be equal - Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_1)); + Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_1); // Crawl domains with different domains should not be equal - Assert.assertFalse(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_2)); + Assert.assertNotEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_2); } @Test From e74679e46ed139c953e8c89a274fe743b9130504 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 10 Mar 2018 19:17:26 +0100 Subject: [PATCH 14/24] Remove static import --- .../peterbencze/serritor/internal/CrawlFrontier.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 5fa6283..350a6ad 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -21,7 +21,6 @@ import java.net.URL; import java.util.Arrays; import java.util.Comparator; -import static java.util.Comparator.reverseOrder; import java.util.HashSet; import java.util.List; import java.util.PriorityQueue; @@ -187,10 +186,10 @@ private PriorityQueue createPriorityQueue() { switch (configuration.getCrawlStrategy()) { case BREADTH_FIRST: return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth) - .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); + .thenComparing((Function & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder())); case DEPTH_FIRST: - return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth, reverseOrder()) - .thenComparing((Function & Serializable) CrawlCandidate::getPriority, reverseOrder())); + return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth, Comparator.reverseOrder()) + .thenComparing((Function & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder())); } throw new IllegalArgumentException("Unsupported crawl strategy."); From 9ad94c6d0926df64edea2d4858c43def4c109d36 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 12 Mar 2018 18:53:11 +0100 Subject: [PATCH 15/24] Add validations --- pom.xml | 2 +- .../peterbencze/serritor/api/BaseCrawler.java | 30 +++++---------- .../internal/CrawlerConfigurator.java | 37 ++++++++++++------- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index a3ccc27..f87d10a 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 1.2.2 + 1.3.0 jar Serritor diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 489a6f0..4c47e45 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.SerializationUtils; +import org.apache.commons.lang3.Validate; import org.apache.http.Header; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; @@ -107,20 +108,17 @@ public final void start(final WebDriver driver) { */ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { try { - // Check if the crawler is running - if (!isStopped) { - throw new IllegalStateException("The crawler is already started."); - } + Validate.validState(isStopped, "The crawler is already started."); isStopped = false; httpClient = HttpClientBuilder.create().build(); - webDriver = driver; + webDriver = Validate.notNull(driver, "The webdriver cannot be null."); crawlFrontier = frontierToUse; crawlDelayMechanism = createCrawlDelayMechanism(); run(); } finally { - // Always close the WebDriver + // Always close the browser webDriver.quit(); stopCrawling = false; @@ -135,9 +133,7 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { */ public final void saveState(final OutputStream out) { // Check if the crawler has been started at least once, otherwise we have nothing to save - if (crawlFrontier == null) { - throw new IllegalStateException("No state to save."); - } + Validate.validState(crawlFrontier != null, "Cannot save state at this point. The crawler should be started first."); // Save the crawl frontier's current state SerializationUtils.serialize(crawlFrontier, out); @@ -171,14 +167,8 @@ public final void resumeState(final WebDriver driver, final InputStream in) { * Stops the crawler. */ public final void stop() { - // Check if the crawler is running - if (isStopped) { - throw new IllegalStateException("The crawler is not started."); - } - - if (stopCrawling) { - throw new IllegalStateException("Stop has already been called."); - } + Validate.validState(!isStopped, "The crawler is not started."); + Validate.validState(!stopCrawling, "The stop method has already been called."); // Indicate that the crawling should be stopped stopCrawling = true; @@ -193,10 +183,8 @@ public final void stop() { * @param request The CrawlRequest instance */ protected final void crawl(final CrawlRequest request) { - // Check if the crawler is running - if (isStopped) { - throw new IllegalStateException("The crawler is not started. Maybe you meant to add this request as a crawl seed?"); - } + Validate.notNull(request, "The request cannot be null."); + Validate.validState(!isStopped, "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); crawlFrontier.feedRequest(request, false); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java index 5c4a4df..594eebe 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java @@ -18,10 +18,10 @@ import com.github.peterbencze.serritor.api.CrawlDelayStrategy; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlStrategy; -import com.google.common.base.Preconditions; import com.google.common.net.InternetDomainName; import java.time.Duration; import java.util.List; +import org.apache.commons.lang3.Validate; /** * This class provides an interface for the user to configure the crawler. @@ -44,7 +44,7 @@ public CrawlerConfigurator(CrawlerConfiguration configuration) { public void addAllowedCrawlDomain(final String allowedCrawlDomain) { InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain); - Preconditions.checkArgument(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain)); + Validate.isTrue(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain)); configuration.addAllowedCrawlDomain(new CrawlDomain(domain)); } @@ -65,7 +65,9 @@ public void addAllowedCrawlDomains(final List allowedCrawlDomains) { * the crawl seed */ public void addCrawlSeed(final CrawlRequest request) { - configuration.addCrawlSeed(Preconditions.checkNotNull(request)); + Validate.notNull(request, "The request cannot be null."); + + configuration.addCrawlSeed(request); } /** @@ -83,10 +85,12 @@ public void addCrawlSeeds(final List requests) { * orders crawl requests by the lowest crawl depth, whereas depth-first * orders them by the highest crawl depth. * - * @param crawlStrategy The crawl strategy + * @param strategy The crawl strategy */ - public void setCrawlStrategy(final CrawlStrategy crawlStrategy) { - configuration.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy)); + public void setCrawlStrategy(final CrawlStrategy strategy) { + Validate.notNull(strategy, "The strategy cannot be null."); + + configuration.setCrawlStrategy(strategy); } /** @@ -116,7 +120,7 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { * @param maxCrawlDepth The maximum crawl depth */ public void setMaximumCrawlDepth(final int maxCrawlDepth) { - Preconditions.checkArgument(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative."); + Validate.isTrue(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative."); configuration.setMaximumCrawlDepth(maxCrawlDepth); } @@ -124,10 +128,12 @@ public void setMaximumCrawlDepth(final int maxCrawlDepth) { /** * Sets the crawl delay strategy to be used by the crawler. * - * @param crawlDelayStrategy The crawl delay strategy + * @param strategy The crawl delay strategy */ - public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { - configuration.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy)); + public void setCrawlDelayStrategy(final CrawlDelayStrategy strategy) { + Validate.notNull(strategy, "The strategy cannot be null."); + + configuration.setCrawlDelayStrategy(strategy); } /** @@ -136,6 +142,8 @@ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) { * @param fixedCrawlDelayDuration The duration of delay */ public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { + Validate.notNull(fixedCrawlDelayDuration, "The duration cannot be null."); + configuration.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis()); } @@ -145,12 +153,13 @@ public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { * @param minCrawlDelayDuration The minimum duration of delay */ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { - Preconditions.checkArgument(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative."); + Validate.notNull(minCrawlDelayDuration, "The duration cannot be null."); + Validate.isTrue(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative."); long minCrawlDelayDurationInMillis = minCrawlDelayDuration.toMillis(); long maxCrawlDelayInMillis = configuration.getMaximumCrawlDelayDurationInMillis(); - Preconditions.checkArgument(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum."); + Validate.isTrue(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum."); configuration.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis); } @@ -161,10 +170,12 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { * @param maxCrawlDelayDuration The maximum duration of delay */ public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { + Validate.notNull(maxCrawlDelayDuration, "The duration cannot be null."); + long minCrawlDelayDurationInMillis = configuration.getMinimumCrawlDelayDurationInMillis(); long maxCrawlDelayDurationInMillis = maxCrawlDelayDuration.toMillis(); - Preconditions.checkArgument(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum."); + Validate.isTrue(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum."); configuration.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis); } From e157a08b8cfa60cb42abee25bdabe5904df71028 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 12 Mar 2018 20:33:54 +0100 Subject: [PATCH 16/24] Refact --- .../peterbencze/serritor/api/BaseCrawler.java | 2 +- .../serritor/internal/CrawlFrontier.java | 2 +- .../serritor/internal/CrawlerConfiguration.java | 14 ++++++-------- .../serritor/internal/CrawlerConfigurator.java | 4 ++-- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 4c47e45..598ba73 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -195,7 +195,7 @@ protected final void crawl(final CrawlRequest request) { * @param requests The list of CrawlRequest instances */ protected final void crawl(final List requests) { - requests.stream().forEach(this::crawl); + requests.forEach(this::crawl); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 350a6ad..c49f8e4 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -57,7 +57,7 @@ public CrawlFrontier(final CrawlerConfiguration configuration) { candidates = createPriorityQueue(); // Feed initial crawl requests (seeds) - configuration.getCrawlSeeds().stream() + configuration.getCrawlSeeds() .forEach((CrawlRequest request) -> { feedRequest(request, true); }); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java index 303bc52..3916c34 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java @@ -20,9 +20,7 @@ import com.github.peterbencze.serritor.api.CrawlStrategy; import java.io.Serializable; import java.time.Duration; -import java.util.ArrayList; import java.util.HashSet; -import java.util.List; import java.util.Set; /** @@ -42,7 +40,7 @@ public final class CrawlerConfiguration implements Serializable { private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis(); private final Set allowedCrawlDomains; - private final List crawlSeeds; + private final Set crawlSeeds; private CrawlStrategy crawlStrategy; private boolean filterDuplicateRequests; @@ -57,7 +55,7 @@ public CrawlerConfiguration() { // Initialize configuration with default values allowedCrawlDomains = new HashSet<>(); - crawlSeeds = new ArrayList<>(); + crawlSeeds = new HashSet<>(); crawlStrategy = DEFAULT_CRAWL_STRATEGY; filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT; @@ -88,16 +86,16 @@ public void addAllowedCrawlDomain(CrawlDomain allowedCrawlDomain) { } /** - * Returns the list of crawl seeds. + * Returns the set of crawl seeds. * - * @return The list of crawl seeds + * @return The set of crawl seeds */ - public List getCrawlSeeds() { + public Set getCrawlSeeds() { return crawlSeeds; } /** - * Appends a crawl request to the list of crawl seeds. + * Appends a crawl request to the set of crawl seeds. * * @param request The CrawlRequest instance which represents * the crawl seed diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java index 594eebe..be6c6f4 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java @@ -59,7 +59,7 @@ public void addAllowedCrawlDomains(final List allowedCrawlDomains) { } /** - * Appends a crawl request to the list of crawl seeds. + * Appends a crawl request to the set of crawl seeds. * * @param request The CrawlRequest instance which represents * the crawl seed @@ -71,7 +71,7 @@ public void addCrawlSeed(final CrawlRequest request) { } /** - * Appends a list of crawl requests to the list of crawl seeds. + * Appends a list of crawl requests to the set of crawl seeds. * * @param requests The list of CrawlRequest instances which * represent the crawl seeds From 7214dc7eb2f39b7abf6036c9d1cc3cf60073f2df Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 13 Mar 2018 02:17:52 +0100 Subject: [PATCH 17/24] Add UrlFinder helper class --- .../serritor/api/helper/UrlFinder.java | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java new file mode 100644 index 0000000..2bcbe83 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -0,0 +1,189 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.api.helper; + +import com.github.peterbencze.serritor.api.HtmlResponse; +import com.google.common.collect.Sets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.Validate; +import org.openqa.selenium.By; +import org.openqa.selenium.WebElement; + +/** + * A helper class which can be used to find URLs in HTML sources using regular + * expressions. + * + * @author Peter Bencze + */ +public final class UrlFinder { + + private final Set urlPatterns; + private final Set locatingMechanisms; + private final Set attributes; + + private UrlFinder(final UrlFinderBuilder builder) { + urlPatterns = builder.urlPatterns; + locatingMechanisms = builder.locatingMechanisms; + attributes = builder.attributes; + } + + /** + * Returns a list of (unvalidated) URLs found in the response's HTML source. + * + * @param response The HtmlResponse instance + * @return The list of found URLs + */ + public List findUrlsInResponse(final HtmlResponse response) { + Set foundUrls = new HashSet<>(); + + // Find elements using the specified locating mechanisms + Set extractedElements = locatingMechanisms.stream() + .map(response.getWebDriver()::findElements) + .flatMap(List::stream) + .collect(Collectors.toSet()); + + // Find URLs in the attribute values of the found elements + extractedElements.forEach((WebElement element) -> { + attributes.stream() + .map(element::getAttribute) + .filter(StringUtils::isNotBlank) + .map(this::findUrlsInAttributeValue) + .flatMap(List::stream) + .forEach(foundUrls::add); + }); + + return foundUrls.stream() + .collect(Collectors.toList()); + } + + /** + * Returns a list of (unvalidated) URLs found in the attribute's value. + * + * @param attributeValue The value of the attribute + * @return The list of found URLs + */ + private List findUrlsInAttributeValue(final String attributeValue) { + List foundUrls = new ArrayList<>(); + + urlPatterns.stream() + .map((Pattern urlPattern) -> urlPattern.matcher(attributeValue)) + .forEach((Matcher urlPatternMatcher) -> { + while (urlPatternMatcher.find()) { + String foundUrl = urlPatternMatcher.group(); + + if (StringUtils.isNotBlank(foundUrl)) { + foundUrls.add(foundUrl); + } + } + }); + + return foundUrls; + } + + public static final class UrlFinderBuilder { + + private final Set urlPatterns; + + private Set locatingMechanisms; + private Set attributes; + + /** + * Constructs a UrlFinderBuilder instance that can be used + * to create UrlFinder instances. + * + * @param urlPattern The pattern which will be used to find URLs + */ + public UrlFinderBuilder(final Pattern urlPattern) { + this(Arrays.asList(urlPattern)); + } + + /** + * Constructs a UrlFinderBuilder instance that can be used + * to create UrlFinder instances. It + * + * @param urlPatterns The list of patterns which will be used to find + * URLs + */ + public UrlFinderBuilder(final List urlPatterns) { + Validate.noNullElements(urlPatterns, "URL patterns cannot be null."); + + this.urlPatterns = Sets.newHashSet(urlPatterns); + locatingMechanisms = Sets.newHashSet(By.tagName("a")); + attributes = Sets.newHashSet("href"); + } + + /** + * Sets the locating mechanism used by the finder. Only elements matched + * by the locator will be considered when searching for URLs. + * + * @param locatingMechanism The By locating mechanism + * instance + */ + public void setLocatingMechanism(final By locatingMechanism) { + setLocatingMechanisms(Arrays.asList(locatingMechanism)); + } + + /** + * Sets the locating mechanisms used by the finder. Only elements + * matched by the locators will be considered when searching for URLs. + * + * @param locatingMechanisms The list of By locating + * mechanism instances + */ + public void setLocatingMechanisms(final List locatingMechanisms) { + Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null."); + + this.locatingMechanisms = Sets.newHashSet(locatingMechanisms); + } + + /** + * Sets which attributes to search for URLs. + * + * @param attributes The list of attribute names + */ + public void setAttributes(final List attributes) { + Validate.noNullElements(attributes, "Attributes cannot be null."); + + this.attributes = Sets.newHashSet(attributes); + } + + /** + * Sets which attribute to search for URLs. + * + * @param attribute The name of the attribute + */ + public void setAttribute(final String attribute) { + setAttributes(Arrays.asList(attribute)); + } + + /** + * Builds the configured URL finder. + * + * @return The configured UrlFinder instance + */ + public UrlFinder build() { + return new UrlFinder(this); + } + } +} From 030896bfeb947f2ebbde47985d845911f25dc31c Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 15 Mar 2018 02:53:16 +0100 Subject: [PATCH 18/24] Replace URL with URI --- .../peterbencze/serritor/api/BaseCrawler.java | 9 +-- .../serritor/api/CrawlRequest.java | 29 ++----- .../serritor/api/HtmlResponse.java | 4 +- .../serritor/api/NonHtmlResponse.java | 4 +- .../serritor/api/UnsuccessfulRequest.java | 4 +- .../serritor/internal/CallbackParameter.java | 10 +-- .../serritor/internal/CrawlCandidate.java | 12 +-- .../serritor/internal/CrawlFrontier.java | 4 +- .../serritor/internal/CrawlFrontierTest.java | 81 +++++-------------- 9 files changed, 47 insertions(+), 110 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 598ba73..97da904 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -31,7 +31,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.net.URI; -import java.net.URL; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.SerializationUtils; @@ -208,11 +207,11 @@ private void run() { // Get the next crawl candidate from the queue CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); - URL currentCandidateUrl = currentCandidate.getCandidateUrl(); + URI currentCandidateUrl = currentCandidate.getCandidateUrl(); String currentRequestUrlAsString = currentCandidateUrl.toString(); HttpHeadResponse httpHeadResponse; - URL responseUrl = currentCandidateUrl; + URI responseUrl = currentCandidateUrl; try { HttpClientContext context = HttpClientContext.create(); @@ -223,7 +222,7 @@ private void run() { // If the request has been redirected, get the final URL List redirectLocations = context.getRedirectLocations(); if (redirectLocations != null) { - responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL(); + responseUrl = redirectLocations.get(redirectLocations.size() - 1); } } catch (IOException ex) { UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), @@ -289,7 +288,7 @@ private void run() { * @param destinationUrl The URL to crawl * @return The HTTP HEAD response */ - private HttpHeadResponse getHttpHeadResponse(final URL destinationUrl, final HttpClientContext context) throws IOException { + private HttpHeadResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException { HttpHead headRequest = new HttpHead(destinationUrl.toString()); HttpResponse response = httpClient.execute(headRequest, context); return new HttpHeadResponse(response); diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 5cb9a23..4188a54 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -19,8 +19,7 @@ import java.io.IOException; import java.io.ObjectInputStream; import java.io.Serializable; -import java.net.MalformedURLException; -import java.net.URL; +import java.net.URI; import java.util.Optional; /** @@ -32,7 +31,7 @@ */ public final class CrawlRequest implements Serializable { - private final URL requestUrl; + private final URI requestUrl; private final int priority; private final Serializable metadata; @@ -50,7 +49,7 @@ private CrawlRequest(final CrawlRequestBuilder builder) { * * @return The URL of the request */ - public URL getRequestUrl() { + public URI getRequestUrl() { return requestUrl; } @@ -85,7 +84,7 @@ public static final class CrawlRequestBuilder { private static final int DEFAULT_PRIORITY = 0; - private final URL requestUrl; + private final URI requestUrl; private final InternetDomainName domain; private int priority; @@ -98,7 +97,7 @@ public static final class CrawlRequestBuilder { * @param requestUrl The request's URL given as a URL * instance */ - public CrawlRequestBuilder(final URL requestUrl) { + public CrawlRequestBuilder(final URI requestUrl) { this.requestUrl = requestUrl; // Extract the domain from the request URL @@ -116,7 +115,7 @@ public CrawlRequestBuilder(final URL requestUrl) { * instance */ public CrawlRequestBuilder(final String requestUrl) { - this(getUrlFromString(requestUrl)); + this(URI.create(requestUrl)); } /** @@ -151,22 +150,6 @@ public CrawlRequestBuilder setMetadata(final Serializable metadata) { public CrawlRequest build() { return new CrawlRequest(this); } - - /** - * Constructs a URL instance based on the specified URL - * string. Since call to this must be the first statement in a - * constructor, this method is necessary for the conversion to be made. - * - * @param requestUrl The request URL as String - * @return The URL instance - */ - private static URL getUrlFromString(final String requestUrl) { - try { - return new URL(requestUrl); - } catch (MalformedURLException ex) { - throw new IllegalArgumentException(String.format("The URL (\"%s\") is malformed.", requestUrl), ex); - } - } } private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException { diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java index a7be956..442d493 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.internal.CallbackParameter; -import java.net.URL; +import java.net.URI; import org.openqa.selenium.WebDriver; /** @@ -59,7 +59,7 @@ public static final class HtmlResponseBuilder extends CallbackParameterBuilder { private HttpHeadResponse httpHeadResponse; private WebDriver webDriver; - public HtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { + public HtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { super(refererUrl, crawlDepth, crawlRequest); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java index c1f58bf..fc5e701 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.internal.CallbackParameter; -import java.net.URL; +import java.net.URI; /** * Represents a non-HTML response. @@ -46,7 +46,7 @@ public static final class NonHtmlResponseBuilder extends CallbackParameterBuilde private HttpHeadResponse httpHeadResponse; - public NonHtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { + public NonHtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { super(refererUrl, crawlDepth, crawlRequest); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java index 12c67cc..7d379d5 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java @@ -17,7 +17,7 @@ import com.github.peterbencze.serritor.internal.CallbackParameter; import java.io.IOException; -import java.net.URL; +import java.net.URI; /** * Represents an unsuccessful request. @@ -48,7 +48,7 @@ public static final class UnsuccessfulRequestBuilder extends CallbackParameterBu private IOException exception; - public UnsuccessfulRequestBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { + public UnsuccessfulRequestBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { super(refererUrl, crawlDepth, crawlRequest); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java index 9ca1d75..cb6ae0b 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlRequest; -import java.net.URL; +import java.net.URI; import java.util.Optional; /** @@ -27,7 +27,7 @@ public abstract class CallbackParameter { private final int crawlDepth; - private final URL refererUrl; + private final URI refererUrl; private final CrawlRequest crawlRequest; protected CallbackParameter(final CallbackParameterBuilder builder) { @@ -41,7 +41,7 @@ protected CallbackParameter(final CallbackParameterBuilder builder) { * * @return The referer URL */ - public final Optional getRefererUrl() { + public final Optional getRefererUrl() { return Optional.ofNullable(refererUrl); } @@ -65,11 +65,11 @@ public final CrawlRequest getCrawlRequest() { public static abstract class CallbackParameterBuilder { - private final URL refererUrl; + private final URI refererUrl; private final int crawlDepth; private final CrawlRequest crawlRequest; - public CallbackParameterBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { + public CallbackParameterBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { this.refererUrl = refererUrl; this.crawlDepth = crawlDepth; this.crawlRequest = crawlRequest; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java index 7a4acbd..b5041b9 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlRequest; import com.google.common.net.InternetDomainName; import java.io.Serializable; -import java.net.URL; +import java.net.URI; /** * Represents a candidate for crawling that will be surely processed by the @@ -28,7 +28,7 @@ */ public final class CrawlCandidate implements Serializable { - private final URL refererUrl; + private final URI refererUrl; private final int crawlDepth; private final CrawlRequest crawlRequest; @@ -43,7 +43,7 @@ public CrawlCandidate(final CrawlCandidateBuilder builder) { * * @return The URL of the referer */ - public URL getRefererUrl() { + public URI getRefererUrl() { return refererUrl; } @@ -52,7 +52,7 @@ public URL getRefererUrl() { * * @return The URL of the candidate */ - public URL getCandidateUrl() { + public URI getCandidateUrl() { return crawlRequest.getRequestUrl(); } @@ -96,14 +96,14 @@ public static final class CrawlCandidateBuilder { private final CrawlRequest crawlRequest; - private URL refererUrl; + private URI refererUrl; private int crawlDepth; public CrawlCandidateBuilder(final CrawlRequest request) { crawlRequest = request; } - public CrawlCandidateBuilder setRefererUrl(final URL refererUrl) { + public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) { this.refererUrl = refererUrl; return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index c49f8e4..74b1b05 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.internal.CrawlCandidate.CrawlCandidateBuilder; import java.io.Serializable; -import java.net.URL; +import java.net.URI; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; @@ -148,7 +148,7 @@ public CrawlCandidate getNextCandidate() { * @param url The URL that the fingerprint will be created for * @return The fingerprint of the URL */ - private static String createFingerprintForUrl(final URL url) { + private static String createFingerprintForUrl(final URI url) { // First, we start off with the host only StringBuilder truncatedUrl = new StringBuilder(url.getHost()); diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 74854bf..3789aae 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -19,8 +19,7 @@ import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.api.CrawlStrategy; import com.google.common.net.InternetDomainName; -import java.net.MalformedURLException; -import java.net.URL; +import java.net.URI; import java.util.Arrays; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -36,12 +35,12 @@ public final class CrawlFrontierTest { // Allowed crawl domains - private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_0 = new CrawlDomain(InternetDomainName.from("root_url_0.com")); - private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_1 = new CrawlDomain(InternetDomainName.from("root_url_1.com")); + private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_0 = new CrawlDomain(InternetDomainName.from("root-url-0.com")); + private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_1 = new CrawlDomain(InternetDomainName.from("root-url-1.com")); // Root URLs - private static final URL ROOT_URL_0; - private static final URL ROOT_URL_1; + private static final URI ROOT_URL_0 = URI.create("http://root-url-0.com"); + private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com"); // Root URL crawl depth private static final int ROOT_URL_CRAWL_DEPTH = 0; @@ -51,13 +50,16 @@ public final class CrawlFrontierTest { private static final int ROOT_URL_1_PRIORITY = 1; // Root URL crawl requests - private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST; - private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST; + private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0).setPriority(ROOT_URL_0_PRIORITY).build(); + private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1).setPriority(ROOT_URL_1_PRIORITY).build(); + + // Child URL path + private static final String CHILD_URL_PATH = "/child"; // Child URLs - private static final URL CHILD_URL_0; - private static final URL CHILD_URL_1; - private static final URL CHILD_URL_2; + private static final URI CHILD_URL_0 = URI.create(String.format("http://root-url-0.com%s-0.html", CHILD_URL_PATH)); + private static final URI CHILD_URL_1 = URI.create(String.format("http://root-url-0.com%s-1.html", CHILD_URL_PATH)); + private static final URI CHILD_URL_2 = URI.create(String.format("http://root-url-1.com%s-0.html", CHILD_URL_PATH)); // Child URL crawl depth private static final int CHILD_URL_CRAWL_DEPTH = 1; @@ -68,69 +70,22 @@ public final class CrawlFrontierTest { private static final int CHILD_URL_2_PRIORITY = 1; // Child URL crawl requests - private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST; - private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST; - private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST; - - // Child URL path - private static final String CHILD_URL_PATH = "/child"; + private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0).setPriority(CHILD_URL_0_PRIORITY).build(); + private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1).setPriority(CHILD_URL_1_PRIORITY).build(); + private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build(); // Offsite URL - private static final URL OFFSITE_URL; + private static final URI OFFSITE_URL = URI.create("http://offsite-url.com"); // Offsite URL priority private static final int OFFSITE_URL_PRIORITY = 0; // Offsite URL crawl request - private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST; + private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build(); // Max crawl depth private static final int MAX_CRAWL_DEPTH = 1; - static { - try { - // Initialization of root URLs - ROOT_URL_0 = new URL("http://root_url_0.com"); - ROOT_URL_1 = new URL("http://root_url_1.com"); - - // Initialization of child URLs - CHILD_URL_0 = new URL(String.format("http://root_url_0.com%s_0.html", CHILD_URL_PATH)); - CHILD_URL_1 = new URL(String.format("http://root_url_0.com%s_1.html", CHILD_URL_PATH)); - - CHILD_URL_2 = new URL(String.format("http://root_url_1.com%s_0.html", CHILD_URL_PATH)); - - // Initialization of the offsite URL - OFFSITE_URL = new URL("http://offsite_url.com"); - } catch (MalformedURLException ex) { - throw new Error(ex); - } - - // Initialize crawl requests - ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0) - .setPriority(ROOT_URL_0_PRIORITY) - .build(); - - ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1) - .setPriority(ROOT_URL_1_PRIORITY) - .build(); - - CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0) - .setPriority(CHILD_URL_0_PRIORITY) - .build(); - - CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1) - .setPriority(CHILD_URL_1_PRIORITY) - .build(); - - CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2) - .setPriority(CHILD_URL_2_PRIORITY) - .build(); - - OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL) - .setPriority(OFFSITE_URL_PRIORITY) - .build(); - } - private CrawlerConfiguration configuration; private CrawlFrontier frontier; From a14aacde935548f72bbaec3438757162dc50cf9e Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 16 Mar 2018 17:48:37 +0100 Subject: [PATCH 19/24] Add URL validation --- .../serritor/api/helper/UrlFinder.java | 59 ++++++++++++++++--- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 2bcbe83..a79b278 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -17,11 +17,14 @@ import com.github.peterbencze.serritor.api.HtmlResponse; import com.google.common.collect.Sets; +import com.google.common.net.InternetDomainName; +import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -41,15 +44,17 @@ public final class UrlFinder { private final Set urlPatterns; private final Set locatingMechanisms; private final Set attributes; + private final Function validator; private UrlFinder(final UrlFinderBuilder builder) { urlPatterns = builder.urlPatterns; locatingMechanisms = builder.locatingMechanisms; attributes = builder.attributes; + validator = builder.validator; } /** - * Returns a list of (unvalidated) URLs found in the response's HTML source. + * Returns a list of validated URLs found in the response's HTML source. * * @param response The HtmlResponse instance * @return The list of found URLs @@ -78,7 +83,7 @@ public List findUrlsInResponse(final HtmlResponse response) { } /** - * Returns a list of (unvalidated) URLs found in the attribute's value. + * Returns a list of validated URLs found in the attribute's value. * * @param attributeValue The value of the attribute * @return The list of found URLs @@ -92,7 +97,7 @@ private List findUrlsInAttributeValue(final String attributeValue) { while (urlPatternMatcher.find()) { String foundUrl = urlPatternMatcher.group(); - if (StringUtils.isNotBlank(foundUrl)) { + if (validator.apply(foundUrl)) { foundUrls.add(foundUrl); } } @@ -107,6 +112,7 @@ public static final class UrlFinderBuilder { private Set locatingMechanisms; private Set attributes; + private Function validator; /** * Constructs a UrlFinderBuilder instance that can be used @@ -131,6 +137,7 @@ public UrlFinderBuilder(final List urlPatterns) { this.urlPatterns = Sets.newHashSet(urlPatterns); locatingMechanisms = Sets.newHashSet(By.tagName("a")); attributes = Sets.newHashSet("href"); + validator = this::isValidUrl; } /** @@ -139,9 +146,10 @@ public UrlFinderBuilder(final List urlPatterns) { * * @param locatingMechanism The By locating mechanism * instance + * @return The UrlFinderBuilder instance */ - public void setLocatingMechanism(final By locatingMechanism) { - setLocatingMechanisms(Arrays.asList(locatingMechanism)); + public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { + return setLocatingMechanisms(Arrays.asList(locatingMechanism)); } /** @@ -150,31 +158,49 @@ public void setLocatingMechanism(final By locatingMechanism) { * * @param locatingMechanisms The list of By locating * mechanism instances + * @return The UrlFinderBuilder instance */ - public void setLocatingMechanisms(final List locatingMechanisms) { + public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) { Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null."); this.locatingMechanisms = Sets.newHashSet(locatingMechanisms); + return this; } /** * Sets which attributes to search for URLs. * * @param attributes The list of attribute names + * @return The UrlFinderBuilder instance */ - public void setAttributes(final List attributes) { + public UrlFinderBuilder setAttributes(final List attributes) { Validate.noNullElements(attributes, "Attributes cannot be null."); this.attributes = Sets.newHashSet(attributes); + return this; } /** * Sets which attribute to search for URLs. * * @param attribute The name of the attribute + * @return The UrlFinderBuilder instance */ - public void setAttribute(final String attribute) { - setAttributes(Arrays.asList(attribute)); + public UrlFinderBuilder setAttribute(final String attribute) { + return setAttributes(Arrays.asList(attribute)); + } + + /** + * Sets a function to be used for validating found URLs. + * + * @param validator The validator function + * @return The UrlFinderBuilder instance + */ + public UrlFinderBuilder setValidator(final Function validator) { + Validate.notNull(validator, "The validator function cannot be null."); + + this.validator = validator; + return this; } /** @@ -185,5 +211,20 @@ public void setAttribute(final String attribute) { public UrlFinder build() { return new UrlFinder(this); } + + /** + * The default URL validator function. + * + * @param url The URL to be validated + * @return true if the URL is valid, false + * otherwise + */ + private boolean isValidUrl(final String url) { + try { + return InternetDomainName.isValid(URI.create(url).getHost()); + } catch (IllegalArgumentException e) { + return false; + } + } } } From 4903726617ecbda924bbfe7b8f94b97a91f71fcc Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 16 Mar 2018 18:16:04 +0100 Subject: [PATCH 20/24] Update README --- README.md | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 185dfba..40db8dc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Add the following dependency to your pom.xml: com.github.peterbencze serritor - 1.2.1 + 1.3.0 ``` @@ -27,24 +27,31 @@ BaseCrawler provides a skeletal implementation of a crawler to minimize the effo ```java public class MyCrawler extends BaseCrawler { + private final UrlFinder urlFinder; + public MyCrawler() { // Enable offsite request filtering - config.setOffsiteRequestFiltering(true); + configurator.setOffsiteRequestFiltering(true); + + // Specify the allowed crawl domain + configurator.addAllowedCrawlDomain("example.com"); // Add a crawl seed, this is where the crawling starts CrawlRequest request = new CrawlRequestBuilder("http://example.com").build(); - config.addCrawlSeed(request); + configurator.addCrawlSeed(request); + + // Extract URLs from links on the crawled page + urlFinder = new UrlFinderBuilder(Pattern.compile(".+")).build(); } @Override protected void onResponseComplete(final HtmlResponse response) { - // Crawl every link that can be found on the page - response.getWebDriver().findElements(By.tagName("a")) + // Crawl every URL that match the given pattern + urlFinder.findUrlsInResponse(response) .stream() - .forEach((WebElement link) -> { - CrawlRequest request = new CrawlRequestBuilder(link.getAttribute("href")).build(); - crawl(request); - }); + .map(CrawlRequestBuilder::new) + .map(CrawlRequestBuilder::build) + .forEach(this::crawl); } @Override @@ -58,7 +65,7 @@ public class MyCrawler extends BaseCrawler { } } ``` -That's it! In just a few lines you can make a crawler that extracts and crawls every URL it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium. +That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium. By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/): ```java From aee21535a46242982a7fb9a0fbb20f4dbb2ffd90 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 16 Mar 2018 19:17:47 +0100 Subject: [PATCH 21/24] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 40db8dc..4be6493 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ public class MyCrawler extends BaseCrawler { } } ``` -That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium. +That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver instance, so you can use all the features that are provided by Selenium. By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/): ```java From 018b1cda36a8ecddadbf1ecb6191071013d455fb Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 16 Mar 2018 22:24:24 +0100 Subject: [PATCH 22/24] Refact --- .../peterbencze/serritor/api/helper/UrlFinder.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index a79b278..24ca816 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -95,7 +95,7 @@ private List findUrlsInAttributeValue(final String attributeValue) { .map((Pattern urlPattern) -> urlPattern.matcher(attributeValue)) .forEach((Matcher urlPatternMatcher) -> { while (urlPatternMatcher.find()) { - String foundUrl = urlPatternMatcher.group(); + String foundUrl = urlPatternMatcher.group().trim(); if (validator.apply(foundUrl)) { foundUrls.add(foundUrl); @@ -107,6 +107,10 @@ private List findUrlsInAttributeValue(final String attributeValue) { } public static final class UrlFinderBuilder { + + private static final Set DEFAULT_LOCATING_MECHANISMS = Sets.newHashSet(By.tagName("a")); + private static final Set DEFAULT_ATTRIBUTES = Sets.newHashSet("href"); + private static final Function DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl; private final Set urlPatterns; @@ -135,9 +139,9 @@ public UrlFinderBuilder(final List urlPatterns) { Validate.noNullElements(urlPatterns, "URL patterns cannot be null."); this.urlPatterns = Sets.newHashSet(urlPatterns); - locatingMechanisms = Sets.newHashSet(By.tagName("a")); - attributes = Sets.newHashSet("href"); - validator = this::isValidUrl; + locatingMechanisms = DEFAULT_LOCATING_MECHANISMS; + attributes = DEFAULT_ATTRIBUTES; + validator = DEFAULT_VALIDATOR; } /** @@ -219,7 +223,7 @@ public UrlFinder build() { * @return true if the URL is valid, false * otherwise */ - private boolean isValidUrl(final String url) { + private static boolean isValidUrl(final String url) { try { return InternetDomainName.isValid(URI.create(url).getHost()); } catch (IllegalArgumentException e) { From c99848d0236757f76de39cbe18a659c021b6c649 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 16 Mar 2018 22:29:16 +0100 Subject: [PATCH 23/24] Update dependency versions --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index f87d10a..cfe92f6 100644 --- a/pom.xml +++ b/pom.xml @@ -54,12 +54,12 @@ org.seleniumhq.selenium selenium-java - 3.9.1 + 3.11.0 org.seleniumhq.selenium htmlunit-driver - 2.29.1 + 2.29.2 com.google.guava From e58c1d2d65e18fd2d4a3502a931ae6115d52424f Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 16 Mar 2018 23:55:25 +0100 Subject: [PATCH 24/24] Add test for url finder --- pom.xml | 6 ++ .../serritor/api/helper/UrlFinderTest.java | 84 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java diff --git a/pom.xml b/pom.xml index cfe92f6..8747531 100644 --- a/pom.xml +++ b/pom.xml @@ -72,6 +72,12 @@ 4.12 test + + org.mockito + mockito-core + 2.16.0 + test + diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java new file mode 100644 index 0000000..f89c0fd --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -0,0 +1,84 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.api.helper; + +import com.github.peterbencze.serritor.api.HtmlResponse; +import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder; +import com.github.peterbencze.serritor.api.helper.UrlFinder.UrlFinderBuilder; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; + +/** + * Test cases for UrlFinder. + * + * @author Peter Bencze + */ +public class UrlFinderTest { + + private static final Pattern URL_PATTERN = Pattern.compile(".+valid-url.+"); + private static final String ATTRIBUTE = "href"; + private static final String TAG_NAME = "a"; + private static final String VALID_URL = "http://valid-url.com"; + private static final String INVALID_URL = "invalid-url"; + private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain"; + + private UrlFinder urlFinder; + private HtmlResponse mockResponse; + private WebDriver mockDriver; + private WebElement mockElementWithValidUrl; + private WebElement mockElementWithInvalidUrlFormat; + private WebElement mockElementWithInvalidDomain; + + @Before + public void initialize() { + urlFinder = new UrlFinderBuilder(URL_PATTERN).build(); + + // Create mocks + mockDriver = Mockito.mock(WebDriver.class); + + // Cannot mock because of the final modifier + mockResponse = new HtmlResponseBuilder(null, 0, null).setWebDriver(mockDriver).build(); + + mockElementWithValidUrl = Mockito.mock(WebElement.class); + Mockito.when(mockElementWithValidUrl.getAttribute(Mockito.eq(ATTRIBUTE))) + .thenReturn(VALID_URL); + + mockElementWithInvalidUrlFormat = Mockito.mock(WebElement.class); + Mockito.when(mockElementWithInvalidUrlFormat.getAttribute(Mockito.eq(ATTRIBUTE))) + .thenReturn(INVALID_URL); + + mockElementWithInvalidDomain = Mockito.mock(WebElement.class); + Mockito.when(mockElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE))) + .thenReturn(URL_WITH_INVALID_DOMAIN); + + List elementList = Arrays.asList(mockElementWithValidUrl, mockElementWithInvalidUrlFormat, mockElementWithInvalidDomain); + Mockito.when(mockDriver.findElements(By.tagName(TAG_NAME))) + .thenReturn(elementList); + } + + @Test + public void findUrlsInResponseTest() { + Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInResponse(mockResponse)); + } +}