From c1d66df83b7d4d56f9f54df6a8378ce9c1be308f Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Mon, 19 Feb 2018 18:21:44 +0100
Subject: [PATCH 01/24] Remove unnecessary generic parameters, add missing
 final modifiers

---
 pom.xml                                                       | 2 +-
 .../com/github/peterbencze/serritor/api/HtmlResponse.java     | 4 ++--
 .../com/github/peterbencze/serritor/api/NonHtmlResponse.java  | 4 ++--
 .../github/peterbencze/serritor/api/UnsuccessfulRequest.java  | 4 ++--
 .../peterbencze/serritor/internal/CallbackParameter.java      | 2 +-
 .../peterbencze/serritor/internal/CrawlerConfiguration.java   | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/pom.xml b/pom.xml
index 2e9cb15..33c497b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>com.github.peterbencze</groupId>
     <artifactId>serritor</artifactId>
-    <version>1.2.1</version>
+    <version>1.2.2</version>
     <packaging>jar</packaging>
     
     <name>Serritor</name>
diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
index 12a47a2..4138abb 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
@@ -54,12 +54,12 @@ public WebDriver getWebDriver() {
         return webDriver;
     }
 
-    public static final class HtmlResponseBuilder extends CallbackParameterBuilder<HtmlResponseBuilder> {
+    public static final class HtmlResponseBuilder extends CallbackParameterBuilder {
 
         private HttpHeadResponse httpHeadResponse;
         private WebDriver webDriver;
 
-        public HtmlResponseBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) {
+        public HtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             super(refererUrl, crawlDepth, crawlRequest);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
index 9245beb..c1f58bf 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
@@ -42,11 +42,11 @@ public HttpHeadResponse getHttpHeadResponse() {
         return httpHeadResponse;
     }
 
-    public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder<NonHtmlResponseBuilder> {
+    public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder {
 
         private HttpHeadResponse httpHeadResponse;
 
-        public NonHtmlResponseBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) {
+        public NonHtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             super(refererUrl, crawlDepth, crawlRequest);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
index c545c01..f809a6a 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
@@ -43,11 +43,11 @@ public IOException getException() {
         return exception;
     }
 
-    public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder<UnsuccessfulRequestBuilder> {
+    public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder {
 
         private IOException exception;
 
-        public UnsuccessfulRequestBuilder(URL refererUrl, int crawlDepth, CrawlRequest crawlRequest) {
+        public UnsuccessfulRequestBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             super(refererUrl, crawlDepth, crawlRequest);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
index 1817733..28af583 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
@@ -63,7 +63,7 @@ public final CrawlRequest getCrawlRequest() {
         return crawlRequest;
     }
 
-    public static abstract class CallbackParameterBuilder<T extends CallbackParameterBuilder<T>> {
+    public static abstract class CallbackParameterBuilder {
 
         private final URL refererUrl;
         private final int crawlDepth;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index c7e065a..8b1d306 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -160,7 +160,7 @@ public int getMaxCrawlDepth() {
      *
      * @param maxCrawlDepth The maximum crawl depth, zero means no limit
      */
-    public void setMaxCrawlDepth(int maxCrawlDepth) {
+    public void setMaxCrawlDepth(final int maxCrawlDepth) {
         this.maxCrawlDepth = maxCrawlDepth;
     }
 }

From ed28065762a540194375c53e4d392b13d9afb816 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Mon, 19 Feb 2018 18:32:42 +0100
Subject: [PATCH 02/24] Add initialization of variable

---
 .../peterbencze/serritor/internal/CrawlerConfiguration.java      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 8b1d306..09574b4 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -43,6 +43,7 @@ public CrawlerConfiguration() {
         crawlSeeds = new ArrayList<>();
         crawlingStrategy = CrawlingStrategy.BREADTH_FIRST;
         filterDuplicateRequests = true;
+        filterOffsiteRequests = false;
         delayBetweenRequests = Duration.ZERO;
         maxCrawlDepth = 0;
     }

From cb53c1d355d7ab3e1b1222e0bf925705a51db51f Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Mon, 19 Feb 2018 22:11:28 +0100
Subject: [PATCH 03/24] Refact

---
 .../serritor/internal/CrawlerConfiguration.java  | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 09574b4..4e9170f 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -29,6 +29,12 @@
  * @author Peter Bencze
  */
 public final class CrawlerConfiguration implements Serializable {
+    
+    private static final CrawlingStrategy DEFAULT_CRAWLING_STRATEGY = CrawlingStrategy.BREADTH_FIRST;
+    private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true;
+    private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false;
+    private static final Duration DEFAULT_DELAY_BETWEEN_REQUESTS = Duration.ZERO;
+    private static final int DEFAULT_MAX_CRAWL_DEPTH = 0;
 
     private final List<CrawlRequest> crawlSeeds;
 
@@ -41,11 +47,11 @@ public final class CrawlerConfiguration implements Serializable {
     public CrawlerConfiguration() {
         // Default configuration
         crawlSeeds = new ArrayList<>();
-        crawlingStrategy = CrawlingStrategy.BREADTH_FIRST;
-        filterDuplicateRequests = true;
-        filterOffsiteRequests = false;
-        delayBetweenRequests = Duration.ZERO;
-        maxCrawlDepth = 0;
+        crawlingStrategy = DEFAULT_CRAWLING_STRATEGY;
+        filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT;
+        filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT;
+        delayBetweenRequests = DEFAULT_DELAY_BETWEEN_REQUESTS;
+        maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH;
     }
 
     /**

From 6ac261adeda41525f95d1b1cd21e48eeeaa0de56 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Tue, 20 Feb 2018 01:59:52 +0100
Subject: [PATCH 04/24] Add license file

---
 LICENSE | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

From 52110b7626ae8419fe09f6d0cfe7c37aa75be234 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Tue, 20 Feb 2018 18:28:03 +0100
Subject: [PATCH 05/24] Rename CrawlingStrategy to CrawlStrategy

---
 ...awlingStrategy.java => CrawlStrategy.java} |  2 +-
 .../serritor/internal/CrawlFrontier.java      |  6 ++---
 .../internal/CrawlerConfiguration.java        | 24 +++++++++----------
 .../serritor/internal/CrawlFrontierTest.java  |  6 ++---
 4 files changed, 19 insertions(+), 19 deletions(-)
 rename src/main/java/com/github/peterbencze/serritor/api/{CrawlingStrategy.java => CrawlStrategy.java} (96%)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java
similarity index 96%
rename from src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java
rename to src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java
index c4da75a..c88435b 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlingStrategy.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java
@@ -20,7 +20,7 @@
  *
  * @author Peter Bencze
  */
-public enum CrawlingStrategy {
+public enum CrawlStrategy {
 
     BREADTH_FIRST,
     DEPTH_FIRST
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index 408710e..ed6c20e 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -54,7 +54,7 @@ public CrawlFrontier(final CrawlerConfiguration config) {
         allowedDomains = new HashSet<>();
         urlFingerprints = new HashSet<>();
 
-        // Construct a priority queue according to the crawling strategy specified in the configuration
+        // Construct a priority queue according to the crawl strategy specified in the configuration
         candidates = getPriorityQueue();
 
         // Feed initial crawl requests (seeds)
@@ -174,7 +174,7 @@ private String getFingerprintForUrl(final URL url) {
      * comparator
      */
     private PriorityQueue<CrawlCandidate> getPriorityQueue() {
-        switch (config.getCrawlingStrategy()) {
+        switch (config.getCrawlStrategy()) {
             case BREADTH_FIRST:
                 return new PriorityQueue<>(Comparator.comparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getCrawlDepth)
                         .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, reverseOrder()));
@@ -183,6 +183,6 @@ private PriorityQueue<CrawlCandidate> getPriorityQueue() {
                         .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, reverseOrder()));
         }
 
-        throw new IllegalArgumentException("Unsupported crawling strategy.");
+        throw new IllegalArgumentException("Unsupported crawl strategy.");
     }
 }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 4e9170f..b275c9a 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -16,7 +16,7 @@
 package com.github.peterbencze.serritor.internal;
 
 import com.github.peterbencze.serritor.api.CrawlRequest;
-import com.github.peterbencze.serritor.api.CrawlingStrategy;
+import com.github.peterbencze.serritor.api.CrawlStrategy;
 import java.io.Serializable;
 import java.time.Duration;
 import java.util.ArrayList;
@@ -30,7 +30,7 @@
  */
 public final class CrawlerConfiguration implements Serializable {
     
-    private static final CrawlingStrategy DEFAULT_CRAWLING_STRATEGY = CrawlingStrategy.BREADTH_FIRST;
+    private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST;
     private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true;
     private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false;
     private static final Duration DEFAULT_DELAY_BETWEEN_REQUESTS = Duration.ZERO;
@@ -38,7 +38,7 @@ public final class CrawlerConfiguration implements Serializable {
 
     private final List<CrawlRequest> crawlSeeds;
 
-    private CrawlingStrategy crawlingStrategy;
+    private CrawlStrategy crawlStrategy;
     private boolean filterDuplicateRequests;
     private boolean filterOffsiteRequests;
     private Duration delayBetweenRequests;
@@ -47,7 +47,7 @@ public final class CrawlerConfiguration implements Serializable {
     public CrawlerConfiguration() {
         // Default configuration
         crawlSeeds = new ArrayList<>();
-        crawlingStrategy = DEFAULT_CRAWLING_STRATEGY;
+        crawlStrategy = DEFAULT_CRAWL_STRATEGY;
         filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT;
         filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT;
         delayBetweenRequests = DEFAULT_DELAY_BETWEEN_REQUESTS;
@@ -82,21 +82,21 @@ public void addCrawlSeeds(final List<CrawlRequest> requests) {
     }
 
     /**
-     * Returns the crawling strategy of the crawler.
+     * Returns the crawl strategy of the crawler.
      *
-     * @return The crawling strategy
+     * @return The crawl strategy
      */
-    public CrawlingStrategy getCrawlingStrategy() {
-        return crawlingStrategy;
+    public CrawlStrategy getCrawlStrategy() {
+        return crawlStrategy;
     }
 
     /**
-     * Sets the crawling strategy of the crawler.
+     * Sets the crawl strategy of the crawler.
      *
-     * @param crawlingStrategy The crawling strategy
+     * @param crawlStrategy The crawl strategy
      */
-    public void setCrawlingStrategy(final CrawlingStrategy crawlingStrategy) {
-        this.crawlingStrategy = crawlingStrategy;
+    public void setCrawlStrategy(final CrawlStrategy crawlStrategy) {
+        this.crawlStrategy = crawlStrategy;
     }
 
     /**
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 2c8fb29..621da01 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -17,7 +17,7 @@
 
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
-import com.github.peterbencze.serritor.api.CrawlingStrategy;
+import com.github.peterbencze.serritor.api.CrawlStrategy;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Arrays;
@@ -325,8 +325,8 @@ public void getNextRequestBreadthFirstTest() {
 
     @Test
     public void getNextRequestDepthFirstTest() {
-        // Set the crawling strategy to depth-first
-        config.setCrawlingStrategy(CrawlingStrategy.DEPTH_FIRST);
+        // Set the crawl strategy to depth-first
+        config.setCrawlStrategy(CrawlStrategy.DEPTH_FIRST);
         frontier = new CrawlFrontier(config);
 
         // Get the crawl candidate of root URL 1

From a35b8962e9ad4ebbddf19f0bf61bac4514bb456c Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Sun, 25 Feb 2018 02:49:39 +0100
Subject: [PATCH 06/24] Add different crawl delay strategies

---
 .../peterbencze/serritor/api/BaseCrawler.java | 199 ++++++++++--------
 .../serritor/api/CrawlDelayStrategy.java      |  28 +++
 .../serritor/internal/AdaptiveCrawlDelay.java |  77 +++++++
 .../serritor/internal/CrawlDelay.java         |  31 +++
 .../serritor/internal/CrawlDelayFactory.java  |  68 ++++++
 .../internal/CrawlerConfiguration.java        | 130 ++++++++++--
 .../serritor/internal/FixedCrawlDelay.java    |  46 ++++
 .../serritor/internal/RandomCrawlDelay.java   |  52 +++++
 8 files changed, 521 insertions(+), 110 deletions(-)
 create mode 100644 src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java

diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index 4d8bca3..cd48aa1 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -20,6 +20,8 @@
 import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder;
 import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder;
 import com.github.peterbencze.serritor.internal.CrawlCandidate;
+import com.github.peterbencze.serritor.internal.CrawlDelay;
+import com.github.peterbencze.serritor.internal.CrawlDelayFactory;
 import com.github.peterbencze.serritor.internal.CrawlFrontier;
 import com.github.peterbencze.serritor.internal.CrawlerConfiguration;
 import java.io.IOException;
@@ -37,6 +39,7 @@
 import org.apache.http.client.methods.HttpHead;
 import org.apache.http.client.protocol.HttpClientContext;
 import org.apache.http.impl.client.HttpClientBuilder;
+import org.openqa.selenium.JavascriptExecutor;
 import org.openqa.selenium.TimeoutException;
 import org.openqa.selenium.WebDriver;
 import org.openqa.selenium.htmlunit.HtmlUnitDriver;
@@ -49,7 +52,7 @@
  */
 public abstract class BaseCrawler {
 
-    //Allows the application to configure the crawler
+    // Allows the application to configure the crawler
     protected final CrawlerConfiguration config;
 
     // Indicates if the crawler is currently running or not
@@ -63,7 +66,10 @@ public abstract class BaseCrawler {
 
     private WebDriver webDriver;
 
-    private CrawlFrontier frontier;
+    private CrawlFrontier crawlFrontier;
+
+    // Specifies which type of crawl delay to use
+    private CrawlDelay crawlDelay;
 
     protected BaseCrawler() {
         // Create a default configuration
@@ -92,23 +98,34 @@ public final void start(final WebDriver driver) {
     /**
      * Constructs all the necessary objects and runs the crawler.
      *
-     * @param frontierToUse Previously saved frontier to be used by the crawler.
+     * @param frontierToUse Crawl frontier to be used by the crawler.
      */
     private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
-        // Check if the crawler is running
-        if (!isStopped) {
-            throw new IllegalStateException("The crawler is already started.");
-        }
+        try {
+            // Check if the crawler is running
+            if (!isStopped) {
+                throw new IllegalStateException("The crawler is already started.");
+            }
+
+            isStopped = false;
 
-        isStopped = false;
+            httpClient = HttpClientBuilder.create().build();
 
-        httpClient = HttpClientBuilder.create().build();
+            webDriver = driver;
 
-        webDriver = driver;
+            crawlFrontier = frontierToUse;
 
-        frontier = frontierToUse;
+            CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(config, (JavascriptExecutor) driver);
+            crawlDelay = crawlDelayFactory.getInstanceOf(config.getCrawlDelayStrategy());
 
-        run();
+            run();
+        } finally {
+            // Always close the WebDriver
+            webDriver.quit();
+
+            stopCrawling = false;
+            isStopped = true;
+        }
     }
 
     /**
@@ -119,13 +136,13 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
      */
     public final void saveState(final OutputStream out) throws IOException {
         // Check if the crawler has been started, otherwise we have nothing to save
-        if (frontier == null) {
+        if (crawlFrontier == null) {
             throw new IllegalStateException("No state to save.");
         }
 
         // Save the frontier's current state
         ObjectOutputStream objectOutputStream = new ObjectOutputStream(out);
-        objectOutputStream.writeObject(frontier);
+        objectOutputStream.writeObject(crawlFrontier);
     }
 
     /**
@@ -153,7 +170,7 @@ public final void resumeState(final InputStream in) throws IOException, ClassNot
     public final void resumeState(final WebDriver driver, final InputStream in) throws IOException, ClassNotFoundException {
         ObjectInputStream objectInputStream = new ObjectInputStream(in);
         CrawlFrontier frontierToUse = (CrawlFrontier) objectInputStream.readObject();
-        
+
         start(driver, frontierToUse);
     }
 
@@ -188,7 +205,7 @@ protected final void crawl(final CrawlRequest request) {
             throw new IllegalStateException("The crawler is not started. Maybe you meant to add this request as a crawl seed?");
         }
 
-        frontier.feedRequest(request, false);
+        crawlFrontier.feedRequest(request, false);
     }
 
     /**
@@ -204,95 +221,85 @@ protected final void crawl(final List<CrawlRequest> requests) {
      * Defines the workflow of the crawler.
      */
     private void run() {
-        try {
-            onBegin();
+        onBegin();
 
-            while (!stopCrawling && frontier.hasNextCandidate()) {
-                // Get the next crawl candidate from the queue
-                CrawlCandidate currentCandidate = frontier.getNextCandidate();
+        while (!stopCrawling && crawlFrontier.hasNextCandidate()) {
+            // Get the next crawl candidate from the queue
+            CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate();
 
-                URL currentCandidateUrl = currentCandidate.getCandidateUrl();
-                String currentRequestUrlAsString = currentCandidateUrl.toString();
+            URL currentCandidateUrl = currentCandidate.getCandidateUrl();
+            String currentRequestUrlAsString = currentCandidateUrl.toString();
 
-                HttpHeadResponse httpHeadResponse;
-                URL responseUrl = currentCandidateUrl;
+            HttpHeadResponse httpHeadResponse;
+            URL responseUrl = currentCandidateUrl;
 
-                try {
-                    HttpClientContext context = HttpClientContext.create();
-
-                    // Send an HTTP HEAD request to the current URL to determine its availability and content type
-                    httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context);
-
-                    // If the request has been redirected, get the final URL
-                    List<URI> redirectLocations = context.getRedirectLocations();
-                    if (redirectLocations != null) {
-                        responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL();
-                    }
-                } catch (IOException ex) {
-                    UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
-                            currentCandidate.getCrawlRequest())
-                            .setException(ex)
-                            .build();
-
-                    onUnsuccessfulRequest(unsuccessfulRequest);
-                    continue;
-                }
+            try {
+                HttpClientContext context = HttpClientContext.create();
 
-                // If the request has been redirected, a new crawl request should be created for the redirected URL
-                if (!responseUrl.toString().equals(currentRequestUrlAsString)) {
-                    CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build();
-                    frontier.feedRequest(redirectedCrawlRequest, false);
+                // Send an HTTP HEAD request to the current URL to determine its availability and content type
+                httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context);
 
-                    continue;
+                // If the request has been redirected, get the final URL
+                List<URI> redirectLocations = context.getRedirectLocations();
+                if (redirectLocations != null) {
+                    responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL();
                 }
+            } catch (IOException ex) {
+                UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
+                        currentCandidate.getCrawlRequest())
+                        .setException(ex)
+                        .build();
+
+                onUnsuccessfulRequest(unsuccessfulRequest);
+                continue;
+            }
 
-                // Check if the content of the response is HTML
-                if (isContentHtml(httpHeadResponse)) {
-                    boolean timedOut = false;
-
-                    try {
-                        // Open the URL in the browser
-                        webDriver.get(currentRequestUrlAsString);
-                    } catch (TimeoutException ex) {
-                        timedOut = true;
-                    }
-
-                    HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
-                            currentCandidate.getCrawlRequest())
-                            .setHttpHeadResponse(httpHeadResponse)
-                            .setWebDriver(webDriver)
-                            .build();
-
-                    // Check if the request has timed out
-                    if (!timedOut) {
-                        onResponseComplete(htmlResponse);
-                    } else {
-                        onResponseTimeout(htmlResponse);
-                    }
-                } else {
-                    // URLs that point to non-HTML content should not be opened in the browser
+            // If the request has been redirected, a new crawl request should be created for the redirected URL
+            if (!responseUrl.toString().equals(currentRequestUrlAsString)) {
+                CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build();
+                crawlFrontier.feedRequest(redirectedCrawlRequest, false);
+
+                continue;
+            }
+
+            // Check if the content of the response is HTML
+            if (isContentHtml(httpHeadResponse)) {
+                boolean timedOut = false;
 
-                    NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
-                            currentCandidate.getCrawlRequest())
-                            .setHttpHeadResponse(httpHeadResponse)
-                            .build();
+                try {
+                    // Open the URL in the browser
+                    webDriver.get(currentRequestUrlAsString);
+                } catch (TimeoutException ex) {
+                    timedOut = true;
+                }
+
+                HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
+                        currentCandidate.getCrawlRequest())
+                        .setHttpHeadResponse(httpHeadResponse)
+                        .setWebDriver(webDriver)
+                        .build();
 
-                    onNonHtmlResponse(nonHtmlResponse);
+                // Check if the request has timed out
+                if (!timedOut) {
+                    onResponseComplete(htmlResponse);
+                } else {
+                    onResponseTimeout(htmlResponse);
                 }
+            } else {
+                // URLs that point to non-HTML content should not be opened in the browser
 
-                TimeUnit.MILLISECONDS.sleep(config.getDelayBetweenRequests().toMillis());
-            }
+                NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
+                        currentCandidate.getCrawlRequest())
+                        .setHttpHeadResponse(httpHeadResponse)
+                        .build();
 
-            onFinish();
-        } catch (InterruptedException ex) {
-            Thread.currentThread().interrupt();
-        } finally {
-            // Always close the WebDriver
-            webDriver.quit();
+                onNonHtmlResponse(nonHtmlResponse);
+            }
 
-            stopCrawling = false;
-            isStopped = true;
+            performDelay();
         }
+
+        onFinish();
     }
 
     /**
@@ -318,6 +325,18 @@ private boolean isContentHtml(final HttpHeadResponse httpHeadResponse) {
         return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html");
     }
 
+    /**
+     * Delays the next request.
+     */
+    private void performDelay() {
+        try {
+            TimeUnit.MILLISECONDS.sleep(crawlDelay.getDelay());
+        } catch (InterruptedException ex) {
+            Thread.currentThread().interrupt();
+            stopCrawling = true;
+        }
+    }
+
     /**
      * Called when the crawler is about to begin its operation.
      */
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java
new file mode 100644
index 0000000..0c10e7b
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java
@@ -0,0 +1,28 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.api;
+
+/**
+ * Available crawl delay strategies that can be used by the crawler.
+ * 
+ * @author Peter Bencze
+ */
+public enum CrawlDelayStrategy {
+    
+    FIXED,
+    ADAPTIVE,
+    RANDOM
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
new file mode 100644
index 0000000..c3e6b4c
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
@@ -0,0 +1,77 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+import org.openqa.selenium.JavascriptExecutor;
+
+/**
+ * A type of crawl delay, in which case the delay corresponds to the page
+ * loading time, if it's between the specified range, otherwise the minimum or
+ * maximum duration is used.
+ *
+ * @author Peter Bencze
+ */
+public final class AdaptiveCrawlDelay implements CrawlDelay {
+
+    private final long minDelayInMillis;
+    private final long maxDelayInMillis;
+    private final JavascriptExecutor javascriptExecutor;
+
+    /**
+     * Constructs a new <code>AdaptiveCrawlDelay</code> instance.
+     *
+     * @param config A <code>CrawlerConfiguration</code> instance which
+     * specifies the minimum and maximum delay.
+     * @param javascriptExecutor A <code>WebDriver</code> instance which is
+     * capable of executing JavaScript.
+     */
+    public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) {
+        minDelayInMillis = config.getMinimumCrawlDelayInMillis();
+        maxDelayInMillis = config.getMaximumCrawlDelayInMillis();
+        this.javascriptExecutor = javascriptExecutor;
+    }
+
+    /**
+     * Checks if the browser supports the Navigation Timing API.
+     *
+     * @return <code>true</code> if the browser is compatible,
+     * <code>false</code> otherwise
+     */
+    public boolean isBrowserCompatible() {
+        return (boolean) javascriptExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)");
+    }
+
+    /**
+     * Calculates the page loading time and returns the delay accordingly,
+     * between the specified min-max range. If the calculated delay is smaller
+     * than the minimum, it returns the minimum delay. If the calculated delay
+     * is higher than the maximum, it returns the maximum delay.
+     *
+     * @return The delay in milliseconds
+     */
+    @Override
+    public long getDelay() {
+        long delayInMillis = (long) javascriptExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;");
+
+        if (delayInMillis < minDelayInMillis) {
+            return minDelayInMillis;
+        } else if (delayInMillis > maxDelayInMillis) {
+            return maxDelayInMillis;
+        }
+
+        return delayInMillis;
+    }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java
new file mode 100644
index 0000000..652b2e9
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java
@@ -0,0 +1,31 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+/**
+ * An interface that every type of crawl delay should implement.
+ * 
+ * @author Peter Bencze
+ */
+public interface CrawlDelay {
+    
+    /**
+     * Returns the delay that should pass between each request.
+     * 
+     * @return The delay in milliseconds
+     */
+    long getDelay();
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java
new file mode 100644
index 0000000..97e78d1
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java
@@ -0,0 +1,68 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+import com.github.peterbencze.serritor.api.CrawlDelayStrategy;
+import org.openqa.selenium.JavascriptExecutor;
+
+/**
+ * Factory class which is used to construct the required crawl delay instance
+ * specified in the configuration.
+ *
+ * @author Peter Bencze
+ */
+public final class CrawlDelayFactory {
+
+    private final CrawlerConfiguration config;
+    private final JavascriptExecutor javascriptExecutor;
+
+    /**
+     * Constructs a new <code>CrawlDelayFactory</code> instance.
+     *
+     * @param config A <code>CrawlerConfiguration</code> instance which
+     * specifies the minimum and maximum delay.
+     * @param javascriptExecutor A <code>WebDriver</code> instance which is
+     * capable of executing JavaScript.
+     */
+    public CrawlDelayFactory(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) {
+        this.config = config;
+        this.javascriptExecutor = javascriptExecutor;
+    }
+
+    /**
+     * Constructs the specific crawl delay instance determined by the strategy.
+     *
+     * @param crawlDelayStrategy The crawl delay strategy
+     * @return The specific crawl delay instance
+     */
+    public CrawlDelay getInstanceOf(final CrawlDelayStrategy crawlDelayStrategy) {
+        switch (crawlDelayStrategy) {
+            case FIXED:
+                return new FixedCrawlDelay(config);
+            case RANDOM:
+                return new RandomCrawlDelay(config);
+            case ADAPTIVE:
+                AdaptiveCrawlDelay adaptiveCrawlDelay = new AdaptiveCrawlDelay(config, javascriptExecutor);
+                if (!adaptiveCrawlDelay.isBrowserCompatible()) {
+                    throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser.");
+                }
+
+                return adaptiveCrawlDelay;
+        }
+
+        throw new IllegalArgumentException("Unsupported crawl delay strategy.");
+    }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index b275c9a..0a3b1e2 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -15,6 +15,7 @@
  */
 package com.github.peterbencze.serritor.internal;
 
+import com.github.peterbencze.serritor.api.CrawlDelayStrategy;
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.github.peterbencze.serritor.api.CrawlStrategy;
 import java.io.Serializable;
@@ -29,20 +30,26 @@
  * @author Peter Bencze
  */
 public final class CrawlerConfiguration implements Serializable {
-    
+
     private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST;
     private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true;
     private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false;
-    private static final Duration DEFAULT_DELAY_BETWEEN_REQUESTS = Duration.ZERO;
     private static final int DEFAULT_MAX_CRAWL_DEPTH = 0;
+    private static final CrawlDelayStrategy DEFAULT_CRAWL_DELAY = CrawlDelayStrategy.FIXED;
+    private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS = Duration.ZERO.toMillis();
+    private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS = Duration.ofSeconds(1).toMillis();
+    private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis();
 
     private final List<CrawlRequest> crawlSeeds;
 
     private CrawlStrategy crawlStrategy;
     private boolean filterDuplicateRequests;
     private boolean filterOffsiteRequests;
-    private Duration delayBetweenRequests;
     private int maxCrawlDepth;
+    private CrawlDelayStrategy crawlDelayStrategy;
+    private long fixedCrawlDelayInMillis;
+    private long minCrawlDelayInMillis;
+    private long maxCrawlDelayInMillis;
 
     public CrawlerConfiguration() {
         // Default configuration
@@ -50,8 +57,11 @@ public CrawlerConfiguration() {
         crawlStrategy = DEFAULT_CRAWL_STRATEGY;
         filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT;
         filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT;
-        delayBetweenRequests = DEFAULT_DELAY_BETWEEN_REQUESTS;
         maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH;
+        crawlDelayStrategy = DEFAULT_CRAWL_DELAY;
+        fixedCrawlDelayInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS;
+        minCrawlDelayInMillis = DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS;
+        maxCrawlDelayInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS;
     }
 
     /**
@@ -136,38 +146,118 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
     }
 
     /**
-     * Returns the delay between each request.
+     * Returns the maximum possible crawl depth.
      *
-     * @return The delay between each request
+     * @return The maximum crawl depth
      */
-    public Duration getDelayBetweenRequests() {
-        return delayBetweenRequests;
+    public int getMaxCrawlDepth() {
+        return maxCrawlDepth;
     }
 
     /**
-     * Sets the delay between each request.
+     * Sets the maximum possible crawl depth.
      *
-     * @param delayBetweenRequests The delay between each request
+     * @param maxCrawlDepth The maximum crawl depth, zero means no limit
      */
-    public void setDelayBetweenRequests(final Duration delayBetweenRequests) {
-        this.delayBetweenRequests = delayBetweenRequests;
+    public void setMaxCrawlDepth(final int maxCrawlDepth) {
+        this.maxCrawlDepth = maxCrawlDepth;
     }
 
     /**
-     * Returns the maximum possible crawl depth.
+     * Sets the crawl delay strategy to be used by the crawler.
      *
-     * @return The maximum crawl depth
+     * @param crawlDelayStrategy The crawl delay strategy
      */
-    public int getMaxCrawlDepth() {
-        return maxCrawlDepth;
+    public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
+        this.crawlDelayStrategy = crawlDelayStrategy;
     }
 
     /**
-     * Sets the maximum possible crawl depth.
+     * Returns the crawl delay strategy used by the crawler.
      *
-     * @param maxCrawlDepth The maximum crawl depth, zero means no limit
+     * @return The crawl delay type
      */
-    public void setMaxCrawlDepth(final int maxCrawlDepth) {
-        this.maxCrawlDepth = maxCrawlDepth;
+    public CrawlDelayStrategy getCrawlDelayStrategy() {
+        return crawlDelayStrategy;
+    }
+
+    /**
+     * Sets the exact duration of delay between each request.
+     *
+     * @param fixedCrawlDelayDuration The duration of delay
+     */
+    public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
+        try {
+            fixedCrawlDelayInMillis = fixedCrawlDelayDuration.toMillis();
+        } catch (ArithmeticException ex) {
+            throw new IllegalArgumentException("The duration is too large.");
+        }
+    }
+
+    /**
+     * Returns the exact duration of delay between each request.
+     *
+     * @return The duration of delay in milliseconds
+     */
+    public long getFixedCrawlDelayInMillis() {
+        return fixedCrawlDelayInMillis;
+    }
+
+    /**
+     * Sets the minimum duration of delay between each request.
+     *
+     * @param minCrawlDelayDuration The minimum duration of delay
+     */
+    public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
+        if (minCrawlDelayDuration.isNegative()) {
+            throw new IllegalArgumentException("The minimum crawl delay should be positive.");
+        }
+
+        try {
+            long delayInMillis = minCrawlDelayDuration.toMillis();
+            if (delayInMillis >= maxCrawlDelayInMillis) {
+                throw new IllegalArgumentException("The minimum crawl delay should be less than the maximum.");
+            }
+
+            minCrawlDelayInMillis = delayInMillis;
+        } catch (ArithmeticException ex) {
+            throw new IllegalArgumentException("The duration is too large.");
+        }
+    }
+
+    /**
+     * Returns the minimum duration of delay between each request.
+     *
+     * @return The minimum duration of delay in milliseconds
+     */
+    public long getMinimumCrawlDelayInMillis() {
+        return minCrawlDelayInMillis;
+    }
+
+    /**
+     * Sets the maximum duration of delay between each request.
+     *
+     * @param maxCrawlDelayDuration The maximum duration of delay
+     */
+    public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
+        try {
+            long delayInMillis = maxCrawlDelayDuration.toMillis();
+            if (delayInMillis <= minCrawlDelayInMillis) {
+                throw new IllegalArgumentException("The maximum crawl delay should be higher than the minimum.");
+            }
+
+            maxCrawlDelayInMillis = delayInMillis;
+        } catch (ArithmeticException ex) {
+            throw new IllegalArgumentException("The duration is too large.");
+        }
+    }
+
+    /**
+     * Returns the maximum duration of delay between each request.
+     *
+     * @return The maximum duration of delay in milliseconds
+     */
+    public long getMaximumCrawlDelayInMillis() {
+        return maxCrawlDelayInMillis;
     }
 }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
new file mode 100644
index 0000000..cea20d0
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
@@ -0,0 +1,46 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+/**
+ * A type of crawl delay, in which case the delay is constant and equals to the
+ * duration specified in the configuration.
+ *
+ * @author Peter Bencze
+ */
+public final class FixedCrawlDelay implements CrawlDelay {
+
+    private final long delayInMillis;
+
+    /**
+     * Constructs a new <code>FixedCrawlDelay</code> instance.
+     * 
+     * @param config A <code>CrawlerConfiguration</code> instance which specifies the fixed delay
+     */
+    public FixedCrawlDelay(final CrawlerConfiguration config) {
+        delayInMillis = config.getFixedCrawlDelayInMillis();
+    }
+
+    /**
+     * Returns the fixed delay specified in the configuration.
+     * 
+     * @return The delay in milliseconds
+     */
+    @Override
+    public long getDelay() {
+        return delayInMillis;
+    }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
new file mode 100644
index 0000000..3bc9871
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
@@ -0,0 +1,52 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+import java.util.concurrent.ThreadLocalRandom;
+
+/**
+ * A type of crawl delay in which case the duration is randomized between the
+ * specified minimum and maximum range.
+ *
+ * @author Peter Bencze
+ */
+public final class RandomCrawlDelay implements CrawlDelay {
+
+    private final long origin;
+    private final long bound;
+
+    /**
+     * Constructs a new <code>RandomCrawlDelay</code> instance.
+     *
+     * @param config A <code>CrawlerConfiguration</code> instance which
+     * specifies the minimum and maximum delay.
+     */
+    public RandomCrawlDelay(final CrawlerConfiguration config) {
+        origin = config.getMinimumCrawlDelayInMillis();
+        bound = config.getMaximumCrawlDelayInMillis() + 1;
+    }
+
+    /**
+     * Returns a random delay between the minimum and maximum range specified in
+     * the configuration.
+     *
+     * @return The delay in milliseconds
+     */
+    @Override
+    public long getDelay() {
+        return ThreadLocalRandom.current().nextLong(origin, bound);
+    }
+}

From cc24efcba519db2658df10d28be56d41a9f0f61a Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Sun, 25 Feb 2018 02:51:58 +0100
Subject: [PATCH 07/24] Rename crawl depth getter and setter

---
 .../github/peterbencze/serritor/internal/CrawlFrontier.java   | 2 +-
 .../peterbencze/serritor/internal/CrawlerConfiguration.java   | 4 ++--
 .../peterbencze/serritor/internal/CrawlFrontierTest.java      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index ed6c20e..2d58e03 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -96,7 +96,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
         CrawlCandidateBuilder builder;
 
         if (!isCrawlSeed) {
-            int crawlDepthLimit = config.getMaxCrawlDepth();
+            int crawlDepthLimit = config.getMaximumCrawlDepth();
             int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1;
 
             // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 0a3b1e2..2dfb8fc 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -150,7 +150,7 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
      *
      * @return The maximum crawl depth
      */
-    public int getMaxCrawlDepth() {
+    public int getMaximumCrawlDepth() {
         return maxCrawlDepth;
     }
 
@@ -159,7 +159,7 @@ public int getMaxCrawlDepth() {
      *
      * @param maxCrawlDepth The maximum crawl depth, zero means no limit
      */
-    public void setMaxCrawlDepth(final int maxCrawlDepth) {
+    public void setMaximumCrawlDepth(final int maxCrawlDepth) {
         this.maxCrawlDepth = maxCrawlDepth;
     }
 
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 621da01..df7788c 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -404,7 +404,7 @@ public void getNextRequestDepthFirstTest() {
     @Test
     public void maxCrawlDepthTest() {
         // Set max crawl depth
-        config.setMaxCrawlDepth(MAX_CRAWL_DEPTH);
+        config.setMaximumCrawlDepth(MAX_CRAWL_DEPTH);
         
         // Clear the crawl candidate queue of the frontier
         clearCrawlCandidateQueue();

From b53b2698b259ced68a4a4fdd7a0e8eb01ca831ba Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Sun, 25 Feb 2018 03:06:23 +0100
Subject: [PATCH 08/24] Refact

---
 .../com/github/peterbencze/serritor/api/CrawlRequest.java   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
index 6f8c674..26ff078 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
@@ -80,6 +80,8 @@ public Optional<Serializable> getMetadata() {
     }
 
     public static final class CrawlRequestBuilder {
+        
+        private static final int DEFAULT_PRIORITY = 0;
 
         private final URL requestUrl;
 
@@ -105,8 +107,8 @@ public CrawlRequestBuilder(final URL requestUrl) {
                 throw new IllegalArgumentException(String.format("The top private domain cannot be extracted from the given request URL (\"%s\").", requestUrl), ex);
             }
 
-            // Default priority is 0
-            priority = 0;
+            // Set default priority
+            priority = DEFAULT_PRIORITY;
         }
 
         /**

From bd533ad982d07f687ed64b1bf1180cad4b71821a Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Mon, 5 Mar 2018 21:51:00 +0100
Subject: [PATCH 09/24] Add configurator and argument checking

---
 .../peterbencze/serritor/api/BaseCrawler.java |  17 +-
 .../serritor/internal/AdaptiveCrawlDelay.java |   4 +-
 .../internal/CrawlerConfiguration.java        | 114 ++++++--------
 .../internal/CrawlerConfigurator.java         | 148 ++++++++++++++++++
 .../serritor/internal/FixedCrawlDelay.java    |   2 +-
 .../serritor/internal/RandomCrawlDelay.java   |   4 +-
 .../serritor/internal/CrawlFrontierTest.java  |  25 +--
 7 files changed, 219 insertions(+), 95 deletions(-)
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java

diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index cd48aa1..392fc68 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -24,6 +24,7 @@
 import com.github.peterbencze.serritor.internal.CrawlDelayFactory;
 import com.github.peterbencze.serritor.internal.CrawlFrontier;
 import com.github.peterbencze.serritor.internal.CrawlerConfiguration;
+import com.github.peterbencze.serritor.internal.CrawlerConfigurator;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.ObjectInputStream;
@@ -51,9 +52,10 @@
  * @author Peter Bencze
  */
 public abstract class BaseCrawler {
+    
+    protected final CrawlerConfigurator configurator;
 
-    // Allows the application to configure the crawler
-    protected final CrawlerConfiguration config;
+    private final CrawlerConfiguration configuration;
 
     // Indicates if the crawler is currently running or not
     private boolean isStopped;
@@ -68,12 +70,11 @@ public abstract class BaseCrawler {
 
     private CrawlFrontier crawlFrontier;
 
-    // Specifies which type of crawl delay to use
     private CrawlDelay crawlDelay;
 
     protected BaseCrawler() {
-        // Create a default configuration
-        config = new CrawlerConfiguration();
+        configuration = new CrawlerConfiguration();
+        configurator = new CrawlerConfigurator(configuration);
 
         // Indicate that the crawler is not running
         isStopped = true;
@@ -92,7 +93,7 @@ public final void start() {
      * @param driver The WebDriver instance that will be used by the crawler
      */
     public final void start(final WebDriver driver) {
-        start(driver, new CrawlFrontier(config));
+        start(driver, new CrawlFrontier(configuration));
     }
 
     /**
@@ -115,8 +116,8 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
 
             crawlFrontier = frontierToUse;
 
-            CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(config, (JavascriptExecutor) driver);
-            crawlDelay = crawlDelayFactory.getInstanceOf(config.getCrawlDelayStrategy());
+            CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(configuration, (JavascriptExecutor) driver);
+            crawlDelay = crawlDelayFactory.getInstanceOf(configuration.getCrawlDelayStrategy());
 
             run();
         } finally {
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
index c3e6b4c..9fd9e9a 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
@@ -39,8 +39,8 @@ public final class AdaptiveCrawlDelay implements CrawlDelay {
      * capable of executing JavaScript.
      */
     public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) {
-        minDelayInMillis = config.getMinimumCrawlDelayInMillis();
-        maxDelayInMillis = config.getMaximumCrawlDelayInMillis();
+        minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis();
+        maxDelayInMillis = config.getMaximumCrawlDelayDurationInMillis();
         this.javascriptExecutor = javascriptExecutor;
     }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 2dfb8fc..1fa666f 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -24,9 +24,8 @@
 import java.util.List;
 
 /**
- * Provides an interface to configure the crawler.
+ * This class contains the settings of the crawler.
  *
- * @author Krisztian Mozsi
  * @author Peter Bencze
  */
 public final class CrawlerConfiguration implements Serializable {
@@ -47,21 +46,22 @@ public final class CrawlerConfiguration implements Serializable {
     private boolean filterOffsiteRequests;
     private int maxCrawlDepth;
     private CrawlDelayStrategy crawlDelayStrategy;
-    private long fixedCrawlDelayInMillis;
-    private long minCrawlDelayInMillis;
-    private long maxCrawlDelayInMillis;
+    private long fixedCrawlDelayDurationInMillis;
+    private long minCrawlDelayDurationInMillis;
+    private long maxCrawlDelayDurationInMillis;
 
     public CrawlerConfiguration() {
-        // Default configuration
+        // Initialize configuration with default values
+
         crawlSeeds = new ArrayList<>();
         crawlStrategy = DEFAULT_CRAWL_STRATEGY;
         filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT;
         filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT;
         maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH;
         crawlDelayStrategy = DEFAULT_CRAWL_DELAY;
-        fixedCrawlDelayInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS;
-        minCrawlDelayInMillis = DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS;
-        maxCrawlDelayInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS;
+        fixedCrawlDelayDurationInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS;
+        minCrawlDelayDurationInMillis = DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS;
+        maxCrawlDelayDurationInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS;
     }
 
     /**
@@ -76,21 +76,13 @@ public List<CrawlRequest> getCrawlSeeds() {
     /**
      * Appends a crawl request to the list of crawl seeds.
      *
-     * @param request The crawl request
+     * @param request The <code>CrawlRequest</code> instance which represents
+     * the crawl seed
      */
     public void addCrawlSeed(final CrawlRequest request) {
         crawlSeeds.add(request);
     }
 
-    /**
-     * Appends a list of crawl requests to the list of crawl seeds.
-     *
-     * @param requests The list of crawl requests
-     */
-    public void addCrawlSeeds(final List<CrawlRequest> requests) {
-        crawlSeeds.addAll(requests);
-    }
-
     /**
      * Returns the crawl strategy of the crawler.
      *
@@ -101,7 +93,9 @@ public CrawlStrategy getCrawlStrategy() {
     }
 
     /**
-     * Sets the crawl strategy of the crawler.
+     * Sets the crawl strategy to be used by the crawler. Breadth-first strategy
+     * orders crawl requests by the lowest crawl depth, whereas depth-first
+     * orders them by the highest crawl depth.
      *
      * @param crawlStrategy The crawl strategy
      */
@@ -112,16 +106,17 @@ public void setCrawlStrategy(final CrawlStrategy crawlStrategy) {
     /**
      * Indicates if duplicate request filtering is enabled or not.
      *
-     * @return True if it is enabled, false otherwise
+     * @return <code>true</code> if enabled, <code>false</code> otherwise
      */
     public boolean isDuplicateRequestFilteringEnabled() {
         return filterDuplicateRequests;
     }
 
     /**
-     * Sets duplicate request filtering.
+     * Enables or disables duplicate request filtering.
      *
-     * @param filterDuplicateRequests True means enabled, false means disabled
+     * @param filterDuplicateRequests <code>true</code> means enabled,
+     * <code>false</code> means disabled
      */
     public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) {
         this.filterDuplicateRequests = filterDuplicateRequests;
@@ -130,16 +125,17 @@ public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests)
     /**
      * Indicates if offsite request filtering is enabled or not.
      *
-     * @return True if it is enabled, false otherwise
+     * @return <code>true</code> if enabled, <code>false</code> otherwise
      */
     public boolean isOffsiteRequestFilteringEnabled() {
         return filterOffsiteRequests;
     }
 
     /**
-     * Sets offsite request filtering.
+     * Enables or disables offsite request filtering.
      *
-     * @param filterOffsiteRequests True means enabled, false means disabled
+     * @param filterOffsiteRequests <code>true</code> means enabled,
+     * <code>false</code> means disabled
      */
     public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
         this.filterOffsiteRequests = filterOffsiteRequests;
@@ -155,9 +151,10 @@ public int getMaximumCrawlDepth() {
     }
 
     /**
-     * Sets the maximum possible crawl depth.
+     * Sets the maximum possible crawl depth. It should be a non-negative number
+     * where 0 means there is no limit.
      *
-     * @param maxCrawlDepth The maximum crawl depth, zero means no limit
+     * @param maxCrawlDepth The maximum crawl depth
      */
     public void setMaximumCrawlDepth(final int maxCrawlDepth) {
         this.maxCrawlDepth = maxCrawlDepth;
@@ -175,7 +172,7 @@ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
     /**
      * Returns the crawl delay strategy used by the crawler.
      *
-     * @return The crawl delay type
+     * @return The crawl delay strategy
      */
     public CrawlDelayStrategy getCrawlDelayStrategy() {
         return crawlDelayStrategy;
@@ -184,14 +181,11 @@ public CrawlDelayStrategy getCrawlDelayStrategy() {
     /**
      * Sets the exact duration of delay between each request.
      *
-     * @param fixedCrawlDelayDuration The duration of delay
+     * @param fixedCrawlDelayDurationInMillis The duration of delay in
+     * milliseconds
      */
-    public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
-        try {
-            fixedCrawlDelayInMillis = fixedCrawlDelayDuration.toMillis();
-        } catch (ArithmeticException ex) {
-            throw new IllegalArgumentException("The duration is too large.");
-        }
+    public void setFixedCrawlDelayDurationInMillis(final long fixedCrawlDelayDurationInMillis) {
+        this.fixedCrawlDelayDurationInMillis = fixedCrawlDelayDurationInMillis;
     }
 
     /**
@@ -199,30 +193,18 @@ public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
      *
      * @return The duration of delay in milliseconds
      */
-    public long getFixedCrawlDelayInMillis() {
-        return fixedCrawlDelayInMillis;
+    public long getFixedCrawlDelayDurationInMillis() {
+        return fixedCrawlDelayDurationInMillis;
     }
 
     /**
      * Sets the minimum duration of delay between each request.
      *
-     * @param minCrawlDelayDuration The minimum duration of delay
+     * @param minCrawlDelayDurationInMillis The minimum duration of delay in
+     * milliseconds
      */
-    public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
-        if (minCrawlDelayDuration.isNegative()) {
-            throw new IllegalArgumentException("The minimum crawl delay should be positive.");
-        }
-
-        try {
-            long delayInMillis = minCrawlDelayDuration.toMillis();
-            if (delayInMillis >= maxCrawlDelayInMillis) {
-                throw new IllegalArgumentException("The minimum crawl delay should be less than the maximum.");
-            }
-
-            minCrawlDelayInMillis = delayInMillis;
-        } catch (ArithmeticException ex) {
-            throw new IllegalArgumentException("The duration is too large.");
-        }
+    public void setMinimumCrawlDelayDurationInMillis(final long minCrawlDelayDurationInMillis) {
+        this.minCrawlDelayDurationInMillis = minCrawlDelayDurationInMillis;
     }
 
     /**
@@ -230,26 +212,18 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
      *
      * @return The minimum duration of delay in milliseconds
      */
-    public long getMinimumCrawlDelayInMillis() {
-        return minCrawlDelayInMillis;
+    public long getMinimumCrawlDelayDurationInMillis() {
+        return minCrawlDelayDurationInMillis;
     }
 
     /**
      * Sets the maximum duration of delay between each request.
      *
-     * @param maxCrawlDelayDuration The maximum duration of delay
+     * @param maxCrawlDelayDurationInMillis The maximum duration of delay in
+     * milliseconds
      */
-    public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
-        try {
-            long delayInMillis = maxCrawlDelayDuration.toMillis();
-            if (delayInMillis <= minCrawlDelayInMillis) {
-                throw new IllegalArgumentException("The maximum crawl delay should be higher than the minimum.");
-            }
-
-            maxCrawlDelayInMillis = delayInMillis;
-        } catch (ArithmeticException ex) {
-            throw new IllegalArgumentException("The duration is too large.");
-        }
+    public void setMaximumCrawlDelayDuration(final long maxCrawlDelayDurationInMillis) {
+        this.maxCrawlDelayDurationInMillis = maxCrawlDelayDurationInMillis;
     }
 
     /**
@@ -257,7 +231,7 @@ public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
      *
      * @return The maximum duration of delay in milliseconds
      */
-    public long getMaximumCrawlDelayInMillis() {
-        return maxCrawlDelayInMillis;
+    public long getMaximumCrawlDelayDurationInMillis() {
+        return maxCrawlDelayDurationInMillis;
     }
 }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
new file mode 100644
index 0000000..378f591
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
@@ -0,0 +1,148 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+import com.github.peterbencze.serritor.api.CrawlDelayStrategy;
+import com.github.peterbencze.serritor.api.CrawlRequest;
+import com.github.peterbencze.serritor.api.CrawlStrategy;
+import com.google.common.base.Preconditions;
+import java.time.Duration;
+import java.util.List;
+
+/**
+ * This class provides an interface for the user to configure the crawler.
+ *
+ * @author Peter Bencze
+ */
+public class CrawlerConfigurator {
+
+    private final CrawlerConfiguration config;
+
+    public CrawlerConfigurator(CrawlerConfiguration config) {
+        this.config = config;
+    }
+
+    /**
+     * Appends a crawl request to the list of crawl seeds.
+     *
+     * @param request The <code>CrawlRequest</code> instance which represents
+     * the crawl seed
+     */
+    public void addCrawlSeed(final CrawlRequest request) {
+        config.addCrawlSeed(Preconditions.checkNotNull(request));
+    }
+
+    /**
+     * Appends a list of crawl requests to the list of crawl seeds.
+     *
+     * @param requests The list of <code>CrawlRequest</code> instances which
+     * represent the crawl seeds
+     */
+    public void addCrawlSeeds(final List<CrawlRequest> requests) {
+        requests.forEach(this::addCrawlSeed);
+    }
+
+    /**
+     * Sets the crawl strategy to be used by the crawler. Breadth-first strategy
+     * orders crawl requests by the lowest crawl depth, whereas depth-first
+     * orders them by the highest crawl depth.
+     *
+     * @param crawlStrategy The crawl strategy
+     */
+    public void setCrawlStrategy(final CrawlStrategy crawlStrategy) {
+        config.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy));
+    }
+
+    /**
+     * Enables or disables duplicate request filtering.
+     *
+     * @param filterDuplicateRequests <code>true</code> means enabled,
+     * <code>false</code> means disabled
+     */
+    public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) {
+        config.setDuplicateRequestFiltering(filterDuplicateRequests);
+    }
+
+    /**
+     * Enables or disables offsite request filtering.
+     *
+     * @param filterOffsiteRequests <code>true</code> means enabled,
+     * <code>false</code> means disabled
+     */
+    public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
+        config.setOffsiteRequestFiltering(filterOffsiteRequests);
+    }
+
+    /**
+     * Sets the maximum possible crawl depth. It should be a non-negative number
+     * where 0 means there is no limit.
+     *
+     * @param maxCrawlDepth The maximum crawl depth
+     */
+    public void setMaximumCrawlDepth(final int maxCrawlDepth) {
+        Preconditions.checkArgument(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative.");
+
+        config.setMaximumCrawlDepth(maxCrawlDepth);
+    }
+
+    /**
+     * Sets the crawl delay strategy to be used by the crawler.
+     *
+     * @param crawlDelayStrategy The crawl delay strategy
+     */
+    public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
+        config.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy));
+    }
+
+    /**
+     * Sets the exact duration of delay between each request.
+     *
+     * @param fixedCrawlDelayDuration The duration of delay
+     */
+    public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
+        config.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis());
+    }
+
+    /**
+     * Sets the minimum duration of delay between each request.
+     *
+     * @param minCrawlDelayDuration The minimum duration of delay
+     */
+    public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
+        Preconditions.checkArgument(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative.");
+
+        long minCrawlDelayDurationInMillis = minCrawlDelayDuration.toMillis();
+        long maxCrawlDelayInMillis = config.getMaximumCrawlDelayDurationInMillis();
+
+        Preconditions.checkArgument(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum.");
+
+        config.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis);
+    }
+
+    /**
+     * Sets the maximum duration of delay between each request.
+     *
+     * @param maxCrawlDelayDuration The maximum duration of delay
+     */
+    public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
+        long minCrawlDelayDurationInMillis = config.getMinimumCrawlDelayDurationInMillis();
+        long maxCrawlDelayDurationInMillis = maxCrawlDelayDuration.toMillis();
+
+        Preconditions.checkArgument(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum.");
+
+        config.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis);
+    }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
index cea20d0..3eb0f87 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
@@ -31,7 +31,7 @@ public final class FixedCrawlDelay implements CrawlDelay {
      * @param config A <code>CrawlerConfiguration</code> instance which specifies the fixed delay
      */
     public FixedCrawlDelay(final CrawlerConfiguration config) {
-        delayInMillis = config.getFixedCrawlDelayInMillis();
+        delayInMillis = config.getFixedCrawlDelayDurationInMillis();
     }
 
     /**
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
index 3bc9871..6c16073 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
@@ -35,8 +35,8 @@ public final class RandomCrawlDelay implements CrawlDelay {
      * specifies the minimum and maximum delay.
      */
     public RandomCrawlDelay(final CrawlerConfiguration config) {
-        origin = config.getMinimumCrawlDelayInMillis();
-        bound = config.getMaximumCrawlDelayInMillis() + 1;
+        origin = config.getMinimumCrawlDelayDurationInMillis();
+        bound = config.getMaximumCrawlDelayDurationInMillis() + 1;
     }
 
     /**
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index df7788c..6cab603 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -28,7 +28,7 @@
 import org.junit.Test;
 
 /**
- * Test cases for CrawlFrontier.
+ * Test cases for <code>CrawlFrontier</code>.
  *
  * @author Krisztian Mozsi
  * @author Peter Bencze
@@ -67,7 +67,7 @@ public final class CrawlFrontierTest {
     private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST;
     private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST;
     private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST;
-    
+
     // Child URL path
     private static final String CHILD_URL_PATH = "/child";
 
@@ -79,7 +79,7 @@ public final class CrawlFrontierTest {
 
     // Offsite URL crawl request
     private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST;
-    
+
     // Max crawl depth
     private static final int MAX_CRAWL_DEPTH = 1;
 
@@ -132,10 +132,11 @@ public final class CrawlFrontierTest {
 
     @Before
     public void initialize() {
-        // Create configuration
         config = new CrawlerConfiguration();
+
         config.setOffsiteRequestFiltering(true);
-        config.addCrawlSeeds(Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST));
+        Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST)
+                .forEach(config::addCrawlSeed);
 
         // Create frontier
         frontier = new CrawlFrontier(config);
@@ -400,27 +401,27 @@ public void getNextRequestDepthFirstTest() {
         // There should be no more candidates left at this point
         assertFalse(frontier.hasNextCandidate());
     }
-    
+
     @Test
     public void maxCrawlDepthTest() {
         // Set max crawl depth
         config.setMaximumCrawlDepth(MAX_CRAWL_DEPTH);
-        
+
         // Clear the crawl candidate queue of the frontier
         clearCrawlCandidateQueue();
-        
+
         // Feed a child request, its crawl depth will be 1
         frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
-        
+
         // Get the crawl candidate of the previously added child URL
         CrawlCandidate nextCandidate = frontier.getNextCandidate();
-        
+
         // Check its crawl depth, it should be less than or equal to the limit
         assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH);
-        
+
         // Feed another child request, its crawl depth will be 2 which is above the limit
         frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
-        
+
         // There should be no more candidates at this point
         assertFalse(frontier.hasNextCandidate());
     }

From 7cf0abd262f0d62fd1643be25cd476b59110108a Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Wed, 7 Mar 2018 00:43:49 +0100
Subject: [PATCH 10/24] Refactor crawl delays, alter comments

---
 pom.xml                                       |  7 --
 .../peterbencze/serritor/api/BaseCrawler.java | 76 +++++++++++++------
 .../serritor/api/CrawlRequest.java            | 35 ++++-----
 .../serritor/api/HtmlResponse.java            |  4 +-
 .../serritor/api/HttpHeadResponse.java        |  6 +-
 .../serritor/api/UnsuccessfulRequest.java     |  5 +-
 ....java => AdaptiveCrawlDelayMechanism.java} | 28 +++----
 .../serritor/internal/CallbackParameter.java  |  2 +-
 .../serritor/internal/CrawlCandidate.java     |  2 +-
 .../serritor/internal/CrawlDelayFactory.java  | 68 -----------------
 ...awlDelay.java => CrawlDelayMechanism.java} |  6 +-
 .../serritor/internal/CrawlFrontier.java      | 39 +++++-----
 .../internal/CrawlerConfigurator.java         | 30 ++++----
 ...lay.java => FixedCrawlDelayMechanism.java} | 19 ++---
 ...ay.java => RandomCrawlDelayMechanism.java} | 20 ++---
 .../serritor/internal/CrawlFrontierTest.java  |  1 -
 16 files changed, 151 insertions(+), 197 deletions(-)
 rename src/main/java/com/github/peterbencze/serritor/internal/{AdaptiveCrawlDelay.java => AdaptiveCrawlDelayMechanism.java} (61%)
 delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java
 rename src/main/java/com/github/peterbencze/serritor/internal/{CrawlDelay.java => CrawlDelayMechanism.java} (83%)
 rename src/main/java/com/github/peterbencze/serritor/internal/{FixedCrawlDelay.java => FixedCrawlDelayMechanism.java} (62%)
 rename src/main/java/com/github/peterbencze/serritor/internal/{RandomCrawlDelay.java => RandomCrawlDelayMechanism.java} (60%)

diff --git a/pom.xml b/pom.xml
index 33c497b..a3ccc27 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,13 +25,6 @@
                 <role>Owner</role>
             </roles>
         </developer>
-        <developer>
-            <name>Krisztian Mozsi</name>
-            <email>mozsik0@gmail.com</email>
-            <roles>
-                <role>Committer</role>
-            </roles>
-        </developer>
     </developers>
     
     <scm>
diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index 392fc68..17b01c1 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -19,12 +19,14 @@
 import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder;
 import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder;
 import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder;
+import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism;
 import com.github.peterbencze.serritor.internal.CrawlCandidate;
-import com.github.peterbencze.serritor.internal.CrawlDelay;
-import com.github.peterbencze.serritor.internal.CrawlDelayFactory;
+import com.github.peterbencze.serritor.internal.CrawlDelayMechanism;
 import com.github.peterbencze.serritor.internal.CrawlFrontier;
 import com.github.peterbencze.serritor.internal.CrawlerConfiguration;
 import com.github.peterbencze.serritor.internal.CrawlerConfigurator;
+import com.github.peterbencze.serritor.internal.FixedCrawlDelayMechanism;
+import com.github.peterbencze.serritor.internal.RandomCrawlDelayMechanism;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.ObjectInputStream;
@@ -52,7 +54,7 @@
  * @author Peter Bencze
  */
 public abstract class BaseCrawler {
-    
+
     protected final CrawlerConfigurator configurator;
 
     private final CrawlerConfiguration configuration;
@@ -70,7 +72,7 @@ public abstract class BaseCrawler {
 
     private CrawlFrontier crawlFrontier;
 
-    private CrawlDelay crawlDelay;
+    private CrawlDelayMechanism crawlDelayMechanism;
 
     protected BaseCrawler() {
         configuration = new CrawlerConfiguration();
@@ -88,9 +90,11 @@ public final void start() {
     }
 
     /**
-     * Starts the crawler using the browser specified by the WebDriver instance.
+     * Starts the crawler using the browser specified by the
+     * <code>WebDriver</code> instance.
      *
-     * @param driver The WebDriver instance that will be used by the crawler
+     * @param driver The <code>WebDriver</code> instance that will be used by
+     * the crawler
      */
     public final void start(final WebDriver driver) {
         start(driver, new CrawlFrontier(configuration));
@@ -99,7 +103,8 @@ public final void start(final WebDriver driver) {
     /**
      * Constructs all the necessary objects and runs the crawler.
      *
-     * @param frontierToUse Crawl frontier to be used by the crawler.
+     * @param frontierToUse The <code>CrawlFrontier</code> instance to be used
+     * by the crawler.
      */
     private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
         try {
@@ -109,15 +114,10 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
             }
 
             isStopped = false;
-
             httpClient = HttpClientBuilder.create().build();
-
             webDriver = driver;
-
             crawlFrontier = frontierToUse;
-
-            CrawlDelayFactory crawlDelayFactory = new CrawlDelayFactory(configuration, (JavascriptExecutor) driver);
-            crawlDelay = crawlDelayFactory.getInstanceOf(configuration.getCrawlDelayStrategy());
+            crawlDelayMechanism = createCrawlDelayMechanism();
 
             run();
         } finally {
@@ -132,8 +132,9 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
     /**
      * Saves the current state of the crawler to the specified output stream.
      *
-     * @param out The output stream to use
-     * @throws IOException Any exception thrown by the underlying OutputStream.
+     * @param out The <code>OutputStream</code> instance to use
+     * @throws IOException Any exception thrown by the underlying
+     * <code>OutputStream</code>.
      */
     public final void saveState(final OutputStream out) throws IOException {
         // Check if the crawler has been started, otherwise we have nothing to save
@@ -149,8 +150,8 @@ public final void saveState(final OutputStream out) throws IOException {
     /**
      * Resumes a previously saved state using HtmlUnit headless browser.
      *
-     * @param in The input stream to use
-     * @throws IOException Any of the usual Input/Output related exceptions.
+     * @param in The <code>InputStream</code> instance to use
+     * @throws IOException Any of the usual input/output related exceptions.
      * @throws ClassNotFoundException Class of a serialized object cannot be
      * found.
      */
@@ -162,9 +163,10 @@ public final void resumeState(final InputStream in) throws IOException, ClassNot
      * Resumes a previously saved state using the browser specified by the
      * WebDriver instance.
      *
-     * @param driver The WebDriver instance that will be used by the crawler
-     * @param in The input stream to use
-     * @throws IOException Any of the usual Input/Output related exceptions.
+     * @param driver The <code>WebDriver</code> instance to be used by the
+     * crawler
+     * @param in The <code>InputStream</code> instance to use
+     * @throws IOException Any of the usual input/output related exceptions.
      * @throws ClassNotFoundException Class of a serialized object cannot be
      * found.
      */
@@ -198,7 +200,7 @@ public final void stop() {
      * {@link CrawlerConfiguration#addCrawlSeed(com.github.peterbencze.serritor.api.CrawlRequest)}
      * for adding crawl seeds.
      *
-     * @param request The crawl request
+     * @param request The <code>CrawlRequest</code> instance
      */
     protected final void crawl(final CrawlRequest request) {
         // Check if the crawler is running
@@ -212,7 +214,7 @@ protected final void crawl(final CrawlRequest request) {
     /**
      * Passes multiple crawl requests to the crawl frontier.
      *
-     * @param requests The list of crawl requests
+     * @param requests The list of <code>CrawlRequest</code> instances
      */
     protected final void crawl(final List<CrawlRequest> requests) {
         requests.stream().forEach(this::crawl);
@@ -319,19 +321,43 @@ private HttpHeadResponse getHttpHeadResponse(final URL destinationUrl, final Htt
      * Indicates if the content of the response is HTML or not.
      *
      * @param httpHeadResponse The HTTP HEAD response
-     * @return True if the content is HTML, false otherwise
+     * @return <code>true</code> if the content is HTML, <code>false</code>
+     * otherwise
      */
-    private boolean isContentHtml(final HttpHeadResponse httpHeadResponse) {
+    private static boolean isContentHtml(final HttpHeadResponse httpHeadResponse) {
         Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type");
         return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html");
     }
 
+    /**
+     * Constructs the crawl delay mechanism specified in the configuration.
+     *
+     * @return The crawl delay mechanism
+     */
+    private CrawlDelayMechanism createCrawlDelayMechanism() {
+        switch (configuration.getCrawlDelayStrategy()) {
+            case FIXED:
+                return new FixedCrawlDelayMechanism(configuration);
+            case RANDOM:
+                return new RandomCrawlDelayMechanism(configuration);
+            case ADAPTIVE:
+                AdaptiveCrawlDelayMechanism adaptiveCrawlDelay = new AdaptiveCrawlDelayMechanism(configuration, (JavascriptExecutor) webDriver);
+                if (!adaptiveCrawlDelay.isBrowserCompatible()) {
+                    throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser.");
+                }
+
+                return adaptiveCrawlDelay;
+        }
+
+        throw new IllegalArgumentException("Unsupported crawl delay strategy.");
+    }
+
     /**
      * Delays the next request.
      */
     private void performDelay() {
         try {
-            TimeUnit.MILLISECONDS.sleep(crawlDelay.getDelay());
+            TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay());
         } catch (InterruptedException ex) {
             Thread.currentThread().interrupt();
             stopCrawling = true;
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
index 26ff078..34f26f0 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
@@ -26,7 +26,6 @@
  * future. The reason why it is not sure that it will be processed is because it
  * might get filtered out by one of the enabled filters.
  *
- * @author Krisztian Mozsi
  * @author Peter Bencze
  */
 public final class CrawlRequest implements Serializable {
@@ -80,7 +79,7 @@ public Optional<Serializable> getMetadata() {
     }
 
     public static final class CrawlRequestBuilder {
-        
+
         private static final int DEFAULT_PRIORITY = 0;
 
         private final URL requestUrl;
@@ -90,10 +89,11 @@ public static final class CrawlRequestBuilder {
         private Serializable metadata;
 
         /**
-         * Constructs a CrawlRequestBuilder instance that can be used to create
-         * CrawRequest instances.
+         * Constructs a <code>CrawlRequestBuilder</code> instance that can be
+         * used to create CrawRequest instances.
          *
-         * @param requestUrl The request's URL given as a URL instance
+         * @param requestUrl The request's URL given as a <code>URL</code>
+         * instance
          */
         public CrawlRequestBuilder(final URL requestUrl) {
             this.requestUrl = requestUrl;
@@ -112,10 +112,11 @@ public CrawlRequestBuilder(final URL requestUrl) {
         }
 
         /**
-         * Constructs a CrawlRequestBuilder instance that can be used to create
-         * CrawRequest instances.
+         * Constructs a <code>CrawlRequestBuilder</code> instance that can be
+         * used to create <code>CrawRequest</code> instances.
          *
-         * @param requestUrl The request's URL given as a String instance
+         * @param requestUrl The request's URL given as a <code>String</code>
+         * instance
          */
         public CrawlRequestBuilder(final String requestUrl) {
             this(getUrlFromString(requestUrl));
@@ -126,7 +127,7 @@ public CrawlRequestBuilder(final String requestUrl) {
          *
          * @param priority The priority of the request (higher number means
          * higher priority)
-         * @return The builder instance
+         * @return The <code>CrawlRequestBuilder</code> instance
          */
         public CrawlRequestBuilder setPriority(final int priority) {
             this.priority = priority;
@@ -138,7 +139,7 @@ public CrawlRequestBuilder setPriority(final int priority) {
          * when the crawler processed the request.
          *
          * @param metadata The metadata associated with the request
-         * @return The builder instance
+         * @return The <code>CrawlRequestBuilder</code> instance
          */
         public CrawlRequestBuilder setMetadata(final Serializable metadata) {
             this.metadata = metadata;
@@ -146,21 +147,21 @@ public CrawlRequestBuilder setMetadata(final Serializable metadata) {
         }
 
         /**
-         * Builds the specified CrawlRequest instance.
+         * Builds the configured <code>CrawlRequest</code> instance.
          *
-         * @return The specified CrawlRequest instance
+         * @return The configured <code>CrawlRequest</code> instance
          */
         public CrawlRequest build() {
             return new CrawlRequest(this);
         }
 
         /**
-         * Constructs a URL instance based on the specified URL string. Since
-         * call to this must be the first statement in a constructor, this
-         * method is necessary for the conversion to be made.
+         * Constructs a <code>URL</code> instance based on the specified URL
+         * string. Since call to this must be the first statement in a
+         * constructor, this method is necessary for the conversion to be made.
          *
-         * @param requestUrl The request URL as String
-         * @return The request URL
+         * @param requestUrl The request URL as <code>String</code>
+         * @return The <code>URL</code> instance
          */
         private static URL getUrlFromString(final String requestUrl) {
             try {
diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
index 4138abb..a7be956 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
@@ -46,9 +46,9 @@ public HttpHeadResponse getHttpHeadResponse() {
     }
 
     /**
-     * Returns the WebDriver instance for the browser.
+     * Returns the <code>WebDriver</code> instance for the browser.
      *
-     * @return The WebDriver instance
+     * @return The <code>WebDriver</code> instance
      */
     public WebDriver getWebDriver() {
         return webDriver;
diff --git a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java
index 847b281..93f2aed 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java
@@ -39,7 +39,7 @@ public HttpHeadResponse(final HttpResponse response) {
      * Checks if a certain header is present in this message.
      *
      * @param name The name of the header
-     * @return True if it is present, false otherwise
+     * @return <code>true</code> if present, <code>false</code> otherwise
      */
     public boolean containsHeader(final String name) {
         return response.containsHeader(name);
@@ -48,7 +48,7 @@ public boolean containsHeader(final String name) {
     /**
      * Returns all the headers of this response.
      *
-     * @return All the headers
+     * @return The array of headers
      */
     public Header[] getAllHeaders() {
         return response.getAllHeaders();
@@ -68,7 +68,7 @@ public Header getFirstHeader(final String name) {
      * Returns all the headers with a specified name of this response.
      *
      * @param name The name of the headers
-     * @return All the headers
+     * @return The array of headers
      */
     public Header[] getHeaders(final String name) {
         return response.getHeaders(name);
diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
index f809a6a..12c67cc 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
@@ -35,9 +35,10 @@ private UnsuccessfulRequest(final UnsuccessfulRequestBuilder builder) {
     }
 
     /**
-     * Returns the exception that was thrown.
+     * Returns the exception that was thrown while trying to fulfill the
+     * request.
      *
-     * @return The thrown exception
+     * @return The <code>IOException</code> instance
      */
     public IOException getException() {
         return exception;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java
similarity index 61%
rename from src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
rename to src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java
index 9fd9e9a..ac8913e 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java
@@ -18,30 +18,30 @@
 import org.openqa.selenium.JavascriptExecutor;
 
 /**
- * A type of crawl delay, in which case the delay corresponds to the page
- * loading time, if it's between the specified range, otherwise the minimum or
+ * A crawl delay mechanism, in which case the delay corresponds to the page
+ * loading time, if it is between the specified range, otherwise the minimum or
  * maximum duration is used.
  *
  * @author Peter Bencze
  */
-public final class AdaptiveCrawlDelay implements CrawlDelay {
+public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism {
 
     private final long minDelayInMillis;
     private final long maxDelayInMillis;
-    private final JavascriptExecutor javascriptExecutor;
+    private final JavascriptExecutor jsExecutor;
 
     /**
-     * Constructs a new <code>AdaptiveCrawlDelay</code> instance.
+     * Constructs a new <code>AdaptiveCrawlDelayMechanism</code> instance.
      *
-     * @param config A <code>CrawlerConfiguration</code> instance which
+     * @param configuration The <code>CrawlerConfiguration</code> instance which
      * specifies the minimum and maximum delay.
-     * @param javascriptExecutor A <code>WebDriver</code> instance which is
-     * capable of executing JavaScript.
+     * @param jsExecutor The <code>WebDriver</code> instance which is capable of
+     * executing JavaScript.
      */
-    public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) {
-        minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis();
-        maxDelayInMillis = config.getMaximumCrawlDelayDurationInMillis();
-        this.javascriptExecutor = javascriptExecutor;
+    public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration configuration, final JavascriptExecutor jsExecutor) {
+        minDelayInMillis = configuration.getMinimumCrawlDelayDurationInMillis();
+        maxDelayInMillis = configuration.getMaximumCrawlDelayDurationInMillis();
+        this.jsExecutor = jsExecutor;
     }
 
     /**
@@ -51,7 +51,7 @@ public AdaptiveCrawlDelay(final CrawlerConfiguration config, final JavascriptExe
      * <code>false</code> otherwise
      */
     public boolean isBrowserCompatible() {
-        return (boolean) javascriptExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)");
+        return (boolean) jsExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)");
     }
 
     /**
@@ -64,7 +64,7 @@ public boolean isBrowserCompatible() {
      */
     @Override
     public long getDelay() {
-        long delayInMillis = (long) javascriptExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;");
+        long delayInMillis = (long) jsExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;");
 
         if (delayInMillis < minDelayInMillis) {
             return minDelayInMillis;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
index 28af583..9ca1d75 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
@@ -57,7 +57,7 @@ public final int getCrawlDepth() {
     /**
      * Returns the crawl request that was processed by the crawler.
      *
-     * @return The processed crawl request
+     * @return The processed <code>CrawlRequest</code> instance
      */
     public final CrawlRequest getCrawlRequest() {
         return crawlRequest;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
index 8d599ab..49b0721 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
@@ -85,7 +85,7 @@ public int getPriority() {
     /**
      * Returns the crawl request from which this candidate was constructed.
      *
-     * @return The crawl request
+     * @return The <code>CrawlRequest</code> instance
      */
     public CrawlRequest getCrawlRequest() {
         return crawlRequest;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java
deleted file mode 100644
index 97e78d1..0000000
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayFactory.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/* 
- * Copyright 2018 Peter Bencze.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.peterbencze.serritor.internal;
-
-import com.github.peterbencze.serritor.api.CrawlDelayStrategy;
-import org.openqa.selenium.JavascriptExecutor;
-
-/**
- * Factory class which is used to construct the required crawl delay instance
- * specified in the configuration.
- *
- * @author Peter Bencze
- */
-public final class CrawlDelayFactory {
-
-    private final CrawlerConfiguration config;
-    private final JavascriptExecutor javascriptExecutor;
-
-    /**
-     * Constructs a new <code>CrawlDelayFactory</code> instance.
-     *
-     * @param config A <code>CrawlerConfiguration</code> instance which
-     * specifies the minimum and maximum delay.
-     * @param javascriptExecutor A <code>WebDriver</code> instance which is
-     * capable of executing JavaScript.
-     */
-    public CrawlDelayFactory(final CrawlerConfiguration config, final JavascriptExecutor javascriptExecutor) {
-        this.config = config;
-        this.javascriptExecutor = javascriptExecutor;
-    }
-
-    /**
-     * Constructs the specific crawl delay instance determined by the strategy.
-     *
-     * @param crawlDelayStrategy The crawl delay strategy
-     * @return The specific crawl delay instance
-     */
-    public CrawlDelay getInstanceOf(final CrawlDelayStrategy crawlDelayStrategy) {
-        switch (crawlDelayStrategy) {
-            case FIXED:
-                return new FixedCrawlDelay(config);
-            case RANDOM:
-                return new RandomCrawlDelay(config);
-            case ADAPTIVE:
-                AdaptiveCrawlDelay adaptiveCrawlDelay = new AdaptiveCrawlDelay(config, javascriptExecutor);
-                if (!adaptiveCrawlDelay.isBrowserCompatible()) {
-                    throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser.");
-                }
-
-                return adaptiveCrawlDelay;
-        }
-
-        throw new IllegalArgumentException("Unsupported crawl delay strategy.");
-    }
-}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java
similarity index 83%
rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java
rename to src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java
index 652b2e9..34317b1 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java
@@ -16,16 +16,16 @@
 package com.github.peterbencze.serritor.internal;
 
 /**
- * An interface that every type of crawl delay should implement.
+ * An interface that every crawl delay mechanism should implement.
  * 
  * @author Peter Bencze
  */
-public interface CrawlDelay {
+public interface CrawlDelayMechanism {
     
     /**
      * Returns the delay that should pass between each request.
      * 
-     * @return The delay in milliseconds
+     * @return The duration of delay in milliseconds
      */
     long getDelay();
 }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index 2d58e03..58e56e2 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -35,11 +35,10 @@
  * crawling.
  *
  * @author Peter Bencze
- * @author Krisztian Mozsi
  */
 public final class CrawlFrontier implements Serializable {
 
-    private final CrawlerConfiguration config;
+    private final CrawlerConfiguration configuration;
 
     private final Set<String> allowedDomains;
     private final Set<String> urlFingerprints;
@@ -48,17 +47,17 @@ public final class CrawlFrontier implements Serializable {
 
     private CrawlCandidate currentCandidate;
 
-    public CrawlFrontier(final CrawlerConfiguration config) {
-        this.config = config;
+    public CrawlFrontier(final CrawlerConfiguration configuration) {
+        this.configuration = configuration;
 
         allowedDomains = new HashSet<>();
         urlFingerprints = new HashSet<>();
 
         // Construct a priority queue according to the crawl strategy specified in the configuration
-        candidates = getPriorityQueue();
+        candidates = createPriorityQueue();
 
         // Feed initial crawl requests (seeds)
-        config.getCrawlSeeds().stream()
+        configuration.getCrawlSeeds().stream()
                 .forEach((CrawlRequest request) -> {
                     feedRequest(request, true);
                 });
@@ -67,11 +66,12 @@ public CrawlFrontier(final CrawlerConfiguration config) {
     /**
      * Feeds a crawl request to the frontier.
      *
-     * @param request The request to be fed
-     * @param isCrawlSeed True if the request is a crawl seed, false otherwise
+     * @param request The <code>CrawlRequest</code> instance to be fed
+     * @param isCrawlSeed <code>true</code> if the request is a crawl seed,
+     * <code>false</code> otherwise
      */
     public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
-        if (config.isOffsiteRequestFilteringEnabled()) {
+        if (configuration.isOffsiteRequestFilteringEnabled()) {
             if (isCrawlSeed) {
                 allowedDomains.add(request.getTopPrivateDomain());
             } else {
@@ -81,8 +81,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
             }
         }
 
-        if (config.isDuplicateRequestFilteringEnabled()) {
-            String urlFingerprint = getFingerprintForUrl(request.getRequestUrl());
+        if (configuration.isDuplicateRequestFilteringEnabled()) {
+            String urlFingerprint = createFingerprintForUrl(request.getRequestUrl());
 
             // Check if the URL has already been crawled
             if (urlFingerprints.contains(urlFingerprint)) {
@@ -96,7 +96,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
         CrawlCandidateBuilder builder;
 
         if (!isCrawlSeed) {
-            int crawlDepthLimit = config.getMaximumCrawlDepth();
+            int crawlDepthLimit = configuration.getMaximumCrawlDepth();
             int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1;
 
             // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit
@@ -117,7 +117,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
     /**
      * Indicates if there are any candidates left in the queue.
      *
-     * @return True if there are candidates in the queue, false otherwise
+     * @return <code>true</code> if there are candidates in the queue,
+     * <code>false</code> otherwise
      */
     public boolean hasNextCandidate() {
         return !candidates.isEmpty();
@@ -126,7 +127,7 @@ public boolean hasNextCandidate() {
     /**
      * Gets the next candidate from the queue.
      *
-     * @return The next candidate
+     * @return The next <code>CrawlCandidate</code> instance
      */
     public CrawlCandidate getNextCandidate() {
         currentCandidate = candidates.poll();
@@ -139,7 +140,7 @@ public CrawlCandidate getNextCandidate() {
      * @param url The URL that the fingerprint will be created for
      * @return The fingerprint of the URL
      */
-    private String getFingerprintForUrl(final URL url) {
+    private static String createFingerprintForUrl(final URL url) {
         // First, we start off with the host only
         StringBuilder truncatedUrl = new StringBuilder(url.getHost());
 
@@ -170,11 +171,11 @@ private String getFingerprintForUrl(final URL url) {
     /**
      * Creates a new priority queue using the specified strategy.
      *
-     * @return A new PriorityQueue instance for CrawlRequests using the given
-     * comparator
+     * @return The <code>PriorityQueue</code> instance for crawl requests using
+     * the given comparator
      */
-    private PriorityQueue<CrawlCandidate> getPriorityQueue() {
-        switch (config.getCrawlStrategy()) {
+    private PriorityQueue<CrawlCandidate> createPriorityQueue() {
+        switch (configuration.getCrawlStrategy()) {
             case BREADTH_FIRST:
                 return new PriorityQueue<>(Comparator.comparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getCrawlDepth)
                         .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, reverseOrder()));
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
index 378f591..e2f4527 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
@@ -27,12 +27,12 @@
  *
  * @author Peter Bencze
  */
-public class CrawlerConfigurator {
+public final class CrawlerConfigurator {
 
-    private final CrawlerConfiguration config;
+    private final CrawlerConfiguration configuration;
 
-    public CrawlerConfigurator(CrawlerConfiguration config) {
-        this.config = config;
+    public CrawlerConfigurator(CrawlerConfiguration configuration) {
+        this.configuration = configuration;
     }
 
     /**
@@ -42,7 +42,7 @@ public CrawlerConfigurator(CrawlerConfiguration config) {
      * the crawl seed
      */
     public void addCrawlSeed(final CrawlRequest request) {
-        config.addCrawlSeed(Preconditions.checkNotNull(request));
+        configuration.addCrawlSeed(Preconditions.checkNotNull(request));
     }
 
     /**
@@ -63,7 +63,7 @@ public void addCrawlSeeds(final List<CrawlRequest> requests) {
      * @param crawlStrategy The crawl strategy
      */
     public void setCrawlStrategy(final CrawlStrategy crawlStrategy) {
-        config.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy));
+        configuration.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy));
     }
 
     /**
@@ -73,7 +73,7 @@ public void setCrawlStrategy(final CrawlStrategy crawlStrategy) {
      * <code>false</code> means disabled
      */
     public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests) {
-        config.setDuplicateRequestFiltering(filterDuplicateRequests);
+        configuration.setDuplicateRequestFiltering(filterDuplicateRequests);
     }
 
     /**
@@ -83,7 +83,7 @@ public void setDuplicateRequestFiltering(final boolean filterDuplicateRequests)
      * <code>false</code> means disabled
      */
     public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
-        config.setOffsiteRequestFiltering(filterOffsiteRequests);
+        configuration.setOffsiteRequestFiltering(filterOffsiteRequests);
     }
 
     /**
@@ -95,7 +95,7 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
     public void setMaximumCrawlDepth(final int maxCrawlDepth) {
         Preconditions.checkArgument(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative.");
 
-        config.setMaximumCrawlDepth(maxCrawlDepth);
+        configuration.setMaximumCrawlDepth(maxCrawlDepth);
     }
 
     /**
@@ -104,7 +104,7 @@ public void setMaximumCrawlDepth(final int maxCrawlDepth) {
      * @param crawlDelayStrategy The crawl delay strategy
      */
     public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
-        config.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy));
+        configuration.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy));
     }
 
     /**
@@ -113,7 +113,7 @@ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
      * @param fixedCrawlDelayDuration The duration of delay
      */
     public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
-        config.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis());
+        configuration.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis());
     }
 
     /**
@@ -125,11 +125,11 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
         Preconditions.checkArgument(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative.");
 
         long minCrawlDelayDurationInMillis = minCrawlDelayDuration.toMillis();
-        long maxCrawlDelayInMillis = config.getMaximumCrawlDelayDurationInMillis();
+        long maxCrawlDelayInMillis = configuration.getMaximumCrawlDelayDurationInMillis();
 
         Preconditions.checkArgument(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum.");
 
-        config.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis);
+        configuration.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis);
     }
 
     /**
@@ -138,11 +138,11 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
      * @param maxCrawlDelayDuration The maximum duration of delay
      */
     public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
-        long minCrawlDelayDurationInMillis = config.getMinimumCrawlDelayDurationInMillis();
+        long minCrawlDelayDurationInMillis = configuration.getMinimumCrawlDelayDurationInMillis();
         long maxCrawlDelayDurationInMillis = maxCrawlDelayDuration.toMillis();
 
         Preconditions.checkArgument(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum.");
 
-        config.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis);
+        configuration.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis);
     }
 }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java
similarity index 62%
rename from src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
rename to src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java
index 3eb0f87..0ca8307 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java
@@ -16,27 +16,28 @@
 package com.github.peterbencze.serritor.internal;
 
 /**
- * A type of crawl delay, in which case the delay is constant and equals to the
- * duration specified in the configuration.
+ * A crawl delay mechanism, in which case the delay is constant and equals to
+ * the duration specified in the configuration.
  *
  * @author Peter Bencze
  */
-public final class FixedCrawlDelay implements CrawlDelay {
+public final class FixedCrawlDelayMechanism implements CrawlDelayMechanism {
 
     private final long delayInMillis;
 
     /**
-     * Constructs a new <code>FixedCrawlDelay</code> instance.
-     * 
-     * @param config A <code>CrawlerConfiguration</code> instance which specifies the fixed delay
+     * Constructs a new <code>FixedCrawlDelayMechanism</code> instance.
+     *
+     * @param configuration The <code>CrawlerConfiguration</code> instance which
+     * specifies the fixed delay duration.
      */
-    public FixedCrawlDelay(final CrawlerConfiguration config) {
-        delayInMillis = config.getFixedCrawlDelayDurationInMillis();
+    public FixedCrawlDelayMechanism(final CrawlerConfiguration configuration) {
+        this.delayInMillis = configuration.getFixedCrawlDelayDurationInMillis();
     }
 
     /**
      * Returns the fixed delay specified in the configuration.
-     * 
+     *
      * @return The delay in milliseconds
      */
     @Override
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java
similarity index 60%
rename from src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
rename to src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java
index 6c16073..13b33b0 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelay.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java
@@ -18,25 +18,25 @@
 import java.util.concurrent.ThreadLocalRandom;
 
 /**
- * A type of crawl delay in which case the duration is randomized between the
+ * A crawl delay mechanism in which case the duration is randomized between the
  * specified minimum and maximum range.
  *
  * @author Peter Bencze
  */
-public final class RandomCrawlDelay implements CrawlDelay {
+public final class RandomCrawlDelayMechanism implements CrawlDelayMechanism {
 
-    private final long origin;
-    private final long bound;
+    private final long lowerLimit;
+    private final long upperLimit;
 
     /**
-     * Constructs a new <code>RandomCrawlDelay</code> instance.
+     * Constructs a new <code>RandomCrawlDelayMechanism</code> instance.
      *
-     * @param config A <code>CrawlerConfiguration</code> instance which
+     * @param configuration The <code>CrawlerConfiguration</code> instance which
      * specifies the minimum and maximum delay.
      */
-    public RandomCrawlDelay(final CrawlerConfiguration config) {
-        origin = config.getMinimumCrawlDelayDurationInMillis();
-        bound = config.getMaximumCrawlDelayDurationInMillis() + 1;
+    public RandomCrawlDelayMechanism(final CrawlerConfiguration configuration) {
+        lowerLimit = configuration.getMinimumCrawlDelayDurationInMillis();
+        upperLimit = configuration.getMaximumCrawlDelayDurationInMillis() + 1;
     }
 
     /**
@@ -47,6 +47,6 @@ public RandomCrawlDelay(final CrawlerConfiguration config) {
      */
     @Override
     public long getDelay() {
-        return ThreadLocalRandom.current().nextLong(origin, bound);
+        return ThreadLocalRandom.current().nextLong(lowerLimit, upperLimit);
     }
 }
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 6cab603..047e202 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -30,7 +30,6 @@
 /**
  * Test cases for <code>CrawlFrontier</code>.
  *
- * @author Krisztian Mozsi
  * @author Peter Bencze
  */
 public final class CrawlFrontierTest {

From f175f90099fac0d649e0273057fca377918770ce Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Thu, 8 Mar 2018 00:40:36 +0100
Subject: [PATCH 11/24] Refactor serialization

---
 .../peterbencze/serritor/api/BaseCrawler.java | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index 17b01c1..489a6f0 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -29,13 +29,12 @@
 import com.github.peterbencze.serritor.internal.RandomCrawlDelayMechanism;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
 import java.io.OutputStream;
 import java.net.URI;
 import java.net.URL;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.SerializationUtils;
 import org.apache.http.Header;
 import org.apache.http.HttpResponse;
 import org.apache.http.client.HttpClient;
@@ -133,29 +132,23 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
      * Saves the current state of the crawler to the specified output stream.
      *
      * @param out The <code>OutputStream</code> instance to use
-     * @throws IOException Any exception thrown by the underlying
-     * <code>OutputStream</code>.
      */
-    public final void saveState(final OutputStream out) throws IOException {
-        // Check if the crawler has been started, otherwise we have nothing to save
+    public final void saveState(final OutputStream out) {
+        // Check if the crawler has been started at least once, otherwise we have nothing to save
         if (crawlFrontier == null) {
             throw new IllegalStateException("No state to save.");
         }
 
-        // Save the frontier's current state
-        ObjectOutputStream objectOutputStream = new ObjectOutputStream(out);
-        objectOutputStream.writeObject(crawlFrontier);
+        // Save the crawl frontier's current state
+        SerializationUtils.serialize(crawlFrontier, out);
     }
 
     /**
      * Resumes a previously saved state using HtmlUnit headless browser.
      *
      * @param in The <code>InputStream</code> instance to use
-     * @throws IOException Any of the usual input/output related exceptions.
-     * @throws ClassNotFoundException Class of a serialized object cannot be
-     * found.
      */
-    public final void resumeState(final InputStream in) throws IOException, ClassNotFoundException {
+    public final void resumeState(final InputStream in) {
         resumeState(new HtmlUnitDriver(true), in);
     }
 
@@ -166,13 +159,10 @@ public final void resumeState(final InputStream in) throws IOException, ClassNot
      * @param driver The <code>WebDriver</code> instance to be used by the
      * crawler
      * @param in The <code>InputStream</code> instance to use
-     * @throws IOException Any of the usual input/output related exceptions.
-     * @throws ClassNotFoundException Class of a serialized object cannot be
-     * found.
      */
-    public final void resumeState(final WebDriver driver, final InputStream in) throws IOException, ClassNotFoundException {
-        ObjectInputStream objectInputStream = new ObjectInputStream(in);
-        CrawlFrontier frontierToUse = (CrawlFrontier) objectInputStream.readObject();
+    public final void resumeState(final WebDriver driver, final InputStream in) {
+        // Re-create crawl frontier from the saved state
+        CrawlFrontier frontierToUse = SerializationUtils.deserialize(in);
 
         start(driver, frontierToUse);
     }

From c481cd46abf7982e9ec26bb0ce7113f5ec580a50 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Thu, 8 Mar 2018 16:10:23 +0100
Subject: [PATCH 12/24] Add the possibility of specifying allowed crawl domains

---
 .../serritor/api/CrawlRequest.java            | 37 ++++----
 .../serritor/internal/CrawlCandidate.java     |  9 +-
 .../serritor/internal/CrawlDomain.java        | 91 +++++++++++++++++++
 .../serritor/internal/CrawlFrontier.java      | 27 ++++--
 .../internal/CrawlerConfiguration.java        | 23 +++++
 .../internal/CrawlerConfigurator.java         | 23 +++++
 .../serritor/internal/CrawlDomainTest.java    | 59 ++++++++++++
 .../serritor/internal/CrawlFrontierTest.java  | 31 +++++--
 8 files changed, 260 insertions(+), 40 deletions(-)
 create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java
 create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java

diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
index 34f26f0..5cb9a23 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
@@ -16,6 +16,8 @@
 package com.github.peterbencze.serritor.api;
 
 import com.google.common.net.InternetDomainName;
+import java.io.IOException;
+import java.io.ObjectInputStream;
 import java.io.Serializable;
 import java.net.MalformedURLException;
 import java.net.URL;
@@ -31,13 +33,14 @@
 public final class CrawlRequest implements Serializable {
 
     private final URL requestUrl;
-    private final String topPrivateDomain;
     private final int priority;
     private final Serializable metadata;
+    
+    private transient InternetDomainName domain;
 
     private CrawlRequest(final CrawlRequestBuilder builder) {
         requestUrl = builder.requestUrl;
-        topPrivateDomain = builder.topPrivateDomain;
+        domain = builder.domain;
         priority = builder.priority;
         metadata = builder.metadata;
     }
@@ -52,12 +55,12 @@ public URL getRequestUrl() {
     }
 
     /**
-     * Returns the top private domain of the request's URL.
+     * Returns the domain of the request's URL.
      *
-     * @return The top private domain of the URL
+     * @return The domain of the request URL
      */
-    public String getTopPrivateDomain() {
-        return topPrivateDomain;
+    public InternetDomainName getDomain() {
+        return domain;
     }
 
     /**
@@ -83,8 +86,8 @@ public static final class CrawlRequestBuilder {
         private static final int DEFAULT_PRIORITY = 0;
 
         private final URL requestUrl;
-
-        private String topPrivateDomain;
+        private final InternetDomainName domain;
+        
         private int priority;
         private Serializable metadata;
 
@@ -98,14 +101,8 @@ public static final class CrawlRequestBuilder {
         public CrawlRequestBuilder(final URL requestUrl) {
             this.requestUrl = requestUrl;
 
-            // Extract the top private domain from the request URL
-            try {
-                topPrivateDomain = InternetDomainName.from(requestUrl.getHost())
-                        .topPrivateDomain()
-                        .toString();
-            } catch (IllegalStateException ex) {
-                throw new IllegalArgumentException(String.format("The top private domain cannot be extracted from the given request URL (\"%s\").", requestUrl), ex);
-            }
+            // Extract the domain from the request URL
+            domain = InternetDomainName.from(requestUrl.getHost());
 
             // Set default priority
             priority = DEFAULT_PRIORITY;
@@ -167,8 +164,14 @@ private static URL getUrlFromString(final String requestUrl) {
             try {
                 return new URL(requestUrl);
             } catch (MalformedURLException ex) {
-                throw new IllegalArgumentException(String.format("The given request URL (\"%s\") is malformed.", requestUrl), ex);
+                throw new IllegalArgumentException(String.format("The URL (\"%s\") is malformed.", requestUrl), ex);
             }
         }
     }
+
+    private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException {
+        in.defaultReadObject();
+        
+        domain = InternetDomainName.from(requestUrl.getHost());
+    }
 }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
index 49b0721..7a4acbd 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
@@ -16,6 +16,7 @@
 package com.github.peterbencze.serritor.internal;
 
 import com.github.peterbencze.serritor.api.CrawlRequest;
+import com.google.common.net.InternetDomainName;
 import java.io.Serializable;
 import java.net.URL;
 
@@ -56,12 +57,12 @@ public URL getCandidateUrl() {
     }
 
     /**
-     * Returns the top private domain of the candidate's URL.
+     * Returns the domain of the candidate's URL.
      *
-     * @return The top private domain of the URL
+     * @return The domain of the candidate URL
      */
-    public String getTopPrivateDomain() {
-        return crawlRequest.getTopPrivateDomain();
+    public InternetDomainName getDomain() {
+        return crawlRequest.getDomain();
     }
 
     /**
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java
new file mode 100644
index 0000000..89bba42
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java
@@ -0,0 +1,91 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.net.InternetDomainName;
+import java.io.Serializable;
+
+/**
+ * Represents an internet domain in which crawling is allowed.
+ *
+ * @author Peter Bencze
+ */
+public final class CrawlDomain implements Serializable {
+
+    private final ImmutableList<String> parts;
+
+    /**
+     * Constructs a new <code>CrawlDomain</code> instance.
+     *
+     * @param domain An immutable well-formed internet domain name
+     */
+    public CrawlDomain(final InternetDomainName domain) {
+        parts = domain.parts();
+    }
+
+    /**
+     * Indicates if two <code>CrawlDomain</code> instances are equal or not.
+     * Crawl domains with the same domain name are considered equal.
+     *
+     * @param obj A <code>CrawlDomain</code> instance
+     * @return <code>true</code> if equal, <code>false</code> otherwise
+     */
+    @Override
+    public boolean equals(final Object obj) {
+        if (obj == this) {
+            return true;
+        }
+
+        if (obj instanceof CrawlDomain) {
+            CrawlDomain other = (CrawlDomain) obj;
+            return parts.equals(other.parts);
+        }
+
+        return false;
+    }
+
+    /**
+     * Calculates the hash code from the individual components of the domain
+     * name.
+     *
+     * @return The hash code for the crawl domain
+     */
+    @Override
+    public int hashCode() {
+        return parts.hashCode();
+    }
+
+    /**
+     * Indicates if this crawl domain contains the specific internet domain.
+     *
+     * @param domain An immutable well-formed internet domain name
+     * @return <code>true</code> if belongs, <code>false</code> otherwise
+     */
+    public boolean contains(final InternetDomainName domain) {
+        ImmutableList<String> otherDomainParts = domain.parts();
+
+        if (parts.size() > otherDomainParts.size()) {
+            return false;
+        }
+
+        otherDomainParts = otherDomainParts.reverse()
+                .subList(0, parts.size());
+
+        return parts.reverse()
+                .equals(otherDomainParts);
+    }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index 58e56e2..5fa6283 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -40,7 +40,7 @@ public final class CrawlFrontier implements Serializable {
 
     private final CrawlerConfiguration configuration;
 
-    private final Set<String> allowedDomains;
+    private final Set<CrawlDomain> allowedCrawlDomains;
     private final Set<String> urlFingerprints;
 
     private final Queue<CrawlCandidate> candidates;
@@ -50,7 +50,8 @@ public final class CrawlFrontier implements Serializable {
     public CrawlFrontier(final CrawlerConfiguration configuration) {
         this.configuration = configuration;
 
-        allowedDomains = new HashSet<>();
+        allowedCrawlDomains = configuration.getAllowedCrawlDomains();
+        
         urlFingerprints = new HashSet<>();
 
         // Construct a priority queue according to the crawl strategy specified in the configuration
@@ -72,24 +73,32 @@ public CrawlFrontier(final CrawlerConfiguration configuration) {
      */
     public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
         if (configuration.isOffsiteRequestFilteringEnabled()) {
-            if (isCrawlSeed) {
-                allowedDomains.add(request.getTopPrivateDomain());
-            } else {
-                if (!allowedDomains.contains(request.getTopPrivateDomain())) {
-                    return;
+            // Check if the request's domain is in the allowed crawl domains
+            
+            boolean inCrawlDomain = false;
+            
+            for (CrawlDomain allowedCrawlDomain : allowedCrawlDomains) {
+                if (allowedCrawlDomain.contains(request.getDomain())) {
+                    inCrawlDomain = true;
+                    break;
                 }
             }
+            
+            if (!inCrawlDomain) {
+                return;
+            }
         }
 
         if (configuration.isDuplicateRequestFilteringEnabled()) {
+            // Check if the URL has already been crawled
+            
             String urlFingerprint = createFingerprintForUrl(request.getRequestUrl());
 
-            // Check if the URL has already been crawled
+            
             if (urlFingerprints.contains(urlFingerprint)) {
                 return;
             }
 
-            // If not, add its fingerprint to the set of URL fingerprints
             urlFingerprints.add(urlFingerprint);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 1fa666f..303bc52 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -21,7 +21,9 @@
 import java.io.Serializable;
 import java.time.Duration;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 /**
  * This class contains the settings of the crawler.
@@ -39,6 +41,7 @@ public final class CrawlerConfiguration implements Serializable {
     private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS = Duration.ofSeconds(1).toMillis();
     private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis();
 
+    private final Set<CrawlDomain> allowedCrawlDomains;
     private final List<CrawlRequest> crawlSeeds;
 
     private CrawlStrategy crawlStrategy;
@@ -53,6 +56,7 @@ public final class CrawlerConfiguration implements Serializable {
     public CrawlerConfiguration() {
         // Initialize configuration with default values
 
+        allowedCrawlDomains = new HashSet<>();
         crawlSeeds = new ArrayList<>();
         crawlStrategy = DEFAULT_CRAWL_STRATEGY;
         filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT;
@@ -64,6 +68,25 @@ public CrawlerConfiguration() {
         maxCrawlDelayDurationInMillis = DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS;
     }
 
+    /**
+     * Returns the set of allowed crawl domains.
+     *
+     * @return The set of allowed crawl domains
+     */
+    public Set<CrawlDomain> getAllowedCrawlDomains() {
+        return allowedCrawlDomains;
+    }
+
+    /**
+     * Appends a crawl domain to the list of allowed ones.
+     *
+     * @param allowedCrawlDomain The <code>CrawlDomain</code> instance which
+     * represents the allowed crawl domain
+     */
+    public void addAllowedCrawlDomain(CrawlDomain allowedCrawlDomain) {
+        allowedCrawlDomains.add(allowedCrawlDomain);
+    }
+
     /**
      * Returns the list of crawl seeds.
      *
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
index e2f4527..5c4a4df 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
@@ -19,6 +19,7 @@
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.github.peterbencze.serritor.api.CrawlStrategy;
 import com.google.common.base.Preconditions;
+import com.google.common.net.InternetDomainName;
 import java.time.Duration;
 import java.util.List;
 
@@ -35,6 +36,28 @@ public CrawlerConfigurator(CrawlerConfiguration configuration) {
         this.configuration = configuration;
     }
 
+    /**
+     * Appends an internet domain to the list of allowed crawl domains.
+     *
+     * @param allowedCrawlDomain A well-formed internet domain name
+     */
+    public void addAllowedCrawlDomain(final String allowedCrawlDomain) {
+        InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain);
+
+        Preconditions.checkArgument(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain));
+
+        configuration.addAllowedCrawlDomain(new CrawlDomain(domain));
+    }
+
+    /**
+     * Appends a list of internet domains to the list of allowed crawl domains.
+     *
+     * @param allowedCrawlDomains A list of well-formed internet domain names
+     */
+    public void addAllowedCrawlDomains(final List<String> allowedCrawlDomains) {
+        allowedCrawlDomains.forEach(this::addAllowedCrawlDomain);
+    }
+
     /**
      * Appends a crawl request to the list of crawl seeds.
      *
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
new file mode 100644
index 0000000..3148b0c
--- /dev/null
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
@@ -0,0 +1,59 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.internal;
+
+import com.google.common.net.InternetDomainName;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test cases for <code>CrawlDomain</code>.
+ * 
+ * @author Peter Bencze
+ */
+public class CrawlDomainTest {
+    
+    private static final InternetDomainName DOMAIN = InternetDomainName.from("test.com");
+    private static final InternetDomainName SUBDOMAIN = InternetDomainName.from("sub.test.com");
+    
+    private static final CrawlDomain CRAWL_DOMAIN_0 = new CrawlDomain(DOMAIN);
+    private static final CrawlDomain CRAWL_DOMAIN_1 = new CrawlDomain(DOMAIN);
+    private static final CrawlDomain CRAWL_DOMAIN_2 = new CrawlDomain(SUBDOMAIN);
+    
+    @Test
+    public void testEquals() {
+        // A crawl domain should be equal with itself
+        Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_0));
+        
+        // Crawl domains with the same domain should be equal
+        Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_1));
+        
+        // Crawl domains with different domains should not be equal
+        Assert.assertFalse(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_2));
+    }
+    
+    @Test
+    public void testContains() {
+        // A crawl domain should contain its own domain
+        Assert.assertTrue(CRAWL_DOMAIN_0.contains(DOMAIN));
+        
+        // A crawl domain should contain its own domain's subdomain
+        Assert.assertTrue(CRAWL_DOMAIN_0.contains(SUBDOMAIN));
+        
+        // A crawl domain should not contain a domain different from its own domain
+        Assert.assertFalse(CRAWL_DOMAIN_2.contains(DOMAIN));
+    }
+}
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 047e202..74854bf 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -18,6 +18,7 @@
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
 import com.github.peterbencze.serritor.api.CrawlStrategy;
+import com.google.common.net.InternetDomainName;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Arrays;
@@ -34,6 +35,10 @@
  */
 public final class CrawlFrontierTest {
 
+    // Allowed crawl domains
+    private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_0 = new CrawlDomain(InternetDomainName.from("root_url_0.com"));
+    private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_1 = new CrawlDomain(InternetDomainName.from("root_url_1.com"));
+
     // Root URLs
     private static final URL ROOT_URL_0;
     private static final URL ROOT_URL_1;
@@ -126,19 +131,25 @@ public final class CrawlFrontierTest {
                 .build();
     }
 
-    private CrawlerConfiguration config;
+    private CrawlerConfiguration configuration;
     private CrawlFrontier frontier;
 
     @Before
     public void initialize() {
-        config = new CrawlerConfiguration();
+        configuration = new CrawlerConfiguration();
+
+        configuration.setOffsiteRequestFiltering(true);
+
+        // Add allowed crawl domains
+        Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1)
+                .forEach(configuration::addAllowedCrawlDomain);
 
-        config.setOffsiteRequestFiltering(true);
+        // Add crawl seeds
         Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST)
-                .forEach(config::addCrawlSeed);
+                .forEach(configuration::addCrawlSeed);
 
         // Create frontier
-        frontier = new CrawlFrontier(config);
+        frontier = new CrawlFrontier(configuration);
     }
 
     @Test
@@ -216,7 +227,7 @@ public void getNextRequestWithOffsiteRequestFilterTest() {
     @Test
     public void getNextRequestWithoutDuplicateRequestFilterTest() {
         // Turn off duplicate request filtering
-        config.setDuplicateRequestFiltering(false);
+        configuration.setDuplicateRequestFiltering(false);
 
         // Clear the crawl candidate queue of the frontier
         clearCrawlCandidateQueue();
@@ -234,7 +245,7 @@ public void getNextRequestWithoutDuplicateRequestFilterTest() {
     @Test
     public void getNextRequestWithoutOffsiteRequestFilterTest() {
         // Turn off offsite request filtering
-        config.setOffsiteRequestFiltering(false);
+        configuration.setOffsiteRequestFiltering(false);
 
         // Clear the crawl candidate queue of the frontier
         clearCrawlCandidateQueue();
@@ -326,8 +337,8 @@ public void getNextRequestBreadthFirstTest() {
     @Test
     public void getNextRequestDepthFirstTest() {
         // Set the crawl strategy to depth-first
-        config.setCrawlStrategy(CrawlStrategy.DEPTH_FIRST);
-        frontier = new CrawlFrontier(config);
+        configuration.setCrawlStrategy(CrawlStrategy.DEPTH_FIRST);
+        frontier = new CrawlFrontier(configuration);
 
         // Get the crawl candidate of root URL 1
         CrawlCandidate nextCandidate = frontier.getNextCandidate();
@@ -404,7 +415,7 @@ public void getNextRequestDepthFirstTest() {
     @Test
     public void maxCrawlDepthTest() {
         // Set max crawl depth
-        config.setMaximumCrawlDepth(MAX_CRAWL_DEPTH);
+        configuration.setMaximumCrawlDepth(MAX_CRAWL_DEPTH);
 
         // Clear the crawl candidate queue of the frontier
         clearCrawlCandidateQueue();

From 4c78f444ce546cc9d01dc80a81ef73d2f75d68db Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Thu, 8 Mar 2018 21:38:43 +0100
Subject: [PATCH 13/24] Refact assertions

---
 .../peterbencze/serritor/internal/CrawlDomainTest.java      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
index 3148b0c..c420965 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
@@ -36,13 +36,13 @@ public class CrawlDomainTest {
     @Test
     public void testEquals() {
         // A crawl domain should be equal with itself
-        Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_0));
+        Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_0);
         
         // Crawl domains with the same domain should be equal
-        Assert.assertTrue(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_1));
+        Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_1);
         
         // Crawl domains with different domains should not be equal
-        Assert.assertFalse(CRAWL_DOMAIN_0.equals(CRAWL_DOMAIN_2));
+        Assert.assertNotEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_2);
     }
     
     @Test

From e74679e46ed139c953e8c89a274fe743b9130504 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Sat, 10 Mar 2018 19:17:26 +0100
Subject: [PATCH 14/24] Remove static import

---
 .../peterbencze/serritor/internal/CrawlFrontier.java       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index 5fa6283..350a6ad 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -21,7 +21,6 @@
 import java.net.URL;
 import java.util.Arrays;
 import java.util.Comparator;
-import static java.util.Comparator.reverseOrder;
 import java.util.HashSet;
 import java.util.List;
 import java.util.PriorityQueue;
@@ -187,10 +186,10 @@ private PriorityQueue<CrawlCandidate> createPriorityQueue() {
         switch (configuration.getCrawlStrategy()) {
             case BREADTH_FIRST:
                 return new PriorityQueue<>(Comparator.comparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getCrawlDepth)
-                        .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, reverseOrder()));
+                        .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder()));
             case DEPTH_FIRST:
-                return new PriorityQueue<>(Comparator.comparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getCrawlDepth, reverseOrder())
-                        .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, reverseOrder()));
+                return new PriorityQueue<>(Comparator.comparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getCrawlDepth, Comparator.reverseOrder())
+                        .thenComparing((Function<CrawlCandidate, Integer> & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder()));
         }
 
         throw new IllegalArgumentException("Unsupported crawl strategy.");

From 9ad94c6d0926df64edea2d4858c43def4c109d36 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Mon, 12 Mar 2018 18:53:11 +0100
Subject: [PATCH 15/24] Add validations

---
 pom.xml                                       |  2 +-
 .../peterbencze/serritor/api/BaseCrawler.java | 30 +++++----------
 .../internal/CrawlerConfigurator.java         | 37 ++++++++++++-------
 3 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/pom.xml b/pom.xml
index a3ccc27..f87d10a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>com.github.peterbencze</groupId>
     <artifactId>serritor</artifactId>
-    <version>1.2.2</version>
+    <version>1.3.0</version>
     <packaging>jar</packaging>
     
     <name>Serritor</name>
diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index 489a6f0..4c47e45 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -35,6 +35,7 @@
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import org.apache.commons.lang3.SerializationUtils;
+import org.apache.commons.lang3.Validate;
 import org.apache.http.Header;
 import org.apache.http.HttpResponse;
 import org.apache.http.client.HttpClient;
@@ -107,20 +108,17 @@ public final void start(final WebDriver driver) {
      */
     private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
         try {
-            // Check if the crawler is running
-            if (!isStopped) {
-                throw new IllegalStateException("The crawler is already started.");
-            }
+            Validate.validState(isStopped, "The crawler is already started.");
 
             isStopped = false;
             httpClient = HttpClientBuilder.create().build();
-            webDriver = driver;
+            webDriver = Validate.notNull(driver, "The webdriver cannot be null.");
             crawlFrontier = frontierToUse;
             crawlDelayMechanism = createCrawlDelayMechanism();
 
             run();
         } finally {
-            // Always close the WebDriver
+            // Always close the browser
             webDriver.quit();
 
             stopCrawling = false;
@@ -135,9 +133,7 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
      */
     public final void saveState(final OutputStream out) {
         // Check if the crawler has been started at least once, otherwise we have nothing to save
-        if (crawlFrontier == null) {
-            throw new IllegalStateException("No state to save.");
-        }
+        Validate.validState(crawlFrontier != null, "Cannot save state at this point. The crawler should be started first.");
 
         // Save the crawl frontier's current state
         SerializationUtils.serialize(crawlFrontier, out);
@@ -171,14 +167,8 @@ public final void resumeState(final WebDriver driver, final InputStream in) {
      * Stops the crawler.
      */
     public final void stop() {
-        // Check if the crawler is running
-        if (isStopped) {
-            throw new IllegalStateException("The crawler is not started.");
-        }
-
-        if (stopCrawling) {
-            throw new IllegalStateException("Stop has already been called.");
-        }
+        Validate.validState(!isStopped, "The crawler is not started.");
+        Validate.validState(!stopCrawling, "The stop method has already been called.");
 
         // Indicate that the crawling should be stopped
         stopCrawling = true;
@@ -193,10 +183,8 @@ public final void stop() {
      * @param request The <code>CrawlRequest</code> instance
      */
     protected final void crawl(final CrawlRequest request) {
-        // Check if the crawler is running
-        if (isStopped) {
-            throw new IllegalStateException("The crawler is not started. Maybe you meant to add this request as a crawl seed?");
-        }
+        Validate.notNull(request, "The request cannot be null.");
+        Validate.validState(!isStopped, "The crawler is not started. Maybe you meant to add this request as a crawl seed?");
 
         crawlFrontier.feedRequest(request, false);
     }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
index 5c4a4df..594eebe 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
@@ -18,10 +18,10 @@
 import com.github.peterbencze.serritor.api.CrawlDelayStrategy;
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.github.peterbencze.serritor.api.CrawlStrategy;
-import com.google.common.base.Preconditions;
 import com.google.common.net.InternetDomainName;
 import java.time.Duration;
 import java.util.List;
+import org.apache.commons.lang3.Validate;
 
 /**
  * This class provides an interface for the user to configure the crawler.
@@ -44,7 +44,7 @@ public CrawlerConfigurator(CrawlerConfiguration configuration) {
     public void addAllowedCrawlDomain(final String allowedCrawlDomain) {
         InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain);
 
-        Preconditions.checkArgument(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain));
+        Validate.isTrue(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain));
 
         configuration.addAllowedCrawlDomain(new CrawlDomain(domain));
     }
@@ -65,7 +65,9 @@ public void addAllowedCrawlDomains(final List<String> allowedCrawlDomains) {
      * the crawl seed
      */
     public void addCrawlSeed(final CrawlRequest request) {
-        configuration.addCrawlSeed(Preconditions.checkNotNull(request));
+        Validate.notNull(request, "The request cannot be null.");
+        
+        configuration.addCrawlSeed(request);
     }
 
     /**
@@ -83,10 +85,12 @@ public void addCrawlSeeds(final List<CrawlRequest> requests) {
      * orders crawl requests by the lowest crawl depth, whereas depth-first
      * orders them by the highest crawl depth.
      *
-     * @param crawlStrategy The crawl strategy
+     * @param strategy The crawl strategy
      */
-    public void setCrawlStrategy(final CrawlStrategy crawlStrategy) {
-        configuration.setCrawlStrategy(Preconditions.checkNotNull(crawlStrategy));
+    public void setCrawlStrategy(final CrawlStrategy strategy) {
+        Validate.notNull(strategy, "The strategy cannot be null.");
+        
+        configuration.setCrawlStrategy(strategy);
     }
 
     /**
@@ -116,7 +120,7 @@ public void setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
      * @param maxCrawlDepth The maximum crawl depth
      */
     public void setMaximumCrawlDepth(final int maxCrawlDepth) {
-        Preconditions.checkArgument(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative.");
+        Validate.isTrue(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative.");
 
         configuration.setMaximumCrawlDepth(maxCrawlDepth);
     }
@@ -124,10 +128,12 @@ public void setMaximumCrawlDepth(final int maxCrawlDepth) {
     /**
      * Sets the crawl delay strategy to be used by the crawler.
      *
-     * @param crawlDelayStrategy The crawl delay strategy
+     * @param strategy The crawl delay strategy
      */
-    public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
-        configuration.setCrawlDelayStrategy(Preconditions.checkNotNull(crawlDelayStrategy));
+    public void setCrawlDelayStrategy(final CrawlDelayStrategy strategy) {
+        Validate.notNull(strategy, "The strategy cannot be null.");
+        
+        configuration.setCrawlDelayStrategy(strategy);
     }
 
     /**
@@ -136,6 +142,8 @@ public void setCrawlDelayStrategy(final CrawlDelayStrategy crawlDelayStrategy) {
      * @param fixedCrawlDelayDuration The duration of delay
      */
     public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
+        Validate.notNull(fixedCrawlDelayDuration, "The duration cannot be null.");
+        
         configuration.setFixedCrawlDelayDurationInMillis(fixedCrawlDelayDuration.toMillis());
     }
 
@@ -145,12 +153,13 @@ public void setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
      * @param minCrawlDelayDuration The minimum duration of delay
      */
     public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
-        Preconditions.checkArgument(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative.");
+        Validate.notNull(minCrawlDelayDuration, "The duration cannot be null.");
+        Validate.isTrue(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative.");
 
         long minCrawlDelayDurationInMillis = minCrawlDelayDuration.toMillis();
         long maxCrawlDelayInMillis = configuration.getMaximumCrawlDelayDurationInMillis();
 
-        Preconditions.checkArgument(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum.");
+        Validate.isTrue(minCrawlDelayDurationInMillis < maxCrawlDelayInMillis, "The minimum crawl delay should be less than the maximum.");
 
         configuration.setMinimumCrawlDelayDurationInMillis(minCrawlDelayDurationInMillis);
     }
@@ -161,10 +170,12 @@ public void setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
      * @param maxCrawlDelayDuration The maximum duration of delay
      */
     public void setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
+        Validate.notNull(maxCrawlDelayDuration, "The duration cannot be null.");
+        
         long minCrawlDelayDurationInMillis = configuration.getMinimumCrawlDelayDurationInMillis();
         long maxCrawlDelayDurationInMillis = maxCrawlDelayDuration.toMillis();
 
-        Preconditions.checkArgument(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum.");
+        Validate.isTrue(maxCrawlDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum.");
 
         configuration.setMaximumCrawlDelayDuration(maxCrawlDelayDurationInMillis);
     }

From e157a08b8cfa60cb42abee25bdabe5904df71028 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Mon, 12 Mar 2018 20:33:54 +0100
Subject: [PATCH 16/24] Refact

---
 .../peterbencze/serritor/api/BaseCrawler.java      |  2 +-
 .../serritor/internal/CrawlFrontier.java           |  2 +-
 .../serritor/internal/CrawlerConfiguration.java    | 14 ++++++--------
 .../serritor/internal/CrawlerConfigurator.java     |  4 ++--
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index 4c47e45..598ba73 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -195,7 +195,7 @@ protected final void crawl(final CrawlRequest request) {
      * @param requests The list of <code>CrawlRequest</code> instances
      */
     protected final void crawl(final List<CrawlRequest> requests) {
-        requests.stream().forEach(this::crawl);
+        requests.forEach(this::crawl);
     }
 
     /**
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index 350a6ad..c49f8e4 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -57,7 +57,7 @@ public CrawlFrontier(final CrawlerConfiguration configuration) {
         candidates = createPriorityQueue();
 
         // Feed initial crawl requests (seeds)
-        configuration.getCrawlSeeds().stream()
+        configuration.getCrawlSeeds()
                 .forEach((CrawlRequest request) -> {
                     feedRequest(request, true);
                 });
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
index 303bc52..3916c34 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfiguration.java
@@ -20,9 +20,7 @@
 import com.github.peterbencze.serritor.api.CrawlStrategy;
 import java.io.Serializable;
 import java.time.Duration;
-import java.util.ArrayList;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Set;
 
 /**
@@ -42,7 +40,7 @@ public final class CrawlerConfiguration implements Serializable {
     private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis();
 
     private final Set<CrawlDomain> allowedCrawlDomains;
-    private final List<CrawlRequest> crawlSeeds;
+    private final Set<CrawlRequest> crawlSeeds;
 
     private CrawlStrategy crawlStrategy;
     private boolean filterDuplicateRequests;
@@ -57,7 +55,7 @@ public CrawlerConfiguration() {
         // Initialize configuration with default values
 
         allowedCrawlDomains = new HashSet<>();
-        crawlSeeds = new ArrayList<>();
+        crawlSeeds = new HashSet<>();
         crawlStrategy = DEFAULT_CRAWL_STRATEGY;
         filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT;
         filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT;
@@ -88,16 +86,16 @@ public void addAllowedCrawlDomain(CrawlDomain allowedCrawlDomain) {
     }
 
     /**
-     * Returns the list of crawl seeds.
+     * Returns the set of crawl seeds.
      *
-     * @return The list of crawl seeds
+     * @return The set of crawl seeds
      */
-    public List<CrawlRequest> getCrawlSeeds() {
+    public Set<CrawlRequest> getCrawlSeeds() {
         return crawlSeeds;
     }
 
     /**
-     * Appends a crawl request to the list of crawl seeds.
+     * Appends a crawl request to the set of crawl seeds.
      *
      * @param request The <code>CrawlRequest</code> instance which represents
      * the crawl seed
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
index 594eebe..be6c6f4 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlerConfigurator.java
@@ -59,7 +59,7 @@ public void addAllowedCrawlDomains(final List<String> allowedCrawlDomains) {
     }
 
     /**
-     * Appends a crawl request to the list of crawl seeds.
+     * Appends a crawl request to the set of crawl seeds.
      *
      * @param request The <code>CrawlRequest</code> instance which represents
      * the crawl seed
@@ -71,7 +71,7 @@ public void addCrawlSeed(final CrawlRequest request) {
     }
 
     /**
-     * Appends a list of crawl requests to the list of crawl seeds.
+     * Appends a list of crawl requests to the set of crawl seeds.
      *
      * @param requests The list of <code>CrawlRequest</code> instances which
      * represent the crawl seeds

From 7214dc7eb2f39b7abf6036c9d1cc3cf60073f2df Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Tue, 13 Mar 2018 02:17:52 +0100
Subject: [PATCH 17/24] Add UrlFinder helper class

---
 .../serritor/api/helper/UrlFinder.java        | 189 ++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java

diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
new file mode 100644
index 0000000..2bcbe83
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
@@ -0,0 +1,189 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.api.helper;
+
+import com.github.peterbencze.serritor.api.HtmlResponse;
+import com.google.common.collect.Sets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.Validate;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebElement;
+
+/**
+ * A helper class which can be used to find URLs in HTML sources using regular
+ * expressions.
+ *
+ * @author Peter Bencze
+ */
+public final class UrlFinder {
+
+    private final Set<Pattern> urlPatterns;
+    private final Set<By> locatingMechanisms;
+    private final Set<String> attributes;
+
+    private UrlFinder(final UrlFinderBuilder builder) {
+        urlPatterns = builder.urlPatterns;
+        locatingMechanisms = builder.locatingMechanisms;
+        attributes = builder.attributes;
+    }
+
+    /**
+     * Returns a list of (unvalidated) URLs found in the response's HTML source.
+     *
+     * @param response The <code>HtmlResponse</code> instance
+     * @return The list of found URLs
+     */
+    public List<String> findUrlsInResponse(final HtmlResponse response) {
+        Set<String> foundUrls = new HashSet<>();
+
+        // Find elements using the specified locating mechanisms
+        Set<WebElement> extractedElements = locatingMechanisms.stream()
+                .map(response.getWebDriver()::findElements)
+                .flatMap(List::stream)
+                .collect(Collectors.toSet());
+
+        // Find URLs in the attribute values of the found elements
+        extractedElements.forEach((WebElement element) -> {
+            attributes.stream()
+                    .map(element::getAttribute)
+                    .filter(StringUtils::isNotBlank)
+                    .map(this::findUrlsInAttributeValue)
+                    .flatMap(List::stream)
+                    .forEach(foundUrls::add);
+        });
+
+        return foundUrls.stream()
+                .collect(Collectors.toList());
+    }
+
+    /**
+     * Returns a list of (unvalidated) URLs found in the attribute's value.
+     *
+     * @param attributeValue The value of the attribute
+     * @return The list of found URLs
+     */
+    private List<String> findUrlsInAttributeValue(final String attributeValue) {
+        List<String> foundUrls = new ArrayList<>();
+
+        urlPatterns.stream()
+                .map((Pattern urlPattern) -> urlPattern.matcher(attributeValue))
+                .forEach((Matcher urlPatternMatcher) -> {
+                    while (urlPatternMatcher.find()) {
+                        String foundUrl = urlPatternMatcher.group();
+
+                        if (StringUtils.isNotBlank(foundUrl)) {
+                            foundUrls.add(foundUrl);
+                        }
+                    }
+                });
+
+        return foundUrls;
+    }
+
+    public static final class UrlFinderBuilder {
+
+        private final Set<Pattern> urlPatterns;
+
+        private Set<By> locatingMechanisms;
+        private Set<String> attributes;
+
+        /**
+         * Constructs a <code>UrlFinderBuilder</code> instance that can be used
+         * to create <code>UrlFinder</code> instances.
+         *
+         * @param urlPattern The pattern which will be used to find URLs
+         */
+        public UrlFinderBuilder(final Pattern urlPattern) {
+            this(Arrays.asList(urlPattern));
+        }
+
+        /**
+         * Constructs a <code>UrlFinderBuilder</code> instance that can be used
+         * to create <code>UrlFinder</code> instances. It
+         *
+         * @param urlPatterns The list of patterns which will be used to find
+         * URLs
+         */
+        public UrlFinderBuilder(final List<Pattern> urlPatterns) {
+            Validate.noNullElements(urlPatterns, "URL patterns cannot be null.");
+
+            this.urlPatterns = Sets.newHashSet(urlPatterns);
+            locatingMechanisms = Sets.newHashSet(By.tagName("a"));
+            attributes = Sets.newHashSet("href");
+        }
+
+        /**
+         * Sets the locating mechanism used by the finder. Only elements matched
+         * by the locator will be considered when searching for URLs.
+         *
+         * @param locatingMechanism The <code>By</code> locating mechanism
+         * instance
+         */
+        public void setLocatingMechanism(final By locatingMechanism) {
+            setLocatingMechanisms(Arrays.asList(locatingMechanism));
+        }
+
+        /**
+         * Sets the locating mechanisms used by the finder. Only elements
+         * matched by the locators will be considered when searching for URLs.
+         *
+         * @param locatingMechanisms The list of <code>By</code> locating
+         * mechanism instances
+         */
+        public void setLocatingMechanisms(final List<By> locatingMechanisms) {
+            Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null.");
+
+            this.locatingMechanisms = Sets.newHashSet(locatingMechanisms);
+        }
+
+        /**
+         * Sets which attributes to search for URLs.
+         *
+         * @param attributes The list of attribute names
+         */
+        public void setAttributes(final List<String> attributes) {
+            Validate.noNullElements(attributes, "Attributes cannot be null.");
+
+            this.attributes = Sets.newHashSet(attributes);
+        }
+
+        /**
+         * Sets which attribute to search for URLs.
+         *
+         * @param attribute The name of the attribute
+         */
+        public void setAttribute(final String attribute) {
+            setAttributes(Arrays.asList(attribute));
+        }
+
+        /**
+         * Builds the configured URL finder.
+         *
+         * @return The configured <code>UrlFinder</code> instance
+         */
+        public UrlFinder build() {
+            return new UrlFinder(this);
+        }
+    }
+}

From 030896bfeb947f2ebbde47985d845911f25dc31c Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Thu, 15 Mar 2018 02:53:16 +0100
Subject: [PATCH 18/24] Replace URL with URI

---
 .../peterbencze/serritor/api/BaseCrawler.java |  9 +--
 .../serritor/api/CrawlRequest.java            | 29 ++-----
 .../serritor/api/HtmlResponse.java            |  4 +-
 .../serritor/api/NonHtmlResponse.java         |  4 +-
 .../serritor/api/UnsuccessfulRequest.java     |  4 +-
 .../serritor/internal/CallbackParameter.java  | 10 +--
 .../serritor/internal/CrawlCandidate.java     | 12 +--
 .../serritor/internal/CrawlFrontier.java      |  4 +-
 .../serritor/internal/CrawlFrontierTest.java  | 81 +++++--------------
 9 files changed, 47 insertions(+), 110 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index 598ba73..97da904 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -31,7 +31,6 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.net.URI;
-import java.net.URL;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import org.apache.commons.lang3.SerializationUtils;
@@ -208,11 +207,11 @@ private void run() {
             // Get the next crawl candidate from the queue
             CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate();
 
-            URL currentCandidateUrl = currentCandidate.getCandidateUrl();
+            URI currentCandidateUrl = currentCandidate.getCandidateUrl();
             String currentRequestUrlAsString = currentCandidateUrl.toString();
 
             HttpHeadResponse httpHeadResponse;
-            URL responseUrl = currentCandidateUrl;
+            URI responseUrl = currentCandidateUrl;
 
             try {
                 HttpClientContext context = HttpClientContext.create();
@@ -223,7 +222,7 @@ private void run() {
                 // If the request has been redirected, get the final URL
                 List<URI> redirectLocations = context.getRedirectLocations();
                 if (redirectLocations != null) {
-                    responseUrl = redirectLocations.get(redirectLocations.size() - 1).toURL();
+                    responseUrl = redirectLocations.get(redirectLocations.size() - 1);
                 }
             } catch (IOException ex) {
                 UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
@@ -289,7 +288,7 @@ private void run() {
      * @param destinationUrl The URL to crawl
      * @return The HTTP HEAD response
      */
-    private HttpHeadResponse getHttpHeadResponse(final URL destinationUrl, final HttpClientContext context) throws IOException {
+    private HttpHeadResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException {
         HttpHead headRequest = new HttpHead(destinationUrl.toString());
         HttpResponse response = httpClient.execute(headRequest, context);
         return new HttpHeadResponse(response);
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
index 5cb9a23..4188a54 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
@@ -19,8 +19,7 @@
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.Serializable;
-import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.URI;
 import java.util.Optional;
 
 /**
@@ -32,7 +31,7 @@
  */
 public final class CrawlRequest implements Serializable {
 
-    private final URL requestUrl;
+    private final URI requestUrl;
     private final int priority;
     private final Serializable metadata;
     
@@ -50,7 +49,7 @@ private CrawlRequest(final CrawlRequestBuilder builder) {
      *
      * @return The URL of the request
      */
-    public URL getRequestUrl() {
+    public URI getRequestUrl() {
         return requestUrl;
     }
 
@@ -85,7 +84,7 @@ public static final class CrawlRequestBuilder {
 
         private static final int DEFAULT_PRIORITY = 0;
 
-        private final URL requestUrl;
+        private final URI requestUrl;
         private final InternetDomainName domain;
         
         private int priority;
@@ -98,7 +97,7 @@ public static final class CrawlRequestBuilder {
          * @param requestUrl The request's URL given as a <code>URL</code>
          * instance
          */
-        public CrawlRequestBuilder(final URL requestUrl) {
+        public CrawlRequestBuilder(final URI requestUrl) {
             this.requestUrl = requestUrl;
 
             // Extract the domain from the request URL
@@ -116,7 +115,7 @@ public CrawlRequestBuilder(final URL requestUrl) {
          * instance
          */
         public CrawlRequestBuilder(final String requestUrl) {
-            this(getUrlFromString(requestUrl));
+            this(URI.create(requestUrl));
         }
 
         /**
@@ -151,22 +150,6 @@ public CrawlRequestBuilder setMetadata(final Serializable metadata) {
         public CrawlRequest build() {
             return new CrawlRequest(this);
         }
-
-        /**
-         * Constructs a <code>URL</code> instance based on the specified URL
-         * string. Since call to this must be the first statement in a
-         * constructor, this method is necessary for the conversion to be made.
-         *
-         * @param requestUrl The request URL as <code>String</code>
-         * @return The <code>URL</code> instance
-         */
-        private static URL getUrlFromString(final String requestUrl) {
-            try {
-                return new URL(requestUrl);
-            } catch (MalformedURLException ex) {
-                throw new IllegalArgumentException(String.format("The URL (\"%s\") is malformed.", requestUrl), ex);
-            }
-        }
     }
 
     private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException {
diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
index a7be956..442d493 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
@@ -16,7 +16,7 @@
 package com.github.peterbencze.serritor.api;
 
 import com.github.peterbencze.serritor.internal.CallbackParameter;
-import java.net.URL;
+import java.net.URI;
 import org.openqa.selenium.WebDriver;
 
 /**
@@ -59,7 +59,7 @@ public static final class HtmlResponseBuilder extends CallbackParameterBuilder {
         private HttpHeadResponse httpHeadResponse;
         private WebDriver webDriver;
 
-        public HtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
+        public HtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             super(refererUrl, crawlDepth, crawlRequest);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
index c1f58bf..fc5e701 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
@@ -16,7 +16,7 @@
 package com.github.peterbencze.serritor.api;
 
 import com.github.peterbencze.serritor.internal.CallbackParameter;
-import java.net.URL;
+import java.net.URI;
 
 /**
  * Represents a non-HTML response.
@@ -46,7 +46,7 @@ public static final class NonHtmlResponseBuilder extends CallbackParameterBuilde
 
         private HttpHeadResponse httpHeadResponse;
 
-        public NonHtmlResponseBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
+        public NonHtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             super(refererUrl, crawlDepth, crawlRequest);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
index 12c67cc..7d379d5 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
@@ -17,7 +17,7 @@
 
 import com.github.peterbencze.serritor.internal.CallbackParameter;
 import java.io.IOException;
-import java.net.URL;
+import java.net.URI;
 
 /**
  * Represents an unsuccessful request.
@@ -48,7 +48,7 @@ public static final class UnsuccessfulRequestBuilder extends CallbackParameterBu
 
         private IOException exception;
 
-        public UnsuccessfulRequestBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
+        public UnsuccessfulRequestBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             super(refererUrl, crawlDepth, crawlRequest);
         }
 
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
index 9ca1d75..cb6ae0b 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
@@ -16,7 +16,7 @@
 package com.github.peterbencze.serritor.internal;
 
 import com.github.peterbencze.serritor.api.CrawlRequest;
-import java.net.URL;
+import java.net.URI;
 import java.util.Optional;
 
 /**
@@ -27,7 +27,7 @@
 public abstract class CallbackParameter {
 
     private final int crawlDepth;
-    private final URL refererUrl;
+    private final URI refererUrl;
     private final CrawlRequest crawlRequest;
 
     protected CallbackParameter(final CallbackParameterBuilder builder) {
@@ -41,7 +41,7 @@ protected CallbackParameter(final CallbackParameterBuilder builder) {
      *
      * @return The referer URL
      */
-    public final Optional<URL> getRefererUrl() {
+    public final Optional<URI> getRefererUrl() {
         return Optional.ofNullable(refererUrl);
     }
 
@@ -65,11 +65,11 @@ public final CrawlRequest getCrawlRequest() {
 
     public static abstract class CallbackParameterBuilder {
 
-        private final URL refererUrl;
+        private final URI refererUrl;
         private final int crawlDepth;
         private final CrawlRequest crawlRequest;
 
-        public CallbackParameterBuilder(final URL refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
+        public CallbackParameterBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
             this.refererUrl = refererUrl;
             this.crawlDepth = crawlDepth;
             this.crawlRequest = crawlRequest;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
index 7a4acbd..b5041b9 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
@@ -18,7 +18,7 @@
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.google.common.net.InternetDomainName;
 import java.io.Serializable;
-import java.net.URL;
+import java.net.URI;
 
 /**
  * Represents a candidate for crawling that will be surely processed by the
@@ -28,7 +28,7 @@
  */
 public final class CrawlCandidate implements Serializable {
 
-    private final URL refererUrl;
+    private final URI refererUrl;
     private final int crawlDepth;
     private final CrawlRequest crawlRequest;
 
@@ -43,7 +43,7 @@ public CrawlCandidate(final CrawlCandidateBuilder builder) {
      *
      * @return The URL of the referer
      */
-    public URL getRefererUrl() {
+    public URI getRefererUrl() {
         return refererUrl;
     }
 
@@ -52,7 +52,7 @@ public URL getRefererUrl() {
      *
      * @return The URL of the candidate
      */
-    public URL getCandidateUrl() {
+    public URI getCandidateUrl() {
         return crawlRequest.getRequestUrl();
     }
 
@@ -96,14 +96,14 @@ public static final class CrawlCandidateBuilder {
 
         private final CrawlRequest crawlRequest;
 
-        private URL refererUrl;
+        private URI refererUrl;
         private int crawlDepth;
 
         public CrawlCandidateBuilder(final CrawlRequest request) {
             crawlRequest = request;
         }
 
-        public CrawlCandidateBuilder setRefererUrl(final URL refererUrl) {
+        public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) {
             this.refererUrl = refererUrl;
             return this;
         }
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index c49f8e4..74b1b05 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -18,7 +18,7 @@
 import com.github.peterbencze.serritor.api.CrawlRequest;
 import com.github.peterbencze.serritor.internal.CrawlCandidate.CrawlCandidateBuilder;
 import java.io.Serializable;
-import java.net.URL;
+import java.net.URI;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashSet;
@@ -148,7 +148,7 @@ public CrawlCandidate getNextCandidate() {
      * @param url The URL that the fingerprint will be created for
      * @return The fingerprint of the URL
      */
-    private static String createFingerprintForUrl(final URL url) {
+    private static String createFingerprintForUrl(final URI url) {
         // First, we start off with the host only
         StringBuilder truncatedUrl = new StringBuilder(url.getHost());
 
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 74854bf..3789aae 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -19,8 +19,7 @@
 import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
 import com.github.peterbencze.serritor.api.CrawlStrategy;
 import com.google.common.net.InternetDomainName;
-import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.URI;
 import java.util.Arrays;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -36,12 +35,12 @@
 public final class CrawlFrontierTest {
 
     // Allowed crawl domains
-    private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_0 = new CrawlDomain(InternetDomainName.from("root_url_0.com"));
-    private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_1 = new CrawlDomain(InternetDomainName.from("root_url_1.com"));
+    private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_0 = new CrawlDomain(InternetDomainName.from("root-url-0.com"));
+    private static final CrawlDomain ALLOWED_CRAWL_DOMAIN_1 = new CrawlDomain(InternetDomainName.from("root-url-1.com"));
 
     // Root URLs
-    private static final URL ROOT_URL_0;
-    private static final URL ROOT_URL_1;
+    private static final URI ROOT_URL_0 = URI.create("http://root-url-0.com");
+    private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com");
 
     // Root URL crawl depth
     private static final int ROOT_URL_CRAWL_DEPTH = 0;
@@ -51,13 +50,16 @@ public final class CrawlFrontierTest {
     private static final int ROOT_URL_1_PRIORITY = 1;
 
     // Root URL crawl requests
-    private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST;
-    private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST;
+    private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0).setPriority(ROOT_URL_0_PRIORITY).build();
+    private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1).setPriority(ROOT_URL_1_PRIORITY).build();
+
+    // Child URL path
+    private static final String CHILD_URL_PATH = "/child";
 
     // Child URLs
-    private static final URL CHILD_URL_0;
-    private static final URL CHILD_URL_1;
-    private static final URL CHILD_URL_2;
+    private static final URI CHILD_URL_0 = URI.create(String.format("http://root-url-0.com%s-0.html", CHILD_URL_PATH));
+    private static final URI CHILD_URL_1 = URI.create(String.format("http://root-url-0.com%s-1.html", CHILD_URL_PATH));
+    private static final URI CHILD_URL_2 = URI.create(String.format("http://root-url-1.com%s-0.html", CHILD_URL_PATH));
 
     // Child URL crawl depth
     private static final int CHILD_URL_CRAWL_DEPTH = 1;
@@ -68,69 +70,22 @@ public final class CrawlFrontierTest {
     private static final int CHILD_URL_2_PRIORITY = 1;
 
     // Child URL crawl requests  
-    private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST;
-    private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST;
-    private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST;
-
-    // Child URL path
-    private static final String CHILD_URL_PATH = "/child";
+    private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0).setPriority(CHILD_URL_0_PRIORITY).build();
+    private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1).setPriority(CHILD_URL_1_PRIORITY).build();
+    private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build();
 
     // Offsite URL
-    private static final URL OFFSITE_URL;
+    private static final URI OFFSITE_URL = URI.create("http://offsite-url.com");
 
     // Offsite URL priority
     private static final int OFFSITE_URL_PRIORITY = 0;
 
     // Offsite URL crawl request
-    private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST;
+    private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build();
 
     // Max crawl depth
     private static final int MAX_CRAWL_DEPTH = 1;
 
-    static {
-        try {
-            // Initialization of root URLs
-            ROOT_URL_0 = new URL("http://root_url_0.com");
-            ROOT_URL_1 = new URL("http://root_url_1.com");
-
-            // Initialization of child URLs
-            CHILD_URL_0 = new URL(String.format("http://root_url_0.com%s_0.html", CHILD_URL_PATH));
-            CHILD_URL_1 = new URL(String.format("http://root_url_0.com%s_1.html", CHILD_URL_PATH));
-
-            CHILD_URL_2 = new URL(String.format("http://root_url_1.com%s_0.html", CHILD_URL_PATH));
-
-            // Initialization of the offsite URL
-            OFFSITE_URL = new URL("http://offsite_url.com");
-        } catch (MalformedURLException ex) {
-            throw new Error(ex);
-        }
-
-        // Initialize crawl requests
-        ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0)
-                .setPriority(ROOT_URL_0_PRIORITY)
-                .build();
-
-        ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1)
-                .setPriority(ROOT_URL_1_PRIORITY)
-                .build();
-
-        CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0)
-                .setPriority(CHILD_URL_0_PRIORITY)
-                .build();
-
-        CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1)
-                .setPriority(CHILD_URL_1_PRIORITY)
-                .build();
-
-        CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2)
-                .setPriority(CHILD_URL_2_PRIORITY)
-                .build();
-
-        OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL)
-                .setPriority(OFFSITE_URL_PRIORITY)
-                .build();
-    }
-
     private CrawlerConfiguration configuration;
     private CrawlFrontier frontier;
 

From a14aacde935548f72bbaec3438757162dc50cf9e Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Fri, 16 Mar 2018 17:48:37 +0100
Subject: [PATCH 19/24] Add URL validation

---
 .../serritor/api/helper/UrlFinder.java        | 59 ++++++++++++++++---
 1 file changed, 50 insertions(+), 9 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
index 2bcbe83..a79b278 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
@@ -17,11 +17,14 @@
 
 import com.github.peterbencze.serritor.api.HtmlResponse;
 import com.google.common.collect.Sets;
+import com.google.common.net.InternetDomainName;
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -41,15 +44,17 @@ public final class UrlFinder {
     private final Set<Pattern> urlPatterns;
     private final Set<By> locatingMechanisms;
     private final Set<String> attributes;
+    private final Function<String, Boolean> validator;
 
     private UrlFinder(final UrlFinderBuilder builder) {
         urlPatterns = builder.urlPatterns;
         locatingMechanisms = builder.locatingMechanisms;
         attributes = builder.attributes;
+        validator = builder.validator;
     }
 
     /**
-     * Returns a list of (unvalidated) URLs found in the response's HTML source.
+     * Returns a list of validated URLs found in the response's HTML source.
      *
      * @param response The <code>HtmlResponse</code> instance
      * @return The list of found URLs
@@ -78,7 +83,7 @@ public List<String> findUrlsInResponse(final HtmlResponse response) {
     }
 
     /**
-     * Returns a list of (unvalidated) URLs found in the attribute's value.
+     * Returns a list of validated URLs found in the attribute's value.
      *
      * @param attributeValue The value of the attribute
      * @return The list of found URLs
@@ -92,7 +97,7 @@ private List<String> findUrlsInAttributeValue(final String attributeValue) {
                     while (urlPatternMatcher.find()) {
                         String foundUrl = urlPatternMatcher.group();
 
-                        if (StringUtils.isNotBlank(foundUrl)) {
+                        if (validator.apply(foundUrl)) {
                             foundUrls.add(foundUrl);
                         }
                     }
@@ -107,6 +112,7 @@ public static final class UrlFinderBuilder {
 
         private Set<By> locatingMechanisms;
         private Set<String> attributes;
+        private Function<String, Boolean> validator;
 
         /**
          * Constructs a <code>UrlFinderBuilder</code> instance that can be used
@@ -131,6 +137,7 @@ public UrlFinderBuilder(final List<Pattern> urlPatterns) {
             this.urlPatterns = Sets.newHashSet(urlPatterns);
             locatingMechanisms = Sets.newHashSet(By.tagName("a"));
             attributes = Sets.newHashSet("href");
+            validator = this::isValidUrl;
         }
 
         /**
@@ -139,9 +146,10 @@ public UrlFinderBuilder(final List<Pattern> urlPatterns) {
          *
          * @param locatingMechanism The <code>By</code> locating mechanism
          * instance
+         * @return The <code>UrlFinderBuilder</code> instance
          */
-        public void setLocatingMechanism(final By locatingMechanism) {
-            setLocatingMechanisms(Arrays.asList(locatingMechanism));
+        public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) {
+            return setLocatingMechanisms(Arrays.asList(locatingMechanism));
         }
 
         /**
@@ -150,31 +158,49 @@ public void setLocatingMechanism(final By locatingMechanism) {
          *
          * @param locatingMechanisms The list of <code>By</code> locating
          * mechanism instances
+         * @return The <code>UrlFinderBuilder</code> instance
          */
-        public void setLocatingMechanisms(final List<By> locatingMechanisms) {
+        public UrlFinderBuilder setLocatingMechanisms(final List<By> locatingMechanisms) {
             Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null.");
 
             this.locatingMechanisms = Sets.newHashSet(locatingMechanisms);
+            return this;
         }
 
         /**
          * Sets which attributes to search for URLs.
          *
          * @param attributes The list of attribute names
+         * @return The <code>UrlFinderBuilder</code> instance
          */
-        public void setAttributes(final List<String> attributes) {
+        public UrlFinderBuilder setAttributes(final List<String> attributes) {
             Validate.noNullElements(attributes, "Attributes cannot be null.");
 
             this.attributes = Sets.newHashSet(attributes);
+            return this;
         }
 
         /**
          * Sets which attribute to search for URLs.
          *
          * @param attribute The name of the attribute
+         * @return The <code>UrlFinderBuilder</code> instance
          */
-        public void setAttribute(final String attribute) {
-            setAttributes(Arrays.asList(attribute));
+        public UrlFinderBuilder setAttribute(final String attribute) {
+            return setAttributes(Arrays.asList(attribute));
+        }
+
+        /**
+         * Sets a function to be used for validating found URLs.
+         *
+         * @param validator The validator function
+         * @return The <code>UrlFinderBuilder</code> instance
+         */
+        public UrlFinderBuilder setValidator(final Function<String, Boolean> validator) {
+            Validate.notNull(validator, "The validator function cannot be null.");
+
+            this.validator = validator;
+            return this;
         }
 
         /**
@@ -185,5 +211,20 @@ public void setAttribute(final String attribute) {
         public UrlFinder build() {
             return new UrlFinder(this);
         }
+
+        /**
+         * The default URL validator function.
+         *
+         * @param url The URL to be validated
+         * @return <code>true</code> if the URL is valid, <code>false</code>
+         * otherwise
+         */
+        private boolean isValidUrl(final String url) {
+            try {
+                return InternetDomainName.isValid(URI.create(url).getHost());
+            } catch (IllegalArgumentException e) {
+                return false;
+            }
+        }
     }
 }

From 4903726617ecbda924bbfe7b8f94b97a91f71fcc Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Fri, 16 Mar 2018 18:16:04 +0100
Subject: [PATCH 20/24] Update README

---
 README.md | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 185dfba..40db8dc 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Add the following dependency to your pom.xml:
 <dependency>
     <groupId>com.github.peterbencze</groupId>
     <artifactId>serritor</artifactId>
-    <version>1.2.1</version>
+    <version>1.3.0</version>
 </dependency>
 ```
 
@@ -27,24 +27,31 @@ BaseCrawler provides a skeletal implementation of a crawler to minimize the effo
 ```java
 public class MyCrawler extends BaseCrawler {
 
+    private final UrlFinder urlFinder;
+
     public MyCrawler() {
         // Enable offsite request filtering
-        config.setOffsiteRequestFiltering(true);
+        configurator.setOffsiteRequestFiltering(true);
+
+        // Specify the allowed crawl domain
+        configurator.addAllowedCrawlDomain("example.com");
 
         // Add a crawl seed, this is where the crawling starts
         CrawlRequest request = new CrawlRequestBuilder("http://example.com").build();
-        config.addCrawlSeed(request);
+        configurator.addCrawlSeed(request);
+
+        // Extract URLs from links on the crawled page
+        urlFinder = new UrlFinderBuilder(Pattern.compile(".+")).build();
     }
 
     @Override
     protected void onResponseComplete(final HtmlResponse response) {
-        // Crawl every link that can be found on the page
-        response.getWebDriver().findElements(By.tagName("a"))
+        // Crawl every URL that match the given pattern
+        urlFinder.findUrlsInResponse(response)
                 .stream()
-                .forEach((WebElement link) -> {
-                    CrawlRequest request = new CrawlRequestBuilder(link.getAttribute("href")).build();
-                    crawl(request);
-                });
+                .map(CrawlRequestBuilder::new)
+                .map(CrawlRequestBuilder::build)
+                .forEach(this::crawl);
     }
 
     @Override
@@ -58,7 +65,7 @@ public class MyCrawler extends BaseCrawler {
     }
 }
 ```
-That's it! In just a few lines you can make a crawler that extracts and crawls every URL it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium.
+That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium.
 
 By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/):
 ```java

From aee21535a46242982a7fb9a0fbb20f4dbb2ffd90 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Fri, 16 Mar 2018 19:17:47 +0100
Subject: [PATCH 21/24] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 40db8dc..4be6493 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ public class MyCrawler extends BaseCrawler {
     }
 }
 ```
-That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium.
+That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver instance, so you can use all the features that are provided by Selenium.
 
 By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/):
 ```java

From 018b1cda36a8ecddadbf1ecb6191071013d455fb Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Fri, 16 Mar 2018 22:24:24 +0100
Subject: [PATCH 22/24] Refact

---
 .../peterbencze/serritor/api/helper/UrlFinder.java | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
index a79b278..24ca816 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
@@ -95,7 +95,7 @@ private List<String> findUrlsInAttributeValue(final String attributeValue) {
                 .map((Pattern urlPattern) -> urlPattern.matcher(attributeValue))
                 .forEach((Matcher urlPatternMatcher) -> {
                     while (urlPatternMatcher.find()) {
-                        String foundUrl = urlPatternMatcher.group();
+                        String foundUrl = urlPatternMatcher.group().trim();
 
                         if (validator.apply(foundUrl)) {
                             foundUrls.add(foundUrl);
@@ -107,6 +107,10 @@ private List<String> findUrlsInAttributeValue(final String attributeValue) {
     }
 
     public static final class UrlFinderBuilder {
+        
+        private static final Set<By> DEFAULT_LOCATING_MECHANISMS = Sets.newHashSet(By.tagName("a"));
+        private static final Set<String> DEFAULT_ATTRIBUTES = Sets.newHashSet("href");
+        private static final Function<String, Boolean> DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl;
 
         private final Set<Pattern> urlPatterns;
 
@@ -135,9 +139,9 @@ public UrlFinderBuilder(final List<Pattern> urlPatterns) {
             Validate.noNullElements(urlPatterns, "URL patterns cannot be null.");
 
             this.urlPatterns = Sets.newHashSet(urlPatterns);
-            locatingMechanisms = Sets.newHashSet(By.tagName("a"));
-            attributes = Sets.newHashSet("href");
-            validator = this::isValidUrl;
+            locatingMechanisms = DEFAULT_LOCATING_MECHANISMS;
+            attributes = DEFAULT_ATTRIBUTES;
+            validator = DEFAULT_VALIDATOR;
         }
 
         /**
@@ -219,7 +223,7 @@ public UrlFinder build() {
          * @return <code>true</code> if the URL is valid, <code>false</code>
          * otherwise
          */
-        private boolean isValidUrl(final String url) {
+        private static boolean isValidUrl(final String url) {
             try {
                 return InternetDomainName.isValid(URI.create(url).getHost());
             } catch (IllegalArgumentException e) {

From c99848d0236757f76de39cbe18a659c021b6c649 Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Fri, 16 Mar 2018 22:29:16 +0100
Subject: [PATCH 23/24] Update dependency versions

---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index f87d10a..cfe92f6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,12 +54,12 @@
         <dependency>
             <groupId>org.seleniumhq.selenium</groupId>
             <artifactId>selenium-java</artifactId>
-            <version>3.9.1</version>
+            <version>3.11.0</version>
         </dependency>
         <dependency>
             <groupId>org.seleniumhq.selenium</groupId>
             <artifactId>htmlunit-driver</artifactId>
-            <version>2.29.1</version>
+            <version>2.29.2</version>
         </dependency>
         <dependency>
             <groupId>com.google.guava</groupId>

From e58c1d2d65e18fd2d4a3502a931ae6115d52424f Mon Sep 17 00:00:00 2001
From: Peter Bencze <benczepeter95@gmail.com>
Date: Fri, 16 Mar 2018 23:55:25 +0100
Subject: [PATCH 24/24] Add test for url finder

---
 pom.xml                                       |  6 ++
 .../serritor/api/helper/UrlFinderTest.java    | 84 +++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java

diff --git a/pom.xml b/pom.xml
index cfe92f6..8747531 100644
--- a/pom.xml
+++ b/pom.xml
@@ -72,6 +72,12 @@
             <version>4.12</version>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>2.16.0</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
     
     <build>
diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java
new file mode 100644
index 0000000..f89c0fd
--- /dev/null
+++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java
@@ -0,0 +1,84 @@
+/* 
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.peterbencze.serritor.api.helper;
+
+import com.github.peterbencze.serritor.api.HtmlResponse;
+import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder;
+import com.github.peterbencze.serritor.api.helper.UrlFinder.UrlFinderBuilder;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Pattern;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+
+/**
+ * Test cases for <code>UrlFinder</code>.
+ *
+ * @author Peter Bencze
+ */
+public class UrlFinderTest {
+    
+    private static final Pattern URL_PATTERN = Pattern.compile(".+valid-url.+");
+    private static final String ATTRIBUTE = "href";
+    private static final String TAG_NAME = "a";
+    private static final String VALID_URL = "http://valid-url.com";
+    private static final String INVALID_URL = "invalid-url";
+    private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain";
+
+    private UrlFinder urlFinder;
+    private HtmlResponse mockResponse;
+    private WebDriver mockDriver;
+    private WebElement mockElementWithValidUrl;
+    private WebElement mockElementWithInvalidUrlFormat;
+    private WebElement mockElementWithInvalidDomain;
+
+    @Before
+    public void initialize() {
+        urlFinder = new UrlFinderBuilder(URL_PATTERN).build();
+        
+        // Create mocks
+        mockDriver = Mockito.mock(WebDriver.class);
+        
+        // Cannot mock because of the final modifier
+        mockResponse = new HtmlResponseBuilder(null, 0, null).setWebDriver(mockDriver).build();
+        
+        mockElementWithValidUrl = Mockito.mock(WebElement.class);
+        Mockito.when(mockElementWithValidUrl.getAttribute(Mockito.eq(ATTRIBUTE)))
+                .thenReturn(VALID_URL);
+        
+        mockElementWithInvalidUrlFormat = Mockito.mock(WebElement.class);
+        Mockito.when(mockElementWithInvalidUrlFormat.getAttribute(Mockito.eq(ATTRIBUTE)))
+                .thenReturn(INVALID_URL);
+        
+        mockElementWithInvalidDomain = Mockito.mock(WebElement.class);
+        Mockito.when(mockElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE)))
+                .thenReturn(URL_WITH_INVALID_DOMAIN);
+        
+        List<WebElement> elementList = Arrays.asList(mockElementWithValidUrl, mockElementWithInvalidUrlFormat, mockElementWithInvalidDomain);         
+        Mockito.when(mockDriver.findElements(By.tagName(TAG_NAME)))
+                .thenReturn(elementList);
+    }
+
+    @Test
+    public void findUrlsInResponseTest() {
+        Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInResponse(mockResponse));
+    }
+}