From c04041c463abb4edec9ec48e3675f84948b0d9e3 Mon Sep 17 00:00:00 2001 From: grimsa Date: Sat, 1 Jul 2017 20:58:02 +0300 Subject: [PATCH] 88: Add per-host config for bypassing robots file --- .../crawler/connectors/web/WebConnector.java | 6 ++- .../java/fr/eolya/utils/ConfigHelper.java | 38 ++++++++++++++++ .../java/fr/eolya/utils/ConfigHelperTest.java | 43 +++++++++++++++++++ .../bypass-robots-crawler-config.xml | 11 +++++ 4 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 java/utils/src/main/java/fr/eolya/utils/ConfigHelper.java create mode 100644 java/utils/src/test/java/fr/eolya/utils/ConfigHelperTest.java create mode 100644 java/utils/src/test/resources/bypass-robots-crawler-config.xml diff --git a/java/crawler/src/main/java/fr/eolya/crawler/connectors/web/WebConnector.java b/java/crawler/src/main/java/fr/eolya/crawler/connectors/web/WebConnector.java index f236e88..ec3a394 100644 --- a/java/crawler/src/main/java/fr/eolya/crawler/connectors/web/WebConnector.java +++ b/java/crawler/src/main/java/fr/eolya/crawler/connectors/web/WebConnector.java @@ -26,6 +26,8 @@ import org.dom4j.Node; import org.dom4j.io.SAXReader; +import fr.eolya.utils.ConfigHelper; + import crawlercommons.sitemaps.AbstractSiteMap; import crawlercommons.sitemaps.SiteMap; import crawlercommons.sitemaps.SiteMapIndex; @@ -238,7 +240,7 @@ public int processItem(Map itemData, long threadId) { /* * Ignore this page if robots.txt ask it */ - if (robots!=null && !"1".equals(config.getProperty("/crawler/param[@name='bypass_robots_file']", "0")) && !robots.isUrlAllowed(pageURL)) return 0; + if (robots!=null && !ConfigHelper.isBypassRobotsFile(config, pageURL) && !robots.isUrlAllowed(pageURL)) return 0; int maxCrawlDepth = src.getDepth(); if (maxCrawlDepth==0) maxCrawlDepth = Integer.parseInt(config.getProperty("/crawler/param[@name='max_depth']", "2")); @@ -1398,7 +1400,7 @@ private boolean isAccepetedUrl (String strLink, String normalizedStartUrl, List< */ // Filtre l'url par rapport aux règles du fichier robots.txt - if (robots!=null && !isStartingUrl(strLink) && !"1".equals(config.getProperty("/crawler/param[@name='bypass_robots_file']", "0")) && !robots.isUrlAllowed(urlLink)) + if (robots!=null && !isStartingUrl(strLink) && !ConfigHelper.isBypassRobotsFile(config, pageURL) && !robots.isUrlAllowed(urlLink)) { logger.log("[" + String.valueOf(threadId) + "] " + strLink + " rejected due to robots.txt exclusion rules"); if (depth<=memlogMaxDepth) src.memLogAppend(" " + strLink + " rejected due to robots.txt exclusion rules"); diff --git a/java/utils/src/main/java/fr/eolya/utils/ConfigHelper.java b/java/utils/src/main/java/fr/eolya/utils/ConfigHelper.java new file mode 100644 index 0000000..f67af33 --- /dev/null +++ b/java/utils/src/main/java/fr/eolya/utils/ConfigHelper.java @@ -0,0 +1,38 @@ +package fr.eolya.utils; + +import java.net.URL; +import java.util.Objects; + +import fr.eolya.utils.XMLConfig; + +public final class ConfigHelper { + + private ConfigHelper() { + } + + /** + * Checks if the robots.txt file should be bypassed by the crawler.
+ * The base setting is the bypass_robots_file param, with overrides nested in a special element: + *

+ *

+     * <bypassRobotsFile>
+     *   <param name="overriden.host.com">1</param>
+     * </bypassRobotsFile>
+     * 
+ * + * @param config + * @param url optional parameter. If null, no overrides are applied + * @return + */ + public static boolean isBypassRobotsFile(XMLConfig config, URL url) { + Objects.requireNonNull(config, "Parameter config is missing"); + + if (url != null) { + String perHostBypassValue = config.getProperty("/crawler/bypassRobotsFile/param[@name='" + url.getHost() + "']"); + return "1".equals(perHostBypassValue); + } + + String globalBypassValue = config.getProperty("/crawler/param[@name='bypass_robots_file']", "0"); + return "1".equals(globalBypassValue); + } +} diff --git a/java/utils/src/test/java/fr/eolya/utils/ConfigHelperTest.java b/java/utils/src/test/java/fr/eolya/utils/ConfigHelperTest.java new file mode 100644 index 0000000..0dd12f5 --- /dev/null +++ b/java/utils/src/test/java/fr/eolya/utils/ConfigHelperTest.java @@ -0,0 +1,43 @@ +package fr.eolya.utils; + +import java.io.IOException; +import java.net.URL; + +import fr.eolya.utils.ConfigHelper; +import org.junit.Assert; +import org.junit.Test; + +import fr.eolya.utils.XMLConfig; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class ConfigHelperTest { + + @Test + public void isBypassRobotsFile_noOverride_false() throws IOException { + // given + XMLConfig config = new XMLConfig(); + config.loadFile("src/test/resources/bypass-robots-crawler-config.xml"); + + // when + boolean bypass = ConfigHelper.isBypassRobotsFile(config, new URL("http://any-url.com")); + + // then + assertFalse(bypass); + } + + @Test + public void isBypassRobotsFile_override_true() throws IOException { + // given + XMLConfig config = new XMLConfig(); + config.loadFile("src/test/resources/bypass-robots-crawler-config.xml"); + + // when + boolean bypass = ConfigHelper.isBypassRobotsFile(config, new URL("http://bypass-robots-for-this.com/context")); + + // then + assertTrue(bypass); + } +} diff --git a/java/utils/src/test/resources/bypass-robots-crawler-config.xml b/java/utils/src/test/resources/bypass-robots-crawler-config.xml new file mode 100644 index 0000000..e093a7a --- /dev/null +++ b/java/utils/src/test/resources/bypass-robots-crawler-config.xml @@ -0,0 +1,11 @@ + + + +0 + + + 1 + + + +