Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

88: Add per-host config for bypassing robots file #91

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

import fr.eolya.utils.ConfigHelper;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
Expand Down Expand Up @@ -238,7 +240,7 @@ public int processItem(Map<String,Object> itemData, long threadId) {
/*
* Ignore this page if robots.txt ask it
*/
if (robots!=null && !"1".equals(config.getProperty("/crawler/param[@name='bypass_robots_file']", "0")) && !robots.isUrlAllowed(pageURL)) return 0;
if (robots!=null && !ConfigHelper.isBypassRobotsFile(config, pageURL) && !robots.isUrlAllowed(pageURL)) return 0;

int maxCrawlDepth = src.getDepth();
if (maxCrawlDepth==0) maxCrawlDepth = Integer.parseInt(config.getProperty("/crawler/param[@name='max_depth']", "2"));
Expand Down Expand Up @@ -1398,7 +1400,7 @@ private boolean isAccepetedUrl (String strLink, String normalizedStartUrl, List<
*/

// Filtre l'url par rapport aux règles du fichier robots.txt
if (robots!=null && !isStartingUrl(strLink) && !"1".equals(config.getProperty("/crawler/param[@name='bypass_robots_file']", "0")) && !robots.isUrlAllowed(urlLink))
if (robots!=null && !isStartingUrl(strLink) && !ConfigHelper.isBypassRobotsFile(config, pageURL) && !robots.isUrlAllowed(urlLink))
{
logger.log("[" + String.valueOf(threadId) + "] " + strLink + " rejected due to robots.txt exclusion rules");
if (depth<=memlogMaxDepth) src.memLogAppend(" " + strLink + " rejected due to robots.txt exclusion rules");
Expand Down
38 changes: 38 additions & 0 deletions java/utils/src/main/java/fr/eolya/utils/ConfigHelper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package fr.eolya.utils;

import java.net.URL;
import java.util.Objects;

import fr.eolya.utils.XMLConfig;

public final class ConfigHelper {

private ConfigHelper() {
}

/**
* Checks if the robots.txt file should be bypassed by the crawler.<br>
* The base setting is the bypass_robots_file param, with overrides nested in a special element:
* <p>
* <pre>
* &lt;bypassRobotsFile&gt;
* &lt;param name=&quot;overriden.host.com&quot;&gt;1&lt;/param&gt;
* &lt;/bypassRobotsFile&gt;
* </pre>
*
* @param config
* @param url optional parameter. If null, no overrides are applied
* @return
*/
public static boolean isBypassRobotsFile(XMLConfig config, URL url) {
Objects.requireNonNull(config, "Parameter config is missing");

if (url != null) {
String perHostBypassValue = config.getProperty("/crawler/bypassRobotsFile/param[@name='" + url.getHost() + "']");
return "1".equals(perHostBypassValue);
}

String globalBypassValue = config.getProperty("/crawler/param[@name='bypass_robots_file']", "0");
return "1".equals(globalBypassValue);
}
}
43 changes: 43 additions & 0 deletions java/utils/src/test/java/fr/eolya/utils/ConfigHelperTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package fr.eolya.utils;

import java.io.IOException;
import java.net.URL;

import fr.eolya.utils.ConfigHelper;
import org.junit.Assert;
import org.junit.Test;

import fr.eolya.utils.XMLConfig;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class ConfigHelperTest {

@Test
public void isBypassRobotsFile_noOverride_false() throws IOException {
// given
XMLConfig config = new XMLConfig();
config.loadFile("src/test/resources/bypass-robots-crawler-config.xml");

// when
boolean bypass = ConfigHelper.isBypassRobotsFile(config, new URL("http://any-url.com"));

// then
assertFalse(bypass);
}

@Test
public void isBypassRobotsFile_override_true() throws IOException {
// given
XMLConfig config = new XMLConfig();
config.loadFile("src/test/resources/bypass-robots-crawler-config.xml");

// when
boolean bypass = ConfigHelper.isBypassRobotsFile(config, new URL("http://bypass-robots-for-this.com/context"));

// then
assertTrue(bypass);
}
}
11 changes: 11 additions & 0 deletions java/utils/src/test/resources/bypass-robots-crawler-config.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<crawler>

<param name="bypass_robots_file">0</param>

<bypassRobotsFile>
<param name="bypass-robots-for-this.com">1</param>
</bypassRobotsFile>

</crawler>