Skip to content

Commit

Permalink
Fix #177 by adding recent TLDs list to URL validator utils
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Apr 2, 2019
1 parent ba95ee1 commit f8c3495
Show file tree
Hide file tree
Showing 7 changed files with 267 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- Upgrade `crawler-commons` library to version 1.0
- Upgrade `commons-validator` library to version 1.6
- Upgrade `okhttp3` library to version 3.14.0
- Fix issue #144: Links from recent TLDs are considered invalid

## Version 0.11.0

Expand Down
5 changes: 2 additions & 3 deletions src/main/java/focusedCrawler/seedfinder/BingSearch.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package focusedCrawler.seedfinder;

import focusedCrawler.util.Urls;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.validator.routines.UrlValidator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
Expand All @@ -23,7 +23,6 @@ public class BingSearch implements SearchEngineApi {
private final SimpleHttpFetcher fetcher;

private int docsPerPage = 10;
private UrlValidator urlValidator = new UrlValidator();
private TimeDelay timer = new TimeDelay(5000);


Expand Down Expand Up @@ -52,7 +51,7 @@ public List<BackLinkNeighborhood> submitQuery(String query, int page) throws IOE
List<BackLinkNeighborhood> links = new ArrayList<>();
for (Element link : linksUrl) {
String linkStr = link.attr("href");
if(urlValidator.isValid(linkStr)) {
if(Urls.isValid(linkStr)) {
BackLinkNeighborhood bl = new BackLinkNeighborhood();
bl.setLink(linkStr);
bl.setTitle(link.text());
Expand Down
144 changes: 144 additions & 0 deletions src/main/java/focusedCrawler/tools/GenerateTLDLists.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package focusedCrawler.tools;

import focusedCrawler.crawler.async.fetcher.OkHttpFetcher;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.crawler.crawlercommons.fetcher.http.UserAgent;
import focusedCrawler.crawler.crawlercommons.fetcher.http.UserAgent.Builder;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
import java.net.IDN;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringJoiner;
import java.util.TreeSet;
import org.apache.commons.validator.routines.DomainValidator;
import org.apache.commons.validator.routines.DomainValidator.ArrayType;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

@Command(name = "GenerateTLDLists", description = "")
public class GenerateTLDLists extends CliTool {

public static void main(String[] args) throws Exception {
CliTool.run(args, new GenerateTLDLists());
}

@Override
public void execute() throws Exception {

UserAgent userAgent = new Builder().setAgentName("ACHE").build();
OkHttpFetcher fetcher = new OkHttpFetcher(userAgent);
fetcher.setDefaultMaxContentSize(10 * 1024 * 1024);
FetchedResult result = fetcher.get("https://www.iana.org/domains/root/db");
String html = new String(result.getContent());

Map<String, TreeSet<String>> tlds = extractTLDsFromHTML(result, html);

checkCategoryExists(tlds, "country-code");
checkCategoryExists(tlds, "infrastructure");
checkCategoryExists(tlds, "test");
checkCategoryExists(tlds, "sponsored");
checkCategoryExists(tlds, "generic");

System.out.println();

System.out.println("// Recent Country Code TLDs absent in commons-validator");
printMissing("recentCountryCodeTLDs", tlds.get("country-code"), ArrayType.COUNTRY_CODE_RO);

System.out.println("// Recent Generic TLDs absent in commons-validator");
printMissing("recentGenericTLDs", tlds.get("generic"), ArrayType.GENERIC_RO);

System.out.println("// Recent Sponsored TLDs absent in commons-validator");
// sponsored category is stored in array type GENERIC_RO in commons-validator
printMissing("recentSponsoredTLDs", tlds.get("sponsored"), ArrayType.GENERIC_RO);
}

private Map<String, TreeSet<String>> extractTLDsFromHTML(FetchedResult result, String html) {
Document dom = Jsoup.parse(html, result.getFetchedUrl());

Map<String, TreeSet<String>> tlds = new HashMap<>();
Elements tr = dom.select("#tld-table tbody > tr");
for (Element element : tr) {
Elements children = element.children();
if (children.size() != 3) {
System.err.println(
"WARN: Found a table row (tr) not with 3 children. The HTML template may have changed.");
continue;
}
String tld = children.get(0).text();
String type = children.get(1).text();

tld = normalizeTld(tld);
if (tld == null) {
continue;
}

TreeSet<String> tldList = tlds.get(type);
if (tldList == null) {
tldList = new TreeSet<>();
tlds.put(type, tldList);
}

tldList.add(tld);
}

System.out.println("Found TLDs per category:");
for (Map.Entry<String, TreeSet<String>> entry : tlds.entrySet()) {
String type = entry.getKey();
System.out.print(type + ": ");
System.out.println(entry.getValue().size());
}
return tlds;
}

private String normalizeTld(final String tld) {
int lastChar = tld.length() - 1;
if (!(
(tld.charAt(0) == '.') ||
(tld.charAt(0) == '\u200F' && tld.charAt(1) == '.'
&& tld.charAt(lastChar) == '\u200E')
)
) {
System.err.printf("WARN: Found a TLD without leading dot: [%s]."
+ " The HTML template may have changed.\n", tld);
}
String normalized = null;
if (tld.charAt(0) == '\u200F' && tld.charAt(1) == '.' && tld.charAt(lastChar) == '\u200E') {
normalized = tld.substring(2, tld.length() - 1);
}
if (tld.charAt(0) == '.') {
normalized = tld.substring(1);
}
try {
normalized = IDN.toASCII(normalized);
} catch (Exception e) {
System.err.printf("WARN: Failed to convert normalized string [%s]"
+ " from TLD [%s] to punnycode.\n", normalized, tld);
return null;
}
return normalized;
}

private void printMissing(String variableName, TreeSet<String> tlds, ArrayType tldType) {
List<String> countryCode = Arrays.asList(DomainValidator.getTLDEntries(tldType));
StringJoiner str = new StringJoiner(",");
for (String tld : tlds) {
if (!countryCode.contains(tld)) {
str.add("\n\"" + tld + "\"");
}
}
System.out.printf("String[] " + variableName + " = new String[]{" + str.toString() + "};\n\n");
}

private void checkCategoryExists(Map<String, TreeSet<String>> tlds, String tldCategory) {
if (!tlds.containsKey(tldCategory)) {
System.out.println("WARN: TLD category not found: " + tldCategory
+ ". Site template may have changed.");
}
}

}
78 changes: 72 additions & 6 deletions src/main/java/focusedCrawler/util/Urls.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TreeSet;
import java.util.regex.Pattern;

import org.apache.commons.validator.routines.DomainValidator;
import org.apache.commons.validator.routines.DomainValidator.ArrayType;
import org.apache.commons.validator.routines.UrlValidator;

import com.fasterxml.jackson.core.JsonParser;
Expand All @@ -21,14 +23,78 @@

public class Urls {

static {
// This static block updates the list of top-level domain names from the commons-validator
// library, which is used to verify if URLs are valid.
// To update this arrays, the tool GenerateTLDLists can be used to automatically find the
// new TLDs not included in default list.

// Recent Country Code TLDs absent in commons-validator
String[] recentCountryCodeTLDs = new String[]{
"an",
"bl",
"bq",
"eh",
"mf",
"ss",
"tp",
"um",
"xn--2scrj9c",
"xn--3hcrj9c",
"xn--45br5cyl",
"xn--90ae",
"xn--h2breg3eve",
"xn--h2brj9c8c",
"xn--mgbah1a3hjkrd",
"xn--mgbai9azgqp6j",
"xn--mgbbh1a",
"xn--mgbgu82a",
"xn--rvc1e0am3e"};

// Recent Generic TLDs absent in commons-validator
String[] recentGenericTLDs = new String[]{
"africa",
"arab",
"charity",
"doosan",
"etisalat",
"flsmidth",
"grocery",
"hotels",
"iinet",
"inc",
"llc",
"map",
"merckmsd",
"mutuelle",
"phd",
"rugby",
"search",
"sport",
"xn--mgbaakc7dvf",
"xn--ngbrx",
"xn--otu796d"};

// Recent Sponsored TLDs absent in commons-validator
String[] recentSponsoredTLDs = new String[]{};

//# END

// Create TLD arrays
List<String> newGenericDomains = new ArrayList<>();
newGenericDomains.addAll(Arrays.asList(recentGenericTLDs));
newGenericDomains.addAll(Arrays.asList(recentSponsoredTLDs));
newGenericDomains.add("onion"); // we also want accept links from to TOR network
String[] gericPlusArray = newGenericDomains.toArray(new String[newGenericDomains.size()]);
// Finally, update commons-validator
DomainValidator.updateTLDOverride(ArrayType.GENERIC_PLUS, gericPlusArray);
DomainValidator.updateTLDOverride(ArrayType.COUNTRY_CODE_PLUS, recentCountryCodeTLDs);
}

private static final String[] ALLOWED_SCHEMES = {"http", "https"};

private static final UrlValidator VALIDATOR = new UrlValidator(ALLOWED_SCHEMES);

// .onion links aren't accepted by the validator
// Regex ".[^.]+" --> any string of at least 1 char without dot
private static final Pattern ONION_PATTERN = Pattern.compile("https?://.[^.]+\\.onion.*");

private static final List<String> INVALID_QUERY_PARAMETERS = Arrays.asList(
"sid",
"phpsessid",
Expand All @@ -40,7 +106,7 @@ public class Urls {


public static boolean isValid(String url) {
return VALIDATOR.isValid(url) || ONION_PATTERN.matcher(url).matches();
return VALIDATOR.isValid(url);
}

public static String normalize(String url) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

import focusedCrawler.util.Urls;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.validator.routines.UrlValidator;
import org.junit.Test;

import focusedCrawler.config.Configuration;
Expand Down Expand Up @@ -62,9 +62,8 @@ public void backlinksShouldBeDownloadedFromGoogle() throws MalformedURLException
}

public boolean backLinkSetIsValid(BackLinkNeighborhood[] backlinks) {
UrlValidator validator = new UrlValidator();
for (BackLinkNeighborhood backlink : backlinks) {
if (validator.isValid(backlink.getLink()))
if (Urls.isValid(backlink.getLink()))
return true;
}
return false;
Expand Down
22 changes: 22 additions & 0 deletions src/test/java/focusedCrawler/util/UrlsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package focusedCrawler.util;

import static org.junit.Assert.assertTrue;

import org.junit.Test;

public class UrlsTest {

/**
* See issue https://github.com/VIDA-NYU/ache/issues/177
*/
@Test
public void RecentTLDsShouldBeValid() {
assertTrue(Urls.isValid("http://registry.africa"));
}

@Test
public void OnionLinksShouldBeValid() {
assertTrue(Urls.isValid("http://3g2upl4pq6kufc4m.onion/"));
}

}
24 changes: 24 additions & 0 deletions src/test/java/focusedCrawler/util/parser/PaginaURLTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,30 @@ public void shouldNormalizeLinks() throws MalformedURLException {
assertThat(links[2].toString(), is("http://example.com/"));
}

@Test
public void shouldExtractLinksWithRecentAndOnion() throws MalformedURLException {
// given
URL url = new URL("http://example.com/test.html");
StringBuilder testPage = new StringBuilder();
testPage.append("<!DOCTYPE html>");
testPage.append("<html>");
testPage.append("<body>");
testPage.append("<h1>My First Heading</h1>");
testPage.append("<a href = \"http://registry.africa/\">Link with recent TLDs</a>");
testPage.append("<a href = \"http://3g2upl4pq6kufc4m.onion/\">Onion Link</a>");
testPage.append("</body>");
testPage.append("</html>");

// when
PaginaURL paginaURL = new PaginaURL(url, testPage.toString());
URL[] links = paginaURL.links();

// then
assertThat(links.length, is(2));
assertThat(links[0].toString(), is("http://registry.africa/"));
assertThat(links[1].toString(), is("http://3g2upl4pq6kufc4m.onion/"));
}

private String createTestPage() {
StringBuilder testPage = new StringBuilder();
testPage.append("<!DOCTYPE html>");
Expand Down

0 comments on commit f8c3495

Please sign in to comment.