From f8c34958c69ffa3c6853dea55bc2aa5c5fa8bc8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Mon, 1 Apr 2019 21:51:54 -0400 Subject: [PATCH] Fix #177 by adding recent TLDs list to URL validator utils --- CHANGELOG.md | 1 + .../focusedCrawler/seedfinder/BingSearch.java | 5 +- .../tools/GenerateTLDLists.java | 144 ++++++++++++++++++ src/main/java/focusedCrawler/util/Urls.java | 78 +++++++++- .../builder/BacklinkSurferTest.java | 5 +- .../java/focusedCrawler/util/UrlsTest.java | 22 +++ .../util/parser/PaginaURLTest.java | 24 +++ 7 files changed, 267 insertions(+), 12 deletions(-) create mode 100644 src/main/java/focusedCrawler/tools/GenerateTLDLists.java create mode 100644 src/test/java/focusedCrawler/util/UrlsTest.java diff --git a/CHANGELOG.md b/CHANGELOG.md index fd0037fcc..9a6fce915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - Upgrade `crawler-commons` library to version 1.0 - Upgrade `commons-validator` library to version 1.6 - Upgrade `okhttp3` library to version 3.14.0 +- Fix issue #144: Links from recent TLDs are considered invalid ## Version 0.11.0 diff --git a/src/main/java/focusedCrawler/seedfinder/BingSearch.java b/src/main/java/focusedCrawler/seedfinder/BingSearch.java index c0ec0b84e..aaab5fc0d 100644 --- a/src/main/java/focusedCrawler/seedfinder/BingSearch.java +++ b/src/main/java/focusedCrawler/seedfinder/BingSearch.java @@ -1,12 +1,12 @@ package focusedCrawler.seedfinder; +import focusedCrawler.util.Urls; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; -import org.apache.commons.validator.routines.UrlValidator; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -23,7 +23,6 @@ public class BingSearch implements SearchEngineApi { private final SimpleHttpFetcher fetcher; private int docsPerPage = 10; - private UrlValidator urlValidator = new UrlValidator(); private TimeDelay timer = new TimeDelay(5000); @@ -52,7 +51,7 @@ public List submitQuery(String query, int page) throws IOE List links = new ArrayList<>(); for (Element link : linksUrl) { String linkStr = link.attr("href"); - if(urlValidator.isValid(linkStr)) { + if(Urls.isValid(linkStr)) { BackLinkNeighborhood bl = new BackLinkNeighborhood(); bl.setLink(linkStr); bl.setTitle(link.text()); diff --git a/src/main/java/focusedCrawler/tools/GenerateTLDLists.java b/src/main/java/focusedCrawler/tools/GenerateTLDLists.java new file mode 100644 index 000000000..f9af6bfce --- /dev/null +++ b/src/main/java/focusedCrawler/tools/GenerateTLDLists.java @@ -0,0 +1,144 @@ +package focusedCrawler.tools; + +import focusedCrawler.crawler.async.fetcher.OkHttpFetcher; +import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; +import focusedCrawler.crawler.crawlercommons.fetcher.http.UserAgent; +import focusedCrawler.crawler.crawlercommons.fetcher.http.UserAgent.Builder; +import focusedCrawler.util.CliTool; +import io.airlift.airline.Command; +import java.net.IDN; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.StringJoiner; +import java.util.TreeSet; +import org.apache.commons.validator.routines.DomainValidator; +import org.apache.commons.validator.routines.DomainValidator.ArrayType; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +@Command(name = "GenerateTLDLists", description = "") +public class GenerateTLDLists extends CliTool { + + public static void main(String[] args) throws Exception { + CliTool.run(args, new GenerateTLDLists()); + } + + @Override + public void execute() throws Exception { + + UserAgent userAgent = new Builder().setAgentName("ACHE").build(); + OkHttpFetcher fetcher = new OkHttpFetcher(userAgent); + fetcher.setDefaultMaxContentSize(10 * 1024 * 1024); + FetchedResult result = fetcher.get("https://www.iana.org/domains/root/db"); + String html = new String(result.getContent()); + + Map> tlds = extractTLDsFromHTML(result, html); + + checkCategoryExists(tlds, "country-code"); + checkCategoryExists(tlds, "infrastructure"); + checkCategoryExists(tlds, "test"); + checkCategoryExists(tlds, "sponsored"); + checkCategoryExists(tlds, "generic"); + + System.out.println(); + + System.out.println("// Recent Country Code TLDs absent in commons-validator"); + printMissing("recentCountryCodeTLDs", tlds.get("country-code"), ArrayType.COUNTRY_CODE_RO); + + System.out.println("// Recent Generic TLDs absent in commons-validator"); + printMissing("recentGenericTLDs", tlds.get("generic"), ArrayType.GENERIC_RO); + + System.out.println("// Recent Sponsored TLDs absent in commons-validator"); + // sponsored category is stored in array type GENERIC_RO in commons-validator + printMissing("recentSponsoredTLDs", tlds.get("sponsored"), ArrayType.GENERIC_RO); + } + + private Map> extractTLDsFromHTML(FetchedResult result, String html) { + Document dom = Jsoup.parse(html, result.getFetchedUrl()); + + Map> tlds = new HashMap<>(); + Elements tr = dom.select("#tld-table tbody > tr"); + for (Element element : tr) { + Elements children = element.children(); + if (children.size() != 3) { + System.err.println( + "WARN: Found a table row (tr) not with 3 children. The HTML template may have changed."); + continue; + } + String tld = children.get(0).text(); + String type = children.get(1).text(); + + tld = normalizeTld(tld); + if (tld == null) { + continue; + } + + TreeSet tldList = tlds.get(type); + if (tldList == null) { + tldList = new TreeSet<>(); + tlds.put(type, tldList); + } + + tldList.add(tld); + } + + System.out.println("Found TLDs per category:"); + for (Map.Entry> entry : tlds.entrySet()) { + String type = entry.getKey(); + System.out.print(type + ": "); + System.out.println(entry.getValue().size()); + } + return tlds; + } + + private String normalizeTld(final String tld) { + int lastChar = tld.length() - 1; + if (!( + (tld.charAt(0) == '.') || + (tld.charAt(0) == '\u200F' && tld.charAt(1) == '.' + && tld.charAt(lastChar) == '\u200E') + ) + ) { + System.err.printf("WARN: Found a TLD without leading dot: [%s]." + + " The HTML template may have changed.\n", tld); + } + String normalized = null; + if (tld.charAt(0) == '\u200F' && tld.charAt(1) == '.' && tld.charAt(lastChar) == '\u200E') { + normalized = tld.substring(2, tld.length() - 1); + } + if (tld.charAt(0) == '.') { + normalized = tld.substring(1); + } + try { + normalized = IDN.toASCII(normalized); + } catch (Exception e) { + System.err.printf("WARN: Failed to convert normalized string [%s]" + + " from TLD [%s] to punnycode.\n", normalized, tld); + return null; + } + return normalized; + } + + private void printMissing(String variableName, TreeSet tlds, ArrayType tldType) { + List countryCode = Arrays.asList(DomainValidator.getTLDEntries(tldType)); + StringJoiner str = new StringJoiner(","); + for (String tld : tlds) { + if (!countryCode.contains(tld)) { + str.add("\n\"" + tld + "\""); + } + } + System.out.printf("String[] " + variableName + " = new String[]{" + str.toString() + "};\n\n"); + } + + private void checkCategoryExists(Map> tlds, String tldCategory) { + if (!tlds.containsKey(tldCategory)) { + System.out.println("WARN: TLD category not found: " + tldCategory + + ". Site template may have changed."); + } + } + +} diff --git a/src/main/java/focusedCrawler/util/Urls.java b/src/main/java/focusedCrawler/util/Urls.java index 66188df8a..e6c089b8f 100644 --- a/src/main/java/focusedCrawler/util/Urls.java +++ b/src/main/java/focusedCrawler/util/Urls.java @@ -3,11 +3,13 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.TreeSet; -import java.util.regex.Pattern; +import org.apache.commons.validator.routines.DomainValidator; +import org.apache.commons.validator.routines.DomainValidator.ArrayType; import org.apache.commons.validator.routines.UrlValidator; import com.fasterxml.jackson.core.JsonParser; @@ -21,14 +23,78 @@ public class Urls { + static { + // This static block updates the list of top-level domain names from the commons-validator + // library, which is used to verify if URLs are valid. + // To update this arrays, the tool GenerateTLDLists can be used to automatically find the + // new TLDs not included in default list. + + // Recent Country Code TLDs absent in commons-validator + String[] recentCountryCodeTLDs = new String[]{ + "an", + "bl", + "bq", + "eh", + "mf", + "ss", + "tp", + "um", + "xn--2scrj9c", + "xn--3hcrj9c", + "xn--45br5cyl", + "xn--90ae", + "xn--h2breg3eve", + "xn--h2brj9c8c", + "xn--mgbah1a3hjkrd", + "xn--mgbai9azgqp6j", + "xn--mgbbh1a", + "xn--mgbgu82a", + "xn--rvc1e0am3e"}; + + // Recent Generic TLDs absent in commons-validator + String[] recentGenericTLDs = new String[]{ + "africa", + "arab", + "charity", + "doosan", + "etisalat", + "flsmidth", + "grocery", + "hotels", + "iinet", + "inc", + "llc", + "map", + "merckmsd", + "mutuelle", + "phd", + "rugby", + "search", + "sport", + "xn--mgbaakc7dvf", + "xn--ngbrx", + "xn--otu796d"}; + + // Recent Sponsored TLDs absent in commons-validator + String[] recentSponsoredTLDs = new String[]{}; + + //# END + + // Create TLD arrays + List newGenericDomains = new ArrayList<>(); + newGenericDomains.addAll(Arrays.asList(recentGenericTLDs)); + newGenericDomains.addAll(Arrays.asList(recentSponsoredTLDs)); + newGenericDomains.add("onion"); // we also want accept links from to TOR network + String[] gericPlusArray = newGenericDomains.toArray(new String[newGenericDomains.size()]); + // Finally, update commons-validator + DomainValidator.updateTLDOverride(ArrayType.GENERIC_PLUS, gericPlusArray); + DomainValidator.updateTLDOverride(ArrayType.COUNTRY_CODE_PLUS, recentCountryCodeTLDs); + } + private static final String[] ALLOWED_SCHEMES = {"http", "https"}; private static final UrlValidator VALIDATOR = new UrlValidator(ALLOWED_SCHEMES); - // .onion links aren't accepted by the validator - // Regex ".[^.]+" --> any string of at least 1 char without dot - private static final Pattern ONION_PATTERN = Pattern.compile("https?://.[^.]+\\.onion.*"); - private static final List INVALID_QUERY_PARAMETERS = Arrays.asList( "sid", "phpsessid", @@ -40,7 +106,7 @@ public class Urls { public static boolean isValid(String url) { - return VALIDATOR.isValid(url) || ONION_PATTERN.matcher(url).matches(); + return VALIDATOR.isValid(url); } public static String normalize(String url) { diff --git a/src/test/java/focusedCrawler/link/classifier/builder/BacklinkSurferTest.java b/src/test/java/focusedCrawler/link/classifier/builder/BacklinkSurferTest.java index a516b3066..32dead21b 100644 --- a/src/test/java/focusedCrawler/link/classifier/builder/BacklinkSurferTest.java +++ b/src/test/java/focusedCrawler/link/classifier/builder/BacklinkSurferTest.java @@ -5,13 +5,13 @@ import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; +import focusedCrawler.util.Urls; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.Map; -import org.apache.commons.validator.routines.UrlValidator; import org.junit.Test; import focusedCrawler.config.Configuration; @@ -62,9 +62,8 @@ public void backlinksShouldBeDownloadedFromGoogle() throws MalformedURLException } public boolean backLinkSetIsValid(BackLinkNeighborhood[] backlinks) { - UrlValidator validator = new UrlValidator(); for (BackLinkNeighborhood backlink : backlinks) { - if (validator.isValid(backlink.getLink())) + if (Urls.isValid(backlink.getLink())) return true; } return false; diff --git a/src/test/java/focusedCrawler/util/UrlsTest.java b/src/test/java/focusedCrawler/util/UrlsTest.java new file mode 100644 index 000000000..544e7b366 --- /dev/null +++ b/src/test/java/focusedCrawler/util/UrlsTest.java @@ -0,0 +1,22 @@ +package focusedCrawler.util; + +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +public class UrlsTest { + + /** + * See issue https://github.com/VIDA-NYU/ache/issues/177 + */ + @Test + public void RecentTLDsShouldBeValid() { + assertTrue(Urls.isValid("http://registry.africa")); + } + + @Test + public void OnionLinksShouldBeValid() { + assertTrue(Urls.isValid("http://3g2upl4pq6kufc4m.onion/")); + } + +} \ No newline at end of file diff --git a/src/test/java/focusedCrawler/util/parser/PaginaURLTest.java b/src/test/java/focusedCrawler/util/parser/PaginaURLTest.java index 778b50339..f3cb9d19c 100644 --- a/src/test/java/focusedCrawler/util/parser/PaginaURLTest.java +++ b/src/test/java/focusedCrawler/util/parser/PaginaURLTest.java @@ -191,6 +191,30 @@ public void shouldNormalizeLinks() throws MalformedURLException { assertThat(links[2].toString(), is("http://example.com/")); } + @Test + public void shouldExtractLinksWithRecentAndOnion() throws MalformedURLException { + // given + URL url = new URL("http://example.com/test.html"); + StringBuilder testPage = new StringBuilder(); + testPage.append(""); + testPage.append(""); + testPage.append(""); + testPage.append("

My First Heading

"); + testPage.append("Link with recent TLDs"); + testPage.append("Onion Link"); + testPage.append(""); + testPage.append(""); + + // when + PaginaURL paginaURL = new PaginaURL(url, testPage.toString()); + URL[] links = paginaURL.links(); + + // then + assertThat(links.length, is(2)); + assertThat(links[0].toString(), is("http://registry.africa/")); + assertThat(links[1].toString(), is("http://3g2upl4pq6kufc4m.onion/")); + } + private String createTestPage() { StringBuilder testPage = new StringBuilder(); testPage.append("");