-
Notifications
You must be signed in to change notification settings - Fork 134
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix #177 by adding recent TLDs list to URL validator utils
- Loading branch information
Showing
7 changed files
with
267 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
144 changes: 144 additions & 0 deletions
144
src/main/java/focusedCrawler/tools/GenerateTLDLists.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
package focusedCrawler.tools; | ||
|
||
import focusedCrawler.crawler.async.fetcher.OkHttpFetcher; | ||
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; | ||
import focusedCrawler.crawler.crawlercommons.fetcher.http.UserAgent; | ||
import focusedCrawler.crawler.crawlercommons.fetcher.http.UserAgent.Builder; | ||
import focusedCrawler.util.CliTool; | ||
import io.airlift.airline.Command; | ||
import java.net.IDN; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.StringJoiner; | ||
import java.util.TreeSet; | ||
import org.apache.commons.validator.routines.DomainValidator; | ||
import org.apache.commons.validator.routines.DomainValidator.ArrayType; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
@Command(name = "GenerateTLDLists", description = "") | ||
public class GenerateTLDLists extends CliTool { | ||
|
||
public static void main(String[] args) throws Exception { | ||
CliTool.run(args, new GenerateTLDLists()); | ||
} | ||
|
||
@Override | ||
public void execute() throws Exception { | ||
|
||
UserAgent userAgent = new Builder().setAgentName("ACHE").build(); | ||
OkHttpFetcher fetcher = new OkHttpFetcher(userAgent); | ||
fetcher.setDefaultMaxContentSize(10 * 1024 * 1024); | ||
FetchedResult result = fetcher.get("https://www.iana.org/domains/root/db"); | ||
String html = new String(result.getContent()); | ||
|
||
Map<String, TreeSet<String>> tlds = extractTLDsFromHTML(result, html); | ||
|
||
checkCategoryExists(tlds, "country-code"); | ||
checkCategoryExists(tlds, "infrastructure"); | ||
checkCategoryExists(tlds, "test"); | ||
checkCategoryExists(tlds, "sponsored"); | ||
checkCategoryExists(tlds, "generic"); | ||
|
||
System.out.println(); | ||
|
||
System.out.println("// Recent Country Code TLDs absent in commons-validator"); | ||
printMissing("recentCountryCodeTLDs", tlds.get("country-code"), ArrayType.COUNTRY_CODE_RO); | ||
|
||
System.out.println("// Recent Generic TLDs absent in commons-validator"); | ||
printMissing("recentGenericTLDs", tlds.get("generic"), ArrayType.GENERIC_RO); | ||
|
||
System.out.println("// Recent Sponsored TLDs absent in commons-validator"); | ||
// sponsored category is stored in array type GENERIC_RO in commons-validator | ||
printMissing("recentSponsoredTLDs", tlds.get("sponsored"), ArrayType.GENERIC_RO); | ||
} | ||
|
||
private Map<String, TreeSet<String>> extractTLDsFromHTML(FetchedResult result, String html) { | ||
Document dom = Jsoup.parse(html, result.getFetchedUrl()); | ||
|
||
Map<String, TreeSet<String>> tlds = new HashMap<>(); | ||
Elements tr = dom.select("#tld-table tbody > tr"); | ||
for (Element element : tr) { | ||
Elements children = element.children(); | ||
if (children.size() != 3) { | ||
System.err.println( | ||
"WARN: Found a table row (tr) not with 3 children. The HTML template may have changed."); | ||
continue; | ||
} | ||
String tld = children.get(0).text(); | ||
String type = children.get(1).text(); | ||
|
||
tld = normalizeTld(tld); | ||
if (tld == null) { | ||
continue; | ||
} | ||
|
||
TreeSet<String> tldList = tlds.get(type); | ||
if (tldList == null) { | ||
tldList = new TreeSet<>(); | ||
tlds.put(type, tldList); | ||
} | ||
|
||
tldList.add(tld); | ||
} | ||
|
||
System.out.println("Found TLDs per category:"); | ||
for (Map.Entry<String, TreeSet<String>> entry : tlds.entrySet()) { | ||
String type = entry.getKey(); | ||
System.out.print(type + ": "); | ||
System.out.println(entry.getValue().size()); | ||
} | ||
return tlds; | ||
} | ||
|
||
private String normalizeTld(final String tld) { | ||
int lastChar = tld.length() - 1; | ||
if (!( | ||
(tld.charAt(0) == '.') || | ||
(tld.charAt(0) == '\u200F' && tld.charAt(1) == '.' | ||
&& tld.charAt(lastChar) == '\u200E') | ||
) | ||
) { | ||
System.err.printf("WARN: Found a TLD without leading dot: [%s]." | ||
+ " The HTML template may have changed.\n", tld); | ||
} | ||
String normalized = null; | ||
if (tld.charAt(0) == '\u200F' && tld.charAt(1) == '.' && tld.charAt(lastChar) == '\u200E') { | ||
normalized = tld.substring(2, tld.length() - 1); | ||
} | ||
if (tld.charAt(0) == '.') { | ||
normalized = tld.substring(1); | ||
} | ||
try { | ||
normalized = IDN.toASCII(normalized); | ||
} catch (Exception e) { | ||
System.err.printf("WARN: Failed to convert normalized string [%s]" | ||
+ " from TLD [%s] to punnycode.\n", normalized, tld); | ||
return null; | ||
} | ||
return normalized; | ||
} | ||
|
||
private void printMissing(String variableName, TreeSet<String> tlds, ArrayType tldType) { | ||
List<String> countryCode = Arrays.asList(DomainValidator.getTLDEntries(tldType)); | ||
StringJoiner str = new StringJoiner(","); | ||
for (String tld : tlds) { | ||
if (!countryCode.contains(tld)) { | ||
str.add("\n\"" + tld + "\""); | ||
} | ||
} | ||
System.out.printf("String[] " + variableName + " = new String[]{" + str.toString() + "};\n\n"); | ||
} | ||
|
||
private void checkCategoryExists(Map<String, TreeSet<String>> tlds, String tldCategory) { | ||
if (!tlds.containsKey(tldCategory)) { | ||
System.out.println("WARN: TLD category not found: " + tldCategory | ||
+ ". Site template may have changed."); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package focusedCrawler.util; | ||
|
||
import static org.junit.Assert.assertTrue; | ||
|
||
import org.junit.Test; | ||
|
||
public class UrlsTest { | ||
|
||
/** | ||
* See issue https://github.com/VIDA-NYU/ache/issues/177 | ||
*/ | ||
@Test | ||
public void RecentTLDsShouldBeValid() { | ||
assertTrue(Urls.isValid("http://registry.africa")); | ||
} | ||
|
||
@Test | ||
public void OnionLinksShouldBeValid() { | ||
assertTrue(Urls.isValid("http://3g2upl4pq6kufc4m.onion/")); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters