diff --git a/ache/src/main/java/achecrawler/util/Urls.java b/ache/src/main/java/achecrawler/util/Urls.java index 0cf381701..6c82a3bdb 100644 --- a/ache/src/main/java/achecrawler/util/Urls.java +++ b/ache/src/main/java/achecrawler/util/Urls.java @@ -121,7 +121,7 @@ public static String removeFragmentsIfAny(String url) { return url; } - public static String resolveHttpLink(HttpUrl base, String link) { + public static HttpUrl resolveHttpLink(HttpUrl base, String link) { HttpUrl resolvedUrl; try { if (base == null) { @@ -133,6 +133,11 @@ public static String resolveHttpLink(HttpUrl base, String link) { // The link is invalid or malformed resolvedUrl = null; } + return resolvedUrl; + } + + public static String resolveHttpLinkAsString(HttpUrl base, String link) { + HttpUrl resolvedUrl = resolveHttpLink(base, link); if (resolvedUrl == null) { return null; } else { diff --git a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java new file mode 100644 index 000000000..f458dac34 --- /dev/null +++ b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java @@ -0,0 +1,410 @@ +package achecrawler.util.parser; + +import achecrawler.util.Urls; +import okhttp3.HttpUrl; +import org.apache.commons.io.input.CharSequenceReader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.cyberneko.html.parsers.SAXParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.*; + +import java.io.IOException; +import java.io.StringReader; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + + +public class HtmlSaxParser extends SAXParser implements ContentHandler { + + public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class); + + public static final int AROUND_WORDS = 10; + + private final List anchors = new ArrayList<>(); + private final List images = new ArrayList<>(); + private final List tokens = new ArrayList<>(); + + private final StringBuilder title = new StringBuilder(); + private final StringBuilder text = new StringBuilder(); + private final SimpleTokenizer tokenizer = new SimpleTokenizer(new CharSequenceReader(text)); + + private HttpUrl base; + private TextType textState = TextType.TEXT; + private String currentHref = null; + private int currentHrefTextStart = 0; + private int currentHrefTokenStart = 0; + private StringBuilder anchorText = new StringBuilder(); + + public HtmlSaxParser(URL url, String html) { + this(url.toString(), html); + } + + public HtmlSaxParser(String url, String html) { + this.base = HttpUrl.parse(url); + // super.setContentHandler(new BoilerpipeHTMLContentHandler()); + setContentHandler(this); + InputSource input = new InputSource(new StringReader(html)); + try { + this.parse(input); + } catch (SAXException | IOException e) { + throw new RuntimeException("Failed to parse page: " + url, e); + } + } + + @Override + public void startElement(String uri, String tagName, String qName, Attributes atts) { + switch (tagName) { + case "BASE": { + handleBaseTag(atts); + break; + } + case "A": { + this.textState = TextType.ANCHOR_TEXT; + String href = atts.getValue("href"); + String link = createLink(this.base, href); + if (link != null) { + this.currentHref = link; + this.currentHrefTextStart = text.length(); + + this.tokenizer.tokenize(); + this.currentHrefTokenStart = this.tokens.size(); + } + break; + } + case "IMG": { + String href = atts.getValue("href"); + if (href != null && !href.isEmpty()) { + images.add(createLink(this.base, href)); + } + break; + } + case "NOSCRIPT": + case "SCRIPT": + case "STYLE": + this.textState = TextType.IGNORE; + break; + case "TITLE": + this.textState = TextType.TITLE; + break; + // default: + // this.textState = TextType.TEXT; + } + } + + @Override + public void endElement(String uri, String tagName, String qName) { + // TODO: extract data from tags (e.g., description, keywords, noindex, nofollow) + switch (tagName) { + case "A": + if (currentHref != null && !currentHref.isEmpty()) { + tokenizer.tokenize(); + anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(), + anchorText.toString().trim(), currentHrefTokenStart, tokens.size())); + currentHref = null; + } + anchorText = new StringBuilder(); + textState = TextType.TEXT; + break; + case "TITLE": + textState = TextType.TEXT; + break; + case "P": + case "H1": + case "H2": + case "H3": + case "H4": + case "H5": + case "H6": + text.append("\n\n"); + break; + case "BR": + text.append('\n'); + break; + case "NOSCRIPT": + case "SCRIPT": + case "STYLE": + this.textState = TextType.TEXT; + break; + default: + text.append(' '); + } + } + + /* + * Handles the BASE tag which sets the URL that should be used for resolving + */ + private void handleBaseTag(Attributes attributes) { + String href = attributes.getValue("href"); + if (href != null && !href.isEmpty()) { + // All extracted links should be relative to the href of tag + try { + HttpUrl newBase = Urls.resolveHttpLink(this.base, href); + if (newBase != null) { + this.base = newBase; + } + } catch (Exception e) { + // ignore invalid URLs + } + } + } + + @Override + public void characters(char[] ch, int start, int length) { + switch (textState) { + case TEXT: + text.append(ch, start, length); + break; + case ANCHOR_TEXT: + text.append(ch, start, length); + anchorText.append(ch, start, length); + break; + case TITLE: + title.append(ch, start, length); + break; + case IGNORE: + break; + } + } + + public URL[] links() { + List links = new ArrayList<>(); + for (Anchor anchor : anchors) { + URL absoluteUrl = Urls.toJavaURL(anchor.href); + if (absoluteUrl != null) { + links.add(absoluteUrl); + } + } + return links.toArray(new URL[links.size()]); + } + + public LinkNeighborhood[] getLinkNeighborhood() { + List links = new ArrayList<>(); + for (Anchor anchor : anchors) { + URL absoluteUrl = Urls.toJavaURL(anchor.href); + LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl); + ln.setAround(createAroundText(anchor)); + ln.setAnchor(createAnchorText(anchor)); + links.add(ln); + } + return links.toArray(new LinkNeighborhood[links.size()]); + } + + private String[] createAnchorText(Anchor anchor) { + List aroundTemp = new ArrayList<>(); + for (int i = anchor.tokenStart; i < anchor.tokenEnd; i++) { + aroundTemp.add(tokens.get(i)); + } + return aroundTemp.toArray(new String[aroundTemp.size()]); + } + + private String[] createAroundText(Anchor anchor) { + List aroundTemp = new ArrayList(); + final int begin = Math.max(0, anchor.tokenStart - AROUND_WORDS); + for (int i = begin; i < anchor.tokenStart; i++) { + aroundTemp.add(tokens.get(i)); + } + int end = Math.min(tokens.size(), anchor.tokenEnd + AROUND_WORDS); + for (int i = anchor.tokenEnd; i < end; i++) { + aroundTemp.add(tokens.get(i)); + } + return aroundTemp.toArray(new String[aroundTemp.size()]); + } + + private String createLink(HttpUrl base, String href) { + if (href == null || href.isEmpty()) { + return null; + } + String url = href; + if (url.startsWith(" ") || url.endsWith(" ")) { + url = href.trim(); + } + if (url.startsWith("javascript:")) { + return null; + } + if (url.startsWith("mailto:")) { + return null; + } + if (url.startsWith("tel:")) { + return null; + } + if (url.startsWith("data:")) { + return null; + } + String absoluteUrl = Urls.resolveHttpLinkAsString(base, href); + if (absoluteUrl == null || absoluteUrl.isEmpty()) { + return null; + } + if (!Urls.isValid(absoluteUrl)) { + return null; + } + return Urls.normalize(absoluteUrl); + } + + public URL getURL() { + return base != null ? base.url() : null; + } + + public List tokens() { + return this.tokens; + } + + public String title() { + return this.title.toString(); + } + + public String text() { + return this.text.toString(); + } + + private void print() { + // TODO: Clean up + System.out.println("---"); + System.out.println("TEXT: " + text.toString()); + System.out.println("ANCHORS: "); + for (Anchor anchor : anchors) { + System.out.println("> " + anchor); + } + } + + @Override + public void setDocumentLocator(Locator locator) { + } + + @Override + public void startDocument() { + } + + @Override + public void endDocument() { + // Finish tokenization of text left over + this.tokenizer.tokenize(); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) { + } + + @Override + public void processingInstruction(String target, String data) { + } + + @Override + public void skippedEntity(String name) { + } + + @Override + public void startPrefixMapping(String prefix, String uri) { + } + + @Override + public void endPrefixMapping(String prefix) { + } + + enum TextType { + TITLE, TEXT, ANCHOR_TEXT, IGNORE + } + + static class Anchor { + + private final String href; + private final int textStart; + private final int textEnd; + private final String anchorText; + private final int tokenStart; + private final int tokenEnd; + + Anchor(String href, int textStart, int textEnd, String anchorText, int tokenStart, int tokenEnd) { + this.href = href; + this.textStart = textStart; + this.textEnd = textEnd; + this.anchorText = anchorText; + this.tokenStart = tokenStart; + this.tokenEnd = tokenEnd; + } + + @Override + public String toString() { + return "Anchor[href=" + href + + ", textStart=" + textStart + + ", textEnd=" + textEnd + + ", text=" + anchorText + + "]"; + } + } + + public class SimpleTokenizer { + + private final TokenStream ts; + private final CharTermAttribute cattr; + + public SimpleTokenizer(CharSequenceReader cleanText) { + // TODO: setup a good general tokenizer + Analyzer analyzer = new SimpleAnalyzer(); +// this.analyzer = new StandardAnalyzer(StandardAnalyzer.ENGLISH_STOP_WORDS_SET); +// this.analyzer = new Analyzer() { +// @Override +// protected TokenStreamComponents createComponents(final String fieldName) { +// final StandardTokenizer src = new StandardTokenizer(); +// src.setMaxTokenLength(255); +// // return new TokenStreamComponents(src); +//// TokenStream tok = new StandardFilter(src); +//// tok = new LowerCaseFilter(tok); +////// tok = new StopFilter(tok, stopwords); +//// return new TokenStreamComponents(src, tok) { +//// @Override +//// protected void setReader(final Reader reader) { +//// // So that if maxTokenLength was changed, the change takes +//// // effect next time tokenStream is called: +//// src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength); +//// super.setReader(reader); +//// } +//// }; +// } +// }; + ts = analyzer.tokenStream("cleanText", cleanText); + cattr = ts.addAttribute(CharTermAttribute.class); + try { + ts.reset(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void tokenize() { + try { + while (ts.incrementToken()) { + String token = cattr.toString(); + HtmlSaxParser.this.tokens.add(token); + } + } catch (IOException e) { + throw new RuntimeException("Tokenization failed", e); + } + } + } + + // TODO: Clean up + public static void main(String[] args) throws Exception { + + String url = "http://www.darpa.mil/program/memex"; + String html = + "

My heading 1!

My Paragraph.

"; + html = new String(Files.readAllBytes(Paths.get( + "ache-tools/src/test/resources/achecrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex"))); + HtmlSaxParser parser = new HtmlSaxParser(url, html); +// parser.print(); +// PaginaURL parser = new PaginaURL(new URL(url), html); + + final LinkNeighborhood[] neighborhoods = parser.getLinkNeighborhood(); + for (LinkNeighborhood n : neighborhoods) { + System.out.println("> Around: " + n.getLink().toString()); + System.out.println(n.getAroundString()); + } + } +} diff --git a/ache/src/main/java/achecrawler/util/parser/PaginaURL.java b/ache/src/main/java/achecrawler/util/parser/PaginaURL.java index 96377f2c2..36b23beb5 100644 --- a/ache/src/main/java/achecrawler/util/parser/PaginaURL.java +++ b/ache/src/main/java/achecrawler/util/parser/PaginaURL.java @@ -895,17 +895,17 @@ protected void separadorTextoCodigo(String arquivo) { // arquivo equivale ao ln = new LinkNeighborhood(new URL(urlTemp)); } } else if (tagName.equals("img") - && atributo.equals("src")) { - if(ln != null){ - ln.setImgSource(str); - } + && atributo.equals("src")) { + if(ln != null){ + ln.setImgSource(str); + } try { - imagens.add(Urls.resolveHttpLink(base,str).toString()); + imagens.add(Urls.resolveHttpLink(base,str).toString()); } catch (Exception e) { // TODO: handle exception } - - } + } + else if (tagName.equals("frame") && atributo.equals("src")) { frames.add(str); addLink(str, base); @@ -1022,7 +1022,7 @@ else if (tagName.equals("frame") && atributo.equals("src")) { } else if (tagName.equals("base") && atributo.equals("href")) { try { HttpUrl oldBase = (baseUrl == null) ? null : HttpUrl.get(baseUrl); - String newBase = Urls.resolveHttpLink(oldBase, str); + String newBase = Urls.resolveHttpLinkAsString(oldBase, str); base = (newBase == null) ? null : HttpUrl.parse(newBase); } catch (Exception e) { // ignore invalid URLs @@ -1237,7 +1237,7 @@ protected String addLink(String link, HttpUrl base) { return ""; } link = link.trim(); - link = Urls.resolveHttpLink(base, link); + link = Urls.resolveHttpLinkAsString(base, link); if (link == null) { return ""; } diff --git a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java new file mode 100644 index 000000000..96fb4e665 --- /dev/null +++ b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java @@ -0,0 +1,273 @@ +package achecrawler.util.parser; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.junit.Assert.assertThat; + +import java.net.MalformedURLException; +import java.net.URL; + +import org.junit.Test; + +public class HtmlSaxParserTest { + + @Test + public void shouldExtractTitle() { + // given + String testString = new HtmlBuilder() + .withHeader("ACHE Crawler \n \t") + .withBody("

My text

") + .build(); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); + + // then + assertThat(pageParser.title().trim(), is("ACHE Crawler")); + } + + @Test + public void shouldCleanHtmlEntities() { + // given + String testString = new HtmlBuilder() + .withHeader("ACHE > domain specific search ©") + .withBody("

My text & me. €

") + .build(); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); + + // then + assertThat(pageParser.title(), is("ACHE > domain specific search ©")); + assertThat(pageParser.text().trim(), is("My\u00A0text & me. €")); + } + + @Test + public void htmlEncodedLinksShouldBeEscaped() { + // given + String testString = new HtmlBuilder() + .withBody("Anchor text.") + .build(); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); + URL[] extractedLinks = pageParser.links(); + LinkNeighborhood[] neighborhood = pageParser.getLinkNeighborhood(); + + // then + assertThat(extractedLinks[0].toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer")); + assertThat(neighborhood[0].getLink().toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer")); + } + + @Test + public void linksShouldNotContainFragments() throws MalformedURLException { + // given + String testString = new HtmlBuilder() + .appendToBody("

My First Heading

") + .appendToBody("Mouse") + .build(); + URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testString); + URL[] extractedLinks = pageParser.links(); + + // then + assertThat(extractedLinks.length, is(1)); + assertThat(extractedLinks[0].toString(), is("https://en.wikipedia.org/wiki/Mouse_(computing)")); + } + + @Test + public void constructorsShouldWork() throws MalformedURLException { + // given + URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); + String testPage = createTestPage(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); + // then + assertThat(pageParser.getURL(), is(notNullValue())); + } + + @Test + public void shouldExtractOnionLinks() throws MalformedURLException { + // given + URL url = new URL("http://example.com/test.html"); + String testPage = new HtmlBuilder() + .appendToBody("link 1") + .appendToBody("link 1") + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); + URL[] links = pageParser.links(); + + // then + assertThat(links.length, is(2)); + assertThat(links[0].toString(), is("http://3g2asl4qw6kufc5m.onion/")); + assertThat(links[1].toString(), is("http://3g2asl4qw6kufc5m.onion/test.html")); + } + +// @Test +// public void shouldParseText() throws MalformedURLException { +// // given +// URL url = new URL("http://example.com/"); +// StringBuilder testPage = new StringBuilder(); +// testPage.append(""); +// testPage.append(""); +// testPage.append(""); +// testPage.append("

My First paragraph. My second second paragraph.

"); +// testPage.append(""); +// testPage.append(""); +// +// // when +// HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); +// String[] links = pageParser.palavras(); +// int[] ocorrencias = pageParser.ocorrencias(); +// System.out.println(testPage.toString()); +// System.out.println(Arrays.deepToString(links)); +// System.out.println(Ints.asList(ocorrencias)); +//// // then +//// assertThat(links.length, is(1)); +//// assertThat(links[0].toString(), is("http://example.com/asdf.html")); +//// +//// assertThat(lns.length, is(1)); +//// assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html")); +// } + + @Test + public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLException { + // given + String url = "http://www.example.com"; + String testPage = HtmlBuilder.newBuilder() + .appendToBody("

My First Heading

") + .appendToBody("My first anchor text.") +// .appendToBody("my second anchor text.") + .appendToBody("

my paragraph.

") + .appendToBody("free text") + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); + LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood(); +// PaginaURL pageParser = new PaginaURL(new URL(url), testPage); +// LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor(); +// System.out.println("tokens = " + pageParser.getTokens()); + + // then + assertThat(neighborhoods.length, is(1)); + + assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading my paragraph free text")); + assertThat(neighborhoods[0].getAround()[0], is("my")); + assertThat(neighborhoods[0].getAround()[1], is("first")); + assertThat(neighborhoods[0].getAround()[2], is("heading")); + + assertThat(neighborhoods[0].getAnchorString().trim(), is("my first anchor text")); + assertThat(neighborhoods[0].getAnchor()[0], is("my")); + assertThat(neighborhoods[0].getAnchor()[1], is("first")); + assertThat(neighborhoods[0].getAnchor()[2], is("anchor")); + assertThat(neighborhoods[0].getAnchor()[3], is("text")); + } + + @Test + public void shouldNotExtractInvalidLinks() throws MalformedURLException { + // given + URL url = new URL("http://example.com/test.html"); + String testPage = new HtmlBuilder() + .withBody( + "

My First Heading

" + + "link 0" + + "link 1" + + "link 2" + ) + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); + URL[] links = pageParser.links(); + LinkNeighborhood[] lns = pageParser.getLinkNeighborhood(); + + // then + assertThat(links.length, is(1)); + assertThat(links[0].toString(), is("http://example.com/asdf.html")); + + assertThat(lns.length, is(1)); + assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html")); + } + + @Test + public void shouldNormalizeLinks() throws MalformedURLException { + // given + URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); + String testPage = HtmlBuilder.newBuilder() + .appendToBody("

My First Heading

") + .appendToBody("Link 1.") + .appendToBody("Link 2.") + .appendToBody("Link 3.") + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); + LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood(); + URL[] links = pageParser.links(); + + // then + assertThat(neighborhoods.length, is(3)); + assertThat(links.length, is(3)); + + assertThat(neighborhoods[0].getLink().toString(), is("http://example.com/post.php?")); + assertThat(links[0].toString(), is("http://example.com/post.php?")); + + assertThat(neighborhoods[1].getLink().toString(), is("http://example.com/post.php?a=1&b=2")); + assertThat(links[1].toString(), is("http://example.com/post.php?a=1&b=2")); + + assertThat(neighborhoods[2].getLink().toString(), is("http://example.com/")); + assertThat(links[2].toString(), is("http://example.com/")); + } + + private String createTestPage() { + return HtmlBuilder.newBuilder() + .appendToBody("

My First Heading

") + .appendToBody("My first paragraph.") + .build(); + } + + public static class HtmlBuilder { + + private String header = ""; + private String body = ""; + + public static HtmlBuilder newBuilder() { + return new HtmlBuilder(); + } + + public HtmlBuilder appendToBody(String body) { + this.body += body; + return this; + } + + public HtmlBuilder withHeader(String header) { + this.header = header; + return this; + } + + public HtmlBuilder withBody(String body) { + this.body = body; + return this; + } + + public String build() { + StringBuilder html = new StringBuilder(); + html.append(""); + html.append(""); + if (header != null && !header.isEmpty()) { + html.append(header); + } + html.append(""); + if (body != null && !body.isEmpty()) { + html.append(body); + } + html.append(""); + html.append(""); + return html.toString(); + } + + } + + +}