-
-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a fetcher that uses a real Chrome browser to download the html.
- Loading branch information
1 parent
382f21b
commit 475065d
Showing
10 changed files
with
253 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
plugins { | ||
buildsrc.convention.`kotlin-jvm` | ||
buildsrc.convention.`publish-jvm` | ||
buildsrc.convention.kover | ||
} | ||
|
||
dependencies { | ||
api(projects.fetcher.baseFetcher) | ||
api(Deps.htmlUnit) { | ||
exclude("org.eclipse.jetty.websocket") // avoid android crash; see #93 | ||
} | ||
api(Deps.logback) | ||
api(Deps.log4jOverSlf4j) | ||
api("io.fluidsonic.mirror:cdt-java-client:4.0.0-fluidsonic-1") | ||
|
||
testImplementation(projects.testUtils) | ||
} |
3 changes: 3 additions & 0 deletions
3
fetcher/chrome-fetcher/src/main/kotlin/it/skrape/fetcher/ChromeException.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
package it.skrape.fetcher | ||
|
||
public class ChromeException(msg: String) : Exception(msg) |
60 changes: 60 additions & 0 deletions
60
fetcher/chrome-fetcher/src/main/kotlin/it/skrape/fetcher/ChromeFetcher.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package it.skrape.fetcher | ||
|
||
import com.github.kklisura.cdt.launch.ChromeArguments | ||
import com.github.kklisura.cdt.launch.ChromeLauncher | ||
import com.github.kklisura.cdt.protocol.events.network.LoadingFinished | ||
import com.github.kklisura.cdt.protocol.support.types.EventHandler | ||
import com.github.kklisura.cdt.services.ChromeService | ||
import org.htmlunit.org.apache.http.HttpStatus | ||
|
||
public object ChromeFetcher : BlockingFetcher<Request> { | ||
override val requestBuilder: Request get() = Request() | ||
|
||
override fun fetch(request: Request): Result { | ||
val chromeArgs = ChromeArguments.defaults(true) | ||
.additionalArguments("no-sandbox", true) | ||
.additionalArguments("remote-allow-origins", "*") | ||
.build() | ||
|
||
// Start Chrome | ||
val launcher = ChromeLauncher() | ||
val chromeService: ChromeService = launcher.launch(chromeArgs) | ||
val tab = chromeService.createTab() | ||
val devToolsService = chromeService.createDevToolsService(tab) | ||
|
||
val page = devToolsService.page | ||
val runtime = devToolsService.runtime | ||
|
||
var result: Result? = null | ||
|
||
// Wait for on load event | ||
page.onLoadEventFired { _ -> | ||
val evaluation = runtime.evaluate("document.documentElement.outerHTML") | ||
|
||
result = Result( | ||
responseBody = evaluation.result.value.toString(), | ||
responseStatus = Result.Status(HttpStatus.SC_OK, ""), | ||
contentType = "", | ||
headers = emptyMap(), | ||
baseUri = request.url, | ||
cookies = emptyList() | ||
) | ||
|
||
devToolsService.close() | ||
} | ||
|
||
page.enable() | ||
|
||
// Navigate to the page in question | ||
page.navigate(request.url) | ||
|
||
devToolsService.waitUntilClosed() | ||
chromeService.closeTab(tab) | ||
|
||
if (result == null) { | ||
throw ChromeException("No result found") | ||
} | ||
|
||
return result as Result | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
<configuration debug="false"> | ||
|
||
<appender name="console" class="ch.qos.logback.core.ConsoleAppender"> | ||
<withJansi>false</withJansi> | ||
|
||
<encoder> | ||
<pattern>%highlight(%.-1level) %date{HH:mm:ss.SSS} [%30.30logger] %msg%n</pattern> | ||
</encoder> | ||
</appender> | ||
|
||
<!-- turning down htmlunit logging --> | ||
<logger name="com.gargoylesoftware.htmlunit" level="OFF"/> | ||
<logger name="org.apache.http" level="ERROR"/> | ||
|
||
<root level="INFO"> | ||
<appender-ref ref="console"/> | ||
</root> | ||
</configuration> |
89 changes: 89 additions & 0 deletions
89
fetcher/chrome-fetcher/src/test/kotlin/it/skrape/fetcher/ChromeFetcherTest.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package it.skrape.fetcher | ||
|
||
import Testcontainer | ||
import com.gargoylesoftware.htmlunit.util.NameValuePair | ||
import org.junit.jupiter.api.Test | ||
import org.junit.jupiter.api.condition.DisabledOnOs | ||
import org.junit.jupiter.api.condition.OS | ||
import org.junit.jupiter.api.parallel.Execution | ||
import org.junit.jupiter.api.parallel.ExecutionMode | ||
import setupCookiesStub | ||
import setupPostStub | ||
import setupRedirect | ||
import setupStub | ||
import strikt.api.expect | ||
import strikt.api.expectThat | ||
import strikt.api.expectThrows | ||
import strikt.assertions.contains | ||
import strikt.assertions.isEqualTo | ||
import java.net.SocketTimeoutException | ||
|
||
private val wiremock = Testcontainer.wiremock | ||
private val httpBin = Testcontainer.httpBin | ||
|
||
@Execution(ExecutionMode.SAME_THREAD) | ||
@DisabledOnOs(OS.WINDOWS) | ||
class ChromeFetcherTest { | ||
|
||
private val baseRequest by lazy { Request(url = wiremock.httpUrl) } | ||
|
||
@Test | ||
fun `will fetch localhost 8080 with defaults if no params`() { | ||
wiremock.setupStub() | ||
|
||
val fetched = ChromeFetcher.fetch(baseRequest) | ||
|
||
expect { | ||
that(fetched.status { code }).isEqualTo(200) | ||
that(fetched.responseBody).contains("i'm the title") | ||
} | ||
} | ||
|
||
@Test | ||
fun `can fetch url and use HTTP verb GET by default`() { | ||
wiremock.setupStub(path = "/example") | ||
|
||
val request = baseRequest.copy( | ||
url = "${wiremock.httpUrl}/example", | ||
sslRelaxed = true | ||
) | ||
|
||
val fetched = ChromeFetcher.fetch(request) | ||
|
||
expect { | ||
that(fetched.status { code }).isEqualTo(200) | ||
that(fetched.responseBody).contains("i'm the title") | ||
} | ||
} | ||
|
||
@Test | ||
fun `can parse js rendered elements`() { | ||
wiremock.setupStub(fileName = "js.html") | ||
|
||
val fetched = ChromeFetcher.fetch(baseRequest) | ||
|
||
expectThat(fetched.responseBody).contains("I have been dynamically added via Javascript") | ||
} | ||
|
||
@Test | ||
fun `can parse js rendered elements from https page`() { | ||
wiremock.setupStub(fileName = "js.html") | ||
val request = baseRequest.copy( | ||
url = wiremock.httpUrl, | ||
sslRelaxed = true | ||
) | ||
|
||
val fetched = ChromeFetcher.fetch(request) | ||
|
||
expectThat(fetched.responseBody).contains("I have been dynamically added via Javascript") | ||
} | ||
|
||
@Test | ||
fun `can parse es6 rendered elements from https page`() { | ||
wiremock.setupStub(fileName = "es6.html") | ||
|
||
val fetched = ChromeFetcher.fetch(baseRequest) | ||
expectThat(fetched.responseBody).contains("dynamically added") | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"data":"some value"} |
16 changes: 16 additions & 0 deletions
16
fetcher/chrome-fetcher/src/test/resources/__files/es6.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<title>i'm the title</title> | ||
</head> | ||
<body> | ||
i'm the body | ||
<h1>i'm the headline</h1> | ||
<p>i'm a paragraph</p> | ||
<p>i'm a second paragraph</p> | ||
</body> | ||
<script> | ||
const getNodesOf = (selector) => document.querySelectorAll(selector); | ||
getNodesOf("p").forEach(p => p.innerHTML = "<span>dynamically added</span>") | ||
</script> | ||
</html> |
28 changes: 28 additions & 0 deletions
28
fetcher/chrome-fetcher/src/test/resources/__files/example.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<title>i'm the title</title> | ||
</head> | ||
<body> | ||
i'm the body | ||
<h1>i'm the headline</h1> | ||
<header> | ||
<h1>i'm the headers headline</h1> | ||
</header> | ||
<p class="foo bar fizz buzz" data-foo="bar">i'm a paragraph</p> | ||
<p>i'm a second paragraph</p> | ||
<div> | ||
first div | ||
<div>first divs child div</div> | ||
</div> | ||
<div> | ||
second div | ||
<div>second divs child div</div> | ||
</div> | ||
<div class="foo bar fizz buzz">div with class foo</div> | ||
<a-custom-tag>i'm a custom html5 tag</a-custom-tag> | ||
<a href="http://some.url">first link</a> | ||
<a href="http://some-other.url">second link</a> | ||
<a href="/relative-link">relative link</a> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<title>i'm the title</title> | ||
</head> | ||
<body> | ||
i'm the body | ||
<h1>i'm the headline</h1> | ||
<p>i'm a paragraph</p> | ||
<p>i'm a second paragraph</p> | ||
</body> | ||
<script> | ||
var dynamicallyAddedElement = document.createElement("div"); | ||
dynamicallyAddedElement.className = "dynamic"; | ||
var textNode = document.createTextNode("I have been dynamically added via Javascript"); | ||
dynamicallyAddedElement.appendChild(textNode); | ||
document.body.appendChild(dynamicallyAddedElement); | ||
</script> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters