Skip to content

Commit

Permalink
feat(chrome): add page bytes collecting
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 4, 2024
1 parent 2d8a8f9 commit 95436e9
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 79 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion examples/chrome_remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,15 @@ async fn crawl_website(url: &str) -> Result<()> {
tokio::spawn(async move {
while let Ok(page) = rx2.recv().await {
let _ = stdout
.write_all(format!("- {}\n", page.get_url()).as_bytes())
.write_all(
format!(
"- {} -- Bytes transferred {:?} -- HTML Size {:?}\n",
page.get_url(),
page.bytes_transferred.unwrap_or_default(),
page.get_html_bytes_u8().len()
)
.as_bytes(),
)
.await;
}
});
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.15.0"
version = "2.16.0"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
4 changes: 2 additions & 2 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -826,5 +826,5 @@ static CHROME_ARGS: [&'static str; 63] = [

/// Fingerprint handling
pub(crate) static FP_JS: &'static str = r#"const toBlob=HTMLCanvasElement.prototype.toBlob,toDataURL=HTMLCanvasElement.prototype.toDataURL,getImageData=CanvasRenderingContext2D.prototype.getImageData,noisify=function(e,t){let o={r:Math.floor(10*Math.random())-5,g:Math.floor(10*Math.random())-5,b:Math.floor(10*Math.random())-5,a:Math.floor(10*Math.random())-5},r=e.width,n=e.height,a=getImageData.apply(t,[0,0,r,n]);for(let i=0;i<n;i++)for(let f=0;f<r;f++){let l=i*(4*r)+4*f;a.data[l+0]=a.data[l+0]+o.r,a.data[l+1]=a.data[l+1]+o.g,a.data[l+2]=a.data[l+2]+o.b,a.data[l+3]=a.data[l+3]+o.a}t.putImageData(a,0,0)};Object.defineProperty(HTMLCanvasElement.prototype,"toBlob",{value:function(){return noisify(this,this.getContext("2d")),toBlob.apply(this,arguments)}}),Object.defineProperty(HTMLCanvasElement.prototype,"toDataURL",{value:function(){return noisify(this,this.getContext("2d")),toDataURL.apply(this,arguments)}}),Object.defineProperty(CanvasRenderingContext2D.prototype,"getImageData",{value:function(){return noisify(this.canvas,this),getImageData.apply(this,arguments)}});const config={random:{value:function(){return Math.random()},item:function(e){let t=e.length*config.random.value();return e[Math.floor(t)]},array:function(e){let t=config.random.item(e);return new Int32Array([t,t])},items:function(e,t){let o=e.length,r=Array(t),n=Array(o);for(t>o&&(t=o);t--;){let a=Math.floor(config.random.value()*o);r[t]=e[a in n?n[a]:a],n[a]=--o in n?n[o]:o}return r}},spoof:{webgl:{buffer:function(e){let t=e.prototype.bufferData;Object.defineProperty(e.prototype,"bufferData",{value:function(){let e=Math.floor(10*config.random.value()),o=.1*config.random.value()*arguments[1][e];return arguments[1][e]=arguments[1][e]+o,t.apply(this,arguments)}})},parameter:function(e){e.prototype.getParameter,Object.defineProperty(e.prototype,"getParameter",{value:function(){let e=new Float32Array([1,8192]);if(3415===arguments[0])return 0;if(3414===arguments[0])return 24;if(35661===arguments[0])return config.random.items([128,192,256]);if(3386===arguments[0])return config.random.array([8192,16384,32768]);if(36349===arguments[0]||36347===arguments[0])return config.random.item([4096,8192]);else if(34047===arguments[0]||34921===arguments[0])return config.random.items([2,4,8,16]);else if(7937===arguments[0]||33901===arguments[0]||33902===arguments[0])return e;else if(34930===arguments[0]||36348===arguments[0]||35660===arguments[0])return config.random.item([16,32,64]);else if(34076===arguments[0]||34024===arguments[0]||3379===arguments[0])return config.random.item([16384,32768]);else if(3413===arguments[0]||3412===arguments[0]||3411===arguments[0]||3410===arguments[0]||34852===arguments[0])return config.random.item([2,4,8,16]);else return config.random.item([0,2,4,8,16,32,64,128,256,512,1024,2048,4096,])}})}}}};config.spoof.webgl.buffer(WebGLRenderingContext),config.spoof.webgl.buffer(WebGL2RenderingContext),config.spoof.webgl.parameter(WebGLRenderingContext),config.spoof.webgl.parameter(WebGL2RenderingContext);const rand={noise:function(){return Math.floor(Math.random()+(Math.random()<Math.random()?-1:1)*Math.random())},sign:function(){let e=[-1,-1,-1,-1,-1,-1,1,-1,-1,-1],t=Math.floor(Math.random()*e.length);return e[t]}};Object.defineProperty(HTMLElement.prototype,"offsetHeight",{get(){let e=Math.floor(this.getBoundingClientRect().height),t=e&&1===rand.sign(),o=t?e+rand.noise():e;return o}}),Object.defineProperty(HTMLElement.prototype,"offsetWidth",{get(){let e=Math.floor(this.getBoundingClientRect().width),t=e&&1===rand.sign(),o=t?e+rand.noise():e;return o}});const context={BUFFER:null,getChannelData:function(e){let t=e.prototype.getChannelData;Object.defineProperty(e.prototype,"getChannelData",{value:function(){let e=t.apply(this,arguments);if(context.BUFFER!==e){context.BUFFER=e;for(let o=0;o<e.length;o+=100){let r=Math.floor(Math.random()*o);e[r]=e[r]+1e-7*Math.random()}}return e}})},createAnalyser:function(e){let t=e.prototype.__proto__.createAnalyser;Object.defineProperty(e.prototype.__proto__,"createAnalyser",{value:function(){let e=t.apply(this,arguments),o=e.__proto__.getFloatFrequencyData;return Object.defineProperty(e.__proto__,"getFloatFrequencyData",{value:function(){let e=o.apply(this,arguments);for(let t=0;t<arguments[0].length;t+=100){let r=Math.floor(Math.random()*t);arguments[0][r]=arguments[0][r]+.1*Math.random()}return e}}),e}})}};context.getChannelData(AudioBuffer),context.createAnalyser(AudioContext),context.getChannelData(OfflineAudioContext),context.createAnalyser(OfflineAudioContext),navigator.mediaDevices.getUserMedia=navigator.webkitGetUserMedia=navigator.mozGetUserMedia=navigator.getUserMedia=webkitRTCPeerConnection=RTCPeerConnection=MediaStreamTrack=void 0;const getParameter=WebGLRenderingContext.prototype.getParameter;WebGLRenderingContext.prototype.getParameter=function(e){return 37445===e?"Intel Open Source Technology Center":37446===e?"Mesa DRI Intel(R) Ivybridge Mobile ":getParameter.call(this,e)};const newProto=navigator.__proto__;delete newProto.webdriver,navigator.__proto__=newProto;"#;
/// Handle extracting links from anchors that are not found.
pub(crate) static ANCHOR_EVENTS: &'static str = r###"() => new Promise((resolve) => { const _pageRoutes = new Set(), _originalPushState = window.history.pushState; window.history.pushState = function(_state, _title, _url) { _pageRoutes.add(_url) }; function _onRouteChange() { _pageRoutes.add(window.location.href) } document.querySelectorAll("a:not([href])").forEach(_anchor => { _anchor.click() }); window.addEventListener("popstate", _onRouteChange); return resolve(Array.from(_pageRoutes)); } )"###;
// /// Handle extracting links from anchors that are not found.
// pub(crate) static ANCHOR_EVENTS: &'static str = r###"() => new Promise((resolve) => { const _pageRoutes = new Set(), _originalPushState = window.history.pushState; window.history.pushState = function(_state, _title, _url) { _pageRoutes.add(_url) }; function _onRouteChange() { _pageRoutes.add(window.location.href) } document.querySelectorAll("a:not([href])").forEach(_anchor => { _anchor.click() }); window.addEventListener("popstate", _onRouteChange); return resolve(Array.from(_pageRoutes)); } )"###;
3 changes: 3 additions & 0 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ pub struct Page {
pub should_retry: bool,
/// A WAF was found on the page.
pub waf_check: bool,
/// The total bytes transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.
pub bytes_transferred: Option<f64>,
}

/// Represent a page visited.
Expand Down Expand Up @@ -441,6 +443,7 @@ pub fn build(url: &str, res: PageResponse) -> Page {
page_links: None,
should_retry,
waf_check: res.waf_check,
bytes_transferred: res.bytes_transferred,
}
}

Expand Down
133 changes: 71 additions & 62 deletions spider/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,26 +142,23 @@ async fn cf_handle(
wait_for.page_navigations = true;
page_wait(&page, &Some(wait_for.clone())).await;

match page.content_bytes().await {
Ok(next_content) => {
let next_content = if next_content.ends_with(cf)
|| next_content.ends_with(cf2)
|| next_content.starts_with(cn) && next_content.ends_with(cnf)
{
wait_for.delay =
WaitForDelay::new(Some(core::time::Duration::from_secs(4))).into();
page_wait(&page, &Some(wait_for)).await;
match page.content_bytes().await {
Ok(nc) => nc,
_ => next_content,
}
} else {
next_content
};
if let Ok(next_content) = page.content_bytes().await {
let next_content = if next_content.ends_with(cf)
|| next_content.ends_with(cf2)
|| next_content.starts_with(cn) && next_content.ends_with(cnf)
{
wait_for.delay =
WaitForDelay::new(Some(core::time::Duration::from_secs(4))).into();
page_wait(&page, &Some(wait_for)).await;
match page.content_bytes().await {
Ok(nc) => nc,
_ => next_content,
}
} else {
next_content
};

*b = next_content;
}
_ => (),
*b = next_content;
}
})
.await;
Expand Down Expand Up @@ -212,6 +209,8 @@ pub struct PageResponse {
pub extra_ai_data: Option<Vec<crate::page::AIResults>>,
/// A WAF was found on the page.
pub waf_check: bool,
/// The total bytes transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.
pub bytes_transferred: Option<f64>,
}

/// wait for event with timeout
Expand Down Expand Up @@ -502,57 +501,45 @@ pub async fn perform_chrome_http_request(
Ok(page_base) => {
match page_base {
Some(http_request) => {
match http_request.method.as_deref() {
Some(http_method) => {
method = http_method.into();
}
_ => (),
if let Some(http_method) = http_request.method.as_deref() {
method = http_method.into();
}

request_headers.clone_from(&http_request.headers);

match http_request.response {
Some(ref response) => {
match response.protocol {
Some(ref p) => {
protocol.clone_from(p);
}
_ => (),
}
if let Some(ref response) = http_request.response {
if let Some(ref p) = response.protocol {
protocol.clone_from(p);
}

match response.headers.inner().as_object() {
Some(res_headers) => {
for (k, v) in res_headers {
response_headers.insert(k.to_string(), v.to_string());
}
}
_ => (),
if let Some(res_headers) = response.headers.inner().as_object() {
for (k, v) in res_headers {
response_headers.insert(k.to_string(), v.to_string());
}
}

if !response.url.starts_with(source) {
waf_check = match response.security_details {
Some(ref security_details) => {
if security_details.subject_name
== "challenges.cloudflare.com"
{
true
} else {
false
}
}
_ => response.url.contains("/cdn-cgi/challenge-platform"),
};
if !waf_check {
waf_check = match response.protocol {
Some(ref protocol) => protocol == "blob",
_ => false,
if !response.url.starts_with(source) {
waf_check = match response.security_details {
Some(ref security_details) => {
if security_details.subject_name == "challenges.cloudflare.com"
{
true
} else {
false
}
}
_ => response.url.contains("/cdn-cgi/challenge-platform"),
};
if !waf_check {
waf_check = match response.protocol {
Some(ref protocol) => protocol == "blob",
_ => false,
}
}

status_code = StatusCode::from_u16(response.status as u16)
.unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
}
_ => (),

status_code = StatusCode::from_u16(response.status as u16)
.unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
}
}
_ => {
Expand Down Expand Up @@ -1010,9 +997,25 @@ pub async fn fetch_page_html_chrome_base(
request_timeout: &Option<Box<std::time::Duration>>,
) -> Result<PageResponse, chromiumoxide::error::CdpError> {
use std::ops::Div;

let mut chrome_http_req_res = ChromeHTTPReqRes::default();

let listener = page
.event_listener::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>()
.await;

// Listen for network events. todo: capture the last values endtime to track period.
let bytes_collected_handle = tokio::spawn(async move {
let mut total = 0.0;

if let Ok(mut listener) = listener {
while let Some(event) = listener.next().await {
total += event.encoded_data_length;
}
}

total
});

let page_navigation = async {
if !page_set {
// used for smart mode re-rendering direct assigning html
Expand Down Expand Up @@ -1061,7 +1064,9 @@ pub async fn fetch_page_html_chrome_base(
None
};

let page_response = if timeout_error.is_none() && chrome_http_req_res.status_code.is_success() {
let mut page_response = if timeout_error.is_none()
&& chrome_http_req_res.status_code.is_success()
{
// we do not need to wait for navigation if content is assigned. The method set_content already handles this.
let final_url = if wait_for_navigation {
let last_redirect = tokio::time::timeout(tokio::time::Duration::from_secs(15), async {
Expand Down Expand Up @@ -1218,6 +1223,10 @@ pub async fn fetch_page_html_chrome_base(
page.execute(chromiumoxide::cdp::browser_protocol::page::CloseParams::default()),
)
.await;
// we want to use a sync impl to get bytes when storing the page.
if let Ok(transferred) = bytes_collected_handle.await {
page_response.bytes_transferred = Some(transferred);
}
}

Ok(page_response)
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.15.0"
version = "2.16.0"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
Loading

0 comments on commit 95436e9

Please sign in to comment.