Skip to content

Commit

Permalink
feat(chrome): add with_wait_for_idle_network builder config
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 20, 2024
1 parent 6dde7be commit 5314135
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 3 deletions.
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.16"
version = "0.0.17"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

Expand All @@ -11,8 +11,8 @@ crate-type = ["cdylib"]
[dependencies]
indexmap = "2.1.0"
num_cpus = "1.16.0"
spider = { version = "1.80.68", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
pyo3 = { version = "0.20.0", features = ["extension-module"] }
spider = { version = "1.80.75", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
pyo3 = { version = "0.20.2", features = ["extension-module"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }

[target.x86_64-unknown-linux-gnu.dependencies]
Expand Down
16 changes: 16 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,22 @@ async def main():
asyncio.run(main())
```
### Wait For Idle Network
You can wait for the Network to become idle when using chrome. This helps load all the data from client side scripts.
The first param is whether to enable or not and the second is the duration max timeout in milliseconds.
```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_wait_for_idle_network(True, 12000)

asyncio.run(main())
```
### Respect Robots
Respect the robots.txt file.
Expand Down
22 changes: 22 additions & 0 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::{new_page, NPage, BUFFER};
use indexmap::IndexMap;
use pyo3::prelude::*;
use spider::compact_str::CompactString;
use spider::configuration::WaitForIdleNetwork;
use spider::tokio::select;
use spider::tokio::task::JoinHandle;
use spider::utils::shutdown;
Expand Down Expand Up @@ -781,6 +782,27 @@ impl Website {
slf
}

/// Wait for idle network request. This method does nothing if the `chrome` feature is not enabled.
/// Set the timeout to 0 to disable the timeout.
pub fn with_wait_for_idle_network(
mut slf: PyRefMut<'_, Self>,
wait_for_idle_network: bool,
timeout: u64,
) -> PyRefMut<'_, Self> {
slf
.inner
.with_wait_for_idle_network(if wait_for_idle_network {
Some(WaitForIdleNetwork::new(if timeout == 0 {
None
} else {
Some(core::time::Duration::from_millis(timeout))
}))
} else {
None
});
slf
}

/// Setup cron jobs to run
pub fn with_cron(
mut slf: PyRefMut<'_, Self>,
Expand Down

0 comments on commit 5314135

Please sign in to comment.