Skip to content

Commit

Permalink
chore(website): expose chrome connection url
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 24, 2024
1 parent 30e963d commit 7a9fb2d
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.52"
version = "0.0.53"
repository = "https://github.com/spider-rs/spider-py"
license = "MIT"
description = "The fastest web crawler and indexer."
Expand Down
28 changes: 28 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,20 @@ async def main():
asyncio.run(main())
```

### Chrome Remote Connection

Add a chrome remote connection url. This can be a json endpoint or ws direct connection.

```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_chrome_connection("http://localhost:9222/json/version")

asyncio.run(main())
```

### External Domains

Add external domains to include with the website.
Expand Down Expand Up @@ -338,6 +352,20 @@ async def main():
asyncio.run(main())
```
### Preserve Host
Preserve the HOST HTTP header.
```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_preserve_host_header(True)

asyncio.run(main())
```
## Chaining
You can chain all of the configs together for simple configuration.
Expand Down
1 change: 1 addition & 0 deletions src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub struct Page {
selectors: Option<(
CompactString,
spider::smallvec::SmallVec<[CompactString; 2]>,
CompactString
)>,
/// the url for the page
pub url: String,
Expand Down
28 changes: 26 additions & 2 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -578,10 +578,12 @@ impl Website {
pub fn drain_links(&mut self) -> Vec<String> {
let links = self
.inner
.drain_links()
.get_links()
.iter()
.map(|x| x.as_ref().to_string())
.collect::<Vec<String>>();

// drain for now until clear method exposure.
self.inner.drain_links();
links
}

Expand Down Expand Up @@ -729,6 +731,28 @@ impl Website {
slf
}

/// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
pub fn with_chrome_connection(
mut slf: PyRefMut<'_, Self>,
chome_connection: String,
) -> PyRefMut<'_, Self> {
slf
.inner
.with_chrome_connection(if chome_connection.is_empty() { None } else { Some (chome_connection)});
slf
}

/// Preserve the HOST header.
pub fn with_preserve_host_header(
mut slf: PyRefMut<'_, Self>,
preserve: bool,
) -> PyRefMut<'_, Self> {
slf
.inner
.with_preserve_host_header(preserve);
slf
}

/// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
pub fn with_wait_for_delay(
mut slf: PyRefMut<'_, Self>,
Expand Down

0 comments on commit 7a9fb2d

Please sign in to comment.