diff --git a/Cargo.toml b/Cargo.toml index ab4e7c7..a510bcc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.52" +version = "0.0.53" repository = "https://github.com/spider-rs/spider-py" license = "MIT" description = "The fastest web crawler and indexer." diff --git a/book/src/website.md b/book/src/website.md index 9878600..c5b9185 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -136,6 +136,20 @@ async def main(): asyncio.run(main()) ``` +### Chrome Remote Connection + +Add a chrome remote connection url. This can be a json endpoint or ws direct connection. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_chrome_connection("http://localhost:9222/json/version") + +asyncio.run(main()) +``` + ### External Domains Add external domains to include with the website. @@ -338,6 +352,20 @@ async def main(): asyncio.run(main()) ``` +### Preserve Host + +Preserve the HOST HTTP header. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_preserve_host_header(True) + +asyncio.run(main()) +``` + ## Chaining You can chain all of the configs together for simple configuration. diff --git a/src/page.rs b/src/page.rs index 3101049..ded45af 100644 --- a/src/page.rs +++ b/src/page.rs @@ -12,6 +12,7 @@ pub struct Page { selectors: Option<( CompactString, spider::smallvec::SmallVec<[CompactString; 2]>, + CompactString )>, /// the url for the page pub url: String, diff --git a/src/website.rs b/src/website.rs index fbcf9c2..ae0fd42 100644 --- a/src/website.rs +++ b/src/website.rs @@ -578,10 +578,12 @@ impl Website { pub fn drain_links(&mut self) -> Vec { let links = self .inner - .drain_links() + .get_links() + .iter() .map(|x| x.as_ref().to_string()) .collect::>(); - + // drain for now until clear method exposure. + self.inner.drain_links(); links } @@ -729,6 +731,28 @@ impl Website { slf } + /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled. + pub fn with_chrome_connection( + mut slf: PyRefMut<'_, Self>, + chome_connection: String, + ) -> PyRefMut<'_, Self> { + slf + .inner + .with_chrome_connection(if chome_connection.is_empty() { None } else { Some (chome_connection)}); + slf + } + + /// Preserve the HOST header. + pub fn with_preserve_host_header( + mut slf: PyRefMut<'_, Self>, + preserve: bool, + ) -> PyRefMut<'_, Self> { + slf + .inner + .with_preserve_host_header(preserve); + slf + } + /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled. pub fn with_wait_for_delay( mut slf: PyRefMut<'_, Self>,