Skip to content

Commit

Permalink
chore(page): add http cookies map return
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Aug 20, 2024
1 parent eb73f70 commit 66af308
Show file tree
Hide file tree
Showing 8 changed files with 202 additions and 103 deletions.
173 changes: 104 additions & 69 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.0.4"
version = "2.0.6"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
24 changes: 12 additions & 12 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "2.0.4"
spider = "2.0.6"
```

And then the code:
Expand Down Expand Up @@ -93,7 +93,7 @@ We have the following optional feature flags.

```toml
[dependencies]
spider = { version = "2.0.4", features = ["regex", "ua_generator"] }
spider = { version = "2.0.6", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -139,7 +139,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "2.0.4", features = ["decentralized"] }
spider = { version = "2.0.6", features = ["decentralized"] }
```

```sh
Expand Down Expand Up @@ -170,7 +170,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "2.0.4", features = ["sync"] }
spider = { version = "2.0.6", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -201,7 +201,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "2.0.4", features = ["regex"] }
spider = { version = "2.0.6", features = ["regex"] }
```

```rust,no_run
Expand All @@ -228,7 +228,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "2.0.4", features = ["control"] }
spider = { version = "2.0.6", features = ["control"] }
```

```rust
Expand Down Expand Up @@ -298,7 +298,7 @@ Use cron jobs to run crawls continuously at anytime.

```toml
[dependencies]
spider = { version = "2.0.4", features = ["sync", "cron"] }
spider = { version = "2.0.6", features = ["sync", "cron"] }
```

```rust,no_run
Expand Down Expand Up @@ -337,7 +337,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network

```toml
[dependencies]
spider = { version = "2.0.4", features = ["chrome", "chrome_intercept"] }
spider = { version = "2.0.6", features = ["chrome", "chrome_intercept"] }
```

You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
Expand Down Expand Up @@ -367,7 +367,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]

```toml
[dependencies]
spider = { version = "2.0.4", features = ["cache"] }
spider = { version = "2.0.6", features = ["cache"] }
```

You need to set `website.cache` to true to enable as well.
Expand Down Expand Up @@ -398,7 +398,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be

```toml
[dependencies]
spider = { version = "2.0.4", features = ["smart"] }
spider = { version = "2.0.6", features = ["smart"] }
```

```rust,no_run
Expand All @@ -424,7 +424,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur

```toml
[dependencies]
spider = { version = "2.0.4", features = ["openai"] }
spider = { version = "2.0.6", features = ["openai"] }
```

```rust
Expand All @@ -450,7 +450,7 @@ Set a depth limit to prevent forwarding.

```toml
[dependencies]
spider = { version = "2.0.4" }
spider = { version = "2.0.6" }
```

```rust,no_run
Expand Down
19 changes: 13 additions & 6 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ use tokio::time::Instant;

#[cfg(all(feature = "decentralized", feature = "headers"))]
use crate::utils::FetchPageResult;
#[cfg(feature = "headers")]
use reqwest::header::HeaderMap;

use tokio_stream::StreamExt;
use url::Url;

Expand Down Expand Up @@ -106,7 +103,10 @@ pub struct Page {
url: String,
#[cfg(feature = "headers")]
/// The headers of the page request response.
pub headers: Option<HeaderMap>,
pub headers: Option<reqwest::header::HeaderMap>,
#[cfg(feature = "cookies")]
/// The cookies of the page request response.
pub cookies: Option<reqwest::header::HeaderMap>,
/// The status code of the page request.
pub status_code: StatusCode,
/// The error of the request if any.
Expand Down Expand Up @@ -140,7 +140,10 @@ pub struct Page {
html: Option<Bytes>,
#[cfg(feature = "headers")]
/// The headers of the page request response.
pub headers: Option<HeaderMap>,
pub headers: Option<reqwest::header::HeaderMap>,
#[cfg(feature = "cookies")]
/// The cookies of the page request response.
pub cookies: Option<reqwest::header::HeaderMap>,
/// The status code of the page request.
pub status_code: StatusCode,
/// The error of the request if any.
Expand Down Expand Up @@ -259,6 +262,8 @@ pub fn build(url: &str, res: PageResponse) -> Page {
},
#[cfg(feature = "headers")]
headers: res.headers,
#[cfg(feature = "cookies")]
cookies: res.cookies,
base: match Url::parse(url) {
Ok(u) => Some(u),
_ => None,
Expand Down Expand Up @@ -298,6 +303,8 @@ pub fn build(_: &str, res: PageResponse) -> Page {
},
#[cfg(feature = "headers")]
headers: res.headers,
#[cfg(feature = "cookies")]
cookies: res.cookies,
final_redirect_destination: res.final_url,
status_code: res.status_code,
error_status: match res.error_for_status {
Expand Down Expand Up @@ -1407,7 +1414,7 @@ pub fn get_html_encoded(html: &Option<Bytes>, label: &str) -> String {
}

#[cfg(test)]
#[cfg(all(not(feature = "decentralized"), not(feature = "cache")))]
#[cfg(all(not(feature = "decentralized")))]
pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"));

#[cfg(all(
Expand Down
75 changes: 66 additions & 9 deletions spider/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ pub mod header_utils;
/// A trie struct.
pub mod trie;

use std::str::FromStr;

#[cfg(feature = "chrome")]
use crate::features::chrome_common::{AutomationScripts, ExecutionScripts};
use crate::tokio_stream::StreamExt;
Expand All @@ -11,9 +13,11 @@ use crate::Client;
use http_cache_semantics::{RequestLike, ResponseLike};

use log::{info, log_enabled, Level};
#[cfg(feature = "headers")]
use reqwest::header::HeaderMap;
use reqwest::{Error, Response, StatusCode};

use reqwest::{
header::{HeaderName, HeaderValue},
Error, Response, StatusCode,
};

#[cfg(feature = "fs")]
lazy_static! {
Expand Down Expand Up @@ -130,7 +134,10 @@ pub struct PageResponse {
pub content: Option<bytes::Bytes>,
#[cfg(feature = "headers")]
/// The headers of the response. (Always None if a webdriver protocol is used for fetching.).
pub headers: Option<HeaderMap>,
pub headers: Option<reqwest::header::HeaderMap>,
#[cfg(feature = "cookies")]
/// The cookies of the response.
pub cookies: Option<reqwest::header::HeaderMap>,
/// The status code of the request.
pub status_code: StatusCode,
/// The final url destination after any redirects.
Expand Down Expand Up @@ -1053,6 +1060,36 @@ pub fn get_last_redirect(
}
}

/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
#[cfg(feature = "cookies")]
pub fn get_cookies(res: &Response) -> Option<reqwest::header::HeaderMap> {
let mut headers = reqwest::header::HeaderMap::new();

for cookie in res.cookies() {
match HeaderValue::from_str(cookie.value()) {
Ok(h) => match HeaderName::from_str(cookie.name()) {
Ok(n) => {
headers.insert(n, h);
}
_ => (),
},
_ => (),
}
}

if !headers.is_empty() {
Some(headers)
} else {
None
}
}

#[cfg(not(feature = "cookies"))]
/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
pub fn get_cookies(res: &Response) -> Option<reqwest::header::HeaderMap> {
None
}

/// Perform a network request to a resource extracting all content streaming.
pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse {
use crate::bytes::BufMut;
Expand All @@ -1067,9 +1104,11 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo
} else {
None
};
let status_code = res.status();
let status_code: StatusCode = res.status();
#[cfg(feature = "headers")]
let headers = res.headers().clone();
let cookies = get_cookies(&res);

let mut stream = res.bytes_stream();
let mut data: BytesMut = BytesMut::new();

Expand All @@ -1091,6 +1130,8 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo
PageResponse {
#[cfg(feature = "headers")]
headers: Some(headers),
#[cfg(feature = "cookies")]
cookies,
content: Some(data.into()),
final_url: rd,
status_code,
Expand All @@ -1100,6 +1141,7 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo
Ok(res) => PageResponse {
#[cfg(feature = "headers")]
headers: Some(res.headers().clone()),
cookies: get_cookies(&res),
status_code: res.status(),
..Default::default()
},
Expand Down Expand Up @@ -1133,9 +1175,9 @@ pub async fn fetch_page(target_url: &str, client: &Client) -> Option<bytes::Byte
/// Fetch a page with the headers returned.
pub enum FetchPageResult {
/// Success extracting contents of the page
Success(HeaderMap, Option<bytes::Bytes>),
Success(reqwest::header::HeaderMap, Option<bytes::Bytes>),
/// No success extracting content
NoSuccess(HeaderMap),
NoSuccess(reqwest::header::HeaderMap),
/// A network error occured.
FetchError,
}
Expand Down Expand Up @@ -1192,6 +1234,7 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse
};

let status_code = res.status();
let cookies = get_cookies(&res);
#[cfg(feature = "headers")]
let headers = res.headers().clone();
let mut stream = res.bytes_stream();
Expand Down Expand Up @@ -1243,6 +1286,8 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse
PageResponse {
#[cfg(feature = "headers")]
headers: Some(headers),
#[cfg(feature = "cookies")]
cookies,
content: Some(if file.is_some() {
let mut buffer = vec![];

Expand All @@ -1269,6 +1314,8 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse
Ok(res) => PageResponse {
#[cfg(feature = "headers")]
headers: Some(res.headers().clone()),
#[cfg(feature = "cookies")]
cookies: get_cookies(&res),
status_code: res.status(),
..Default::default()
},
Expand All @@ -1290,8 +1337,8 @@ pub async fn fetch_page_html(
screenshot: &Option<crate::configuration::ScreenShotConfig>,
page_set: bool,
openai_config: &Option<crate::configuration::GPTConfigs>,
execution_scripts: &ExecutionScripts,
automation_scripts: &AutomationScripts,
execution_scripts: &Option<ExecutionScripts>,
automation_scripts: &Option<AutomationScripts>,
) -> PageResponse {
use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
use percent_encoding::utf8_percent_encode;
Expand Down Expand Up @@ -1329,6 +1376,7 @@ pub async fn fetch_page_html(
Ok(res) if res.status().is_success() => {
#[cfg(feature = "headers")]
let headers = res.headers().clone();
let cookies = get_cookies(&res);
let status_code = res.status();
let mut stream = res.bytes_stream();
let mut data: BytesMut = BytesMut::new();
Expand Down Expand Up @@ -1386,6 +1434,8 @@ pub async fn fetch_page_html(
PageResponse {
#[cfg(feature = "headers")]
headers: Some(headers),
#[cfg(feature = "cookies")]
cookies,
content: Some(if file.is_some() {
let mut buffer = vec![];

Expand All @@ -1412,6 +1462,8 @@ pub async fn fetch_page_html(
Ok(res) => PageResponse {
#[cfg(feature = "headers")]
headers: Some(res.headers().clone()),
#[cfg(feature = "cookies")]
cookies: get_cookies(&res),
status_code: res.status(),
..Default::default()
},
Expand Down Expand Up @@ -1506,6 +1558,7 @@ pub async fn fetch_page_html_chrome(
Ok(res) if res.status().is_success() => {
#[cfg(feature = "headers")]
let headers = res.headers().clone();
let cookies = get_cookies(&res);
let status_code = res.status();
let mut stream = res.bytes_stream();
let mut data: BytesMut = BytesMut::new();
Expand All @@ -1527,6 +1580,8 @@ pub async fn fetch_page_html_chrome(
PageResponse {
#[cfg(feature = "headers")]
headers: Some(headers),
#[cfg(feature = "cookies")]
cookies,
content: Some(data.into()),
status_code,
..Default::default()
Expand All @@ -1535,6 +1590,8 @@ pub async fn fetch_page_html_chrome(
Ok(res) => PageResponse {
#[cfg(feature = "headers")]
headers: Some(res.headers().clone()),
#[cfg(feature = "cookies")]
cookies: get_cookies(&res),
status_code: res.status(),
..Default::default()
},
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.0.4"
version = "2.0.6"
authors = [
"j-mendez <[email protected]>"
]
Expand Down Expand Up @@ -28,7 +28,7 @@ quote = "1"
failure_derive = "0.1.8"

[dependencies.spider]
version = "2.0.4"
version = "2.0.6"
path = "../spider"

[[bin]]
Expand Down
Loading

0 comments on commit 66af308

Please sign in to comment.