chore(page): add http cookies map return

spider-rs · Aug 20, 2024 · 66af308 · 66af308
1 parent eb73f70
commit 66af308
Show file tree

Hide file tree

Showing 8 changed files with 202 additions and 103 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "2.0.4"
+version = "2.0.6"
 authors = [
     "j-mendez <[email protected]>"
 ]

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "2.0.4"
+spider = "2.0.6"
 ```
 
 And then the code:
@@ -93,7 +93,7 @@ We have the following optional feature flags.
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["regex", "ua_generator"] }
+spider = { version = "2.0.6", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -139,7 +139,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["decentralized"] }
+spider = { version = "2.0.6", features = ["decentralized"] }
 ```
 
 ```sh
@@ -170,7 +170,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["sync"] }
+spider = { version = "2.0.6", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -201,7 +201,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["regex"] }
+spider = { version = "2.0.6", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -228,7 +228,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["control"] }
+spider = { version = "2.0.6", features = ["control"] }
 ```
 
 ```rust
@@ -298,7 +298,7 @@ Use cron jobs to run crawls continuously at anytime.
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["sync", "cron"] }
+spider = { version = "2.0.6", features = ["sync", "cron"] }
 ```
 
 ```rust,no_run
@@ -337,7 +337,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["chrome", "chrome_intercept"] }
+spider = { version = "2.0.6", features = ["chrome", "chrome_intercept"] }
 ```
 
 You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
@@ -367,7 +367,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["cache"] }
+spider = { version = "2.0.6", features = ["cache"] }
 ```
 
 You need to set `website.cache` to true to enable as well.
@@ -398,7 +398,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["smart"] }
+spider = { version = "2.0.6", features = ["smart"] }
 ```
 
 ```rust,no_run
@@ -424,7 +424,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4", features = ["openai"] }
+spider = { version = "2.0.6", features = ["openai"] }
 ```
 
 ```rust
@@ -450,7 +450,7 @@ Set a depth limit to prevent forwarding.
 
 ```toml
 [dependencies]
-spider = { version = "2.0.4" }
+spider = { version = "2.0.6" }
 ```
 
 ```rust,no_run

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -20,9 +20,6 @@ use tokio::time::Instant;
 
 #[cfg(all(feature = "decentralized", feature = "headers"))]
 use crate::utils::FetchPageResult;
-#[cfg(feature = "headers")]
-use reqwest::header::HeaderMap;
-
 use tokio_stream::StreamExt;
 use url::Url;
 
@@ -106,7 +103,10 @@ pub struct Page {
     url: String,
     #[cfg(feature = "headers")]
     /// The headers of the page request response.
-    pub headers: Option<HeaderMap>,
+    pub headers: Option<reqwest::header::HeaderMap>,
+    #[cfg(feature = "cookies")]
+    /// The cookies of the page request response.
+    pub cookies: Option<reqwest::header::HeaderMap>,
     /// The status code of the page request.
     pub status_code: StatusCode,
     /// The error of the request if any.
@@ -140,7 +140,10 @@ pub struct Page {
     html: Option<Bytes>,
     #[cfg(feature = "headers")]
     /// The headers of the page request response.
-    pub headers: Option<HeaderMap>,
+    pub headers: Option<reqwest::header::HeaderMap>,
+    #[cfg(feature = "cookies")]
+    /// The cookies of the page request response.
+    pub cookies: Option<reqwest::header::HeaderMap>,
     /// The status code of the page request.
     pub status_code: StatusCode,
     /// The error of the request if any.
@@ -259,6 +262,8 @@ pub fn build(url: &str, res: PageResponse) -> Page {
         },
         #[cfg(feature = "headers")]
         headers: res.headers,
+        #[cfg(feature = "cookies")]
+        cookies: res.cookies,
         base: match Url::parse(url) {
             Ok(u) => Some(u),
             _ => None,
@@ -298,6 +303,8 @@ pub fn build(_: &str, res: PageResponse) -> Page {
         },
         #[cfg(feature = "headers")]
         headers: res.headers,
+        #[cfg(feature = "cookies")]
+        cookies: res.cookies,
         final_redirect_destination: res.final_url,
         status_code: res.status_code,
         error_status: match res.error_for_status {
@@ -1407,7 +1414,7 @@ pub fn get_html_encoded(html: &Option<Bytes>, label: &str) -> String {
 }
 
 #[cfg(test)]
-#[cfg(all(not(feature = "decentralized"), not(feature = "cache")))]
+#[cfg(all(not(feature = "decentralized")))]
 pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"));
 
 #[cfg(all(

diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs
@@ -3,6 +3,8 @@ pub mod header_utils;
 /// A trie struct.
 pub mod trie;
 
+use std::str::FromStr;
+
 #[cfg(feature = "chrome")]
 use crate::features::chrome_common::{AutomationScripts, ExecutionScripts};
 use crate::tokio_stream::StreamExt;
@@ -11,9 +13,11 @@ use crate::Client;
 use http_cache_semantics::{RequestLike, ResponseLike};
 
 use log::{info, log_enabled, Level};
-#[cfg(feature = "headers")]
-use reqwest::header::HeaderMap;
-use reqwest::{Error, Response, StatusCode};
+
+use reqwest::{
+    header::{HeaderName, HeaderValue},
+    Error, Response, StatusCode,
+};
 
 #[cfg(feature = "fs")]
 lazy_static! {
@@ -130,7 +134,10 @@ pub struct PageResponse {
     pub content: Option<bytes::Bytes>,
     #[cfg(feature = "headers")]
     /// The headers of the response. (Always None if a webdriver protocol is used for fetching.).
-    pub headers: Option<HeaderMap>,
+    pub headers: Option<reqwest::header::HeaderMap>,
+    #[cfg(feature = "cookies")]
+    /// The cookies of the response.
+    pub cookies: Option<reqwest::header::HeaderMap>,
     /// The status code of the request.
     pub status_code: StatusCode,
     /// The final url destination after any redirects.
@@ -1053,6 +1060,36 @@ pub fn get_last_redirect(
     }
 }
 
+/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
+#[cfg(feature = "cookies")]
+pub fn get_cookies(res: &Response) -> Option<reqwest::header::HeaderMap> {
+    let mut headers = reqwest::header::HeaderMap::new();
+
+    for cookie in res.cookies() {
+        match HeaderValue::from_str(cookie.value()) {
+            Ok(h) => match HeaderName::from_str(cookie.name()) {
+                Ok(n) => {
+                    headers.insert(n, h);
+                }
+                _ => (),
+            },
+            _ => (),
+        }
+    }
+
+    if !headers.is_empty() {
+        Some(headers)
+    } else {
+        None
+    }
+}
+
+#[cfg(not(feature = "cookies"))]
+/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
+pub fn get_cookies(res: &Response) -> Option<reqwest::header::HeaderMap> {
+    None
+}
+
 /// Perform a network request to a resource extracting all content streaming.
 pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse {
     use crate::bytes::BufMut;
@@ -1067,9 +1104,11 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo
             } else {
                 None
             };
-            let status_code = res.status();
+            let status_code: StatusCode = res.status();
             #[cfg(feature = "headers")]
             let headers = res.headers().clone();
+            let cookies = get_cookies(&res);
+
             let mut stream = res.bytes_stream();
             let mut data: BytesMut = BytesMut::new();
 
@@ -1091,6 +1130,8 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo
             PageResponse {
                 #[cfg(feature = "headers")]
                 headers: Some(headers),
+                #[cfg(feature = "cookies")]
+                cookies,
                 content: Some(data.into()),
                 final_url: rd,
                 status_code,
@@ -1100,6 +1141,7 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo
         Ok(res) => PageResponse {
             #[cfg(feature = "headers")]
             headers: Some(res.headers().clone()),
+            cookies: get_cookies(&res),
             status_code: res.status(),
             ..Default::default()
         },
@@ -1133,9 +1175,9 @@ pub async fn fetch_page(target_url: &str, client: &Client) -> Option<bytes::Byte
 /// Fetch a page with the headers returned.
 pub enum FetchPageResult {
     /// Success extracting contents of the page
-    Success(HeaderMap, Option<bytes::Bytes>),
+    Success(reqwest::header::HeaderMap, Option<bytes::Bytes>),
     /// No success extracting content
-    NoSuccess(HeaderMap),
+    NoSuccess(reqwest::header::HeaderMap),
     /// A network error occured.
     FetchError,
 }
@@ -1192,6 +1234,7 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse
             };
 
             let status_code = res.status();
+            let cookies = get_cookies(&res);
             #[cfg(feature = "headers")]
             let headers = res.headers().clone();
             let mut stream = res.bytes_stream();
@@ -1243,6 +1286,8 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse
             PageResponse {
                 #[cfg(feature = "headers")]
                 headers: Some(headers),
+                #[cfg(feature = "cookies")]
+                cookies,
                 content: Some(if file.is_some() {
                     let mut buffer = vec![];
 
@@ -1269,6 +1314,8 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse
         Ok(res) => PageResponse {
             #[cfg(feature = "headers")]
             headers: Some(res.headers().clone()),
+            #[cfg(feature = "cookies")]
+            cookies: get_cookies(&res),
             status_code: res.status(),
             ..Default::default()
         },
@@ -1290,8 +1337,8 @@ pub async fn fetch_page_html(
     screenshot: &Option<crate::configuration::ScreenShotConfig>,
     page_set: bool,
     openai_config: &Option<crate::configuration::GPTConfigs>,
-    execution_scripts: &ExecutionScripts,
-    automation_scripts: &AutomationScripts,
+    execution_scripts: &Option<ExecutionScripts>,
+    automation_scripts: &Option<AutomationScripts>,
 ) -> PageResponse {
     use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
     use percent_encoding::utf8_percent_encode;
@@ -1329,6 +1376,7 @@ pub async fn fetch_page_html(
                         Ok(res) if res.status().is_success() => {
                             #[cfg(feature = "headers")]
                             let headers = res.headers().clone();
+                            let cookies = get_cookies(&res);
                             let status_code = res.status();
                             let mut stream = res.bytes_stream();
                             let mut data: BytesMut = BytesMut::new();
@@ -1386,6 +1434,8 @@ pub async fn fetch_page_html(
                             PageResponse {
                                 #[cfg(feature = "headers")]
                                 headers: Some(headers),
+                                #[cfg(feature = "cookies")]
+                                cookies,
                                 content: Some(if file.is_some() {
                                     let mut buffer = vec![];
 
@@ -1412,6 +1462,8 @@ pub async fn fetch_page_html(
                         Ok(res) => PageResponse {
                             #[cfg(feature = "headers")]
                             headers: Some(res.headers().clone()),
+                            #[cfg(feature = "cookies")]
+                            cookies: get_cookies(&res),
                             status_code: res.status(),
                             ..Default::default()
                         },
@@ -1506,6 +1558,7 @@ pub async fn fetch_page_html_chrome(
                         Ok(res) if res.status().is_success() => {
                             #[cfg(feature = "headers")]
                             let headers = res.headers().clone();
+                            let cookies = get_cookies(&res);
                             let status_code = res.status();
                             let mut stream = res.bytes_stream();
                             let mut data: BytesMut = BytesMut::new();
@@ -1527,6 +1580,8 @@ pub async fn fetch_page_html_chrome(
                             PageResponse {
                                 #[cfg(feature = "headers")]
                                 headers: Some(headers),
+                                #[cfg(feature = "cookies")]
+                                cookies,
                                 content: Some(data.into()),
                                 status_code,
                                 ..Default::default()
@@ -1535,6 +1590,8 @@ pub async fn fetch_page_html_chrome(
                         Ok(res) => PageResponse {
                             #[cfg(feature = "headers")]
                             headers: Some(res.headers().clone()),
+                            #[cfg(feature = "cookies")]
+                            cookies: get_cookies(&res),
                             status_code: res.status(),
                             ..Default::default()
                         },

diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "2.0.4"
+version = "2.0.6"
 authors = [
     "j-mendez <[email protected]>"
 ]
@@ -28,7 +28,7 @@ quote = "1"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "2.0.4"
+version = "2.0.6"
 path = "../spider"
 
 [[bin]]