From 66af3089f72403129059027c5e48b808971d35b9 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 20 Aug 2024 16:49:32 -0400 Subject: [PATCH] chore(page): add http cookies map return --- Cargo.lock | 173 +++++++++++++++++++++++---------------- spider/Cargo.toml | 2 +- spider/README.md | 24 +++--- spider/src/page.rs | 19 +++-- spider/src/utils/mod.rs | 75 +++++++++++++++-- spider_cli/Cargo.toml | 4 +- spider_utils/Cargo.toml | 4 +- spider_worker/Cargo.toml | 4 +- 8 files changed, 202 insertions(+), 103 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd562131a2..a4bfc502d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,6 +49,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -237,7 +243,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -304,7 +310,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -487,9 +493,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.11" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fb8dd288a69fc53a1996d7ecfbf4a20d59065bff137ce7e56bbd620de191189" +checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" dependencies = [ "jobserver", "libc", @@ -621,9 +627,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.15" +version = "4.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" +checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" dependencies = [ "clap_builder", "clap_derive", @@ -650,7 +656,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -928,7 +934,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -952,7 +958,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -963,7 +969,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -999,7 +1005,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1009,7 +1015,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" dependencies = [ "derive_builder_core", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1022,7 +1028,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1086,7 +1092,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1221,12 +1227,12 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -1338,7 +1344,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1447,9 +1453,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -1542,6 +1548,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hex" version = "0.4.3" @@ -1786,7 +1798,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "httparse", @@ -1972,11 +1984,11 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is-terminal" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi", + "hermit-abi 0.4.0", "libc", "windows-sys 0.52.0", ] @@ -2049,9 +2061,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.155" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "linked-hash-map" @@ -2162,7 +2174,7 @@ checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -2196,13 +2208,22 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", @@ -2310,7 +2331,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] @@ -2379,7 +2400,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -2565,7 +2586,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -2612,7 +2633,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -2982,9 +3003,9 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "async-compression", "base64 0.22.1", @@ -2994,7 +3015,7 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "hickory-resolver", "http 1.1.0", "http-body 1.0.1", @@ -3034,7 +3055,7 @@ dependencies = [ "wasm-streams", "web-sys", "webpki-roots", - "winreg 0.52.0", + "windows-registry", ] [[package]] @@ -3186,9 +3207,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" dependencies = [ "openssl-probe", "rustls-pemfile", @@ -3352,29 +3373,29 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.207" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.207" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] name = "serde_json" -version = "1.0.124" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa 1.0.11", "memchr", @@ -3506,7 +3527,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.0.4" +version = "2.0.6" dependencies = [ "adblock", "ahash", @@ -3561,7 +3582,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.0.4" +version = "2.0.6" dependencies = [ "clap", "env_logger", @@ -3584,7 +3605,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "0.2.1" +version = "0.2.3" dependencies = [ "indexmap 1.9.3", "spider", @@ -3592,7 +3613,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.0.4" +version = "2.0.6" dependencies = [ "env_logger", "lazy_static", @@ -3692,7 +3713,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3714,9 +3735,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.74" +version = "2.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" dependencies = [ "proc-macro2", "quote", @@ -3728,6 +3749,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -3743,20 +3767,20 @@ dependencies = [ [[package]] name = "system-configuration" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +checksum = "658bc6ee10a9b4fcf576e9b0819d95ec16f4d2c02d39fd83ac1c8789785c4a42" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "core-foundation", "system-configuration-sys", ] [[package]] name = "system-configuration-sys" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" dependencies = [ "core-foundation-sys", "libc", @@ -3815,7 +3839,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3911,9 +3935,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", @@ -3935,7 +3959,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -4082,7 +4106,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -4187,9 +4211,9 @@ checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" [[package]] name = "unicode-xid" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" +checksum = "229730647fbc343e3a80e463c1db7f78f3855d3f3739bee0dda773c9a037c90a" [[package]] name = "untrusted" @@ -4346,7 +4370,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-shared", ] @@ -4380,7 +4404,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4512,7 +4536,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -4523,7 +4547,18 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", +] + +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets 0.52.6", ] [[package]] @@ -4752,7 +4787,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 8a20ac7e0b..e8b16705b6 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.0.4" +version = "2.0.6" authors = [ "j-mendez " ] diff --git a/spider/README.md b/spider/README.md index 6b5a023a67..a94e1bade7 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "2.0.4" +spider = "2.0.6" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "2.0.4", features = ["regex", "ua_generator"] } +spider = { version = "2.0.6", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -139,7 +139,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "2.0.4", features = ["decentralized"] } +spider = { version = "2.0.6", features = ["decentralized"] } ``` ```sh @@ -170,7 +170,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "2.0.4", features = ["sync"] } +spider = { version = "2.0.6", features = ["sync"] } ``` ```rust,no_run @@ -201,7 +201,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "2.0.4", features = ["regex"] } +spider = { version = "2.0.6", features = ["regex"] } ``` ```rust,no_run @@ -228,7 +228,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "2.0.4", features = ["control"] } +spider = { version = "2.0.6", features = ["control"] } ``` ```rust @@ -298,7 +298,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "2.0.4", features = ["sync", "cron"] } +spider = { version = "2.0.6", features = ["sync", "cron"] } ``` ```rust,no_run @@ -337,7 +337,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "2.0.4", features = ["chrome", "chrome_intercept"] } +spider = { version = "2.0.6", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -367,7 +367,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "2.0.4", features = ["cache"] } +spider = { version = "2.0.6", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -398,7 +398,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "2.0.4", features = ["smart"] } +spider = { version = "2.0.6", features = ["smart"] } ``` ```rust,no_run @@ -424,7 +424,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur ```toml [dependencies] -spider = { version = "2.0.4", features = ["openai"] } +spider = { version = "2.0.6", features = ["openai"] } ``` ```rust @@ -450,7 +450,7 @@ Set a depth limit to prevent forwarding. ```toml [dependencies] -spider = { version = "2.0.4" } +spider = { version = "2.0.6" } ``` ```rust,no_run diff --git a/spider/src/page.rs b/spider/src/page.rs index 4b2aeb4c63..869b34428d 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -20,9 +20,6 @@ use tokio::time::Instant; #[cfg(all(feature = "decentralized", feature = "headers"))] use crate::utils::FetchPageResult; -#[cfg(feature = "headers")] -use reqwest::header::HeaderMap; - use tokio_stream::StreamExt; use url::Url; @@ -106,7 +103,10 @@ pub struct Page { url: String, #[cfg(feature = "headers")] /// The headers of the page request response. - pub headers: Option, + pub headers: Option, + #[cfg(feature = "cookies")] + /// The cookies of the page request response. + pub cookies: Option, /// The status code of the page request. pub status_code: StatusCode, /// The error of the request if any. @@ -140,7 +140,10 @@ pub struct Page { html: Option, #[cfg(feature = "headers")] /// The headers of the page request response. - pub headers: Option, + pub headers: Option, + #[cfg(feature = "cookies")] + /// The cookies of the page request response. + pub cookies: Option, /// The status code of the page request. pub status_code: StatusCode, /// The error of the request if any. @@ -259,6 +262,8 @@ pub fn build(url: &str, res: PageResponse) -> Page { }, #[cfg(feature = "headers")] headers: res.headers, + #[cfg(feature = "cookies")] + cookies: res.cookies, base: match Url::parse(url) { Ok(u) => Some(u), _ => None, @@ -298,6 +303,8 @@ pub fn build(_: &str, res: PageResponse) -> Page { }, #[cfg(feature = "headers")] headers: res.headers, + #[cfg(feature = "cookies")] + cookies: res.cookies, final_redirect_destination: res.final_url, status_code: res.status_code, error_status: match res.error_for_status { @@ -1407,7 +1414,7 @@ pub fn get_html_encoded(html: &Option, label: &str) -> String { } #[cfg(test)] -#[cfg(all(not(feature = "decentralized"), not(feature = "cache")))] +#[cfg(all(not(feature = "decentralized")))] pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")); #[cfg(all( diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index e7cfe97be8..5f5ae6c6b2 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -3,6 +3,8 @@ pub mod header_utils; /// A trie struct. pub mod trie; +use std::str::FromStr; + #[cfg(feature = "chrome")] use crate::features::chrome_common::{AutomationScripts, ExecutionScripts}; use crate::tokio_stream::StreamExt; @@ -11,9 +13,11 @@ use crate::Client; use http_cache_semantics::{RequestLike, ResponseLike}; use log::{info, log_enabled, Level}; -#[cfg(feature = "headers")] -use reqwest::header::HeaderMap; -use reqwest::{Error, Response, StatusCode}; + +use reqwest::{ + header::{HeaderName, HeaderValue}, + Error, Response, StatusCode, +}; #[cfg(feature = "fs")] lazy_static! { @@ -130,7 +134,10 @@ pub struct PageResponse { pub content: Option, #[cfg(feature = "headers")] /// The headers of the response. (Always None if a webdriver protocol is used for fetching.). - pub headers: Option, + pub headers: Option, + #[cfg(feature = "cookies")] + /// The cookies of the response. + pub cookies: Option, /// The status code of the request. pub status_code: StatusCode, /// The final url destination after any redirects. @@ -1053,6 +1060,36 @@ pub fn get_last_redirect( } } +/// The response cookies mapped. This does nothing without the cookies feature flag enabled. +#[cfg(feature = "cookies")] +pub fn get_cookies(res: &Response) -> Option { + let mut headers = reqwest::header::HeaderMap::new(); + + for cookie in res.cookies() { + match HeaderValue::from_str(cookie.value()) { + Ok(h) => match HeaderName::from_str(cookie.name()) { + Ok(n) => { + headers.insert(n, h); + } + _ => (), + }, + _ => (), + } + } + + if !headers.is_empty() { + Some(headers) + } else { + None + } +} + +#[cfg(not(feature = "cookies"))] +/// The response cookies mapped. This does nothing without the cookies feature flag enabled. +pub fn get_cookies(res: &Response) -> Option { + None +} + /// Perform a network request to a resource extracting all content streaming. pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse { use crate::bytes::BufMut; @@ -1067,9 +1104,11 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo } else { None }; - let status_code = res.status(); + let status_code: StatusCode = res.status(); #[cfg(feature = "headers")] let headers = res.headers().clone(); + let cookies = get_cookies(&res); + let mut stream = res.bytes_stream(); let mut data: BytesMut = BytesMut::new(); @@ -1091,6 +1130,8 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo PageResponse { #[cfg(feature = "headers")] headers: Some(headers), + #[cfg(feature = "cookies")] + cookies, content: Some(data.into()), final_url: rd, status_code, @@ -1100,6 +1141,7 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageRespo Ok(res) => PageResponse { #[cfg(feature = "headers")] headers: Some(res.headers().clone()), + cookies: get_cookies(&res), status_code: res.status(), ..Default::default() }, @@ -1133,9 +1175,9 @@ pub async fn fetch_page(target_url: &str, client: &Client) -> Option), + Success(reqwest::header::HeaderMap, Option), /// No success extracting content - NoSuccess(HeaderMap), + NoSuccess(reqwest::header::HeaderMap), /// A network error occured. FetchError, } @@ -1192,6 +1234,7 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse }; let status_code = res.status(); + let cookies = get_cookies(&res); #[cfg(feature = "headers")] let headers = res.headers().clone(); let mut stream = res.bytes_stream(); @@ -1243,6 +1286,8 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse PageResponse { #[cfg(feature = "headers")] headers: Some(headers), + #[cfg(feature = "cookies")] + cookies, content: Some(if file.is_some() { let mut buffer = vec![]; @@ -1269,6 +1314,8 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse Ok(res) => PageResponse { #[cfg(feature = "headers")] headers: Some(res.headers().clone()), + #[cfg(feature = "cookies")] + cookies: get_cookies(&res), status_code: res.status(), ..Default::default() }, @@ -1290,8 +1337,8 @@ pub async fn fetch_page_html( screenshot: &Option, page_set: bool, openai_config: &Option, - execution_scripts: &ExecutionScripts, - automation_scripts: &AutomationScripts, + execution_scripts: &Option, + automation_scripts: &Option, ) -> PageResponse { use crate::tokio::io::{AsyncReadExt, AsyncWriteExt}; use percent_encoding::utf8_percent_encode; @@ -1329,6 +1376,7 @@ pub async fn fetch_page_html( Ok(res) if res.status().is_success() => { #[cfg(feature = "headers")] let headers = res.headers().clone(); + let cookies = get_cookies(&res); let status_code = res.status(); let mut stream = res.bytes_stream(); let mut data: BytesMut = BytesMut::new(); @@ -1386,6 +1434,8 @@ pub async fn fetch_page_html( PageResponse { #[cfg(feature = "headers")] headers: Some(headers), + #[cfg(feature = "cookies")] + cookies, content: Some(if file.is_some() { let mut buffer = vec![]; @@ -1412,6 +1462,8 @@ pub async fn fetch_page_html( Ok(res) => PageResponse { #[cfg(feature = "headers")] headers: Some(res.headers().clone()), + #[cfg(feature = "cookies")] + cookies: get_cookies(&res), status_code: res.status(), ..Default::default() }, @@ -1506,6 +1558,7 @@ pub async fn fetch_page_html_chrome( Ok(res) if res.status().is_success() => { #[cfg(feature = "headers")] let headers = res.headers().clone(); + let cookies = get_cookies(&res); let status_code = res.status(); let mut stream = res.bytes_stream(); let mut data: BytesMut = BytesMut::new(); @@ -1527,6 +1580,8 @@ pub async fn fetch_page_html_chrome( PageResponse { #[cfg(feature = "headers")] headers: Some(headers), + #[cfg(feature = "cookies")] + cookies, content: Some(data.into()), status_code, ..Default::default() @@ -1535,6 +1590,8 @@ pub async fn fetch_page_html_chrome( Ok(res) => PageResponse { #[cfg(feature = "headers")] headers: Some(res.headers().clone()), + #[cfg(feature = "cookies")] + cookies: get_cookies(&res), status_code: res.status(), ..Default::default() }, diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 5312e792da..65b0eb1c99 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.0.4" +version = "2.0.6" authors = [ "j-mendez " ] @@ -28,7 +28,7 @@ quote = "1" failure_derive = "0.1.8" [dependencies.spider] -version = "2.0.4" +version = "2.0.6" path = "../spider" [[bin]] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index bcc4624341..c1c20034b9 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "0.2.1" +version = "0.2.3" authors = [ "j-mendez " ] @@ -17,7 +17,7 @@ edition = "2018" indexmap = { version = "1", optional = true } [dependencies.spider] -version = "2.0.4" +version = "2.0.6" path = "../spider" [features] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index a1d3131cf3..e57f968200 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.0.4" +version = "2.0.6" authors = [ "j-mendez " ] @@ -24,7 +24,7 @@ lazy_static = "1.4.0" env_logger = "0.11.3" [dependencies.spider] -version = "2.0.4" +version = "2.0.6" path = "../spider" features = ["serde", "flexbuffers"]