diff --git a/Cargo.lock b/Cargo.lock index f7a056e..9210030 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,14 +19,15 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ "cfg-if", - "getrandom 0.2.10", + "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -53,6 +54,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -96,7 +103,21 @@ checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", +] + +[[package]] +name = "async-tungstenite" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e9efbe14612da0a19fb983059a0b621e9cf6225d7018ecab4f9988215540dc" +dependencies = [ + "futures-io", + "futures-util", + "log", + "pin-project-lite", + "tokio", + "tungstenite", ] [[package]] @@ -124,7 +145,7 @@ checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39" dependencies = [ "async-trait", "axum-core", - "bitflags", + "bitflags 1.3.2", "bytes", "futures-util", "http", @@ -188,6 +209,21 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "brotli" version = "3.3.4" @@ -223,15 +259,18 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +dependencies = [ + "serde", +] [[package]] name = "case_insensitive_string" -version = "0.1.6" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5252d96f01ec50f29b85408fbc0e77d6707f72c80dbe21324610372a6811a031" +checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd" dependencies = [ "compact_str", "serde", @@ -262,6 +301,73 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chromiumoxide" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2abb1e644b7fcaa711fe4d3fca06c52bf18ebc5398711b2b11e8281c6107fbe" +dependencies = [ + "async-tungstenite", + "base64", + "bytes", + "cfg-if", + "chromiumoxide_cdp", + "chromiumoxide_types", + "dunce", + "fnv", + "futures", + "futures-timer", + "pin-project-lite", + "reqwest", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "url", + "which", + "winreg 0.51.0", +] + +[[package]] +name = "chromiumoxide_cdp" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72641e4931b5372361160346d88605bbd6252b54c89581b71ac06b3b68b8b7c0" +dependencies = [ + "chromiumoxide_pdl", + "chromiumoxide_types", + "serde", + "serde_json", +] + +[[package]] +name = "chromiumoxide_pdl" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ecbd2d949e41b72575e61f9a38a5e2d6207ffb98544448e1eb7b9310d48bbbe" +dependencies = [ + "chromiumoxide_types", + "either", + "heck", + "once_cell", + "proc-macro2", + "quote", + "regex", + "serde", + "serde_json", +] + +[[package]] +name = "chromiumoxide_types" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5fc717f4c899a7a02a50f7698a90acade50a1a65fe7804e57c747739fa23e25" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "chrono" version = "0.4.26" @@ -299,12 +405,6 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - [[package]] name = "core-foundation" version = "0.9.3" @@ -321,6 +421,15 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.3.2" @@ -330,21 +439,27 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "cssparser" -version = "0.29.6" +version = "0.31.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" dependencies = [ "cssparser-macros", "dtoa-short", "itoa", - "matches", - "phf 0.10.1", - "proc-macro2", - "quote", + "phf 0.11.2", "smallvec", - "syn 1.0.109", ] [[package]] @@ -354,22 +469,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.23", + "syn 2.0.48", ] +[[package]] +name = "data-encoding" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5" + [[package]] name = "derive_more" version = "0.99.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" dependencies = [ - "convert_case", "proc-macro2", "quote", - "rustc_version", "syn 1.0.109", ] +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "dtoa" version = "1.0.8" @@ -385,6 +514,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "dunce" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" + [[package]] name = "ego-tree" version = "0.6.2" @@ -528,9 +663,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] @@ -551,6 +686,21 @@ dependencies = [ "new_debug_unreachable", ] +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.28" @@ -558,6 +708,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -566,6 +717,17 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.28" @@ -580,7 +742,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -595,12 +757,19 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + [[package]] name = "futures-util" version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", @@ -622,14 +791,13 @@ dependencies = [ ] [[package]] -name = "getrandom" -version = "0.1.16" +name = "generic-array" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ - "cfg-if", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", + "typenum", + "version_check", ] [[package]] @@ -676,11 +844,12 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "hashbrown" -version = "0.13.2" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" dependencies = [ "ahash", + "allocator-api2", ] [[package]] @@ -818,9 +987,9 @@ dependencies = [ [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -923,9 +1092,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "linux-raw-sys" @@ -945,9 +1114,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.19" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "mac" @@ -955,12 +1124,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" -[[package]] -name = "matches" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" - [[package]] name = "matchit" version = "0.7.0" @@ -969,9 +1132,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" [[package]] name = "memchr" -version = "2.5.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "mime" @@ -990,9 +1153,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" dependencies = [ "libc", "wasi 0.11.0+wasi-snapshot-preview1", @@ -1029,12 +1192,6 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" -[[package]] -name = "nodrop" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" - [[package]] name = "num-traits" version = "0.2.15" @@ -1075,7 +1232,7 @@ version = "0.10.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cfg-if", "foreign-types", "libc", @@ -1092,7 +1249,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -1149,9 +1306,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" @@ -1163,24 +1320,13 @@ dependencies = [ "indexmap", ] -[[package]] -name = "phf" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" -dependencies = [ - "phf_shared 0.8.0", -] - [[package]] name = "phf" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ - "phf_macros", "phf_shared 0.10.0", - "proc-macro-hack", ] [[package]] @@ -1189,17 +1335,18 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ + "phf_macros", "phf_shared 0.11.2", ] [[package]] name = "phf_codegen" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" dependencies = [ - "phf_generator 0.8.0", - "phf_shared 0.8.0", + "phf_generator 0.10.0", + "phf_shared 0.10.0", ] [[package]] @@ -1212,16 +1359,6 @@ dependencies = [ "phf_shared 0.11.2", ] -[[package]] -name = "phf_generator" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" -dependencies = [ - "phf_shared 0.8.0", - "rand 0.7.3", -] - [[package]] name = "phf_generator" version = "0.10.0" @@ -1229,7 +1366,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" dependencies = [ "phf_shared 0.10.0", - "rand 0.8.5", + "rand", ] [[package]] @@ -1239,30 +1376,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ "phf_shared 0.11.2", - "rand 0.8.5", + "rand", ] [[package]] name = "phf_macros" -version = "0.10.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" +checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", - "proc-macro-hack", + "phf_generator 0.11.2", + "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 1.0.109", -] - -[[package]] -name = "phf_shared" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" -dependencies = [ - "siphasher", + "syn 2.0.48", ] [[package]] @@ -1300,7 +1427,7 @@ checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -1343,17 +1470,11 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" -version = "1.0.63" +version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" +checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" dependencies = [ "unicode-ident", ] @@ -1414,27 +1535,13 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.29" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc", - "rand_pcg", -] - [[package]] name = "rand" version = "0.8.5" @@ -1442,18 +1549,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", + "rand_chacha", + "rand_core", ] [[package]] @@ -1463,16 +1560,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", + "rand_core", ] [[package]] @@ -1481,25 +1569,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.10", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_pcg" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" -dependencies = [ - "rand_core 0.5.1", + "getrandom", ] [[package]] @@ -1508,14 +1578,14 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.9.0" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89089e897c013b3deb627116ae56a6955a72b8bed395c9526af31c9fe528b484" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", @@ -1525,9 +1595,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa250384981ea14565685dea16a9ccc4d1c541a13f82b9c168572264d1df8c56" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", @@ -1536,15 +1606,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.3" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "reqwest" -version = "0.11.18" +version = "0.11.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" +checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" dependencies = [ "async-compression", "base64", @@ -1568,9 +1638,9 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", + "system-configuration", "tokio", "tokio-native-tls", - "tokio-socks", "tokio-util", "tower-service", "url", @@ -1578,22 +1648,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", -] - -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin", - "untrusted", - "web-sys", - "winapi", + "winreg 0.50.0", ] [[package]] @@ -1617,7 +1672,7 @@ version = "0.37.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno", "io-lifetimes", "libc", @@ -1625,38 +1680,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "rustls" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b19faa85ecb5197342b54f987b142fb3e30d0c90da40f80ef4fa9a726e6676ed" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.1", - "sct", -] - -[[package]] -name = "rustls-webpki" -version = "0.100.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "rustls-webpki" -version = "0.101.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15f36a6828982f422756984e47912a7a51dcbc2a197aa791158f8ca61cd8204e" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "rustversion" version = "1.0.13" @@ -1684,23 +1707,13 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "sct" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "security-framework" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-foundation-sys", "libc", @@ -1719,17 +1732,18 @@ dependencies = [ [[package]] name = "selectors" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" dependencies = [ - "bitflags", + "bitflags 2.4.1", "cssparser", "derive_more", "fxhash", "log", - "phf 0.8.0", - "phf_codegen 0.8.0", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", "precomputed-hash", "servo_arc", "smallvec", @@ -1743,22 +1757,22 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "serde" -version = "1.0.166" +version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d01b7404f9d441d3ad40e6a636a7782c377d2abdbe4fa2440e2edcc2f4f10db8" +checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.166" +version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd83d6dde2b6b2d466e14d9d1acce8816dedee94f735eac6395808b3483c6d6" +checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -1786,14 +1800,33 @@ dependencies = [ [[package]] name = "servo_arc" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" dependencies = [ - "nodrop", "stable_deref_trait", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + [[package]] name = "siphasher" version = "0.3.10" @@ -1823,9 +1856,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" [[package]] name = "socket2" @@ -1839,19 +1872,45 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" dependencies = [ "libc", "windows-sys", ] [[package]] -name = "spin" -version = "0.5.2" +name = "spider" +version = "1.80.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +checksum = "c21ae5ee676dcab8fa4e4c0ae17a0b19ab15842a5dc57604ea5138de45e4dac8" +dependencies = [ + "ahash", + "bytes", + "case_insensitive_string", + "chromiumoxide", + "compact_str", + "cssparser", + "ego-tree", + "fast_html5ever", + "hashbrown 0.14.3", + "lazy_static", + "log", + "num_cpus", + "percent-encoding", + "regex", + "reqwest", + "selectors", + "sitemap", + "smallvec", + "string_concat", + "strum", + "tendril", + "tokio", + "tokio-stream", + "url", +] [[package]] name = "stable_deref_trait" @@ -1897,6 +1956,28 @@ version = "0.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3c3ee6129eec20fed59acf2e9cfb3ffd20d0bbe39fe334c22af0edc56dfe752" +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.48", +] + [[package]] name = "syn" version = "1.0.109" @@ -1910,9 +1991,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.23" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", @@ -1937,6 +2018,27 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" version = "3.6.0" @@ -1973,22 +2075,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.41" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c16a64ba9387ef3fdae4f9c1a7f07a0997fce91985c0336f1ddc1822b3b37802" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.41" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d14928354b01c4d6a4f0e549069adef399a284e7995c7ccca94e8a07a5346c59" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -2019,9 +2121,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.33.0" +version = "1.35.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" +checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" dependencies = [ "backtrace", "bytes", @@ -2030,7 +2132,8 @@ dependencies = [ "num_cpus", "parking_lot", "pin-project-lite", - "socket2 0.5.4", + "signal-hook-registry", + "socket2 0.5.5", "tokio-macros", "windows-sys", ] @@ -2047,13 +2150,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -2066,18 +2169,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-socks" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51165dfa029d2a65969413a6cc96f354b86b464498702f174a4efa13608fd8c0" -dependencies = [ - "either", - "futures-util", - "thiserror", - "tokio", -] - [[package]] name = "tokio-stream" version = "0.1.14" @@ -2155,7 +2246,7 @@ dependencies = [ "indexmap", "pin-project", "pin-project-lite", - "rand 0.8.5", + "rand", "slab", "tokio", "tokio-util", @@ -2196,7 +2287,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", ] [[package]] @@ -2215,16 +2306,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" [[package]] -name = "ua_generator" -version = "0.3.5" -source = "git+https://github.com/a11ywatch/ua_generator.git#fdc9657db663b0d542cfc8e6842eedbb79728e2e" +name = "tungstenite" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9" dependencies = [ - "fastrand", - "serde", - "serde_json", - "ureq", + "byteorder", + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand", + "sha1", + "thiserror", + "url", + "utf-8", ] +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-bidi" version = "0.3.13" @@ -2252,37 +2357,11 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - -[[package]] -name = "ureq" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9" -dependencies = [ - "base64", - "brotli-decompressor", - "encoding_rs", - "flate2", - "log", - "once_cell", - "rustls", - "rustls-webpki 0.100.1", - "serde", - "serde_json", - "url", - "webpki-roots", -] - [[package]] name = "url" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", @@ -2316,12 +2395,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - [[package]] name = "wasi" version = "0.10.0+wasi-snapshot-preview1" @@ -2355,7 +2428,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", "wasm-bindgen-shared", ] @@ -2389,7 +2462,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.48", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2402,9 +2475,9 @@ checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" -version = "0.2.3" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bbae3363c08332cadccd13b67db371814cd214c2524020932f0804b8cf7c078" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" dependencies = [ "futures-util", "js-sys", @@ -2423,56 +2496,27 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki-roots" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338" -dependencies = [ - "rustls-webpki 0.100.1", -] - [[package]] name = "website_crawler" -version = "0.8.12" +version = "0.9.0" dependencies = [ - "ahash", - "case_insensitive_string", "cc", - "compact_str", - "cssparser", - "ego-tree", "env_logger", "failure_derive", - "fast_html5ever", "fs_extra", - "hashbrown 0.13.2", "jemalloc-sys", "jemallocator", "lazy_static", - "log", - "matches", - "num_cpus", "os_info", - "percent-encoding", "prost", "prost-types", "quote", - "regex", - "reqwest", "rustc_version", - "selectors", - "sitemap", - "smallvec", + "spider", "string_concat", - "tendril", "tokio", - "tokio-stream", "tonic", "tonic-build", - "ua_generator", - "url", - "xml-rs", ] [[package]] @@ -2594,11 +2638,22 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "winreg" -version = "0.10.1" +version = "0.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" dependencies = [ - "winapi", + "cfg-if", + "windows-sys", +] + +[[package]] +name = "winreg" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "937f3df7948156640f46aacef17a70db0de5917bda9c92b0f751f3a955b588fc" +dependencies = [ + "cfg-if", + "windows-sys", ] [[package]] @@ -2606,3 +2661,23 @@ name = "xml-rs" version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a56c84a8ccd4258aed21c92f70c0f6dea75356b6892ae27c24139da456f9336" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] diff --git a/Cargo.toml b/Cargo.toml index b24c49f..c273a58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "website_crawler" -version = "0.8.12" +version = "0.9.0" authors = ["Jeff Mendez "] edition = "2021" description = "gRPC tokio based web crawler" @@ -12,34 +12,14 @@ categories = ["accessibility", "asynchronous"] include = ["src/*", "build.rs", "proto/*", "LICENSE", "README.md"] [dependencies] -tokio = { version = "1.33.0", features = [ "rt-multi-thread", "macros", "sync", "time", "parking_lot" ] } -tokio-stream = "0.1.14" +tokio = { version = "1.35.1", features = [ "rt-multi-thread", "macros", "sync", "time", "parking_lot" ] } tonic = { version = "0.9.2" } prost = "0.11.3" prost-types = "0.11.2" -reqwest = { version = "0.11.18", features = ["deflate", "brotli", "gzip", "native-tls-alpn", "socks", "stream" ] } -url = "2.4.0" -regex = { version = "^1.5.0", optional = true } -hashbrown = { version = "0.13.2" } -log = "0.4.16" lazy_static = "1.4.0" -ua_generator = { git = "https://github.com/a11ywatch/ua_generator.git", version = "0.3.5", optional = true } -percent-encoding = "2.1.0" env_logger = "0.9.0" string_concat = "0.0.1" -sitemap = "0.4.1" -xml-rs = "0.8.4" -compact_str = "0.7.1" -selectors = "0.24.0" -tendril = "0.4.3" -ahash = "0.8.3" -matches = "0.1.10" -cssparser = "0.29.6" -smallvec = "1.10.0" -ego-tree = "0.6.2" -fast_html5ever = "0.26.1" -num_cpus = "1.15.0" -case_insensitive_string = { version = "0.1.6", features = ["compact"] } +spider = { version = "1.80.68", features = ["sync", "control", "sitemap"]} [target.'cfg(all(not(target_os = "android"), not(target_os = "freebsd")))'.dependencies] jemallocator = { version = "0.5.0", optional = true } @@ -60,5 +40,5 @@ os_info = "3" [features] jemalloc = ["jemallocator", "jemalloc-sys"] -regex = ["dep:regex"] -ua_generator = ["dep:ua_generator"] +regex = ["spider/regex"] +chrome = ["spider/chrome"] \ No newline at end of file diff --git a/examples/client.rs b/examples/client.rs index 8d9c051..8d4d22a 100644 --- a/examples/client.rs +++ b/examples/client.rs @@ -10,7 +10,7 @@ pub mod crawler { extern crate lazy_static; use crate::tokio::macros::support::Pin; use tokio::sync::mpsc; -use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; +use spider::tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use tonic::{Request, Response, Status}; pub use website::website_service_server::{WebsiteService, WebsiteServiceServer}; pub use website::{Empty, ScanInitParams, ScanParams, ScanStreamResponse}; @@ -73,7 +73,7 @@ impl WebsiteService for MyWebsiteService { message: req.domain, }); - let mut stream = Box::pin(tokio_stream::iter(repeat)); + let mut stream = Box::pin(spider::tokio_stream::iter(repeat)); let (tx, rx) = mpsc::channel(1); match stream.next().await { diff --git a/src/lib.rs b/src/lib.rs index 05c7b81..c04cf9e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,27 +1,16 @@ -extern crate sitemap; extern crate tokio; extern crate tonic; #[cfg(feature = "ua_generator")] extern crate ua_generator; -// packages mainly for spider -extern crate hashbrown; -extern crate log; -extern crate reqwest; -extern crate url; #[macro_use] extern crate lazy_static; -pub extern crate compact_str; -pub use packages::spider; -#[macro_use] -extern crate fast_html5ever; #[macro_use] extern crate string_concat; // internal packages. pub mod interface; -pub mod packages; pub mod rpc; pub mod scanner; pub use rpc::handlers::grpc_start; diff --git a/src/packages/mod.rs b/src/packages/mod.rs deleted file mode 100644 index b45320c..0000000 --- a/src/packages/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -pub mod robotparser; -pub mod scraper; -pub mod spider; diff --git a/src/packages/robotparser/mod.rs b/src/packages/robotparser/mod.rs deleted file mode 100644 index 67c567f..0000000 --- a/src/packages/robotparser/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod parser; diff --git a/src/packages/robotparser/parser.rs b/src/packages/robotparser/parser.rs deleted file mode 100644 index 06dbfce..0000000 --- a/src/packages/robotparser/parser.rs +++ /dev/null @@ -1,457 +0,0 @@ -//! robots.txt parser for Rust. -//! -//! This package initially started from a fork of -//! that has improvements that help our case for speed. -//! -//! The robots.txt Exclusion Protocol is implemented as specified in -//! -//! -//! -//! Add ``extern crate robotparser`` to your crate root and your're good to go! -//! -//! # Examples -//! -//! ```rust,ignore -//! extern crate spider; -//! -//! use spider::packages::robotparser::RobotFileParser; -//! use reqwest::blocking::Client; -//! -//! fn main() { -//! let parser = RobotFileParser::new(); -//! let client = Client::new(); -//! parser.read(&client, &"http://www.python.org/robots.txt"); -//! assert!(parser.can_fetch("*", "http://www.python.org/robots.txt")); -//! } -//! ``` - -use compact_str::CompactString; -use reqwest::Client; -use reqwest::Response; -use reqwest::StatusCode; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - -/// A rule line is a single "Allow:" (allowance==True) or "Disallow:" -/// (allowance==False) followed by a path.""" -#[derive(Debug, Eq, PartialEq, Clone)] -struct RuleLine { - /// Path of the rule - path: String, - /// Is the rule allowed? - allowance: bool, -} - -#[derive(Debug, Eq, PartialEq, Clone)] -/// Determine the amount of request allowed between navigation or crawls. -pub struct RequestRate { - /// Amount of request allowed within duration - pub requests: usize, - /// Duration in seconds between request - pub seconds: usize, -} - -/// An entry has one or more user-agents and zero or more rulelines -#[derive(Debug, Eq, PartialEq, Clone)] -struct Entry { - /// Multiple user agents to use - useragents: Vec, - /// Rules that should be ignored - rulelines: Vec, - /// Time to wait in between crawls - crawl_delay: Option, - /// The request rate to respect - req_rate: Option, -} - -/// robots.txt file parser -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct RobotFileParser { - /// Entire robots.txt list of urls - entries: Vec, - /// Base entry to list - default_entry: Entry, - /// Dis-allow links reguardless of robots.txt - disallow_all: bool, - /// Allow links reguardless of robots.txt - allow_all: bool, - /// Time last checked robots.txt file - last_checked: i64, -} - -impl RuleLine { - fn new(path: &str, allowance: bool) -> RuleLine { - RuleLine { - path: path.into(), - allowance: path == "" && !allowance || allowance, - } - } - - fn applies_to(&self, filename: &str) -> bool { - self.path == "*" || filename.starts_with(&self.path) - } -} - -impl Entry { - /// Base collection to manage robot.txt data - fn new() -> Entry { - Entry { - useragents: vec![], - rulelines: vec![], - crawl_delay: None, - req_rate: None, - } - } - - /// check if this entry applies to the specified agent - fn applies_to(&self, useragent: &str) -> bool { - let ua = useragent - .split('/') - .nth(0) - .unwrap_or_default() - .to_lowercase(); - for agent in &self.useragents { - if agent == "*" { - return true; - } - if ua.contains(agent) { - return true; - } - } - false - } - - /// Preconditions: - /// - our agent applies to this entry - /// - filename is URL decoded - fn allowance(&self, filename: &str) -> bool { - for line in &self.rulelines { - if line.applies_to(filename) { - return line.allowance; - } - } - true - } - - /// Add to user agent list - fn push_useragent(&mut self, useragent: &str) { - self.useragents.push(useragent.to_lowercase()); - } - - /// Add rule to list - fn push_ruleline(&mut self, ruleline: RuleLine) { - self.rulelines.push(ruleline); - } - - /// Determine if user agent exist - fn has_useragent(&self) -> bool { - self.useragents.iter().any(|a| a == "*") - } - - /// Is the user-agent list empty? - fn is_empty(&self) -> bool { - self.useragents.is_empty() && self.rulelines.is_empty() - } - - /// Set the crawl delay for the website - fn set_crawl_delay(&mut self, delay: Duration) { - self.crawl_delay = Some(delay); - } - - /// Determine the crawl delay for the website - fn get_crawl_delay(&self) -> Option { - self.crawl_delay - } - - /// Establish request rates between robots.txt crawling sitemaps - fn set_req_rate(&mut self, req_rate: RequestRate) { - self.req_rate = Some(req_rate); - } - - /// Determine the limit allowed between request before being limited. - fn get_req_rate(&self) -> Option { - self.req_rate.clone() - } -} - -impl Default for Entry { - fn default() -> Entry { - Entry::new() - } -} - -impl RobotFileParser { - /// Establish a new robotparser for a website domain - pub fn new() -> Box { - RobotFileParser { - entries: vec![], - default_entry: Entry::new(), - disallow_all: false, - allow_all: false, - last_checked: 0i64, - } - .into() - } - - /// Returns the time the robots.txt file was last fetched. - /// - /// This is useful for long-running web spiders that need to - /// check for new robots.txt files periodically. - pub fn mtime(&self) -> i64 { - self.last_checked - } - - /// Sets the time the robots.txt file was last fetched to the - /// current time. - pub fn modified(&mut self) { - let now = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs() as i64; - self.last_checked = now; - } - - /// Reads the robots.txt URL and feeds it to the parser. - pub async fn read(&mut self, client: &Client, url: &str) { - self.modified(); - - let request = client.get(&string_concat!(url, "robots.txt")); - - let res = match request.send().await { - Ok(res) => res, - Err(_) => { - return; - } - }; - let status = res.status(); - match status { - StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { - self.disallow_all = true; - } - status - if status >= StatusCode::BAD_REQUEST - && status < StatusCode::INTERNAL_SERVER_ERROR => - { - self.allow_all = true; - } - StatusCode::OK => self.from_response(res).await, - _ => {} - } - } - - /// Reads the HTTP response and feeds it to the parser. - pub async fn from_response(&mut self, response: Response) { - let buf = response.text().await.unwrap(); - let lines: Vec<&str> = buf.split('\n').collect(); - self.parse(&lines); - } - - fn _add_entry(&mut self, entry: Entry) { - if entry.has_useragent() { - // the default entry is considered last - if self.default_entry.is_empty() { - // the first default entry wins - self.default_entry = entry; - } - } else { - self.entries.push(entry); - } - } - - /// - /// Parse the input lines from a robots.txt file - /// - /// We allow that a user-agent: line is not preceded by - /// one or more blank lines. - /// - pub fn parse>(&mut self, lines: &[T]) { - use percent_encoding::percent_decode; - - // states: - // 0: start state - // 1: saw user-agent line - // 2: saw an allow or disallow line - let mut state = 0; - let mut entry = Entry::new(); - - for line in lines { - let mut ln = line.as_ref(); - if ln.is_empty() { - match state { - 1 => { - entry = Entry::new(); - state = 0; - } - 2 => { - self._add_entry(entry); - entry = Entry::new(); - state = 0; - } - _ => {} - } - } - // remove optional comment and strip line - if let Some(i) = ln.find('#') { - ln = &ln[0..i]; - } - ln = ln.trim(); - if ln.is_empty() { - continue; - } - let parts: Vec<&str> = ln.splitn(2, ':').collect(); - if parts.len() == 2 { - let part0 = parts[0].trim().to_lowercase(); - let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect()) - .unwrap_or_default(); - match part0 { - ref x if x.to_lowercase() == "user-agent" => { - if state == 2 { - self._add_entry(entry); - entry = Entry::new(); - } - entry.push_useragent(&part1); - state = 1; - } - ref x if x.to_lowercase() == "disallow" => { - if state != 0 { - entry.push_ruleline(RuleLine::new(&part1, false)); - state = 2; - } - } - ref x if x.to_lowercase() == "allow" => { - if state != 0 { - entry.push_ruleline(RuleLine::new(&part1, true)); - state = 2; - } - } - ref x if x.to_lowercase() == "crawl-delay" => { - if state != 0 { - if let Ok(delay) = part1.parse::() { - let delay_seconds = delay.trunc(); - let delay_nanoseconds = delay.fract() * 10f64.powi(9); - let delay = - Duration::new(delay_seconds as u64, delay_nanoseconds as u32); - - entry.set_crawl_delay(delay); - } - state = 2; - } - } - ref x if x.to_lowercase() == "sitemap" => { - if state != 0 { - state = 2; - } - } - ref x if x.to_lowercase() == "request-rate" => { - if state != 0 { - let numbers: Vec> = - part1.split('/').map(|x| x.parse::()).collect(); - if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() { - let req_rate = RequestRate { - requests: numbers[0].clone().unwrap(), - seconds: numbers[1].clone().unwrap(), - }; - entry.set_req_rate(req_rate); - } - state = 2; - } - } - _ => {} - } - } - } - if state == 2 { - self._add_entry(entry); - } - } - - /// Using the parsed robots.txt decide if useragent can fetch url - pub fn can_fetch>(&self, useragent: T, url: &str) -> bool { - use percent_encoding::percent_decode; - - let useragent = useragent.as_ref(); - - if self.disallow_all { - return false; - } - if self.allow_all { - return true; - } - // Until the robots.txt file has been read or found not - // to exist, we must assume that no url is allowable. - // This prevents false positives when a user erronenously - // calls can_fetch() before calling read(). - if self.last_checked == 0 { - return false; - } - // search for given user agent matches - // the first match counts - let decoded_url = - String::from_utf8(percent_decode(url.trim().as_bytes()).collect()).unwrap_or_default(); - - let url_str = match decoded_url { - ref u if !u.is_empty() => u, - _ => "/", - }; - - for entry in &self.entries { - if entry.applies_to(useragent) { - return entry.allowance(&url_str); - } - } - - // try the default entry last - let default_entry = &self.default_entry; - - if !default_entry.is_empty() { - return default_entry.allowance(&url_str); - } - // agent not found ==> access granted - true - } - - /// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined. - pub fn get_crawl_delay(&self, useragent: &Option>) -> Option { - if self.last_checked == 0 { - None - } else { - let useragent = useragent.as_ref(); - let crawl_delay: Option = match useragent { - Some(ua) => { - for entry in &self.entries { - if entry.applies_to(ua) { - return entry.get_crawl_delay(); - } - } - None - } - _ => None, - }; - - if crawl_delay.is_some() { - crawl_delay - } else { - let default_entry = &self.default_entry; - - if !default_entry.is_empty() { - return default_entry.get_crawl_delay(); - } - - None - } - } - } - - /// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined - pub fn get_req_rate>(&self, useragent: T) -> Option { - let useragent = useragent.as_ref(); - if self.last_checked == 0 { - return None; - } - for entry in &self.entries { - if entry.applies_to(useragent) { - return entry.get_req_rate(); - } - } - None - } -} diff --git a/src/packages/scraper/element_ref/element.rs b/src/packages/scraper/element_ref/element.rs deleted file mode 100644 index ae141e0..0000000 --- a/src/packages/scraper/element_ref/element.rs +++ /dev/null @@ -1,206 +0,0 @@ -use fast_html5ever::Namespace; -use selectors::attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}; -use selectors::matching; -use selectors::{Element, OpaqueElement}; - -use super::super::selector::{CssLocalName, CssString, NonTSPseudoClass, PseudoElement, Simple}; -use super::ElementRef; - -/// Note: will never match against non-tree-structure pseudo-classes. -impl<'a> Element for ElementRef<'a> { - type Impl = Simple; - - fn opaque(&self) -> OpaqueElement { - OpaqueElement::new(self.node.value()) - } - - fn parent_element(&self) -> Option { - self.parent().and_then(ElementRef::wrap) - } - - fn parent_node_is_shadow_root(&self) -> bool { - false - } - - fn containing_shadow_host(&self) -> Option { - None - } - - fn is_pseudo_element(&self) -> bool { - false - } - - fn is_part(&self, _name: &CssLocalName) -> bool { - false - } - - fn is_same_type(&self, other: &Self) -> bool { - self.value().name == other.value().name - } - - fn imported_part(&self, _: &CssLocalName) -> Option { - None - } - - fn prev_sibling_element(&self) -> Option { - self.prev_siblings() - .find(|sibling| sibling.value().is_element()) - .map(ElementRef::new) - } - - fn next_sibling_element(&self) -> Option { - self.next_siblings() - .find(|sibling| sibling.value().is_element()) - .map(ElementRef::new) - } - - fn is_html_element_in_html_document(&self) -> bool { - // FIXME: Is there more to this? - self.value().name.ns == ns!(html) - } - - fn has_local_name(&self, name: &CssLocalName) -> bool { - self.value().name.local == name.0 - } - - fn has_namespace(&self, namespace: &Namespace) -> bool { - &self.value().name.ns == namespace - } - - fn attr_matches( - &self, - ns: &NamespaceConstraint<&Namespace>, - local_name: &CssLocalName, - operation: &AttrSelectorOperation<&CssString>, - ) -> bool { - self.value().attrs.iter().any(|(key, value)| { - !matches!(*ns, NamespaceConstraint::Specific(url) if *url != key.ns) - && local_name.0 == key.local - && operation.eval_str(value) - }) - } - - fn match_non_ts_pseudo_class( - &self, - _pc: &NonTSPseudoClass, - _context: &mut matching::MatchingContext, - _flags_setter: &mut F, - ) -> bool { - false - } - - fn match_pseudo_element( - &self, - _pe: &PseudoElement, - _context: &mut matching::MatchingContext, - ) -> bool { - false - } - - fn is_link(&self) -> bool { - self.value().name() == "link" - } - - fn is_html_slot_element(&self) -> bool { - true - } - - fn has_id(&self, id: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { - match self.value().id { - Some(ref val) => case_sensitivity.eq(id.0.as_bytes(), val.as_bytes()), - None => false, - } - } - - fn has_class(&self, name: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { - self.value().has_class(&name.0, case_sensitivity) - } - - fn is_empty(&self) -> bool { - !self - .children() - .any(|child| child.value().is_element() || child.value().is_text()) - } - - fn is_root(&self) -> bool { - self.parent() - .map_or(false, |parent| parent.value().is_document()) - } -} - -#[cfg(test)] -mod tests { - use crate::packages::scraper::html::Html; - use crate::packages::scraper::selector::{CssLocalName, Selector}; - use selectors::attr::CaseSensitivity; - use selectors::Element; - - #[test] - fn test_has_id() { - let html = ""; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - - let element = fragment.select(&sel).next().unwrap(); - assert_eq!( - true, - element.has_id( - &CssLocalName::from("link_id_456"), - CaseSensitivity::CaseSensitive - ) - ); - - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let element = fragment.select(&sel).next().unwrap(); - assert_eq!( - false, - element.has_id( - &CssLocalName::from("any_link_id"), - CaseSensitivity::CaseSensitive - ) - ); - } - - #[test] - fn test_is_link() { - let html = ""; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("link").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert_eq!(true, element.is_link()); - - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert_eq!(false, element.is_link()); - } - - #[test] - fn test_has_class() { - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert_eq!( - true, - element.has_class( - &CssLocalName::from("my_class"), - CaseSensitivity::CaseSensitive - ) - ); - - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert_eq!( - false, - element.has_class( - &CssLocalName::from("my_class"), - CaseSensitivity::CaseSensitive - ) - ); - } -} diff --git a/src/packages/scraper/element_ref/mod.rs b/src/packages/scraper/element_ref/mod.rs deleted file mode 100644 index 76819dc..0000000 --- a/src/packages/scraper/element_ref/mod.rs +++ /dev/null @@ -1,162 +0,0 @@ -//! Element references. - -use std::ops::Deref; - -use ego_tree::iter::{Edge, Traverse}; -use ego_tree::NodeRef; -use fast_html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; - -use crate::packages::scraper::node::Element; -use crate::packages::scraper::node::Node; -use crate::packages::scraper::selector::Selector; - -/// Wrapper around a reference to an element node. -/// -/// This wrapper implements the `Element` trait from the `selectors` crate, which allows it to be -/// matched against CSS selectors. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct ElementRef<'a> { - node: NodeRef<'a, Node>, -} - -impl<'a> ElementRef<'a> { - fn new(node: NodeRef<'a, Node>) -> Self { - ElementRef { node } - } - - /// Wraps a `NodeRef` only if it references a `Node::Element`. - pub fn wrap(node: NodeRef<'a, Node>) -> Option { - if node.value().is_element() { - Some(ElementRef::new(node)) - } else { - None - } - } - - /// Returns the `Element` referenced by `self`. - pub fn value(&self) -> &'a Element { - self.node.value().as_element().unwrap() - } - - /// Returns an iterator over descendent elements matching a selector. - pub fn select<'b>(&self, selector: &'b Selector) -> Select<'a, 'b> { - let mut inner = self.traverse(); - inner.next(); // Skip Edge::Open(self). - - Select { - scope: *self, - inner, - selector, - } - } - - fn serialize(&self, traversal_scope: TraversalScope) -> String { - let opts = SerializeOpts { - scripting_enabled: false, // It's not clear what this does. - traversal_scope, - create_missing_parent: false, - }; - let mut buf = Vec::new(); - serialize(&mut buf, self, opts).unwrap(); - String::from_utf8(buf).unwrap() - } - - /// Returns the HTML of this element. - pub fn html(&self) -> String { - self.serialize(TraversalScope::IncludeNode) - } - - /// Returns the inner HTML of this element. - pub fn inner_html(&self) -> String { - self.serialize(TraversalScope::ChildrenOnly(None)) - } - - /// Returns an iterator over descendent text nodes. - pub fn text(&self) -> Text<'a> { - Text { - inner: self.traverse(), - } - } -} - -impl<'a> Deref for ElementRef<'a> { - type Target = NodeRef<'a, Node>; - fn deref(&self) -> &NodeRef<'a, Node> { - &self.node - } -} - -/// Iterator over descendent elements matching a selector. -#[derive(Debug, Clone)] -pub struct Select<'a, 'b> { - scope: ElementRef<'a>, - inner: Traverse<'a, Node>, - selector: &'b Selector, -} - -impl<'a, 'b> Iterator for Select<'a, 'b> { - type Item = ElementRef<'a>; - - fn next(&mut self) -> Option> { - for edge in &mut self.inner { - if let Edge::Open(node) = edge { - if let Some(element) = ElementRef::wrap(node) { - if self.selector.matches_with_scope(&element, Some(self.scope)) { - return Some(element); - } - } - } - } - None - } -} - -/// Iterator over descendent text nodes. -#[derive(Debug, Clone)] -pub struct Text<'a> { - inner: Traverse<'a, Node>, -} - -impl<'a> Iterator for Text<'a> { - type Item = &'a str; - - fn next(&mut self) -> Option<&'a str> { - for edge in &mut self.inner { - if let Edge::Open(node) = edge { - if let Node::Text(ref text) = node.value() { - return Some(&**text); - } - } - } - None - } -} - -mod element; -mod serializable; - -#[cfg(test)] -mod tests { - use crate::packages::scraper::html::Html; - use crate::packages::scraper::selector::Selector; - - #[test] - fn test_scope() { - let html = r" -
- 1 - - 2 - 3 - -
- "; - let fragment = Html::parse_fragment(html); - let sel1 = Selector::parse("div > span").unwrap(); - let sel2 = Selector::parse(":scope > b").unwrap(); - - let element1 = fragment.select(&sel1).next().unwrap(); - let element2 = element1.select(&sel2).next().unwrap(); - assert_eq!(element2.inner_html(), "3"); - } -} diff --git a/src/packages/scraper/element_ref/serializable.rs b/src/packages/scraper/element_ref/serializable.rs deleted file mode 100644 index fc5d37d..0000000 --- a/src/packages/scraper/element_ref/serializable.rs +++ /dev/null @@ -1,15 +0,0 @@ -use std::io::Error; - -use fast_html5ever::serialize::{Serialize, Serializer, TraversalScope}; - -use super::ElementRef; - -impl<'a> Serialize for ElementRef<'a> { - fn serialize( - &self, - serializer: &mut S, - traversal_scope: TraversalScope, - ) -> Result<(), Error> { - super::super::node::serializable::serialize(**self, serializer, traversal_scope) - } -} diff --git a/src/packages/scraper/error.rs b/src/packages/scraper/error.rs deleted file mode 100644 index 2bdbd9f..0000000 --- a/src/packages/scraper/error.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! Custom error types for diagnostics -//! Includes re-exported error types from dependencies - -use cssparser::{BasicParseErrorKind, ParseErrorKind, Token}; -use selectors::parser::SelectorParseErrorKind; - -/// Error type that is returned when calling `Selector::parse` -#[derive(Debug, Clone)] -pub enum SelectorErrorKind<'a> { - /// A `Token` was not expected - UnexpectedToken(Token<'a>), - - /// End-Of-Line was unexpected - EndOfLine, - - /// `@` rule is invalid - InvalidAtRule(String), - - /// The body of an `@` rule is invalid - InvalidAtRuleBody, - - /// The qualified rule is invalid - QualRuleInvalid, - - /// Expected a `::` for a pseudoelement - ExpectedColonOnPseudoElement(Token<'a>), - - /// Expected an identity for a pseudoelement - ExpectedIdentityOnPseudoElement(Token<'a>), - - /// A `SelectorParseErrorKind` error that isn't really supposed to happen did - UnexpectedSelectorParseError(SelectorParseErrorKind<'a>), -} - -impl<'a> From>> for SelectorErrorKind<'a> { - fn from(original: cssparser::ParseError<'a, SelectorParseErrorKind<'a>>) -> Self { - // NOTE: This could be improved, but I dont - // exactly know how - match original.kind { - ParseErrorKind::Basic(err) => SelectorErrorKind::from(err), - ParseErrorKind::Custom(err) => SelectorErrorKind::from(err), - } - } -} - -impl<'a> From> for SelectorErrorKind<'a> { - fn from(err: BasicParseErrorKind<'a>) -> Self { - match err { - BasicParseErrorKind::UnexpectedToken(token) => Self::UnexpectedToken(token), - BasicParseErrorKind::EndOfInput => Self::EndOfLine, - BasicParseErrorKind::AtRuleInvalid(rule) => Self::InvalidAtRule(rule.to_string()), - BasicParseErrorKind::AtRuleBodyInvalid => Self::InvalidAtRuleBody, - BasicParseErrorKind::QualifiedRuleInvalid => Self::QualRuleInvalid, - } - } -} - -impl<'a> From> for SelectorErrorKind<'a> { - fn from(err: SelectorParseErrorKind<'a>) -> Self { - match err { - SelectorParseErrorKind::PseudoElementExpectedColon(token) => { - Self::ExpectedColonOnPseudoElement(token) - } - SelectorParseErrorKind::PseudoElementExpectedIdent(token) => { - Self::ExpectedIdentityOnPseudoElement(token) - } - other => Self::UnexpectedSelectorParseError(other), - } - } -} diff --git a/src/packages/scraper/html/mod.rs b/src/packages/scraper/html/mod.rs deleted file mode 100644 index 0ffc133..0000000 --- a/src/packages/scraper/html/mod.rs +++ /dev/null @@ -1,207 +0,0 @@ -//! HTML documents and fragments. - -use std::borrow::Cow; - -use ego_tree::iter::Nodes; -use ego_tree::Tree; -use fast_html5ever::serialize::SerializeOpts; -use fast_html5ever::tree_builder::QuirksMode; -use fast_html5ever::QualName; -use fast_html5ever::{driver, serialize}; -use tendril::TendrilSink; - -use crate::packages::scraper::element_ref::ElementRef; -use crate::packages::scraper::node::Node; -use crate::packages::scraper::selector::Selector; - -/// An HTML tree. -/// -/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the -/// `errors` field. The `tree` will still be populated as best as possible. -/// -/// Implements the `TreeSink` trait from the `fast_html5ever` crate, which allows HTML to be parsed. -#[derive(Debug, Clone)] -pub struct Html { - /// Parse errors. - pub errors: Vec>, - - /// The quirks mode. - pub quirks_mode: QuirksMode, - - /// The node tree. - pub tree: Tree, -} - -impl Html { - /// Creates an empty HTML document. - pub fn new_document() -> Self { - Html { - errors: Vec::new(), - quirks_mode: QuirksMode::NoQuirks, - tree: Tree::new(Node::Document), - } - } - - /// Creates an empty HTML fragment. - pub fn new_fragment() -> Self { - Html { - errors: Vec::new(), - quirks_mode: QuirksMode::NoQuirks, - tree: Tree::new(Node::Fragment), - } - } - - /// Parses a string of HTML as a document. - /// - /// This is a convenience method for the following: - /// - /// ``` - /// # extern crate fast_html5ever; - /// # extern crate tendril; - /// # fn main() { - /// # let document = ""; - /// use fast_html5ever::driver::{self, ParseOpts}; - /// use crate::website_crawler::packages::scraper::Html; - /// use tendril::TendrilSink; - /// - /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default()); - /// let html = parser.one(document); - /// # } - /// ``` - pub fn parse_document(document: &str) -> Self { - let parser = driver::parse_document(Self::new_document(), Default::default()); - parser.one(document) - } - - /// Parses a string of HTML as a fragment. - pub fn parse_fragment(fragment: &str) -> Self { - let parser = driver::parse_fragment( - Self::new_fragment(), - Default::default(), - QualName::new(None, ns!(html), local_name!("body")), - Vec::new(), - ); - parser.one(fragment) - } - - /// Returns an iterator over elements matching a selector. - pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> { - Select { - inner: self.tree.nodes(), - selector, - } - } - - /// Returns the root `` element. - pub fn root_element(&self) -> ElementRef { - let root_node = self - .tree - .root() - .children() - .find(|child| child.value().is_element()) - .expect("html node missing"); - ElementRef::wrap(root_node).unwrap() - } - - /// Serialize entire document into HTML. - pub fn html(&self) -> String { - let opts = SerializeOpts { - scripting_enabled: false, // It's not clear what this does. - traversal_scope: fast_html5ever::serialize::TraversalScope::IncludeNode, - create_missing_parent: false, - }; - let mut buf = Vec::new(); - serialize(&mut buf, self, opts).unwrap(); - String::from_utf8(buf).unwrap() - } -} - -/// Iterator over elements matching a selector. -#[derive(Debug)] -pub struct Select<'a, 'b> { - inner: Nodes<'a, Node>, - selector: &'b Selector, -} - -impl<'a, 'b> Iterator for Select<'a, 'b> { - type Item = ElementRef<'a>; - - fn next(&mut self) -> Option> { - for node in self.inner.by_ref() { - if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { - return Some(element); - } - } - } - None - } -} - -impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { - fn next_back(&mut self) -> Option { - for node in self.inner.by_ref().rev() { - if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { - return Some(element); - } - } - } - None - } -} - -mod serializable; -mod tree_sink; - -#[cfg(test)] -mod tests { - use super::Html; - use super::Selector; - - #[test] - fn root_element_fragment() { - let html = Html::parse_fragment(r#"1"#); - let root_ref = html.root_element(); - let href = root_ref - .select(&Selector::parse("a").unwrap()) - .next() - .unwrap(); - assert_eq!(href.inner_html(), "1"); - assert_eq!(href.value().attr("href").unwrap(), "http://github.com"); - } - - #[test] - fn root_element_document_doctype() { - let html = Html::parse_document("\nabc"); - let root_ref = html.root_element(); - let title = root_ref - .select(&Selector::parse("title").unwrap()) - .next() - .unwrap(); - assert_eq!(title.inner_html(), "abc"); - } - - #[test] - fn root_element_document_comment() { - let html = Html::parse_document("abc"); - let root_ref = html.root_element(); - let title = root_ref - .select(&Selector::parse("title").unwrap()) - .next() - .unwrap(); - assert_eq!(title.inner_html(), "abc"); - } - - #[test] - fn select_is_reversible() { - let html = Html::parse_document("

element1

element2

element3

"); - let selector = Selector::parse("p").unwrap(); - let result: Vec<_> = html - .select(&selector) - .rev() - .map(|e| e.inner_html()) - .collect(); - assert_eq!(result, vec!["element3", "element2", "element1"]); - } -} diff --git a/src/packages/scraper/html/serializable.rs b/src/packages/scraper/html/serializable.rs deleted file mode 100644 index 2cd60f7..0000000 --- a/src/packages/scraper/html/serializable.rs +++ /dev/null @@ -1,27 +0,0 @@ -use std::io::Error; - -use fast_html5ever::serialize::{Serialize, Serializer, TraversalScope}; - -use super::Html; - -impl Serialize for Html { - fn serialize( - &self, - serializer: &mut S, - traversal_scope: TraversalScope, - ) -> Result<(), Error> { - super::super::node::serializable::serialize(self.tree.root(), serializer, traversal_scope) - } -} - -#[cfg(test)] -mod tests { - use super::Html; - - #[test] - fn test_serialize() { - let src = r#"

Hello world!

"#; - let html = Html::parse_document(src); - assert_eq!(html.html(), src); - } -} diff --git a/src/packages/scraper/html/tree_sink.rs b/src/packages/scraper/html/tree_sink.rs deleted file mode 100644 index 9e64b79..0000000 --- a/src/packages/scraper/html/tree_sink.rs +++ /dev/null @@ -1,229 +0,0 @@ -use super::Html; -use crate::packages::scraper::node::{Doctype, Element, Node, ProcessingInstruction, Text}; -use ego_tree::NodeId; -use fast_html5ever::tendril::StrTendril; -use fast_html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; -use fast_html5ever::Attribute; -use fast_html5ever::{ExpandedName, QualName}; -use std::borrow::Cow; - -/// Note: does not support the `