From 7f68fc144aecfe088783f7bc73dde6a6c55f4ce7 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 27 Feb 2024 09:36:07 -0800 Subject: [PATCH 1/5] Update tokenizers version to match python --- Cargo.lock | 536 ++++++++++++++-------------------------------- router/Cargo.toml | 2 +- 2 files changed, 160 insertions(+), 378 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index df6f5efc0..9c7c0bdb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,17 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aes" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - [[package]] name = "ahash" version = "0.8.3" @@ -41,34 +30,24 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" -dependencies = [ - "memchr", -] - -[[package]] -name = "aho-corasick" -version = "1.0.2" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "96b09b5178381e0874812a9b157f7fe84982617e48f71f4e3235482775e5b540" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", "utf8parse", ] @@ -98,12 +77,12 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -148,7 +127,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -159,7 +138,7 @@ checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -272,12 +251,6 @@ version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" -[[package]] -name = "base64ct" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" - [[package]] name = "bitflags" version = "1.3.2" @@ -317,57 +290,11 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "cached-path" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "097968e38f1319207f057d0f4d76452e4f4f847a5de61c5215379f297fa034f3" -dependencies = [ - "flate2", - "fs2", - "glob", - "indicatif 0.16.2", - "log", - "rand", - "reqwest", - "serde", - "serde_json", - "sha2", - "tar", - "tempfile", - "thiserror", - "zip", -] - [[package]] name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" -dependencies = [ - "jobserver", -] [[package]] name = "cfg-if" @@ -375,56 +302,45 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "cipher" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" -dependencies = [ - "crypto-common", - "inout", -] - [[package]] name = "clap" -version = "4.3.11" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" +checksum = "c918d541ef2913577a0f9566e9ce27cb35b6df072075769e0b26cb5a554520da" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.11" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim", + "strsim 0.11.0", ] [[package]] name = "clap_derive" -version = "4.3.2" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] name = "clap_lex" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "colorchoice" @@ -445,12 +361,6 @@ dependencies = [ "windows-sys 0.45.0", ] -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - [[package]] name = "core-foundation" version = "0.9.3" @@ -568,7 +478,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.10.0", "syn 1.0.109", ] @@ -635,7 +545,6 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", - "subtle", ] [[package]] @@ -708,9 +617,9 @@ dependencies = [ [[package]] name = "esaxx-rs" -version = "0.1.8" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" dependencies = [ "cc", ] @@ -724,18 +633,6 @@ dependencies = [ "instant", ] -[[package]] -name = "filetime" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall 0.2.16", - "windows-sys 0.48.0", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -801,16 +698,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "futures" version = "0.3.28" @@ -867,7 +754,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -938,12 +825,6 @@ version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" -[[package]] -name = "glob" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" - [[package]] name = "grpc-metadata" version = "0.1.0" @@ -1006,15 +887,6 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - [[package]] name = "hostname" version = "0.3.1" @@ -1154,35 +1026,15 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" -dependencies = [ - "console", - "lazy_static", - "number_prefix 0.3.0", - "regex", -] - -[[package]] -name = "indicatif" -version = "0.16.2" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" dependencies = [ "console", - "lazy_static", - "number_prefix 0.4.0", - "regex", -] - -[[package]] -name = "inout" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" -dependencies = [ - "generic-array", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", ] [[package]] @@ -1211,40 +1063,29 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" -[[package]] -name = "is-terminal" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" -dependencies = [ - "hermit-abi", - "rustix 0.38.4", - "windows-sys 0.48.0", -] - [[package]] name = "itertools" -version = "0.8.2" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itertools" -version = "0.9.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" dependencies = [ "either", ] [[package]] name = "itertools" -version = "0.10.5" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] @@ -1255,15 +1096,6 @@ version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" -[[package]] -name = "jobserver" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.64" @@ -1291,12 +1123,6 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" -[[package]] -name = "linux-raw-sys" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" - [[package]] name = "lock_api" version = "0.4.10" @@ -1390,9 +1216,9 @@ dependencies = [ [[package]] name = "macro_rules_attribute" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862" +checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" dependencies = [ "macro_rules_attribute-proc_macro", "paste", @@ -1400,9 +1226,9 @@ dependencies = [ [[package]] name = "macro_rules_attribute-proc_macro" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" +checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" [[package]] name = "match_cfg" @@ -1486,7 +1312,7 @@ checksum = "ddece26afd34c31585c74a4db0630c376df271c285d682d1e55012197830b6df" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -1548,9 +1374,9 @@ dependencies = [ [[package]] name = "monostate" -version = "0.1.8" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f3f57a8802842f648026a33c3d2e3bb41bb309a35b1609bd7ef2b060b8b6b1b" +checksum = "878c2a1f1c70e5724fa28f101ca787b6a7e8ad5c5e4ae4ca3b0fa4a419fa9075" dependencies = [ "monostate-impl", "serde", @@ -1558,13 +1384,13 @@ dependencies = [ [[package]] name = "monostate-impl" -version = "0.1.8" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72f4d2e10fde62a0f2fcb4b44ccbf4f9899dcc30c9193449f8dfb9123d71377" +checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -1718,12 +1544,6 @@ dependencies = [ "libc", ] -[[package]] -name = "number_prefix" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" - [[package]] name = "number_prefix" version = "0.4.0" @@ -1790,7 +1610,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -1967,34 +1787,11 @@ dependencies = [ "windows-targets 0.48.1", ] -[[package]] -name = "password-hash" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" -dependencies = [ - "base64ct", - "rand_core", - "subtle", -] - [[package]] name = "paste" -version = "1.0.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4b27ab7be369122c218afc2079489cdcb4b517c0a3fc386ff11e1fedfcc2b35" - -[[package]] -name = "pbkdf2" -version = "0.11.0" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" -dependencies = [ - "digest", - "hmac", - "password-hash", - "sha2", -] +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "percent-encoding" @@ -2029,7 +1826,7 @@ checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -2098,9 +1895,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.64" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -2177,9 +1974,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.29" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -2225,9 +2022,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" dependencies = [ "either", "rayon-core", @@ -2235,25 +2032,23 @@ dependencies = [ [[package]] name = "rayon-cond" -version = "0.1.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd1259362c9065e5ea39a789ef40b1e3fd934c94beb7b5ab3ac6629d3b5e7cb7" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" dependencies = [ "either", - "itertools 0.8.2", + "itertools 0.11.0", "rayon", ] [[package]] name = "rayon-core" -version = "1.11.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "num_cpus", ] [[package]] @@ -2291,7 +2086,7 @@ version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" dependencies = [ - "aho-corasick 1.0.2", + "aho-corasick", "memchr", "regex-automata 0.3.3", "regex-syntax 0.7.4", @@ -2312,7 +2107,7 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" dependencies = [ - "aho-corasick 1.0.2", + "aho-corasick", "memchr", "regex-syntax 0.7.4", ] @@ -2329,6 +2124,12 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "reqwest" version = "0.11.18" @@ -2402,7 +2203,7 @@ dependencies = [ "quote", "rust-embed-utils", "shellexpand", - "syn 2.0.25", + "syn 2.0.51", "walkdir", ] @@ -2441,20 +2242,7 @@ dependencies = [ "errno", "io-lifetimes", "libc", - "linux-raw-sys 0.3.8", - "windows-sys 0.48.0", -] - -[[package]] -name = "rustix" -version = "0.38.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" -dependencies = [ - "bitflags 2.3.3", - "errno", - "libc", - "linux-raw-sys 0.4.3", + "linux-raw-sys", "windows-sys 0.48.0", ] @@ -2571,7 +2359,7 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -2608,17 +2396,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha1" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "sha2" version = "0.10.7" @@ -2737,10 +2514,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] -name = "subtle" -version = "2.5.0" +name = "strsim" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" @@ -2755,9 +2532,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.25" +version = "2.0.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e3fc8c0c74267e2df136e5e5fb656a464158aa57624053375eb9c8c6e25ae2" +checksum = "6ab617d94515e94ae53b8406c628598680aa0c9587474ecbe58188f7b345d66c" dependencies = [ "proc-macro2", "quote", @@ -2784,17 +2561,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "tar" -version = "0.4.39" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec96d2ffad078296368d46ff1cb309be1c23c513b4ab0e22a45de0185275ac96" -dependencies = [ - "filetime", - "libc", - "xattr", -] - [[package]] name = "tempfile" version = "3.6.0" @@ -2805,28 +2571,28 @@ dependencies = [ "cfg-if", "fastrand", "redox_syscall 0.3.5", - "rustix 0.37.23", + "rustix", "windows-sys 0.48.0", ] [[package]] name = "thiserror" -version = "1.0.43" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a35fc5b8971143ca348fa6df4f024d4d55264f3468c71ad1c2f365b0a4d58c42" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.43" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -2885,19 +2651,17 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokenizers" -version = "0.13.4" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aea68938177975ab09da68552b720eac941779ff386baceaf77e0f5f9cea645f" +checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d" dependencies = [ - "aho-corasick 0.7.20", - "cached-path", + "aho-corasick", "clap", "derive_builder", - "dirs", "esaxx-rs", "getrandom", - "indicatif 0.15.0", - "itertools 0.9.0", + "indicatif", + "itertools 0.12.1", "lazy_static", "log", "macro_rules_attribute", @@ -2908,8 +2672,7 @@ dependencies = [ "rayon", "rayon-cond", "regex", - "regex-syntax 0.7.4", - "reqwest", + "regex-syntax 0.8.2", "serde", "serde_json", "spm_precompiled", @@ -2957,7 +2720,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -3170,7 +2933,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -3383,7 +3146,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.25", + "syn 2.0.51", ] [[package]] @@ -3479,7 +3242,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", "wasm-bindgen-shared", ] @@ -3513,7 +3276,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.51", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3604,6 +3367,15 @@ dependencies = [ "windows-targets 0.48.1", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.3", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -3634,6 +3406,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d380ba1dc7187569a8a9e91ed34b8ccfc33123bbacb8c0aed2d1ad7f3ef2dc5f" +dependencies = [ + "windows_aarch64_gnullvm 0.52.3", + "windows_aarch64_msvc 0.52.3", + "windows_i686_gnu 0.52.3", + "windows_i686_msvc 0.52.3", + "windows_x86_64_gnu 0.52.3", + "windows_x86_64_gnullvm 0.52.3", + "windows_x86_64_msvc 0.52.3", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -3646,6 +3433,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68e5dcfb9413f53afd9c8f86e56a7b4d86d9a2fa26090ea2dc9e40fba56c6ec6" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -3658,6 +3451,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dab469ebbc45798319e69eebf92308e541ce46760b49b18c6b3fe5e8965b30f" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -3670,6 +3469,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a4e9b6a7cac734a8b4138a4e1044eac3404d8326b6c0f939276560687a033fb" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -3682,6 +3487,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28b0ec9c422ca95ff34a78755cfa6ad4a51371da2a5ace67500cf7ca5f232c58" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -3694,6 +3505,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704131571ba93e89d7cd43482277d6632589b18ecf4468f591fbae0a8b101614" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -3706,6 +3523,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42079295511643151e98d61c38c0acc444e52dd42ab456f7ccfd5152e8ecf21c" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -3718,6 +3541,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0770833d60a970638e989b3fa9fd2bb1aaadcf88963d1659fd7d9990196ed2d6" + [[package]] name = "winreg" version = "0.10.1" @@ -3727,61 +3556,14 @@ dependencies = [ "winapi", ] -[[package]] -name = "xattr" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" -dependencies = [ - "libc", -] - [[package]] name = "zip" version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" dependencies = [ - "aes", "byteorder", - "bzip2", - "constant_time_eq", "crc32fast", "crossbeam-utils", "flate2", - "hmac", - "pbkdf2", - "sha1", - "time", - "zstd", -] - -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" -dependencies = [ - "cc", - "libc", - "pkg-config", ] diff --git a/router/Cargo.toml b/router/Cargo.toml index de69f34b4..1fb609fd3 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -32,7 +32,7 @@ reqwest = { version = "0.11.14", features = [] } serde = "1.0.152" serde_json = { version = "1.0.93", features = ["preserve_order"] } thiserror = "1.0.38" -tokenizers = "0.13.4" +tokenizers = "0.15.0" tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } tower-http = { version = "0.4.0", features = ["cors"] } tracing = "0.1.37" From 5fcda100f07e8697cef21bc0e4d1c36747e5056c Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 27 Feb 2024 10:49:41 -0800 Subject: [PATCH 2/5] Updated tokenizers --- Cargo.lock | 181 ++++++++++++++++++++++++++++++++++++++++----- router/Cargo.toml | 3 +- router/src/main.rs | 79 +++++++++++++++++--- 3 files changed, 232 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9c7c0bdb2..6b5013b28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -104,7 +104,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93b21a03b7c21702a0110f9f8d228763a533570deb376119042dabf33c37a01a" dependencies = [ "futures-io", - "rustls", + "rustls 0.20.8", "webpki", ] @@ -292,9 +292,9 @@ checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "02f341c093d19155a6e41631ce5971aac4e9a868262212153124c15fa22d1cdc" [[package]] name = "cfg-if" @@ -553,7 +553,16 @@ version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" dependencies = [ - "dirs-sys", + "dirs-sys 0.3.7", +] + +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys 0.4.1", ] [[package]] @@ -567,6 +576,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "either" version = "1.8.1" @@ -887,6 +908,27 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +[[package]] +name = "hf-hub" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +dependencies = [ + "dirs 5.0.1", + "futures", + "indicatif", + "log", + "native-tls", + "num_cpus", + "rand", + "reqwest", + "serde", + "serde_json", + "thiserror", + "tokio", + "ureq", +] + [[package]] name = "hostname" version = "0.3.1" @@ -1113,9 +1155,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "linux-raw-sys" @@ -1181,6 +1223,7 @@ dependencies = [ "clap", "flume", "futures", + "hf-hub", "lorax-client", "metrics", "metrics-exporter-prometheus", @@ -1758,6 +1801,12 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "overload" version = "0.1.1" @@ -2177,11 +2226,26 @@ dependencies = [ "libc", "once_cell", "spin 0.5.2", - "untrusted", + "untrusted 0.7.1", "web-sys", "winapi", ] +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin 0.9.8", + "untrusted 0.9.0", + "windows-sys 0.52.0", +] + [[package]] name = "rust-embed" version = "6.8.1" @@ -2253,11 +2317,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" dependencies = [ "log", - "ring", + "ring 0.16.20", "sct", "webpki", ] +[[package]] +name = "rustls" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +dependencies = [ + "log", + "ring 0.17.8", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + [[package]] name = "rustls-pemfile" version = "1.0.3" @@ -2267,6 +2345,23 @@ dependencies = [ "base64 0.21.2", ] +[[package]] +name = "rustls-pki-types" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" + +[[package]] +name = "rustls-webpki" +version = "0.102.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" +dependencies = [ + "ring 0.17.8", + "rustls-pki-types", + "untrusted 0.9.0", +] + [[package]] name = "rustversion" version = "1.0.13" @@ -2309,8 +2404,8 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" dependencies = [ - "ring", - "untrusted", + "ring 0.16.20", + "untrusted 0.7.1", ] [[package]] @@ -2344,18 +2439,18 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "serde" -version = "1.0.171" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e27d1e4fd7659406c492fd6cfaf2066ba8773de45ca75e855590f856dc34a9" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.171" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", @@ -2364,9 +2459,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.102" +version = "1.0.112" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed" +checksum = "4d1bd37ce2324cf3bf85e5a25f96eb4baf0d5aa6eba43e7ae8958870c4ec48ed" dependencies = [ "indexmap 2.0.0", "itoa", @@ -2422,7 +2517,7 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ccc8076840c4da029af4f87e4e8daeb0fca6b87bbb02e10cb60b791450e11e4" dependencies = [ - "dirs", + "dirs 4.0.0", ] [[package]] @@ -2519,6 +2614,12 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "1.0.109" @@ -2660,6 +2761,7 @@ dependencies = [ "derive_builder", "esaxx-rs", "getrandom", + "hf-hub", "indicatif", "itertools 0.12.1", "lazy_static", @@ -3101,6 +3203,32 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35" +dependencies = [ + "base64 0.21.2", + "flate2", + "log", + "native-tls", + "once_cell", + "rustls 0.22.2", + "rustls-pki-types", + "rustls-webpki", + "serde", + "serde_json", + "url", + "webpki-roots", +] + [[package]] name = "url" version = "2.4.0" @@ -3303,8 +3431,17 @@ version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" dependencies = [ - "ring", - "untrusted", + "ring 0.16.20", + "untrusted 0.7.1", +] + +[[package]] +name = "webpki-roots" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +dependencies = [ + "rustls-pki-types", ] [[package]] @@ -3556,6 +3693,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "zeroize" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" + [[package]] name = "zip" version = "0.6.6" diff --git a/router/Cargo.toml b/router/Cargo.toml index 1fb609fd3..079582ada 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -22,6 +22,7 @@ lorax-client = { path = "client" } clap = { version = "4.1.4", features = ["derive", "env"] } flume = "0.10.14" futures = "0.3.26" +hf-hub = { version = "0.3.0", features = ["tokio"] } metrics = "0.21.0" metrics-exporter-prometheus = { version = "0.12.1", features = [] } nohash-hasher = "0.2.0" @@ -32,7 +33,7 @@ reqwest = { version = "0.11.14", features = [] } serde = "1.0.152" serde_json = { version = "1.0.93", features = ["preserve_order"] } thiserror = "1.0.38" -tokenizers = "0.15.0" +tokenizers = { version = "0.15.1", features = ["http"] } tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } tower-http = { version = "0.4.0", features = ["cors"] } tracing = "0.1.37" diff --git a/router/src/main.rs b/router/src/main.rs index 7980c0560..9135db22d 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -1,6 +1,8 @@ /// LoRAX webserver entrypoint use axum::http::{HeaderName, HeaderValue}; use clap::Parser; +use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo}; +use hf_hub::{Repo, RepoType}; use lorax_client::{ClientError, ShardedClient}; use lorax_router::{server, HubModelInfo}; use opentelemetry::sdk::propagation::TraceContextPropagator; @@ -9,11 +11,13 @@ use opentelemetry::sdk::trace::Sampler; use opentelemetry::sdk::Resource; use opentelemetry::{global, KeyValue}; use opentelemetry_otlp::WithExportConfig; +use std::fs::File; +use std::io::BufReader; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::path::Path; use std::time::Duration; use thiserror::Error; -use tokenizers::{FromPretrainedParameters, Tokenizer}; +use tokenizers::tokenizer::Tokenizer; use tower_http::cors::{AllowCredentials, AllowHeaders, AllowMethods, AllowOrigin, ExposeHeaders}; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; @@ -191,22 +195,50 @@ fn main() -> Result<(), RouterError> { // Parse Huggingface hub token let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); - // Tokenizer instance - // This will only be used to validate payloads + // Load tokenizer let local_path = Path::new(&tokenizer_name); let local_model = local_path.exists() && local_path.is_dir(); let tokenizer = if local_model { - // Load local tokenizer Tokenizer::from_file(local_path.join("tokenizer.json")).ok() } else { - // Download and instantiate tokenizer - // We need to download it outside of the Tokio runtime - let params = FromPretrainedParameters { - revision: revision.clone().unwrap_or("main".to_string()), - auth_token: authorization_token.clone(), - ..Default::default() + // Shared API builder initialization + let api_builder = || { + let mut builder = ApiBuilder::new() + .with_progress(false) + .with_token(authorization_token); + + if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { + builder = builder.with_cache_dir(cache_dir.into()); + } + + builder + }; + + tracing::info!("Using the Hugging Face API"); + let api = match api_builder().build() { + Ok(api) => Some(api), + Err(_) => { + tracing::warn!("Unable to build the Hugging Face API"); + None + } }; - Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).ok() + + if api.is_none() { + return Err(RouterError::ArgumentValidation( + "Unable to build the Hugging Face API".to_string(), + )); + } + + let api_repo = api.unwrap().repo(Repo::with_revision( + tokenizer_name.to_string(), + RepoType::Model, + revision.clone().unwrap_or_else(|| "main".to_string()), + )); + + match api_repo.get("tokenizer.json") { + Ok(tokenizer_filename) => Tokenizer::from_file(tokenizer_filename).ok(), + Err(_) => get_base_tokenizer(&api, &api_repo), + } }; // Launch Tokio runtime @@ -442,6 +474,31 @@ pub async fn get_model_info( } } +/// get base tokenizer +pub fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option { + let config_filename = api_repo.get("config.json").ok()?; + + // Open the file in read-only mode with buffer. + let file = File::open(config_filename).ok()?; + let reader = BufReader::new(file); + + // Read the JSON contents of the file as an instance of `User`. + let config: serde_json::Value = serde_json::from_reader(reader).ok()?; + + if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") { + let api_base_repo = api.repo(Repo::with_revision( + base_model_id.to_string(), + RepoType::Model, + "main".to_string(), + )); + + let tokenizer_filename = api_base_repo.get("tokenizer.json").ok()?; + Tokenizer::from_file(tokenizer_filename).ok() + } else { + None + } +} + #[derive(Debug, Error)] enum RouterError { #[error("Argument validation error: {0}")] From b5725e49a4823965c93dd206306bf836a6cebbf2 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 27 Feb 2024 11:00:10 -0800 Subject: [PATCH 3/5] Refactor async --- router/src/main.rs | 273 ++++++++++++++++++++++----------------------- 1 file changed, 135 insertions(+), 138 deletions(-) diff --git a/router/src/main.rs b/router/src/main.rs index 9135db22d..441cbfc8c 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -85,7 +85,8 @@ struct Args { adapter_source: String, } -fn main() -> Result<(), RouterError> { +#[tokio::main] +async fn main() -> Result<(), RouterError> { // Get args let args = Args::parse(); // Pattern match configuration @@ -205,7 +206,7 @@ fn main() -> Result<(), RouterError> { let api_builder = || { let mut builder = ApiBuilder::new() .with_progress(false) - .with_token(authorization_token); + .with_token(authorization_token.clone()); if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { builder = builder.with_cache_dir(cache_dir.into()); @@ -215,7 +216,7 @@ fn main() -> Result<(), RouterError> { }; tracing::info!("Using the Hugging Face API"); - let api = match api_builder().build() { + let api_opt = match api_builder().build() { Ok(api) => Some(api), Err(_) => { tracing::warn!("Unable to build the Hugging Face API"); @@ -223,159 +224,155 @@ fn main() -> Result<(), RouterError> { } }; - if api.is_none() { + if api_opt.is_none() { return Err(RouterError::ArgumentValidation( "Unable to build the Hugging Face API".to_string(), )); } - let api_repo = api.unwrap().repo(Repo::with_revision( + let api = api_opt.unwrap(); + let api_repo = api.repo(Repo::with_revision( tokenizer_name.to_string(), RepoType::Model, revision.clone().unwrap_or_else(|| "main".to_string()), )); - match api_repo.get("tokenizer.json") { + match api_repo.get("tokenizer.json").await { Ok(tokenizer_filename) => Tokenizer::from_file(tokenizer_filename).ok(), - Err(_) => get_base_tokenizer(&api, &api_repo), + Err(_) => get_base_tokenizer(&api, &api_repo).await, } }; // Launch Tokio runtime - tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()? - .block_on(async { - init_logging(otlp_endpoint, json_output); - - if tokenizer.is_none() { - tracing::warn!( - "Could not find a fast tokenizer implementation for {tokenizer_name}" - ); - tracing::warn!("Rust input length validation and truncation is disabled"); - } + init_logging(otlp_endpoint, json_output); + + if tokenizer.is_none() { + tracing::warn!( + "Could not find a fast tokenizer implementation for {tokenizer_name}" + ); + tracing::warn!("Rust input length validation and truncation is disabled"); + } - // Get Model info - let model_info = match local_model { - true => HubModelInfo { - model_id: tokenizer_name.clone(), + // Get Model info + let model_info = match local_model { + true => HubModelInfo { + model_id: tokenizer_name.clone(), + sha: None, + pipeline_tag: None, + }, + false => get_model_info(&tokenizer_name, revision, authorization_token) + .await + .unwrap_or_else(|| { + tracing::warn!("Could not retrieve model info from the Hugging Face hub."); + HubModelInfo { + model_id: tokenizer_name.to_string(), sha: None, pipeline_tag: None, - }, - false => get_model_info(&tokenizer_name, revision, authorization_token) - .await - .unwrap_or_else(|| { - tracing::warn!("Could not retrieve model info from the Hugging Face hub."); - HubModelInfo { - model_id: tokenizer_name.to_string(), - sha: None, - pipeline_tag: None, - } - }), - }; - - // if pipeline-tag == lorax we default to return_full_text = true - let compat_return_full_text = match &model_info.pipeline_tag { - None => { - tracing::warn!("no pipeline tag found for model {tokenizer_name}"); - false } - Some(pipeline_tag) => pipeline_tag.as_str() == "lorax", - }; - - // Instantiate sharded client from the master unix socket - let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path) - .await - .map_err(RouterError::Connection)?; - // Clear the cache; useful if the webserver rebooted - sharded_client - .clear_cache(None) - .await - .map_err(RouterError::Cache)?; - // Get info from the shard - let shard_info = sharded_client.info().await.map_err(RouterError::Info)?; - - // Warmup model - tracing::info!("Warming up model"); - let max_supported_batch_total_tokens = match sharded_client - .warmup(max_input_length as u32, max_batch_prefill_tokens) - .await - .map_err(RouterError::Warmup)? - { - // Older models do not support automatic max-batch-total-tokens - None => { - let max_batch_total_tokens = max_batch_total_tokens.unwrap_or( - 16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)), + }), + }; + + // if pipeline-tag == lorax we default to return_full_text = true + let compat_return_full_text = match &model_info.pipeline_tag { + None => { + tracing::warn!("no pipeline tag found for model {tokenizer_name}"); + false + } + Some(pipeline_tag) => pipeline_tag.as_str() == "lorax", + }; + + // Instantiate sharded client from the master unix socket + let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path) + .await + .map_err(RouterError::Connection)?; + // Clear the cache; useful if the webserver rebooted + sharded_client + .clear_cache(None) + .await + .map_err(RouterError::Cache)?; + // Get info from the shard + let shard_info = sharded_client.info().await.map_err(RouterError::Info)?; + + // Warmup model + tracing::info!("Warming up model"); + let max_supported_batch_total_tokens = match sharded_client + .warmup(max_input_length as u32, max_batch_prefill_tokens) + .await + .map_err(RouterError::Warmup)? + { + // Older models do not support automatic max-batch-total-tokens + None => { + let max_batch_total_tokens = max_batch_total_tokens.unwrap_or( + 16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)), + ); + tracing::warn!("Model does not support automatic max batch total tokens"); + max_batch_total_tokens + } + // Flash attention models return their max supported total tokens + Some(max_supported_batch_total_tokens) => { + // Warn if user added his own max-batch-total-tokens as we will ignore it + let mut max_effective_batch_total_tokens = max_supported_batch_total_tokens; + if max_batch_total_tokens.is_some() { + // Check if manual value is lower than inferred value + let max_batch_total_tokens = max_batch_total_tokens.unwrap(); + if max_batch_total_tokens < max_supported_batch_total_tokens { + max_effective_batch_total_tokens = max_batch_total_tokens; + } else { + tracing::warn!( + "`--max-batch-total-tokens` is deprecated for Flash \ + Attention models." + ); + tracing::warn!( + "Inferred max batch total tokens: {max_supported_batch_total_tokens}" ); - tracing::warn!("Model does not support automatic max batch total tokens"); - max_batch_total_tokens - } - // Flash attention models return their max supported total tokens - Some(max_supported_batch_total_tokens) => { - // Warn if user added his own max-batch-total-tokens as we will ignore it - let mut max_effective_batch_total_tokens = max_supported_batch_total_tokens; - if max_batch_total_tokens.is_some() { - // Check if manual value is lower than inferred value - let max_batch_total_tokens = max_batch_total_tokens.unwrap(); - if max_batch_total_tokens < max_supported_batch_total_tokens { - max_effective_batch_total_tokens = max_batch_total_tokens; - } else { - tracing::warn!( - "`--max-batch-total-tokens` is deprecated for Flash \ - Attention models." - ); - tracing::warn!( - "Inferred max batch total tokens: {max_supported_batch_total_tokens}" - ); - } - } - max_effective_batch_total_tokens - } - }; - tracing::info!("Setting max batch total tokens to {max_supported_batch_total_tokens}"); - tracing::info!("Connected"); - - let addr = match hostname.parse() { - Ok(ip) => SocketAddr::new(ip, port), - Err(_) => { - tracing::warn!("Invalid hostname, defaulting to 0.0.0.0"); - SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port) } - }; - - // Run server - server::run( - model_info, - shard_info, - compat_return_full_text, - max_concurrent_requests, - max_best_of, - max_stop_sequences, - max_input_length, - max_total_tokens, - waiting_served_ratio, - max_batch_prefill_tokens, - max_supported_batch_total_tokens, - max_waiting_tokens, - max_active_adapters, - adapter_cycle_time_s, - sharded_client, - tokenizer, - validation_workers, - addr, - cors_allow_origin, - cors_allow_method, - cors_allow_credentials, - cors_allow_header, - cors_expose_header, - ngrok, - ngrok_authtoken, - ngrok_edge, - adapter_source, - ) - .await?; - Ok(()) - }) + } + max_effective_batch_total_tokens + } + }; + tracing::info!("Setting max batch total tokens to {max_supported_batch_total_tokens}"); + tracing::info!("Connected"); + + let addr = match hostname.parse() { + Ok(ip) => SocketAddr::new(ip, port), + Err(_) => { + tracing::warn!("Invalid hostname, defaulting to 0.0.0.0"); + SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port) + } + }; + + // Run server + server::run( + model_info, + shard_info, + compat_return_full_text, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_input_length, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_supported_batch_total_tokens, + max_waiting_tokens, + max_active_adapters, + adapter_cycle_time_s, + sharded_client, + tokenizer, + validation_workers, + addr, + cors_allow_origin, + cors_allow_method, + cors_allow_credentials, + cors_allow_header, + cors_expose_header, + ngrok, + ngrok_authtoken, + ngrok_edge, + adapter_source, + ) + .await?; + Ok(()) } /// Init logging using env variables LOG_LEVEL and LOG_FORMAT: @@ -475,8 +472,8 @@ pub async fn get_model_info( } /// get base tokenizer -pub fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option { - let config_filename = api_repo.get("config.json").ok()?; +pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option { + let config_filename = api_repo.get("config.json").await.ok()?; // Open the file in read-only mode with buffer. let file = File::open(config_filename).ok()?; @@ -492,7 +489,7 @@ pub fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option { "main".to_string(), )); - let tokenizer_filename = api_base_repo.get("tokenizer.json").ok()?; + let tokenizer_filename = api_base_repo.get("tokenizer.json").await.ok()?; Tokenizer::from_file(tokenizer_filename).ok() } else { None From 319f08a3bd0a0add5f0c0788b883aa266b394ff4 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 27 Feb 2024 14:58:10 -0800 Subject: [PATCH 4/5] Fixed logging --- router/src/main.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/router/src/main.rs b/router/src/main.rs index 441cbfc8c..78aae330c 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -121,6 +121,8 @@ async fn main() -> Result<(), RouterError> { adapter_source, } = args; + init_logging(otlp_endpoint, json_output); + // Validate args if max_input_length >= max_total_tokens { return Err(RouterError::ArgumentValidation( @@ -197,9 +199,11 @@ async fn main() -> Result<(), RouterError> { let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); // Load tokenizer + tracing::info!("Loading tokenizer {tokenizer_name}"); let local_path = Path::new(&tokenizer_name); let local_model = local_path.exists() && local_path.is_dir(); let tokenizer = if local_model { + tracing::info!("Using local tokenizer: {0}", local_path.to_str().unwrap()); Tokenizer::from_file(local_path.join("tokenizer.json")).ok() } else { // Shared API builder initialization @@ -243,9 +247,6 @@ async fn main() -> Result<(), RouterError> { } }; - // Launch Tokio runtime - init_logging(otlp_endpoint, json_output); - if tokenizer.is_none() { tracing::warn!( "Could not find a fast tokenizer implementation for {tokenizer_name}" From 64e61eb52b8aa72f303030bf7c6f5bf15bb38d11 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 27 Feb 2024 15:05:11 -0800 Subject: [PATCH 5/5] cargo fmt --- router/src/main.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/router/src/main.rs b/router/src/main.rs index 78aae330c..cffafe631 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -248,9 +248,7 @@ async fn main() -> Result<(), RouterError> { }; if tokenizer.is_none() { - tracing::warn!( - "Could not find a fast tokenizer implementation for {tokenizer_name}" - ); + tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}"); tracing::warn!("Rust input length validation and truncation is disabled"); } @@ -303,9 +301,8 @@ async fn main() -> Result<(), RouterError> { { // Older models do not support automatic max-batch-total-tokens None => { - let max_batch_total_tokens = max_batch_total_tokens.unwrap_or( - 16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)), - ); + let max_batch_total_tokens = max_batch_total_tokens + .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens))); tracing::warn!("Model does not support automatic max batch total tokens"); max_batch_total_tokens }