Split arch wasm filter code into prompt and llm gateway filters (#190)

katanemo · Oct 17, 2024 · 21e7fe2 · 21e7fe2
1 parent 8e54ac2
commit 21e7fe2
Show file tree

Hide file tree

Showing 13 changed files with 684 additions and 2,789 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
         name: cargo-fmt
         language: system
         types: [file, rust]
-        entry: bash -c "cd crates/llm_gateway && cargo fmt -- --check"
+        entry: bash -c "cd crates/llm_gateway && cargo fmt"
 
       - id: cargo-clippy
         name: cargo-clippy

diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml
@@ -45,34 +45,12 @@ static_resources:
                       domains:
                         - "*"
                       routes:
-                      {% for provider in arch_llm_providers %}
                         - match:
                             prefix: "/"
-                            headers:
-                              - name: "x-arch-llm-provider"
-                                string_match:
-                                  exact: {{ provider.name }}
-                          route:
-                            auto_host_rewrite: true
-                            cluster: {{ provider.provider }}
-                            timeout: 60s
-                      {% endfor %}
-                        - match:
-                            prefix: "/"
-                            headers:
-                              - name: "x-arch-upstream"
-                                string_match:
-                                  exact: arch_llm_listener
                           route:
                             auto_host_rewrite: true
                             cluster: arch_llm_listener
                             timeout: 60s
-                        - match:
-                            prefix: "/"
-                          direct_response:
-                            status: 400
-                            body:
-                              inline_string: "x-arch-llm-provider or x-arch-upstream header not set, cannot perform routing\n"
                 http_filters:
                   - name: envoy.filters.http.wasm
                     typed_config:
@@ -232,7 +210,7 @@ static_resources:
                           direct_response:
                             status: 400
                             body:
-                              inline_string: "x-arch-llm-provider header not set, cannot perform routing\n"
+                              inline_string: "x-arch-llm-provider header not set, llm gateway cannot perform routing\n"
                 http_filters:
                   - name: envoy.filters.http.wasm
                     typed_config:
@@ -250,7 +228,7 @@ static_resources:
                             runtime: "envoy.wasm.runtime.v8"
                             code:
                               local:
-                                filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
+                                filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm"
                   - name: envoy.filters.http.router
                     typed_config:
                       "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

diff --git a/crates/llm_gateway/src/lib.rs b/crates/llm_gateway/src/lib.rs
@@ -1,13 +1,13 @@
-use filter_context::FilterContext;
+use llm_filter_context::LlmGatewayFilterContext;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 
-mod filter_context;
-mod stream_context;
+mod llm_filter_context;
+mod llm_stream_context;
 
 proxy_wasm::main! {{
     proxy_wasm::set_log_level(LogLevel::Trace);
     proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
-        Box::new(FilterContext::new())
+        Box::new(LlmGatewayFilterContext::new())
     });
 }}
diff --git a/crates/llm_gateway/src/llm_filter_context.rs b/crates/llm_gateway/src/llm_filter_context.rs
@@ -0,0 +1,108 @@
+use crate::llm_stream_context::LlmGatewayStreamContext;
+use common::configuration::Configuration;
+use common::http::Client;
+use common::llm_providers::LlmProviders;
+use common::ratelimit;
+use common::stats::Counter;
+use common::stats::Gauge;
+use log::debug;
+use proxy_wasm::traits::*;
+use proxy_wasm::types::*;
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+#[derive(Copy, Clone, Debug)]
+pub struct WasmMetrics {
+    pub active_http_calls: Gauge,
+    pub ratelimited_rq: Counter,
+}
+
+impl WasmMetrics {
+    fn new() -> WasmMetrics {
+        WasmMetrics {
+            active_http_calls: Gauge::new(String::from("active_http_calls")),
+            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct FilterCallContext {}
+
+#[derive(Debug)]
+pub struct LlmGatewayFilterContext {
+    metrics: Rc<WasmMetrics>,
+    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
+    callouts: RefCell<HashMap<u32, FilterCallContext>>,
+    llm_providers: Option<Rc<LlmProviders>>,
+}
+
+impl LlmGatewayFilterContext {
+    pub fn new() -> LlmGatewayFilterContext {
+        LlmGatewayFilterContext {
+            callouts: RefCell::new(HashMap::new()),
+            metrics: Rc::new(WasmMetrics::new()),
+            llm_providers: None,
+        }
+    }
+}
+
+impl Client for LlmGatewayFilterContext {
+    type CallContext = FilterCallContext;
+
+    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
+        &self.callouts
+    }
+
+    fn active_http_calls(&self) -> &Gauge {
+        &self.metrics.active_http_calls
+    }
+}
+
+impl Context for LlmGatewayFilterContext {}
+
+// RootContext allows the Rust code to reach into the Envoy Config
+impl RootContext for LlmGatewayFilterContext {
+    fn on_configure(&mut self, _: usize) -> bool {
+        let config_bytes = self
+            .get_plugin_configuration()
+            .expect("Arch config cannot be empty");
+
+        let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
+            Ok(config) => config,
+            Err(err) => panic!("Invalid arch config \"{:?}\"", err),
+        };
+
+        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
+
+        match config.llm_providers.try_into() {
+            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
+            Err(err) => panic!("{err}"),
+        }
+
+        true
+    }
+
+    fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
+        debug!(
+            "||| create_http_context called with context_id: {:?} |||",
+            context_id
+        );
+
+        // No StreamContext can be created until the Embedding Store is fully initialized.
+        Some(Box::new(LlmGatewayStreamContext::new(
+            context_id,
+            Rc::clone(&self.metrics),
+            Rc::clone(
+                self.llm_providers
+                    .as_ref()
+                    .expect("LLM Providers must exist when Streams are being created"),
+            ),
+        )))
+    }
+
+    fn get_type(&self) -> Option<ContextType> {
+        Some(ContextType::HttpContext)
+    }
+}