diff --git a/.changesets/feat_geal_v8_heap_statistics.md b/.changesets/feat_geal_v8_heap_statistics.md new file mode 100644 index 0000000000..c091b108a8 --- /dev/null +++ b/.changesets/feat_geal_v8_heap_statistics.md @@ -0,0 +1,7 @@ +### Add V8 heap usage metrics ([PR #5781](https://github.com/apollographql/router/pull/5781)) + +The router supports new gauge metrics for tracking heap memory usage of the V8 Javascript engine: +- `apollo.router.v8.heap.used`: heap memory used by V8, in bytes +- `apollo.router.v8.heap.total`: total heap allocated by V8, in bytes + +By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/5781 \ No newline at end of file diff --git a/.changesets/feat_update_federation.md b/.changesets/feat_update_federation.md new file mode 100644 index 0000000000..b3c0670daa --- /dev/null +++ b/.changesets/feat_update_federation.md @@ -0,0 +1,8 @@ +### Update federation to 2.8.3 ([PR #5781](https://github.com/apollographql/router/pull/5781)) + +> [!IMPORTANT] +> If you have enabled [Distributed query plan caching](https://www.apollographql.com/docs/router/configuration/distributed-caching/#distributed-query-plan-caching), this release changes the hashing algorithm used for the cache keys. On account of this, you should anticipate additional cache regeneration cost when updating between these versions while the new hashing algorithm comes into service. + +This updates the router from federation version 2.8.1 to 2.8.3, with a [fix for fragment generation](https://github.com/apollographql/federation/pull/3043). + +By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/5781 diff --git a/Cargo.lock b/Cargo.lock index c20249f7e7..11a516f5dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6092,9 +6092,9 @@ dependencies = [ [[package]] name = "router-bridge" -version = "0.5.27+v2.8.1" +version = "0.5.30+v2.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "288fa40fc4e0a76fb911410e05d4525e8bf7558622bd02403f89f871c4d0785b" +checksum = "9b2b67ccfc13842df12e473cbb93fe306a8dc3d120cfa2be57e3537c71bf0e63" dependencies = [ "anyhow", "async-channel 1.9.0", diff --git a/apollo-router/Cargo.toml b/apollo-router/Cargo.toml index 5716453c60..dbf481bf5a 100644 --- a/apollo-router/Cargo.toml +++ b/apollo-router/Cargo.toml @@ -198,7 +198,7 @@ regex = "1.10.5" reqwest.workspace = true # note: this dependency should _always_ be pinned, prefix the version with an `=` -router-bridge = "=0.5.27+v2.8.1" +router-bridge = "=0.5.30+v2.8.3" rust-embed = { version = "8.4.0", features = ["include-exclude"] } rustls = "0.21.12" diff --git a/apollo-router/src/query_planner/bridge_query_planner_pool.rs b/apollo-router/src/query_planner/bridge_query_planner_pool.rs index a306f19b6b..bb75124df1 100644 --- a/apollo-router/src/query_planner/bridge_query_planner_pool.rs +++ b/apollo-router/src/query_planner/bridge_query_planner_pool.rs @@ -1,5 +1,7 @@ use std::collections::HashMap; use std::num::NonZeroUsize; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Instant; @@ -8,6 +10,9 @@ use async_channel::bounded; use async_channel::Sender; use futures::future::BoxFuture; use opentelemetry::metrics::MeterProvider; +use opentelemetry::metrics::ObservableGauge; +use opentelemetry::metrics::Unit; +use opentelemetry_api::metrics::Meter; use router_bridge::planner::Planner; use tokio::sync::oneshot; use tokio::task::JoinSet; @@ -37,6 +42,10 @@ pub(crate) struct BridgeQueryPlannerPool { schema: Arc, subgraph_schemas: Arc>>>, _pool_size_gauge: opentelemetry::metrics::ObservableGauge, + v8_heap_used: Arc, + _v8_heap_used_gauge: ObservableGauge, + v8_heap_total: Arc, + _v8_heap_total_gauge: ObservableGauge, } impl BridgeQueryPlannerPool { @@ -93,7 +102,7 @@ impl BridgeQueryPlannerPool { })? .subgraph_schemas(); - let planners = bridge_query_planners + let planners: Vec<_> = bridge_query_planners .iter() .map(|p| p.planner().clone()) .collect(); @@ -119,21 +128,68 @@ impl BridgeQueryPlannerPool { }); } let sender_for_gauge = sender.clone(); - let pool_size_gauge = meter_provider() - .meter("apollo/router") + let meter = meter_provider().meter("apollo/router"); + let pool_size_gauge = meter .u64_observable_gauge("apollo.router.query_planning.queued") + .with_description("Number of queries waiting to be planned") + .with_unit(Unit::new("query")) .with_callback(move |m| m.observe(sender_for_gauge.len() as u64, &[])) .init(); + let (v8_heap_used, _v8_heap_used_gauge) = Self::create_heap_used_gauge(&meter); + let (v8_heap_total, _v8_heap_total_gauge) = Self::create_heap_total_gauge(&meter); + + // initialize v8 metrics + if let Some(bridge_query_planner) = planners.first().cloned() { + Self::get_v8_metrics( + bridge_query_planner, + v8_heap_used.clone(), + v8_heap_total.clone(), + ) + .await; + } + Ok(Self { js_planners: planners, sender, schema, subgraph_schemas, _pool_size_gauge: pool_size_gauge, + v8_heap_used, + _v8_heap_used_gauge, + v8_heap_total, + _v8_heap_total_gauge, }) } + fn create_heap_used_gauge(meter: &Meter) -> (Arc, ObservableGauge) { + let current_heap_used = Arc::new(AtomicU64::new(0)); + let current_heap_used_for_gauge = current_heap_used.clone(); + let heap_used_gauge = meter + .u64_observable_gauge("apollo.router.v8.heap.used") + .with_description("V8 heap used, in bytes") + .with_unit(Unit::new("By")) + .with_callback(move |i| { + i.observe(current_heap_used_for_gauge.load(Ordering::SeqCst), &[]) + }) + .init(); + (current_heap_used, heap_used_gauge) + } + + fn create_heap_total_gauge(meter: &Meter) -> (Arc, ObservableGauge) { + let current_heap_total = Arc::new(AtomicU64::new(0)); + let current_heap_total_for_gauge = current_heap_total.clone(); + let heap_total_gauge = meter + .u64_observable_gauge("apollo.router.v8.heap.total") + .with_description("V8 heap total, in bytes") + .with_unit(Unit::new("By")) + .with_callback(move |i| { + i.observe(current_heap_total_for_gauge.load(Ordering::SeqCst), &[]) + }) + .init(); + (current_heap_total, heap_total_gauge) + } + pub(crate) fn planners(&self) -> Vec>> { self.js_planners.clone() } @@ -147,6 +203,18 @@ impl BridgeQueryPlannerPool { ) -> Arc>>> { self.subgraph_schemas.clone() } + + async fn get_v8_metrics( + planner: Arc>, + v8_heap_used: Arc, + v8_heap_total: Arc, + ) { + let metrics = planner.get_heap_statistics().await; + if let Ok(metrics) = metrics { + v8_heap_used.store(metrics.heap_used, Ordering::SeqCst); + v8_heap_total.store(metrics.heap_total, Ordering::SeqCst); + } + } } impl tower::Service for BridgeQueryPlannerPool { @@ -173,6 +241,20 @@ impl tower::Service for BridgeQueryPlannerPool { let (response_sender, response_receiver) = oneshot::channel(); let sender = self.sender.clone(); + let get_metrics_future = + if let Some(bridge_query_planner) = self.js_planners.first().cloned() { + let v8_heap_used = self.v8_heap_used.clone(); + let v8_heap_total = self.v8_heap_total.clone(); + + Some(Self::get_v8_metrics( + bridge_query_planner, + v8_heap_used, + v8_heap_total, + )) + } else { + None + }; + Box::pin(async move { let start = Instant::now(); let _ = sender.send((req, response_sender)).await; @@ -187,7 +269,73 @@ impl tower::Service for BridgeQueryPlannerPool { start.elapsed().as_secs_f64() ); + if let Some(f) = get_metrics_future { + // execute in a separate task to avoid blocking the request + tokio::task::spawn(f); + } + res }) } } + +#[cfg(test)] + +mod tests { + use opentelemetry_sdk::metrics::data::Gauge; + + use super::*; + use crate::metrics::FutureMetricsExt; + use crate::spec::Query; + use crate::Context; + + #[tokio::test] + async fn test_v8_metrics() { + let sdl = include_str!("../testdata/supergraph.graphql"); + let config = Arc::default(); + let schema = Arc::new(Schema::parse(sdl, &config).unwrap()); + + async move { + let mut pool = BridgeQueryPlannerPool::new( + schema.clone(), + config.clone(), + NonZeroUsize::new(2).unwrap(), + ) + .await + .unwrap(); + let query = "query { me { name } }".to_string(); + + let doc = Query::parse_document(&query, None, &schema, &config).unwrap(); + let context = Context::new(); + context.extensions().with_lock(|mut lock| lock.insert(doc)); + + pool.call(QueryPlannerRequest::new(query, None, context)) + .await + .unwrap(); + + let metrics = crate::metrics::collect_metrics(); + let heap_used = metrics.find("apollo.router.v8.heap.used").unwrap(); + let heap_total = metrics.find("apollo.router.v8.heap.total").unwrap(); + + println!( + "got heap_used: {:?}, heap_total: {:?}", + heap_used + .data + .as_any() + .downcast_ref::>() + .unwrap() + .data_points[0] + .value, + heap_total + .data + .as_any() + .downcast_ref::>() + .unwrap() + .data_points[0] + .value + ); + } + .with_metrics() + .await; + } +} diff --git a/apollo-router/tests/integration/redis.rs b/apollo-router/tests/integration/redis.rs index cb8b79959e..6b0ff6b404 100644 --- a/apollo-router/tests/integration/redis.rs +++ b/apollo-router/tests/integration/redis.rs @@ -26,7 +26,7 @@ async fn query_planner_cache() -> Result<(), BoxError> { // 2. run `docker compose up -d` and connect to the redis container by running `docker-compose exec redis /bin/bash`. // 3. Run the `redis-cli` command from the shell and start the redis `monitor` command. // 4. Run this test and yank the updated cache key from the redis logs. - let known_cache_key = "plan:0:v2.8.1:16385ebef77959fcdc520ad507eb1f7f7df28f1d54a0569e3adabcb4cd00d7ce:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:3106dfc3339d8c3f3020434024bff0f566a8be5995199954db5a7525a7d7e67a"; + let known_cache_key = "plan:0:v2.8.3:16385ebef77959fcdc520ad507eb1f7f7df28f1d54a0569e3adabcb4cd00d7ce:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:3106dfc3339d8c3f3020434024bff0f566a8be5995199954db5a7525a7d7e67a"; let config = RedisConfig::from_url("redis://127.0.0.1:6379").unwrap(); let client = RedisClient::new(config, None, None, None); @@ -921,7 +921,7 @@ async fn connection_failure_blocks_startup() { async fn query_planner_redis_update_query_fragments() { test_redis_query_plan_config_update( include_str!("fixtures/query_planner_redis_config_update_query_fragments.router.yaml"), - "plan:0:v2.8.1:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:9054d19854e1d9e282ac7645c612bc70b8a7143d43b73d44dade4a5ec43938b4", + "plan:0:v2.8.3:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:9054d19854e1d9e282ac7645c612bc70b8a7143d43b73d44dade4a5ec43938b4", ) .await; } @@ -940,7 +940,7 @@ async fn query_planner_redis_update_planner_mode() { async fn query_planner_redis_update_introspection() { test_redis_query_plan_config_update( include_str!("fixtures/query_planner_redis_config_update_introspection.router.yaml"), - "plan:0:v2.8.1:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:04b3051125b5994fba6b0a22b2d8b4246cadc145be030c491a3431655d2ba07a", + "plan:0:v2.8.3:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:04b3051125b5994fba6b0a22b2d8b4246cadc145be030c491a3431655d2ba07a", ) .await; } @@ -949,7 +949,7 @@ async fn query_planner_redis_update_introspection() { async fn query_planner_redis_update_defer() { test_redis_query_plan_config_update( include_str!("fixtures/query_planner_redis_config_update_defer.router.yaml"), - "plan:0:v2.8.1:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:3b7241b0db2cd878b79c0810121953ba544543f3cb2692aaf1a59184470747b0", + "plan:0:v2.8.3:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:3b7241b0db2cd878b79c0810121953ba544543f3cb2692aaf1a59184470747b0", ) .await; } @@ -960,7 +960,7 @@ async fn query_planner_redis_update_type_conditional_fetching() { include_str!( "fixtures/query_planner_redis_config_update_type_conditional_fetching.router.yaml" ), - "plan:0:v2.8.1:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:0ca695a8c4c448b65fa04229c663f44150af53b184ebdcbb0ad6862290efed76", + "plan:0:v2.8.3:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:0ca695a8c4c448b65fa04229c663f44150af53b184ebdcbb0ad6862290efed76", ) .await; } @@ -971,7 +971,7 @@ async fn query_planner_redis_update_reuse_query_fragments() { include_str!( "fixtures/query_planner_redis_config_update_reuse_query_fragments.router.yaml" ), - "plan:0:v2.8.1:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:f7c04319556397ec4b550aa5aaa96c73689cee09026b661b6a9fc20b49e6fa77", + "plan:0:v2.8.3:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:f7c04319556397ec4b550aa5aaa96c73689cee09026b661b6a9fc20b49e6fa77", ) .await; } @@ -994,7 +994,7 @@ async fn test_redis_query_plan_config_update(updated_config: &str, new_cache_key router.assert_started().await; router.clear_redis_cache().await; - let starting_key = "plan:0:v2.8.1:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:4a5827854a6d2efc85045f0d5bede402e15958390f1073d2e77df56188338e5a"; + let starting_key = "plan:0:v2.8.3:a9e605fa09adc5a4b824e690b4de6f160d47d84ede5956b58a7d300cca1f7204:3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112:4a5827854a6d2efc85045f0d5bede402e15958390f1073d2e77df56188338e5a"; router.execute_default_query().await; router.assert_redis_cache_contains(starting_key, None).await; router.update_config(updated_config).await; diff --git a/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx b/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx index 37c63e8b57..d29cbf1fca 100644 --- a/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx @@ -66,6 +66,8 @@ The coprocessor operations metric has the following attributes: - `apollo.router.query_planning.plan.duration` - Histogram of plan durations isolated to query planning time only. - `apollo.router.query_planning.total.duration` - Histogram of plan durations including queue time. - `apollo.router.query_planning.queued` - A gauge of the number of queued plans requests. +- `apollo.router.v8.heap.used` - heap memory used by V8, in bytes. +- `apollo.router.v8.heap.total` - total heap allocated by V8, in bytes. ### Uplink diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 451ea09375..781b40cc11 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -20,7 +20,7 @@ reqwest = { workspace = true, features = ["json", "blocking"] } serde_json.workspace = true tokio.workspace = true # note: this dependency should _always_ be pinned, prefix the version with an `=` -router-bridge = "=0.5.27+v2.8.1" +router-bridge = "=0.5.30+v2.8.3" [dev-dependencies] anyhow = "1"