From cb9ab7463c27f30532f944ff9d3adb0636e42364 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 14 Oct 2024 12:25:55 +0200 Subject: [PATCH 01/38] proxy: split out the console-redirect backend flow (#9270) removes the ConsoleRedirect backend from the main auth::Backends enum, copy-paste the existing crate::proxy::task_main structure to use the ConsoleRedirectBackend exclusively. This makes the logic a bit simpler at the cost of some fairly trivial code duplication. --- proxy/src/auth/backend/console_redirect.rs | 37 +++- proxy/src/auth/backend/mod.rs | 72 ++----- proxy/src/bin/local_proxy.rs | 2 +- proxy/src/bin/proxy.rs | 97 +++++---- proxy/src/console_redirect_proxy.rs | 217 +++++++++++++++++++++ proxy/src/lib.rs | 1 + proxy/src/proxy/mod.rs | 6 +- proxy/src/proxy/tests/mod.rs | 2 +- proxy/src/serverless/backend.rs | 7 +- proxy/src/serverless/mod.rs | 2 +- proxy/src/serverless/websocket.rs | 2 +- 11 files changed, 333 insertions(+), 112 deletions(-) create mode 100644 proxy/src/console_redirect_proxy.rs diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 127be545e1d8..457410ec8cec 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,18 +1,24 @@ use crate::{ - auth, compute, + auth, + cache::Cached, + compute, config::AuthenticationConfig, context::RequestMonitoring, - control_plane::{self, provider::NodeInfo}, + control_plane::{self, provider::NodeInfo, CachedNodeInfo}, error::{ReportableError, UserFacingError}, + proxy::connect_compute::ComputeConnectBackend, stream::PqStream, waiters, }; +use async_trait::async_trait; use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::SslMode; use tracing::{info, info_span}; +use super::ComputeCredentialKeys; + #[derive(Debug, Error)] pub(crate) enum WebAuthError { #[error(transparent)] @@ -25,6 +31,7 @@ pub(crate) enum WebAuthError { Io(#[from] std::io::Error), } +#[derive(Debug)] pub struct ConsoleRedirectBackend { console_uri: reqwest::Url, } @@ -66,17 +73,31 @@ impl ConsoleRedirectBackend { Self { console_uri } } - pub(super) fn url(&self) -> &reqwest::Url { - &self.console_uri - } - pub(crate) async fn authenticate( &self, ctx: &RequestMonitoring, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result { - authenticate(ctx, auth_config, &self.console_uri, client).await + ) -> auth::Result { + authenticate(ctx, auth_config, &self.console_uri, client) + .await + .map(ConsoleRedirectNodeInfo) + } +} + +pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); + +#[async_trait] +impl ComputeConnectBackend for ConsoleRedirectNodeInfo { + async fn wake_compute( + &self, + _ctx: &RequestMonitoring, + ) -> Result { + Ok(Cached::new_uncached(self.0.clone())) + } + + fn get_keys(&self) -> &ComputeCredentialKeys { + &ComputeCredentialKeys::None } } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 27c9f1876eac..96e1a787ed1b 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -22,7 +22,7 @@ use crate::cache::Cached; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend}; -use crate::control_plane::{AuthSecret, NodeInfo}; +use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; @@ -66,11 +66,9 @@ impl std::ops::Deref for MaybeOwned<'_, T> { /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum Backend<'a, T, D> { +pub enum Backend<'a, T> { /// Cloud API (V2). ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T), - /// Authentication via a web browser. - ConsoleRedirect(MaybeOwned<'a, ConsoleRedirectBackend>, D), /// Local proxy uses configured auth credentials and does not wake compute Local(MaybeOwned<'a, LocalBackend>), } @@ -91,7 +89,7 @@ impl Clone for Box { } } -impl std::fmt::Display for Backend<'_, (), ()> { +impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { @@ -107,46 +105,39 @@ impl std::fmt::Display for Backend<'_, (), ()> { #[cfg(test)] ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, - Self::ConsoleRedirect(backend, ()) => fmt - .debug_tuple("ConsoleRedirect") - .field(&backend.url().as_str()) - .finish(), Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } -impl Backend<'_, T, D> { +impl Backend<'_, T> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> { + pub(crate) fn as_ref(&self) -> Backend<'_, &T> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(MaybeOwned::Borrowed(c), x), Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } } -impl<'a, T, D> Backend<'a, T, D> { +impl<'a, T> Backend<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. - pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> { + pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(c, x), Self::Local(l) => Backend::Local(l), } } } -impl<'a, T, D, E> Backend<'a, Result, D> { +impl<'a, T, E> Backend<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub(crate) fn transpose(self) -> Result, E> { + pub(crate) fn transpose(self) -> Result, E> { match self { Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)), - Self::ConsoleRedirect(c, x) => Ok(Backend::ConsoleRedirect(c, x)), Self::Local(l) => Ok(Backend::Local(l)), } } @@ -414,12 +405,11 @@ async fn authenticate_with_secret( classic::authenticate(ctx, info, client, config, secret).await } -impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { +impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { /// Get username from the credentials. pub(crate) fn get_user(&self) -> &str { match self { Self::ControlPlane(_, user_info) => &user_info.user, - Self::ConsoleRedirect(_, ()) => "web", Self::Local(_) => "local", } } @@ -433,7 +423,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { info!( @@ -454,14 +444,6 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { .await?; Backend::ControlPlane(api, credentials) } - // NOTE: this auth backend doesn't use client credentials. - Self::ConsoleRedirect(backend, ()) => { - info!("performing web authentication"); - - let info = backend.authenticate(ctx, config, client).await?; - - Backend::ConsoleRedirect(backend, info) - } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) } @@ -472,14 +454,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { } } -impl Backend<'_, ComputeUserInfo, &()> { +impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await, - Self::ConsoleRedirect(_, ()) => Ok(Cached::new_uncached(None)), Self::Local(_) => Ok(Cached::new_uncached(None)), } } @@ -492,45 +473,19 @@ impl Backend<'_, ComputeUserInfo, &()> { Self::ControlPlane(api, user_info) => { api.get_allowed_ips_and_secret(ctx, user_info).await } - Self::ConsoleRedirect(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } #[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { +impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, info) => Ok(Cached::new_uncached(info.clone())), - Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), - } - } - - fn get_keys(&self) -> &ComputeCredentialKeys { - match self { - Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, _) => &ComputeCredentialKeys::None, - Self::Local(_) => &ComputeCredentialKeys::None, - } - } -} - -#[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { - async fn wake_compute( - &self, - ctx: &RequestMonitoring, - ) -> Result { - match self { - Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, ()) => { - unreachable!("web auth flow doesn't support waking the compute") - } Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } @@ -538,7 +493,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { fn get_keys(&self) -> &ComputeCredentialKeys { match self { Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, ()) => &ComputeCredentialKeys::None, Self::Local(_) => &ComputeCredentialKeys::None, } } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index c781af846a2e..c92ebbc51f3c 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -291,7 +291,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &LocalProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { +) -> anyhow::Result<&'static auth::Backend<'static, ()>> { let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( LocalBackend::new(args.compute), )); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3f4c2df80954..3c0e66dec3f1 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -314,7 +314,10 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; let auth_backend = build_auth_backend(&args)?; - info!("Authentication backend: {}", auth_backend); + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + }; info!("Using region: {}", args.aws_region); let region_provider = @@ -461,26 +464,41 @@ async fn main() -> anyhow::Result<()> { // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - auth_backend, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } } client_tasks.spawn(proxy::context::parquet::worker( @@ -510,7 +528,7 @@ async fn main() -> anyhow::Result<()> { )); } - if let auth::Backend::ControlPlane(api, _) = auth_backend { + if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {} @@ -663,7 +681,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { webauth_confirmation_timeout: args.webauth_confirmation_timeout, }; - let config = Box::leak(Box::new(ProxyConfig { + let config = ProxyConfig { tls_config, metric_collection, allow_self_signed_compute: args.allow_self_signed_compute, @@ -677,7 +695,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { connect_to_compute_retry_config: config::RetryConfig::parse( &args.connect_to_compute_retry, )?, - })); + }; + + let config = Box::leak(Box::new(config)); tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); @@ -687,8 +707,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &ProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { - let auth_backend = match &args.auth_backend { +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { AuthBackendType::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = @@ -738,12 +758,11 @@ fn build_auth_backend( wake_compute_endpoint_rate_limiter, ); let api = control_plane::provider::ControlPlaneBackend::Management(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - AuthBackendType::Web => { - let url = args.uri.parse()?; - auth::Backend::ConsoleRedirect(MaybeOwned::Owned(ConsoleRedirectBackend::new(url)), ()) + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) } #[cfg(feature = "testing")] @@ -751,11 +770,23 @@ fn build_auth_backend( let url = args.auth_endpoint.parse()?; let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy); let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) + + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) } - }; - Ok(Box::leak(Box::new(auth_backend))) + AuthBackendType::Web => { + let url = args.uri.parse()?; + let backend = ConsoleRedirectBackend::new(url); + + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } } #[cfg(test)] diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs new file mode 100644 index 000000000000..9e1797672021 --- /dev/null +++ b/proxy/src/console_redirect_proxy.rs @@ -0,0 +1,217 @@ +use crate::auth::backend::ConsoleRedirectBackend; +use crate::config::{ProxyConfig, ProxyProtocolV2}; +use crate::proxy::{ + prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, +}; +use crate::{ + cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}, + context::RequestMonitoring, + error::ReportableError, + metrics::{Metrics, NumClientConnectionsGuard}, + protocol2::read_proxy_protocol, + proxy::handshake::{handshake, HandshakeData}, +}; +use futures::TryFutureExt; +use std::sync::Arc; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, Instrument}; + +use crate::proxy::{ + connect_compute::{connect_to_compute, TcpMechanism}, + passthrough::ProxyPassthrough, +}; + +pub async fn task_main( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, + cancellation_handler: Arc, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("proxy has shut down"); + } + + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + + let session_id = uuid::Uuid::new_v4(); + let cancellation_handler = Arc::clone(&cancellation_handler); + + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + + connections.spawn(async move { + let (socket, peer_addr) = match read_proxy_protocol(socket).await { + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; + } + Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + error!("missing required proxy protocol header"); + return; + } + Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + error!("proxy protocol header not supported"); + return; + } + Ok((socket, Some(addr))) => (socket, addr.ip()), + Ok((socket, None)) => (socket, peer_addr.ip()), + }; + + match socket.inner.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + } + }; + + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); + let span = ctx.span(); + + let startup = Box::pin( + handle_client( + config, + backend, + &ctx, + cancellation_handler, + socket, + conn_gauge, + ) + .instrument(span.clone()), + ); + let res = startup.await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(ErrorSource::Client(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + } + Err(ErrorSource::Compute(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); + } + } + } + } + }); + } + + connections.close(); + drop(listener); + + // Drain connections + connections.wait().await; + + Ok(()) +} + +pub(crate) async fn handle_client( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + ctx: &RequestMonitoring, + cancellation_handler: Arc, + stream: S, + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol(); + let request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.as_ref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id()) + .await + .map(|()| None)?) + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let user_info = match backend + .authenticate(ctx, &config.authentication_config, &mut stream) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + return stream.throw_error(e).await?; + } + }; + + let mut node = connect_to_compute( + ctx, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, + &user_info, + config.allow_self_signed_compute, + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, + ) + .or_else(|e| stream.throw_error(e)) + .await?; + + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; + + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + _req: request_gauge, + _conn: conn_gauge, + _cancel: session, + })) +} diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 8d274baa10b9..74bc778a36c4 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -95,6 +95,7 @@ pub mod cache; pub mod cancellation; pub mod compute; pub mod config; +pub mod console_redirect_proxy; pub mod context; pub mod control_plane; pub mod error; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 3a43ccb74a6f..b2b5a7f43d6c 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -61,7 +61,7 @@ pub async fn run_until_cancelled( pub async fn task_main( config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, (), ()>, + auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -248,7 +248,7 @@ impl ReportableError for ClientRequestError { #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, (), ()>, + auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, @@ -356,7 +356,7 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection

( +pub(crate) async fn prepare_client_connection

( node: &compute::PostgresConnection, session: &cancellation::Session

, stream: &mut PqStream, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3861ddc8edff..58fb36dba754 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -552,7 +552,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> auth::Backend<'static, ComputeCredentials, &()> { +) -> auth::Backend<'static, ComputeCredentials> { let user_info = auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))), ComputeCredentials { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 9e49478cf3ec..2b060af9e1e3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -42,7 +42,7 @@ pub(crate) struct PoolingBackend { pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, - pub(crate) auth_backend: &'static crate::auth::Backend<'static, (), ()>, + pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, pub(crate) endpoint_rate_limiter: Arc, } @@ -135,9 +135,6 @@ impl PoolingBackend { keys: crate::auth::backend::ComputeCredentialKeys::None, }) } - crate::auth::Backend::ConsoleRedirect(_, ()) => Err(AuthError::auth_failed( - "JWT login over web auth proxy is not supported", - )), crate::auth::Backend::Local(_) => { let keys = self .config @@ -264,7 +261,7 @@ impl PoolingBackend { info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); let mut node_info = match &self.auth_backend { - auth::Backend::ControlPlane(_, ()) | auth::Backend::ConsoleRedirect(_, ()) => { + auth::Backend::ControlPlane(_, ()) => { unreachable!("only local_proxy can connect to local postgres") } auth::Backend::Local(local) => local.node_info.clone(), diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 95f64e972c07..3131adada4cc 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -55,7 +55,7 @@ pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, - auth_backend: &'static crate::auth::Backend<'static, (), ()>, + auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index fd0f0cac7f34..f5a692cf404e 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -129,7 +129,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, - auth_backend: &'static crate::auth::Backend<'static, (), ()>, + auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, From d056ae9be5844b22378f961dd3ae730d96ef996e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Oct 2024 13:45:20 +0300 Subject: [PATCH 02/38] Ignore pg_dynshmem fiel when comparing directories (#9374) ## Problem At MacOS `pg_dynshmem` file is create in PGDATADIR which cause mismatch in directories comparison ## Summary of changes Add this files to the ignore list. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- test_runner/fixtures/neon_fixtures.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7789855fe4f5..059707c8ed42 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4280,6 +4280,7 @@ def pytest_addoption(parser: Parser): "postmaster.opts", "postmaster.pid", "pg_control", + "pg_dynshmem", ) ) From 31b7703fa87fa7fdc4d3a9f8b8f223cfddc0cd1a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 14 Oct 2024 11:51:01 +0100 Subject: [PATCH 03/38] CI(build-build-tools): fix unexpected cancellations (#9357) ## Problem When `Dockerfile.build-tools` gets changed, several PRs catch up with it and some might get unexpectedly cancelled workflows because of GitHub's concurrency model for workflows. See the comment in the code for more details. It should be possible to revert it after https://github.com/orgs/community/discussions/41518 (I don't expect it anytime soon, but I subscribed) ## Summary of changes - Do not queue `build-build-tools-image` workflows in the concurrency group --- .github/workflows/build-build-tools-image.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index ca5ff573e19e..130753833dd6 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -19,9 +19,16 @@ defaults: run: shell: bash -euo pipefail {0} -concurrency: - group: build-build-tools-image-${{ inputs.image-tag }} - cancel-in-progress: false +# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image +# for the same tag in parallel workflow runs, and queue them to be skipped once we have +# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected. +# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs. +# +# Ref https://github.com/orgs/community/discussions/41518 +# +# concurrency: +# group: build-build-tools-image-${{ inputs.image-tag }} +# cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} From d92ff578c4a738a52bdcb0a6f44af7691a64882c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 14 Oct 2024 14:34:57 +0200 Subject: [PATCH 04/38] Add test for fixed storage broker issue (#9311) Adds a test for the (now fixed) storage broker limit issue, see #9268 for the description and #9299 for the fix. Also fix a race condition with endpoint creation/starts running in parallel, leading to file not found errors. --- control_plane/src/endpoint.rs | 16 +++++++++++++- test_runner/regress/test_tenants.py | 34 ++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7cdf6217373b..71514daa7cc5 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -97,7 +97,21 @@ impl ComputeControlPlane { for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?; + let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env); + let ep = match ep_res { + Ok(ep) => ep, + Err(e) => match e.downcast::() { + Ok(e) => { + // A parallel task could delete an endpoint while we have just scanned the directory + if e.kind() == std::io::ErrorKind::NotFound { + continue; + } else { + Err(e)? + } + } + Err(e) => Err(e)?, + }, + }; endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 95dc0fec78ba..4a165359410e 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -2,6 +2,7 @@ import concurrent.futures import os +import threading import time from contextlib import closing from datetime import datetime @@ -10,7 +11,7 @@ import pytest import requests -from fixtures.common_types import Lsn, TenantId +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -476,3 +477,34 @@ def only_int(samples: list[Sample]) -> int: assert counts log.info(f"directory counts: {counts}") assert counts[2] > COUNT_AT_LEAST_EXPECTED + + +def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): + """ + (Relaxed) regression test for issue that led to https://github.com/neondatabase/neon/pull/9268 + Create many endpoints in parallel and then restart them + """ + env = neon_simple_env + + # This param needs to be 200+ to reproduce the limit issue + n_threads = 16 + barrier = threading.Barrier(n_threads) + + def test_timeline(branch_name: str, timeline_id: TimelineId): + endpoint = env.endpoints.create_start(branch_name) + endpoint.stop() + # Use a barrier to make sure we restart endpoints at the same time + barrier.wait() + endpoint.start() + + workers = [] + + for i in range(0, n_threads): + branch_name = f"branch_{i}" + timeline_id = env.create_branch(branch_name) + w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id]) + workers.append(w) + w.start() + + for w in workers: + w.join() From f4f7ea247c05a56a90e4a7f99249133c58c8c443 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 14 Oct 2024 16:50:12 +0100 Subject: [PATCH 05/38] tests: make size comparisons more lenient (#9388) The empirically determined threshold doesn't hold for PG 17. Bump the limit to stabilise ci. --- test_runner/regress/test_tenant_size.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 9ea09d10d725..b41f1709bd07 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -479,9 +479,9 @@ def assert_size_approx_equal(size_a, size_b): """ # Determined empirically from examples of equality failures: they differ - # by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid + # by page multiples of 8272, and usually by 1-3 pages. Tolerate 6 to avoid # failing on outliers from that observed range. - threshold = 4 * 8272 + threshold = 6 * 8272 assert size_a == pytest.approx(size_b, abs=threshold) From f54e3e9147bf1dd341e22a0fc01cf5c5d71843e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 14 Oct 2024 17:54:03 +0200 Subject: [PATCH 06/38] Also consider offloaded timelines for obtaining retain_lsn (#9308) Also consider offloaded timelines for obtaining `retain_lsn`. This is required for correctness for all timelines that have not been flattened yet: otherwise we GC data that might still be required for reading. This somewhat counteracts the original purpose of timeline offloading of not having to iterate over offloaded timelines, but sadly it's required. In the future, we can improve the way the offloaded timelines are stored. We also make the `retain_lsn` optional so that in the future, when we implement flattening, we can make it None. This also applies to full timeline objects by the way, where it would probably make most sense to add a bool flag whether the timeline is successfully flattened, and if it is, one can exclude it from `retain_lsn` as well. Also, track whether a timeline was offloaded or not in `retain_lsn` so that the `retain_lsn` can be excluded from visibility and size calculation. Part of #8088 --- pageserver/src/tenant.rs | 56 ++++++++++++++++---- pageserver/src/tenant/size.rs | 8 +-- pageserver/src/tenant/timeline.rs | 21 +++++--- pageserver/src/tenant/timeline/compaction.rs | 9 ++-- 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2818d04dc69..397778d4c834 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -493,6 +493,8 @@ pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, // TODO: once we persist offloaded state, make this lazily constructed pub remote_client: Arc, @@ -504,10 +506,14 @@ pub struct OffloadedTimeline { impl OffloadedTimeline { fn from_timeline(timeline: &Timeline) -> Self { + let ancestor_retain_lsn = timeline + .get_ancestor_timeline_id() + .map(|_timeline_id| timeline.get_ancestor_lsn()); Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_retain_lsn, remote_client: timeline.remote_client.clone(), delete_progress: timeline.delete_progress.clone(), @@ -515,6 +521,12 @@ impl OffloadedTimeline { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub enum MaybeOffloaded { + Yes, + No, +} + #[derive(Clone)] pub enum TimelineOrOffloaded { Timeline(Arc), @@ -2253,12 +2265,13 @@ impl Tenant { if activating { let timelines_accessor = self.timelines.lock().unwrap(); + let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); let timelines_to_activate = timelines_accessor .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor); + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor); // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. @@ -3298,6 +3311,7 @@ impl Tenant { fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, + timelines_offloaded: &std::sync::MutexGuard>>, ) { // This function must be called before activation: after activation timeline create/delete operations // might happen, and this function is not safe to run concurrently with those. @@ -3305,20 +3319,37 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); timelines.iter().for_each(|(timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + ancestor_children.push(( + timeline_entry.get_ancestor_lsn(), + *timeline_id, + MaybeOffloaded::No, + )); } }); + timelines_offloaded + .iter() + .for_each(|(timeline_id, timeline_entry)| { + let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else { + return; + }; + let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else { + return; + }; + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes)); + }); // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines let horizon = self.get_gc_horizon(); // Populate each timeline's GcInfo with information about its child branches for timeline in timelines.values() { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints .remove(&timeline.timeline_id) .unwrap_or_default(); @@ -4878,7 +4909,10 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); + assert_eq!( + branchpoints[0], + (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No) + ); } // You can read the key from the child branch even though the parent is @@ -8261,8 +8295,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8489,8 +8523,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8723,7 +8757,7 @@ mod tests { // Update GC info let mut guard = parent_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x10), space: Lsn(0x10), @@ -8737,7 +8771,7 @@ mod tests { // Update GC info let mut guard = branch_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x50), space: Lsn(0x50), diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 41d558d3f68a..4a4c698b5655 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -12,7 +12,7 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::Timeline; +use crate::tenant::{MaybeOffloaded, Timeline}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -264,10 +264,12 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) + .filter(|(lsn, _child_id, is_offloaded)| { + lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No + }) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2fd4e699cfbf..8f098d0e8299 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -139,8 +139,10 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::{ - config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + config::TenantConf, + storage_layer::{inmemory_layer, LayerVisibilityHint}, upload_queue::NotInitialized, + MaybeOffloaded, }; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; @@ -450,7 +452,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -467,8 +469,13 @@ impl GcInfo { self.cutoffs.select_min() } - pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { - self.retain_lsns.push((child_lsn, child_id)); + pub(super) fn insert_child( + &mut self, + child_id: TimelineId, + child_lsn: Lsn, + is_offloaded: MaybeOffloaded, + ) { + self.retain_lsns.push((child_lsn, child_id, is_offloaded)); self.retain_lsns.sort_by_key(|i| i.0); } @@ -2164,7 +2171,9 @@ impl Timeline { if let Some(ancestor) = &ancestor { let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); - ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + // If we construct an explicit timeline object, it's obviously not offloaded + let is_offloaded = MaybeOffloaded::No; + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } Arc::new_cyclic(|myself| { @@ -4875,7 +4884,7 @@ impl Timeline { let retain_lsns = gc_info .retain_lsns .iter() - .map(|(lsn, _child_id)| *lsn) + .map(|(lsn, _child_id, _is_offloaded)| *lsn) .collect(); // Gets the maximum LSN that holds the valid lease. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9f64471432e3..8b9ace1e5bbf 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -42,7 +42,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::DeltaLayer; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, @@ -639,7 +639,10 @@ impl Timeline { let children = self.gc_info.read().unwrap().retain_lsns.clone(); let mut readable_points = Vec::with_capacity(children.len() + 1); - for (child_lsn, _child_timeline_id) in &children { + for (child_lsn, _child_timeline_id, is_offloaded) in &children { + if *is_offloaded == MaybeOffloaded::Yes { + continue; + } readable_points.push(*child_lsn); } readable_points.push(head_lsn); @@ -1741,7 +1744,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); - for (lsn, _timeline_id) in &gc_info.retain_lsns { + for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); } From dab96a6eb159ffa34ff98f8dfc3b2a6862441e02 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 14 Oct 2024 20:30:21 +0200 Subject: [PATCH 07/38] Add more timing histogram and gauge metrics to the Neon extension (#9116) We now also track: - Number of PS IOs in-flight - Number of pages cached by smgr prefetch implementation - IO timing histograms for LFC reads and writes, per IO issued ## Problem There's little insight into the timing metrics of LFC, and what the prefetch state of each backend is. This changes that, by measuring (and subsequently exposing) these data points. ## Summary of changes - Extract IOHistogram as separate type, rather than a collection of fields on NeonMetrics - others, see items above. Part of https://github.com/neondatabase/neon/issues/8926 --- pgxn/neon/file_cache.c | 27 ++++- pgxn/neon/neon_perf_counters.c | 174 +++++++++++++++++++++------------ pgxn/neon/neon_perf_counters.h | 42 ++++++-- pgxn/neon/pagestore_smgr.c | 35 +++++++ 4 files changed, 205 insertions(+), 73 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index d789526050e8..bbea5a8b0d0c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -43,6 +43,7 @@ #include "hll.h" #include "bitmap.h" #include "neon.h" +#include "neon_perf_counters.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -114,7 +115,9 @@ typedef struct FileCacheControl uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; - uint64 writes; + uint64 writes; /* number of writes issued */ + uint64 time_read; /* time spent reading (us) */ + uint64 time_write; /* time spent writing (us) */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ @@ -270,6 +273,8 @@ lfc_shmem_startup(void) lfc_ctl->hits = 0; lfc_ctl->misses = 0; lfc_ctl->writes = 0; + lfc_ctl->time_read = 0; + lfc_ctl->time_write = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -701,6 +706,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; + uint64 io_time_us = 0; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -795,6 +801,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->misses += iteration_misses; pgBufferUsage.file_cache.hits += iteration_hits; pgBufferUsage.file_cache.misses += iteration_misses; + + if (iteration_hits) + { + lfc_ctl->time_read += io_time_us; + inc_page_cache_read_wait(io_time_us); + } + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); @@ -859,6 +872,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, struct iovec iov[PG_IOV_MAX]; int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + instr_time io_start, io_end; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -947,12 +961,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ * blocks_in_chunk) @@ -965,9 +980,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { + uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += blocks_in_chunk; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index a497d387c851..05db18707609 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void) } } -/* - * Count a GetPage wait operation. - */ -void -inc_getpage_wait(uint64 latency_us) +static inline void +inc_iohist(IOHistogram hist, uint64 latency_us) { int lo = 0; - int hi = NUM_GETPAGE_WAIT_BUCKETS - 1; + int hi = NUM_IO_WAIT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; - if (latency_us < getpage_wait_bucket_thresholds[mid]) + if (latency_us < io_wait_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } - MyNeonCounters->getpage_wait_us_bucket[lo]++; - MyNeonCounters->getpage_wait_us_sum += latency_us; - MyNeonCounters->getpage_wait_us_count++; + hist->wait_us_bucket[lo]++; + hist->wait_us_sum += latency_us; + hist->wait_us_count++; +} + +/* + * Count a GetPage wait operation. + */ +void +inc_getpage_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->getpage_hist, latency); +} + +/* + * Count an LFC read wait operation. + */ +void +inc_page_cache_read_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_read_hist, latency); +} + +/* + * Count an LFC write wait operation. + */ +void +inc_page_cache_write_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } /* @@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us) typedef struct { - char *name; + const char *name; bool is_bucket; double bucket_le; double value; } metric_t; -static metric_t * -neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +static int +histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { -#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8) - metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); - uint64 bucket_accum; - int i = 0; + int i = 0; + uint64 bucket_accum = 0; - metrics[i].name = "getpage_wait_seconds_count"; + metrics[i].name = count; metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_wait_us_count; + metrics[i].value = (double) histogram->wait_us_count; i++; - metrics[i].name = "getpage_wait_seconds_sum"; + metrics[i].name = sum; metrics[i].is_bucket = false; - metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0; + metrics[i].value = (double) histogram->wait_us_sum / 1000000.0; i++; - - bucket_accum = 0; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) { - uint64 threshold = getpage_wait_bucket_thresholds[bucketno]; + uint64 threshold = io_wait_bucket_thresholds[bucketno]; - bucket_accum += counters->getpage_wait_us_bucket[bucketno]; + bucket_accum += histogram->wait_us_bucket[bucketno]; - metrics[i].name = "getpage_wait_seconds_bucket"; + metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } - metrics[i].name = "getpage_prefetch_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_requests_total; - i++; - metrics[i].name = "getpage_sync_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_sync_requests_total; - i++; - metrics[i].name = "getpage_prefetch_misses_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_misses_total; - i++; - metrics[i].name = "getpage_prefetch_discards_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_discards_total; - i++; - metrics[i].name = "pageserver_requests_sent_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_requests_sent_total; - i++; - metrics[i].name = "pageserver_disconnects_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_disconnects_total; - i++; - metrics[i].name = "pageserver_send_flushes_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_send_flushes_total; - i++; - metrics[i].name = "file_cache_hits_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->file_cache_hits_total; - i++; + + return i; +} + +static metric_t * +neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +{ +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) + metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); + int i = 0; + +#define APPEND_METRIC(_name) do { \ + metrics[i].name = #_name; \ + metrics[i].is_bucket = false; \ + metrics[i].value = (double) counters->_name; \ + i++; \ + } while (false) + + i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); + + APPEND_METRIC(getpage_prefetch_requests_total); + APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(getpage_prefetch_misses_total); + APPEND_METRIC(getpage_prefetch_discards_total); + APPEND_METRIC(pageserver_requests_sent_total); + APPEND_METRIC(pageserver_disconnects_total); + APPEND_METRIC(pageserver_send_flushes_total); + APPEND_METRIC(pageserver_open_requests); + APPEND_METRIC(getpage_prefetches_buffered); + + APPEND_METRIC(file_cache_hits_total); + + i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); Assert(i == NUM_METRICS); +#undef APPEND_METRIC +#undef NUM_METRICS + /* NULL entry marks end of array */ metrics[i].name = NULL; metrics[i].value = 0; @@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) return (Datum) 0; } +static inline void +histogram_merge_into(IOHistogram into, IOHistogram from) +{ + into->wait_us_count += from->wait_us_count; + into->wait_us_sum += from->wait_us_sum; + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) + into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - totals.getpage_wait_us_count += counters->getpage_wait_us_count; - totals.getpage_wait_us_sum += counters->getpage_wait_us_sum; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) - totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno]; + histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total; totals.pageserver_disconnects_total += counters->pageserver_disconnects_total; totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total; + totals.pageserver_open_requests += counters->pageserver_open_requests; + totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; + histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 49d477c4f8ad..8edc658a3025 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -15,17 +15,26 @@ #include "storage/proc.h" #endif -static const uint64 getpage_wait_bucket_thresholds[] = { - 20, 30, 60, 100, /* 0 - 100 us */ +static const uint64 io_wait_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ - 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ UINT64_MAX, }; -#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds)) +#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds)) + +typedef struct IOHistogramData +{ + uint64 wait_us_count; + uint64 wait_us_sum; + uint64 wait_us_bucket[NUM_IO_WAIT_BUCKETS]; +} IOHistogramData; + +typedef IOHistogramData *IOHistogram; typedef struct { @@ -39,9 +48,7 @@ typedef struct * the backend, but the 'neon_backend_perf_counters' view will convert * them to seconds, to make them more idiomatic as prometheus metrics. */ - uint64 getpage_wait_us_count; - uint64 getpage_wait_us_sum; - uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS]; + IOHistogramData getpage_hist; /* * Total number of speculative prefetch Getpage requests and synchronous @@ -50,7 +57,11 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; - /* XXX: It's not clear to me when these misses happen. */ + /* + * Total number of readahead misses; consisting of either prefetches that + * don't satisfy the LSN bounds, or cases where no readahead was issued + * for the read. + */ uint64 getpage_prefetch_misses_total; /* @@ -80,6 +91,16 @@ typedef struct * this can be smaller than pageserver_requests_sent_total. */ uint64 pageserver_send_flushes_total; + + /* + * Number of open requests to PageServer. + */ + uint64 pageserver_open_requests; + + /* + * Number of unused prefetches currently cached in this backend. + */ + uint64 getpage_prefetches_buffered; /* * Number of requests satisfied from the LFC. @@ -91,6 +112,9 @@ typedef struct */ uint64 file_cache_hits_total; + /* LFC I/O time buckets */ + IOHistogramData file_cache_read_hist; + IOHistogramData file_cache_write_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; #endif extern void inc_getpage_wait(uint64 latency); +extern void inc_page_cache_read_wait(uint64 latency); +extern void inc_page_cache_write_wait(uint64 latency); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3d9d9285dfc6..f46df7f70ac4 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra) newPState->n_unused -= 1; } + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { prefetch_set_unused(end); @@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot) MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; @@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void) prefetch_set_unused(ring_index); } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } /* @@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index) MyPState->n_responses_buffered -= 1; MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } else { @@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, hashkey.buftag = tag; Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + min_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { @@ -1001,6 +1029,9 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, prefetch_do_request(slot, lsns); } + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + Assert(any_hits); Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || @@ -1076,8 +1107,10 @@ page_server_request(void const *req) { /* do nothing */ } + MyNeonCounters->pageserver_open_requests++; consume_prefetch_responses(); resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; } PG_CATCH(); { @@ -1086,6 +1119,8 @@ page_server_request(void const *req) * point, but this currently seems fine for now. */ page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + PG_RE_THROW(); } PG_END_TRY(); From 0fc4ada3ca9b1eb264bff9c6407ad050722578ae Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 14 Oct 2024 21:12:43 +0100 Subject: [PATCH 08/38] Switch CI, Storage and Proxy to Debian 12 (Bookworm) (#9170) ## Problem This PR switches CI and Storage to Debain 12 (Bookworm) based images. ## Summary of changes - Add Debian codename (`bookworm`/`bullseye`) to most of docker tags, create un-codenamed images to be used by default - `vm-compute-node-image`: create a separate spec for `bookworm` (we don't need to build cgroups in the future) - `neon-image`: Switch to `bookworm`-based `build-tools` image - Storage components and Proxy use it - CI: run lints and tests on `bookworm`-based `build-tools` image --- .../actions/allure-report-generate/action.yml | 2 +- .../actions/run-python-test-set/action.yml | 2 +- .github/workflows/_build-and-test-locally.yml | 8 +- .github/workflows/build-build-tools-image.yml | 31 ++-- .github/workflows/build_and_test.yml | 136 ++++++++++-------- .github/workflows/neon_extra_builds.yml | 2 +- .github/workflows/pg-clients.yml | 4 +- .github/workflows/pin-build-tools-image.yml | 23 ++- Dockerfile | 4 +- Dockerfile.build-tools | 19 +-- compute/Dockerfile.compute-node | 27 ++-- compute/vm-image-spec-bookworm.yaml | 126 ++++++++++++++++ ...-spec.yaml => vm-image-spec-bullseye.yaml} | 0 13 files changed, 280 insertions(+), 104 deletions(-) create mode 100644 compute/vm-image-spec-bookworm.yaml rename compute/{vm-image-spec.yaml => vm-image-spec-bullseye.yaml} (100%) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 11adc8df86ec..2bdb7277194e 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -183,7 +183,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 330e875d566a..037b9aeb1ec4 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -88,7 +88,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5fc6aa247a3e..3aa671fab103 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -124,28 +124,28 @@ jobs: uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v17 build id: cache_pg_17 uses: actions/cache@v4 with: path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 130753833dd6..0f0527657974 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -43,6 +43,7 @@ jobs: strategy: matrix: + debian-version: [ bullseye, bookworm ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -81,22 +82,22 @@ jobs: - uses: docker/build-push-action@v6 with: + file: Dockerfile.build-tools context: . provenance: false push: true pull: true - file: Dockerfile.build-tools - cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} - tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + build-args: | + DEBIAN_VERSION=${{ matrix.debian-version }} + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }} + tags: | + neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }} merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 - env: - IMAGE_TAG: ${{ inputs.image-tag }} - steps: - uses: docker/login-action@v3 with: @@ -104,7 +105,17 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Create multi-arch image + env: + DEFAULT_DEBIAN_VERSION: bullseye + IMAGE_TAG: ${{ inputs.image-tag }} run: | - docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ - neondatabase/build-tools:${IMAGE_TAG}-x64 \ - neondatabase/build-tools:${IMAGE_TAG}-arm64 + for debian_version in bullseye bookworm; do + tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}") + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64 + done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e7193cfe1967..51f6975e63c4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -92,7 +92,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -106,7 +106,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -181,7 +181,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -261,7 +261,7 @@ jobs: uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} - build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds @@ -276,7 +276,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -289,7 +289,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -309,7 +309,7 @@ jobs: needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -367,7 +367,7 @@ jobs: runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -415,7 +415,7 @@ jobs: needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -559,15 +559,16 @@ jobs: ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm + DEBIAN_VERSION=bookworm provenance: false push: true pull: true file: Dockerfile - cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, tag ] @@ -582,8 +583,9 @@ jobs: - name: Create multi-arch image run: | docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - uses: docker/login-action@v3 with: @@ -604,17 +606,16 @@ jobs: version: # Much data was already generated on old PG versions with bullseye's # libraries, the locales of which can cause data incompatibilities. - # However, new PG versions should check if they can be built on newer - # images, as that reduces the support burden of old and ancient - # distros. + # However, new PG versions should be build on newer images, + # as that reduces the support burden of old and ancient distros. - pg: v14 - debian: bullseye-slim + debian: bullseye - pg: v15 - debian: bullseye-slim + debian: bullseye - pg: v16 - debian: bullseye-slim + debian: bullseye - pg: v17 - debian: bookworm-slim + debian: bookworm arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -659,16 +660,16 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg == 'v16' @@ -679,17 +680,17 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once @@ -704,14 +705,16 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, tag ] @@ -719,7 +722,16 @@ jobs: strategy: matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm steps: - uses: docker/login-action@v3 @@ -729,23 +741,26 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image - if: matrix.version == 'v16' + if: matrix.version.pg == 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - uses: docker/login-action@v3 with: @@ -753,13 +768,13 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -770,7 +785,16 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm env: VM_BUILDER_VERSION: v0.35.0 @@ -792,18 +816,18 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ - -spec=compute/vm-image-spec.yaml \ - -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 140aac032a99..287c9ea281e4 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -155,7 +155,7 @@ jobs: github.ref_name == 'main' runs-on: [ self-hosted, large ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 23a2e3876c6c..df40b5bedab3 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -55,7 +55,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -150,7 +150,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 2e79498fc441..c196d07d3e51 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -71,7 +71,6 @@ jobs: steps: - uses: docker/login-action@v3 - with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -94,8 +93,22 @@ jobs: az acr login --name=neoneastus2 - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR + env: + DEFAULT_DEBIAN_VERSION: bullseye run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ - -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ - -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} + for debian_version in bullseye bookworm; do + tags=() + + tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") + + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${TO_TAG}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${FROM_TAG}-${debian_version} + done diff --git a/Dockerfile b/Dockerfile index bdb76a4f4fa9..785dd4598e0a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 ARG STABLE_PG_VERSION=16 +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -57,7 +59,7 @@ RUN set -e \ # Build final image # -FROM debian:bullseye-slim +FROM debian:${DEBIAN_FLAVOR} ARG DEFAULT_PG_VERSION WORKDIR /data diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index d8bcacf22867..54e913425772 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,12 +1,7 @@ -FROM debian:bullseye-slim +ARG DEBIAN_VERSION=bullseye -# Use ARG as a build-time environment variable here to allow. -# It's not supposed to be set outside. -# Alternatively it can be obtained using the following command -# ``` -# . /etc/os-release && echo "${VERSION_CODENAME}" -# ``` -ARG DEBIAN_VERSION_CODENAME=bullseye +FROM debian:${DEBIAN_VERSION}-slim +ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home @@ -42,14 +37,14 @@ RUN set -e \ libseccomp-dev \ libsqlite3-dev \ libssl-dev \ - libstdc++-10-dev \ + $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \ libtool \ libxml2-dev \ libxmlsec1-dev \ libxxhash-dev \ lsof \ make \ - netcat \ + netcat-openbsd \ net-tools \ openssh-client \ parallel \ @@ -78,7 +73,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ # LLVM ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ @@ -86,7 +81,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ # Install docker RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ && apt update \ && apt install -y docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 15afb9897fe6..91528618dad8 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG -ARG DEBIAN_FLAVOR=bullseye-slim +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim ######################################################################################### # @@ -11,20 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim # ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS build-deps -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION -RUN case $DEBIAN_FLAVOR in \ +RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. - bullseye*) \ + bullseye) \ echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ ;; \ # Version-specific installs for Bookworm (PG17): - bookworm*) \ + bookworm) \ VERSION_INSTALLS="cmake"; \ ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ esac && \ apt update && \ apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \ @@ -1091,7 +1095,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS compute-tools-image -ARG DEBIAN_FLAVOR COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl @@ -1102,7 +1105,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS pgbouncer -ARG DEBIAN_FLAVOR RUN set -e \ && apt-get update \ && apt-get install --no-install-recommends -y \ @@ -1257,7 +1259,7 @@ ENV PGDATABASE=postgres # ######################################################################################### FROM debian:$DEBIAN_FLAVOR -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ @@ -1305,19 +1307,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca RUN apt update && \ - case $DEBIAN_FLAVOR in \ + case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # libicu67, locales for collations (including ICU and plpgsql_check) # libgdal28, libproj19 for PostGIS - bullseye*) \ + bullseye) \ VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \ ;; \ # Version-specific installs for Bookworm (PG17): # libicu72, locales for collations (including ICU and plpgsql_check) # libgdal32, libproj25 for PostGIS - bookworm*) \ + bookworm) \ VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \ ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ esac && \ apt install --no-install-recommends -y \ gdb \ diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml new file mode 100644 index 000000000000..51a55b513f07 --- /dev/null +++ b/compute/vm-image-spec-bookworm.yaml @@ -0,0 +1,126 @@ +# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. +--- +commands: + - name: cgconfigparser + user: root + sysvInitAction: sysinit + shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' + - name: chmod-set-disk-quota + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/set-disk-quota' + - name: pgbouncer + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + - name: local_proxy + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + - name: postgres-exporter + user: nobody + sysvInitAction: respawn + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' + - name: sql-exporter + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' +shutdownHook: | + su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' +files: + - filename: compute_ctl-sudoers + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), + # regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + - filename: cgconfig.conf + content: | + # Configuration for cgroups in VM compute nodes + group neon-postgres { + perm { + admin { + uid = postgres; + } + task { + gid = users; + } + } + memory {} + } +build: | + # Build cgroup-tools + # + # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically + # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor + # requires cgroup v2, so we'll build cgroup-tools ourselves. + # + # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, + # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset + # for debian version migration. + # + FROM debian:bookworm-slim as libcgroup-builder + ENV LIBCGROUP_VERSION=v2.0.3 + + RUN set -exu \ + && apt update \ + && apt install --no-install-recommends -y \ + git \ + ca-certificates \ + automake \ + cmake \ + make \ + gcc \ + byacc \ + flex \ + libtool \ + libpam0g-dev \ + && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ + && INSTALL_DIR="/libcgroup-install" \ + && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ + && cd libcgroup \ + # extracted from bootstrap.sh, with modified flags: + && (test -d m4 || mkdir m4) \ + && autoreconf -fi \ + && rm -rf autom4te.cache \ + && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ + # actually build the thing... + && make install +merge: | + # tweak nofile limits + RUN set -e \ + && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ + && test ! -e /etc/security || ( \ + echo '* - nofile 1048576' >>/etc/security/limits.conf \ + && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ + ) + + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers + + COPY cgconfig.conf /etc/cgconfig.conf + + RUN set -e \ + && chmod 0644 /etc/cgconfig.conf + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ + COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ + COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec-bullseye.yaml similarity index 100% rename from compute/vm-image-spec.yaml rename to compute/vm-image-spec-bullseye.yaml From 73c6626b381bd013064d72332c3a0a372c555877 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 15 Oct 2024 09:31:18 +0100 Subject: [PATCH 09/38] pageserver: stabilize & refine controller scale test (#8971) ## Problem We were seeing timeouts on migrations in this test. The test unfortunately tends to saturate local storage, which is shared between the pageservers and the control plane database, which makes the test kind of unrealistic. We will also want to increase the scale of this test, so it's worth fixing that. ## Summary of changes - Instead of randomly creating timelines at the same time as the other background operations, explicitly identify a subset of tenant which will have timelines, and create them at the start. This avoids pageservers putting a lot of load on the test node during the main body of the test. - Adjust the tenants created to create some number of 8 shard tenants and the rest 1 shard tenants, instead of just creating a lot of 2 shard tenants. - Use archival_config to exercise tenant-mutating operations, instead of using timeline creation for this. - Adjust reconcile_until_idle calls to avoid waiting 5 seconds between calls, which causes timelines with large shard count tenants. - Fix a pageserver bug where calls to archival_config during activation get 404 --- libs/utils/src/http/error.rs | 7 + pageserver/src/http/routes.rs | 2 + proxy/src/serverless/http_util.rs | 4 + storage_controller/src/service.rs | 5 + test_runner/fixtures/neon_fixtures.py | 6 +- .../test_storage_controller_scale.py | 225 ++++++++++++++---- 6 files changed, 204 insertions(+), 45 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 5e05e4e713d0..02fc9e3b9972 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -28,6 +28,9 @@ pub enum ApiError { #[error("Resource temporarily unavailable: {0}")] ResourceUnavailable(Cow<'static, str>), + #[error("Too many requests: {0}")] + TooManyRequests(Cow<'static, str>), + #[error("Shutting down")] ShuttingDown, @@ -73,6 +76,10 @@ impl ApiError { err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2985ab1efb68..1079d8df29fb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -715,6 +715,8 @@ async fn timeline_archival_config_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant .apply_timeline_archival_config(timeline_id, request_data.state, ctx) .await?; diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 87a72ec5f04a..c1c5764d1780 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -41,6 +41,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cc735dc27e80..cedee545347e 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { // storage controller's auth configuration. ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) } + mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => { + // Pass through 429 errors: if pageserver is asking us to wait + retry, we in + // turn ask our clients to wait + retry + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } mgmt_api::Error::ApiError(status, msg) => { // Presume general case of pageserver API errors is that we tried to do something // that can't be done right now. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 059707c8ed42..a313ac2ed360 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1986,11 +1986,11 @@ def reconcile_all(self): log.info(f"reconcile_all waited for {n} shards") return n - def reconcile_until_idle(self, timeout_secs=30): + def reconcile_until_idle(self, timeout_secs=30, max_interval=5): start_at = time.time() n = 1 - delay_sec = 0.5 - delay_max = 5 + delay_sec = 0.1 + delay_max = max_interval while n > 0: n = self.reconcile_all() if n == 0: diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 452a85671409..d2eba751f8f2 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,9 +4,10 @@ import random import time from collections import defaultdict +from enum import Enum import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -34,6 +35,7 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[ if tenant_placement[tid]["intent"]["attached"] == tenant_placement[tid]["observed"]["attached"] } + assert len(matching) == total_shards attached_per_node: defaultdict[str, int] = defaultdict(int) @@ -107,15 +109,48 @@ def test_storage_controller_many_tenants( ps.allowed_errors.append(".*request was dropped before completing.*") # Total tenants - tenant_count = 4000 + small_tenant_count = 7800 + large_tenant_count = 200 + tenant_count = small_tenant_count + large_tenant_count + large_tenant_shard_count = 8 + total_shards = small_tenant_count + large_tenant_count * large_tenant_shard_count + + # A small stripe size to encourage all shards to get some data + stripe_size = 1 + + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) + + class Tenant: + def __init__(self): + # Tenants may optionally contain a timeline + self.timeline_id = None - # Shards per tenant - shard_count = 2 - stripe_size = 1024 + # Tenants may be marked as 'large' to get multiple shard during creation phase + self.large = False - total_shards = tenant_count * shard_count + tenant_ids = list(TenantId.generate() for _i in range(0, tenant_count)) + tenants = dict((tid, Tenant()) for tid in tenant_ids) - tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + # We will create timelines in only a subset of tenants, because creating timelines + # does many megabytes of IO, and we want to densely simulate huge tenant counts on + # a single test node. + tenant_timelines_count = 100 + + # These lists are maintained for use with rng.choice + tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_without_timelines = list( + tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines + ) + + # For our sharded tenants, we will make half of them with timelines and half without + assert large_tenant_count >= tenant_timelines_count / 2 + for tenant_id in tenants_with_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True + + for tenant_id in tenants_without_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -125,23 +160,39 @@ def check_memory(): rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") assert rss is not None - log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") - assert rss < expect_memory_per_shard * shard_count * tenant_count - - # We use a fixed seed to make the test somewhat reproducible: we want a randomly - # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. - rng = random.Random(1234) + log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") + assert rss < expect_memory_per_shard * total_shards # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 - # We will create tenants directly via API, not via neon_local, to avoid any false - # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) - with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: - futs = [] + # A different concurrency limit for bulk tenant+timeline creations: these do I/O and will + # start timing on test nodes if we aren't a bit careful. + create_concurrency = 16 + + class Operation(str, Enum): + TIMELINE_OPS = "timeline_ops" + SHARD_MIGRATE = "shard_migrate" + TENANT_PASSTHROUGH = "tenant_passthrough" + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + + # Creation phase: make a lot of tenants, and create timelines in a subset of them + # This executor has concurrency set modestly, to avoid overloading pageservers with timeline creations. + with concurrent.futures.ThreadPoolExecutor(max_workers=create_concurrency) as executor: + tenant_create_futs = [] t1 = time.time() - for tenant_id in tenants: + + for tenant_id, tenant in tenants.items(): + if tenant.large: + shard_count = large_tenant_shard_count + else: + shard_count = 1 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) f = executor.submit( env.storage_controller.tenant_create, tenant_id, @@ -152,44 +203,106 @@ def check_memory(): tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) - futs.append(f) + tenant_create_futs.append(f) - # Wait for creations to finish - for f in futs: + # Wait for tenant creations to finish + for f in tenant_create_futs: f.result() log.info( f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" ) - run_ops = api_concurrency * 4 - assert run_ops < len(tenants) - op_tenants = list(tenants)[0:run_ops] + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Create timelines in those tenants which are going to get one + t1 = time.time() + timeline_create_futs = [] + for tenant_id in tenants_with_timelines: + timeline_id = TimelineId.generate() + tenants[tenant_id].timeline_id = timeline_id + f = executor.submit( + env.storage_controller.pageserver_api().timeline_create, + PgVersion.NOT_SET, + tenant_id, + timeline_id, + ) + timeline_create_futs.append(f) + + for f in timeline_create_futs: + f.result() + log.info( + f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" + ) + + # Plan operations: ensure each tenant with a timeline gets at least + # one of each operation type. Then add other tenants to make up the + # numbers. + ops_plan = [] + for tenant_id in tenants_with_timelines: + ops_plan.append((tenant_id, Operation.TIMELINE_OPS)) + ops_plan.append((tenant_id, Operation.SHARD_MIGRATE)) + ops_plan.append((tenant_id, Operation.TENANT_PASSTHROUGH)) + + # Fill up remaining run_ops with migrations of tenants without timelines + other_migrate_tenants = rng.sample(tenants_without_timelines, run_ops - len(ops_plan)) + + for tenant_id in other_migrate_tenants: + ops_plan.append( + ( + tenant_id, + rng.choice([Operation.SHARD_MIGRATE, Operation.TENANT_PASSTHROUGH]), + ) + ) + + # Exercise phase: pick pseudo-random operations to do on the tenants + timelines + # This executor has concurrency high enough to stress the storage controller API. + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + + def exercise_timeline_ops(tenant_id, timeline_id): + # A read operation: this requires looking up shard zero and routing there + detail = virtual_ps_http.timeline_detail(tenant_id, timeline_id) + assert detail["timeline_id"] == str(timeline_id) + + # A fan-out write operation to all shards in a tenant. + # - We use a metadata operation rather than something like a timeline create, because + # timeline creations are I/O intensive and this test isn't meant to be a stress test for + # doing lots of concurrent timeline creations. + archival_state = rng.choice( + [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED] + ) + virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state) # Generate a mixture of operations and dispatch them all concurrently futs = [] - for tenant_id in op_tenants: - op = rng.choice([0, 1, 2]) - if op == 0: - # A fan-out write operation to all shards in a tenant (timeline creation) + for tenant_id, op in ops_plan: + if op == Operation.TIMELINE_OPS: + op_timeline_id = tenants[tenant_id].timeline_id + assert op_timeline_id is not None + + # Exercise operations that modify tenant scheduling state but require traversing + # the fan-out-to-all-shards functionality. f = executor.submit( - virtual_ps_http.timeline_create, - PgVersion.NOT_SET, + exercise_timeline_ops, tenant_id, - TimelineId.generate(), + op_timeline_id, ) - elif op == 1: + elif op == Operation.SHARD_MIGRATE: # A reconciler operation: migrate a shard. - shard_number = rng.randint(0, shard_count - 1) - tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + desc = env.storage_controller.tenant_describe(tenant_id) + + shard_number = rng.randint(0, len(desc["shards"]) - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, len(desc["shards"])) # Migrate it to its secondary location - desc = env.storage_controller.tenant_describe(tenant_id) dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) - elif op == 2: + elif op == Operation.TENANT_PASSTHROUGH: # A passthrough read to shard zero f = executor.submit(virtual_ps_http.tenant_status, tenant_id) @@ -199,10 +312,18 @@ def check_memory(): for f in futs: f.result() + log.info("Completed mixed operations phase") + # Some of the operations above (notably migrations) might leave the controller in a state where it has # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system # to reach a quiescent state before doing following checks. - env.storage_controller.reconcile_until_idle() + # + # - Set max_interval low because we probably have a significant number of optimizations to complete and would like + # the test to run quickly. + # - Set timeout high because we might be waiting for optimizations that reuqire a secondary + # to warm up, and if we just started a secondary in the previous step, it might wait some time + # before downloading its heatmap + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() check_memory() @@ -213,6 +334,7 @@ def check_memory(): # # We do not require that the system is quiescent already here, although at present in this point in the test # that may be the case. + log.info("Reconciling all & timing") while True: t1 = time.time() reconcilers = env.storage_controller.reconcile_all() @@ -225,6 +347,7 @@ def check_memory(): break # Restart the storage controller + log.info("Restarting controller") env.storage_controller.stop() env.storage_controller.start() @@ -246,7 +369,16 @@ def check_memory(): # Restart pageservers gracefully: this exercises the /re-attach pageserver API # and the storage controller drain and fill API + log.info("Restarting pageservers...") + + # Parameters for how long we expect it to take to migrate all of the tenants from/to + # a node during a drain/fill operation + DRAIN_FILL_TIMEOUT = 240 + DRAIN_FILL_BACKOFF = 5 + for ps in env.pageservers: + log.info(f"Draining pageserver {ps.id}") + t1 = time.time() env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -255,9 +387,10 @@ def check_memory(): ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.PAUSE_FOR_RESTART, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Drained pageserver {ps.id} in {time.time() - t1}s") shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -275,6 +408,7 @@ def check_memory(): backoff=1, ) + log.info(f"Filling pageserver {ps.id}") env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -282,16 +416,23 @@ def check_memory(): ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.ACTIVE, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Filled pageserver {ps.id} in {time.time() - t1}s") + + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") assert_consistent_balanced_attachments(env, total_shards) - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, From ec4cc30de9bc1140761a7f8b7e4a5886c4d3b4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 15 Oct 2024 11:46:51 +0200 Subject: [PATCH 10/38] Shut down timelines during offload and add offload tests (#9289) Add a test for timeline offloading, and subsequent unoffloading. Also adds a manual endpoint, and issues a proper timeline shutdown during offloading which prevents a pageserver hang at shutdown. Part of #8088. --- pageserver/src/http/routes.rs | 49 ++++++++++++ pageserver/src/tenant.rs | 29 +++++++ pageserver/src/tenant/timeline/offload.rs | 3 + test_runner/fixtures/pageserver/http.py | 16 ++++ test_runner/regress/test_timeline_archive.py | 84 ++++++++++++++++++++ 5 files changed, 181 insertions(+) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1079d8df29fb..dd403c1cefb8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; @@ -325,6 +326,7 @@ impl From for ApiError { match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + Cancelled => ApiError::ShuttingDown, e @ HasArchivedParent(_) => { ApiError::PreconditionFailed(e.to_string().into_boxed_str()) } @@ -1785,6 +1787,49 @@ async fn timeline_compact_handler( .await } +// Run offload immediately on given timeline. +async fn timeline_offload_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if tenant.get_offloaded_timeline(timeline_id).is_ok() { + return json_response(StatusCode::OK, ()); + } + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if !tenant.timeline_has_no_attached_children(timeline_id) { + return Err(ApiError::PreconditionFailed( + "timeline has attached children".into(), + )); + } + if !timeline.can_offload() { + return Err(ApiError::PreconditionFailed( + "Timeline::can_offload() returned false".into(), + )); + } + offload_timeline(&tenant, &timeline) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) + } + .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run checkpoint immediately on given timeline. async fn timeline_checkpoint_handler( request: Request, @@ -3008,6 +3053,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload", + |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 397778d4c834..44d1bb74ca34 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -619,6 +619,9 @@ pub enum TimelineArchivalError { #[error("Timeout")] Timeout, + #[error("Cancelled")] + Cancelled, + #[error("ancestor is archived: {}", .0)] HasArchivedParent(TimelineId), @@ -637,6 +640,7 @@ impl Debug for TimelineArchivalError { match self { Self::NotFound => write!(f, "NotFound"), Self::Timeout => write!(f, "Timeout"), + Self::Cancelled => write!(f, "Cancelled"), Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), Self::HasUnarchivedChildren(c) => { f.debug_tuple("HasUnarchivedChildren").field(c).finish() @@ -1552,6 +1556,7 @@ impl Tenant { timeline_id: TimelineId, ctx: RequestContext, ) -> Result, TimelineArchivalError> { + info!("unoffloading timeline"); let cancel = self.cancel.clone(); let timeline_preload = self .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) @@ -1566,6 +1571,7 @@ impl Tenant { error!(%timeline_id, "index_part not found on remote"); return Err(TimelineArchivalError::NotFound); } + Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled), Err(e) => { // Some (possibly ephemeral) error happened during index_part download. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); @@ -1603,6 +1609,7 @@ impl Tenant { if offloaded_timelines.remove(&timeline_id).is_none() { warn!("timeline already removed from offloaded timelines"); } + info!("timeline unoffloading complete"); Ok(Arc::clone(timeline)) } else { warn!("timeline not available directly after attach"); @@ -1683,6 +1690,21 @@ impl Tenant { Ok(()) } + pub fn get_offloaded_timeline( + &self, + timeline_id: TimelineId, + ) -> Result, GetTimelineError> { + self.timelines_offloaded + .lock() + .unwrap() + .get(&timeline_id) + .map(Arc::clone) + .ok_or(GetTimelineError::NotFound { + tenant_id: self.tenant_shard_id, + timeline_id, + }) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -2218,6 +2240,13 @@ impl Tenant { } } + pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { + let timelines = self.timelines.lock().unwrap(); + !timelines + .iter() + .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id)) + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index fb906d906b41..7e6084baaf70 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline( return Ok(()); }; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index aa4435af4e9d..18d65cb7de4a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -583,6 +583,22 @@ def timeline_unblock_gc( log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) + def timeline_offload( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + ): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload", + ) + log.info(f"Got offload request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 841707d32e6b..971cc57a1cfa 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -6,6 +6,7 @@ NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until @pytest.mark.parametrize("shard_count", [0, 4]) @@ -114,3 +115,86 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): leaf_timeline_id, state=TimelineArchivalState.UNARCHIVED, ) + + +@pytest.mark.parametrize("manual_offload", [False, True]) +def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s" if manual_offload else "1s", + } + ) + + # Create two branches and archive them + parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id) + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" + ) + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded(timeline_id: TimelineId) -> bool: + return ( + env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*") + is not None + ) + + if manual_offload: + with pytest.raises( + PageserverApiException, + match="timeline has attached children", + ): + # This only tests the (made for testing only) http handler, + # but still demonstrates the constraints we have. + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + + def parent_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + assert timeline_offloaded(parent_timeline_id) + + def leaf_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) + assert timeline_offloaded(leaf_timeline_id) + + wait_until(30, 1, leaf_offloaded) + wait_until(30, 1, parent_offloaded) + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + assert not timeline_offloaded(initial_timeline_id) From d92d36a315f955cd39bc6f6b0948bae25ed195ad Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 15 Oct 2024 13:13:57 +0100 Subject: [PATCH 11/38] [local_proxy] update api for pg_session_jwt (#9359) pg_session_jwt now: 1. Sets the JWK in a PGU_BACKEND session guc, no longer in the init() function. 2. JWK no longer needs the kid. --- Cargo.lock | 7 +- Cargo.toml | 1 + compute/Dockerfile.compute-node | 4 +- proxy/Cargo.toml | 3 +- proxy/src/serverless/backend.rs | 49 ++++---- proxy/src/serverless/local_conn_pool.rs | 143 ++++++++++++++++-------- workspace_hack/Cargo.toml | 6 +- 7 files changed, 139 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5edf5cf7b4d5..7e772814ec57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", "hashbrown 0.14.5", + "serde", ] [[package]] @@ -2794,9 +2795,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" @@ -4296,6 +4297,7 @@ dependencies = [ "indexmap 2.0.1", "ipnet", "itertools 0.10.5", + "itoa", "jose-jwa", "jose-jwk", "lasso", @@ -7307,6 +7309,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "indexmap 1.9.3", + "indexmap 2.0.1", "itertools 0.12.1", "lazy_static", "libc", diff --git a/Cargo.toml b/Cargo.toml index dde80f502087..a1a974b33b48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,6 +107,7 @@ indexmap = "2" indoc = "2" ipnet = "2.9.0" itertools = "0.10" +itoa = "1.0.11" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 91528618dad8..412c64eda497 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -929,8 +929,8 @@ ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ esac && \ - wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \ - echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \ + echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 963fb94a7de9..e25d2fcbab03 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -42,9 +42,10 @@ hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } -indexmap.workspace = true +indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true +itoa.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 2b060af9e1e3..927854897f24 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -2,8 +2,9 @@ use std::{io, sync::Arc, time::Duration}; use async_trait::async_trait; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; +use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey}; +use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; -use tokio_postgres::types::ToSql; use tracing::{debug, field::display, info}; use crate::{ @@ -267,50 +268,58 @@ impl PoolingBackend { auth::Backend::Local(local) => local.node_info.clone(), }; + let (key, jwk) = create_random_jwk(); + let config = node_info .config .user(&conn_info.user_info.user) - .dbname(&conn_info.dbname); + .dbname(&conn_info.dbname) + .options(&format!( + "-c pg_session_jwt.jwk={}", + serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") + )); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(tokio_postgres::NoTls).await?; drop(pause); - tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + let pid = client.get_process_id(); + tracing::Span::current().record("pid", pid); - let handle = local_conn_pool::poll_client( + let mut handle = local_conn_pool::poll_client( self.local_pool.clone(), ctx, conn_info, client, connection, + key, conn_id, node_info.aux.clone(), ); - let kid = handle.get_client().get_process_id() as i64; - let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk(); - - debug!(kid, ?jwk, "setting up backend session state"); + { + let (client, mut discard) = handle.inner(); + debug!("setting up backend session state"); - // initiates the auth session - handle - .get_client() - .query( - "select auth.init($1, $2);", - &[ - &kid as &(dyn ToSql + Sync), - &tokio_postgres::types::Json(jwk), - ], - ) - .await?; + // initiates the auth session + if let Err(e) = client.query("select auth.init()", &[]).await { + discard.discard(); + return Err(e.into()); + } - info!(?kid, "backend session state init"); + info!("backend session state initialized"); + } Ok(handle) } } +fn create_random_jwk() -> (SigningKey, JwkEcKey) { + let key = SigningKey::random(&mut OsRng); + let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); + (key, jwk) +} + #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1dde5952e103..4ab14ad35f89 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,9 +1,9 @@ use futures::{future::poll_fn, Future}; +use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; -use rand::rngs::OsRng; -use serde_json::Value; +use serde_json::value::RawValue; use signature::Signer; use std::task::{ready, Poll}; use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; @@ -12,14 +12,13 @@ use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; -use typed_json::json; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{context::RequestMonitoring, DbName, RoleName}; -use tracing::{debug, error, warn, Span}; +use tracing::{error, warn, Span}; use tracing::{info, info_span, Instrument}; use super::backend::HttpConnError; @@ -245,12 +244,14 @@ impl LocalConnPool { } } +#[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, conn_info: ConnInfo, client: tokio_postgres::Client, mut connection: tokio_postgres::Connection, + key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> LocalClient { @@ -346,8 +347,6 @@ pub(crate) fn poll_client( } .instrument(span)); - let key = SigningKey::random(&mut OsRng); - let inner = ClientInner { inner: client, session: tx, @@ -430,13 +429,6 @@ impl LocalClient { let inner = inner.as_mut().expect("client inner should not be removed"); (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn key(&self) -> &SigningKey { - let inner = &self - .inner - .as_ref() - .expect("client inner should not be removed"); - &inner.key - } } impl LocalClient { @@ -445,25 +437,9 @@ impl LocalClient { .inner .as_mut() .expect("client inner should not be removed"); - inner.jti += 1; - - let kid = inner.inner.get_process_id(); - let header = json!({"kid":kid}).to_string(); - - let mut payload = serde_json::from_slice::>(payload) - .map_err(HttpConnError::JwtPayloadError)?; - payload.insert("jti".to_string(), Value::Number(inner.jti.into())); - let payload = Value::Object(payload).to_string(); - debug!( - kid, - jti = inner.jti, - ?header, - ?payload, - "signing new ephemeral JWT" - ); - - let token = sign_jwt(&inner.key, header, payload); + inner.jti += 1; + let token = resign_jwt(&inner.key, payload, inner.jti)?; // initiates the auth session inner.inner.simple_query("discard all").await?; @@ -475,20 +451,74 @@ impl LocalClient { ) .await?; - info!(kid, jti = inner.jti, "user session state init"); + let pid = inner.inner.get_process_id(); + info!(pid, jti = inner.jti, "user session state init"); Ok(()) } } -fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String { - let header = Base64UrlUnpadded::encode_string(header.as_bytes()); - let payload = Base64UrlUnpadded::encode_string(payload.as_bytes()); +/// implements relatively efficient in-place json object key upserting +/// +/// only supports top-level keys +fn upsert_json_object( + payload: &[u8], + key: &str, + value: &RawValue, +) -> Result { + let mut payload = serde_json::from_slice::>(payload)?; + payload.insert(key, value); + serde_json::to_string(&payload) +} + +fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result { + let mut buffer = itoa::Buffer::new(); + + // encode the jti integer to a json rawvalue + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap(); + + // update the jti in-place + let payload = + upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?; + + // sign the jwt + let token = sign_jwt(sk, payload.as_bytes()); + + Ok(token) +} + +fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { + let header_len = 20; + let payload_len = Base64UrlUnpadded::encoded_len(payload); + let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]); + let total_len = header_len + payload_len + signature_len + 2; + + let mut jwt = String::with_capacity(total_len); + let cap = jwt.capacity(); - let message = format!("{header}.{payload}"); - let sig: Signature = sk.sign(message.as_bytes()); - let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes()); - format!("{message}.{base64_sig}") + // we only need an empty header with the alg specified. + // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" + jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + + // encode the jwt payload in-place + base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); + + // create the signature from the encoded header || payload + let sig: Signature = sk.sign(jwt.as_bytes()); + + jwt.push('.'); + + // encode the jwt signature in-place + base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt); + + debug_assert_eq!( + jwt.len(), + total_len, + "the jwt len should match our expected len" + ); + debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change"); + + jwt } impl Discard<'_, C> { @@ -509,14 +539,6 @@ impl Discard<'_, C> { } impl LocalClient { - pub fn get_client(&self) -> &C { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner - } - fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self @@ -542,3 +564,30 @@ impl Drop for LocalClient { } } } + +#[cfg(test)] +mod tests { + use p256::ecdsa::SigningKey; + use typed_json::json; + + use super::resign_jwt; + + #[test] + fn jwt_token_snapshot() { + let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let data = + json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); + + let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap(); + + // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. + // In the public-key box, paste the following jwk public key + // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + + // let pub_key = p256::ecdsa::VerifyingKey::from(&key); + // let pub_key = p256::PublicKey::from(pub_key); + // println!("{}", pub_key.to_jwk_string()); + + assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 0a90b6b6f763..1347d6ddff64 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -46,7 +46,8 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } -indexmap = { version = "1", default-features = false, features = ["std"] } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } @@ -101,7 +102,8 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } From fb74c21e8cae23831b7728232772315297463e63 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 15 Oct 2024 15:24:56 +0200 Subject: [PATCH 12/38] proxy: Migrate jwt module away from anyhow (#9361) --- proxy/src/auth/backend/jwt.rs | 188 +++++++++++++++++------ proxy/src/auth/backend/local.rs | 6 +- proxy/src/auth/backend/mod.rs | 3 +- proxy/src/control_plane/provider/mock.rs | 10 +- proxy/src/control_plane/provider/mod.rs | 43 +++++- proxy/src/control_plane/provider/neon.rs | 27 ++-- proxy/src/proxy/tests/mod.rs | 42 ++--- proxy/src/proxy/wake_compute.rs | 2 +- 8 files changed, 228 insertions(+), 93 deletions(-) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 17ab7eda2245..402e59fdb399 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -4,21 +4,20 @@ use std::{ time::{Duration, SystemTime}, }; -use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; use serde::{de::Visitor, Deserialize, Deserializer}; use signature::Verifier; +use thiserror::Error; use tokio::time::Instant; use crate::{ - context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId, - RoleName, + auth::backend::ComputeCredentialKeys, context::RequestMonitoring, + control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit, + intern::RoleNameInt, EndpointId, RoleName, }; -use super::ComputeCredentialKeys; - // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); const MIN_RENEW: Duration = Duration::from_secs(30); @@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> impl Future>> + Send; + ) -> impl Future, FetchAuthRulesError>> + Send; +} + +#[derive(Error, Debug)] +pub(crate) enum FetchAuthRulesError { + #[error(transparent)] + GetEndpointJwks(#[from] GetEndpointJwksError), + + #[error("JWKs settings for this role were not configured")] + RoleJwksNotConfigured, } pub(crate) struct AuthRule { @@ -122,7 +130,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, auth_rules: &F, - ) -> anyhow::Result> { + ) -> Result, JwtError> { // double check that no one beat us to updating the cache. let now = Instant::now(); let guard = self.cached.load_full(); @@ -188,7 +196,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, fetch: &F, - ) -> Result, anyhow::Error> { + ) -> Result, JwtError> { let now = Instant::now(); let guard = self.cached.load_full(); @@ -243,27 +251,24 @@ impl JwkCacheEntryLock { endpoint: EndpointId, role_name: &RoleName, fetch: &F, - ) -> Result { + ) -> Result { // JWT compact form is defined to be // || . || || . || // where Signature = alg( || . || ); let (header_payload, signature) = jwt .rsplit_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; let (header, payload) = header_payload .split_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; - let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let header = serde_json::from_slice::>(&header) - .context("Provided authentication token is not a valid JWT encoding")?; + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; + let header = serde_json::from_slice::>(&header)?; - let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; - let kid = header.key_id.context("missing key id")?; + let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; let mut guard = self .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) @@ -281,16 +286,13 @@ impl JwkCacheEntryLock { .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } - _ => { - bail!("jwk not found"); - } + _ => return Err(JwtError::JwkNotFound), } }; - ensure!( - jwk.is_supported(&header.algorithm), - "signature algorithm not supported" - ); + if !jwk.is_supported(&header.algorithm) { + return Err(JwtError::SignatureAlgorithmNotSupported); + } match &jwk.key { jose_jwk::Key::Ec(key) => { @@ -299,34 +301,32 @@ impl JwkCacheEntryLock { jose_jwk::Key::Rsa(key) => { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } - key => bail!("unsupported key type {key:?}"), + key => return Err(JwtError::UnsupportedKeyType(key.into())), }; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let payload = serde_json::from_slice::>(&payloadb) - .context("Provided authentication token is not a valid JWT encoding")?; + let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payload = serde_json::from_slice::>(&payloadb)?; tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience { - ensure!( - payload.audience.0.iter().any(|s| s == aud), - "invalid JWT token audience" - ); + if payload.audience.0.iter().all(|s| s != aud) { + return Err(JwtError::InvalidJwtTokenAudience); + } } let now = SystemTime::now(); if let Some(exp) = payload.expiration { - ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired"); + if now >= exp + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenHasExpired); + } } if let Some(nbf) = payload.not_before { - ensure!( - nbf < now + CLOCK_SKEW_LEEWAY, - "JWT token is not yet ready to use" - ); + if nbf >= now + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenNotYetReadyToUse); + } } Ok(ComputeCredentialKeys::JwtPayload(payloadb)) @@ -341,7 +341,7 @@ impl JwkCache { role_name: &RoleName, fetch: &F, jwt: &str, - ) -> Result { + ) -> Result { // try with just a read lock first let key = (endpoint.clone(), role_name.clone()); let entry = self.map.get(&key).as_deref().map(Arc::clone); @@ -357,19 +357,18 @@ impl JwkCache { } } -fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> { use ecdsa::Signature; use signature::Verifier; match key.crv { jose_jwk::EcCurves::P256 => { - let pk = - p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?; let key = p256::ecdsa::VerifyingKey::from(&pk); let sig = Signature::from_slice(sig)?; key.verify(data, &sig)?; } - key => bail!("unsupported ec key type {key:?}"), + key => return Err(JwtError::UnsupportedEcKeyType(key)), } Ok(()) @@ -380,14 +379,14 @@ fn verify_rsa_signature( sig: &[u8], key: &jose_jwk::Rsa, alg: &jose_jwa::Algorithm, -) -> anyhow::Result<()> { +) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; use rsa::{ pkcs1v15::{Signature, VerifyingKey}, RsaPublicKey, }; - let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; match alg { Algorithm::Signing(Signing::Rs256) => { @@ -395,7 +394,7 @@ fn verify_rsa_signature( let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; } - _ => bail!("invalid RSA signing algorithm"), + _ => return Err(JwtError::InvalidRsaSigningAlgorithm), }; Ok(()) @@ -561,6 +560,99 @@ impl Drop for JwkRenewalPermit<'_> { } } +#[derive(Error, Debug)] +#[non_exhaustive] +pub(crate) enum JwtError { + #[error("jwk not found")] + JwkNotFound, + + #[error("missing key id")] + MissingKeyId, + + #[error("Provided authentication token is not a valid JWT encoding")] + JwtEncoding(#[from] JwtEncodingError), + + #[error("invalid JWT token audience")] + InvalidJwtTokenAudience, + + #[error("JWT token has expired")] + JwtTokenHasExpired, + + #[error("JWT token is not yet ready to use")] + JwtTokenNotYetReadyToUse, + + #[error("invalid P256 key")] + InvalidP256Key(jose_jwk::crypto::Error), + + #[error("invalid RSA key")] + InvalidRsaKey(jose_jwk::crypto::Error), + + #[error("invalid RSA signing algorithm")] + InvalidRsaSigningAlgorithm, + + #[error("unsupported EC key type {0:?}")] + UnsupportedEcKeyType(jose_jwk::EcCurves), + + #[error("unsupported key type {0:?}")] + UnsupportedKeyType(KeyType), + + #[error("signature algorithm not supported")] + SignatureAlgorithmNotSupported, + + #[error("signature error: {0}")] + Signature(#[from] signature::Error), + + #[error("failed to fetch auth rules: {0}")] + FetchAuthRules(#[from] FetchAuthRulesError), +} + +impl From for JwtError { + fn from(err: base64::DecodeError) -> Self { + JwtEncodingError::Base64Decode(err).into() + } +} + +impl From for JwtError { + fn from(err: serde_json::Error) -> Self { + JwtEncodingError::SerdeJson(err).into() + } +} + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum JwtEncodingError { + #[error(transparent)] + Base64Decode(#[from] base64::DecodeError), + + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + + #[error("invalid compact form")] + InvalidCompactForm, +} + +#[allow(dead_code, reason = "Debug use only")] +#[derive(Debug)] +pub(crate) enum KeyType { + Ec(jose_jwk::EcCurves), + Rsa, + Oct, + Okp(jose_jwk::OkpCurves), + Unknown, +} + +impl From<&jose_jwk::Key> for KeyType { + fn from(key: &jose_jwk::Key) -> Self { + match key { + jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv), + jose_jwk::Key::Rsa(_rsa) => Self::Rsa, + jose_jwk::Key::Oct(_oct) => Self::Oct, + jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv), + _ => Self::Unknown, + } + } +} + #[cfg(test)] mod tests { use crate::RoleName; @@ -758,7 +850,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { Ok(vec![ AuthRule { id: "foo".to_owned(), diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 12451847b1c6..1dea4d2d73c3 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,9 +1,9 @@ use std::net::SocketAddr; -use anyhow::Context; use arc_swap::ArcSwapOption; use crate::{ + auth::backend::jwt::FetchAuthRulesError, compute::ConnCfg, context::RequestMonitoring, control_plane::{ @@ -53,11 +53,11 @@ impl FetchAuthRules for StaticAuthRules { &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() - .context("JWKs settings for this role were not configured")?; + .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?; let mut rules = vec![]; for setting in &role_mappings.jwks { rules.push(AuthRule { diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 96e1a787ed1b..7cf158bcd90d 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -561,7 +561,8 @@ mod tests { &self, _ctx: &RequestMonitoring, _endpoint: crate::EndpointId, - ) -> anyhow::Result> { + ) -> Result, control_plane::errors::GetEndpointJwksError> + { unimplemented!() } diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index ea2eb79e2a8c..51cddec67248 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -5,7 +5,8 @@ use super::{ AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; use crate::{ - auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName, + auth::backend::jwt::AuthRule, context::RequestMonitoring, + control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName, }; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; @@ -120,7 +121,10 @@ impl Api { }) } - async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result> { + async fn do_get_endpoint_jwks( + &self, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { let (client, connection) = tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; @@ -224,7 +228,7 @@ impl super::Api for Api { &self, _ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await } diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 6cc525a3244f..0a196fe2a35a 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -6,7 +6,7 @@ use super::messages::{ControlPlaneError, MetricsAuxInfo}; use crate::{ auth::{ backend::{ - jwt::{AuthRule, FetchAuthRules}, + jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}, ComputeCredentialKeys, ComputeUserInfo, }, IpPattern, @@ -44,7 +44,7 @@ pub(crate) mod errors { pub(crate) enum ApiError { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {0}")] - ControlPlane(ControlPlaneError), + ControlPlane(Box), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -90,7 +90,7 @@ pub(crate) mod errors { Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, Reason::LockAlreadyTaken => ErrorKind::ControlPlane, Reason::RunningOperations => ErrorKind::ControlPlane, - Reason::Unknown => match &e { + Reason::Unknown => match &**e { ControlPlaneError { http_status_code: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, @@ -246,6 +246,33 @@ pub(crate) mod errors { } } } + + #[derive(Debug, Error)] + pub enum GetEndpointJwksError { + #[error("endpoint not found")] + EndpointNotFound, + + #[error("failed to build control plane request: {0}")] + RequestBuild(#[source] reqwest::Error), + + #[error("failed to send control plane request: {0}")] + RequestExecute(#[source] reqwest_middleware::Error), + + #[error(transparent)] + ControlPlane(#[from] ApiError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TokioPostgres(#[from] tokio_postgres::Error), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + ParseUrl(#[from] url::ParseError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TaskJoin(#[from] tokio::task::JoinError), + } } /// Auth secret which is managed by the cloud. @@ -342,7 +369,7 @@ pub(crate) trait Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result>; + ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( @@ -401,7 +428,7 @@ impl Api for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, errors::GetEndpointJwksError> { match self { Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] @@ -583,7 +610,9 @@ impl FetchAuthRules for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { - self.get_endpoint_jwks(ctx, endpoint).await + ) -> Result, FetchAuthRulesError> { + self.get_endpoint_jwks(ctx, endpoint) + .await + .map_err(FetchAuthRulesError::GetEndpointJwks) } } diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index d01878741c1d..2487ce0e3f40 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -9,7 +9,10 @@ use super::{ use crate::{ auth::backend::{jwt::AuthRule, ComputeUserInfo}, compute, - control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}, + control_plane::{ + errors::GetEndpointJwksError, + messages::{ColdStartInfo, EndpointJwksResponse, Reason}, + }, http, metrics::{CacheOutcome, Metrics}, rate_limiter::WakeComputeRateLimiter, @@ -17,7 +20,6 @@ use crate::{ }; use crate::{cache::Cached, context::RequestMonitoring}; use ::http::{header::AUTHORIZATION, HeaderName}; -use anyhow::bail; use futures::TryFutureExt; use std::{sync::Arc, time::Duration}; use tokio::time::Instant; @@ -137,14 +139,14 @@ impl Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { if !self .caches .endpoints_cache .is_valid(ctx, &endpoint.normalize()) .await { - bail!("endpoint not found"); + return Err(GetEndpointJwksError::EndpointNotFound); } let request_id = ctx.session_id().to_string(); async { @@ -159,12 +161,17 @@ impl Api { .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) - .build()?; + .build() + .map_err(GetEndpointJwksError::RequestBuild)?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; + let response = self + .endpoint + .execute(request) + .await + .map_err(GetEndpointJwksError::RequestExecute)?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -330,7 +337,7 @@ impl super::Api for Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await } @@ -348,7 +355,7 @@ impl super::Api for Api { let (cached, info) = cached.take_value(); let info = info.map_err(|c| { info!(key = &*key, "found cached wake_compute error"); - WakeComputeError::ApiError(ApiError::ControlPlane(*c)) + WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c))) })?; debug!(key = &*key, "found cached compute node info"); @@ -418,7 +425,7 @@ impl super::Api for Api { self.caches.node_info.insert_ttl( key, - Err(Box::new(err.clone())), + Err(err.clone()), Duration::from_secs(30), ); @@ -457,7 +464,7 @@ async fn parse_body serde::Deserialize<'a>>( body.http_status_code = status; warn!("console responded with an error ({status}): {body:?}"); - Err(ApiError::ControlPlane(body)) + Err(ApiError::ControlPlane(Box::new(body))) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 58fb36dba754..deb4d4a63f61 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -492,30 +492,32 @@ impl TestBackend for TestConnectMechanism { match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: None, - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + })); assert!(!err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: Some(Status { - code: "error".into(), - message: "error".into(), - details: Details { - error_info: None, - retry_info: Some(control_plane::messages::RetryInfo { - retry_delay_ms: 1, - }), - user_facing_message: None, - }, - }), - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: Some(Status { + code: "error".into(), + message: "error".into(), + details: Details { + error_info: None, + retry_info: Some(control_plane::messages::RetryInfo { + retry_delay_ms: 1, + }), + user_facing_message: None, + }, + }), + })); assert!(err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index ba674f5d0d96..0d1527a2c118 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -79,7 +79,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, - Reason::Unknown => match e { + Reason::Unknown => match **e { ControlPlaneError { http_status_code: StatusCode::LOCKED, ref error, From 614c3aef72ed595190801e8d77fe188e3cb13605 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 15 Oct 2024 17:18:52 +0300 Subject: [PATCH 13/38] Remove redundant code (#9373) ## Problem There is double update of resize cache in `put_rel_truncation` Also `page_server_request` contains check that fork is MAIN_FORKNUM which 1. is incorrect (because Vm/FSM pages are shreded in the same way as MAIN fork pages and 2. is redundant because `page_server_request` is never called for `get page` request so first part to OR condition is always true. ## Summary of changes Remove redundant code ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik --- pageserver/src/pgdatadir_mapping.rs | 3 --- pgxn/neon/pagestore_smgr.c | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7aa313f03143..900da5beabe5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1545,9 +1545,6 @@ impl<'a> DatadirModification<'a> { // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update relation size cache - self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update logical database size. self.pending_nblocks -= old_size as i64 - nblocks as i64; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f46df7f70ac4..cbb0e2ae6d0b 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1092,8 +1092,7 @@ page_server_request(void const *req) * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || - ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) { shard_no = 0; } From cf7a596a151487c1b3afafbe1eb2efab895326ea Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 11:18:38 -0500 Subject: [PATCH 14/38] Generate sql_exporter config files with Jsonnet There are quite a few benefits to this approach: - Reduce config duplication - The two sql_exporter configs were super similar with just a few differences - Pull SQL queries into standalone files - That means we could run a SQL formatter on the file in the future - It also means access to syntax highlighting - In the future, run different queries for different PG versions - This is relevant because right now, we have queries that are failing on PG 17 due to catalog updates Signed-off-by: Tristan Partin --- .github/workflows/build_and_test.yml | 19 + Dockerfile.build-tools | 1 + Makefile | 1 + compute/.gitignore | 5 + compute/Dockerfile.compute-node | 22 +- compute/Makefile | 35 ++ compute/etc/README.md | 17 + compute/etc/neon_collector.jsonnet | 43 +++ compute/etc/neon_collector.yml | 331 ------------------ .../etc/neon_collector_autoscaling.jsonnet | 11 + compute/etc/neon_collector_autoscaling.yml | 55 --- compute/etc/sql_exporter.jsonnet | 40 +++ compute/etc/sql_exporter.yml | 33 -- .../sql_exporter/checkpoints_req.libsonnet | 10 + compute/etc/sql_exporter/checkpoints_req.sql | 1 + .../sql_exporter/checkpoints_timed.libsonnet | 10 + .../etc/sql_exporter/checkpoints_timed.sql | 1 + .../compute_current_lsn.libsonnet | 10 + .../etc/sql_exporter/compute_current_lsn.sql | 4 + .../compute_logical_snapshot_files.libsonnet | 12 + .../compute_logical_snapshot_files.sql | 7 + .../compute_receive_lsn.libsonnet | 10 + .../etc/sql_exporter/compute_receive_lsn.sql | 4 + .../compute_subscriptions_count.libsonnet | 12 + .../compute_subscriptions_count.sql | 1 + .../sql_exporter/connection_counts.libsonnet | 13 + .../etc/sql_exporter/connection_counts.sql | 1 + .../etc/sql_exporter/db_total_size.libsonnet | 10 + compute/etc/sql_exporter/db_total_size.sql | 1 + .../getpage_prefetch_discards_total.libsonnet | 9 + .../getpage_prefetch_misses_total.libsonnet | 9 + .../getpage_prefetch_requests_total.libsonnet | 9 + .../getpage_sync_requests_total.libsonnet | 9 + .../getpage_wait_seconds_bucket.libsonnet | 12 + .../getpage_wait_seconds_bucket.sql | 1 + .../getpage_wait_seconds_count.libsonnet | 9 + .../getpage_wait_seconds_sum.libsonnet | 9 + ...lfc_approximate_working_set_size.libsonnet | 12 + .../lfc_approximate_working_set_size.sql | 1 + ...ing_set_size_windows.autoscaling.libsonnet | 12 + ...e_working_set_size_windows.autoscaling.sql | 8 + ...oximate_working_set_size_windows.libsonnet | 12 + ...c_approximate_working_set_size_windows.sql | 8 + .../lfc_cache_size_limit.libsonnet | 10 + .../etc/sql_exporter/lfc_cache_size_limit.sql | 1 + compute/etc/sql_exporter/lfc_hits.libsonnet | 10 + compute/etc/sql_exporter/lfc_hits.sql | 1 + compute/etc/sql_exporter/lfc_misses.libsonnet | 10 + compute/etc/sql_exporter/lfc_misses.sql | 1 + compute/etc/sql_exporter/lfc_used.libsonnet | 10 + compute/etc/sql_exporter/lfc_used.sql | 1 + compute/etc/sql_exporter/lfc_writes.libsonnet | 10 + compute/etc/sql_exporter/lfc_writes.sql | 1 + .../logical_slot_restart_lsn.libsonnet | 15 + .../sql_exporter/logical_slot_restart_lsn.sql | 3 + .../sql_exporter/max_cluster_size.libsonnet | 10 + compute/etc/sql_exporter/max_cluster_size.sql | 1 + .../etc/sql_exporter/neon_perf_counters.sql | 13 + .../pageserver_disconnects_total.libsonnet | 9 + .../pageserver_requests_sent_total.libsonnet | 9 + .../pageserver_send_flushes_total.libsonnet | 9 + .../sql_exporter/pg_stats_userdb.libsonnet | 18 + compute/etc/sql_exporter/pg_stats_userdb.sql | 10 + .../replication_delay_bytes.libsonnet | 10 + .../sql_exporter/replication_delay_bytes.sql | 6 + .../replication_delay_seconds.libsonnet | 10 + .../replication_delay_seconds.sql | 5 + .../etc/sql_exporter/retained_wal.libsonnet | 12 + compute/etc/sql_exporter/retained_wal.sql | 5 + .../etc/sql_exporter/wal_is_lost.libsonnet | 12 + compute/etc/sql_exporter/wal_is_lost.sql | 7 + compute/etc/sql_exporter_autoscaling.yml | 33 -- 72 files changed, 635 insertions(+), 457 deletions(-) create mode 100644 compute/.gitignore create mode 100644 compute/Makefile create mode 100644 compute/etc/README.md create mode 100644 compute/etc/neon_collector.jsonnet delete mode 100644 compute/etc/neon_collector.yml create mode 100644 compute/etc/neon_collector_autoscaling.jsonnet delete mode 100644 compute/etc/neon_collector_autoscaling.yml create mode 100644 compute/etc/sql_exporter.jsonnet delete mode 100644 compute/etc/sql_exporter.yml create mode 100644 compute/etc/sql_exporter/checkpoints_req.libsonnet create mode 100644 compute/etc/sql_exporter/checkpoints_req.sql create mode 100644 compute/etc/sql_exporter/checkpoints_timed.libsonnet create mode 100644 compute/etc/sql_exporter/checkpoints_timed.sql create mode 100644 compute/etc/sql_exporter/compute_current_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/compute_current_lsn.sql create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.sql create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.sql create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.libsonnet create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.sql create mode 100644 compute/etc/sql_exporter/connection_counts.libsonnet create mode 100644 compute/etc/sql_exporter/connection_counts.sql create mode 100644 compute/etc/sql_exporter/db_total_size.libsonnet create mode 100644 compute/etc/sql_exporter/db_total_size.sql create mode 100644 compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.sql create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.sql create mode 100644 compute/etc/sql_exporter/lfc_hits.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_hits.sql create mode 100644 compute/etc/sql_exporter/lfc_misses.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_misses.sql create mode 100644 compute/etc/sql_exporter/lfc_used.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_used.sql create mode 100644 compute/etc/sql_exporter/lfc_writes.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_writes.sql create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.sql create mode 100644 compute/etc/sql_exporter/max_cluster_size.libsonnet create mode 100644 compute/etc/sql_exporter/max_cluster_size.sql create mode 100644 compute/etc/sql_exporter/neon_perf_counters.sql create mode 100644 compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.libsonnet create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.sql create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.libsonnet create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.sql create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.libsonnet create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.sql create mode 100644 compute/etc/sql_exporter/retained_wal.libsonnet create mode 100644 compute/etc/sql_exporter/retained_wal.sql create mode 100644 compute/etc/sql_exporter/wal_is_lost.libsonnet create mode 100644 compute/etc/sql_exporter/wal_is_lost.sql delete mode 100644 compute/etc/sql_exporter_autoscaling.yml diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 51f6975e63c4..c9a447626f19 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -120,6 +120,25 @@ jobs: - name: Run mypy to check types run: poetry run mypy . + check-codestyle-jsonnet: + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check Jsonnet code formatting + run: | + jsonnetfmt --test \ + $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet') + # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 54e913425772..7cba1c863599 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -27,6 +27,7 @@ RUN set -e \ gnupg \ gzip \ jq \ + jsonnet \ libcurl4-openssl-dev \ libbz2-dev \ libffi-dev \ diff --git a/Makefile b/Makefile index 5e227ed3f5ca..33cfda2661df 100644 --- a/Makefile +++ b/Makefile @@ -291,6 +291,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean clean: postgres-clean neon-pg-clean-ext + $(MAKE) -C compute clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/compute/.gitignore b/compute/.gitignore new file mode 100644 index 000000000000..70980d335afb --- /dev/null +++ b/compute/.gitignore @@ -0,0 +1,5 @@ +# sql_exporter config files generated from Jsonnet +etc/neon_collector.yml +etc/neon_collector_autoscaling.yml +etc/sql_exporter.yml +etc/sql_exporter_autoscaling.yml diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 412c64eda497..13381b29013d 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -349,7 +349,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific -# doesn't use releases, last commit f3d82fd - Mar 2, 2023 +# doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ @@ -1169,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### +# +# Preprocess the sql_exporter configuration files +# +######################################################################################### +FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor + +USER nonroot + +COPY --chown=nonroot compute compute + +RUN make -C compute ######################################################################################### # @@ -1287,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter -COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Create remote extension download directory RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions diff --git a/compute/Makefile b/compute/Makefile new file mode 100644 index 000000000000..45fbfa6d5e58 --- /dev/null +++ b/compute/Makefile @@ -0,0 +1,35 @@ +jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet) + +.PHONY: all +all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml + +neon_collector.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + etc/neon_collector.jsonnet + +neon_collector_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + etc/neon_collector_autoscaling.jsonnet + +sql_exporter.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector.yml \ + etc/sql_exporter.jsonnet + +sql_exporter_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector_autoscaling.yml \ + --tla-str application_name=sql_exporter_autoscaling \ + etc/sql_exporter.jsonnet + +.PHONY: clean +clean: + rm --force \ + etc/neon_collector.yml \ + etc/neon_collector_autoscaling.yml \ + etc/sql_exporter.yml \ + etc/sql_exporter_autoscaling.yml diff --git a/compute/etc/README.md b/compute/etc/README.md new file mode 100644 index 000000000000..70b108146cf8 --- /dev/null +++ b/compute/etc/README.md @@ -0,0 +1,17 @@ +# Compute Configuration + +These files are the configuration files for various other pieces of software +that will be running in the compute alongside Postgres. + +## `sql_exporter` + +### Adding a `sql_exporter` Metric + +We use `sql_exporter` to export various metrics from Postgres. In order to add +a metric, you will need to create two files: a `libsonnet` and a `sql` file. You +will then import the `libsonnet` file in one of the collector files, and the +`sql` file will be imported in the `libsonnet` file. + +In the event your statistic is an LSN, you may want to cast it to a `float8` +because Prometheus only supports floats. It's probably fine because `float8` can +store integers from `-2^53` to `+2^53` exactly. diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet new file mode 100644 index 000000000000..2031eb8c8568 --- /dev/null +++ b/compute/etc/neon_collector.jsonnet @@ -0,0 +1,43 @@ +{ + collector_name: 'neon_collector', + metrics: [ + import 'sql_exporter/checkpoints_req.libsonnet', + import 'sql_exporter/checkpoints_timed.libsonnet', + import 'sql_exporter/compute_current_lsn.libsonnet', + import 'sql_exporter/compute_logical_snapshot_files.libsonnet', + import 'sql_exporter/compute_receive_lsn.libsonnet', + import 'sql_exporter/compute_subscriptions_count.libsonnet', + import 'sql_exporter/connection_counts.libsonnet', + import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', + import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', + import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', + import 'sql_exporter/getpage_wait_seconds_count.libsonnet', + import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + import 'sql_exporter/logical_slot_restart_lsn.libsonnet', + import 'sql_exporter/max_cluster_size.libsonnet', + import 'sql_exporter/pageserver_disconnects_total.libsonnet', + import 'sql_exporter/pageserver_requests_sent_total.libsonnet', + import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pg_stats_userdb.libsonnet', + import 'sql_exporter/replication_delay_bytes.libsonnet', + import 'sql_exporter/replication_delay_seconds.libsonnet', + import 'sql_exporter/retained_wal.libsonnet', + import 'sql_exporter/wal_is_lost.libsonnet', + ], + queries: [ + { + query_name: 'neon_perf_counters', + query: importstr 'sql_exporter/neon_perf_counters.sql', + }, + ], +} diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml deleted file mode 100644 index 92da0cdbdd72..000000000000 --- a/compute/etc/neon_collector.yml +++ /dev/null @@ -1,331 +0,0 @@ -collector_name: neon_collector -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: connection_counts - type: gauge - help: 'Connection counts' - key_labels: - - datname - - state - values: [count] - query: | - select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; - -- metric_name: pg_stats_userdb - type: gauge - help: 'Stats for several oldest non-system dbs' - key_labels: - - datname - value_label: kind - values: - - db_size - - deadlocks - # Rows - - inserted - - updated - - deleted - # We export stats for 10 non-system database. Without this limit - # it is too easy to abuse the system by creating lots of databases. - query: | - select pg_database_size(datname) as db_size, deadlocks, - tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, - datname - from pg_stat_database - where datname IN ( - select datname - from pg_database - where datname <> 'postgres' and not datistemplate - order by oid - limit 10 - ); - -- metric_name: max_cluster_size - type: gauge - help: 'neon.max_cluster_size setting' - key_labels: - values: [max_cluster_size] - query: | - select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; - -- metric_name: db_total_size - type: gauge - help: 'Size of all databases' - key_labels: - values: [total] - query: | - select sum(pg_database_size(datname)) as total from pg_database; - -- metric_name: getpage_wait_seconds_count - type: counter - help: 'Number of getpage requests' - values: [getpage_wait_seconds_count] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_sum - type: counter - help: 'Time spent in getpage requests' - values: [getpage_wait_seconds_sum] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_requests_total - type: counter - help: 'Number of getpage issued for prefetching' - values: [getpage_prefetch_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_sync_requests_total - type: counter - help: 'Number of synchronous getpage issued' - values: [getpage_sync_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_misses_total - type: counter - help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' - values: [getpage_prefetch_misses_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_discards_total - type: counter - help: 'Number of prefetch responses issued but not used' - values: [getpage_prefetch_discards_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_requests_sent_total - type: counter - help: 'Number of all requests sent to the pageserver (not just GetPage requests)' - values: [pageserver_requests_sent_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_disconnects_total - type: counter - help: 'Number of times that the connection to the pageserver was lost' - values: [pageserver_disconnects_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_send_flushes_total - type: counter - help: 'Number of flushes to the pageserver connection' - values: [pageserver_send_flushes_total] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_bucket - type: counter - help: 'Histogram buckets of getpage request latency' - key_labels: - - bucket_le - values: [value] - query_ref: getpage_wait_seconds_buckets - -# DEPRECATED -- metric_name: lfc_approximate_working_set_size - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] - query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration] - values: [size] - # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection - # of durations in a pretty-printed form. - query: | - select - x as duration, - neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size - from - (values ('5m'),('15m'),('1h')) as t (x); - -- metric_name: compute_current_lsn - type: gauge - help: 'Current LSN of the database' - key_labels: - values: [lsn] - query: | - select - case - when pg_catalog.pg_is_in_recovery() - then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - else (pg_current_wal_lsn() - '0/0')::FLOAT8 - end as lsn; - -- metric_name: compute_receive_lsn - type: gauge - help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' - key_labels: - values: [lsn] - query: | - SELECT - CASE - WHEN pg_catalog.pg_is_in_recovery() - THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 - ELSE 0 - END AS lsn; - -- metric_name: replication_delay_bytes - type: gauge - help: 'Bytes between received and replayed LSN' - key_labels: - values: [replication_delay_bytes] - # We use a GREATEST call here because this calculation can be negative. - # The calculation is not atomic, meaning after we've gotten the receive - # LSN, the replay LSN may have advanced past the receive LSN we - # are using for the calculation. - query: | - SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - -- metric_name: replication_delay_seconds - type: gauge - help: 'Time since last LSN was replayed' - key_labels: - values: [replication_delay_seconds] - query: | - SELECT - CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) - END AS replication_delay_seconds; - -- metric_name: checkpoints_req - type: gauge - help: 'Number of requested checkpoints' - key_labels: - values: [checkpoints_req] - query: | - SELECT checkpoints_req FROM pg_stat_bgwriter; - -- metric_name: checkpoints_timed - type: gauge - help: 'Number of scheduled checkpoints' - key_labels: - values: [checkpoints_timed] - query: | - SELECT checkpoints_timed FROM pg_stat_bgwriter; - -- metric_name: compute_logical_snapshot_files - type: gauge - help: 'Number of snapshot files in pg_logical/snapshot' - key_labels: - - timeline_id - values: [num_logical_snapshot_files] - query: | - SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, - -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These - -- temporary snapshot files are renamed to the actual snapshot files after they are - -- completely built. We only WAL-log the completely built snapshot files. - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; - -# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. -# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. - -# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. -- metric_name: logical_slot_restart_lsn - type: gauge - help: 'restart_lsn of logical slots' - key_labels: - - slot_name - values: [restart_lsn] - query: | - select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn - from pg_replication_slots - where slot_type = 'logical'; - -- metric_name: compute_subscriptions_count - type: gauge - help: 'Number of logical replication subscriptions grouped by enabled/disabled' - key_labels: - - enabled - values: [subscriptions_count] - query: | - select subenabled::text as enabled, count(*) as subscriptions_count - from pg_subscription - group by subenabled; - -- metric_name: retained_wal - type: gauge - help: 'Retained WAL in inactive replication slots' - key_labels: - - slot_name - values: [retained_wal] - query: | - SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal - FROM pg_replication_slots - WHERE active = false; - -- metric_name: wal_is_lost - type: gauge - help: 'Whether or not the replication slot wal_status is lost' - key_labels: - - slot_name - values: [wal_is_lost] - query: | - SELECT slot_name, - CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost - FROM pg_replication_slots; - -queries: - - query_name: neon_perf_counters - query: | - WITH c AS ( - SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters - ) - SELECT d.* - FROM pg_catalog.jsonb_to_record((select jb from c)) as d( - getpage_wait_seconds_count numeric, - getpage_wait_seconds_sum numeric, - getpage_prefetch_requests_total numeric, - getpage_sync_requests_total numeric, - getpage_prefetch_misses_total numeric, - getpage_prefetch_discards_total numeric, - pageserver_requests_sent_total numeric, - pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric - ); - - - query_name: getpage_wait_seconds_buckets - query: | - SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet new file mode 100644 index 000000000000..e248172a3d8b --- /dev/null +++ b/compute/etc/neon_collector_autoscaling.jsonnet @@ -0,0 +1,11 @@ +{ + collector_name: 'neon_collector_autoscaling', + metrics: [ + import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + ], +} diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml deleted file mode 100644 index 5616264eba1b..000000000000 --- a/compute/etc/neon_collector_autoscaling.yml +++ /dev/null @@ -1,55 +0,0 @@ -collector_name: neon_collector_autoscaling -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration_seconds] - values: [size] - # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set - # size looking back 1..60 minutes, labeled with the number of minutes. - query: | - select - x::text as duration_seconds, - neon.approximate_working_set_size_seconds(x) as size - from - (select generate_series * 60 as x from generate_series(1, 60)) as t (x); diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet new file mode 100644 index 000000000000..1e3665ac4727 --- /dev/null +++ b/compute/etc/sql_exporter.jsonnet @@ -0,0 +1,40 @@ +function(collector_file, application_name='sql_exporter') { + // Configuration for sql_exporter for autoscaling-agent + // Global defaults. + global: { + // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: '10s', + // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: '500ms', + // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: '0s', + // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + // as will concurrent scrapes. + max_connections: 1, + // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + // always be the same as max_connections. + max_idle_connections: 1, + // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + // If 0, connections are not closed due to a connection's age. + max_connection_lifetime: '5m', + }, + + // The target to monitor and the collectors to execute on it. + target: { + // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + // the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]), + + // Collectors (referenced by name) to execute on the target. + // Glob patterns are supported (see for syntax). + collectors: [ + 'neon_collector_autoscaling', + ], + }, + + // Collector files specifies a list of globs. One collector definition is read from each matching file. + // Glob patterns are supported (see for syntax). + collector_files: [ + collector_file, + ], +} diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml deleted file mode 100644 index 139d04468ab6..000000000000 --- a/compute/etc/sql_exporter.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector.yml" diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet new file mode 100644 index 000000000000..8697f8af3b99 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'checkpoints_req', + type: 'gauge', + help: 'Number of requested checkpoints', + key_labels: null, + values: [ + 'checkpoints_req', + ], + query: importstr 'sql_exporter/checkpoints_req.sql', +} diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql new file mode 100644 index 000000000000..eb8427c8832f --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -0,0 +1 @@ +SELECT checkpoints_req FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet new file mode 100644 index 000000000000..9f0b742400f9 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'checkpoints_timed', + type: 'gauge', + help: 'Number of scheduled checkpoints', + key_labels: null, + values: [ + 'checkpoints_timed', + ], + query: importstr 'sql_exporter/checkpoints_timed.sql', +} diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql new file mode 100644 index 000000000000..c50853134cdb --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -0,0 +1 @@ +SELECT checkpoints_timed FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet new file mode 100644 index 000000000000..ccff1613580a --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_current_lsn', + type: 'gauge', + help: 'Current LSN of the database', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_current_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql new file mode 100644 index 000000000000..be02b8a09444 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet new file mode 100644 index 000000000000..212f079ccf3d --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_logical_snapshot_files', + type: 'gauge', + help: 'Number of snapshot files in pg_logical/snapshot', + key_labels: [ + 'timeline_id', + ], + values: [ + 'num_logical_snapshot_files', + ], + query: importstr 'sql_exporter/compute_logical_snapshot_files.sql', +} diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql new file mode 100644 index 000000000000..f2454235b70c --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -0,0 +1,7 @@ +SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. + -- These temporary snapshot files are renamed to the actual snapshot files + -- after they are completely built. We only WAL-log the completely built + -- snapshot files + (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet new file mode 100644 index 000000000000..eb68a77ec2c8 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_receive_lsn', + type: 'gauge', + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_receive_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql new file mode 100644 index 000000000000..318b31ab41c7 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet new file mode 100644 index 000000000000..e1575da397d1 --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_subscriptions_count', + type: 'gauge', + help: 'Number of logical replication subscriptions grouped by enabled/disabled', + key_labels: [ + 'enabled', + ], + values: [ + 'subscriptions_count', + ], + query: importstr 'sql_exporter/compute_subscriptions_count.sql', +} diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql new file mode 100644 index 000000000000..50740cb5dfc5 --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -0,0 +1 @@ +SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet new file mode 100644 index 000000000000..9f94db67a90d --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'connection_counts', + type: 'gauge', + help: 'Connection counts', + key_labels: [ + 'datname', + 'state', + ], + values: [ + 'count', + ], + query: importstr 'sql_exporter/connection_counts.sql', +} diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql new file mode 100644 index 000000000000..6824480fdbcb --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -0,0 +1 @@ +SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet new file mode 100644 index 000000000000..6e08d5fb87b9 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'db_total_size', + type: 'gauge', + help: 'Size of all databases', + key_labels: null, + values: [ + 'total', + ], + query: importstr 'sql_exporter/db_total_size.sql', +} diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql new file mode 100644 index 000000000000..9cbbdfd8a33a --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -0,0 +1 @@ +SELECT sum(pg_database_size(datname)) AS total FROM pg_database; diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet new file mode 100644 index 000000000000..935e35d2e49d --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_discards_total', + type: 'counter', + help: 'Number of prefetch responses issued but not used', + values: [ + 'getpage_prefetch_discards_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet new file mode 100644 index 000000000000..b9a96321055a --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_misses_total', + type: 'counter', + help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read", + values: [ + 'getpage_prefetch_misses_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet new file mode 100644 index 000000000000..75fdb6717b44 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_requests_total', + type: 'counter', + help: 'Number of getpage issued for prefetching', + values: [ + 'getpage_prefetch_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet new file mode 100644 index 000000000000..f3a1e6b33914 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_sync_requests_total', + type: 'counter', + help: 'Number of synchronous getpage issued', + values: [ + 'getpage_sync_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet new file mode 100644 index 000000000000..2adda2ad0348 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'getpage_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of getpage request latency', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql new file mode 100644 index 000000000000..b4a6bc15604a --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet new file mode 100644 index 000000000000..d2326974fcb8 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_count', + type: 'counter', + help: 'Number of getpage requests', + values: [ + 'getpage_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet new file mode 100644 index 000000000000..844c8419ff88 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_sum', + type: 'counter', + help: 'Time spent in getpage requests', + values: [ + 'getpage_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet new file mode 100644 index 000000000000..78859ce60df3 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet @@ -0,0 +1,12 @@ +// DEPRECATED + +{ + metric_name: 'lfc_approximate_working_set_size', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: null, + values: [ + 'approximate_working_set_size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql new file mode 100644 index 000000000000..de509ebb4712 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql @@ -0,0 +1 @@ +SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet new file mode 100644 index 000000000000..a54deca4677a --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration_seconds', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql new file mode 100644 index 000000000000..35fa42c34cac --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "internal" / "machine-readable" version. This outputs the +-- working set size looking back 1..60 minutes, labeled with the number of +-- minutes. + +SELECT + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) AS size +FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet new file mode 100644 index 000000000000..4970bd2c7fd1 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql new file mode 100644 index 000000000000..46c7d1610c7f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "public" / "human-readable" version. Here, we supply a +-- small selection of durations in a pretty-printed form. + +SELECT + x AS duration, + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + VALUES ('5m'), ('15m'), ('1h') + ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet new file mode 100644 index 000000000000..4cbbd766210a --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_cache_size_limit', + type: 'gauge', + help: 'LFC cache size limit in bytes', + key_labels: null, + values: [ + 'lfc_cache_size_limit', + ], + query: importstr 'sql_exporter/lfc_cache_size_limit.sql', +} diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql new file mode 100644 index 000000000000..378904c1fe46 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -0,0 +1 @@ +SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet new file mode 100644 index 000000000000..4a0b7671bf3d --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_hits', + type: 'gauge', + help: 'lfc_hits', + key_labels: null, + values: [ + 'lfc_hits', + ], + query: importstr 'sql_exporter/lfc_hits.sql', +} diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql new file mode 100644 index 000000000000..2e14f5c73c69 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits'; diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet new file mode 100644 index 000000000000..302998d04f1c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_misses', + type: 'gauge', + help: 'lfc_misses', + key_labels: null, + values: [ + 'lfc_misses', + ], + query: importstr 'sql_exporter/lfc_misses.sql', +} diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql new file mode 100644 index 000000000000..27ed4ecf86be --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses'; diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet new file mode 100644 index 000000000000..23891dadaf24 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used', + type: 'gauge', + help: 'LFC chunks used (chunk = 1MB)', + key_labels: null, + values: [ + 'lfc_used', + ], + query: importstr 'sql_exporter/lfc_used.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql new file mode 100644 index 000000000000..4f01545f307d --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used'; diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet new file mode 100644 index 000000000000..6a22ee1dd932 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_writes', + type: 'gauge', + help: 'lfc_writes', + key_labels: null, + values: [ + 'lfc_writes', + ], + query: importstr 'sql_exporter/lfc_writes.sql', +} diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql new file mode 100644 index 000000000000..37c9abc9cfc7 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes'; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet new file mode 100644 index 000000000000..8ef31b5d8d14 --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet @@ -0,0 +1,15 @@ +// Number of slots is limited by max_replication_slots, so collecting position +// for all of them shouldn't be bad. + +{ + metric_name: 'logical_slot_restart_lsn', + type: 'gauge', + help: 'restart_lsn of logical slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'restart_lsn', + ], + query: importstr 'sql_exporter/logical_slot_restart_lsn.sql', +} diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql new file mode 100644 index 000000000000..1b1c038501e5 --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -0,0 +1,3 @@ +SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn +FROM pg_replication_slots +WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet new file mode 100644 index 000000000000..1352fb77ee7a --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'max_cluster_size', + type: 'gauge', + help: 'neon.max_cluster_size setting', + key_labels: null, + values: [ + 'max_cluster_size', + ], + query: importstr 'sql_exporter/max_cluster_size.sql', +} diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql new file mode 100644 index 000000000000..2d2355a9a77a --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -0,0 +1 @@ +SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql new file mode 100644 index 000000000000..58998907a098 --- /dev/null +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -0,0 +1,13 @@ +WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) + +SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric +); diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet new file mode 100644 index 000000000000..5ad9ba078e13 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_disconnects_total', + type: 'counter', + help: 'Number of times that the connection to the pageserver was lost', + values: [ + 'pageserver_disconnects_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet new file mode 100644 index 000000000000..c191e2467fa1 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_requests_sent_total', + type: 'counter', + help: 'Number of all requests sent to the pageserver (not just GetPage requests)', + values: [ + 'pageserver_requests_sent_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet new file mode 100644 index 000000000000..9fa5f77758af --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_send_flushes_total', + type: 'counter', + help: 'Number of flushes to the pageserver connection', + values: [ + 'pageserver_send_flushes_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet new file mode 100644 index 000000000000..46ea2f41926a --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet @@ -0,0 +1,18 @@ +{ + metric_name: 'pg_stats_userdb', + type: 'gauge', + help: 'Stats for several oldest non-system dbs', + key_labels: [ + 'datname', + ], + value_label: 'kind', + values: [ + 'db_size', + 'deadlocks', + // Rows + 'inserted', + 'updated', + 'deleted', + ], + query: importstr 'sql_exporter/pg_stats_userdb.sql', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql new file mode 100644 index 000000000000..00ada87370d1 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -0,0 +1,10 @@ +-- We export stats for 10 non-system databases. Without this limit it is too +-- easy to abuse the system by creating lots of databases. + +SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, + tup_updated AS updated, tup_deleted AS deleted, datname +FROM pg_stat_database +WHERE datname IN ( + SELECT datname FROM pg_database + WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 +); diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet new file mode 100644 index 000000000000..3e5bb6af1fdf --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_bytes', + type: 'gauge', + help: 'Bytes between received and replayed LSN', + key_labels: null, + values: [ + 'replication_delay_bytes', + ], + query: importstr 'sql_exporter/replication_delay_bytes.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql new file mode 100644 index 000000000000..60a6981acd2d --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -0,0 +1,6 @@ +-- We use a GREATEST call here because this calculation can be negative. The +-- calculation is not atomic, meaning after we've gotten the receive LSN, the +-- replay LSN may have advanced past the receive LSN we are using for the +-- calculation. + +SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet new file mode 100644 index 000000000000..d3f2c21b544e --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_seconds', + type: 'gauge', + help: 'Time since last LSN was replayed', + key_labels: null, + values: [ + 'replication_delay_seconds', + ], + query: importstr 'sql_exporter/replication_delay_seconds.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql new file mode 100644 index 000000000000..a76809ad747d --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -0,0 +1,5 @@ +SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet new file mode 100644 index 000000000000..f9eff5faa576 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'retained_wal', + type: 'gauge', + help: 'Retained WAL in inactive replication slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'retained_wal', + ], + query: importstr 'sql_exporter/retained_wal.sql', +} diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql new file mode 100644 index 000000000000..6c5835946164 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -0,0 +1,5 @@ +SELECT + slot_name, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal +FROM pg_replication_slots +WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet new file mode 100644 index 000000000000..3cd25f4b399c --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'wal_is_lost', + type: 'gauge', + help: 'Whether or not the replication slot wal_status is lost', + key_labels: [ + 'slot_name', + ], + values: [ + 'wal_is_lost', + ], + query: importstr 'sql_exporter/wal_is_lost.sql', +} diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql new file mode 100644 index 000000000000..55212708517b --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -0,0 +1,7 @@ +SELECT + slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost +FROM pg_replication_slots; diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml deleted file mode 100644 index 044557233ee3..000000000000 --- a/compute/etc/sql_exporter_autoscaling.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter for autoscaling-agent -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector_autoscaling] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector_autoscaling.yml" From f1eb7032569c35ec47806c5e736486508d559439 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:35:21 -0400 Subject: [PATCH 15/38] fix(pageserver): use a buffer for basebackup; add aux basebackup metrics log (#9401) Our replication bench project is stuck because it is too slow to generate basebackup and it caused compute to disconnect. https://neondb.slack.com/archives/C03438W3FLZ/p1728330685012419 The compute timeout for waiting for basebackup is 10m (is it true?). Generating basebackup directly on pageserver takes ~3min. Therefore, I suspect it's because there are too many wasted round-trip time for writing the 10000+ snapshot aux files. Also, it is possible that the basebackup process takes too long time retrieving all aux files that it did not write anything over the wire protocol, causing a read timeout. Basebackup size is 800KB gzipped for that project and was 55MB tar before compression. ## Summary of changes * Potentially fix the issue by placing a write buffer for basebackup. * Log how many aux files did we read + the time spent on it. Signed-off-by: Alex Chi Z --- pageserver/src/basebackup.rs | 21 +++++++++++++++++---- pageserver/src/page_service.rs | 10 +++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a32d09f3b3bb..975318419f65 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -16,7 +16,7 @@ use fail::fail_point; use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; -use std::time::SystemTime; +use std::time::{Instant, SystemTime}; use tokio::io; use tokio::io::AsyncWrite; use tracing::*; @@ -352,12 +352,25 @@ where } } - for (path, content) in self + let start_time = Instant::now(); + let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx) .await - .map_err(|e| BasebackupError::Server(e.into()))? - { + .map_err(|e| BasebackupError::Server(e.into()))?; + let aux_scan_time = start_time.elapsed(); + let aux_estimated_size = aux_files + .values() + .map(|content| content.len()) + .sum::(); + info!( + "Scanned {} aux files in {}ms, aux file content size = {}", + aux_files.len(), + aux_scan_time.as_millis(), + aux_estimated_size + ); + + for (path, content) in aux_files { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8fa6b9a7f0d4..afb2f92ff80d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -26,8 +26,8 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; -use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -1137,10 +1137,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } else { - let mut writer = pgb.copyout_writer(); + let mut writer = BufWriter::new(pgb.copyout_writer()); if gzip { let mut encoder = GzipEncoder::with_quality( - writer, + &mut writer, // NOTE using fast compression because it's on the critical path // for compute startup. For an empty database, we get // <100KB with this method. The Level::Best compression method @@ -1175,6 +1175,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } + writer + .flush() + .await + .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; } pgb.write_message_noflush(&BeMessage::CopyDone) From 18f4e5f10cd1eeaa5a5949f9a6130983691311d6 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 15 Oct 2024 23:13:31 +0200 Subject: [PATCH 16/38] Add newly added metrics from neondatabase/neon#9116 to exports (#9402) They weren't added in that PR, but should be available immediately on rollout as the neon extension already defaults to 1.5. --- compute/etc/neon_collector.jsonnet | 8 ++++++++ .../file_cache_read_wait_seconds_bucket.libsonnet | 12 ++++++++++++ .../file_cache_read_wait_seconds_bucket.sql | 1 + .../file_cache_read_wait_seconds_count.libsonnet | 9 +++++++++ .../file_cache_read_wait_seconds_sum.libsonnet | 9 +++++++++ .../file_cache_write_wait_seconds_bucket.libsonnet | 12 ++++++++++++ .../file_cache_write_wait_seconds_bucket.sql | 1 + .../file_cache_write_wait_seconds_count.libsonnet | 9 +++++++++ .../file_cache_write_wait_seconds_sum.libsonnet | 9 +++++++++ .../getpage_prefetches_buffered.libsonnet | 9 +++++++++ compute/etc/sql_exporter/neon_perf_counters.sql | 8 +++++++- .../sql_exporter/pageserver_open_requests.libsonnet | 9 +++++++++ 12 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_open_requests.libsonnet diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index 2031eb8c8568..8b43ebe7a388 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -9,9 +9,16 @@ import 'sql_exporter/compute_subscriptions_count.libsonnet', import 'sql_exporter/connection_counts.libsonnet', import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet', import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', @@ -28,6 +35,7 @@ import 'sql_exporter/pageserver_disconnects_total.libsonnet', import 'sql_exporter/pageserver_requests_sent_total.libsonnet', import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pageserver_open_requests.libsonnet', import 'sql_exporter/pg_stats_userdb.libsonnet', import 'sql_exporter/replication_delay_bytes.libsonnet', import 'sql_exporter/replication_delay_seconds.libsonnet', diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet new file mode 100644 index 000000000000..d13f657a7f71 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_read_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC read operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql new file mode 100644 index 000000000000..09047bf0c409 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet new file mode 100644 index 000000000000..aa028b0f5ea9 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_count', + type: 'counter', + help: 'Number of read operations in LFC', + values: [ + 'file_cache_read_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet new file mode 100644 index 000000000000..2547aabf3d64 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC read operations', + values: [ + 'file_cache_read_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet new file mode 100644 index 000000000000..13dbc77f7662 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_write_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC write operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql new file mode 100644 index 000000000000..d03613cf913c --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet new file mode 100644 index 000000000000..6227d3193aea --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_count', + type: 'counter', + help: 'Number of write operations in LFC', + values: [ + 'file_cache_write_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet new file mode 100644 index 000000000000..2acfe7f608e5 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC write operations', + values: [ + 'file_cache_write_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet new file mode 100644 index 000000000000..8926d867c983 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetches_buffered', + type: 'gauge', + help: 'Number of prefetched pages buffered in neon', + values: [ + 'getpage_prefetches_buffered', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql index 58998907a098..4a36f3bf2fb2 100644 --- a/compute/etc/sql_exporter/neon_perf_counters.sql +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -1,13 +1,19 @@ WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + file_cache_read_wait_seconds_count numeric, + file_cache_read_wait_seconds_sum numeric, + file_cache_write_wait_seconds_count numeric, + file_cache_write_wait_seconds_sum numeric, getpage_wait_seconds_count numeric, getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, + getpage_prefetches_buffered numeric, pageserver_requests_sent_total numeric, pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric + pageserver_send_flushes_total numeric, + pageserver_open_requests numeric ); diff --git a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet new file mode 100644 index 000000000000..dca89ea64a24 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_open_requests', + type: 'gauge', + help: 'Number of open requests to PageServer', + values: [ + 'pageserver_open_requests', + ], + query_ref: 'neon_perf_counters', +} From be5d6a69dc6a05d339235d00958eb9fea7b0e9f5 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 16:30:31 -0500 Subject: [PATCH 17/38] Fix jsonnet_files wildcard Just a typo in a path. Signed-off-by: Tristan Partin --- compute/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compute/Makefile b/compute/Makefile index 45fbfa6d5e58..b407fc60beb2 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -1,4 +1,6 @@ -jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet) +jsonnet_files = $(wildcard \ + etc/*.jsonnet \ + etc/sql_exporter/*.libsonnet) .PHONY: all all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml From 061ea0de7a9768716d941e2e3472f19e075a5ce5 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 20:01:13 -0500 Subject: [PATCH 18/38] Add jsonnetfmt targets This should make it a little bit easier for people wanting to check if their files are formated correctly. Has the added bonus of making the CI check simpler as well. Signed-off-by: Tristan Partin --- .github/workflows/build_and_test.yml | 3 +-- compute/Makefile | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c9a447626f19..faee1d89e16c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -136,8 +136,7 @@ jobs: - name: Check Jsonnet code formatting run: | - jsonnetfmt --test \ - $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet') + make -C compute jsonnetfmt-test # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. diff --git a/compute/Makefile b/compute/Makefile index b407fc60beb2..f8faa882eedb 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -35,3 +35,11 @@ clean: etc/neon_collector_autoscaling.yml \ etc/sql_exporter.yml \ etc/sql_exporter_autoscaling.yml + +.PHONY: jsonnetfmt-test +jsonnetfmt-test: + jsonnetfmt --test $(jsonnet_files) + +.PHONY: jsonnetfmt-format +jsonnetfmt-format: + jsonnetfmt --in-place $(jsonnet_files) From bc6b8cee01cc4055332fef052c048856612bcbab Mon Sep 17 00:00:00 2001 From: Cihan Demirci <128653800+fcdm@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:43:48 +0100 Subject: [PATCH 19/38] don't trigger workflows in two repos (#9340) https://github.com/neondatabase/cloud/issues/16723 --- .github/workflows/build_and_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index faee1d89e16c..b669eaeb1152 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1100,7 +1100,6 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ From 89a65a9e5a30c7525d165d1a9c2675d05811bfcb Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 16 Oct 2024 13:39:58 +0100 Subject: [PATCH 20/38] pageserver: improve handling of archival_config calls during Timeline shutdown (#9415) ## Problem In test `test_timeline_offloading`, we see failures like: ``` PageserverApiException: queue is in state Stopped ``` Example failure: https://neon-github-public-dev.s3.amazonaws.com/reports/main/11356917668/index.html#testresult/ff0e348a78a974ee/retries ## Summary of changes - Amend code paths that handle errors from RemoteTimelineClient to check for cancellation and emit the Cancelled error variant in these cases (will give clients a 503 to retry) - Remove the implicit `#[from]` for the Other error case, to make it harder to add code that accidentally squashes errors into this (500-equivalent) error variant. This would be neater if we made RemoteTimelineClient return a structured error instead of anyhow::Error, but that's a bigger refactor. I'm not sure if the test really intends to hit this path, but the error handling fix makes sense either way. --- pageserver/src/tenant.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 44d1bb74ca34..20925c7fd61a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::remote_timeline_client::upload::upload_index_part; -use self::remote_timeline_client::RemoteTimelineClient; +use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; @@ -632,7 +632,7 @@ pub enum TimelineArchivalError { AlreadyInProgress, #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl Debug for TimelineArchivalError { @@ -1602,7 +1602,8 @@ impl Tenant { "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) - })?; + }) + .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); if let Some(timeline) = timelines.get(&timeline_id) { let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); @@ -1672,9 +1673,19 @@ impl Tenant { }; // Third part: upload new timeline archival state and block until it is present in S3 - let upload_needed = timeline + let upload_needed = match timeline .remote_client - .schedule_index_upload_for_timeline_archival_state(new_state)?; + .schedule_index_upload_for_timeline_archival_state(new_state) + { + Ok(upload_needed) => upload_needed, + Err(e) => { + if timeline.cancel.is_cancelled() { + return Err(TimelineArchivalError::Cancelled); + } else { + return Err(TimelineArchivalError::Other(e)); + } + } + }; if upload_needed { info!("Uploading new state"); @@ -1685,7 +1696,14 @@ impl Tenant { tracing::warn!("reached timeout for waiting on upload queue"); return Err(TimelineArchivalError::Timeout); }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + v.map_err(|e| match e { + WaitCompletionError::NotInitialized(e) => { + TimelineArchivalError::Other(anyhow::anyhow!(e)) + } + WaitCompletionError::UploadQueueShutDownOrStopped => { + TimelineArchivalError::Cancelled + } + })?; } Ok(()) } From f14e45f0cee38bfbbbf1141d486fdd8edfbcc2f2 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 16 Oct 2024 15:01:56 +0200 Subject: [PATCH 21/38] proxy: format imports with nightly rustfmt (#9414) ```shell cargo +nightly fmt -p proxy -- -l --config imports_granularity=Module,group_imports=StdExternalCrate,reorder_imports=true ``` These rust-analyzer settings for VSCode should help retain this style: ```json "rust-analyzer.imports.group.enable": true, "rust-analyzer.imports.prefix": "crate", "rust-analyzer.imports.merge.glob": false, "rust-analyzer.imports.granularity.group": "module", "rust-analyzer.imports.granularity.enforce": true, ``` --- proxy/src/auth/backend/classic.rs | 19 +++-- proxy/src/auth/backend/console_redirect.rs | 21 +++-- proxy/src/auth/backend/hacks.rs | 19 +++-- proxy/src/auth/backend/jwt.rs | 39 +++++----- proxy/src/auth/backend/local.rs | 19 ++--- proxy/src/auth/backend/mod.rs | 61 +++++++-------- proxy/src/auth/credentials.rs | 25 +++--- proxy/src/auth/flow.rs | 25 +++--- proxy/src/auth/mod.rs | 12 +-- proxy/src/bin/local_proxy.rs | 50 ++++++------ proxy/src/bin/pg_sni_router.rs | 16 ++-- proxy/src/bin/proxy.rs | 51 +++++------- proxy/src/cache/endpoints.rs | 34 ++++---- proxy/src/cache/project_info.rs | 27 +++---- proxy/src/cache/timed_lru.rs | 13 ++-- proxy/src/cancellation.rs | 14 ++-- proxy/src/compute.rs | 30 ++++--- proxy/src/config.rs | 37 ++++----- proxy/src/console_redirect_proxy.rs | 29 ++++--- proxy/src/context/mod.rs | 21 ++--- proxy/src/context/parquet.rs | 49 ++++++------ proxy/src/control_plane/messages.rs | 7 +- proxy/src/control_plane/mgmt.rs | 10 +-- proxy/src/control_plane/provider/mock.rs | 39 +++++----- proxy/src/control_plane/provider/mod.rs | 47 ++++++----- proxy/src/control_plane/provider/neon.rs | 42 +++++----- proxy/src/error.rs | 3 +- proxy/src/http/health_server.rs | 25 +++--- proxy/src/http/mod.rs | 17 ++-- proxy/src/intern.rs | 14 ++-- proxy/src/jemalloc.rs | 16 ++-- proxy/src/logging.rs | 16 ++-- proxy/src/metrics.rs | 8 +- proxy/src/protocol2.rs | 10 +-- proxy/src/proxy/connect_compute.rs | 29 ++++--- proxy/src/proxy/copy_bidirectional.rs | 9 ++- proxy/src/proxy/handshake.rs | 20 +++-- proxy/src/proxy/mod.rs | 38 ++++----- proxy/src/proxy/passthrough.rs | 14 ++-- proxy/src/proxy/retry.rs | 8 +- proxy/src/proxy/tests/mitm.rs | 3 +- proxy/src/proxy/tests/mod.rs | 22 +++--- proxy/src/proxy/wake_compute.rs | 11 +-- proxy/src/rate_limiter/leaky_bucket.rs | 6 +- proxy/src/rate_limiter/limit_algorithm.rs | 12 +-- .../src/rate_limiter/limit_algorithm/aimd.rs | 3 +- proxy/src/rate_limiter/limiter.rs | 24 +++--- proxy/src/rate_limiter/mod.rs | 4 +- proxy/src/redis/cancellation_publisher.rs | 7 +- .../connection_with_credentials_provider.rs | 9 +-- proxy/src/redis/notifications.rs | 17 ++-- proxy/src/sasl/messages.rs | 3 +- proxy/src/sasl/mod.rs | 5 +- proxy/src/sasl/stream.rs | 7 +- proxy/src/scram/countmin.rs | 4 +- proxy/src/scram/exchange.rs | 3 +- proxy/src/scram/messages.rs | 5 +- proxy/src/scram/mod.rs | 15 ++-- proxy/src/scram/pbkdf2.rs | 10 +-- proxy/src/scram/threadpool.rs | 32 +++----- proxy/src/serverless/backend.rs | 60 +++++++------- proxy/src/serverless/cancel_set.rs | 8 +- proxy/src/serverless/conn_pool.rs | 44 +++++------ proxy/src/serverless/http_conn_pool.rs | 17 ++-- proxy/src/serverless/http_util.rs | 7 +- proxy/src/serverless/json.rs | 9 +-- proxy/src/serverless/local_conn_pool.rs | 23 +++--- proxy/src/serverless/mod.rs | 19 +++-- proxy/src/serverless/sql_over_http.rs | 78 ++++++------------- proxy/src/serverless/websocket.rs | 41 ++++------ proxy/src/stream.rs | 15 ++-- proxy/src/usage_metrics.rs | 41 +++++----- proxy/src/waiters.rs | 8 +- 73 files changed, 723 insertions(+), 832 deletions(-) diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 94b84b6f0034..de32a06e9e60 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo}; -use crate::{ - auth::{self, backend::ComputeCredentialKeys, AuthFlow}, - compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - sasl, - stream::{PqStream, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::stream::{PqStream, Stream}; +use crate::{compute, sasl}; + pub(super) async fn authenticate( ctx: &RequestMonitoring, creds: ComputeUserInfo, diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 457410ec8cec..255e1fed54fb 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,15 +1,3 @@ -use crate::{ - auth, - cache::Cached, - compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{self, provider::NodeInfo, CachedNodeInfo}, - error::{ReportableError, UserFacingError}, - proxy::connect_compute::ComputeConnectBackend, - stream::PqStream, - waiters, -}; use async_trait::async_trait; use pq_proto::BeMessage as Be; use thiserror::Error; @@ -18,6 +6,15 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; +use crate::cache::Cached; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::provider::NodeInfo; +use crate::control_plane::{self, CachedNodeInfo}; +use crate::error::{ReportableError, UserFacingError}; +use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::stream::PqStream; +use crate::{auth, compute, waiters}; #[derive(Debug, Error)] pub(crate) enum WebAuthError { diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 749218d260d4..8ab8d5d37f8a 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; -use crate::{ - auth::{self, AuthFlow}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - stream::{self, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::stream::{self, Stream}; + /// Compared to [SCRAM](crate::scram), cleartext password auth saves /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 402e59fdb399..3f53ee24c39c 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -1,22 +1,22 @@ -use std::{ - future::Future, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; -use serde::{de::Visitor, Deserialize, Deserializer}; +use serde::de::Visitor; +use serde::{Deserialize, Deserializer}; use signature::Verifier; use thiserror::Error; use tokio::time::Instant; -use crate::{ - auth::backend::ComputeCredentialKeys, context::RequestMonitoring, - control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit, - intern::RoleNameInt, EndpointId, RoleName, -}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::http::parse_json_body_with_limit; +use crate::intern::RoleNameInt; +use crate::{EndpointId, RoleName}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -381,10 +381,8 @@ fn verify_rsa_signature( alg: &jose_jwa::Algorithm, ) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::{ - pkcs1v15::{Signature, VerifyingKey}, - RsaPublicKey, - }; + use rsa::pkcs1v15::{Signature, VerifyingKey}; + use rsa::RsaPublicKey; let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; @@ -655,11 +653,9 @@ impl From<&jose_jwk::Key> for KeyType { #[cfg(test)] mod tests { - use crate::RoleName; - - use super::*; - - use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + use std::future::IntoFuture; + use std::net::SocketAddr; + use std::time::SystemTime; use base64::URL_SAFE_NO_PAD; use bytes::Bytes; @@ -672,6 +668,9 @@ mod tests { use signature::Signer; use tokio::net::TcpListener; + use super::*; + use crate::RoleName; + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { let sk = p256::SecretKey::random(&mut OsRng); let pk = sk.public_key().into(); diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 1dea4d2d73c3..e3995ac6c0cc 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -2,19 +2,14 @@ use std::net::SocketAddr; use arc_swap::ArcSwapOption; -use crate::{ - auth::backend::jwt::FetchAuthRulesError, - compute::ConnCfg, - context::RequestMonitoring, - control_plane::{ - messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, - NodeInfo, - }, - intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}, - EndpointId, -}; - use super::jwt::{AuthRule, FetchAuthRules}; +use crate::auth::backend::jwt::FetchAuthRulesError; +use crate::compute::ConnCfg; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; +use crate::control_plane::NodeInfo; +use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; +use crate::EndpointId; pub struct LocalBackend { pub(crate) node_info: NodeInfo, diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7cf158bcd90d..a4db130b618b 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -17,29 +17,22 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::{validate_password_and_exchange, AuthError}; +use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; use crate::cache::Cached; +use crate::config::AuthenticationConfig; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetAuthInfoError; -use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend}; -use crate::control_plane::AuthSecret; +use crate::control_plane::provider::{ + CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend, +}; +use crate::control_plane::{self, Api, AuthSecret}; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; -use crate::{ - auth::{self, ComputeUserInfoMaybeEndpoint}, - config::AuthenticationConfig, - control_plane::{ - self, - provider::{CachedAllowedIps, CachedNodeInfo}, - Api, - }, - stream, -}; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -500,34 +493,32 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { #[cfg(test)] mod tests { - use std::{net::IpAddr, sync::Arc, time::Duration}; + use std::net::IpAddr; + use std::sync::Arc; + use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use once_cell::sync::Lazy; - use postgres_protocol::{ - authentication::sasl::{ChannelBinding, ScramSha256}, - message::{backend::Message as PgMessage, frontend}, - }; + use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; + use postgres_protocol::message::backend::Message as PgMessage; + use postgres_protocol::message::frontend; use provider::AuthSecret; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; - use crate::{ - auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{ - self, - provider::{self, CachedAllowedIps, CachedRoleSecret}, - CachedNodeInfo, - }, - proxy::NeonOptions, - rate_limiter::{EndpointRateLimiter, RateBucketInfo}, - scram::{threadpool::ThreadPool, ServerSecret}, - stream::{PqStream, Stream}, - }; - - use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter}; + use super::jwt::JwkCache; + use super::{auth_quirks, AuthRateLimiter}; + use crate::auth::backend::MaskedIp; + use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; + use crate::config::AuthenticationConfig; + use crate::context::RequestMonitoring; + use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret}; + use crate::control_plane::{self, CachedNodeInfo}; + use crate::proxy::NeonOptions; + use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; + use crate::scram::threadpool::ThreadPool; + use crate::scram::ServerSecret; + use crate::stream::{PqStream, Stream}; struct Auth { ips: Vec, diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index cba8601d143b..fa6bc4c6f5b7 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,20 +1,22 @@ //! User credentials used in authentication. -use crate::{ - auth::password_hack::parse_endpoint_param, - context::RequestMonitoring, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, SniKind}, - proxy::NeonOptions, - serverless::SERVERLESS_DRIVER_SNI, - EndpointId, RoleName, -}; +use std::collections::HashSet; +use std::net::IpAddr; +use std::str::FromStr; + use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; +use crate::auth::password_hack::parse_endpoint_param; +use crate::context::RequestMonitoring; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, SniKind}; +use crate::proxy::NeonOptions; +use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::{EndpointId, RoleName}; + #[derive(Debug, Error, PartialEq, Eq, Clone)] pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] @@ -249,10 +251,11 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] mod tests { - use super::*; use serde_json::json; use ComputeUserInfoParseError::*; + use super::*; + #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 9a5139dfb845..ccb17b66b9f3 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,21 +1,24 @@ //! Main authentication flow. -use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; -use crate::{ - config::TlsServerEndPoint, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - scram::{self, threadpool::ThreadPool}, - stream::{PqStream, Stream}, -}; +use std::io; +use std::sync::Arc; + use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::backend::ComputeCredentialKeys; +use super::{AuthErrorImpl, PasswordHackPayload}; +use crate::config::TlsServerEndPoint; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::scram::threadpool::ThreadPool; +use crate::scram::{self}; +use crate::stream::{PqStream, Stream}; + /// Every authentication selector is supposed to implement this trait. pub(crate) trait AuthMethod { /// Any authentication selector should provide initial backend message diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 0c8686add2a5..ff97e6c35d70 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -14,15 +14,15 @@ pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; +use std::io; +use std::net::IpAddr; + pub(crate) use flow::*; +use thiserror::Error; use tokio::time::error::Elapsed; -use crate::{ - control_plane, - error::{ReportableError, UserFacingError}, -}; -use std::{io, net::IpAddr}; -use thiserror::Error; +use crate::control_plane; +use crate::error::{ReportableError, UserFacingError}; /// Convenience wrapper for the authentication error. pub(crate) type Result = std::result::Result; diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index c92ebbc51f3c..e6bc369d9a7c 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,41 +1,43 @@ -use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration}; +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; use anyhow::{bail, ensure, Context}; use camino::{Utf8Path, Utf8PathBuf}; use compute_api::spec::LocalProxySpec; use dashmap::DashMap; use futures::future::Either; -use proxy::{ - auth::{ - self, - backend::{ - jwt::JwkCache, - local::{LocalBackend, JWKS_ROLE_MAP}, - }, - }, - cancellation::CancellationHandlerMain, - config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, - control_plane::{ - locks::ApiLocks, - messages::{EndpointJwksResponse, JwksSettings}, - }, - http::health_server::AppMetrics, - intern::RoleNameInt, - metrics::{Metrics, ThreadPoolMetrics}, - rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, - scram::threadpool::ThreadPool, - serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, - RoleName, +use proxy::auth::backend::jwt::JwkCache; +use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use proxy::auth::{self}; +use proxy::cancellation::CancellationHandlerMain; +use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}; +use proxy::control_plane::locks::ApiLocks; +use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use proxy::http::health_server::AppMetrics; +use proxy::intern::RoleNameInt; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; +use proxy::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, }; +use proxy::scram::threadpool::ThreadPool; +use proxy::serverless::cancel_set::CancelSet; +use proxy::serverless::{self, GlobalConnPoolOptions}; +use proxy::RoleName; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; -use tokio::{net::TcpListener, sync::Notify, task::JoinSet}; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 53f1586abe17..00eb830d98a6 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -5,25 +5,23 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; +use anyhow::{anyhow, bail, ensure, Context}; +use clap::Arg; use futures::future::Either; +use futures::TryFutureExt; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use rustls::pki_types::PrivateKeyDer; -use tokio::net::TcpListener; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::TryFutureExt; use proxy::stream::{PqStream, Stream}; - +use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use utils::{project_git_version, sentry_init::init_sentry}; - use tracing::{error, info, Instrument}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; project_git_version!(GIT_VERSION); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3c0e66dec3f1..96a71e69c6ec 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,3 +1,8 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; + +use anyhow::bail; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -7,52 +12,34 @@ use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::Region; use futures::future::Either; -use proxy::auth; use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::AuthRateLimiter; -use proxy::auth::backend::ConsoleRedirectBackend; -use proxy::auth::backend::MaybeOwned; -use proxy::cancellation::CancelMap; -use proxy::cancellation::CancellationHandler; -use proxy::config::remote_storage_from_toml; -use proxy::config::AuthenticationConfig; -use proxy::config::CacheOptions; -use proxy::config::HttpConfig; -use proxy::config::ProjectInfoCacheOptions; -use proxy::config::ProxyProtocolV2; +use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use proxy::cancellation::{CancelMap, CancellationHandler}; +use proxy::config::{ + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, + ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, +}; use proxy::context::parquet::ParquetUploadArgs; -use proxy::control_plane; -use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; -use proxy::rate_limiter::EndpointRateLimiter; -use proxy::rate_limiter::LeakyBucketConfig; -use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::WakeComputeRateLimiter; +use proxy::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::elasticache; -use proxy::redis::notifications; +use proxy::redis::{elasticache, notifications}; use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; -use proxy::usage_metrics; - -use anyhow::bail; -use proxy::config::{self, ProxyConfig}; -use proxy::serverless; +use proxy::{auth, control_plane, http, serverless, usage_metrics}; use remote_storage::RemoteStorageConfig; -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; use tokio::net::TcpListener; use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use tracing::Instrument; -use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 27121ce89e84..82f3247fa7b8 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -1,31 +1,23 @@ -use std::{ - convert::Infallible, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::convert::Infallible; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Duration; use dashmap::DashSet; -use redis::{ - streams::{StreamReadOptions, StreamReadReply}, - AsyncCommands, FromRedisValue, Value, -}; +use redis::streams::{StreamReadOptions, StreamReadReply}; +use redis::{AsyncCommands, FromRedisValue, Value}; use serde::Deserialize; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; use tracing::info; -use crate::{ - config::EndpointCacheConfig, - context::RequestMonitoring, - intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, - rate_limiter::GlobalRateLimiter, - redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, - EndpointId, -}; +use crate::config::EndpointCacheConfig; +use crate::context::RequestMonitoring; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; +use crate::rate_limiter::GlobalRateLimiter; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::EndpointId; #[derive(Deserialize, Debug, Clone)] pub(crate) struct ControlPlaneEventKey { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index b92cedb04381..31d1dc96e701 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,9 +1,8 @@ -use std::{ - collections::HashSet, - convert::Infallible, - sync::{atomic::AtomicU64, Arc}, - time::Duration, -}; +use std::collections::HashSet; +use std::convert::Infallible; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use dashmap::DashMap; @@ -13,15 +12,12 @@ use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; -use crate::{ - auth::IpPattern, - config::ProjectInfoCacheOptions, - control_plane::AuthSecret, - intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, RoleName, -}; - use super::{Cache, Cached}; +use crate::auth::IpPattern; +use crate::config::ProjectInfoCacheOptions; +use crate::control_plane::AuthSecret; +use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { @@ -371,7 +367,8 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{scram::ServerSecret, ProjectId}; + use crate::scram::ServerSecret; + use crate::ProjectId; #[tokio::test] async fn test_project_info_cache_settings() { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 5b08d74696c6..06eaeb9a30ab 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -1,9 +1,6 @@ -use std::{ - borrow::Borrow, - hash::Hash, - time::{Duration, Instant}, -}; -use tracing::debug; +use std::borrow::Borrow; +use std::hash::Hash; +use std::time::{Duration, Instant}; // This seems to make more sense than `lru` or `cached`: // @@ -15,8 +12,10 @@ use tracing::debug; // // On the other hand, `hashlink` has good download stats and appears to be maintained. use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use tracing::debug; -use super::{common::Cached, timed_lru, Cache}; +use super::common::Cached; +use super::{timed_lru, Cache}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 71a2a16af876..db0970adcbc3 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,6 +1,8 @@ +use std::net::SocketAddr; +use std::sync::Arc; + use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::{net::SocketAddr, sync::Arc}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; @@ -8,12 +10,10 @@ use tokio_postgres::{CancelToken, NoTls}; use tracing::info; use uuid::Uuid; -use crate::{ - error::ReportableError, - metrics::{CancellationRequest, CancellationSource, Metrics}, - redis::cancellation_publisher::{ - CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, - }, +use crate::error::ReportableError; +use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; pub type CancelMap = Arc>>; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 006804fcd4ca..212e82497f48 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,25 +1,31 @@ -use crate::{ - auth::parse_endpoint_param, - cancellation::CancelClosure, - context::RequestMonitoring, - control_plane::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, NumDbConnectionsGuard}, - proxy::neon_option, - Host, -}; +use std::io; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; -use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}; -use std::{io, net::SocketAddr, sync::Arc, time::Duration}; +use rustls::client::danger::ServerCertVerifier; +use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; +use crate::auth::parse_endpoint_param; +use crate::cancellation::CancelClosure; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::ApiLockError; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, NumDbConnectionsGuard}; +use crate::proxy::neon_option; +use crate::Host; + pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] diff --git a/proxy/src/config.rs b/proxy/src/config.rs index c068fc50fb81..2ec8c7adda9a 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,29 +1,27 @@ -use crate::{ - auth::backend::{jwt::JwkCache, AuthRateLimiter}, - control_plane::locks::ApiLocks, - rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, - scram::threadpool::ThreadPool, - serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, - Host, -}; +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + use anyhow::{bail, ensure, Context, Ok}; use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::{ - crypto::ring::sign, - pki_types::{CertificateDer, PrivateKeyDer}, -}; +use rustls::crypto::ring::sign; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use sha2::{Digest, Sha256}; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - sync::Arc, - time::Duration, -}; use tracing::{error, info}; use x509_parser::oid_registry; +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::AuthRateLimiter; +use crate::control_plane::locks::ApiLocks; +use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::GlobalConnPoolOptions; +use crate::Host; + pub struct ProxyConfig { pub tls_config: Option, pub metric_collection: Option, @@ -692,9 +690,8 @@ impl FromStr for ConcurrencyLockOptions { #[cfg(test)] mod tests { - use crate::rate_limiter::Aimd; - use super::*; + use crate::rate_limiter::Aimd; #[test] fn test_parse_cache_options() -> anyhow::Result<()> { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 9e1797672021..81d1d70958d2 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -1,25 +1,22 @@ -use crate::auth::backend::ConsoleRedirectBackend; -use crate::config::{ProxyConfig, ProxyProtocolV2}; -use crate::proxy::{ - prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, -}; -use crate::{ - cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}, - context::RequestMonitoring, - error::ReportableError, - metrics::{Metrics, NumClientConnectionsGuard}, - protocol2::read_proxy_protocol, - proxy::handshake::{handshake, HandshakeData}, -}; -use futures::TryFutureExt; use std::sync::Arc; + +use futures::TryFutureExt; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::{error, info, Instrument}; +use crate::auth::backend::ConsoleRedirectBackend; +use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ - connect_compute::{connect_to_compute, TcpMechanism}, - passthrough::ProxyPassthrough, + prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, }; pub async fn task_main( diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 7fb4e7c69835..e2d2c1b7668a 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -1,24 +1,25 @@ //! Connection request monitoring contexts +use std::net::IpAddr; + use chrono::Utc; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; use smol_str::SmolStr; -use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{debug, field::display, info, info_span, Span}; +use tracing::field::display; +use tracing::{debug, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; -use crate::{ - control_plane::messages::{ColdStartInfo, MetricsAuxInfo}, - error::ErrorKind, - intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, - DbName, EndpointId, RoleName, -}; - use self::parquet::RequestData; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::error::ErrorKind; +use crate::intern::{BranchIdInt, ProjectIdInt}; +use crate::metrics::{ + ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, +}; +use crate::{DbName, EndpointId, RoleName}; pub mod parquet; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 9f6f83022ed6..b0ad0e45662b 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,29 +1,28 @@ -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use bytes::{buf::Writer, BufMut, BytesMut}; +use bytes::buf::Writer; +use bytes::{BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; -use parquet::{ - basic::Compression, - file::{ - metadata::RowGroupMetaDataPtr, - properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}, - writer::SerializedFileWriter, - }, - record::RecordWriter, -}; +use parquet::basic::Compression; +use parquet::file::metadata::RowGroupMetaDataPtr; +use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::writer::SerializedFileWriter; +use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; -use tokio::{sync::mpsc, time}; +use tokio::sync::mpsc; +use tokio::time; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; - use super::{RequestMonitoringInner, LOG_CHAN}; +use crate::config::remote_storage_from_toml; +use crate::context::LOG_CHAN_DISCONNECT; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -407,26 +406,26 @@ async fn upload_parquet( #[cfg(test)] mod tests { - use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc}; + use std::net::Ipv4Addr; + use std::num::NonZeroUsize; + use std::sync::Arc; use camino::Utf8Path; use clap::Parser; use futures::{Stream, StreamExt}; use itertools::Itertools; - use parquet::{ - basic::{Compression, ZstdLevel}, - file::{ - properties::{WriterProperties, DEFAULT_PAGE_SIZE}, - reader::FileReader, - serialized_reader::SerializedFileReader, - }, - }; - use rand::{rngs::StdRng, Rng, SeedableRng}; + use parquet::basic::{Compression, ZstdLevel}; + use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::reader::FileReader; + use parquet::file::serialized_reader::SerializedFileReader; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use remote_storage::{ GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; - use tokio::{sync::mpsc, time}; + use tokio::sync::mpsc; + use tokio::time; use walkdir::WalkDir; use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 960bb5bc2126..dae23f7c5311 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -1,9 +1,9 @@ +use std::fmt::{self, Display}; + use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; -use std::fmt::{self, Display}; use crate::auth::IpPattern; - use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; @@ -362,9 +362,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + fn dummy_aux() -> serde_json::Value { json!({ "endpoint_id": "endpoint", diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2c4b5a9b94e5..5ac3acd28a79 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -1,16 +1,16 @@ -use crate::{ - control_plane::messages::{DatabaseInfo, KickSession}, - waiters::{self, Waiter, Waiters}, -}; +use std::convert::Infallible; + use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; +use crate::control_plane::messages::{DatabaseInfo, KickSession}; +use crate::waiters::{self, Waiter, Waiters}; + static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index 51cddec67248..fb061376e7a9 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -1,28 +1,29 @@ //! Mock console backend which relies on a user-provided postgres instance. -use super::{ - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, -}; -use crate::{ - auth::backend::jwt::AuthRule, context::RequestMonitoring, - control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName, -}; -use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; -use crate::{auth::IpPattern, cache::Cached}; -use crate::{ - control_plane::{ - messages::MetricsAuxInfo, - provider::{CachedAllowedIps, CachedRoleSecret}, - }, - BranchId, EndpointId, ProjectId, -}; +use std::str::FromStr; +use std::sync::Arc; + use futures::TryFutureExt; -use std::{str::FromStr, sync::Arc}; use thiserror::Error; -use tokio_postgres::{config::SslMode, Client}; +use tokio_postgres::config::SslMode; +use tokio_postgres::Client; use tracing::{error, info, info_span, warn, Instrument}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::IpPattern; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret}; +use crate::error::io_error; +use crate::intern::RoleNameInt; +use crate::url::ApiUrl; +use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName}; + #[derive(Debug, Error)] enum MockApiError { #[error("Failed to read password: {0}")] diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 0a196fe2a35a..a4a330cd5fec 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -2,39 +2,36 @@ pub mod mock; pub mod neon; -use super::messages::{ControlPlaneError, MetricsAuxInfo}; -use crate::{ - auth::{ - backend::{ - jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}, - ComputeCredentialKeys, ComputeUserInfo, - }, - IpPattern, - }, - cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, - compute, - config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, - context::RequestMonitoring, - error::ReportableError, - intern::ProjectIdInt, - metrics::ApiLockMetrics, - rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, - scram, EndpointCacheKey, EndpointId, -}; +use std::hash::Hash; +use std::sync::Arc; +use std::time::Duration; + use dashmap::DashMap; -use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::time::Instant; use tracing::info; +use super::messages::{ControlPlaneError, MetricsAuxInfo}; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::IpPattern; +use crate::cache::endpoints::EndpointsCache; +use crate::cache::project_info::ProjectInfoCacheImpl; +use crate::cache::{Cached, TimedLru}; +use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::intern::ProjectIdInt; +use crate::metrics::ApiLockMetrics; +use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; +use crate::{compute, scram, EndpointCacheKey, EndpointId}; + pub(crate) mod errors { - use crate::{ - control_plane::messages::{self, ControlPlaneError, Reason}, - error::{io_error, ErrorKind, ReportableError, UserFacingError}, - proxy::retry::CouldRetry, - }; use thiserror::Error; use super::ApiLockError; + use crate::control_plane::messages::{self, ControlPlaneError, Reason}; + use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; + use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. pub(crate) const REQUEST_FAILED: &str = "Console request failed"; diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index 2487ce0e3f40..5d0692c7ca21 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -1,31 +1,31 @@ //! Production console backend. -use super::{ - super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}, - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, - NodeInfo, -}; -use crate::{ - auth::backend::{jwt::AuthRule, ComputeUserInfo}, - compute, - control_plane::{ - errors::GetEndpointJwksError, - messages::{ColdStartInfo, EndpointJwksResponse, Reason}, - }, - http, - metrics::{CacheOutcome, Metrics}, - rate_limiter::WakeComputeRateLimiter, - scram, EndpointCacheKey, EndpointId, -}; -use crate::{cache::Cached, context::RequestMonitoring}; -use ::http::{header::AUTHORIZATION, HeaderName}; +use std::sync::Arc; +use std::time::Duration; + +use ::http::header::AUTHORIZATION; +use ::http::HeaderName; use futures::TryFutureExt; -use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{debug, info, info_span, warn, Instrument}; +use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{ + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, + NodeInfo, +}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::metrics::{CacheOutcome, Metrics}; +use crate::rate_limiter::WakeComputeRateLimiter; +use crate::{compute, http, scram, EndpointCacheKey, EndpointId}; + const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); #[derive(Clone)] diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 1cd4dc2c221c..e71ed0c048f0 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,4 +1,5 @@ -use std::{error::Error as StdError, fmt, io}; +use std::error::Error as StdError; +use std::{fmt, io}; use measured::FixedCardinalityLabel; diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index d0352351d55f..978ad9f76131 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,19 +1,18 @@ +use std::convert::Infallible; +use std::net::TcpListener; +use std::sync::{Arc, Mutex}; + use anyhow::{anyhow, bail}; -use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; -use measured::{text::BufferedTextEncoder, MetricGroup}; +use hyper0::header::CONTENT_TYPE; +use hyper0::{Body, Request, Response, StatusCode}; +use measured::text::BufferedTextEncoder; +use measured::MetricGroup; use metrics::NeonMetrics; -use std::{ - convert::Infallible, - net::TcpListener, - sync::{Arc, Mutex}, -}; use tracing::{info, info_span}; -use utils::http::{ - endpoint::{self, request_span}, - error::ApiError, - json::json_response, - RouterBuilder, RouterService, -}; +use utils::http::endpoint::{self, request_span}; +use utils::http::error::ApiError; +use utils::http::json::json_response; +use utils::http::{RouterBuilder, RouterService}; use crate::jemalloc; diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index d8676d5b5008..fd587e8f01f5 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -10,17 +10,15 @@ use anyhow::bail; use bytes::Bytes; use http_body_util::BodyExt; use hyper::body::Body; -use serde::de::DeserializeOwned; - pub(crate) use reqwest::{Request, Response}; +use reqwest_middleware::RequestBuilder; pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; +pub(crate) use reqwest_retry::RetryTransientMiddleware; +use serde::de::DeserializeOwned; -use crate::{ - metrics::{ConsoleRequest, Metrics}, - url::ApiUrl, -}; -use reqwest_middleware::RequestBuilder; +use crate::metrics::{ConsoleRequest, Metrics}; +use crate::url::ApiUrl; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). @@ -142,9 +140,10 @@ pub(crate) async fn parse_json_body_with_limit( #[cfg(test)] mod tests { - use super::*; use reqwest::Client; + use super::*; + #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index 108420d7d739..09fd9657d076 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -1,6 +1,8 @@ -use std::{ - hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, -}; +use std::hash::BuildHasherDefault; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::Index; +use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; @@ -208,9 +210,8 @@ impl From for ProjectIdInt { mod tests { use std::sync::OnceLock; - use crate::intern::StringInterner; - use super::InternId; + use crate::intern::StringInterner; struct MyId; impl InternId for MyId { @@ -222,7 +223,8 @@ mod tests { #[test] fn push_many_strings() { - use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use rand_distr::Zipf; let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index d307d80f4af9..0fae78b60cd6 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,14 +1,12 @@ use std::marker::PhantomData; -use measured::{ - label::NoLabels, - metric::{ - gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, - MetricFamilyEncoding, MetricType, - }, - text::TextEncoder, - LabelGroup, MetricGroup, -}; +use measured::label::NoLabels; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; +use measured::text::TextEncoder; +use measured::{LabelGroup, MetricGroup}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index a34eb820f81b..11921867e44b 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,14 +1,10 @@ use tracing::Subscriber; -use tracing_subscriber::{ - filter::{EnvFilter, LevelFilter}, - fmt::{ - format::{Format, Full}, - time::SystemTime, - FormatEvent, FormatFields, - }, - prelude::*, - registry::LookupSpan, -}; +use tracing_subscriber::filter::{EnvFilter, LevelFilter}; +use tracing_subscriber::fmt::format::{Format, Full}; +use tracing_subscriber::fmt::time::SystemTime; +use tracing_subscriber::fmt::{FormatEvent, FormatFields}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::registry::LookupSpan; /// Initialize logging and OpenTelemetry tracing and exporter. /// diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 272723a1bccc..542826e83375 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,14 +1,16 @@ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; +use measured::label::{ + FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet, +}; +use measured::metric::histogram::Thresholds; +use measured::metric::name::MetricName; use measured::{ - label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, - metric::{histogram::Thresholds, name::MetricName}, Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; - use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 17764f78d15d..ef2391cdd805 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,11 +1,9 @@ //! Proxy Protocol V2 implementation -use std::{ - io, - net::SocketAddr, - pin::Pin, - task::{Context, Poll}, -}; +use std::io; +use std::net::SocketAddr; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::BytesMut; use pin_project_lite::pin_project; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index aac77208900a..8e9663626a51 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,24 +1,23 @@ -use crate::{ - auth::backend::ComputeCredentialKeys, - compute::COULD_NOT_CONNECT, - compute::{self, PostgresConnection}, - config::RetryConfig, - context::RequestMonitoring, - control_plane::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, - error::ReportableError, - metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, - proxy::{ - retry::{retry_after, should_retry, CouldRetry}, - wake_compute::wake_compute, - }, - Host, -}; use async_trait::async_trait; use pq_proto::StartupMessageParams; use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; +use crate::auth::backend::ComputeCredentialKeys; +use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; +use crate::config::RetryConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; +use crate::error::ReportableError; +use crate::metrics::{ + ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, +}; +use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::wake_compute::wake_compute; +use crate::Host; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4ebda013ac3c..91a3ceff75a7 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,11 +1,11 @@ -use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; -use tracing::info; - use std::future::poll_fn; use std::io; use std::pin::Pin; use std::task::{ready, Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + #[derive(Debug)] enum TransferState { Running(CopyBuffer), @@ -256,9 +256,10 @@ impl CopyBuffer { #[cfg(test)] mod tests { - use super::*; use tokio::io::AsyncWriteExt; + use super::*; + #[tokio::test] async fn test_client_to_compute() { let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 5996b11c11db..a67f1b811264 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -1,21 +1,19 @@ use bytes::Buf; +use pq_proto::framed::Framed; use pq_proto::{ - framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, - StartupMessageParams, + BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams, }; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; -use crate::{ - auth::endpoint_sni, - config::{TlsConfig, PG_ALPN_PROTOCOL}, - context::RequestMonitoring, - error::ReportableError, - metrics::Metrics, - proxy::ERR_INSECURE_CONNECTION, - stream::{PqStream, Stream, StreamUpgradeError}, -}; +use crate::auth::endpoint_sni; +use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::Metrics; +use crate::proxy::ERR_INSECURE_CONNECTION; +use crate::stream::{PqStream, Stream, StreamUpgradeError}; #[derive(Error, Debug)] pub(crate) enum HandshakeError { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index b2b5a7f43d6c..f646862caadd 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -7,40 +7,32 @@ pub(crate) mod handshake; pub(crate) mod passthrough; pub(crate) mod retry; pub(crate) mod wake_compute; -pub use copy_bidirectional::copy_bidirectional_client_compute; -pub use copy_bidirectional::ErrorSource; - -use crate::config::ProxyProtocolV2; -use crate::{ - auth, - cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, - compute, - config::{ProxyConfig, TlsConfig}, - context::RequestMonitoring, - error::ReportableError, - metrics::{Metrics, NumClientConnectionsGuard}, - protocol2::read_proxy_protocol, - proxy::handshake::{handshake, HandshakeData}, - rate_limiter::EndpointRateLimiter, - stream::{PqStream, Stream}, - EndpointCacheKey, -}; +use std::sync::Arc; + +pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; use futures::TryFutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; use smol_str::{format_smolstr, SmolStr}; -use std::sync::Arc; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use self::{ - connect_compute::{connect_to_compute, TcpMechanism}, - passthrough::ProxyPassthrough, -}; +use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::passthrough::ProxyPassthrough; +use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::stream::{PqStream, Stream}; +use crate::{auth, compute, EndpointCacheKey}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 497cf4bfd523..e3b473098276 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,16 +1,14 @@ -use crate::{ - cancellation, - compute::PostgresConnection, - control_plane::messages::MetricsAuxInfo, - metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, - stream::Stream, - usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; +use crate::cancellation; +use crate::compute::PostgresConnection; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; +use crate::stream::Stream; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 15895d37e629..d3f0c3e7d471 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,7 +1,11 @@ -use crate::{compute, config::RetryConfig}; -use std::{error::Error, io}; +use std::error::Error; +use std::io; + use tokio::time; +use crate::compute; +use crate::config::RetryConfig; + pub(crate) trait CouldRetry { /// Returns true if the error could be retried fn could_retry(&self) -> bool; diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 33a2162bc714..df9f79a7e39f 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -6,7 +6,6 @@ use std::fmt::Debug; -use super::*; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; @@ -14,6 +13,8 @@ use tokio::io::{AsyncReadExt, DuplexStream}; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; +use super::*; + enum Intercept { None, Methods, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index deb4d4a63f61..e50ae4bc93c5 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -4,6 +4,16 @@ mod mitm; use std::time::Duration; +use anyhow::{bail, Context}; +use async_trait::async_trait; +use http::StatusCode; +use retry::{retry_after, ShouldRetryWakeCompute}; +use rstest::rstest; +use rustls::pki_types; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::{MakeTlsConnect, NoTls}; +use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; + use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; use super::*; @@ -18,15 +28,6 @@ use crate::control_plane::provider::{ use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; -use anyhow::{bail, Context}; -use async_trait::async_trait; -use http::StatusCode; -use retry::{retry_after, ShouldRetryWakeCompute}; -use rstest::rstest; -use rustls::pki_types; -use tokio_postgres::config::SslMode; -use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( @@ -336,7 +337,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::{distributions::Alphanumeric, Rng}; + use rand::distributions::Alphanumeric; + use rand::Rng; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 0d1527a2c118..9dfa485fa48d 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,16 +1,17 @@ +use hyper::StatusCode; +use tracing::{error, info, warn}; + +use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::{ControlPlaneError, Reason}; -use crate::control_plane::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::control_plane::provider::CachedNodeInfo; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, WakeupFailureKind, }; use crate::proxy::retry::{retry_after, should_retry}; -use hyper::StatusCode; -use tracing::{error, info, warn}; - -use super::connect_compute::ComputeConnectBackend; pub(crate) async fn wake_compute( num_retries: &mut u32, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index bf4d85f2e42e..45f9630dde0f 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -1,7 +1,5 @@ -use std::{ - hash::Hash, - sync::atomic::{AtomicUsize, Ordering}, -}; +use std::hash::Hash; +use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use dashmap::DashMap; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 25607b7e10fd..16c398f303f4 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -1,10 +1,12 @@ //! Algorithms for controlling concurrency limits. +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + use parking_lot::Mutex; -use std::{pin::pin, sync::Arc, time::Duration}; -use tokio::{ - sync::Notify, - time::{error::Elapsed, Instant}, -}; +use tokio::sync::Notify; +use tokio::time::error::Elapsed; +use tokio::time::Instant; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 86b56e38fbc4..5332a5184f99 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -60,12 +60,11 @@ impl LimitAlgorithm for Aimd { mod tests { use std::time::Duration; + use super::*; use crate::rate_limiter::limit_algorithm::{ DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, }; - use super::*; - #[tokio::test(start_paused = true)] async fn increase_decrease() { let config = RateLimiterConfig { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index be529f174d5d..5de64c22544e 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,17 +1,14 @@ -use std::{ - borrow::Cow, - collections::hash_map::RandomState, - hash::{BuildHasher, Hash}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Mutex, - }, -}; +use std::borrow::Cow; +use std::collections::hash_map::RandomState; +use std::hash::{BuildHasher, Hash}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; -use rand::{rngs::StdRng, Rng, SeedableRng}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; @@ -243,14 +240,17 @@ impl BucketRateLimiter { #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, time::Duration}; + use std::hash::BuildHasherDefault; + use std::time::Duration; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; use super::{BucketRateLimiter, WakeComputeRateLimiter}; - use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; + use crate::intern::EndpointIdInt; + use crate::rate_limiter::RateBucketInfo; + use crate::EndpointId; #[test] fn rate_bucket_rpi() { diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 6e38f8945899..3ae2ecaf8f96 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -2,13 +2,11 @@ mod leaky_bucket; mod limit_algorithm; mod limiter; +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; #[cfg(test)] pub(crate) use limit_algorithm::aimd::Aimd; - pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; pub(crate) use limiter::GlobalRateLimiter; - -pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 95bdfc096534..000024697159 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,13 +5,10 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; -use super::{ - connection_with_credentials_provider::ConnectionWithCredentialsProvider, - notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, -}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index ccd48f148156..82139ea1d5e5 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,10 +1,9 @@ -use std::{sync::Arc, time::Duration}; +use std::sync::Arc; +use std::time::Duration; use futures::FutureExt; -use redis::{ - aio::{ConnectionLike, MultiplexedConnection}, - ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, -}; +use redis::aio::{ConnectionLike, MultiplexedConnection}; +use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; use tokio::task::JoinHandle; use tracing::{debug, error, info, warn}; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index c3af6740cb53..e56c5a341433 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -1,4 +1,5 @@ -use std::{convert::Infallible, sync::Arc}; +use std::convert::Infallible; +use std::sync::Arc; use futures::StreamExt; use pq_proto::CancelKeyData; @@ -8,12 +9,10 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::{ - cache::project_info::ProjectInfoCache, - cancellation::{CancelMap, CancellationHandler}, - intern::{ProjectIdInt, RoleNameInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, -}; +use crate::cache::project_info::ProjectInfoCache; +use crate::cancellation::{CancelMap, CancellationHandler}; +use crate::intern::{ProjectIdInt, RoleNameInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; @@ -269,10 +268,10 @@ where #[cfg(test)] mod tests { - use crate::{ProjectId, RoleName}; + use serde_json::json; use super::*; - use serde_json::json; + use crate::{ProjectId, RoleName}; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 6c9a42b2db74..1373dfba3d9a 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -1,8 +1,9 @@ //! Definitions for SASL messages. -use crate::parse::{split_at_const, split_cstr}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; +use crate::parse::{split_at_const, split_cstr}; + /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub(crate) struct FirstMessage<'a> { diff --git a/proxy/src/sasl/mod.rs b/proxy/src/sasl/mod.rs index 0a3669435981..f0181b404f5e 100644 --- a/proxy/src/sasl/mod.rs +++ b/proxy/src/sasl/mod.rs @@ -10,13 +10,14 @@ mod channel_binding; mod messages; mod stream; -use crate::error::{ReportableError, UserFacingError}; use std::io; -use thiserror::Error; pub(crate) use channel_binding::ChannelBinding; pub(crate) use messages::FirstMessage; pub(crate) use stream::{Outcome, SaslStream}; +use thiserror::Error; + +use crate::error::{ReportableError, UserFacingError}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index b6becd28e1a8..f1c916daa2b7 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -1,11 +1,14 @@ //! Abstraction for the string-oriented SASL protocols. -use super::{messages::ServerMessage, Mechanism}; -use crate::stream::PqStream; use std::io; + use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::messages::ServerMessage; +use super::Mechanism; +use crate::stream::PqStream; + /// Abstracts away all peculiarities of the libpq's protocol. pub(crate) struct SaslStream<'a, S> { /// The underlying stream. diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 64ee0135e17f..87ab6e0d5f5b 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -69,7 +69,9 @@ impl CountMinSketch { #[cfg(test)] mod tests { - use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; use super::CountMinSketch; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index afb560466683..493295c938f6 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -209,7 +209,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use {sasl::Step, ExchangeState}; + use sasl::Step; + use ExchangeState; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index fd9e77764cb1..5ee3a513527d 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -1,11 +1,12 @@ //! Definitions for SCRAM messages. +use std::fmt; +use std::ops::Range; + use super::base64_decode_array; use super::key::{ScramKey, SCRAM_KEY_LEN}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; -use std::fmt; -use std::ops::Range; /// Faithfully taken from PostgreSQL. pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index d058f1c3f851..97644b62822c 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -16,10 +16,9 @@ mod signature; pub mod threadpool; pub(crate) use exchange::{exchange, Exchange}; +use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; - -use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; @@ -59,13 +58,11 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::{ - intern::EndpointIdInt, - sasl::{Mechanism, Step}, - EndpointId, - }; - - use super::{threadpool::ThreadPool, Exchange, ServerSecret}; + use super::threadpool::ThreadPool; + use super::{Exchange, ServerSecret}; + use crate::intern::EndpointIdInt; + use crate::sasl::{Mechanism, Step}; + use crate::EndpointId; #[test] fn snapshot() { diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index 4cf76c845263..9c559e908227 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -1,7 +1,6 @@ -use hmac::{ - digest::{consts::U32, generic_array::GenericArray}, - Hmac, Mac, -}; +use hmac::digest::consts::U32; +use hmac::digest::generic_array::GenericArray; +use hmac::{Hmac, Mac}; use sha2::Sha256; pub(crate) struct Pbkdf2 { @@ -66,10 +65,11 @@ impl Pbkdf2 { #[cfg(test)] mod tests { - use super::Pbkdf2; use pbkdf2::pbkdf2_hmac_array; use sha2::Sha256; + use super::Pbkdf2; + #[test] fn works() { let salt = b"sodium chloride"; diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index c027a0cd200a..cc1b69fcf94b 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -4,28 +4,21 @@ //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. -use std::{ - cell::RefCell, - future::Future, - pin::Pin, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Weak, - }, - task::{Context, Poll}, -}; +use std::cell::RefCell; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::task::{Context, Poll}; use futures::FutureExt; -use rand::Rng; -use rand::{rngs::SmallRng, SeedableRng}; - -use crate::{ - intern::EndpointIdInt, - metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, - scram::countmin::CountMinSketch, -}; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use super::pbkdf2::Pbkdf2; +use crate::intern::EndpointIdInt; +use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}; +use crate::scram::countmin::CountMinSketch; pub struct ThreadPool { runtime: Option, @@ -195,9 +188,8 @@ impl Drop for JobHandle { #[cfg(test)] mod tests { - use crate::EndpointId; - use super::*; + use crate::EndpointId; #[tokio::test] async fn hash_is_correct() { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 927854897f24..a180c4c2ed09 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,42 +1,34 @@ -use std::{io, sync::Arc, time::Duration}; +use std::io; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey}; +use p256::ecdsa::SigningKey; +use p256::elliptic_curve::JwkEcKey; use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; -use tracing::{debug, field::display, info}; - -use crate::{ - auth::{ - self, - backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo}, - check_peer_addr_is_in_list, AuthError, - }, - compute, - config::ProxyConfig, - context::RequestMonitoring, - control_plane::{ - errors::{GetAuthInfoError, WakeComputeError}, - locks::ApiLocks, - provider::ApiLockError, - CachedNodeInfo, - }, - error::{ErrorKind, ReportableError, UserFacingError}, - intern::EndpointIdInt, - proxy::{ - connect_compute::ConnectMechanism, - retry::{CouldRetry, ShouldRetryWakeCompute}, - }, - rate_limiter::EndpointRateLimiter, - EndpointId, Host, -}; - -use super::{ - conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}, - http_conn_pool::{self, poll_http2_client}, - local_conn_pool::{self, LocalClient, LocalConnPool}, -}; +use tracing::field::display; +use tracing::{debug, info}; + +use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client}; +use super::local_conn_pool::{self, LocalClient, LocalConnPool}; +use crate::auth::backend::local::StaticAuthRules; +use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::provider::ApiLockError; +use crate::control_plane::CachedNodeInfo; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::intern::EndpointIdInt; +use crate::proxy::connect_compute::ConnectMechanism; +use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::{compute, EndpointId, Host}; pub(crate) struct PoolingBackend { pub(crate) http_conn_pool: Arc, diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 7659745473c0..6db986f1f74e 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -1,10 +1,8 @@ //! A set for cancelling random http connections -use std::{ - hash::{BuildHasher, BuildHasherDefault}, - num::NonZeroUsize, - time::Duration, -}; +use std::hash::{BuildHasher, BuildHasherDefault}; +use std::num::NonZeroUsize; +use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 2e576e0ded5c..aa869ff1c0a5 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,33 +1,31 @@ +use std::collections::HashMap; +use std::fmt; +use std::ops::Deref; +use std::pin::pin; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + use dashmap::DashMap; -use futures::{future::poll_fn, Future}; +use futures::future::poll_fn; +use futures::Future; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; -use std::{ - fmt, - task::{ready, Poll}, -}; -use std::{ - ops::Deref, - sync::atomic::{self, AtomicUsize}, -}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, info_span, warn, Instrument, Span}; +use super::backend::HttpConnError; +use crate::auth::backend::ComputeUserInfo; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, -}; - -use tracing::{debug, error, warn, Span}; -use tracing::{info, info_span, Instrument}; - -use super::backend::HttpConnError; +use crate::{DbName, EndpointCacheKey, RoleName}; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { @@ -724,13 +722,13 @@ impl Drop for Client { #[cfg(test)] mod tests { - use std::{mem, sync::atomic::AtomicBool}; - - use crate::{ - proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId, - }; + use std::mem; + use std::sync::atomic::AtomicBool; use super::*; + use crate::proxy::NeonOptions; + use crate::serverless::cancel_set::CancelSet; + use crate::{BranchId, EndpointId, ProjectId}; struct MockClient(Arc); impl MockClient { diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 6d61536f1a77..9b6bc98557a5 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -1,22 +1,21 @@ +use std::collections::VecDeque; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; + use dashmap::DashMap; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use rand::Rng; -use std::collections::VecDeque; -use std::sync::atomic::{self, AtomicUsize}; -use std::{sync::Arc, sync::Weak}; use tokio::net::TcpStream; +use tracing::{debug, error, info, info_span, Instrument}; +use super::conn_pool::ConnInfo; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, EndpointCacheKey}; - -use tracing::{debug, error}; -use tracing::{info, info_span, Instrument}; - -use super::conn_pool::ConnInfo; +use crate::EndpointCacheKey; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index c1c5764d1780..c0208d4f68f1 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -1,12 +1,11 @@ //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility //! Will merge back in at some point in the future. -use bytes::Bytes; - use anyhow::Context; +use bytes::Bytes; use http::{Response, StatusCode}; -use http_body_util::{combinators::BoxBody, BodyExt, Full}; - +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; use serde::Serialize; use utils::http::error::ApiError; diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 9f328a0e1d29..8c56d317ccc4 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,7 +1,5 @@ -use serde_json::Map; -use serde_json::Value; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use serde_json::{Map, Value}; +use tokio_postgres::types::{Kind, Type}; use tokio_postgres::Row; // @@ -256,9 +254,10 @@ fn _pg_array_parse( #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + #[test] fn test_atomic_types_to_pg_params() { let json = vec![Value::Bool(true), Value::Bool(false)]; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 4ab14ad35f89..5df37a8762ff 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,28 +1,31 @@ -use futures::{future::poll_fn, Future}; +use std::collections::HashMap; +use std::pin::pin; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + +use futures::future::poll_fn; +use futures::Future; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; use serde_json::value::RawValue; use signature::Signer; -use std::task::{ready, Poll}; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; +use tracing::{error, info, info_span, warn, Instrument, Span}; +use super::backend::HttpConnError; +use super::conn_pool::{ClientInnerExt, ConnInfo}; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, DbName, RoleName}; - -use tracing::{error, warn, Span}; -use tracing::{info, info_span, Instrument}; - -use super::backend::HttpConnError; -use super::conn_pool::{ClientInnerExt, ConnInfo}; +use crate::{DbName, RoleName}; struct ConnPoolEntry { conn: ClientInner, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 3131adada4cc..3ed3b6c845ce 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -12,12 +12,15 @@ mod local_conn_pool; mod sql_over_http; mod websocket; +use std::net::{IpAddr, SocketAddr}; +use std::pin::{pin, Pin}; +use std::sync::Arc; + +use anyhow::Context; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; - -use anyhow::Context; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; @@ -29,9 +32,13 @@ use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; +use tracing::{info, warn, Instrument}; +use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; @@ -43,14 +50,6 @@ use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; -use std::sync::Arc; -use tokio::net::{TcpListener, TcpStream}; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::http::error::ApiError; - pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index cf3324926c0c..3d8a2adef198 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,77 +2,43 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::select; -use futures::future::try_join; -use futures::future::Either; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::future::{select, try_join, Either}; +use futures::{StreamExt, TryFutureExt}; use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; -use http_body_util::BodyExt; -use http_body_util::Full; -use hyper::body::Body; -use hyper::body::Incoming; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{HeaderMap, Request}; +use http_body_util::{BodyExt, Full}; +use hyper::body::{Body, Incoming}; +use hyper::http::{HeaderName, HeaderValue}; +use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; use tokio::time; -use tokio_postgres::error::DbError; -use tokio_postgres::error::ErrorPosition; -use tokio_postgres::error::SqlState; -use tokio_postgres::GenericClient; -use tokio_postgres::IsolationLevel; -use tokio_postgres::NoTls; -use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Transaction; +use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; +use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; -use tracing::error; -use tracing::info; +use tracing::{error, info}; use typed_json::json; use url::Url; use urlencoding; use utils::http::error::ApiError; -use crate::auth::backend::ComputeCredentialKeys; -use crate::auth::backend::ComputeUserInfo; -use crate::auth::endpoint_sni; -use crate::auth::ComputeUserInfoParseError; -use crate::config::AuthenticationConfig; -use crate::config::HttpConfig; -use crate::config::ProxyConfig; -use crate::config::TlsConfig; +use super::backend::{LocalProxyConnError, PoolingBackend}; +use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth}; +use super::http_util::json_response; +use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; +use super::{conn_pool, local_conn_pool}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestMonitoring; -use crate::error::ErrorKind; -use crate::error::ReportableError; -use crate::error::UserFacingError; -use crate::metrics::HttpDirection; -use crate::metrics::Metrics; -use crate::proxy::run_until_cancelled; -use crate::proxy::NeonOptions; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::{HttpDirection, Metrics}; +use crate::proxy::{run_until_cancelled, NeonOptions}; use crate::serverless::backend::HttpConnError; -use crate::usage_metrics::MetricCounter; -use crate::usage_metrics::MetricCounterRecorder; -use crate::DbName; -use crate::RoleName; - -use super::backend::LocalProxyConnError; -use super::backend::PoolingBackend; -use super::conn_pool; -use super::conn_pool::AuthData; -use super::conn_pool::ConnInfo; -use super::conn_pool::ConnInfoWithAuth; -use super::http_util::json_response; -use super::json::json_to_pg_text; -use super::json::pg_text_row_to_json; -use super::json::JsonConversionError; -use super::local_conn_pool; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::{DbName, RoleName}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index f5a692cf404e..ba36116c2c7d 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,13 +1,7 @@ -use crate::proxy::ErrorSource; -use crate::{ - cancellation::CancellationHandlerMain, - config::ProxyConfig, - context::RequestMonitoring, - error::{io_error, ReportableError}, - metrics::Metrics, - proxy::{handle_client, ClientMode}, - rate_limiter::EndpointRateLimiter, -}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Context, Poll}; + use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; use framed_websockets::{Frame, OpCode, WebSocketServer}; @@ -15,15 +9,17 @@ use futures::{Sink, Stream}; use hyper::upgrade::OnUpgrade; use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; - -use std::{ - pin::Pin, - sync::Arc, - task::{ready, Context, Poll}, -}; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::error::{io_error, ReportableError}; +use crate::metrics::Metrics; +use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::rate_limiter::EndpointRateLimiter; + pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. @@ -184,14 +180,11 @@ mod tests { use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::{ - io::{duplex, AsyncReadExt, AsyncWriteExt}, - task::JoinSet, - }; - use tokio_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; + use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::task::JoinSet; + use tokio_tungstenite::tungstenite::protocol::Role; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::WebSocketStream; use super::WebSocketRw; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index e2fc73235ee8..89df48c5d3e9 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,19 +1,20 @@ -use crate::config::TlsServerEndPoint; -use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::Metrics; -use bytes::BytesMut; +use std::pin::Pin; +use std::sync::Arc; +use std::{io, task}; +use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; use rustls::ServerConfig; -use std::pin::Pin; -use std::sync::Arc; -use std::{io, task}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; use tracing::debug; +use crate::config::TlsServerEndPoint; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::Metrics; + /// Stream wrapper which implements libpq's protocol. /// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index ee36ed462d94..c5384c0b0ec0 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,36 +1,33 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{ - config::{MetricBackupCollectionConfig, MetricCollectionConfig}, - context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - http, - intern::{BranchIdInt, EndpointIdInt}, -}; +use std::convert::Infallible; +use std::pin::pin; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use dashmap::{mapref::entry::Entry, DashMap}; +use dashmap::mapref::entry::Entry; +use dashmap::DashMap; use futures::future::select; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; -use std::{ - convert::Infallible, - pin::pin, - sync::{ - atomic::{AtomicU64, AtomicUsize, Ordering}, - Arc, - }, - time::Duration, -}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; +use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig}; +use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; +use crate::http; +use crate::intern::{BranchIdInt, EndpointIdInt}; + const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -485,19 +482,23 @@ async fn upload_events_chunk( #[cfg(test)] mod tests { - use super::*; + use std::sync::{Arc, Mutex}; - use crate::{http, BranchId, EndpointId}; use anyhow::Error; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; - use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response}; + use hyper::body::Incoming; + use hyper::server::conn::http1; + use hyper::service::service_fn; + use hyper::{Request, Response}; use hyper_util::rt::TokioIo; - use std::sync::{Arc, Mutex}; use tokio::net::TcpListener; use url::Url; + use super::*; + use crate::{http, BranchId, EndpointId}; + #[tokio::test] async fn metrics() { type Report = EventChunk<'static, Event>; diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 86d0f9e8b29b..7e07f6a2affe 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,8 +1,9 @@ +use std::pin::Pin; +use std::task; + use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; -use std::pin::Pin; -use std::task; use thiserror::Error; use tokio::sync::oneshot; @@ -99,9 +100,10 @@ impl std::future::Future for Waiter<'_, T> { #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + #[tokio::test] async fn test_waiter() -> anyhow::Result<()> { let waiters = Arc::new(Waiters::default()); From d490ad23e0948b7c49098638ffc669774c61049e Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 16 Oct 2024 14:04:17 +0100 Subject: [PATCH 22/38] storcon: use the same trace fields for reconciler and results (#9410) ## Problem The reconciler use `seq`, but processing of results uses `sequence`. Order is different too. It makes it annoying to read logs. ## Summary of Changes Use the same tracing fields in both --- storage_controller/src/service.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cedee545347e..25e1fb5e1f80 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1074,8 +1074,9 @@ impl Service { /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( - tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), - sequence=%result.sequence + seq=%result.sequence, + tenant_id=%result.tenant_shard_id.tenant_id, + shard_id=%result.tenant_shard_id.shard_slug(), ))] fn process_result(&self, result: ReconcileResult) { let mut locked = self.inner.write().unwrap(); From d6281cbe65db6959e83c6d8abb44c0a3184e8b97 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 16 Oct 2024 15:27:46 +0100 Subject: [PATCH 23/38] tests: stabilize test_timelines_parallel_endpoints (#9413) ## Problem This test would get failures like `command failed: Found no timeline id for branch name 'branch_8'` It's because neon_local is being invoked concurrently for branch creation, which is unsafe (they'll step on each others' JSON writes) Example failure: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9410/11363051979/index.html#testresult/5ddc56c640f5422b/retries ## Summary of changes - Don't do branch creation concurrently with endpoint creation via neon_local --- test_runner/regress/test_tenants.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4a165359410e..03cb79fc1d6e 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -19,6 +19,7 @@ parse_metrics, ) from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn, @@ -490,8 +491,8 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): n_threads = 16 barrier = threading.Barrier(n_threads) - def test_timeline(branch_name: str, timeline_id: TimelineId): - endpoint = env.endpoints.create_start(branch_name) + def test_timeline(branch_name: str, timeline_id: TimelineId, endpoint: Endpoint): + endpoint.start() endpoint.stop() # Use a barrier to make sure we restart endpoints at the same time barrier.wait() @@ -502,8 +503,12 @@ def test_timeline(branch_name: str, timeline_id: TimelineId): for i in range(0, n_threads): branch_name = f"branch_{i}" timeline_id = env.create_branch(branch_name) - w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id]) + endpoint = env.endpoints.create(branch_name) + w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id, endpoint]) workers.append(w) + + # Only start the restarts once we're done creating all timelines & endpoints + for w in workers: w.start() for w in workers: From 3140c14d608e79d792518d9d9144460b6ff01b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 16:28:55 +0200 Subject: [PATCH 24/38] Remove allow(clippy::unknown_lints) (#9416) the lint stabilized in 1.80. --- pageserver/src/tenant/timeline.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8f098d0e8299..1992dee93038 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3092,7 +3092,6 @@ impl Timeline { } impl Timeline { - #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// From 9668601f4666bd82cee653800433ce66a4d9fb21 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 16 Oct 2024 15:29:23 +0100 Subject: [PATCH 25/38] Add support of extensions for v17 (part 2) (#9389) - plv8 3.2.3 - HypoPG 1.4.1 - pgtap 1.3.3 - timescaledb 2.17.0 - pg_hint_plan 17_1_7_0 - rdkit Release_2024_09_1 - pg_uuidv7 1.6.0 - wal2json 2.6 - pg_ivm 1.9 - pg_partman 5.1.0 update support of extensions for v14-v16: - HypoPG 1.4.0 -> 1.4.1 - pgtap 1.2.0 -> 1.3.3 - plpgsql_check 2.5.3 -> 2.7.11 - pg_uuidv7 1.0.1 -> 1.6.0 - wal2json 2.5 -> 2.6 - pg_ivm 1.7 -> 1.9 - pg_partman 5.0.1 -> 5.1.0 --- compute/Dockerfile.compute-node | 182 ++++++++++++++++++++------------ 1 file changed, 114 insertions(+), 68 deletions(-) diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 13381b29013d..f05039f8b77e 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -18,13 +18,14 @@ RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. + # libstdc++-10-dev is required for plv8 bullseye) \ echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ - VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ + VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \ ;; \ # Version-specific installs for Bookworm (PG17): bookworm) \ - VERSION_INSTALLS="cmake"; \ + VERSION_INSTALLS="cmake libstdc++-12-dev"; \ ;; \ *) \ echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ @@ -227,18 +228,33 @@ FROM build-deps AS plv8-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt update && \ +RUN apt update && \ apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +# plv8 3.2.3 supports v17 +# last release v3.2.3 - Sep 7, 2024 +# +# clone the repo instead of downloading the release tarball because plv8 has submodule dependencies +# and the release tarball doesn't include them +# +# Use new version only for v17 +# because since v3.2, plv8 doesn't include plcoffee and plls extensions +ENV PLV8_TAG=v3.2.3 + +RUN case "${PG_VERSION}" in \ + "v17") \ + export PLV8_TAG=v3.2.3 \ + ;; \ + "v14" | "v15" | "v16") \ + export PLV8_TAG=v3.1.10 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ - echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ + git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ + tar -czf plv8.tar.gz --exclude .git plv8-src && \ + cd plv8-src && \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ @@ -248,8 +264,17 @@ RUN case "${PG_VERSION}" in "v17") \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - ln -s plv8-3.1.10.so plv8-3.1.5.so && \ - ln -s plv8-3.1.10.so plv8-3.1.8.so && \ + case "${PG_VERSION}" in \ + "v17") \ + ln -s plv8-3.2.3.so plv8-3.1.8.so && \ + ln -s plv8-3.2.3.so plv8-3.1.5.so && \ + ln -s plv8-3.2.3.so plv8-3.1.10.so \ + ;; \ + "v14" | "v15" | "v16") \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so \ + ;; \ + esac && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -327,6 +352,9 @@ COPY compute/patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. +# +# v17 is not supported yet because of upstream issue +# https://github.com/pgvector/pgvector/issues/669 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -366,11 +394,10 @@ FROM build-deps AS hypopg-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ - echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ +# HypoPG 1.4.1 supports v17 +# last release 1.4.1 - Apr 28, 2024 +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ + echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -407,6 +434,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/rum.patch /rum.patch +# maybe version-specific +# support for v17 is unknown +# last release 1.3.13 - Sep 19, 2022 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -428,11 +458,10 @@ FROM build-deps AS pgtap-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ - echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ +# pgtap 1.3.3 supports v17 +# last release v1.3.3 - Apr 8, 2024 +RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ + echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -505,11 +534,10 @@ FROM build-deps AS plpgsql-check-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ - echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ +# plpgsql_check v2.7.11 supports v17 +# last release v2.7.11 - Sep 16, 2024 +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ + echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -527,18 +555,19 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ ;; \ - *) \ + "v16") \ export TIMESCALEDB_VERSION=2.13.0 \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ + "v17") \ + export TIMESCALEDB_VERSION=2.17.0 \ + export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \ + ;; \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ @@ -561,10 +590,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +# version-specific, has separate releases for each version +RUN case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -578,7 +605,8 @@ RUN case "${PG_VERSION}" in "v17") \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ "v17") \ - echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + export PG_HINT_PLAN_VERSION=17_1_7_0 \ + export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \ ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ @@ -602,6 +630,10 @@ FROM build-deps AS pg-cron-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# 1.6.4 available, supports v17 +# This is an experimental extension that we do not support on prod yet. +# !Do not remove! +# We set it in shared_preload_libraries and computes will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -623,23 +655,37 @@ FROM build-deps AS rdkit-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt-get update && \ +RUN apt-get update && \ apt-get install --no-install-recommends -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev + libeigen3-dev \ + libboost-all-dev +# rdkit Release_2024_09_1 supports v17 +# last release Release_2024_09_1 - Sep 27, 2024 +# +# Use new version only for v17 +# because Release_2024_09_1 has some backward incompatible changes +# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +RUN case "${PG_VERSION}" in \ + "v17") \ + export RDKIT_VERSION=Release_2024_09_1 \ + export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ + ;; \ + "v14" | "v15" | "v16") \ + export RDKIT_VERSION=Release_2023_03_3 \ + export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ - echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ + echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ @@ -678,12 +724,11 @@ FROM build-deps AS pg-uuidv7-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# not version-specific +# last release v1.6.0 - Oct 9, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ - echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ + echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -754,6 +799,8 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is our extension, support stopped in favor of pgvector +# TODO: deprecate it ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ @@ -780,6 +827,8 @@ FROM build-deps AS pg-anon-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ @@ -946,13 +995,12 @@ FROM build-deps AS wal2json-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# wal2json wal2json_2_6 supports v17 +# last release wal2json_2_6 - Apr 25, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ - echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ + echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -966,12 +1014,11 @@ FROM build-deps AS pg-ivm-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# pg_ivm v1.9 supports v17 +# last release v1.9 - Jul 31 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ - echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ + echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -987,12 +1034,11 @@ FROM build-deps AS pg-partman-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# should support v17 https://github.com/pgpartman/pg_partman/discussions/693 +# last release 5.1.0 Apr 2, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "pg_partman doesn't support PG17 yet" && exit 0;; \ - esac && \ - wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ - echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ + echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ From 55b246085ea30341f2479ecfadff374a5487e74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 16:47:17 +0200 Subject: [PATCH 26/38] Activate timelines during unoffload (#9399) The current code has forgotten to activate timelines during unoffload, leading to inability to receive the basebackup, due to the timeline still being in loading state. ``` stderr: command failed: compute startup failed: failed to get basebackup@0/0 from pageserver postgresql://no_user@localhost:15014 Caused by: 0: db error: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading 1: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading ``` Therefore, also activate the timeline during unoffloading. Part of #8088 --- pageserver/src/http/routes.rs | 7 +++- pageserver/src/tenant.rs | 40 +++++++++++++------- test_runner/regress/test_timeline_archive.py | 17 +++++++++ 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index dd403c1cefb8..36a6ed427b9b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -720,7 +720,12 @@ async fn timeline_archival_config_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant - .apply_timeline_archival_config(timeline_id, request_data.state, ctx) + .apply_timeline_archival_config( + timeline_id, + request_data.state, + state.broker_client.clone(), + ctx, + ) .await?; Ok::<_, ApiError>(()) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 20925c7fd61a..689982ddd4e3 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1554,6 +1554,7 @@ impl Tenant { async fn unoffload_timeline( self: &Arc, timeline_id: TimelineId, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); @@ -1605,25 +1606,37 @@ impl Tenant { }) .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); - if let Some(timeline) = timelines.get(&timeline_id) { - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } - info!("timeline unoffloading complete"); - Ok(Arc::clone(timeline)) - } else { + let Some(timeline) = timelines.get(&timeline_id) else { warn!("timeline not available directly after attach"); - Err(TimelineArchivalError::Other(anyhow::anyhow!( + return Err(TimelineArchivalError::Other(anyhow::anyhow!( "timeline not available directly after attach" - ))) + ))); + }; + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); + } + + // Activate the timeline (if it makes sense) + if !(timeline.is_broken() || timeline.is_stopping()) { + let background_jobs_can_start = None; + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + &ctx, + ); } + + info!("timeline unoffloading complete"); + Ok(Arc::clone(timeline)) } pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, new_state: TimelineArchivalState, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); @@ -1664,12 +1677,13 @@ impl Tenant { Some(Arc::clone(timeline)) }; - // Second part: unarchive timeline (if needed) + // Second part: unoffload timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { // Turn offloaded timeline into a non-offloaded one - self.unoffload_timeline(timeline_id, ctx).await? + self.unoffload_timeline(timeline_id, broker_client, ctx) + .await? }; // Third part: upload new timeline archival state and block until it is present in S3 @@ -3354,7 +3368,7 @@ impl Tenant { /// Populate all Timelines' `GcInfo` with information about their children. We do not set the /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] /// - /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`]. fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 971cc57a1cfa..ffaed5e1307e 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -136,6 +136,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" ) + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -197,4 +208,10 @@ def leaf_offloaded(): ) assert leaf_detail["is_archived"] is False + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + assert sum == sum_again + assert not timeline_offloaded(initial_timeline_id) From 8a114e3aeda7a2e321fa4524335c1748448cae07 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:19:45 -0400 Subject: [PATCH 27/38] refactor(pageserver): upgrade remote_storage to use hyper1 (#9405) part of https://github.com/neondatabase/neon/issues/9255 ## Summary of changes Upgrade remote_storage crate to use hyper1. Hyper0 is used when providing the streaming HTTP body to the s3 SDK, and it is refactored to use hyper1. Signed-off-by: Alex Chi Z --- Cargo.lock | 3 ++- libs/remote_storage/Cargo.toml | 3 ++- libs/remote_storage/src/s3_bucket.rs | 8 +++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e772814ec57..6b212bac2eea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4648,9 +4648,10 @@ dependencies = [ "camino-tempfile", "futures", "futures-util", + "http-body-util", "http-types", "humantime-serde", - "hyper 0.14.30", + "hyper 1.4.1", "itertools 0.10.5", "metrics", "once_cell", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index be4d61f00925..1816825bda7a 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true -hyper0 = { workspace = true, features = ["stream"] } +hyper = { workspace = true, features = ["client"] } futures.workspace = true serde.workspace = true serde_json.workspace = true @@ -36,6 +36,7 @@ azure_storage.workspace = true azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true +http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index f950f2886ca2..cde32df402a1 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -28,13 +28,15 @@ use aws_sdk_s3::{ Client, }; use aws_smithy_async::rt::sleep::TokioSleep; +use http_body_util::StreamBody; use http_types::StatusCode; use aws_smithy_types::{body::SdkBody, DateTime}; use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; -use hyper0::Body; +use futures_util::StreamExt; +use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -710,8 +712,8 @@ impl RemoteStorage for S3Bucket { let started_at = start_measuring_requests(kind); - let body = Body::wrap_stream(from); - let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); + let body = StreamBody::new(from.map(|x| x.map(Frame::data))); + let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body)); let upload = self .client From ed694732e707b15592991902c89f5078935ec177 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 16 Oct 2024 19:10:49 +0200 Subject: [PATCH 28/38] proxy: merge AuthError and AuthErrorImpl (#9418) Since GetAuthInfoError now boxes the ControlPlaneError message the variant is not big anymore and AuthError is 32 bytes. --- proxy/src/auth/flow.rs | 10 +++--- proxy/src/auth/mod.rs | 78 ++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 49 deletions(-) diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index ccb17b66b9f3..6294549ff6a6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -9,7 +9,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use super::backend::ComputeCredentialKeys; -use super::{AuthErrorImpl, PasswordHackPayload}; +use super::{AuthError, PasswordHackPayload}; use crate::config::TlsServerEndPoint; use crate::context::RequestMonitoring; use crate::control_plane::AuthSecret; @@ -117,14 +117,14 @@ impl AuthFlow<'_, S, PasswordHack> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let payload = PasswordHackPayload::parse(password) // If we ended up here and the payload is malformed, it means that // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. - .ok_or(AuthErrorImpl::MissingEndpointName)?; + .ok_or(AuthError::MissingEndpointName)?; Ok(payload) } @@ -136,7 +136,7 @@ impl AuthFlow<'_, S, CleartextPassword> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let outcome = validate_password_and_exchange( &self.state.pool, @@ -166,7 +166,7 @@ impl AuthFlow<'_, S, Scram<'_>> { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) - .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; + .ok_or(AuthError::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index ff97e6c35d70..7a373dd8251a 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -29,7 +29,7 @@ pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] -pub(crate) enum AuthErrorImpl { +pub(crate) enum AuthError { #[error(transparent)] Web(#[from] backend::WebAuthError), @@ -78,80 +78,70 @@ pub(crate) enum AuthErrorImpl { ConfirmationTimeout(humantime::Duration), } -#[derive(Debug, Error)] -#[error(transparent)] -pub(crate) struct AuthError(Box); - impl AuthError { pub(crate) fn bad_auth_method(name: impl Into>) -> Self { - AuthErrorImpl::BadAuthMethod(name.into()).into() + AuthError::BadAuthMethod(name.into()) } pub(crate) fn auth_failed(user: impl Into>) -> Self { - AuthErrorImpl::AuthFailed(user.into()).into() + AuthError::AuthFailed(user.into()) } pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { - AuthErrorImpl::IpAddressNotAllowed(ip).into() + AuthError::IpAddressNotAllowed(ip) } pub(crate) fn too_many_connections() -> Self { - AuthErrorImpl::TooManyConnections.into() + AuthError::TooManyConnections } pub(crate) fn is_auth_failed(&self) -> bool { - matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) + matches!(self, AuthError::AuthFailed(_)) } pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { - AuthErrorImpl::UserTimeout(elapsed).into() + AuthError::UserTimeout(elapsed) } pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { - AuthErrorImpl::ConfirmationTimeout(timeout).into() - } -} - -impl> From for AuthError { - fn from(e: E) -> Self { - Self(Box::new(e.into())) + AuthError::ConfirmationTimeout(timeout) } } impl UserFacingError for AuthError { fn to_string_client(&self) -> String { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.to_string_client(), - AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), - AuthErrorImpl::Sasl(e) => e.to_string_client(), - AuthErrorImpl::AuthFailed(_) => self.to_string(), - AuthErrorImpl::BadAuthMethod(_) => self.to_string(), - AuthErrorImpl::MalformedPassword(_) => self.to_string(), - AuthErrorImpl::MissingEndpointName => self.to_string(), - AuthErrorImpl::Io(_) => "Internal error".to_string(), - AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), - AuthErrorImpl::TooManyConnections => self.to_string(), - AuthErrorImpl::UserTimeout(_) => self.to_string(), - AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(), + match self { + Self::Web(e) => e.to_string_client(), + Self::GetAuthInfo(e) => e.to_string_client(), + Self::Sasl(e) => e.to_string_client(), + Self::AuthFailed(_) => self.to_string(), + Self::BadAuthMethod(_) => self.to_string(), + Self::MalformedPassword(_) => self.to_string(), + Self::MissingEndpointName => self.to_string(), + Self::Io(_) => "Internal error".to_string(), + Self::IpAddressNotAllowed(_) => self.to_string(), + Self::TooManyConnections => self.to_string(), + Self::UserTimeout(_) => self.to_string(), + Self::ConfirmationTimeout(_) => self.to_string(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.get_error_kind(), - AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), - AuthErrorImpl::Sasl(e) => e.get_error_kind(), - AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User, - AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect, - AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, - AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, - AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User, + match self { + Self::Web(e) => e.get_error_kind(), + Self::GetAuthInfo(e) => e.get_error_kind(), + Self::Sasl(e) => e.get_error_kind(), + Self::AuthFailed(_) => crate::error::ErrorKind::User, + Self::BadAuthMethod(_) => crate::error::ErrorKind::User, + Self::MalformedPassword(_) => crate::error::ErrorKind::User, + Self::MissingEndpointName => crate::error::ErrorKind::User, + Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, + Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + Self::TooManyConnections => crate::error::ErrorKind::RateLimit, + Self::UserTimeout(_) => crate::error::ErrorKind::User, + Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, } } } From 0551cfb6a74258537255af18428b0345f24f2702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 20:04:56 +0200 Subject: [PATCH 29/38] Fix beta clippy warnings (#9419) ``` warning: first doc comment paragraph is too long --> compute_tools/src/installed_extensions.rs:35:1 | 35 | / /// Connect to every database (see list_dbs above) and get the list of installed extensions. 36 | | /// Same extension can be installed in multiple databases with different versions, 37 | | /// we only keep the highest and lowest version across all databases. | |_ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph = note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default help: add an empty line | 35 ~ /// Connect to every database (see list_dbs above) and get the list of installed extensions. 36 + /// | ``` --- compute_tools/src/installed_extensions.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 3d8b22a8a39d..72578b1f342a 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -33,6 +33,7 @@ fn list_dbs(client: &mut Client) -> Result> { } /// Connect to every database (see list_dbs above) and get the list of installed extensions. +/// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. pub async fn get_installed_extensions(connstr: Url) -> Result { From 409a286eaa6f030494c8914fcaa36dcc7d6496d1 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 16 Oct 2024 13:08:40 -0500 Subject: [PATCH 30/38] Fix typo in sql_exporter generator Bad copy-paste seemingly. This manifested itself as a failure to start for the sql_exporter, and was just dying on loop in staging. A future PR will have E2E testing of sql_exporter. Signed-off-by: Tristan Partin --- compute/etc/sql_exporter.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet index 1e3665ac4727..640e2ac38df9 100644 --- a/compute/etc/sql_exporter.jsonnet +++ b/compute/etc/sql_exporter.jsonnet @@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') { // Collectors (referenced by name) to execute on the target. // Glob patterns are supported (see for syntax). collectors: [ - 'neon_collector_autoscaling', + 'neon_collector', ], }, From e0fa6bcf1a9a33929cfcfd0cefada739a8fe6fea Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 16 Oct 2024 14:46:33 -0500 Subject: [PATCH 31/38] Fix some sql_exporter metrics for PG 17 Checkpointer related statistics moved from pg_stat_bgwriter to pg_stat_checkpointer, so we need to adjust our queries accordingly. Signed-off-by: Tristan Partin --- compute/Dockerfile.compute-node | 3 ++- compute/Makefile | 6 ++++-- compute/etc/sql_exporter/checkpoints_req.17.sql | 1 + .../etc/sql_exporter/checkpoints_req.libsonnet | 7 ++++++- .../etc/sql_exporter/checkpoints_timed.17.sql | 1 + .../etc/sql_exporter/checkpoints_timed.libsonnet | 7 ++++++- compute/jsonnet/neon.libsonnet | 16 ++++++++++++++++ 7 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 compute/etc/sql_exporter/checkpoints_req.17.sql create mode 100644 compute/etc/sql_exporter/checkpoints_timed.17.sql create mode 100644 compute/jsonnet/neon.libsonnet diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index f05039f8b77e..b0ce7c171869 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -1221,12 +1221,13 @@ RUN rm /usr/local/pgsql/lib/lib*.a # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor +ARG PG_VERSION USER nonroot COPY --chown=nonroot compute compute -RUN make -C compute +RUN make PG_VERSION="${PG_VERSION}" -C compute ######################################################################################### # diff --git a/compute/Makefile b/compute/Makefile index f8faa882eedb..e4f08a223c24 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -6,13 +6,15 @@ jsonnet_files = $(wildcard \ all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml neon_collector.yml: $(jsonnet_files) - JSONNET_PATH=etc jsonnet \ + JSONNET_PATH=jsonnet:etc jsonnet \ --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ etc/neon_collector.jsonnet neon_collector_autoscaling.yml: $(jsonnet_files) - JSONNET_PATH=etc jsonnet \ + JSONNET_PATH=jsonnet:etc jsonnet \ --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ etc/neon_collector_autoscaling.jsonnet sql_exporter.yml: $(jsonnet_files) diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql new file mode 100644 index 000000000000..a4b946e8e240 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -0,0 +1 @@ +SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet index 8697f8af3b99..e5d97535074f 100644 --- a/compute/etc/sql_exporter/checkpoints_req.libsonnet +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -1,3 +1,8 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + { metric_name: 'checkpoints_req', type: 'gauge', @@ -6,5 +11,5 @@ values: [ 'checkpoints_req', ], - query: importstr 'sql_exporter/checkpoints_req.sql', + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, } diff --git a/compute/etc/sql_exporter/checkpoints_timed.17.sql b/compute/etc/sql_exporter/checkpoints_timed.17.sql new file mode 100644 index 000000000000..0d86ddb3ea41 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.17.sql @@ -0,0 +1 @@ +SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet index 9f0b742400f9..0ba008018874 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -1,3 +1,8 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + { metric_name: 'checkpoints_timed', type: 'gauge', @@ -6,5 +11,5 @@ values: [ 'checkpoints_timed', ], - query: importstr 'sql_exporter/checkpoints_timed.sql', + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, } diff --git a/compute/jsonnet/neon.libsonnet b/compute/jsonnet/neon.libsonnet new file mode 100644 index 000000000000..583b631c5818 --- /dev/null +++ b/compute/jsonnet/neon.libsonnet @@ -0,0 +1,16 @@ +local MIN_SUPPORTED_VERSION = 14; +local MAX_SUPPORTED_VERSION = 17; +local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION); + +# If we receive the pg_version with a leading "v", ditch it. +local pg_version = std.strReplace(std.extVar('pg_version'), 'v', ''); +local pg_version_num = std.parseInt(pg_version); + +assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) : + std.format('%s is an unsupported Postgres version: %s', + [pg_version, std.toString(SUPPORTED_VERSIONS)]); + +{ + PG_MAJORVERSION: pg_version, + PG_MAJORVERSION_NUM: pg_version_num, +} From 67d5d98b1960c7f7b88d1f9860cd9672411cb815 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 16 Oct 2024 21:47:53 +0200 Subject: [PATCH 32/38] readme: fix build instructions for debian 12 (#9371) We need libprotobuf-dev for some of the `/usr/include/google/protobuf/...*.proto` referenced by our protobuf decls. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cfc63b47087c..e68ef70bdfa2 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev +libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash From 934dbb61f557477512b3cf5c98e9930e5745d87e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 17 Oct 2024 08:04:57 +0300 Subject: [PATCH 33/38] Check access_count in lfc_evict (#9407) ## Problem See https://neondb.slack.com/archives/C033A2WE6BZ/p1729007738526309?thread_ts=1722942856.987979&cid=C033A2WE6BZ When replica receives WAL record which target page is not present in shared buffer, we evict this page from LFC. If all pages from the LFC chunk are evicted, then chunk is moved to the beginning of LRU least to force it reuse. Unfortunately access_count is not checked and if the entry is access at this moment then this operation can cause LRU list corruption. ## Summary of changes Check `access_count` in `lfc_evict` ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- pgxn/neon/file_cache.c | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index bbea5a8b0d0c..70b250d3945d 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -617,31 +617,34 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) /* remove the page from the cache */ entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) + if (entry->access_count == 0) { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) + /* + * If the chunk has no live entries, we can position the chunk to be + * recycled first. + */ + if (entry->bitmap[chunk_offs >> 5] == 0) { - if (entry->bitmap[i] != 0) + bool has_remaining_pages = false; + + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) { - has_remaining_pages = true; - break; + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } } - } - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk - */ - if (!has_remaining_pages) - { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); + /* + * Put the entry at the position that is first to be reclaimed when we + * have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); + } } } From db68e822355a4ef8ac9e3363d90bb9a2bd0e6dad Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 17 Oct 2024 10:06:02 +0100 Subject: [PATCH 34/38] storage_scrubber: fixes to garbage commands (#9409) ## Problem While running `find-garbage` and `purge-garbage`, I encountered two things that needed updating: - Console API may omit `user_id` since org accounts were added - When we cut over to using GenericRemoteStorage, the object listings we do during purge did not get proper retry handling, so could easily fail on usual S3 errors, and make the whole process drop out. ...and one bug: - We had a `.unwrap` which expects that after finding an object in a tenant path, a listing in that path will always return objects. This is not true, because a pageserver might be deleting the path at the same time as we scan it. ## Summary of changes - When listing objects during purge, use backoff::retry - Make `user_id` an `Option` - Handle the case where a tenant's objects go away during find-garbage. --- storage_scrubber/src/cloud_admin_api.rs | 2 +- storage_scrubber/src/garbage.rs | 65 ++++++++++++++++--------- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index 70b108cf2326..7b82a0b11604 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -138,7 +138,7 @@ pub struct ProjectData { pub name: String, pub region_id: String, pub platform_id: String, - pub user_id: String, + pub user_id: Option, pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index d53611ed6e91..a0040ada08ef 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::id::TenantId; +use utils::{backoff, id::TenantId}; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, list_objects_with_retries, metadata_stream::{stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -250,13 +250,16 @@ async fn find_garbage_inner( &target.tenant_root(&tenant_shard_id), ) .await?; - let object = tenant_objects.keys.first().unwrap(); - if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); - garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); - continue; + if let Some(object) = tenant_objects.keys.first() { + if object.key.get_path().as_str().ends_with("heatmap-v1.json") { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } else { + tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -406,14 +409,17 @@ pub async fn get_tenant_objects( // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let list = s3_client - .list( - Some(&tenant_root), - ListingMode::NoDelimiter, - None, - &CancellationToken::new(), - ) - .await?; + let cancel = CancellationToken::new(); + let list = backoff::retry( + || s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "get_tenant_objects", + &cancel, + ) + .await + .expect("dummy cancellation token")?; Ok(list.keys) } @@ -424,14 +430,25 @@ pub async fn get_timeline_objects( tracing::debug!("Listing objects in timeline {ttid}"); let timeline_root = super::remote_timeline_path_id(&ttid); - let list = s3_client - .list( - Some(&timeline_root), - ListingMode::NoDelimiter, - None, - &CancellationToken::new(), - ) - .await?; + let cancel = CancellationToken::new(); + let list = backoff::retry( + || { + s3_client.list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &cancel, + ) + }, + |_| false, + 3, + MAX_RETRIES as u32, + "get_timeline_objects", + &cancel, + ) + .await + .expect("dummy cancellation token")?; + Ok(list.keys) } From 22d8834474d1f619b6ed351fd80033b4a064bb21 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Thu, 17 Oct 2024 13:38:24 +0300 Subject: [PATCH 35/38] proxy: move the connection pools to separate file (#9398) First PR for #9284 Start unification of the client and connection pool interfaces: - Exclude the 'global_connections_count' out from the get_conn_entry() - Move remote connection pools to the conn_pool_lib as a reference - Unify clients among all the conn pools --- proxy/src/serverless/backend.rs | 13 +- proxy/src/serverless/conn_pool.rs | 585 ++---------------------- proxy/src/serverless/conn_pool_lib.rs | 562 +++++++++++++++++++++++ proxy/src/serverless/http_conn_pool.rs | 50 +- proxy/src/serverless/local_conn_pool.rs | 109 ++--- proxy/src/serverless/mod.rs | 5 +- proxy/src/serverless/sql_over_http.rs | 15 +- 7 files changed, 704 insertions(+), 635 deletions(-) create mode 100644 proxy/src/serverless/conn_pool_lib.rs diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index a180c4c2ed09..82e81dbcfef6 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -11,8 +11,9 @@ use tokio::net::{lookup_host, TcpStream}; use tracing::field::display; use tracing::{debug, info}; -use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client}; +use super::conn_pool::poll_client; +use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client, Send}; use super::local_conn_pool::{self, LocalClient, LocalConnPool}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; @@ -31,7 +32,7 @@ use crate::rate_limiter::EndpointRateLimiter; use crate::{compute, EndpointId, Host}; pub(crate) struct PoolingBackend { - pub(crate) http_conn_pool: Arc, + pub(crate) http_conn_pool: Arc>, pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, @@ -199,7 +200,7 @@ impl PoolingBackend { &self, ctx: &RequestMonitoring, conn_info: ConnInfo, - ) -> Result { + ) -> Result, HttpConnError> { info!("pool: looking for an existing connection"); if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) { return Ok(client); @@ -481,7 +482,7 @@ impl ConnectMechanism for TokioMechanism { } struct HyperMechanism { - pool: Arc, + pool: Arc>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -491,7 +492,7 @@ struct HyperMechanism { #[async_trait] impl ConnectMechanism for HyperMechanism { - type Connection = http_conn_pool::Client; + type Connection = http_conn_pool::Client; type ConnectError = HttpConnError; type Error = HttpConnError; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index aa869ff1c0a5..b97c6565101e 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,31 +1,29 @@ -use std::collections::HashMap; use std::fmt; -use std::ops::Deref; use std::pin::pin; -use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; use std::task::{ready, Poll}; -use std::time::Duration; -use dashmap::DashMap; use futures::future::poll_fn; use futures::Future; -use parking_lot::RwLock; -use rand::Rng; use smallvec::SmallVec; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, warn, Instrument, Span}; +use tracing::{error, info, info_span, warn, Instrument}; -use super::backend::HttpConnError; -use crate::auth::backend::ComputeUserInfo; use crate::context::RequestMonitoring; -use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{DbName, EndpointCacheKey, RoleName}; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::Metrics; + +use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool}; + +#[cfg(test)] +use { + super::conn_pool_lib::GlobalConnPoolOptions, + crate::auth::backend::ComputeUserInfo, + std::{sync::atomic, time::Duration}, +}; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { @@ -33,34 +31,12 @@ pub(crate) struct ConnInfoWithAuth { pub(crate) auth: AuthData, } -#[derive(Debug, Clone)] -pub(crate) struct ConnInfo { - pub(crate) user_info: ComputeUserInfo, - pub(crate) dbname: DbName, -} - #[derive(Debug, Clone)] pub(crate) enum AuthData { Password(SmallVec<[u8; 16]>), Jwt(String), } -impl ConnInfo { - // hm, change to hasher to avoid cloning? - pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { - (self.dbname.clone(), self.user_info.user.clone()) - } - - pub(crate) fn endpoint_cache_key(&self) -> Option { - // We don't want to cache http connections for ephemeral endpoints. - if self.user_info.options.is_ephemeral() { - None - } else { - Some(self.user_info.endpoint_cache_key()) - } - } -} - impl fmt::Display for ConnInfo { // use custom display to avoid logging password fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -75,402 +51,6 @@ impl fmt::Display for ConnInfo { } } -struct ConnPoolEntry { - conn: ClientInner, - _last_access: std::time::Instant, -} - -// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool -// Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { - pools: HashMap<(DbName, RoleName), DbUserConnPool>, - total_conns: usize, - max_conns: usize, - _guard: HttpEndpointPoolsGuard<'static>, - global_connections_count: Arc, - global_pool_size_max_conns: usize, -} - -impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { - let Self { - pools, - total_conns, - global_connections_count, - .. - } = self; - pools.get_mut(&db_user).and_then(|pool_entries| { - pool_entries.get_conn_entry(total_conns, global_connections_count.clone()) - }) - } - - fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { - let Self { - pools, - total_conns, - global_connections_count, - .. - } = self; - if let Some(pool) = pools.get_mut(&db_user) { - let old_len = pool.conns.len(); - pool.conns.retain(|conn| conn.conn.conn_id != conn_id); - let new_len = pool.conns.len(); - let removed = old_len - new_len; - if removed > 0 { - global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(removed as i64); - } - *total_conns -= removed; - removed > 0 - } else { - false - } - } - - fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) { - let conn_id = client.conn_id; - - if client.is_closed() { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return; - } - let global_max_conn = pool.read().global_pool_size_max_conns; - if pool - .read() - .global_connections_count - .load(atomic::Ordering::Relaxed) - >= global_max_conn - { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); - return; - } - - // return connection to the pool - let mut returned = false; - let mut per_db_size = 0; - let total_conns = { - let mut pool = pool.write(); - - if pool.total_conns < pool.max_conns { - let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); - - returned = true; - per_db_size = pool_entries.conns.len(); - - pool.total_conns += 1; - pool.global_connections_count - .fetch_add(1, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .inc(); - } - - pool.total_conns - }; - - // do logging outside of the mutex - if returned { - info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); - } else { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); - } - } -} - -impl Drop for EndpointConnPool { - fn drop(&mut self) { - if self.total_conns > 0 { - self.global_connections_count - .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(self.total_conns as i64); - } - } -} - -pub(crate) struct DbUserConnPool { - conns: Vec>, -} - -impl Default for DbUserConnPool { - fn default() -> Self { - Self { conns: Vec::new() } - } -} - -impl DbUserConnPool { - fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { - let old_len = self.conns.len(); - - self.conns.retain(|conn| !conn.conn.is_closed()); - - let new_len = self.conns.len(); - let removed = old_len - new_len; - *conns -= removed; - removed - } - - fn get_conn_entry( - &mut self, - conns: &mut usize, - global_connections_count: Arc, - ) -> Option> { - let mut removed = self.clear_closed_clients(conns); - let conn = self.conns.pop(); - if conn.is_some() { - *conns -= 1; - removed += 1; - } - global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(removed as i64); - conn - } -} - -pub(crate) struct GlobalConnPool { - // endpoint -> per-endpoint connection pool - // - // That should be a fairly conteded map, so return reference to the per-endpoint - // pool as early as possible and release the lock. - global_pool: DashMap>>>, - - /// Number of endpoint-connection pools - /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. - /// That seems like far too much effort, so we're using a relaxed increment counter instead. - /// It's only used for diagnostics. - global_pool_size: AtomicUsize, - - /// Total number of connections in the pool - global_connections_count: Arc, - - config: &'static crate::config::HttpConfig, -} - -#[derive(Debug, Clone, Copy)] -pub struct GlobalConnPoolOptions { - // Maximum number of connections per one endpoint. - // Can mix different (dbname, username) connections. - // When running out of free slots for a particular endpoint, - // falls back to opening a new connection for each request. - pub max_conns_per_endpoint: usize, - - pub gc_epoch: Duration, - - pub pool_shards: usize, - - pub idle_timeout: Duration, - - pub opt_in: bool, - - // Total number of connections in the pool. - pub max_total_conns: usize, -} - -impl GlobalConnPool { - pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { - let shards = config.pool_options.pool_shards; - Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), - global_pool_size: AtomicUsize::new(0), - config, - global_connections_count: Arc::new(AtomicUsize::new(0)), - }) - } - - #[cfg(test)] - pub(crate) fn get_global_connections_count(&self) -> usize { - self.global_connections_count - .load(atomic::Ordering::Relaxed) - } - - pub(crate) fn get_idle_timeout(&self) -> Duration { - self.config.pool_options.idle_timeout - } - - pub(crate) fn shutdown(&self) { - // drops all strong references to endpoint-pools - self.global_pool.clear(); - } - - pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.config.pool_options.gc_epoch; - let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); - loop { - interval.tick().await; - - let shard = rng.gen_range(0..self.global_pool.shards().len()); - self.gc(shard); - } - } - - fn gc(&self, shard: usize) { - debug!(shard, "pool: performing epoch reclamation"); - - // acquire a random shard lock - let mut shard = self.global_pool.shards()[shard].write(); - - let timer = Metrics::get() - .proxy - .http_pool_reclaimation_lag_seconds - .start_timer(); - let current_len = shard.len(); - let mut clients_removed = 0; - shard.retain(|endpoint, x| { - // if the current endpoint pool is unique (no other strong or weak references) - // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { - pools, total_conns, .. - } = pool.get_mut(); - - // ensure that closed clients are removed - for db_pool in pools.values_mut() { - clients_removed += db_pool.clear_closed_clients(total_conns); - } - - // we only remove this pool if it has no active connections - if *total_conns == 0 { - info!("pool: discarding pool for endpoint {endpoint}"); - return false; - } - } - - true - }); - - let new_len = shard.len(); - drop(shard); - timer.observe(); - - // Do logging outside of the lock. - if clients_removed > 0 { - let size = self - .global_connections_count - .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - - clients_removed; - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); - } - let removed = current_len - new_len; - - if removed > 0 { - let global_pool_size = self - .global_pool_size - .fetch_sub(removed, atomic::Ordering::Relaxed) - - removed; - info!("pool: performed global pool gc. size now {global_pool_size}"); - } - } - - pub(crate) fn get( - self: &Arc, - ctx: &RequestMonitoring, - conn_info: &ConnInfo, - ) -> Result>, HttpConnError> { - let mut client: Option> = None; - let Some(endpoint) = conn_info.endpoint_cache_key() else { - return Ok(None); - }; - - let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); - if let Some(entry) = endpoint_pool - .write() - .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } - let endpoint_pool = Arc::downgrade(&endpoint_pool); - - // ok return cached connection if found and establish a new one otherwise - if let Some(client) = client { - if client.is_closed() { - info!("pool: cached connection '{conn_info}' is closed, opening a new one"); - return Ok(None); - } - tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); - tracing::Span::current().record( - "pid", - tracing::field::display(client.inner.get_process_id()), - ); - info!( - cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), - "pool: reusing connection '{conn_info}'" - ); - client.session.send(ctx.session_id())?; - ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.success(); - return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); - } - Ok(None) - } - - fn get_or_create_endpoint_pool( - self: &Arc, - endpoint: &EndpointCacheKey, - ) -> Arc>> { - // fast path - if let Some(pool) = self.global_pool.get(endpoint) { - return pool.clone(); - } - - // slow path - let new_pool = Arc::new(RwLock::new(EndpointConnPool { - pools: HashMap::new(), - total_conns: 0, - max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: Metrics::get().proxy.http_endpoint_pools.guard(), - global_connections_count: self.global_connections_count.clone(), - global_pool_size_max_conns: self.config.pool_options.max_total_conns, - })); - - // find or create a pool for this endpoint - let mut created = false; - let pool = self - .global_pool - .entry(endpoint.clone()) - .or_insert_with(|| { - created = true; - new_pool - }) - .clone(); - - // log new global pool size - if created { - let global_pool_size = self - .global_pool_size - .fetch_add(1, atomic::Ordering::Relaxed) - + 1; - info!( - "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" - ); - } - - pool - } -} - pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, @@ -574,7 +154,7 @@ pub(crate) fn poll_client( } .instrument(span)); - let inner = ClientInner { + let inner = ClientInnerRemote { inner: client, session: tx, cancel, @@ -584,7 +164,7 @@ pub(crate) fn poll_client( Client::new(inner, conn_info, pool_clone) } -struct ClientInner { +pub(crate) struct ClientInnerRemote { inner: C, session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -592,131 +172,36 @@ struct ClientInner { conn_id: uuid::Uuid, } -impl Drop for ClientInner { - fn drop(&mut self) { - // on client drop, tell the conn to shut down - self.cancel.cancel(); +impl ClientInnerRemote { + pub(crate) fn inner_mut(&mut self) -> &mut C { + &mut self.inner } -} - -pub(crate) trait ClientInnerExt: Sync + Send + 'static { - fn is_closed(&self) -> bool; - fn get_process_id(&self) -> i32; -} -impl ClientInnerExt for tokio_postgres::Client { - fn is_closed(&self) -> bool { - self.is_closed() + pub(crate) fn inner(&self) -> &C { + &self.inner } - fn get_process_id(&self) -> i32 { - self.get_process_id() - } -} -impl ClientInner { - pub(crate) fn is_closed(&self) -> bool { - self.inner.is_closed() + pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender { + &mut self.session } -} -impl Client { - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; - USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id, - branch_id: aux.branch_id, - }) + pub(crate) fn aux(&self) -> &MetricsAuxInfo { + &self.aux } -} - -pub(crate) struct Client { - span: Span, - inner: Option>, - conn_info: ConnInfo, - pool: Weak>>, -} -pub(crate) struct Discard<'a, C: ClientInnerExt> { - conn_info: &'a ConnInfo, - pool: &'a mut Weak>>, -} - -impl Client { - pub(self) fn new( - inner: ClientInner, - conn_info: ConnInfo, - pool: Weak>>, - ) -> Self { - Self { - inner: Some(inner), - span: Span::current(), - conn_info, - pool, - } - } - pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { - let Self { - inner, - pool, - conn_info, - span: _, - } = self; - let inner = inner.as_mut().expect("client inner should not be removed"); - (&mut inner.inner, Discard { conn_info, pool }) + pub(crate) fn get_conn_id(&self) -> uuid::Uuid { + self.conn_id } -} -impl Discard<'_, C> { - pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { - let conn_info = &self.conn_info; - if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is not idle"); - } - } - pub(crate) fn discard(&mut self) { - let conn_info = &self.conn_info; - if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); - } - } -} - -impl Deref for Client { - type Target = C; - - fn deref(&self) -> &Self::Target { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner - } -} - -impl Client { - fn do_drop(&mut self) -> Option { - let conn_info = self.conn_info.clone(); - let client = self - .inner - .take() - .expect("client inner should not be removed"); - if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { - let current_span = self.span.clone(); - // return connection to the pool - return Some(move || { - let _span = current_span.enter(); - EndpointConnPool::put(&conn_pool, &conn_info, client); - }); - } - None + pub(crate) fn is_closed(&self) -> bool { + self.inner.is_closed() } } -impl Drop for Client { +impl Drop for ClientInnerRemote { fn drop(&mut self) { - if let Some(drop) = self.do_drop() { - tokio::task::spawn_blocking(drop); - } + // on client drop, tell the conn to shut down + self.cancel.cancel(); } } @@ -745,12 +230,12 @@ mod tests { } } - fn create_inner() -> ClientInner { + fn create_inner() -> ClientInnerRemote { create_inner_with(MockClient::new(false)) } - fn create_inner_with(client: MockClient) -> ClientInner { - ClientInner { + fn create_inner_with(client: MockClient) -> ClientInnerRemote { + ClientInnerRemote { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), cancel: CancellationToken::new(), @@ -797,7 +282,7 @@ mod tests { { let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); assert_eq!(0, pool.get_global_connections_count()); - client.inner().1.discard(); + client.inner_mut().1.discard(); // Discard should not add the connection from the pool. assert_eq!(0, pool.get_global_connections_count()); } diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs new file mode 100644 index 000000000000..6e964ce8789f --- /dev/null +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -0,0 +1,562 @@ +use dashmap::DashMap; +use parking_lot::RwLock; +use rand::Rng; +use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration}; +use std::{ + ops::Deref, + sync::atomic::{self, AtomicUsize}, +}; +use tokio_postgres::ReadyForQueryStatus; + +use crate::control_plane::messages::ColdStartInfo; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::{ + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, +}; + +use super::conn_pool::ClientInnerRemote; +use tracing::info; +use tracing::{debug, Span}; + +use super::backend::HttpConnError; + +#[derive(Debug, Clone)] +pub(crate) struct ConnInfo { + pub(crate) user_info: ComputeUserInfo, + pub(crate) dbname: DbName, +} + +impl ConnInfo { + // hm, change to hasher to avoid cloning? + pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { + (self.dbname.clone(), self.user_info.user.clone()) + } + + pub(crate) fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } + } +} + +pub(crate) struct ConnPoolEntry { + pub(crate) conn: ClientInnerRemote, + pub(crate) _last_access: std::time::Instant, +} + +// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool +// Number of open connections is limited by the `max_conns_per_endpoint`. +pub(crate) struct EndpointConnPool { + pools: HashMap<(DbName, RoleName), DbUserConnPool>, + total_conns: usize, + max_conns: usize, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, + global_pool_size_max_conns: usize, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { + let Self { + pools, + total_conns, + global_connections_count, + .. + } = self; + pools.get_mut(&db_user).and_then(|pool_entries| { + let (entry, removed) = pool_entries.get_conn_entry(total_conns); + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + entry + }) + } + + pub(crate) fn remove_client( + &mut self, + db_user: (DbName, RoleName), + conn_id: uuid::Uuid, + ) -> bool { + let Self { + pools, + total_conns, + global_connections_count, + .. + } = self; + if let Some(pool) = pools.get_mut(&db_user) { + let old_len = pool.conns.len(); + pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id); + let new_len = pool.conns.len(); + let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } + *total_conns -= removed; + removed > 0 + } else { + false + } + } + + pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInnerRemote) { + let conn_id = client.get_conn_id(); + + if client.is_closed() { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); + return; + } + + let global_max_conn = pool.read().global_pool_size_max_conns; + if pool + .read() + .global_connections_count + .load(atomic::Ordering::Relaxed) + >= global_max_conn + { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); + return; + } + + // return connection to the pool + let mut returned = false; + let mut per_db_size = 0; + let total_conns = { + let mut pool = pool.write(); + + if pool.total_conns < pool.max_conns { + let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); + + returned = true; + per_db_size = pool_entries.conns.len(); + + pool.total_conns += 1; + pool.global_connections_count + .fetch_add(1, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); + } + + pool.total_conns + }; + + // do logging outside of the mutex + if returned { + info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + } else { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); + } + } +} + +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if self.total_conns > 0 { + self.global_connections_count + .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); + } + } +} + +pub(crate) struct DbUserConnPool { + pub(crate) conns: Vec>, +} + +impl Default for DbUserConnPool { + fn default() -> Self { + Self { conns: Vec::new() } + } +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { + let old_len = self.conns.len(); + + self.conns.retain(|conn| !conn.conn.is_closed()); + + let new_len = self.conns.len(); + let removed = old_len - new_len; + *conns -= removed; + removed + } + + pub(crate) fn get_conn_entry( + &mut self, + conns: &mut usize, + ) -> (Option>, usize) { + let mut removed = self.clear_closed_clients(conns); + let conn = self.conns.pop(); + if conn.is_some() { + *conns -= 1; + removed += 1; + } + + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + + (conn, removed) + } +} + +pub(crate) struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: DashMap>>>, + + /// Number of endpoint-connection pools + /// + /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// That seems like far too much effort, so we're using a relaxed increment counter instead. + /// It's only used for diagnostics. + global_pool_size: AtomicUsize, + + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, +} + +#[derive(Debug, Clone, Copy)] +pub struct GlobalConnPoolOptions { + // Maximum number of connections per one endpoint. + // Can mix different (dbname, username) connections. + // When running out of free slots for a particular endpoint, + // falls back to opening a new connection for each request. + pub max_conns_per_endpoint: usize, + + pub gc_epoch: Duration, + + pub pool_shards: usize, + + pub idle_timeout: Duration, + + pub opt_in: bool, + + // Total number of connections in the pool. + pub max_total_conns: usize, +} + +impl GlobalConnPool { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; + Arc::new(Self { + global_pool: DashMap::with_shard_amount(shards), + global_pool_size: AtomicUsize::new(0), + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), + }) + } + + #[cfg(test)] + pub(crate) fn get_global_connections_count(&self) -> usize { + self.global_connections_count + .load(atomic::Ordering::Relaxed) + } + + pub(crate) fn get_idle_timeout(&self) -> Duration { + self.config.pool_options.idle_timeout + } + + pub(crate) fn shutdown(&self) { + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } + + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; + + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + pub(crate) fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); + let current_len = shard.len(); + let mut clients_removed = 0; + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { + pools, total_conns, .. + } = pool.get_mut(); + + // ensure that closed clients are removed + for db_pool in pools.values_mut() { + clients_removed += db_pool.clear_closed_clients(total_conns); + } + + // we only remove this pool if it has no active connections + if *total_conns == 0 { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true + }); + + let new_len = shard.len(); + drop(shard); + timer.observe(); + + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } + } + + pub(crate) fn get_or_create_endpoint_pool( + self: &Arc, + endpoint: &EndpointCacheKey, + ) -> Arc>> { + // fast path + if let Some(pool) = self.global_pool.get(endpoint) { + return pool.clone(); + } + + // slow path + let new_pool = Arc::new(RwLock::new(EndpointConnPool { + pools: HashMap::new(), + total_conns: 0, + max_conns: self.config.pool_options.max_conns_per_endpoint, + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + global_pool_size_max_conns: self.config.pool_options.max_total_conns, + })); + + // find or create a pool for this endpoint + let mut created = false; + let pool = self + .global_pool + .entry(endpoint.clone()) + .or_insert_with(|| { + created = true; + new_pool + }) + .clone(); + + // log new global pool size + if created { + let global_pool_size = self + .global_pool_size + .fetch_add(1, atomic::Ordering::Relaxed) + + 1; + info!( + "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" + ); + } + + pool + } + + pub(crate) fn get( + self: &Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; + + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn); + } + let endpoint_pool = Arc::downgrade(&endpoint_pool); + + // ok return cached connection if found and establish a new one otherwise + if let Some(mut client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); + } + tracing::Span::current() + .record("conn_id", tracing::field::display(client.get_conn_id())); + tracing::Span::current().record( + "pid", + tracing::field::display(client.inner().get_process_id()), + ); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + + client.session().send(ctx.session_id())?; + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); + } + Ok(None) + } +} + +impl Client { + pub(crate) fn new( + inner: ClientInnerRemote, + conn_info: ConnInfo, + pool: Weak>>, + ) -> Self { + Self { + inner: Some(inner), + span: Span::current(), + conn_info, + pool, + } + } + + pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) { + let Self { + inner, + pool, + conn_info, + span: _, + } = self; + let inner = inner.as_mut().expect("client inner should not be removed"); + let inner_ref = inner.inner_mut(); + (inner_ref, Discard { conn_info, pool }) + } + + pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux(); + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) + } + + pub(crate) fn do_drop(&mut self) -> Option { + let conn_info = self.conn_info.clone(); + let client = self + .inner + .take() + .expect("client inner should not be removed"); + if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { + let current_span = self.span.clone(); + // return connection to the pool + return Some(move || { + let _span = current_span.enter(); + EndpointConnPool::put(&conn_pool, &conn_info, client); + }); + } + None + } +} + +pub(crate) struct Client { + span: Span, + inner: Option>, + conn_info: ConnInfo, + pool: Weak>>, +} + +impl Drop for Client { + fn drop(&mut self) { + if let Some(drop) = self.do_drop() { + tokio::task::spawn_blocking(drop); + } + } +} + +impl Deref for Client { + type Target = C; + + fn deref(&self) -> &Self::Target { + self.inner + .as_ref() + .expect("client inner should not be removed") + .inner() + } +} + +pub(crate) trait ClientInnerExt: Sync + Send + 'static { + fn is_closed(&self) -> bool; + fn get_process_id(&self) -> i32; +} + +impl ClientInnerExt for tokio_postgres::Client { + fn is_closed(&self) -> bool { + self.is_closed() + } + + fn get_process_id(&self) -> i32 { + self.get_process_id() + } +} + +pub(crate) struct Discard<'a, C: ClientInnerExt> { + conn_info: &'a ConnInfo, + pool: &'a mut Weak>>, +} + +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!("pool: throwing away connection '{conn_info}' because connection is not idle"); + } + } + pub(crate) fn discard(&mut self) { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + } + } +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 9b6bc98557a5..79bb19328ffb 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -10,11 +10,12 @@ use rand::Rng; use tokio::net::TcpStream; use tracing::{debug, error, info, info_span, Instrument}; -use super::conn_pool::ConnInfo; use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; + +use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; use crate::EndpointCacheKey; pub(crate) type Send = http2::SendRequest; @@ -22,15 +23,15 @@ pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] -struct ConnPoolEntry { - conn: Send, +pub(crate) struct ConnPoolEntry { + conn: C, conn_id: uuid::Uuid, aux: MetricsAuxInfo, } // Per-endpoint connection pool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { +pub(crate) struct EndpointConnPool { // TODO(conrad): // either we should open more connections depending on stream count // (not exposed by hyper, need our own counter) @@ -40,13 +41,13 @@ pub(crate) struct EndpointConnPool { // seems somewhat redundant though. // // Probably we should run a semaphore and just the single conn. TBD. - conns: VecDeque, + conns: VecDeque>, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, } -impl EndpointConnPool { - fn get_conn_entry(&mut self) -> Option { +impl EndpointConnPool { + fn get_conn_entry(&mut self) -> Option> { let Self { conns, .. } = self; loop { @@ -81,7 +82,7 @@ impl EndpointConnPool { } } -impl Drop for EndpointConnPool { +impl Drop for EndpointConnPool { fn drop(&mut self) { if !self.conns.is_empty() { self.global_connections_count @@ -95,12 +96,12 @@ impl Drop for EndpointConnPool { } } -pub(crate) struct GlobalConnPool { +pub(crate) struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>>, /// Number of endpoint-connection pools /// @@ -115,7 +116,7 @@ pub(crate) struct GlobalConnPool { config: &'static crate::config::HttpConfig, } -impl GlobalConnPool { +impl GlobalConnPool { pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { @@ -210,7 +211,7 @@ impl GlobalConnPool { self: &Arc, ctx: &RequestMonitoring, conn_info: &ConnInfo, - ) -> Option { + ) -> Option> { let endpoint = conn_info.endpoint_cache_key()?; let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); let client = endpoint_pool.write().get_conn_entry()?; @@ -228,7 +229,7 @@ impl GlobalConnPool { fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -268,14 +269,14 @@ impl GlobalConnPool { } pub(crate) fn poll_http2_client( - global_pool: Arc, + global_pool: Arc>, ctx: &RequestMonitoring, conn_info: &ConnInfo, client: Send, connection: Connect, conn_id: uuid::Uuid, aux: MetricsAuxInfo, -) -> Client { +) -> Client { let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); let session_id = ctx.session_id(); @@ -322,13 +323,13 @@ pub(crate) fn poll_http2_client( Client::new(client, aux) } -pub(crate) struct Client { - pub(crate) inner: Send, +pub(crate) struct Client { + pub(crate) inner: C, aux: MetricsAuxInfo, } -impl Client { - pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self { +impl Client { + pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self { Self { inner, aux } } @@ -339,3 +340,14 @@ impl Client { }) } } + +impl ClientInnerExt for Send { + fn is_closed(&self) -> bool { + self.is_closed() + } + + fn get_process_id(&self) -> i32 { + // ideally throw something meaningful + -1 + } +} diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 5df37a8762ff..c4fdd00f7859 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -20,11 +20,12 @@ use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument, Span}; use super::backend::HttpConnError; -use super::conn_pool::{ClientInnerExt, ConnInfo}; +use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; + use crate::{DbName, RoleName}; struct ConnPoolEntry { @@ -362,7 +363,7 @@ pub(crate) fn poll_client( LocalClient::new(inner, conn_info, pool_clone) } -struct ClientInner { +pub(crate) struct ClientInner { inner: C, session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -387,13 +388,24 @@ impl ClientInner { } } -impl LocalClient { - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; - USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id, - branch_id: aux.branch_id, - }) +impl ClientInner { + pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { + self.jti += 1; + let token = resign_jwt(&self.key, payload, self.jti)?; + + // initiates the auth session + self.inner.simple_query("discard all").await?; + self.inner + .query( + "select auth.jwt_session_init($1)", + &[&token as &(dyn ToSql + Sync)], + ) + .await?; + + let pid = self.inner.get_process_id(); + info!(pid, jti = self.jti, "user session state init"); + + Ok(()) } } @@ -422,6 +434,18 @@ impl LocalClient { pool, } } + + pub(crate) fn client_inner(&mut self) -> (&mut ClientInner, Discard<'_, C>) { + let Self { + inner, + pool, + conn_info, + span: _, + } = self; + let inner_m = inner.as_mut().expect("client inner should not be removed"); + (inner_m, Discard { conn_info, pool }) + } + pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, @@ -434,33 +458,6 @@ impl LocalClient { } } -impl LocalClient { - pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { - let inner = self - .inner - .as_mut() - .expect("client inner should not be removed"); - - inner.jti += 1; - let token = resign_jwt(&inner.key, payload, inner.jti)?; - - // initiates the auth session - inner.inner.simple_query("discard all").await?; - inner - .inner - .query( - "select auth.jwt_session_init($1)", - &[&token as &(dyn ToSql + Sync)], - ) - .await?; - - let pid = inner.inner.get_process_id(); - info!(pid, jti = inner.jti, "user session state init"); - - Ok(()) - } -} - /// implements relatively efficient in-place json object key upserting /// /// only supports top-level keys @@ -524,24 +521,15 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { jwt } -impl Discard<'_, C> { - pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { - let conn_info = &self.conn_info; - if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!( - "local_pool: throwing away connection '{conn_info}' because connection is not idle" - ); - } - } - pub(crate) fn discard(&mut self) { - let conn_info = &self.conn_info; - if std::mem::take(self.pool).strong_count() > 0 { - info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); - } +impl LocalClient { + pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux; + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) } -} -impl LocalClient { fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self @@ -568,6 +556,23 @@ impl Drop for LocalClient { } } +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!( + "local_pool: throwing away connection '{conn_info}' because connection is not idle" + ); + } + } + pub(crate) fn discard(&mut self) { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { + info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + } + } +} + #[cfg(test)] mod tests { use p256::ecdsa::SigningKey; diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 3ed3b6c845ce..29ff7b9d91c4 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -5,6 +5,7 @@ mod backend; pub mod cancel_set; mod conn_pool; +mod conn_pool_lib; mod http_conn_pool; mod http_util; mod json; @@ -20,7 +21,7 @@ use anyhow::Context; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; -pub use conn_pool::GlobalConnPoolOptions; +pub use conn_pool_lib::GlobalConnPoolOptions; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; @@ -65,7 +66,7 @@ pub async fn task_main( } let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config); - let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config); + let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let conn_pool = Arc::clone(&conn_pool); tokio::spawn(async move { diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 3d8a2adef198..bb5eb390a6bc 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -25,10 +25,11 @@ use urlencoding; use utils::http::error::ApiError; use super::backend::{LocalProxyConnError, PoolingBackend}; -use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth}; +use super::conn_pool::{AuthData, ConnInfoWithAuth}; +use super::conn_pool_lib::{self, ConnInfo}; use super::http_util::json_response; use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; -use super::{conn_pool, local_conn_pool}; +use super::local_conn_pool; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; @@ -37,6 +38,7 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{run_until_cancelled, NeonOptions}; use crate::serverless::backend::HttpConnError; + use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; use crate::{DbName, RoleName}; @@ -607,7 +609,8 @@ async fn handle_db_inner( let client = match keys.keys { ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; - client.set_jwt_session(&payload).await?; + let (cli_inner, _dsc) = client.client_inner(); + cli_inner.set_jwt_session(&payload).await?; Client::Local(client) } _ => { @@ -1021,12 +1024,12 @@ async fn query_to_json( } enum Client { - Remote(conn_pool::Client), + Remote(conn_pool_lib::Client), Local(local_conn_pool::LocalClient), } enum Discard<'a> { - Remote(conn_pool::Discard<'a, tokio_postgres::Client>), + Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>), Local(local_conn_pool::Discard<'a, tokio_postgres::Client>), } @@ -1041,7 +1044,7 @@ impl Client { fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { match self { Client::Remote(client) => { - let (c, d) = client.inner(); + let (c, d) = client.inner_mut(); (c, Discard::Remote(d)) } Client::Local(local_client) => { From 35e7d91bc9eb07c8ef70acef5e224c9b9e78a0ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 17 Oct 2024 14:07:58 +0200 Subject: [PATCH 36/38] Add config variable for timeline offloading (#9421) Adds a configuration variable for timeline offloading support. The added pageserver-global config option controls whether the pageserver automatically offloads timelines during compaction. Therefore, already offloaded timelines are not affected by this, nor is the manual testing endpoint. This allows the rollout of timeline offloading to be driven by the storage team. Part of #8088 --- libs/pageserver_api/src/config.rs | 2 ++ pageserver/src/config.rs | 5 +++++ pageserver/src/tenant.rs | 3 ++- pageserver/src/tenant/timeline.rs | 1 + test_runner/regress/test_timeline_archive.py | 4 ++++ 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 24474d48405e..896a5d8069b2 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -102,6 +102,7 @@ pub struct ConfigToml { pub ingest_batch_size: u64, pub max_vectored_read_bytes: MaxVectoredReadBytes, pub image_compression: ImageCompressionAlgorithm, + pub timeline_offloading: bool, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, @@ -385,6 +386,7 @@ impl Default for ConfigToml { NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), + timeline_offloading: false, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8db78285e476..06d432645919 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -164,6 +164,9 @@ pub struct PageServerConf { pub image_compression: ImageCompressionAlgorithm, + /// Whether to offload archived timelines automatically + pub timeline_offloading: bool, + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. @@ -321,6 +324,7 @@ impl PageServerConf { ingest_batch_size, max_vectored_read_bytes, image_compression, + timeline_offloading, ephemeral_bytes_per_memory_kb, l0_flush, virtual_file_io_mode, @@ -364,6 +368,7 @@ impl PageServerConf { ingest_batch_size, max_vectored_read_bytes, image_compression, + timeline_offloading, ephemeral_bytes_per_memory_kb, // ------------------------------------------------------------ diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 689982ddd4e3..baa236565810 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2187,7 +2187,8 @@ impl Tenant { .iter() .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) }; - let can_offload = can_offload && has_no_unoffloaded_children; + let can_offload = + can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading; if (is_active, can_offload) == (false, false) { None } else { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1992dee93038..2b4f949c76da 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1565,6 +1565,7 @@ impl Timeline { } /// Checks if the internal state of the timeline is consistent with it being able to be offloaded. + /// /// This is neccessary but not sufficient for offloading of the timeline as it might have /// child timelines that are not offloaded yet. pub(crate) fn can_offload(&self) -> bool { diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index ffaed5e1307e..85e1077fd5bf 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -119,6 +119,10 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): @pytest.mark.parametrize("manual_offload", [False, True]) def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): + if not manual_offload: + # (automatic) timeline offloading defaults to false for now + neon_env_builder.pageserver_config_override = "timeline_offloading = true" + env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() From 8b479381403cd2be8f7bc7eba69d5074735d8924 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 17 Oct 2024 13:37:21 +0100 Subject: [PATCH 37/38] Add support of extensions for v17 (part 3) (#9430) - pgvector 7.4 update support of extensions for v14-v16: - pgvector 7.2 -> 7.4 --- compute/Dockerfile.compute-node | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index b0ce7c171869..45c1fd9f3871 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -353,13 +353,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. # -# v17 is not supported yet because of upstream issue -# https://github.com/pgvector/pgvector/issues/669 -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ - echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ +# vector 0.7.4 supports v17 +# last release v0.7.4 - Aug 5, 2024 +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \ + echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ From a7c05686ccbebc856b0ce389a9fa60d2bddbeea6 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Thu, 17 Oct 2024 17:20:42 +0300 Subject: [PATCH 38/38] test_runner: Update the README.md to build neon with 'testing' (#9437) Without having the '--features testing' in the cargo build the proxy won't start causing tests to fail. --- test_runner/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/README.md b/test_runner/README.md index e087241c1f05..55d8d2faa9e4 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,7 +6,7 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. + To run tests you need to add `--features testing` to Rust code build commands. For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables