diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bfdf3be2b9cf..81a9fd99ae09 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -602,7 +602,20 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16, v17 ] + version: + # Much data was already generated on old PG versions with bullseye's + # libraries, the locales of which can cause data incompatibilities. + # However, new PG versions should check if they can be built on newer + # images, as that reduces the support burden of old and ancient + # distros. + - pg: v14 + debian: bullseye-slim + - pg: v15 + debian: bullseye-slim + - pg: v16 + debian: bullseye-slim + - pg: v17 + debian: bookworm-slim arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -645,41 +658,46 @@ jobs: context: . build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - PG_VERSION=${{ matrix.version }} + PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + DEBIAN_FLAVOR=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Build neon extensions test image - if: matrix.version == 'v16' + if: matrix.version.pg == 'v16' uses: docker/build-push-action@v6 with: context: . build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - PG_VERSION=${{ matrix.version }} + PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + DEBIAN_FLAVOR=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: matrix.version == 'v17' + # We pick 16, because that builds on debian 11 with older glibc (and is + # thus compatible with newer glibc), rather than 17 on Debian 12, as + # that isn't guaranteed to be compatible with Debian 11 + if: matrix.version.pg == 'v16' uses: docker/build-push-action@v6 with: target: compute-tools-image @@ -688,6 +706,7 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + DEBIAN_FLAVOR=${{ matrix.version.debian }} provenance: false push: true pull: true diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index de6babdde39a..ecafe183f8a8 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -42,7 +42,7 @@ jobs: - name: Patch the test run: | cd "vendor/postgres-v${DEFAULT_PG_VERSION}" - patch -p1 < "../../patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch" + patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch" - name: Generate a random password id: pwgen diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index f25c1051cd98..cad97645327b 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -102,7 +102,7 @@ jobs: # Default set of platforms to run e2e tests on platforms='["docker", "k8s"]' - # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms. + # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms. # If the workflow run is not a pull request, add k8s-neonvm to the list. if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do diff --git a/Cargo.lock b/Cargo.lock index e4dbd8b33398..d0702e09d412 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1321,7 +1321,6 @@ dependencies = [ "clap", "comfy-table", "compute_api", - "git-version", "humantime", "humantime-serde", "hyper 0.14.30", @@ -3578,7 +3577,6 @@ dependencies = [ "anyhow", "camino", "clap", - "git-version", "humantime", "pageserver", "pageserver_api", @@ -3617,7 +3615,6 @@ dependencies = [ "enumset", "fail", "futures", - "git-version", "hex", "hex-literal", "humantime", @@ -3737,7 +3734,6 @@ dependencies = [ "clap", "criterion", "futures", - "git-version", "hex-literal", "itertools 0.10.5", "once_cell", @@ -4307,7 +4303,6 @@ dependencies = [ "fallible-iterator", "framed-websockets", "futures", - "git-version", "hashbrown 0.14.5", "hashlink", "hex", @@ -5139,7 +5134,6 @@ dependencies = [ "desim", "fail", "futures", - "git-version", "hex", "humantime", "hyper 0.14.30", @@ -5702,7 +5696,6 @@ dependencies = [ "futures", "futures-core", "futures-util", - "git-version", "humantime", "hyper 0.14.30", "metrics", @@ -5730,7 +5723,6 @@ dependencies = [ "diesel_migrations", "fail", "futures", - "git-version", "hex", "humantime", "hyper 0.14.30", @@ -5783,7 +5775,6 @@ dependencies = [ "either", "futures", "futures-util", - "git-version", "hex", "humantime", "itertools 0.10.5", @@ -6715,6 +6706,7 @@ dependencies = [ "criterion", "fail", "futures", + "git-version", "hex", "hex-literal", "humantime", diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 18c68c116a94..2c647a669c28 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -3,13 +3,15 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG +ARG DEBIAN_FLAVOR=bullseye-slim ######################################################################################### # # Layer "build-deps" # ######################################################################################### -FROM debian:bullseye-slim AS build-deps +FROM debian:$DEBIAN_FLAVOR AS build-deps +ARG DEBIAN_FLAVOR RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ @@ -1027,7 +1029,8 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de # ######################################################################################### -FROM debian:bullseye-slim AS compute-tools-image +FROM debian:$DEBIAN_FLAVOR AS compute-tools-image +ARG DEBIAN_FLAVOR COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl @@ -1037,7 +1040,8 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu # ######################################################################################### -FROM debian:bullseye-slim AS pgbouncer +FROM debian:$DEBIAN_FLAVOR AS pgbouncer +ARG DEBIAN_FLAVOR RUN set -e \ && apt-get update \ && apt-get install -y \ @@ -1179,7 +1183,9 @@ ENV PGDATABASE=postgres # Put it all together into the final image # ######################################################################################### -FROM debian:bullseye-slim +FROM debian:$DEBIAN_FLAVOR +ARG DEBIAN_FLAVOR +ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ @@ -1211,21 +1217,34 @@ COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector # Create remote extension download directory RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions - # Install: # libreadline8 for psql -# libicu67, locales for collations (including ICU and plpgsql_check) # liblz4-1 for lz4 # libossp-uuid16 for extension ossp-uuid -# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS +# libgeos, libsfcgal1, and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 # libzstd1 for zstd # libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl -RUN apt update && \ + + +RUN apt update && \ + case $DEBIAN_FLAVOR in \ + # Version-specific installs for Bullseye (PG14-PG16): + # libicu67, locales for collations (including ICU and plpgsql_check) + # libgdal28, libproj19 for PostGIS + bullseye*) \ + VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \ + ;; \ + # Version-specific installs for Bookworm (PG17): + # libicu72, locales for collations (including ICU and plpgsql_check) + # libgdal32, libproj25 for PostGIS + bookworm*) \ + VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \ + ;; \ + esac && \ apt install --no-install-recommends -y \ gdb \ - libicu67 \ liblz4-1 \ libreadline8 \ libboost-iostreams1.74.0 \ @@ -1234,8 +1253,6 @@ RUN apt update && \ libboost-system1.74.0 \ libossp-uuid16 \ libgeos-c1v5 \ - libgdal28 \ - libproj19 \ libprotobuf-c1 \ libsfcgal1 \ libxml2 \ @@ -1244,7 +1261,8 @@ RUN apt update && \ libcurl4-openssl-dev \ locales \ procps \ - ca-certificates && \ + ca-certificates \ + $VERSION_INSTALLS && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch similarity index 100% rename from patches/cloud_regress_pg16.patch rename to compute/patches/cloud_regress_pg16.patch diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index c185d20484a4..df87c181bf47 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -9,7 +9,6 @@ anyhow.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true -git-version.workspace = true humantime.workspace = true nix.workspace = true once_cell.workspace = true diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 2b714fbfbf10..0c0e67dff057 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -346,7 +346,14 @@ impl StorageController { let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { - let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()]; + let initdb_args = [ + "-D", + pg_data_path.as_ref(), + "--username", + &username(), + "--no-sync", + "--no-instructions", + ]; tracing::info!( "Initializing storage controller database with args: {:?}", initdb_args diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 651fcda8db52..73d89699edb6 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ - NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy, - TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, + AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, + ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, - availability_zone_id, + availability_zone_id: AvailabilityZone(availability_zone_id), }), ) .await?; diff --git a/docker-compose/README.md b/docker-compose/README.md index bd47805a6791..648e4ca030c6 100644 --- a/docker-compose/README.md +++ b/docker-compose/README.md @@ -2,8 +2,8 @@ # Example docker compose configuration The configuration in this directory is used for testing Neon docker images: it is -not intended for deploying a usable system. To run a development environment where -you can experiment with a minature Neon system, use `cargo neon` rather than container images. +not intended for deploying a usable system. To run a development environment where +you can experiment with a miniature Neon system, use `cargo neon` rather than container images. This configuration does not start the storage controller, because the controller needs a way to reconfigure running computes, and no such thing exists in this setup. diff --git a/docs/rfcs/038-independent-compute-release.md b/docs/rfcs/038-independent-compute-release.md new file mode 100644 index 000000000000..3deaf1e6fdfb --- /dev/null +++ b/docs/rfcs/038-independent-compute-release.md @@ -0,0 +1,343 @@ +# Independent compute release + +Created at: 2024-08-30. Author: Alexey Kondratov (@ololobus) + +## Summary + +This document proposes an approach to fully independent compute release flow. It attempts to +cover the following features: + +- Process is automated as much as possible to minimize human errors. +- Compute<->storage protocol compatibility is ensured. +- A transparent release history is available with an easy rollback strategy. +- Although not in the scope of this document, there is a viable way to extend the proposed release + flow to achieve the canary and/or blue-green deployment strategies. + +## Motivation + +Previously, the compute release was tightly coupled to the storage release. This meant that once +some storage nodes got restarted with a newer version, all new compute starts using these nodes +automatically got a new version. Thus, two releases happen in parallel, which increases the blast +radius and makes ownership fuzzy. + +Now, we practice a manual v0 independent compute release flow -- after getting a new compute release +image and tag, we pin it region by region using Admin UI. It's better, but it still has its own flaws: + +1. It's a simple but fairly manual process, as you need to click through a few pages. +2. It's prone to human errors, e.g., you could mistype or copy the wrong compute tag. +3. We now require an additional approval in the Admin UI, which partially solves the 2., + but also makes the whole process pretty annoying, as you constantly need to go back + and forth between two people. + +## Non-goals + +It's not the goal of this document to propose a design for some general-purpose release tool like Helm. +The document considers how the current compute fleet is orchestrated at Neon. Even if we later +decide to split the control plane further (e.g., introduce a separate compute controller), the proposed +release process shouldn't change much, i.e., the releases table and API will reside in +one of the parts. + +Achieving the canary and/or blue-green deploy strategies is out of the scope of this document. They +were kept in mind, though, so it's expected that the proposed approach will lay down the foundation +for implementing them in future iterations. + +## Impacted components + +Compute, control plane, CI, observability (some Grafana dashboards may require changes). + +## Prior art + +One of the very close examples is how Helm tracks [releases history](https://helm.sh/docs/helm/helm_history/). + +In the code: + +- [Release](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/release.go#L20-L43) +- [Release info](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/info.go#L24-L40) +- [Release status](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/status.go#L18-L42) + +TL;DR it has several important attributes: + +- Revision -- unique release ID/primary key. It is not the same as the application version, + because the same version can be deployed several times, e.g., after a newer version rollback. +- App version -- version of the application chart/code. +- Config -- set of overrides to the default config of the application. +- Status -- current status of the release in the history. +- Timestamps -- tracks when a release was created and deployed. + +## Proposed implementation + +### Separate release branch + +We will use a separate release branch, `release-compute`, to have a clean history for releases and commits. +In order to avoid confusion with storage releases, we will use a different prefix for compute [git release +tags](https://github.com/neondatabase/neon/releases) -- `release-compute-XXXX`. We will use the same tag for +Docker images as well. The `neondatabase/compute-node-v16:release-compute-XXXX` looks longer and a bit redundant, +but it's better to have image and git tags in sync. + +Currently, control plane relies on the numeric compute and storage release versions to decide on compute->storage +compatibility. Once we implement this proposal, we should drop this code as release numbers will be completely +independent. The only constraint we want is that it must monotonically increase within the same release branch. + +### Compute config/settings manifest + +We will create a new sub-directory `compute` and file `compute/manifest.yaml` with a structure: + +```yaml +pg_settings: + # Common settings for primaries and secondaries of all versions. + common: + wal_log_hints: "off" + max_wal_size: "1024" + + per_version: + 14: + # Common settings for both replica and primary of version PG 14 + common: + shared_preload_libraries: "neon,pg_stat_statements,extension_x" + 15: + common: + shared_preload_libraries: "neon,pg_stat_statements,extension_x" + # Settings that should be applied only to + replica: + # Available only starting Postgres 15th + recovery_prefetch: "off" + # ... + 17: + common: + # For example, if third-party `extension_x` is not yet available for PG 17 + shared_preload_libraries: "neon,pg_stat_statements" + replica: + recovery_prefetch: "off" +``` + +**N.B.** Setting value should be a string with `on|off` for booleans and a number (as a string) +without units for all numeric settings. That's how the control plane currently operates. + +The priority of settings will be (a higher number is a higher priority): + +1. Any static and hard-coded settings in the control plane +2. `pg_settings->common` +3. Per-version `common` +4. Per-version `replica` +5. Any per-user/project/endpoint overrides in the control plane +6. Any dynamic setting calculated based on the compute size + +**N.B.** For simplicity, we do not do any custom logic for `shared_preload_libraries`, so it's completely +overridden if specified on some level. Make sure that you include all necessary extensions in it when you +do any overrides. + +**N.B.** There is a tricky question about what to do with custom compute image pinning we sometimes +do for particular projects and customers. That's usually some ad-hoc work and images are based on +the latest compute image, so it's relatively safe to assume that we could use settings from the latest compute +release. If for some reason that's not true, and further overrides are needed, it's also possible to do +on the project level together with pinning the image, so it's on-call/engineer/support responsibility to +ensure that compute starts with the specified custom image. The only real risk is that compute image will get +stale and settings from new releases will drift away, so eventually it will get something incompatible, +but i) this is some operational issue, as we do not want stale images anyway, and ii) base settings +receive something really new so rarely that the chance of this happening is very low. If we want to solve it completely, +then together with pinning the image we could also pin the matching release revision in the control plane. + +The compute team will own the content of `compute/manifest.yaml`. + +### Control plane: releases table + +In order to store information about releases, the control plane will use a table `compute_releases` with the following +schema: + +```sql +CREATE TABLE compute_releases ( + -- Unique release ID + -- N.B. Revision won't by synchronized across all regions, because all control planes are technically independent + -- services. We have the same situation with Helm releases as well because they could be deployed and rolled back + -- independently in different clusters. + revision BIGSERIAL PRIMARY KEY, + -- Numeric version of the compute image, e.g. 9057 + version BIGINT NOT NULL, + -- Compute image tag, e.g. `release-9057` + tag TEXT NOT NULL, + -- Current release status. Currently, it will be a simple enum + -- * `deployed` -- release is deployed and used for new compute starts. + -- Exactly one release can have this status at a time. + -- * `superseded` -- release has been replaced by a newer one. + -- But we can always extend it in the future when we need more statuses + -- for more complex deployment strategies. + status TEXT NOT NULL, + -- Any additional metadata for compute in the corresponding release + manifest JSONB NOT NULL, + -- Timestamp when release record was created in the control plane database + created_at TIMESTAMP NOT NULL DEFAULT now(), + -- Timestamp when release deployment was finished + deployed_at TIMESTAMP +); +``` + +We keep track of the old releases not only for the sake of audit, but also because we usually have ~30% of +old computes started using the image from one of the previous releases. Yet, when users want to reconfigure +them without restarting, the control plane needs to know what settings are applicable to them, so we also need +information about the previous releases that are readily available. There could be some other auxiliary info +needed as well: supported extensions, compute flags, etc. + +**N.B.** Here, we can end up in an ambiguous situation when the same compute image is deployed twice, e.g., +it was deployed once, then rolled back, and then deployed again, potentially with a different manifest. Yet, +we could've started some computes with the first deployment and some with the second. Thus, when we need to +look up the manifest for the compute by its image tag, we will see two records in the table with the same tag, +but different revision numbers. We can assume that this could happen only in case of rollbacks, so we +can just take the latest revision for the given tag. + +### Control plane: management API + +The control plane will implement new API methods to manage releases: + +1. `POST /management/api/v2/compute_releases` to create a new release. With payload + + ```json + { + "version": 9057, + "tag": "release-9057", + "manifest": {} + } + ``` + + and response + + ```json + { + "revision": 53, + "version": 9057, + "tag": "release-9057", + "status": "deployed", + "manifest": {}, + "created_at": "2024-08-15T15:52:01.0000Z", + "deployed_at": "2024-08-15T15:52:01.0000Z", + } + ``` + + Here, we can actually mix-in custom (remote) extensions metadata into the `manifest`, so that the control plane + will get information about all available extensions not bundled into compute image. The corresponding + workflow in `neondatabase/build-custom-extensions` should produce it as an artifact and make + it accessible to the workflow in the `neondatabase/infra`. See the complete release flow below. Doing that, + we put a constraint that new custom extension requires new compute release, which is good for the safety, + but is not exactly what we want operational-wise (we want to be able to deploy new extensions without new + images). Yet, it can be solved incrementally: v0 -- do not do anything with extensions at all; + v1 -- put them into the same manifest; v2 -- make them separate entities with their own lifecycle. + + **N.B.** This method is intended to be used in CI workflows, and CI/network can be flaky. It's reasonable + to assume that we could retry the request several times, even though it's already succeeded. Although it's + not a big deal to create several identical releases one-by-one, it's better to avoid it, so the control plane + should check if the latest release is identical and just return `304 Not Modified` in this case. + +2. `POST /management/api/v2/compute_releases/rollback` to rollback to any previously deployed release. With payload + including the revision of the release to rollback to: + + ```json + { + "revision": 52 + } + ``` + + Rollback marks the current release as `superseded` and creates a new release with all the same data as the + requested revision, but with a new revision number. + + This rollback API is not strictly needed, as we can just use `infra` repo workflow to deploy any + available tag. It's still nice to have for on-call and any urgent matters, for example, if we need + to rollback and GitHub is down. It's much easier to specify only the revision number vs. crafting + all the necessary data for the new release payload. + +### Compute->storage compatibility tests + +In order to safely release new compute versions independently from storage, we need to ensure that the currently +deployed storage is compatible with the new compute version. Currently, we maintain backward compatibility +in storage, but newer computes may require a newer storage version. + +Remote end-to-end (e2e) tests [already accept](https://github.com/neondatabase/cloud/blob/e3468d433e0d73d02b7d7e738d027f509b522408/.github/workflows/testing.yml#L43-L48) +`storage_image_tag` and `compute_image_tag` as separate inputs. That means that we could reuse e2e tests to ensure +compatibility between storage and compute: + +1. Pick the latest storage release tag and use it as `storage_image_tag`. +2. Pick a new compute tag built in the current compute release PR and use it as `compute_image_tag`. + Here, we should use a temporary ECR image tag, because the final tag will be known only after the release PR is merged. +3. Trigger e2e tests as usual. + +### Release flow + +```mermaid + sequenceDiagram + + actor oncall as Compute on-call person + participant neon as neondatabase/neon + + box private + participant cloud as neondatabase/cloud + participant exts as neondatabase/build-custom-extensions + participant infra as neondatabase/infra + end + + box cloud + participant preprod as Pre-prod control plane + participant prod as Production control plane + participant k8s as Compute k8s + end + + oncall ->> neon: Open release PR into release-compute + + activate neon + neon ->> cloud: CI: trigger e2e compatibility tests + activate cloud + cloud -->> neon: CI: e2e tests pass + deactivate cloud + neon ->> neon: CI: pass PR checks, get approvals + deactivate neon + + oncall ->> neon: Merge release PR into release-compute + + activate neon + neon ->> neon: CI: pass checks, build and push images + neon ->> exts: CI: trigger extensions build + activate exts + exts -->> neon: CI: extensions are ready + deactivate exts + neon ->> neon: CI: create release tag + neon ->> infra: Trigger release workflow using the produced tag + deactivate neon + + activate infra + infra ->> infra: CI: pass checks + infra ->> preprod: Release new compute image to pre-prod automatically
POST /management/api/v2/compute_releases + activate preprod + preprod -->> infra: 200 OK + deactivate preprod + + infra ->> infra: CI: wait for per-region production deploy approvals + oncall ->> infra: CI: approve deploys region by region + infra ->> k8s: Prewarm new compute image + infra ->> prod: POST /management/api/v2/compute_releases + activate prod + prod -->> infra: 200 OK + deactivate prod + deactivate infra +``` + +## Further work + +As briefly mentioned in other sections, eventually, we would like to use more complex deployment strategies. +For example, we can pass a fraction of the total compute starts that should use the new release. Then we can +mark the release as `partial` or `canary` and monitor its performance. If everything is fine, we can promote it +to `deployed` status. If not, we can roll back to the previous one. + +## Alternatives + +In theory, we can try using Helm as-is: + +1. Write a compute Helm chart. That will actually have only some config map, which the control plane can access and read. + N.B. We could reuse the control plane chart as well, but then it's not a fully independent release again and even more fuzzy. +2. The control plane will read it and start using the new compute version for new starts. + +Drawbacks: + +1. Helm releases work best if the workload is controlled by the Helm chart itself. Then you can have different + deployment strategies like rolling update or canary or blue/green deployments. At Neon, the compute starts are controlled + by control plane, so it makes it much more tricky. +2. Releases visibility will suffer, i.e. instead of a nice table in the control plane and Admin UI, we would need to use + `helm` cli and/or K8s UIs like K8sLens. +3. We do not restart all computes shortly after the new version release. This means that for some features and compatibility + purpose (see above) control plane may need some auxiliary info from the previous releases. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 40b7dbbbc2af..0ea30ce54f78 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::fmt::Display; use std::str::FromStr; use std::time::{Duration, Instant}; @@ -57,7 +58,7 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, - pub availability_zone_id: String, + pub availability_zone_id: AvailabilityZone, } #[derive(Serialize, Deserialize)] @@ -74,10 +75,19 @@ pub struct TenantPolicyRequest { pub scheduling: Option, } +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct AvailabilityZone(pub String); + +impl Display for AvailabilityZone { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsRequest { #[serde(flatten)] - pub preferred_az_ids: HashMap, + pub preferred_az_ids: HashMap, } #[derive(Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index c9be53f0b0c0..45abda0ad85d 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -37,14 +37,11 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; /// ```mermaid /// stateDiagram-v2 /// -/// [*] --> Loading: spawn_load() /// [*] --> Attaching: spawn_attach() /// -/// Loading --> Activating: activate() /// Attaching --> Activating: activate() /// Activating --> Active: infallible /// -/// Loading --> Broken: load() failure /// Attaching --> Broken: attach() failure /// /// Active --> Stopping: set_stopping(), part of shutdown & detach @@ -68,10 +65,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; )] #[serde(tag = "slug", content = "data")] pub enum TenantState { - /// This tenant is being loaded from local disk. - /// - /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. - Loading, /// This tenant is being attached to the pageserver. /// /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. @@ -121,8 +114,6 @@ impl TenantState { // But, our attach task might still be fetching the remote timelines, etc. // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, - // tenant mgr startup distinguishes attaching from loading via marker file. - Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached, // We only reach Active after successful load / attach. // So, call atttachment status Attached. Self::Active => Attached, @@ -191,10 +182,11 @@ impl LsnLease { } /// The only [`TenantState`] variants we could be `TenantState::Activating` from. +/// +/// XXX: We used to have more variants here, but now it's just one, which makes this rather +/// useless. Remove, once we've checked that there's no client code left that looks at this. #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ActivatingFrom { - /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`] - Loading, /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`] Attaching, } @@ -1562,11 +1554,8 @@ mod tests { #[test] fn tenantstatus_activating_serde() { - let states = [ - TenantState::Activating(ActivatingFrom::Loading), - TenantState::Activating(ActivatingFrom::Attaching), - ]; - let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]"; + let states = [TenantState::Activating(ActivatingFrom::Attaching)]; + let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]"; let actual = serde_json::to_string(&states).unwrap(); @@ -1581,13 +1570,7 @@ mod tests { fn tenantstatus_activating_strum() { // tests added, because we use these for metrics let examples = [ - (line!(), TenantState::Loading, "Loading"), (line!(), TenantState::Attaching, "Attaching"), - ( - line!(), - TenantState::Activating(ActivatingFrom::Loading), - "Activating", - ), ( line!(), TenantState::Activating(ActivatingFrom::Attaching), diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index f199b155540f..7d284a6fc567 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,6 +19,7 @@ bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true +git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper = { workspace = true, features = ["full"] } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 03fb36caf8b6..aacc1e1dd5e8 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -92,6 +92,10 @@ pub mod toml_edit_ext; pub mod circuit_breaker; +// Re-export used in macro. Avoids adding git-version as dep in target crates. +#[doc(hidden)] +pub use git_version; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: @@ -131,7 +135,7 @@ macro_rules! project_git_version { ($const_identifier:ident) => { // this should try GIT_VERSION first only then git_version::git_version! const $const_identifier: &::core::primitive::str = { - const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! { + const __COMMIT_FROM_GIT: &::core::primitive::str = $crate::git_version::git_version! { prefix = "", fallback = "unknown", args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0eb48d6823b8..f1fc3a86fe4b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -27,7 +27,6 @@ crc32c.workspace = true either.workspace = true fail.workspace = true futures.workspace = true -git-version.workspace = true hex.workspace = true humantime.workspace = true humantime-serde.workspace = true diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 52b58fc298ce..d4f89ac38a19 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -12,7 +12,6 @@ anyhow.workspace = true async-stream.workspace = true clap = { workspace = true, features = ["string"] } futures.workspace = true -git-version.workspace = true itertools.workspace = true once_cell.workspace = true pageserver_api.workspace = true diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index 9592002de131..a753f806a076 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -10,7 +10,6 @@ license.workspace = true anyhow.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } -git-version.workspace = true humantime.workspace = true pageserver = { path = ".." } pageserver_api.workspace = true diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index f6d1c35a8ce1..d0a967b9207e 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use futures::Future; use pageserver_api::{ - controller_api::NodeRegisterRequest, + controller_api::{AvailabilityZone, NodeRegisterRequest}, shard::TenantShardId, upcall_api::{ ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, @@ -148,10 +148,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .and_then(|jv| jv.as_str().map(|str| str.to_owned())); match az_id_from_metadata { - Some(az_id) => Some(az_id), + Some(az_id) => Some(AvailabilityZone(az_id)), None => { tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); - conf.availability_zone.clone() + conf.availability_zone.clone().map(AvailabilityZone) } } }; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 162e8d1836ff..366bd8290340 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -3208,45 +3208,38 @@ pub(crate) mod tenant_throttling { impl TimelineGet { pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self { + let per_tenant_label_values = &[ + KIND, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]; TimelineGet { count_accounted_start: { GlobalAndPerTenantIntCounter { global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]), - per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[ - KIND, - &tenant_shard_id.tenant_id.to_string(), - &tenant_shard_id.shard_slug().to_string(), - ]), + per_tenant: COUNT_ACCOUNTED_START_PER_TENANT + .with_label_values(per_tenant_label_values), } }, count_accounted_finish: { GlobalAndPerTenantIntCounter { global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]), - per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[ - KIND, - &tenant_shard_id.tenant_id.to_string(), - &tenant_shard_id.shard_slug().to_string(), - ]), + per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT + .with_label_values(per_tenant_label_values), } }, wait_time: { GlobalAndPerTenantIntCounter { global: WAIT_USECS.with_label_values(&[KIND]), - per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[ - KIND, - &tenant_shard_id.tenant_id.to_string(), - &tenant_shard_id.shard_slug().to_string(), - ]), + per_tenant: WAIT_USECS_PER_TENANT + .with_label_values(per_tenant_label_values), } }, count_throttled: { GlobalAndPerTenantIntCounter { global: WAIT_COUNT.with_label_values(&[KIND]), - per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[ - KIND, - &tenant_shard_id.tenant_id.to_string(), - &tenant_shard_id.shard_slug().to_string(), - ]), + per_tenant: WAIT_COUNT_PER_TENANT + .with_label_values(per_tenant_label_values), } }, } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5ed63734f494..53cbaea621eb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1968,9 +1968,6 @@ impl Tenant { TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => { panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state); } - TenantState::Loading => { - *current_state = TenantState::Activating(ActivatingFrom::Loading); - } TenantState::Attaching => { *current_state = TenantState::Activating(ActivatingFrom::Attaching); } @@ -2151,7 +2148,7 @@ impl Tenant { async fn set_stopping( &self, progress: completion::Barrier, - allow_transition_from_loading: bool, + _allow_transition_from_loading: bool, allow_transition_from_attaching: bool, ) -> Result<(), SetStoppingError> { let mut rx = self.state.subscribe(); @@ -2166,7 +2163,6 @@ impl Tenant { ); false } - TenantState::Loading => allow_transition_from_loading, TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true, }) .await @@ -2185,13 +2181,6 @@ impl Tenant { *current_state = TenantState::Stopping { progress }; true } - TenantState::Loading => { - if !allow_transition_from_loading { - unreachable!("3we ensured above that we're done with activation, and, there is no re-activation") - }; - *current_state = TenantState::Stopping { progress }; - true - } TenantState::Active => { // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines // are created after the transition to Stopping. That's harmless, as the Timelines @@ -2247,7 +2236,7 @@ impl Tenant { // The load & attach routines own the tenant state until it has reached `Active`. // So, wait until it's done. rx.wait_for(|state| match state { - TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + TenantState::Activating(_) | TenantState::Attaching => { info!( "waiting for {} to turn Active|Broken|Stopping", <&'static str>::from(state) @@ -2267,7 +2256,7 @@ impl Tenant { let reason = reason.to_string(); self.state.send_modify(|current_state| { match *current_state { - TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + TenantState::Activating(_) | TenantState::Attaching => { unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } TenantState::Active => { @@ -2311,7 +2300,7 @@ impl Tenant { loop { let current_state = receiver.borrow_and_update().clone(); match current_state { - TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => { + TenantState::Attaching | TenantState::Activating(_) => { // in these states, there's a chance that we can reach ::Active self.activate_now(); match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await { @@ -4144,7 +4133,7 @@ pub(crate) mod harness { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); let tenant = Arc::new(Tenant::new( - TenantState::Loading, + TenantState::Attaching, self.conf, AttachedTenantConf::try_from(LocationConf::attached_single( TenantConfOpt::from(self.tenant_conf.clone()), diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 34f1b15138ec..2b212cfed5d7 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadCoalesceMode, VectoredReadPlanner, }; use crate::tenant::PageReconstructError; @@ -1021,13 +1021,30 @@ impl DeltaLayerInner { continue; } }; - + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter().rev() { if Some(meta.meta.key) == ignore_key_with_err { continue; } + let blob_read = meta.read(&view).await; + let blob_read = match blob_read { + Ok(buf) => buf, + Err(e) => { + reconstruct_state.on_key_error( + meta.meta.key, + PageReconstructError::Other(anyhow!(e).context(format!( + "Failed to decompress blob from virtual file {}", + self.file.path, + ))), + ); + + ignore_key_with_err = Some(meta.meta.key); + continue; + } + }; + + let value = Value::des(&blob_read); - let value = Value::des(&blobs_buf.buf[meta.start..meta.end]); let value = match value { Ok(v) => v, Err(e) => { @@ -1243,21 +1260,21 @@ impl DeltaLayerInner { buf.reserve(read.size()); let res = reader.read_blobs(&read, buf, ctx).await?; + let view = BufView::new_slice(&res.buf); + for blob in res.blobs { let key = blob.meta.key; let lsn = blob.meta.lsn; - let data = &res.buf[blob.start..blob.end]; + + let data = blob.read(&view).await?; #[cfg(debug_assertions)] - Value::des(data) + Value::des(&data) .with_context(|| { format!( - "blob failed to deserialize for {}@{}, {}..{}: {:?}", - blob.meta.key, - blob.meta.lsn, - blob.start, - blob.end, - utils::Hex(data) + "blob failed to deserialize for {}: {:?}", + blob, + utils::Hex(&data) ) }) .unwrap(); @@ -1265,15 +1282,15 @@ impl DeltaLayerInner { // is it an image or will_init walrecord? // FIXME: this could be handled by threading the BlobRef to the // VectoredReadBuilder - let will_init = crate::repository::ValueBytes::will_init(data) + let will_init = crate::repository::ValueBytes::will_init(&data) .inspect_err(|_e| { #[cfg(feature = "testing")] - tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); + tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); }) .unwrap_or(false); per_blob_copy.clear(); - per_blob_copy.extend_from_slice(data); + per_blob_copy.extend_from_slice(&data); let (tmp, res) = writer .put_value_bytes( @@ -1538,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> { .read_blobs(&plan, buf, self.ctx) .await?; let frozen_buf = blobs_buf.buf.freeze(); + let view = BufView::new_bytes(frozen_buf); for meta in blobs_buf.blobs.iter() { - let value = Value::des(&frozen_buf[meta.start..meta.end])?; + let blob_read = meta.read(&view).await?; + let value = Value::des(&blob_read)?; + next_batch.push_back((meta.meta.key, meta.meta.lsn, value)); } self.key_values_batch = next_batch; @@ -1916,9 +1936,13 @@ pub(crate) mod test { let blobs_buf = vectored_blob_reader .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx) .await?; + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { - let value = &blobs_buf.buf[meta.start..meta.end]; - assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]); + let value = meta.read(&view).await?; + assert_eq!( + &value[..], + &entries_meta.index[&(meta.meta.key, meta.meta.lsn)] + ); } buf = Some(blobs_buf.buf); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 5de2582ab79f..940d169db096 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -36,7 +36,8 @@ use crate::tenant::disk_btree::{ }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; @@ -547,15 +548,15 @@ impl ImageLayerInner { let buf = BytesMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; - let frozen_buf = blobs_buf.buf.freeze(); + let view = BufView::new_bytes(frozen_buf); for meta in blobs_buf.blobs.iter() { - let img_buf = frozen_buf.slice(meta.start..meta.end); + let img_buf = meta.read(&view).await?; key_count += 1; writer - .put_image(meta.meta.key, img_buf, ctx) + .put_image(meta.meta.key, img_buf.into_bytes(), ctx) .await .context(format!("Storing key {}", meta.meta.key))?; } @@ -602,13 +603,28 @@ impl ImageLayerInner { match res { Ok(blobs_buf) => { let frozen_buf = blobs_buf.buf.freeze(); - + let view = BufView::new_bytes(frozen_buf); for meta in blobs_buf.blobs.iter() { - let img_buf = frozen_buf.slice(meta.start..meta.end); + let img_buf = meta.read(&view).await; + + let img_buf = match img_buf { + Ok(img_buf) => img_buf, + Err(e) => { + reconstruct_state.on_key_error( + meta.meta.key, + PageReconstructError::Other(anyhow!(e).context(format!( + "Failed to decompress blob from virtual file {}", + self.file.path, + ))), + ); + + continue; + } + }; reconstruct_state.update_key( &meta.meta.key, self.lsn, - Value::Image(img_buf), + Value::Image(img_buf.into_bytes()), ); } } @@ -1025,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> { let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; - let frozen_buf: Bytes = blobs_buf.buf.freeze(); + let frozen_buf = blobs_buf.buf.freeze(); + let view = BufView::new_bytes(frozen_buf); for meta in blobs_buf.blobs.iter() { - let img_buf = frozen_buf.slice(meta.start..meta.end); - next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf))); + let img_buf = meta.read(&view).await?; + next_batch.push_back(( + meta.meta.key, + self.image_layer.lsn, + Value::Image(img_buf.into_bytes()), + )); } self.key_values_batch = next_batch; Ok(()) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 341febb30ab9..3f0f8a21c8a5 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -481,8 +481,7 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken let allowed_rps = tenant.timeline_get_throttle.steady_rps(); let delta = now - prev; info!( - n_seconds=%format_args!("{:.3}", - delta.as_secs_f64()), + n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), count_accounted = count_accounted_finish, // don't break existing log scraping count_throttled, sum_throttled_usecs, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c98efd5f7184..d301ba23eafa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -112,7 +112,7 @@ use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; use postgres_connection::PgConnectionConfig; -use postgres_ffi::to_pg_timestamp; +use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE}; use utils::{ completion, generation::Generation, @@ -1337,6 +1337,10 @@ impl Timeline { _ctx: &RequestContext, ) -> anyhow::Result { let lease = { + // Normalize the requested LSN to be aligned, and move to the first record + // if it points to the beginning of the page (header). + let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE); + let mut gc_info = self.gc_info.write().unwrap(); let valid_until = SystemTime::now() + length; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 2f6cb4d73a69..26c2861b9308 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,8 +30,8 @@ use crate::{ pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError, - LogicalSizeCalculationCause, Tenant, + size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint, + tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, }; @@ -557,6 +557,8 @@ impl Timeline { gather_result = gather => { match gather_result { Ok(_) => {}, + // It can happen sometimes that we hit this instead of the cancellation token firing above + Err(CalculateSyntheticSizeError::Cancelled) => {} Err(e) => { // We don't care about the result, but, if it failed, we should log it, // since consumption metric might be hitting the cached value and diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 553edf6d8b34..aa37a45898bd 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -16,8 +16,9 @@ //! Note that the vectored blob api does *not* go through the page cache. use std::collections::BTreeMap; +use std::ops::Deref; -use bytes::BytesMut; +use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::BoundedBuf; @@ -35,11 +36,123 @@ pub struct BlobMeta { pub lsn: Lsn, } -/// Blob offsets into [`VectoredBlobsBuf::buf`] +/// A view into the vectored blobs read buffer. +#[derive(Clone, Debug)] +pub(crate) enum BufView<'a> { + Slice(&'a [u8]), + Bytes(bytes::Bytes), +} + +impl<'a> BufView<'a> { + /// Creates a new slice-based view on the blob. + pub fn new_slice(slice: &'a [u8]) -> Self { + Self::Slice(slice) + } + + /// Creates a new [`bytes::Bytes`]-based view on the blob. + pub fn new_bytes(bytes: bytes::Bytes) -> Self { + Self::Bytes(bytes) + } + + /// Convert the view into `Bytes`. + /// + /// If using slice as the underlying storage, the copy will be an O(n) operation. + pub fn into_bytes(self) -> Bytes { + match self { + BufView::Slice(slice) => Bytes::copy_from_slice(slice), + BufView::Bytes(bytes) => bytes, + } + } + + /// Creates a sub-view of the blob based on the range. + fn view(&self, range: std::ops::Range) -> Self { + match self { + BufView::Slice(slice) => BufView::Slice(&slice[range]), + BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)), + } + } +} + +impl<'a> Deref for BufView<'a> { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match self { + BufView::Slice(slice) => slice, + BufView::Bytes(bytes) => bytes, + } + } +} + +impl<'a> AsRef<[u8]> for BufView<'a> { + fn as_ref(&self) -> &[u8] { + match self { + BufView::Slice(slice) => slice, + BufView::Bytes(bytes) => bytes.as_ref(), + } + } +} + +impl<'a> From<&'a [u8]> for BufView<'a> { + fn from(value: &'a [u8]) -> Self { + Self::new_slice(value) + } +} + +impl From for BufView<'_> { + fn from(value: Bytes) -> Self { + Self::new_bytes(value) + } +} + +/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed, +/// subject to [`VectoredBlob::compression_bits`]. pub struct VectoredBlob { - pub start: usize, - pub end: usize, + /// Blob metadata. pub meta: BlobMeta, + /// Start offset. + start: usize, + /// End offset. + end: usize, + /// Compression used on the the blob. + compression_bits: u8, +} + +impl VectoredBlob { + /// Reads a decompressed view of the blob. + pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result, std::io::Error> { + let view = buf.view(self.start..self.end); + + match self.compression_bits { + BYTE_UNCOMPRESSED => Ok(view), + BYTE_ZSTD => { + let mut decompressed_vec = Vec::new(); + let mut decoder = + async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec); + decoder.write_all(&view).await?; + decoder.flush().await?; + // Zero-copy conversion from `Vec` to `Bytes` + Ok(BufView::new_bytes(Bytes::from(decompressed_vec))) + } + bits => { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end), + ); + Err(error) + } + } + } +} + +impl std::fmt::Display for VectoredBlob { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}@{}, {}..{}", + self.meta.key, self.meta.lsn, self.start, self.end + ) + } } /// Return type of [`VectoredBlobReader::read_blobs`] @@ -514,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> { ); } - let mut buf = self + let buf = self .file .read_exact_at(buf.slice(0..read.size()), read.start, ctx) .await? @@ -529,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> { // of a blob is implicit: the start of the next blob if one exists // or the end of the read. - // Some scratch space, put here for reusing the allocation - let mut decompressed_vec = Vec::new(); - for (blob_start, meta) in blobs_at { let blob_start_in_buf = blob_start - start_offset; let first_len_byte = buf[blob_start_in_buf as usize]; @@ -557,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> { ) }; - let start_raw = blob_start_in_buf + size_length; - let end_raw = start_raw + blob_size; - let (start, end); - if compression_bits == BYTE_UNCOMPRESSED { - start = start_raw as usize; - end = end_raw as usize; - } else if compression_bits == BYTE_ZSTD { - let mut decoder = - async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec); - decoder - .write_all(&buf[start_raw as usize..end_raw as usize]) - .await?; - decoder.flush().await?; - start = buf.len(); - buf.extend_from_slice(&decompressed_vec); - end = buf.len(); - decompressed_vec.clear(); - } else { - let error = std::io::Error::new( - std::io::ErrorKind::InvalidData, - format!("invalid compression byte {compression_bits:x}"), - ); - return Err(error); - } + let start = (blob_start_in_buf + size_length) as usize; + let end = start + blob_size as usize; metas.push(VectoredBlob { start, end, meta: *meta, + compression_bits, }); } @@ -1020,8 +1109,13 @@ mod tests { let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); let read_blob = &result.blobs[0]; - let read_buf = &result.buf[read_blob.start..read_blob.end]; - assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}"); + let view = BufView::new_slice(&result.buf); + let read_buf = read_blob.read(&view).await?; + assert_eq!( + &blob[..], + &read_buf[..], + "mismatch for idx={idx} at offset={offset}" + ); buf = result.buf; } Ok(()) diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index 3e86d5b26276..de653826c019 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -32,7 +32,7 @@ NeonPerfCountersShmemSize(void) return size; } -bool +void NeonPerfCountersShmemInit(void) { bool found; diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index ae35e8c3a515..02163ada5571 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -105,7 +105,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; extern void inc_getpage_wait(uint64 latency); extern Size NeonPerfCountersShmemSize(void); -extern bool NeonPerfCountersShmemInit(void); +extern void NeonPerfCountersShmemInit(void); #endif /* NEON_PERF_COUNTERS_H */ diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 6703eb06eb29..501ce050e071 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -29,7 +29,6 @@ dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true futures.workspace = true -git-version.workspace = true hashbrown.workspace = true hashlink.workspace = true hex.workspace = true diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index daf21c70b045..67f32b3cc08b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -21,7 +21,6 @@ chrono.workspace = true clap = { workspace = true, features = ["derive"] } crc32c.workspace = true fail.workspace = true -git-version.workspace = true hex.workspace = true humantime.workspace = true hyper.workspace = true diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 82ec0aa272e3..5359f586e49d 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -15,7 +15,6 @@ const_format.workspace = true futures.workspace = true futures-core.workspace = true futures-util.workspace = true -git-version.workspace = true humantime.workspace = true hyper = { workspace = true, features = ["full"] } once_cell.workspace = true diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index a96d64e09670..9ed0501026dc 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -20,7 +20,6 @@ chrono.workspace = true clap.workspace = true fail.workspace = true futures.workspace = true -git-version.workspace = true hex.workspace = true hyper.workspace = true humantime.workspace = true diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 95e4a469ac95..4dd8badd0391 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -515,7 +515,7 @@ async fn handle_tenant_timeline_passthrough( tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); // Find the node that holds shard zero - let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?; + let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. @@ -545,10 +545,10 @@ async fn handle_tenant_timeline_passthrough( let _timer = latency.start_timer(labels.clone()); let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); - let resp = client.get_raw(path).await.map_err(|_e| - // FIXME: give APiError a proper Unavailable variant. We return 503 here because - // if we can't successfully send a request to the pageserver, we aren't available. - ApiError::ShuttingDown)?; + let resp = client.get_raw(path).await.map_err(|e| + // We return 503 here because if we can't successfully send a request to the pageserver, + // either we aren't available or the pageserver is unavailable. + ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?; if !resp.status().is_success() { let error_counter = &METRICS_REGISTRY @@ -557,6 +557,19 @@ async fn handle_tenant_timeline_passthrough( error_counter.inc(labels); } + // Transform 404 into 503 if we raced with a migration + if resp.status() == reqwest::StatusCode::NOT_FOUND { + // Look up node again: if we migrated it will be different + let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + if new_node.get_id() != node.get_id() { + // Rather than retry here, send the client a 503 to prompt a retry: this matches + // the pageserver's use of 503, and all clients calling this API should retry on 503. + return Err(ApiError::ResourceUnavailable( + format!("Pageserver {node} returned 404, was migrated to {new_node}").into(), + )); + } + } + // We have a reqest::Response, would like a http::Response let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); for (k, v) in resp.headers() { diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index cb9ce10d230a..4cc9b0070dc7 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -2,8 +2,8 @@ use std::{str::FromStr, time::Duration}; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, - TenantLocateResponseShard, + AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, + NodeSchedulingPolicy, TenantLocateResponseShard, }, shard::TenantShardId, }; @@ -36,7 +36,7 @@ pub(crate) struct Node { listen_pg_addr: String, listen_pg_port: u16, - availability_zone_id: String, + availability_zone_id: AvailabilityZone, // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. @@ -64,8 +64,8 @@ impl Node { } #[allow(unused)] - pub(crate) fn get_availability_zone_id(&self) -> &str { - self.availability_zone_id.as_str() + pub(crate) fn get_availability_zone_id(&self) -> &AvailabilityZone { + &self.availability_zone_id } pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy { @@ -181,7 +181,7 @@ impl Node { listen_http_port: u16, listen_pg_addr: String, listen_pg_port: u16, - availability_zone_id: String, + availability_zone_id: AvailabilityZone, ) -> Self { Self { id, @@ -204,7 +204,7 @@ impl Node { listen_http_port: self.listen_http_port as i32, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, - availability_zone_id: self.availability_zone_id.clone(), + availability_zone_id: self.availability_zone_id.0.clone(), } } @@ -219,7 +219,7 @@ impl Node { listen_http_port: np.listen_http_port as u16, listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, - availability_zone_id: np.availability_zone_id, + availability_zone_id: AvailabilityZone(np.availability_zone_id), cancel: CancellationToken::new(), } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 1dc1040d9637..14cc51240d10 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -9,6 +9,7 @@ use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; use itertools::Itertools; +use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::controller_api::MetadataHealthRecord; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; @@ -667,8 +668,8 @@ impl Persistence { pub(crate) async fn set_tenant_shard_preferred_azs( &self, - preferred_azs: Vec<(TenantShardId, String)>, - ) -> DatabaseResult> { + preferred_azs: Vec<(TenantShardId, AvailabilityZone)>, + ) -> DatabaseResult> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { @@ -679,7 +680,7 @@ impl Persistence { .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(preferred_az_id.eq(preferred_az)) + .set(preferred_az_id.eq(preferred_az.0.clone())) .execute(conn)?; if updated == 1 { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 750bcd7c0138..2c42da404355 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -463,7 +463,7 @@ impl Reconciler { for (timeline_id, baseline_lsn) in &baseline { match latest.get(timeline_id) { Some(latest_lsn) => { - tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); + tracing::info!(timeline_id = %timeline_id, "🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); if latest_lsn < baseline_lsn { any_behind = true; } @@ -541,6 +541,8 @@ impl Reconciler { } } + pausable_failpoint!("reconciler-live-migrate-pre-generation-inc"); + // Increment generation before attaching to new pageserver self.generation = Some( self.persistence @@ -617,6 +619,8 @@ impl Reconciler { }, ); + pausable_failpoint!("reconciler-live-migrate-post-detach"); + tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",); let dest_final_conf = build_location_config( &self.shard, diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 1cb1fb104d60..2414d95eb89b 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,6 +1,6 @@ use crate::{node::Node, tenant_shard::TenantShard}; use itertools::Itertools; -use pageserver_api::models::PageserverUtilization; +use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; use serde::Serialize; use std::{collections::HashMap, fmt::Debug}; use utils::{http::error::ApiError, id::NodeId}; @@ -32,6 +32,8 @@ pub(crate) struct SchedulerNode { shard_count: usize, /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. attached_shard_count: usize, + /// Availability zone id in which the node resides + az: AvailabilityZone, /// Whether this node is currently elegible to have new shards scheduled (this is derived /// from a node's availability state and scheduling policy). @@ -42,6 +44,7 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized { fn generate( node_id: &NodeId, node: &mut SchedulerNode, + preferred_az: &Option, context: &ScheduleContext, ) -> Option; fn is_overloaded(&self) -> bool; @@ -62,6 +65,72 @@ impl ShardTag for SecondaryShardTag { type Score = NodeSecondarySchedulingScore; } +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +enum AzMatch { + Yes, + No, + Unknown, +} + +impl AzMatch { + fn new(node_az: &AvailabilityZone, shard_preferred_az: Option<&AvailabilityZone>) -> Self { + match shard_preferred_az { + Some(preferred_az) if preferred_az == node_az => Self::Yes, + Some(_preferred_az) => Self::No, + None => Self::Unknown, + } + } +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +struct AttachmentAzMatch(AzMatch); + +impl Ord for AttachmentAzMatch { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Lower scores indicate a more suitable node. + // Note that we prefer a node for which we don't have + // info to a node which we are certain doesn't match the + // preferred AZ of the shard. + let az_match_score = |az_match: &AzMatch| match az_match { + AzMatch::Yes => 0, + AzMatch::Unknown => 1, + AzMatch::No => 2, + }; + + az_match_score(&self.0).cmp(&az_match_score(&other.0)) + } +} + +impl PartialOrd for AttachmentAzMatch { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +struct SecondaryAzMatch(AzMatch); + +impl Ord for SecondaryAzMatch { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Lower scores indicate a more suitable node. + // For secondary locations we wish to avoid the preferred AZ + // of the shard. + let az_match_score = |az_match: &AzMatch| match az_match { + AzMatch::No => 0, + AzMatch::Unknown => 1, + AzMatch::Yes => 2, + }; + + az_match_score(&self.0).cmp(&az_match_score(&other.0)) + } +} + +impl PartialOrd for SecondaryAzMatch { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + /// Scheduling score of a given node for shard attachments. /// Lower scores indicate more suitable nodes. /// Ordering is given by member declaration order (top to bottom). @@ -70,6 +139,10 @@ pub(crate) struct NodeAttachmentSchedulingScore { /// The number of shards belonging to the tenant currently being /// scheduled that are attached to this node. affinity_score: AffinityScore, + /// Flag indicating whether this node matches the preferred AZ + /// of the shard. For equal affinity scores, nodes in the matching AZ + /// are considered first. + az_match: AttachmentAzMatch, /// Size of [`ScheduleContext::attached_nodes`] for the current node. /// This normally tracks the number of attached shards belonging to the /// tenant being scheduled that are already on this node. @@ -87,6 +160,7 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore { fn generate( node_id: &NodeId, node: &mut SchedulerNode, + preferred_az: &Option, context: &ScheduleContext, ) -> Option { let utilization = match &mut node.may_schedule { @@ -102,6 +176,7 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore { .get(node_id) .copied() .unwrap_or(AffinityScore::FREE), + az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0), utilization_score: utilization.cached_score(), total_attached_shard_count: node.attached_shard_count, @@ -123,6 +198,11 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore { /// Ordering is given by member declaration order (top to bottom). #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub(crate) struct NodeSecondarySchedulingScore { + /// Flag indicating whether this node matches the preferred AZ + /// of the shard. For secondary locations we wish to avoid nodes in. + /// the preferred AZ of the shard, since that's where the attached location + /// should be scheduled and having the secondary in the same AZ is bad for HA. + az_match: SecondaryAzMatch, /// The number of shards belonging to the tenant currently being /// scheduled that are attached to this node. affinity_score: AffinityScore, @@ -139,6 +219,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { fn generate( node_id: &NodeId, node: &mut SchedulerNode, + preferred_az: &Option, context: &ScheduleContext, ) -> Option { let utilization = match &mut node.may_schedule { @@ -149,6 +230,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { }; Some(Self { + az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), affinity_score: context .nodes .get(node_id) @@ -179,6 +261,7 @@ impl PartialEq for SchedulerNode { may_schedule_matches && self.shard_count == other.shard_count && self.attached_shard_count == other.attached_shard_count + && self.az == other.az } } @@ -293,6 +376,7 @@ impl Scheduler { shard_count: 0, attached_shard_count: 0, may_schedule: node.may_schedule(), + az: node.get_availability_zone_id().clone(), }, ); } @@ -319,6 +403,7 @@ impl Scheduler { shard_count: 0, attached_shard_count: 0, may_schedule: node.may_schedule(), + az: node.get_availability_zone_id().clone(), }, ); } @@ -497,6 +582,7 @@ impl Scheduler { shard_count: 0, attached_shard_count: 0, may_schedule: node.may_schedule(), + az: node.get_availability_zone_id().clone(), }); } } @@ -542,6 +628,7 @@ impl Scheduler { fn compute_node_scores( &mut self, hard_exclude: &[NodeId], + preferred_az: &Option, context: &ScheduleContext, ) -> Vec where @@ -553,7 +640,7 @@ impl Scheduler { if hard_exclude.contains(k) { None } else { - Score::generate(k, v, context) + Score::generate(k, v, preferred_az, context) } }) .collect() @@ -571,13 +658,15 @@ impl Scheduler { pub(crate) fn schedule_shard( &mut self, hard_exclude: &[NodeId], + preferred_az: &Option, context: &ScheduleContext, ) -> Result { if self.nodes.is_empty() { return Err(ScheduleError::NoPageservers); } - let mut scores = self.compute_node_scores::(hard_exclude, context); + let mut scores = + self.compute_node_scores::(hard_exclude, preferred_az, context); // Exclude nodes whose utilization is critically high, if there are alternatives available. This will // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example @@ -634,6 +723,12 @@ impl Scheduler { Ok(node_id) } + /// Selects any available node. This is suitable for performing background work (e.g. S3 + /// deletions). + pub(crate) fn any_available_node(&mut self) -> Result { + self.schedule_shard::(&[], &None, &ScheduleContext::default()) + } + /// Unit test access to internal state #[cfg(test)] pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { @@ -650,13 +745,22 @@ impl Scheduler { pub(crate) mod test_utils { use crate::node::Node; - use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use pageserver_api::{ + controller_api::{AvailabilityZone, NodeAvailability}, + models::utilization::test_utilization, + }; use std::collections::HashMap; use utils::id::NodeId; + /// Test helper: synthesize the requested number of nodes, all in active state. /// /// Node IDs start at one. - pub(crate) fn make_test_nodes(n: u64) -> HashMap { + /// + /// The `azs` argument specifies the list of availability zones which will be assigned + /// to nodes in round-robin fashion. If empy, a default AZ is assigned. + pub(crate) fn make_test_nodes(n: u64, azs: &[AvailabilityZone]) -> HashMap { + let mut az_iter = azs.iter().cycle(); + (1..n + 1) .map(|i| { (NodeId(i), { @@ -666,7 +770,10 @@ pub(crate) mod test_utils { 80 + i as u16, format!("pghost-{i}"), 5432 + i as u16, - "test-az".to_string(), + az_iter + .next() + .cloned() + .unwrap_or(AvailabilityZone("test-az".to_string())), ); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); @@ -686,7 +793,7 @@ mod tests { use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { - let nodes = test_utils::make_test_nodes(2); + let nodes = test_utils::make_test_nodes(2, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut t1_intent = IntentState::new(); @@ -694,9 +801,9 @@ mod tests { let context = ScheduleContext::default(); - let scheduled = scheduler.schedule_shard::(&[], &context)?; + let scheduled = scheduler.schedule_shard::(&[], &None, &context)?; t1_intent.set_attached(&mut scheduler, Some(scheduled)); - let scheduled = scheduler.schedule_shard::(&[], &context)?; + let scheduled = scheduler.schedule_shard::(&[], &None, &context)?; t2_intent.set_attached(&mut scheduler, Some(scheduled)); assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); @@ -705,8 +812,11 @@ mod tests { assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); - let scheduled = - scheduler.schedule_shard::(&t1_intent.all_pageservers(), &context)?; + let scheduled = scheduler.schedule_shard::( + &t1_intent.all_pageservers(), + &None, + &context, + )?; t1_intent.push_secondary(&mut scheduler, scheduled); assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); @@ -746,7 +856,7 @@ mod tests { #[test] /// Test the PageserverUtilization's contribution to scheduling algorithm fn scheduler_utilization() { - let mut nodes = test_utils::make_test_nodes(3); + let mut nodes = test_utils::make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); // Need to keep these alive because they contribute to shard counts via RAII @@ -761,7 +871,7 @@ mod tests { context: &ScheduleContext, ) { let scheduled = scheduler - .schedule_shard::(&[], context) + .schedule_shard::(&[], &None, context) .unwrap(); let mut intent = IntentState::new(); intent.set_attached(scheduler, Some(scheduled)); @@ -870,4 +980,98 @@ mod tests { intent.clear(&mut scheduler); } } + + #[test] + /// A simple test that showcases AZ-aware scheduling and its interaction with + /// affinity scores. + fn az_scheduling() { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + + let nodes = test_utils::make_test_nodes(3, &[az_a_tag.clone(), az_b_tag.clone()]); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_intents = Vec::new(); + + let mut context = ScheduleContext::default(); + + fn assert_scheduler_chooses( + expect_node: NodeId, + preferred_az: Option, + scheduled_intents: &mut Vec, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) { + let scheduled = scheduler + .schedule_shard::(&[], &preferred_az, context) + .unwrap(); + let mut intent = IntentState::new(); + intent.set_attached(scheduler, Some(scheduled)); + scheduled_intents.push(intent); + assert_eq!(scheduled, expect_node); + + context.avoid(&[scheduled]); + } + + assert_scheduler_chooses::( + NodeId(1), + Some(az_a_tag.clone()), + &mut scheduled_intents, + &mut scheduler, + &mut context, + ); + + // Node 2 and 3 have affinity score equal to 0, but node 3 + // is in "az-a" so we prefer that. + assert_scheduler_chooses::( + NodeId(3), + Some(az_a_tag.clone()), + &mut scheduled_intents, + &mut scheduler, + &mut context, + ); + + // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that. + assert_scheduler_chooses::( + NodeId(2), + Some(az_a_tag.clone()), + &mut scheduled_intents, + &mut scheduler, + &mut context, + ); + + // Avoid nodes in "az-a" for the secondary location. + assert_scheduler_chooses::( + NodeId(2), + Some(az_a_tag.clone()), + &mut scheduled_intents, + &mut scheduler, + &mut context, + ); + + // Avoid nodes in "az-b" for the secondary location. + // Nodes 1 and 3 are identically loaded, so prefer the lowest node id. + assert_scheduler_chooses::( + NodeId(1), + Some(az_b_tag.clone()), + &mut scheduled_intents, + &mut scheduler, + &mut context, + ); + + // Avoid nodes in "az-b" for the secondary location. + // Node 3 has lower affinity score than 1, so prefer that. + assert_scheduler_chooses::( + NodeId(3), + Some(az_b_tag.clone()), + &mut scheduled_intents, + &mut scheduler, + &mut context, + ); + + for mut intent in scheduled_intents { + intent.clear(&mut scheduler); + } + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 5555505b81d9..a5e012968475 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -26,7 +26,7 @@ use crate::{ ShardGenerationState, TenantFilter, }, reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, - scheduler::{AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, + scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, tenant_shard::{ MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, ScheduleOptimizationAction, @@ -1265,6 +1265,8 @@ impl Service { #[cfg(feature = "testing")] { + use pageserver_api::controller_api::AvailabilityZone; + // Hack: insert scheduler state for all nodes referenced by shards, as compatibility // tests only store the shards, not the nodes. The nodes will be loaded shortly // after when pageservers start up and register. @@ -1282,7 +1284,7 @@ impl Service { 123, "".to_string(), 123, - "test_az".to_string(), + AvailabilityZone("test_az".to_string()), ); scheduler.node_upsert(&node); @@ -2099,7 +2101,7 @@ impl Service { let az_id = locked .nodes .get(&resp.node_id) - .map(|n| n.get_availability_zone_id().to_string())?; + .map(|n| n.get_availability_zone_id().clone())?; Some((resp.shard_id, az_id)) }) @@ -2629,8 +2631,7 @@ impl Service { let scheduler = &mut locked.scheduler; // Right now we only perform the operation on a single node without parallelization // TODO fan out the operation to multiple nodes for better performance - let node_id = - scheduler.schedule_shard::(&[], &ScheduleContext::default())?; + let node_id = scheduler.any_available_node()?; let node = locked .nodes .get(&node_id) @@ -2816,8 +2817,7 @@ impl Service { // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant // was attached, just has to be able to see the S3 content) - let node_id = - scheduler.schedule_shard::(&[], &ScheduleContext::default())?; + let node_id = scheduler.any_available_node()?; let node = nodes .get(&node_id) .expect("Pageservers may not be deleted while lock is active"); @@ -3508,34 +3508,66 @@ impl Service { /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) - pub(crate) fn tenant_shard0_node( + pub(crate) async fn tenant_shard0_node( &self, tenant_id: TenantId, ) -> Result<(Node, TenantShardId), ApiError> { - let locked = self.inner.read().unwrap(); - let Some((tenant_shard_id, shard)) = locked - .tenants - .range(TenantShardId::tenant_range(tenant_id)) - .next() - else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant {tenant_id} not found").into(), - )); + // Look up in-memory state and maybe use the node from there. + { + let locked = self.inner.read().unwrap(); + let Some((tenant_shard_id, shard)) = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {tenant_id} not found").into(), + )); + }; + + let Some(intent_node_id) = shard.intent.get_attached() else { + tracing::warn!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Shard not scheduled (policy {:?}), cannot generate pass-through URL", + shard.policy + ); + return Err(ApiError::Conflict( + "Cannot call timeline API on non-attached tenant".to_string(), + )); + }; + + if shard.reconciler.is_none() { + // Optimization: while no reconcile is in flight, we may trust our in-memory state + // to tell us which pageserver to use. Otherwise we will fall through and hit the database + let Some(node) = locked.nodes.get(intent_node_id) else { + // This should never happen + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard refers to nonexistent node" + ))); + }; + return Ok((node.clone(), *tenant_shard_id)); + } }; - // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might - // point to somewhere we haven't attached yet. - let Some(node_id) = shard.intent.get_attached() else { - tracing::warn!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Shard not scheduled (policy {:?}), cannot generate pass-through URL", - shard.policy - ); - return Err(ApiError::Conflict( - "Cannot call timeline API on non-attached tenant".to_string(), + // Look up the latest attached pageserver location from the database + // generation state: this will reflect the progress of any ongoing migration. + // Note that it is not guaranteed to _stay_ here, our caller must still handle + // the case where they call through to the pageserver and get a 404. + let db_result = self.persistence.tenant_generations(tenant_id).await?; + let Some(ShardGenerationState { + tenant_shard_id, + generation: _, + generation_pageserver: Some(node_id), + }) = db_result.first() + else { + // This can happen if we raced with a tenant deletion or a shard split. On a retry + // the caller will either succeed (shard split case), get a proper 404 (deletion case), + // or a conflict response (case where tenant was detached in background) + return Err(ApiError::ResourceUnavailable( + "Shard {} not found in database, or is not attached".into(), )); }; - + let locked = self.inner.read().unwrap(); let Some(node) = locked.nodes.get(node_id) else { // This should never happen return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -4481,7 +4513,7 @@ impl Service { let az_id = locked .nodes .get(node_id) - .map(|n| n.get_availability_zone_id().to_string())?; + .map(|n| n.get_availability_zone_id().clone())?; Some((*tid, az_id)) }) diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 1f5eb423be8b..afc89eae0073 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -15,7 +15,7 @@ use crate::{ service::ReconcileResultRequest, }; use pageserver_api::controller_api::{ - NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, + AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, }; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, @@ -146,7 +146,7 @@ pub(crate) struct TenantShard { // We should attempt to schedule this shard in the provided AZ to // decrease chances of cross-AZ compute. - preferred_az_id: Option, + preferred_az_id: Option, } #[derive(Default, Clone, Debug, Serialize)] @@ -540,14 +540,22 @@ impl TenantShard { Ok((true, promote_secondary)) } else { // Pick a fresh node: either we had no secondaries or none were schedulable - let node_id = - scheduler.schedule_shard::(&self.intent.secondary, context)?; + let node_id = scheduler.schedule_shard::( + &self.intent.secondary, + &self.preferred_az_id, + context, + )?; tracing::debug!("Selected {} as attached", node_id); self.intent.set_attached(scheduler, Some(node_id)); Ok((true, node_id)) } } + #[instrument(skip_all, fields( + tenant_id=%self.tenant_shard_id.tenant_id, + shard_id=%self.tenant_shard_id.shard_slug(), + sequence=%self.sequence + ))] pub(crate) fn schedule( &mut self, scheduler: &mut Scheduler, @@ -617,8 +625,11 @@ impl TenantShard { let mut used_pageservers = vec![attached_node_id]; while self.intent.secondary.len() < secondary_count { - let node_id = scheduler - .schedule_shard::(&used_pageservers, context)?; + let node_id = scheduler.schedule_shard::( + &used_pageservers, + &self.preferred_az_id, + context, + )?; self.intent.push_secondary(scheduler, node_id); used_pageservers.push(node_id); modified = true; @@ -631,7 +642,11 @@ impl TenantShard { modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node - let node_id = scheduler.schedule_shard::(&[], context)?; + let node_id = scheduler.schedule_shard::( + &[], + &self.preferred_az_id, + context, + )?; self.intent.push_secondary(scheduler, node_id); modified = true; } @@ -810,6 +825,7 @@ impl TenantShard { // with lower utilization. let Ok(candidate_node) = scheduler.schedule_shard::( &self.intent.all_pageservers(), + &self.preferred_az_id, schedule_context, ) else { // A scheduling error means we have no possible candidate replacements @@ -1308,7 +1324,7 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), - preferred_az_id: tsp.preferred_az_id, + preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone), }) } @@ -1324,15 +1340,15 @@ impl TenantShard { config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), - preferred_az_id: self.preferred_az_id.clone(), + preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()), } } - pub(crate) fn preferred_az(&self) -> Option<&str> { - self.preferred_az_id.as_deref() + pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { + self.preferred_az_id.as_ref() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) { + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { self.preferred_az_id = Some(preferred_az_id); } } @@ -1345,6 +1361,7 @@ pub(crate) mod tests { controller_api::NodeAvailability, shard::{ShardCount, ShardNumber}, }; + use rand::{rngs::StdRng, SeedableRng}; use utils::id::TenantId; use crate::scheduler::test_utils::make_test_nodes; @@ -1373,7 +1390,11 @@ pub(crate) mod tests { ) } - fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { + fn make_test_tenant( + policy: PlacementPolicy, + shard_count: ShardCount, + preferred_az: Option, + ) -> Vec { let tenant_id = TenantId::generate(); (0..shard_count.count()) @@ -1385,7 +1406,7 @@ pub(crate) mod tests { shard_number, shard_count, }; - TenantShard::new( + let mut ts = TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, @@ -1394,7 +1415,13 @@ pub(crate) mod tests { ) .unwrap(), policy.clone(), - ) + ); + + if let Some(az) = &preferred_az { + ts.set_preferred_az(az.clone()); + } + + ts }) .collect() } @@ -1405,7 +1432,7 @@ pub(crate) mod tests { fn tenant_ha_scheduling() -> anyhow::Result<()> { // Start with three nodes. Our tenant will only use two. The third one is // expected to remain unused. - let mut nodes = make_test_nodes(3); + let mut nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut context = ScheduleContext::default(); @@ -1457,7 +1484,7 @@ pub(crate) mod tests { #[test] fn intent_from_observed() -> anyhow::Result<()> { - let nodes = make_test_nodes(3); + let nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); @@ -1507,7 +1534,7 @@ pub(crate) mod tests { #[test] fn scheduling_mode() -> anyhow::Result<()> { - let nodes = make_test_nodes(3); + let nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); @@ -1532,7 +1559,7 @@ pub(crate) mod tests { #[test] fn optimize_attachment() -> anyhow::Result<()> { - let nodes = make_test_nodes(3); + let nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); @@ -1599,7 +1626,7 @@ pub(crate) mod tests { #[test] fn optimize_secondary() -> anyhow::Result<()> { - let nodes = make_test_nodes(4); + let nodes = make_test_nodes(4, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); @@ -1698,14 +1725,14 @@ pub(crate) mod tests { /// that it converges. #[test] fn optimize_add_nodes() -> anyhow::Result<()> { - let nodes = make_test_nodes(4); + let nodes = make_test_nodes(4, &[]); // Only show the scheduler a couple of nodes let mut scheduler = Scheduler::new([].iter()); scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); - let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { assert!(shard @@ -1754,16 +1781,16 @@ pub(crate) mod tests { fn initial_scheduling_is_optimal() -> anyhow::Result<()> { use itertools::Itertools; - let nodes = make_test_nodes(2); + let nodes = make_test_nodes(2, &[]); let mut scheduler = Scheduler::new([].iter()); scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); - let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); let a_context = Rc::new(RefCell::new(ScheduleContext::default())); - let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); let b_context = Rc::new(RefCell::new(ScheduleContext::default())); let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone())); @@ -1788,4 +1815,147 @@ pub(crate) mod tests { Ok(()) } + + #[test] + fn random_az_shard_scheduling() -> anyhow::Result<()> { + use rand::seq::SliceRandom; + + for seed in 0..50 { + eprintln!("Running test with seed {seed}"); + let mut rng = StdRng::seed_from_u64(seed); + + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let azs = [az_a_tag, az_b_tag]; + let nodes = make_test_nodes(4, &azs); + let mut shards_per_az: HashMap = HashMap::new(); + + let mut scheduler = Scheduler::new([].iter()); + for node in nodes.values() { + scheduler.node_upsert(node); + } + + let mut shards = Vec::default(); + let mut contexts = Vec::default(); + let mut az_picker = azs.iter().cycle().cloned(); + for i in 0..100 { + let az = az_picker.next().unwrap(); + let shard_count = i % 4 + 1; + *shards_per_az.entry(az.clone()).or_default() += shard_count; + + let tenant_shards = make_test_tenant( + PlacementPolicy::Attached(1), + ShardCount::new(shard_count.try_into().unwrap()), + Some(az), + ); + let context = Rc::new(RefCell::new(ScheduleContext::default())); + + contexts.push(context.clone()); + let with_ctx = tenant_shards + .into_iter() + .map(|shard| (shard, context.clone())); + for shard_with_ctx in with_ctx { + shards.push(shard_with_ctx); + } + } + + shards.shuffle(&mut rng); + + #[derive(Default, Debug)] + struct NodeStats { + attachments: u32, + secondaries: u32, + } + + let mut node_stats: HashMap = HashMap::default(); + let mut attachments_in_wrong_az = 0; + let mut secondaries_in_wrong_az = 0; + + for (shard, context) in &mut shards { + let context = &mut *context.borrow_mut(); + shard.schedule(&mut scheduler, context).unwrap(); + + let attached_node = shard.intent.get_attached().unwrap(); + let stats = node_stats.entry(attached_node).or_default(); + stats.attachments += 1; + + let secondary_node = *shard.intent.get_secondary().first().unwrap(); + let stats = node_stats.entry(secondary_node).or_default(); + stats.secondaries += 1; + + let attached_node_az = nodes + .get(&attached_node) + .unwrap() + .get_availability_zone_id(); + let secondary_node_az = nodes + .get(&secondary_node) + .unwrap() + .get_availability_zone_id(); + let preferred_az = shard.preferred_az().unwrap(); + + if attached_node_az != preferred_az { + eprintln!( + "{} attachment was scheduled in AZ {} but preferred AZ {}", + shard.tenant_shard_id, attached_node_az, preferred_az + ); + attachments_in_wrong_az += 1; + } + + if secondary_node_az == preferred_az { + eprintln!( + "{} secondary was scheduled in AZ {} which matches preference", + shard.tenant_shard_id, attached_node_az + ); + secondaries_in_wrong_az += 1; + } + } + + let mut violations = Vec::default(); + + if attachments_in_wrong_az > 0 { + violations.push(format!( + "{} attachments scheduled to the incorrect AZ", + attachments_in_wrong_az + )); + } + + if secondaries_in_wrong_az > 0 { + violations.push(format!( + "{} secondaries scheduled to the incorrect AZ", + secondaries_in_wrong_az + )); + } + + eprintln!( + "attachments_in_wrong_az={} secondaries_in_wrong_az={}", + attachments_in_wrong_az, secondaries_in_wrong_az + ); + + for (node_id, stats) in &node_stats { + let node_az = nodes.get(node_id).unwrap().get_availability_zone_id(); + let ideal_attachment_load = shards_per_az.get(node_az).unwrap() / 2; + let allowed_attachment_load = + (ideal_attachment_load - 1)..(ideal_attachment_load + 2); + + if !allowed_attachment_load.contains(&stats.attachments) { + violations.push(format!( + "Found {} attachments on node {}, but expected {}", + stats.attachments, node_id, ideal_attachment_load + )); + } + + eprintln!( + "{}: attachments={} secondaries={} ideal_attachment_load={}", + node_id, stats.attachments, stats.secondaries, ideal_attachment_load + ); + } + + assert!(violations.is_empty(), "{violations:?}"); + + for (mut shard, _ctx) in shards { + shard.intent.clear(&mut scheduler); + } + } + Ok(()) + } } diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index f9987662b9f5..a1b5b0b12f19 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true aws-sdk-s3.workspace = true either.workspace = true anyhow.workspace = true -git-version.workspace = true hex.workspace = true humantime.workspace = true serde.workspace = true diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 55c1423ed0d5..201eb1087ded 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2553,7 +2553,7 @@ def poll_node_status( desired_availability: Optional[PageserverAvailability], desired_scheduling_policy: Optional[PageserverSchedulingPolicy], max_attempts: int, - backoff: int, + backoff: float, ): """ Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability' @@ -2948,7 +2948,7 @@ def start( self.id ): self.env.storage_controller.poll_node_status( - self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1 + self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1 ) return self @@ -4617,7 +4617,8 @@ def scrubber_cli( "REGION": s3_storage.bucket_region, "BUCKET": s3_storage.bucket_name, "BUCKET_PREFIX": s3_storage.prefix_in_bucket, - "RUST_LOG": "DEBUG", + "RUST_LOG": "INFO", + "PAGESERVER_DISABLE_FILE_LOGGING": "1", } env.update(s3_storage.access_env_vars()) @@ -4637,10 +4638,8 @@ def scrubber_cli( (output_path, stdout, status_code) = subprocess_capture( self.log_dir, args, - echo_stderr=True, - echo_stdout=True, env=env, - check=False, + check=True, capture_stdout=True, timeout=timeout, ) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 35e0c0decb26..be8f70bb7076 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -198,9 +198,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): def run_pgbench(connstr: str, pg_bin: PgBin): log.info(f"Start a pgbench workload on pg {connstr}") - # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. - pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr]) - log.info("pgbench init done") pg_bin.run_capture(["pgbench", "-T60", connstr]) @@ -247,9 +244,15 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info( f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}" ) + + # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", primary.connstr()]) + log.info("pgbench init done in primary") + t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin)) t.start() - # Wait until pgbench_accounts is created + filled on replica *and* + + # Wait until we see that the pgbench_accounts is created + filled on replica *and* # index is created. Otherwise index creation would conflict with # read queries and hs feedback won't save us. wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index cb0b30d9c6e8..0f791e924707 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -10,11 +10,11 @@ from fixtures.neon_fixtures import NeonEnv, PgBin -# -# Test branching, when a transaction is in prepared state -# @pytest.mark.timeout(600) def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): + """ + Test resizing the Local File Cache + """ env = neon_simple_env endpoint = env.endpoints.create_start( "main", @@ -32,27 +32,48 @@ def run_pgbench(connstr: str): pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr]) - thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) + # Initializing the pgbench database can be very slow, especially on debug builds. + connstr = endpoint.connstr(options="-cstatement_timeout=300s") + + thread = threading.Thread(target=run_pgbench, args=(connstr,), daemon=True) thread.start() conn = endpoint.connect() cur = conn.cursor() - for _ in range(n_resize): + # For as long as pgbench is running, twiddle the LFC size once a second. + # Note that we launch this immediately, already while the "pgbench -i" + # initialization step is still running. That's quite a different workload + # than the actual pgbench benchamark run, so this gives us coverage of both. + while thread.is_alive(): size = random.randint(1, 512) cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + thread.join() + # At the end, set it at 100 MB, and perform a final check that the disk usage + # of the file is in that ballbark. + # + # We retry the check a few times, because it might take a while for the + # system to react to changing the setting and shrinking the file. cur.execute("alter system set neon.file_cache_size_limit='100MB'") cur.execute("select pg_reload_conf()") + nretries = 10 + while True: + lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" + lfc_file_size = os.path.getsize(lfc_file_path) + res = subprocess.run( + ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True + ) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + assert lfc_file_size <= 512 * 1024 * 1024 - thread.join() + if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0: + break + + nretries = nretries - 1 + time.sleep(1) - lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" - lfc_file_size = os.path.getsize(lfc_file_path) - res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True) - lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] - log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") - assert lfc_file_size <= 512 * 1024 * 1024 assert int(lfc_file_blocks) <= 128 * 1024 diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 347fc3a04ddb..5e8b8d38f7e2 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -122,6 +122,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): Test static endpoint is protected from GC by acquiring and renewing lsn leases. """ + LSN_LEASE_LENGTH = 8 neon_env_builder.num_pageservers = 2 # GC is manual triggered. env = neon_env_builder.init_start( @@ -139,7 +140,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): "image_creation_threshold": "1", "image_layer_creation_check_threshold": "0", # Short lease length to fit test. - "lsn_lease_length": "3s", + "lsn_lease_length": f"{LSN_LEASE_LENGTH}s", }, initial_tenant_shard_count=2, ) @@ -170,10 +171,14 @@ def generate_updates_on_main( with env.endpoints.create_start("main") as ep_main: with ep_main.cursor() as cur: cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)") - lsn = None + lsn = Lsn(0) for i in range(2): lsn = generate_updates_on_main(env, ep_main, i) + # Round down to the closest LSN on page boundary (unnormalized). + XLOG_BLCKSZ = 8192 + lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ) + with env.endpoints.create_start( branch_name="main", endpoint_id="static", @@ -183,7 +188,8 @@ def generate_updates_on_main( cur.execute("SELECT count(*) FROM t0") assert cur.fetchone() == (ROW_COUNT,) - time.sleep(3) + # Wait for static compute to renew lease at least once. + time.sleep(LSN_LEASE_LENGTH / 2) generate_updates_on_main(env, ep_main, i, end=100) @@ -204,8 +210,9 @@ def generate_updates_on_main( # Do some update so we can increment latest_gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) + # Wait for the existing lease to expire. + time.sleep(LSN_LEASE_LENGTH) # Now trigger GC again, layers should be removed. - time.sleep(4) for shard, ps in tenant_get_shards(env, env.initial_tenant): client = ps.http_client() gc_result = client.timeline_gc(shard, env.initial_timeline, 0) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 4106efd4f9cc..3861f0b82274 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -4,6 +4,7 @@ import time from collections import defaultdict from datetime import datetime, timezone +from enum import Enum from typing import Any, Dict, List, Optional, Set, Tuple, Union import pytest @@ -2466,6 +2467,87 @@ def has_hit_migration_failpoint(): raise +class MigrationFailpoints(Enum): + # While only the origin is attached + PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc" + # While both locations are attached + POST_NOTIFY = "reconciler-live-migrate-post-notify" + # While only the destination is attached + POST_DETACH = "reconciler-live-migrate-post-detach" + + +@pytest.mark.parametrize( + "migration_failpoint", + [ + MigrationFailpoints.PRE_GENERATION_INC, + MigrationFailpoints.POST_NOTIFY, + MigrationFailpoints.POST_DETACH, + ], +) +def test_storage_controller_proxy_during_migration( + neon_env_builder: NeonEnvBuilder, migration_failpoint: MigrationFailpoints +): + """ + If we send a proxied GET request to the controller during a migration, it should route + the request to whichever pageserver was most recently issued a generation. + + Reproducer for https://github.com/neondatabase/neon/issues/9062 + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_configs() + env.start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + env.neon_cli.create_tenant(tenant_id, timeline_id) + + # Activate a failpoint that will cause live migration to get stuck _after_ the generation has been issued + # to the new pageserver: this should result in requests routed to the new pageserver. + env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) + + origin_pageserver = env.get_tenant_pageserver(tenant_id) + dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + TenantShardId(tenant_id, 0, 0), + dest_ps_id, + ) + + def has_hit_migration_failpoint(): + expr = f"at failpoint {str(migration_failpoint.value)}" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(10, 1, has_hit_migration_failpoint) + + # This request should be routed to whichever pageserver holds the highest generation + tenant_info = env.storage_controller.pageserver_api().tenant_status( + tenant_id, + ) + + if migration_failpoint in ( + MigrationFailpoints.POST_NOTIFY, + MigrationFailpoints.POST_DETACH, + ): + # We expect request to land on the destination + assert tenant_info["generation"] == 2 + elif migration_failpoint == MigrationFailpoints.PRE_GENERATION_INC: + # We expect request to land on the origin + assert tenant_info["generation"] == 1 + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + migrate_fut.result() + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + raise + + @run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs()