From f0023479cccf6ebc0eee42075816436f15eec1f7 Mon Sep 17 00:00:00 2001 From: yawangwang Date: Thu, 16 Nov 2023 20:32:02 +0000 Subject: [PATCH] Adding image tests for memory monitoring --- cloudbuild.yaml | 14 +++++++ launcher/container_runner.go | 14 ++----- launcher/image/entrypoint.sh | 8 ++++ ...boot-disk-size-consistency-monitor-cs.json | 12 ++++++ .../docker-monitor-cs.json | 12 ++++++ .../kernel-monitor-cs.json | 10 +++++ .../system-stats-monitor-cs.json | 10 +++++ launcher/image/preload.sh | 10 +++++ .../scripts/test_memory_monitoring_enabled.sh | 21 ++++++++++ .../test/test_launchpolicy_cloudbuild.yaml | 2 +- .../image/test/test_memory_monitoring.yaml | 42 +++++++++++++++++++ .../testworkloads/memorymonitoring/Dockerfile | 14 +++++++ launcher/internal/systemctl/systemctl.go | 2 +- launcher/spec/launch_policy.go | 2 +- 14 files changed, 159 insertions(+), 14 deletions(-) create mode 100644 launcher/image/nodeproblemdetector/boot-disk-size-consistency-monitor-cs.json create mode 100644 launcher/image/nodeproblemdetector/docker-monitor-cs.json create mode 100644 launcher/image/nodeproblemdetector/kernel-monitor-cs.json create mode 100644 launcher/image/nodeproblemdetector/system-stats-monitor-cs.json create mode 100644 launcher/image/test/scripts/test_memory_monitoring_enabled.sh create mode 100644 launcher/image/test/test_memory_monitoring.yaml create mode 100644 launcher/image/testworkloads/memorymonitoring/Dockerfile diff --git a/cloudbuild.yaml b/cloudbuild.yaml index c4e229911..cf0823893 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -198,6 +198,20 @@ steps: gcloud builds submit --config=test_discover_signatures.yaml --region us-west1 \ --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID},_SIGNATURE_REPO=us-docker.pkg.dev/confidential-space-images-dev/cs-cosign-tests/debug exit +- name: 'gcr.io/cloud-builders/gcloud' + id: MemoryMonitoringTests + waitFor: ['HardenedImageBuild'] + env: + - 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX' + - 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX' + - 'PROJECT_ID=$PROJECT_ID' + script: | + #!/usr/bin/env bash + cd launcher/image/test + echo "running memory monitoring tests on ${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX}" + gcloud builds submit --config=test_memory_monitoring.yaml --region us-west1 \ + --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID} + exit options: pool: diff --git a/launcher/container_runner.go b/launcher/container_runner.go index 999ca4ce8..eefc01f21 100644 --- a/launcher/container_runner.go +++ b/launcher/container_runner.go @@ -29,7 +29,6 @@ import ( "github.com/google/go-tpm-tools/cel" "github.com/google/go-tpm-tools/client" "github.com/google/go-tpm-tools/launcher/agent" - npd "github.com/google/go-tpm-tools/launcher/internal/healthmonitoring/nodeproblemdetector" "github.com/google/go-tpm-tools/launcher/internal/signaturediscovery" "github.com/google/go-tpm-tools/launcher/internal/systemctl" "github.com/google/go-tpm-tools/launcher/launcherfile" @@ -57,8 +56,6 @@ const tokenFileTmp = ".token.tmp" const teeServerSocket = "teeserver.sock" -const systemStatsConfigFilePath = "/etc/node_problem_detector/system-stats-monitor.json" - // Since we only allow one container on a VM, using a deterministic id is probably fine const ( containerID = "tee-container" @@ -511,16 +508,9 @@ func (r *ContainerRunner) Run(ctx context.Context) error { defer teeServer.Shutdown(ctx) } - // customize node-problem-detector.service and start it. + // start node-problem-detector.service to collect memory related metrics. if r.launchSpec.MemoryMonitoringEnabled { r.logger.Println("MemoryMonitoring is enabled") - config := npd.NewSystemStatsConfig() - // collects "memory/bytes_used" metrics only when memory monitoring enabled. - config.EnableMemoryBytesUsed() - // override the default config file. - if err := config.WriteFile(systemStatsConfigFilePath); err != nil { - return fmt.Errorf("failed to override the default config file [%s] for node-problem-detector: %v", systemStatsConfigFilePath, err) - } s, err := systemctl.New() if err != nil { return fmt.Errorf("failed to create systemctl client: %v", err) @@ -531,6 +521,8 @@ func (r *ContainerRunner) Run(ctx context.Context) error { return fmt.Errorf("failed to start node-problem-detector.service: %v", err) } r.logger.Println("node-problem-detector.service successfully started.") + } else { + r.logger.Println("MemoryMonitoring is disabled.") } var streamOpt cio.Opt diff --git a/launcher/image/entrypoint.sh b/launcher/image/entrypoint.sh index 4ac3fef0a..b089f0e41 100644 --- a/launcher/image/entrypoint.sh +++ b/launcher/image/entrypoint.sh @@ -6,6 +6,14 @@ main() { # Override default fluent-bit config. cp /usr/share/oem/confidential_space/fluent-bit-cs.conf /etc/fluent-bit/fluent-bit.conf + # Override default system-stats-monitor.json for node-problem-detector. + cp /usr/share/oem/confidential_space/system-stats-monitor-cs.json /etc/node_problem_detector/system-stats-monitor.json + # Override default boot-disk-size-consistency-monitor.json for node-problem-detector. + cp /usr/share/oem/confidential_space/boot-disk-size-consistency-monitor-cs.json /etc/node_problem_detector/boot-disk-size-consistency-monitor.json + # Override default docker-monitor.json for node-problem-detector. + cp /usr/share/oem/confidential_space/docker-monitor-cs.json /etc/node_problem_detector/docker-monitor.json + # Override default kernel-monitor.json for node-problem-detector. + cp /usr/share/oem/confidential_space/kernel-monitor-cs.json /etc/node_problem_detector/kernel-monitor.json systemctl daemon-reload systemctl enable container-runner.service systemctl start container-runner.service diff --git a/launcher/image/nodeproblemdetector/boot-disk-size-consistency-monitor-cs.json b/launcher/image/nodeproblemdetector/boot-disk-size-consistency-monitor-cs.json new file mode 100644 index 000000000..8f741843b --- /dev/null +++ b/launcher/image/nodeproblemdetector/boot-disk-size-consistency-monitor-cs.json @@ -0,0 +1,12 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "30m", + "timeout": "7s", + "max_output_length": 80, + "enable_message_change_based_condition_update": false + }, + "source": "boot-disk-size-consistency-monitor", + "metricsReporting": false, + "rules": [] + } \ No newline at end of file diff --git a/launcher/image/nodeproblemdetector/docker-monitor-cs.json b/launcher/image/nodeproblemdetector/docker-monitor-cs.json new file mode 100644 index 000000000..c28fc9e0e --- /dev/null +++ b/launcher/image/nodeproblemdetector/docker-monitor-cs.json @@ -0,0 +1,12 @@ +{ + "plugin": "journald", + "pluginConfig": { + "source": "dockerd" + }, + "logPath": "/var/log/journal", + "lookback": "5m", + "bufferSize": 10, + "source": "docker-monitor", + "metricsReporting": false, + "conditions": [] +} \ No newline at end of file diff --git a/launcher/image/nodeproblemdetector/kernel-monitor-cs.json b/launcher/image/nodeproblemdetector/kernel-monitor-cs.json new file mode 100644 index 000000000..8a2ee9d4d --- /dev/null +++ b/launcher/image/nodeproblemdetector/kernel-monitor-cs.json @@ -0,0 +1,10 @@ +{ + "plugin": "kmsg", + "logPath": "/dev/kmsg", + "lookback": "5m", + "bufferSize": 10, + "source": "kernel-monitor", + "metricsReporting": false, + "conditions": [], + "rules": [] +} \ No newline at end of file diff --git a/launcher/image/nodeproblemdetector/system-stats-monitor-cs.json b/launcher/image/nodeproblemdetector/system-stats-monitor-cs.json new file mode 100644 index 000000000..30f8cccea --- /dev/null +++ b/launcher/image/nodeproblemdetector/system-stats-monitor-cs.json @@ -0,0 +1,10 @@ +{ + "memory": { + "metricsConfigs": { + "memory/bytes_used": { + "displayName": "memory/bytes_used" + } + } + }, + "invokeInterval": "60s" +} diff --git a/launcher/image/preload.sh b/launcher/image/preload.sh index 2fba9a323..53e478434 100644 --- a/launcher/image/preload.sh +++ b/launcher/image/preload.sh @@ -66,12 +66,22 @@ configure_cloud_logging() { cp fluent-bit-cs.conf "${CS_PATH}" } +configure_node_problem_detector() { + # Copy CS-specific node-problem-detector configs to OEM partition. + cp nodeproblemdetector/system-stats-monitor-cs.json "${CS_PATH}" + cp nodeproblemdetector/boot-disk-size-consistency-monitor-cs.json "${CS_PATH}" + cp nodeproblemdetector/docker-monitor-cs.json "${CS_PATH}" + cp nodeproblemdetector/kernel-monitor-cs.json "${CS_PATH}" +} + configure_systemd_units_for_debug() { configure_cloud_logging + configure_node_problem_detector } configure_systemd_units_for_hardened() { configure_necessary_systemd_units configure_cloud_logging + configure_node_problem_detector # Make entrypoint (via cloud-init) the default unit. set_default_boot_target "cloud-final.service" diff --git a/launcher/image/test/scripts/test_memory_monitoring_enabled.sh b/launcher/image/test/scripts/test_memory_monitoring_enabled.sh new file mode 100644 index 000000000..4ea09765a --- /dev/null +++ b/launcher/image/test/scripts/test_memory_monitoring_enabled.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -euxo pipefail +source util/read_serial.sh + +# This test requires the workload to run and printing +# corresponding messages to the serial console. +SERIAL_OUTPUT=$(read_serial $1 $2) +print_serial=false + +if echo $SERIAL_OUTPUT | grep -q 'node-problem-detector.service successfully started' +then + echo "- memory monitoring enabled" +else + echo "FAILED: memory monitoring disabled" + echo 'TEST FAILED.' > /workspace/status.txt + print_serial=true +fi + +if $print_serial; then + echo $SERIAL_OUTPUT +fi diff --git a/launcher/image/test/test_launchpolicy_cloudbuild.yaml b/launcher/image/test/test_launchpolicy_cloudbuild.yaml index f741c3158..ed1742c21 100644 --- a/launcher/image/test/test_launchpolicy_cloudbuild.yaml +++ b/launcher/image/test/test_launchpolicy_cloudbuild.yaml @@ -5,7 +5,7 @@ substitutions: '_METADATA_FILE': 'startup-script=data/echo_startupscript.sh,user-data=data/cloud-init-config.yaml' '_CLEANUP': 'true' '_VM_NAME_PREFIX': 'cs-launchpolicy-test' - '_ZONE': 'us-central1-a' + '_ZONE': 'us-east4-a' '_WORKLOAD_IMAGE_LOG_NEVER': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/launchpolicylognever:latest' '_WORKLOAD_IMAGE_LOG_DEBUG': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/launchpolicylogdebug:latest' '_WORKLOAD_IMAGE_ENV': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/basic-test:latest' diff --git a/launcher/image/test/test_memory_monitoring.yaml b/launcher/image/test/test_memory_monitoring.yaml new file mode 100644 index 000000000..58d2117ab --- /dev/null +++ b/launcher/image/test/test_memory_monitoring.yaml @@ -0,0 +1,42 @@ +substitutions: + '_IMAGE_NAME': '' + '_IMAGE_PROJECT': '' + '_CLEANUP': 'true' + '_VM_NAME_PREFIX': 'memory-monitoring' + '_ZONE': 'us-east1-b' + '_WORKLOAD_IMAGE': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/memorymonitoring:latest' + +steps: +- name: 'gcr.io/cloud-builders/gcloud' + id: CreateVM + entrypoint: 'bash' + env: + - 'BUILD_ID=$BUILD_ID' + args: ['create_vm.sh','-i', '${_IMAGE_NAME}', + '-p', '${_IMAGE_PROJECT}', + '-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-monitoring-memory-enable=true', + '-n', '${_VM_NAME_PREFIX}-${BUILD_ID}', + '-z', '${_ZONE}', + ] +- name: 'gcr.io/cloud-builders/gcloud' + id: CheckMemoryMonitoringEnabled + entrypoint: 'bash' + args: ['scripts/test_memory_monitoring_enabled.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}'] + +- name: 'gcr.io/cloud-builders/gcloud' + id: CleanUp + entrypoint: 'bash' + env: + - 'CLEANUP=$_CLEANUP' + args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}'] +# Must come after cleanup. +- name: 'gcr.io/cloud-builders/gcloud' + id: CheckFailure + entrypoint: 'bash' + env: + - 'BUILD_ID=$BUILD_ID' + args: ['check_failure.sh'] + +options: + pool: + name: 'projects/confidential-space-images-dev/locations/us-west1/workerPools/cs-image-build-vpc' diff --git a/launcher/image/testworkloads/memorymonitoring/Dockerfile b/launcher/image/testworkloads/memorymonitoring/Dockerfile new file mode 100644 index 000000000..7f8fca0ed --- /dev/null +++ b/launcher/image/testworkloads/memorymonitoring/Dockerfile @@ -0,0 +1,14 @@ +# From current directory: +# GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o main ../basic +# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/memorymonitoring:latest --project confidential-space-images-dev +FROM alpine + +COPY main / + +ENV env_bar="val_bar" + +LABEL "tee.launch_policy.monitoring_memory_allow"="always" + +ENTRYPOINT ["/main"] + +CMD ["arg_foo"] diff --git a/launcher/internal/systemctl/systemctl.go b/launcher/internal/systemctl/systemctl.go index 2aa81e7a4..752d6ed6d 100644 --- a/launcher/internal/systemctl/systemctl.go +++ b/launcher/internal/systemctl/systemctl.go @@ -48,7 +48,7 @@ func runSystemdCmd(cmdFunc func(string, string, chan<- string) (int, error), cmd progress := make(chan string, 1) // Run systemd command in "replace" mode to start the unit and its dependencies, - // possibly replacing already queued jobs that conflict w∏ith this. + // possibly replacing already queued jobs that conflict with this. if _, err := cmdFunc(unit, "replace", progress); err != nil { return fmt.Errorf("failed to run systemctl [%s] for unit [%s]: %v", cmd, unit, err) } diff --git a/launcher/spec/launch_policy.go b/launcher/spec/launch_policy.go index d46ea4aca..9c129e3b2 100644 --- a/launcher/spec/launch_policy.go +++ b/launcher/spec/launch_policy.go @@ -43,7 +43,7 @@ const ( envOverride = "tee.launch_policy.allow_env_override" cmdOverride = "tee.launch_policy.allow_cmd_override" logRedirect = "tee.launch_policy.log_redirect" - memoryMonitoring = "tee.launch_policy.monitoring.memory.allow" + memoryMonitoring = "tee.launch_policy.monitoring_memory_allow" ) // GetLaunchPolicy takes in a map[string] string which should come from image labels,