Skip to content

Commit

Permalink
Merge pull request #201 from ruiwen-zhao/checksum
Browse files Browse the repository at this point in the history
Add functionality to validate disk image
  • Loading branch information
ruiwen-zhao authored Feb 23, 2024
2 parents 0f880d7 + c31a2c6 commit 9016cde
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 38 deletions.
45 changes: 28 additions & 17 deletions gke-disk-image-builder/cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ func main() {
timeout := flag.String("timeout", "20m", "Default timout for each step, defaults to 20m")
network := flag.String("network", "default", "VPC network to be used by GCE resources used for disk image creation.")
subnet := flag.String("subnet", "default", "subnet to be used by GCE resources used for disk image creation.")
storeSnapshotCheckSum := flag.Bool("store-snapshot-checksum", true, "calculate and store checksums of every snapshot directory.")
verifyOnly := flag.Bool("verify-only", false, "Only verifies the disk image provided in image-name, and does not generate any image.")
flag.Var(&imageLabels, "image-labels", "labels tagged to the disk image. This flag can be specified multiple times. The accepted format is `--image-labels=key=val`.")
flag.Var(&containerImages, "container-image", "container image to include in the disk image. This flag can be specified multiple times")

Expand Down Expand Up @@ -98,23 +100,32 @@ func main() {
}

req := builder.Request{
ImageName: *imageName,
ImageFamilyName: *imageFamilyName,
ProjectName: *projectName,
JobName: *jobName,
Zone: *zone,
GCSPath: *gcsPath,
MachineType: *machineType,
ServiceAccount: *serviceAccount,
DiskType: *diskType,
DiskSizeGB: *diskSizeGb,
GCPOAuth: *gcpOAuth,
Network: fmt.Sprintf("projects/%s/global/networks/%s", *projectName, *network),
Subnet: fmt.Sprintf("projects/%s/regions/%s/subnetworks/%s", *projectName, regionForZone(*zone), *subnet),
ContainerImages: containerImages,
Timeout: td,
ImagePullAuth: auth,
ImageLabels: imageLabels,
ImageName: *imageName,
ImageFamilyName: *imageFamilyName,
ProjectName: *projectName,
JobName: *jobName,
Zone: *zone,
GCSPath: *gcsPath,
MachineType: *machineType,
ServiceAccount: *serviceAccount,
DiskType: *diskType,
DiskSizeGB: *diskSizeGb,
GCPOAuth: *gcpOAuth,
Network: fmt.Sprintf("projects/%s/global/networks/%s", *projectName, *network),
Subnet: fmt.Sprintf("projects/%s/regions/%s/subnetworks/%s", *projectName, regionForZone(*zone), *subnet),
ContainerImages: containerImages,
Timeout: td,
ImagePullAuth: auth,
ImageLabels: imageLabels,
StoreSnapshotCheckSum: *storeSnapshotCheckSum,
}

if *verifyOnly {
if err = builder.VerifyDiskImage(ctx, req); err != nil {
log.Panicf("Image verification fails. The images/snapshots preloaded might be broken: %v", err)
}
fmt.Printf("Image at projects/%s/global/images/%s\n has been verified and all container images and snapshots are valid.", req.ProjectName, req.ImageName)
return
}

if err = builder.GenerateDiskImage(ctx, req); err != nil {
Expand Down
176 changes: 156 additions & 20 deletions gke-disk-image-builder/imager.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,27 @@ const (

// Request contains the required input for the disk image generation.
type Request struct {
ImageName string
ImageFamilyName string
ProjectName string
JobName string
Zone string
GCSPath string
MachineType string
DiskType string
DiskSizeGB int64
GCPOAuth string
Network string
Subnet string
ContainerImages []string
Timeout time.Duration
ImagePullAuth ImagePullAuthMechanism
ImageLabels []string
ServiceAccount string
ImageName string
ImageFamilyName string
ProjectName string
JobName string
Zone string
GCSPath string
MachineType string
DiskType string
DiskSizeGB int64
GCPOAuth string
Network string
Subnet string
ContainerImages []string
Timeout time.Duration
ImagePullAuth ImagePullAuthMechanism
ImageLabels []string
ServiceAccount string
StoreSnapshotCheckSum bool
}

func generateStartupScript(req Request) (*os.File, error) {
func buildDiskStartupScript(req Request) (*os.File, error) {
concreteStartupScript, err := os.CreateTemp("", fmt.Sprintf("%s-startup-script-", req.JobName))
if err != nil {
return nil, fmt.Errorf("unable to create a tmp file, err: %v", err)
Expand All @@ -80,13 +81,35 @@ func generateStartupScript(req Request) (*os.File, error) {
return nil, fmt.Errorf("unable to create the concrete startup file suceesfully, err: %v", err)
}
images := strings.Join(req.ContainerImages, " ")
flags := fmt.Sprintf("\n\nunpack %s %s", req.ImagePullAuth, images)
flags := fmt.Sprintf("\n\nunpack %t %s %s", req.StoreSnapshotCheckSum, req.ImagePullAuth, images)
if _, err = concreteStartupScript.Write([]byte(flags)); err != nil {
return nil, fmt.Errorf("umable to create concrete startup script: %v", err)
}
return concreteStartupScript, nil
}

func verifyDiskStartupScript(req Request) (*os.File, error) {
verifyDiskStartupScript, err := os.CreateTemp("", fmt.Sprintf("%s-verify-startup-script-", req.JobName))
if err != nil {
return nil, fmt.Errorf("unable to create a tmp file, err: %v", err)
}
verifyDiskStartupScriptTemplate, err := os.Open("./script/verify.sh")
if err != nil {
return nil, fmt.Errorf("unable to open the startup template file, err: %v", err)
}
defer verifyDiskStartupScriptTemplate.Close()
if _, err = io.Copy(verifyDiskStartupScript, verifyDiskStartupScriptTemplate); err != nil {
return nil, fmt.Errorf("unable to create the verify disk startup file suceesfully, err: %v", err)
}

flags := fmt.Sprintf("\n\nverify_snapshots")
if _, err = verifyDiskStartupScript.Write([]byte(flags)); err != nil {
return nil, fmt.Errorf("umable to create verify disk startup script: %v", err)
}

return verifyDiskStartupScript, nil
}

func buildImageLabels(req Request) (map[string]string, error) {
labels := make(map[string]string)
for _, label := range req.ImageLabels {
Expand All @@ -101,7 +124,7 @@ func buildImageLabels(req Request) (map[string]string, error) {

// GenerateDiskImage generates the disk image according to the given request.
func GenerateDiskImage(ctx context.Context, req Request) error {
startupScriptFile, err := generateStartupScript(req)
startupScriptFile, err := buildDiskStartupScript(req)
if err != nil {
return err
}
Expand Down Expand Up @@ -244,6 +267,119 @@ func GenerateDiskImage(ctx context.Context, req Request) error {
return run(ctx, preloadDiskWorkflow)
}

// VerifyDiskImage verifies the snapshots on the disk image by calculating the checksums and comparing them with those stored in snapshots.metadata file.
func VerifyDiskImage(ctx context.Context, req Request) error {
startupScriptFile, err := verifyDiskStartupScript(req)
if err != nil {
return err
}
defer startupScriptFile.Close()
defer os.Remove(startupScriptFile.Name())

verifyDiskWorkflow := daisy.New()
verifyDiskWorkflow.Name = req.JobName
verifyDiskWorkflow.Project = req.ProjectName
verifyDiskWorkflow.Zone = req.Zone
verifyDiskWorkflow.GCSPath = req.GCSPath
verifyDiskWorkflow.OAuthPath = req.GCPOAuth
verifyDiskWorkflow.DefaultTimeout = req.Timeout.String()
verifyDiskWorkflow.Sources = map[string]string{
"verify.sh": startupScriptFile.Name(),
}
verifyDiskWorkflow.Steps = map[string]*daisy.Step{
"create-disk": {
CreateDisks: &daisy.CreateDisks{
&daisy.Disk{
Resource: daisy.Resource{
ExactName: true,
},
Disk: compute.Disk{
Name: fmt.Sprintf("%s-disk", req.JobName),
Type: req.DiskType,
SizeGb: req.DiskSizeGB,
// Use the image to be verified to create a disk.
SourceImage: req.ImageName,
},
},
},
},
"create-instance": {
CreateInstances: &daisy.CreateInstances{
Instances: []*daisy.Instance{
&daisy.Instance{
InstanceBase: daisy.InstanceBase{
Resource: daisy.Resource{
ExactName: true,
},
StartupScript: "verify.sh",
},
Instance: compute.Instance{
Name: fmt.Sprintf("%s-instance", req.JobName),
MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", req.Zone, req.MachineType),
NetworkInterfaces: []*compute.NetworkInterface{
{
Network: req.Network,
Subnetwork: req.Subnet,
},
},
Disks: []*compute.AttachedDisk{
&compute.AttachedDisk{
AutoDelete: true,
Boot: true,
Type: "PERSISTENT",
DeviceName: fmt.Sprintf("%s-bootable-disk", req.JobName),
Mode: "READ_WRITE",
InitializeParams: &compute.AttachedDiskInitializeParams{
DiskSizeGb: req.DiskSizeGB,
DiskType: fmt.Sprintf("projects/%s/zones/%s/diskTypes/%s", req.ProjectName, req.Zone, req.DiskType),
SourceImage: "projects/debian-cloud/global/images/debian-11-bullseye-v20230912",
},
},
&compute.AttachedDisk{
AutoDelete: true,
Boot: false,
DiskSizeGb: req.DiskSizeGB,
DeviceName: deviceName,
Source: fmt.Sprintf("%s-disk", req.JobName),
},
},
},
},
},
},
},
"wait-on-image-verification": {
WaitForInstancesSignal: &daisy.WaitForInstancesSignal{
&daisy.InstanceSignal{
Name: fmt.Sprintf("%s-instance", req.JobName),
SerialOutput: &daisy.SerialOutput{
Port: 1,
SuccessMatch: "Disk image verification succeeds",
FailureMatch: []string{
"Image verfication failure",
},
},
},
},
},
"detach-disk": {
DetachDisks: &daisy.DetachDisks{
&daisy.DetachDisk{
Instance: fmt.Sprintf("%s-instance", req.JobName),
DeviceName: deviceName,
},
},
},
}
verifyDiskWorkflow.Dependencies = map[string][]string{
"create-instance": {"create-disk"},
"wait-on-image-verification": {"create-instance"},
"detach-disk": {"wait-on-image-verification"},
}

return run(ctx, verifyDiskWorkflow)
}

func run(ctx context.Context, w *daisy.Workflow) error {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
Expand Down
15 changes: 14 additions & 1 deletion gke-disk-image-builder/script/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,15 @@ function process_snapshots() {
echo Failed to copy the snapshot files for $snapshot from $original_path to /mnt/disks/container_layers/${new_path}. Please rerun the tool to try it again.
exit 1
fi

mapping="$snapshot $new_path"

if [ "$STORE_SNAPSHOT_CHECKSUMS" = "true" ]; then
echo "Calculating checksum for snapshot $snapshot"
checksum="$(find /mnt/disks/container_layers/${new_path} -type f -exec md5sum {} + | cut -d' ' -f1 | LC_ALL=C sort | md5sum | cut -d' ' -f1)"
mapping="$snapshot $new_path $checksum"
fi

echo "Appending $mapping to Metadata file"
sudo echo "$mapping" >> "/mnt/disks/container_layers/snapshots.metadata"
if [ $? -ne 0 ]; then
Expand Down Expand Up @@ -163,9 +171,14 @@ function unpack() {
# Remove the previously created snapshot views.
remove_snapshot_views

# Store the first parameter in OAUTH_MECHANISM and shift.
# Store the first parameter in STORE_SNAPSHOT_CHECKSUMS and shift
STORE_SNAPSHOT_CHECKSUMS=$(echo "$1")
shift

# Store the second parameter in OAUTH_MECHANISM and shift.
OAUTH_MECHANISM=$(echo "$1" | tr '[:upper:]' '[:lower:]')
shift

# Pull all the given images.
pull_images $@

Expand Down
81 changes: 81 additions & 0 deletions gke-disk-image-builder/script/verify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

sudo apt update
sudo apt-get update

# Install containerd.
sudo apt install --yes containerd
# Start containerd.
sudo systemctl start containerd
# Check to see if containerd is up and we can use ctr.
if sudo ctr version | grep "Server:"; then
echo containerd is ready to use
else
echo containerd is not running. Please rerun the tool to try it again.
exit 1
fi

# Check if disk is partitioned and update device node file path accordingly if so.
# The device name that maps to the `google-<device_name>` path is defined here: https://github.com/GoogleCloudPlatform/ai-on-gke/blob/4b73f02abd71e3c2a836d4d0ce29de054a605bc6/gke-disk-image-builder/imager.go#L32
# The disk name prefix is constructed here: https://github.com/GoogleCloudPlatform/ai-on-gke/blob/4b73f02abd71e3c2a836d4d0ce29de054a605bc6/gke-disk-image-builder/imager.go#L115
DEVICE_NODE=/dev/disk/by-id/google-secondary-disk-image-disk
if [[ -e "$DEVICE_NODE-part1" ]]; then
DEVICE_NODE="$DEVICE_NODE-part1"
fi
echo "using device node: $DEVICE_NODE"

# Check if the device exists
if ! [ -b "$DEVICE_NODE" ]; then
echo "Image verfication failure: failed to get device: Device $DEVICE_NODE does not exist. Please rerun the tool to try it again."
exit 1
fi

function verify_snapshots() {
# Prepare the disk image directories.
echo Preparing the disk image directories...
sudo mkdir -p /mnt/disks/container_layers
sudo mount -o discard,defaults $DEVICE_NODE /mnt/disks/container_layers

echo verifying the snapshots...
sudo ls /mnt/disks/container_layers
snapshot_metadata_file="/mnt/disks/container_layers/snapshots.metadata"
while IFS= read -r line
do
echo "$line"
snapshot_chainID=$(echo $line | cut -d' ' -f1)
snapshot_path=$(echo $line | cut -d' ' -f2)
expected_checksum=$(echo $line | cut -d' ' -f3)

if [ -z "$expected_checksum" ]; then
echo "Image verfication failure: Expected checksums not found in snapshots.metadata. Please use --store-snapshot-checksum flag when building images."
exit 1
fi

actual_checksum="$(find /mnt/disks/container_layers/${snapshot_path} -type f -exec md5sum {} + | cut -d' ' -f1 | LC_ALL=C sort | md5sum | cut -d' ' -f1)"

if [ "$expected_checksum" = "$actual_checksum" ]; then
echo "Verification succeeds for snapshot $snapshot_chainID at $snapshot_path."
else
echo "Verification fails for snapshot $snapshot_chainID at $snapshot_path. Expected checksum: $expected_checksum, got: $actual_checksum"
snapshot_broken="true"
fi
done < "$snapshot_metadata_file"

if [ -n "$snapshot_broken" ]; then
echo "Image verfication failure: Snapshot checksum mismatch. Please see the log to find more details."
else
echo "Disk image verification succeeds."
fi
}

0 comments on commit 9016cde

Please sign in to comment.