Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Smooth upgrade mount pod #1107

Merged
merged 37 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
20d47b3
feat: graceful shutdown & create new one & smoothly upgrade without r…
zwwhdls Sep 3, 2024
e34587a
feat: trigger upgrade in dashboard
zwwhdls Sep 4, 2024
ac82f6f
feat: trigger upgrade pod in dashboard
zwwhdls Sep 5, 2024
1eabe7d
fix unit test
zwwhdls Sep 6, 2024
c42e562
dashboard: check version when upgrade binary;
zwwhdls Sep 6, 2024
b560d1e
ci: fix mount pod recreate test
zwwhdls Sep 9, 2024
a10f2fd
ci: fix
zwwhdls Sep 9, 2024
3b27c7b
ci: fix
zwwhdls Sep 9, 2024
401b995
ci: fix
zwwhdls Sep 10, 2024
9d67706
fix
zwwhdls Sep 10, 2024
25ef2f0
update
zwwhdls Sep 12, 2024
b830dbc
update
zwwhdls Sep 12, 2024
836d510
lock mount pod when upgrade
zwwhdls Sep 12, 2024
8477c0a
make canary a job
zwwhdls Sep 12, 2024
da2d225
wait for upgrade finish
zwwhdls Sep 13, 2024
6f3a392
add version validate
zwwhdls Sep 14, 2024
ac3b3d6
fix ci
zwwhdls Sep 14, 2024
e2193cf
fix ci
zwwhdls Sep 14, 2024
0581588
dashboard: redirect after upgrade
zwwhdls Sep 18, 2024
65f51b7
delete tmp state file if terminal unexpectly & do not support upgrade…
zwwhdls Sep 18, 2024
0010006
fix unit test
zwwhdls Sep 19, 2024
fd14bd0
set version in event
zwwhdls Sep 19, 2024
8b5903f
fix log
zwwhdls Sep 19, 2024
91c2761
fix
zwwhdls Sep 19, 2024
b2c311f
fix ci
zwwhdls Sep 19, 2024
0936d96
improve log
zwwhdls Sep 20, 2024
39e63f2
use metaurl to check ce or ee
zwwhdls Sep 23, 2024
be6572d
fix
zwwhdls Sep 23, 2024
1fd19d3
Merge branch 'master' of github.com:juicedata/juicefs-csi-driver into…
zwwhdls Sep 24, 2024
7ce0dc7
fix umount hang and ignore fuse fd if get fd error
zwwhdls Sep 24, 2024
4808231
fix unit test
zwwhdls Sep 24, 2024
23f1662
set timeout in timeout
zwwhdls Sep 24, 2024
7d56c3d
fix fd leak
zwwhdls Sep 25, 2024
33fdec0
del useless code
zwwhdls Sep 25, 2024
aecdef8
fix version compare
zwwhdls Sep 26, 2024
0e37cf9
fix ci
zwwhdls Sep 26, 2024
5323f3f
check mount pod if get fuse fd
zwwhdls Sep 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/scripts/e2e-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@
test_dynamic_expand()
test_multi_pvc()
test_mountpod_recreated()
test_config()
test_recreate_mountpod_reload_config()
test_secret_has_owner_reference()
if without_kubelet:
test_pod_resource_err()
test_config()
test_recreate_mountpod_reload_config()

elif test_mode == "pod-mount-share":
if not IS_CE:
Expand Down Expand Up @@ -144,10 +144,10 @@
test_quota_using_storage_rw()
test_dynamic_expand()
test_multi_pvc()
test_config()
test_recreate_mountpod_reload_config()
if without_kubelet:
test_pod_resource_err()
test_config()
test_recreate_mountpod_reload_config()

elif test_mode == "webhook":
test_deployment_use_pv_rw()
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/k8s-deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function die() {

function install_deps() {
sudo apt-get update && sudo apt-get install -y snapd curl netcat-openbsd bc dnsutils redis-tools librados2 python3
sudo pip install kubernetes==18.20.0
sudo apt install -y python3-kubernetes
curl -fsSL -o /tmp/kustomize.tar.gz "$KUSTOMIZE_URL" \
&& tar -xf /tmp/kustomize.tar.gz -C /usr/local/bin \
&& chmod a+x /usr/local/bin/kustomize \
Expand Down
53 changes: 39 additions & 14 deletions .github/scripts/test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from model import PVC, PV, Pod, StorageClass, Deployment, Job, Secret
from util import check_mount_point, wait_dir_empty, wait_dir_not_empty, \
get_only_mount_pod_name, get_mount_pods, check_pod_ready, check_mount_pod_refs, gen_random_string, get_vol_uuid, \
get_voldel_job, check_quota, is_quota_supported, update_config
get_voldel_job, check_quota, is_quota_supported, update_config, wait_get_only_mount_pod_name


def test_deployment_using_storage_rw():
Expand Down Expand Up @@ -691,13 +691,15 @@ def test_dynamic_delete_pod():
is_ready = False
for i in range(0, 60):
try:
is_ready = mount_pod.is_ready()
new_mount_pod = Pod(name=get_only_mount_pod_name(unique_id), deployment_name="", replicas=1, namespace=KUBE_SYSTEM)
is_ready = new_mount_pod.is_ready()
if is_ready:
break
time.sleep(5)
except Exception as e:
LOG.info(e)
raise e
time.sleep(5)
continue
if not is_ready:
raise Exception("Mount pod {} didn't recovery within 5 min.".format(mount_pod.name))

Expand Down Expand Up @@ -769,13 +771,15 @@ def test_static_delete_pod():
is_ready = False
for i in range(0, 60):
try:
is_ready = mount_pod.is_ready()
new_mount_pod = Pod(name=get_only_mount_pod_name(volume_id), deployment_name="", replicas=1, namespace=KUBE_SYSTEM)
is_ready = new_mount_pod.is_ready()
if is_ready:
break
time.sleep(5)
except Exception as e:
LOG.info(e)
raise e
time.sleep(5)
continue
if not is_ready:
raise Exception("Mount pod {} didn't recovery within 5 min.".format(mount_pod.name))

Expand Down Expand Up @@ -2598,10 +2602,20 @@ def test_mountpod_recreated():

# wait for mountpod recreated
LOG.info("Wait for mountpod recreated..")
is_ready = False
for i in range(0, 60):
if mount_pod.watch_for_success():
break
time.sleep(5)
try:
new_mount_pod = Pod(name=get_only_mount_pod_name(volume_id), deployment_name="", replicas=1, namespace=KUBE_SYSTEM)
is_ready = new_mount_pod.is_ready()
if is_ready:
break
time.sleep(5)
except Exception as e:
LOG.info(e)
time.sleep(5)
continue
if not is_ready:
raise Exception("Mount pod {} didn't recovery within 5 min.".format(mount_pod.name))

# check mount point
LOG.info("Check mount point..")
Expand Down Expand Up @@ -2685,6 +2699,7 @@ def test_config():
["kubectl", "annotate", "pods", "--overwrite", "-n", KUBE_SYSTEM, "-l", "app=juicefs-csi-node",
"updatedAt=" + str(int(time.time()))])

time.sleep(2)
# deploy pvc
pvc1 = PVC(name="pvc-config-without-labels", access_mode="ReadWriteMany", storage_name=STORAGECLASS_NAME, pv="")
LOG.info("Deploy pvc {}".format(pvc1.name))
Expand Down Expand Up @@ -2863,26 +2878,36 @@ def test_recreate_mountpod_reload_config():
subprocess.check_call(
["kubectl", "annotate", "pods", "--overwrite", "-n", KUBE_SYSTEM, "-l", "app=juicefs-csi-node",
"updatedAt=" + str(int(time.time()))])
# sleep 2s to wait config update
time.sleep(2)

LOG.info("Start to delete mountpod..")
mount_pod = Pod(name=get_only_mount_pod_name(volume_id), deployment_name="", replicas=1, namespace=KUBE_SYSTEM)
mount_pod.delete()

# wait for mountpod recreated
LOG.info("Wait for mountpod recreated..")
time.sleep(20)
for i in range(0, 60):
if mount_pod.watch_for_success():
break
time.sleep(5)

result = pod.watch_for_success()
if not result:
if MOUNT_MODE == "webhook":
pods = client.CoreV1Api().list_namespaced_pod(
namespace="default",
label_selector="deployment={}".format(deployment.name)
)
for po in pods.items:
pod_name = po.metadata.name
if not check_pod_ready(po):
subprocess.check_call(["kubectl", "get", "po", pod_name, "-o", "yaml", "-n", "default"])
raise Exception("Pods of deployment {} are not ready within 10 min.".format(deployment.name))

# check mount point
LOG.info("Check mount point..")
result = check_mount_point(check_path)
if not result:
raise Exception("mount Point of /jfs/{}/out.txt are not ready within 5 min.".format(volume_id))

mount_pod = Pod(name=get_only_mount_pod_name(volume_id), deployment_name="", replicas=1, namespace=KUBE_SYSTEM)
mount_pod = Pod(name=wait_get_only_mount_pod_name(volume_id), deployment_name="", replicas=1, namespace=KUBE_SYSTEM)
if mount_pod.get_metadata().labels.get("apply") != "updated_config":
raise Exception("mountpod config labels not set")
if mount_pod.get_metadata().labels.get("volume_id") != volume_id:
Expand Down
9 changes: 9 additions & 0 deletions .github/scripts/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ def get_only_mount_pod_name(volume_id):
return running_pods[0].metadata.name


def wait_get_only_mount_pod_name(volume_id, timeout=60):
for i in range(0, timeout):
try:
return get_only_mount_pod_name(volume_id)
except Exception as e:
time.sleep(1)
continue


def get_mount_pods(volume_id):
pods = client.CoreV1Api().list_namespaced_pod(
namespace=KUBE_SYSTEM,
Expand Down
2 changes: 2 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ func main() {
klog.InitFlags(goFlag)
cmd.PersistentFlags().AddGoFlagSet(goFlag)

cmd.AddCommand(upgradeCmd)

if err := cmd.Execute(); err != nil {
os.Exit(1)
}
Expand Down
10 changes: 8 additions & 2 deletions cmd/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ import (
"github.com/juicedata/juicefs-csi-driver/pkg/config"
"github.com/juicedata/juicefs-csi-driver/pkg/controller"
"github.com/juicedata/juicefs-csi-driver/pkg/driver"
"github.com/juicedata/juicefs-csi-driver/pkg/fuse"
"github.com/juicedata/juicefs-csi-driver/pkg/fuse/grace"
"github.com/juicedata/juicefs-csi-driver/pkg/fuse/passfd"
k8s "github.com/juicedata/juicefs-csi-driver/pkg/k8sclient"
"github.com/juicedata/juicefs-csi-driver/pkg/util"
)
Expand Down Expand Up @@ -125,11 +126,16 @@ func parseNodeConfig() {
os.Exit(1)
}
config.CSIPod = *pod
err = fuse.InitGlobalFds(context.TODO(), "/tmp")
err = passfd.InitGlobalFds(context.TODO(), k8sclient, "/tmp")
if err != nil {
log.Error(err, "Init global fds error")
os.Exit(1)
}
err = grace.ServeGfShutdown(config.ShutdownSockPath)
if err != nil {
log.Error(err, "Serve graceful shutdown error")
os.Exit(1)
}
}

func nodeRun(ctx context.Context) {
Expand Down
50 changes: 50 additions & 0 deletions cmd/upgrade.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
Copyright 2023 Juicedata Inc

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"os"

"github.com/spf13/cobra"

"github.com/juicedata/juicefs-csi-driver/pkg/config"
"github.com/juicedata/juicefs-csi-driver/pkg/fuse/grace"
)

var (
restart = false
)

var upgradeCmd = &cobra.Command{
Use: "upgrade",
Short: "upgrade mount pod smoothly",
Run: func(cmd *cobra.Command, args []string) {
if len(args) < 1 {
log.Info("please specify the name of the mount pod which you want to upgrade", "node", config.NodeName)
os.Exit(1)
}
name := args[0]
if err := grace.TriggerShutdown(config.ShutdownSockPath, name, restart); err != nil {
log.Error(err, "failed to upgrade mount pod")
os.Exit(1)
}
},
}

func init() {
upgradeCmd.Flags().BoolVar(&restart, "restart", false, "smoothly upgrade the mount pod with restart")
}
25 changes: 22 additions & 3 deletions dashboard-ui-v2/src/components/containers.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

import { ProCard } from '@ant-design/pro-components'
import { Button, Space, Table, Tag, Tooltip } from 'antd'
import { Button, Space, Table, Tag, Tooltip,} from 'antd'
import { ContainerStatus } from 'kubernetes-types/core/v1'
import { FormattedMessage } from 'react-intl'
import { useParams } from 'react-router-dom'
Expand All @@ -28,12 +28,13 @@ import {
AccessLogIcon,
DebugIcon,
LogIcon,
TerminalIcon,
TerminalIcon, UpgradeIcon,
WarmupIcon,
} from '@/icons'
import { DetailParams } from '@/types'
import { Pod } from '@/types/k8s'
import { isMountPod, supportDebug } from '@/utils'
import { isMountPod, supportBinarySmoothUpgrade, supportDebug } from '@/utils'
import UpgradeModal from '@/components/upgrade-modal.tsx'

const Containers: React.FC<{
pod: Pod
Expand Down Expand Up @@ -165,6 +166,24 @@ const Containers: React.FC<{
</Tooltip>
)}
</WarmupModal>

{supportBinarySmoothUpgrade(c.image) ? (
<UpgradeModal
namespace={namespace!}
name={name!}
recreate={false}
>
{({ onClick }) => (
<Tooltip title="Binary Upgrade" zIndex={0}>
<Button
className="action-button"
onClick={onClick}
icon={<UpgradeIcon />}
/>
</Tooltip>
)}
</UpgradeModal>
) : null}
</>
) : null}
</Space>
Expand Down
31 changes: 26 additions & 5 deletions dashboard-ui-v2/src/components/pod-basic.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,26 @@

import React, { useState } from 'react'
import { ProCard, ProDescriptions } from '@ant-design/pro-components'
import { Button, Tooltip } from 'antd'
import { Button, Space, Tooltip } from 'antd'
import { Badge } from 'antd/lib'
import { FormattedMessage } from 'react-intl'
import YAML from 'yaml'

import YamlModal from './yaml-modal'
import { YamlIcon } from '@/icons'
import { UpgradeIcon, YamlIcon } from '@/icons'
import { Pod } from '@/types/k8s'
import { getPodStatusBadge, omitPod, podStatus } from '@/utils'
import { getPodStatusBadge, isMountPod, omitPod, podStatus, supportPodSmoothUpgrade } from '@/utils'
import { useMountPodImage } from '@/hooks/use-api.ts'
import UpgradeModal from '@/components/upgrade-modal.tsx'

const PodBasic: React.FC<{
pod: Pod
}> = (props) => {
const { pod } = props

const [isModalOpen, setIsModalOpen] = useState(false)
const { data } = useMountPodImage(isMountPod(pod), pod.metadata?.namespace, pod.metadata?.name)
const [image] = useState(pod.spec?.containers[0].image)

const showModal = () => {
setIsModalOpen(true)
Expand All @@ -45,7 +49,24 @@ const PodBasic: React.FC<{
<ProCard
title={<FormattedMessage id="basic" />}
extra={
<>
<Space>
{supportPodSmoothUpgrade(image || '') && supportPodSmoothUpgrade(data || '') ? (
<UpgradeModal
namespace={pod.metadata?.namespace || ''}
name={pod.metadata?.name || ''}
recreate={true}
>
{({ onClick }) => (
<Tooltip title="Upgrade" zIndex={0}>
<Button
className="action-button"
onClick={onClick}
icon={<UpgradeIcon />}
/>
</Tooltip>
)}
</UpgradeModal>
) : null}
<Tooltip title="Show Yaml">
<Button
className="action-button"
Expand All @@ -60,7 +81,7 @@ const PodBasic: React.FC<{
content={YAML.stringify(omitPod(pod))}
/>
</Tooltip>
</>
</Space>
}
>
<ProDescriptions
Expand Down
Loading
Loading