diff --git a/charts/templates/central-deploy.yaml b/charts/templates/central-deploy.yaml index 6ce8e901d5d..f8c4e6cccaf 100644 --- a/charts/templates/central-deploy.yaml +++ b/charts/templates/central-deploy.yaml @@ -44,7 +44,8 @@ spec: - name: ovn-central image: {{ .Values.global.registry.address }}/{{ .Values.global.images.kubeovn.repository }}:{{ .Values.global.images.kubeovn.tag }} imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["/kube-ovn/start-db.sh"] + command: + - /kube-ovn/start-db.sh securityContext: capabilities: add: ["SYS_NICE"] @@ -71,6 +72,10 @@ spec: fieldPath: status.podIPs - name: ENABLE_BIND_LOCAL_IP value: "{{- .Values.func.ENABLE_BIND_LOCAL_IP }}" + - name: PROBE_INTERVAL + value: "{{ .Values.networking.PROBE_INTERVAL }}" + - name: OVN_LEADER_PROBE_INTERVAL + value: "{{ .Values.networking.OVN_LEADER_PROBE_INTERVAL }}" resources: requests: cpu: {{ index .Values "ovn-central" "requests" "cpu" }} diff --git a/charts/templates/ovsovn-ds.yaml b/charts/templates/ovsovn-ds.yaml index e08efc45661..1ec51128800 100644 --- a/charts/templates/ovsovn-ds.yaml +++ b/charts/templates/ovsovn-ds.yaml @@ -44,7 +44,8 @@ spec: {{- if .Values.DPDK }} command: ["/kube-ovn/start-ovs-dpdk.sh"] {{- else }} - command: ["/kube-ovn/start-ovs.sh"] + command: + - /kube-ovn/start-ovs.sh {{- end }} securityContext: runAsUser: 0 @@ -74,6 +75,10 @@ spec: fieldPath: spec.nodeName - name: OVN_DB_IPS value: "{{ .Values.MASTER_NODES }}" + - name: OVN_REMOTE_PROBE_INTERVAL + value: "{{ .Values.networking.OVN_REMOTE_PROBE_INTERVAL }}" + - name: OVN_REMOTE_OPENFLOW_INTERVAL + value: "{{ .Values.networking.OVN_REMOTE_OPENFLOW_INTERVAL }}" volumeMounts: - mountPath: /var/run/netns name: host-ns diff --git a/charts/values.yaml b/charts/values.yaml index dc88a8e52df..cfb63577436 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -47,6 +47,10 @@ networking: ENABLE_ECMP: false ENABLE_METRICS: true NODE_LOCAL_DNS_IP: "" + PROBE_INTERVAL: 180000 + OVN_LEADER_PROBE_INTERVAL: 5 + OVN_REMOTE_PROBE_INTERVAL: 10000 + OVN_REMOTE_OPENFLOW_INTERVAL: 180 func: ENABLE_LB: true diff --git a/dist/images/install.sh b/dist/images/install.sh index 0b7abed236c..d38e3f25cde 100755 --- a/dist/images/install.sh +++ b/dist/images/install.sh @@ -80,9 +80,7 @@ POD_NIC_TYPE="veth-pair" # veth-pair or internal-port POD_DEFAULT_FIP_TYPE="" # iptables, pod can set iptables fip automatically by enable fip annotation # VLAN Config only take effect when NETWORK_TYPE is vlan -PROVIDER_NAME="provider" VLAN_INTERFACE_NAME="" -VLAN_NAME="ovn-vlan" VLAN_ID="100" if [ "$ENABLE_VLAN" = "true" ]; then @@ -134,7 +132,7 @@ then --with-dpdk=*) DPDK=true DPDK_VERSION="${1#*=}" - if [[ ! "${DPDK_SUPPORTED_VERSIONS[@]}" = "${DPDK_VERSION}" ]] || [[ -z "${DPDK_VERSION}" ]]; then + if [[ ! "${DPDK_SUPPORTED_VERSIONS[*]}" = "${DPDK_VERSION}" ]] || [[ -z "${DPDK_VERSION}" ]]; then echo "Unsupported DPDK version: ${DPDK_VERSION}" echo "Supported DPDK versions: ${DPDK_SUPPORTED_VERSIONS[*]}" exit 1 @@ -201,17 +199,17 @@ fi echo "[Step 1/6] Label kube-ovn-master node and label datapath type" count=$(kubectl get no -l$LABEL --no-headers | wc -l) node_label="$LABEL" -if [ $count -eq 0 ]; then +if [ "${count}" -eq 0 ]; then count=$(kubectl get no -l$DEPRECATED_LABEL --no-headers | wc -l) node_label="$DEPRECATED_LABEL" - if [ $count -eq 0 ]; then + if [ "${count}" -eq 0 ]; then echo "ERROR: No node with label $LABEL or $DEPRECATED_LABEL found" exit 1 fi fi kubectl label no -l$node_label kube-ovn/role=master --overwrite -if [ "$DPDK" = "true" -o "$HYBRID_DPDK" = "true" ]; then +if [ "$DPDK" = "true" ] || [ "$HYBRID_DPDK" = "true" ]; then kubectl label no -lovn.kubernetes.io/ovs_dp_type!=userspace ovn.kubernetes.io/ovs_dp_type=kernel --overwrite fi @@ -3204,7 +3202,8 @@ spec: - name: ovn-central image: "$REGISTRY/kube-ovn:$VERSION" imagePullPolicy: $IMAGE_PULL_POLICY - command: ["/kube-ovn/start-db.sh"] + command: + - /kube-ovn/start-db.sh securityContext: capabilities: add: ["SYS_NICE"] @@ -3233,6 +3232,10 @@ spec: value: "$ENABLE_BIND_LOCAL_IP" - name: DEBUG_WRAPPER value: "$DEBUG_WRAPPER" + - name: PROBE_INTERVAL + value: "180000" + - name: OVN_LEADER_PROBE_INTERVAL + value: "5" resources: requests: cpu: 300m @@ -3516,7 +3519,8 @@ spec: - name: openvswitch image: "$REGISTRY/kube-ovn:$VERSION" imagePullPolicy: $IMAGE_PULL_POLICY - command: ["/kube-ovn/start-ovs.sh"] + command: + - /kube-ovn/start-ovs.sh securityContext: runAsUser: 0 privileged: true @@ -3547,6 +3551,10 @@ spec: value: $addresses - name: DEBUG_WRAPPER value: "$DEBUG_WRAPPER" + - name: OVN_REMOTE_PROBE_INTERVAL + value: "10000" + - name: OVN_REMOTE_OPENFLOW_INTERVAL + value: "180" volumeMounts: - mountPath: /var/run/netns name: host-ns @@ -4519,7 +4527,7 @@ if ! sh -c "echo \":$PATH:\" | grep -q \":/usr/local/bin:\""; then fi echo "[Step 6/6] Run network diagnose" -kubectl cp kube-system/$(kubectl -n kube-system get pods -o wide | grep cni | awk '{print $1}' | awk 'NR==1{print}'):/kube-ovn/kubectl-ko /usr/local/bin/kubectl-ko +kubectl cp kube-system/"$(kubectl -n kube-system get pods -o wide | grep cni | awk '{print $1}' | awk 'NR==1{print}')":/kube-ovn/kubectl-ko /usr/local/bin/kubectl-ko chmod +x /usr/local/bin/kubectl-ko kubectl ko diagnose all diff --git a/dist/images/start-db.sh b/dist/images/start-db.sh index 9e368f476bd..57e4df8906a 100755 --- a/dist/images/start-db.sh +++ b/dist/images/start-db.sh @@ -4,6 +4,9 @@ set -eo pipefail DEBUG_WRAPPER=${DEBUG_WRAPPER:-} DEBUG_OPT="--ovn-northd-wrapper=$DEBUG_WRAPPER --ovsdb-nb-wrapper=$DEBUG_WRAPPER --ovsdb-sb-wrapper=$DEBUG_WRAPPER" +echo "PROBE_INTERVAL is set to $PROBE_INTERVAL" +echo "OVN_LEADER_PROBE_INTERVAL is set to $OVN_LEADER_PROBE_INTERVAL" + # https://bugs.launchpad.net/neutron/+bug/1776778 if grep -q "3.10.0-862" /proc/version then @@ -224,11 +227,12 @@ if [[ "$ENABLE_SSL" == "false" ]]; then if [[ -z "$NODE_IPS" ]]; then /usr/share/ovn/scripts/ovn-ctl restart_northd ovn-nbctl --no-leader-only set-connection ptcp:"${NB_PORT}":["${DB_ADDR}"] - ovn-nbctl --no-leader-only set Connection . inactivity_probe=180000 + ovn-nbctl --no-leader-only set Connection . inactivity_probe=${PROBE_INTERVAL} + ovn-nbctl --no-leader-only set NB_Global . options:northd_probe_interval=${PROBE_INTERVAL} ovn-nbctl --no-leader-only set NB_Global . options:use_logical_dp_groups=true ovn-sbctl --no-leader-only set-connection ptcp:"${SB_PORT}":["${DB_ADDR}"] - ovn-sbctl --no-leader-only set Connection . inactivity_probe=180000 + ovn-sbctl --no-leader-only set Connection . inactivity_probe=${PROBE_INTERVAL} else if [[ ! "$NODE_IPS" =~ "$DB_CLUSTER_ADDR" ]]; then echo "ERROR! host ip $DB_CLUSTER_ADDR not in env NODE_IPS $NODE_IPS" @@ -272,7 +276,9 @@ if [[ "$ENABLE_SSL" == "false" ]]; then /etc/ovn/ovnsb_local_config.db /usr/share/ovn/scripts/ovn-ctl $ovn_ctl_args \ --ovn-manage-ovsdb=no start_northd - ovn-nbctl --no-leader-only set NB_Global . options:northd_probe_interval=180000 + ovn-nbctl --no-leader-only set NB_Global . options:inactivity_probe=${PROBE_INTERVAL} + ovn-sbctl --no-leader-only set SB_Global . options:inactivity_probe=${PROBE_INTERVAL} + ovn-nbctl --no-leader-only set NB_Global . options:northd_probe_interval=${PROBE_INTERVAL} ovn-nbctl --no-leader-only set NB_Global . options:use_logical_dp_groups=true else # known leader always first @@ -352,11 +358,11 @@ else --ovn-northd-ssl-ca-cert=/var/run/tls/cacert \ restart_northd ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set-connection pssl:"${NB_PORT}":["${DB_ADDR}"] - ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set Connection . inactivity_probe=180000 + ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set Connection . inactivity_probe=${PROBE_INTERVAL} ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set NB_Global . options:use_logical_dp_groups=true ovn-sbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set-connection pssl:"${SB_PORT}":["${DB_ADDR}"] - ovn-sbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set Connection . inactivity_probe=180000 + ovn-sbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set Connection . inactivity_probe=${PROBE_INTERVAL} else if [[ ! "$NODE_IPS" =~ "$DB_CLUSTER_ADDR" ]]; then echo "ERROR! host ip $DB_CLUSTER_ADDR not in env NODE_IPS $NODE_IPS" @@ -408,7 +414,7 @@ else /etc/ovn/ovnsb_local_config.db /usr/share/ovn/scripts/ovn-ctl $ovn_ctl_args \ --ovn-manage-ovsdb=no start_northd - ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set NB_Global . options:northd_probe_interval=180000 + ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set NB_Global . options:northd_probe_interval=${PROBE_INTERVAL} ovn-nbctl --no-leader-only -p /var/run/tls/key -c /var/run/tls/cert -C /var/run/tls/cacert set NB_Global . options:use_logical_dp_groups=true else # get leader if cluster exists @@ -486,5 +492,4 @@ ovs-appctl -t /var/run/ovn/ovnnb_db.ctl ovsdb-server/memory-trim-on-compaction o ovs-appctl -t /var/run/ovn/ovnsb_db.ctl ovsdb-server/memory-trim-on-compaction on chmod 600 /etc/ovn/* -/kube-ovn/kube-ovn-leader-checker - +/kube-ovn/kube-ovn-leader-checker --probeInterval=${OVN_LEADER_PROBE_INTERVAL} \ No newline at end of file diff --git a/dist/images/start-ovs.sh b/dist/images/start-ovs.sh index e788db9df9c..45b40cef869 100755 --- a/dist/images/start-ovs.sh +++ b/dist/images/start-ovs.sh @@ -1,6 +1,9 @@ #!/bin/bash set -euo pipefail +echo "OVN_REMOTE_PROBE_INTERVAL is set to $OVN_REMOTE_PROBE_INTERVAL" +echo "OVN_REMOTE_OPENFLOW_INTERVAL is set to $OVN_REMOTE_OPENFLOW_INTERVAL" + HW_OFFLOAD=${HW_OFFLOAD:-false} ENABLE_SSL=${ENABLE_SSL:-false} OVN_DB_IPS=${OVN_DB_IPS:-} @@ -36,21 +39,21 @@ cat /proc/cmdline" fi function cgroup_match { - hash1=$(md5sum /proc/$1/cgroup | awk '{print $1}') - hash2=$(md5sum /proc/$2/cgroup | awk '{print $1}') + hash1=$(md5sum /proc/"$1"/cgroup | awk '{print $1}') + hash2=$(md5sum /proc/"$2"/cgroup | awk '{print $1}') test -n "$hash1" -a "x$hash1" = "x$hash2" } function quit { - gen_name=$(kubectl -n $POD_NAMESPACE get pod $POD_NAME -o jsonpath='{.metadata.generateName}') - revision_hash=$(kubectl -n $POD_NAMESPACE get pod $POD_NAME -o jsonpath='{.metadata.labels.controller-revision-hash}') - revision=$(kubectl -n $POD_NAMESPACE get controllerrevision $gen_name$revision_hash -o jsonpath='{.revision}') + gen_name=$(kubectl -n "${POD_NAMESPACE}" get pod "${POD_NAME}" -o jsonpath='{.metadata.generateName}') + revision_hash=$(kubectl -n "${POD_NAMESPACE}" get pod "${POD_NAME}" -o jsonpath='{.metadata.labels.controller-revision-hash}') + revision=$(kubectl -n "${POD_NAMESPACE}" get controllerrevision "${gen_name}${revision_hash}" -o jsonpath='{.revision}') ds_name=${gen_name%-} latest_revision=$(kubectl -n kube-system get controllerrevision --no-headers | awk '$2 == "daemonset.apps/'$ds_name'" {print $3}' | sort -nr | head -n1) if [ "x$latest_revision" = "x$revision" ]; then # stop ovn-controller/ovs only when the processes are in the same cgroup pid=$(/usr/share/ovn/scripts/ovn-ctl status_controller | awk '{print $NF}') - if cgroup_match $pid self; then + if cgroup_match "${pid}" self; then /usr/share/ovn/scripts/grace_stop_ovn_controller /usr/share/openvswitch/scripts/ovs-ctl stop fi @@ -64,12 +67,12 @@ trap quit EXIT iptables -V # Start ovsdb -/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovs-vswitchd --system-id=random --ovsdb-server-wrapper=$DEBUG_WRAPPER +/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovs-vswitchd --system-id=random --ovsdb-server-wrapper="${DEBUG_WRAPPER}" # Restrict the number of pthreads ovs-vswitchd creates to reduce the # amount of RSS it uses on hosts with many cores # https://bugzilla.redhat.com/show_bug.cgi?id=1571379 # https://bugzilla.redhat.com/show_bug.cgi?id=1572797 -if [[ `nproc` -gt 12 ]]; then +if [[ $(nproc) -gt 12 ]]; then ovs-vsctl --no-wait set Open_vSwitch . other_config:n-revalidator-threads=4 ovs-vsctl --no-wait set Open_vSwitch . other_config:n-handler-threads=10 fi @@ -87,20 +90,20 @@ ovs-appctl -t "$ovsdb_server_ctl" vlog/set reconnect:file:err function handle_underlay_bridges() { bridges=($(ovs-vsctl --no-heading --columns=name find bridge external-ids:vendor=kube-ovn)) - for br in ${bridges[@]}; do - if ! ip link show $br >/dev/null; then + for br in "${bridges[@]}"; do + if ! ip link show "$br" >/dev/null; then # the bridge does not exist, leave it to be handled by kube-ovn-cni echo "deleting ovs bridge $br" - ovs-vsctl --no-wait del-br $br + ovs-vsctl --no-wait del-br "$br" fi done bridges=($(ovs-vsctl --no-heading --columns=name find bridge external-ids:vendor=kube-ovn external-ids:exchange-link-name=true)) - for br in ${bridges[@]}; do + for br in "${bridges[@]}"; do if [ -z $(ip link show $br type openvswitch 2>/dev/null || true) ]; then # the bridge does not exist, leave it to be handled by kube-ovn-cni echo "deleting ovs bridge $br" - ovs-vsctl --no-wait del-br $br + ovs-vsctl --no-wait del-br "$br" fi done } @@ -108,7 +111,7 @@ function handle_underlay_bridges() { handle_underlay_bridges # Start vswitchd. restart will automatically set/unset flow-restore-wait which is not what we want -/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovsdb-server --system-id=random --no-mlockall --ovs-vswitchd-wrapper=$DEBUG_WRAPPER +/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovsdb-server --system-id=random --no-mlockall --ovs-vswitchd-wrapper="$DEBUG_WRAPPER" /usr/share/openvswitch/scripts/ovs-ctl --protocol=udp --dport=6081 enable-protocol function gen_conn_str { @@ -137,9 +140,9 @@ ovs-vsctl set open . external-ids:hostname="${KUBE_NODE_NAME}" # Start ovn-controller if [[ "$ENABLE_SSL" == "false" ]]; then - /usr/share/ovn/scripts/ovn-ctl --ovn-controller-wrapper=$DEBUG_WRAPPER restart_controller + /usr/share/ovn/scripts/ovn-ctl --ovn-controller-wrapper="$DEBUG_WRAPPER" restart_controller else - /usr/share/ovn/scripts/ovn-ctl --ovn-controller-ssl-key=/var/run/tls/key --ovn-controller-ssl-cert=/var/run/tls/cert --ovn-controller-ssl-ca-cert=/var/run/tls/cacert --ovn-controller-wrapper=$DEBUG_WRAPPER restart_controller + /usr/share/ovn/scripts/ovn-ctl --ovn-controller-ssl-key=/var/run/tls/key --ovn-controller-ssl-cert=/var/run/tls/cert --ovn-controller-ssl-ca-cert=/var/run/tls/cacert --ovn-controller-wrapper="$DEBUG_WRAPPER" restart_controller fi chmod 600 /etc/openvswitch/* diff --git a/yamls/ovn-dpdk.yaml b/yamls/ovn-dpdk.yaml index 6ad28168b18..85748237316 100644 --- a/yamls/ovn-dpdk.yaml +++ b/yamls/ovn-dpdk.yaml @@ -204,9 +204,10 @@ spec: hostNetwork: true containers: - name: ovn-central - image: "kubeovn/kube-ovn:v1.10.0" + image: "kubeovn/kube-ovn:v1.12.0" imagePullPolicy: IfNotPresent - command: ["/kube-ovn/start-db.sh"] + command: + - /kube-ovn/start-db.sh securityContext: capabilities: add: ["SYS_NICE"] @@ -231,6 +232,10 @@ spec: valueFrom: fieldRef: fieldPath: status.podIPs + - name: PROBE_INTERVAL + value: "180000" + - name: OVN_LEADER_PROBE_INTERVAL + value: "5" resources: requests: cpu: 500m diff --git a/yamls/ovn-ha.yaml b/yamls/ovn-ha.yaml index c1f6e2a5c61..46f30e1031e 100644 --- a/yamls/ovn-ha.yaml +++ b/yamls/ovn-ha.yaml @@ -95,7 +95,8 @@ spec: - name: ovn-central image: "kubeovn/kube-ovn:v1.12.0" imagePullPolicy: IfNotPresent - command: ["/kube-ovn/start-db.sh"] + command: + - /kube-ovn/start-db.sh securityContext: capabilities: add: ["SYS_NICE"] @@ -122,6 +123,10 @@ spec: fieldPath: status.podIPs - name: ENABLE_BIND_LOCAL_IP value: "true" + - name: PROBE_INTERVAL + value: "180000" + - name: OVN_LEADER_PROBE_INTERVAL + value: "5" resources: requests: cpu: 300m @@ -238,7 +243,8 @@ spec: - name: openvswitch image: "kubeovn/kube-ovn:v1.12.0" imagePullPolicy: IfNotPresent - command: ["/kube-ovn/start-ovs.sh"] + command: + - /kube-ovn/start-ovs.sh securityContext: runAsUser: 0 privileged: true @@ -265,6 +271,10 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: OVN_REMOTE_PROBE_INTERVAL + value: "10000" + - name: OVN_REMOTE_OPENFLOW_INTERVAL + value: "180" volumeMounts: - mountPath: /var/run/netns name: host-ns