From 6abfc395ad1b0b6031970b86db504b2575951e7c Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Wed, 26 Jul 2023 21:32:01 -0400 Subject: [PATCH 1/8] added rosahcp script --- .../benchmarks/hcp-small-control-plane.json | 38 + .../benchmarks/hosted-control-plane-p90.json | 17 +- .../data-plane.json} | 1 - .../p75-control-plane.json} | 3 +- .../install/rosa-hcp/small-control-plane.json | 27 + .../config/install/rosa/ovn-osd.json | 16 - dags/openshift_nightlies/dag.py | 61 +- dags/openshift_nightlies/manifest.yaml | 54 +- dags/openshift_nightlies/models/release.py | 2 +- .../scripts/install/rosa-hcp.sh | 799 ++++++++++++++++++ .../scripts/install/rosa.sh | 698 ++------------- .../tasks/benchmarks/e2e.py | 36 +- .../tasks/install/rosa/defaults.json | 7 +- .../tasks/install/rosa/rosa.py | 13 +- .../tasks/install/rosahcp/__init__.py | 0 .../tasks/install/rosahcp/defaults.json | 45 + .../tasks/install/rosahcp/rosahcp.py | 63 ++ dags/openshift_nightlies/util/manifest.py | 31 +- dags/openshift_nightlies/util/var_loader.py | 2 +- 19 files changed, 1155 insertions(+), 758 deletions(-) create mode 100644 dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json rename dags/openshift_nightlies/config/install/{rosa/rosa-hcp-ovn-data-plane.json => rosa-hcp/data-plane.json} (97%) rename dags/openshift_nightlies/config/install/{rosa/rosa-hcp-ovn.json => rosa-hcp/p75-control-plane.json} (94%) create mode 100644 dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json delete mode 100644 dags/openshift_nightlies/config/install/rosa/ovn-osd.json create mode 100755 dags/openshift_nightlies/scripts/install/rosa-hcp.sh create mode 100644 dags/openshift_nightlies/tasks/install/rosahcp/__init__.py create mode 100644 dags/openshift_nightlies/tasks/install/rosahcp/defaults.json create mode 100644 dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py diff --git a/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json new file mode 100644 index 000000000..694c01cd7 --- /dev/null +++ b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json @@ -0,0 +1,38 @@ +{ + "benchmarks": [ + { + "name": "node-density", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "LOG_LEVEL": "debug", + "CHURN": "false" + } + }, + { + "name": "node-desnity-cni", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density-cni", + "LOG_LEVEL": "debug", + "CHURN": "false" + } + }, + { + "name": "cluster-density-v2", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "500", + "LOG_LEVEL": "debug", + "CHURN": "false" + } + } + ] +} diff --git a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json index 769a6699e..ed418fbdb 100644 --- a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json +++ b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json @@ -2,21 +2,14 @@ "benchmarks": [ { "name": "cluster-density-ms-p90", - "workload": "kube-burner", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", "command": "./run.sh", "env": { "WORKLOAD": "cluster-density-ms", - "JOB_ITERATIONS": "100", - "JOB_TIMEOUT": "18000", - "STEP_SIZE": "2m", - "HYPERSHIFT": "true", - "METRICS_PROFILE": "metrics-profiles/hypershift-metrics.yaml", - "QPS": "20", - "BURST": "20", - "LOG_LEVEL": "info", - "PLATFORM_ALERTS": "false", - "CLEANUP_WHEN_FINISH": "true", - "CLEANUP": "true" + "ITERATIONS": "90", + "LOG_LEVEL": "debug", + "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } } ] diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json b/dags/openshift_nightlies/config/install/rosa-hcp/data-plane.json similarity index 97% rename from dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json rename to dags/openshift_nightlies/config/install/rosa-hcp/data-plane.json index 380b11b35..9bfef13dc 100644 --- a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json +++ b/dags/openshift_nightlies/config/install/rosa-hcp/data-plane.json @@ -1,5 +1,4 @@ { - "rosa_hcp": "true", "aws_profile": "", "aws_access_key_id": "", "aws_secret_access_key": "", diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json b/dags/openshift_nightlies/config/install/rosa-hcp/p75-control-plane.json similarity index 94% rename from dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json rename to dags/openshift_nightlies/config/install/rosa-hcp/p75-control-plane.json index 9d07be452..11110dd1f 100644 --- a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json +++ b/dags/openshift_nightlies/config/install/rosa-hcp/p75-control-plane.json @@ -1,5 +1,4 @@ { - "rosa_hcp": "true", "aws_profile": "", "aws_access_key_id": "", "aws_secret_access_key": "", @@ -16,7 +15,7 @@ "openshift_worker_instance_type": "m5.2xlarge", "machineset_metadata_label_prefix": "machine.openshift.io", "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", - "number_of_hostedcluster": 2, + "number_of_hostedcluster": 10, "hcp_install_interval": 60, "extra_machinepool": [{ "name": "infra", diff --git a/dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json b/dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json new file mode 100644 index 000000000..c107fca5c --- /dev/null +++ b/dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json @@ -0,0 +1,27 @@ +{ + "aws_profile": "", + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_authentication_method": "sts", + "aws_region": "us-east-2", + "rosa_environment": "staging", + "rosa_cli_version": "container", + "ocm_cli_version": "container", + "ocm_environment": "stage", + "managed_channel_group": "nightly", + "managed_ocp_version": "latest", + "openshift_worker_count": 24, + "openshift_network_type": "OVNKubernetes", + "openshift_worker_instance_type": "m5.2xlarge", + "machineset_metadata_label_prefix": "machine.openshift.io", + "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", + "number_of_hostedcluster": 1, + "hcp_install_interval": 10, + "extra_machinepool": [{ + "name": "infra", + "replica": "1", + "instance_type": "r5.xlarge", + "labels": "node-role.kubernetes.io/infra=", + "taints": "node-role.kubernetes.io/infra=:NoSchedule" + }] +} diff --git a/dags/openshift_nightlies/config/install/rosa/ovn-osd.json b/dags/openshift_nightlies/config/install/rosa/ovn-osd.json deleted file mode 100644 index 68d1c4ba3..000000000 --- a/dags/openshift_nightlies/config/install/rosa/ovn-osd.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "cluster_install_method": "osd", - "aws_profile": "", - "aws_access_key_id": "", - "aws_secret_access_key": "", - "rosa_environment": "staging", - "rosa_cli_version": "container", - "ocm_environment": "stage", - "managed_channel_group": "nightly", - "managed_ocp_version": "latest", - "openshift_worker_count": 27, - "openshift_network_type": "OVNKubernetes", - "openshift_worker_instance_type": "m5.2xlarge", - "machineset_metadata_label_prefix": "machine.openshift.io", - "openshift_workload_node_instance_type": "m5.2xlarge" - } diff --git a/dags/openshift_nightlies/dag.py b/dags/openshift_nightlies/dag.py index 33033e470..9e6f5dd71 100644 --- a/dags/openshift_nightlies/dag.py +++ b/dags/openshift_nightlies/dag.py @@ -14,6 +14,7 @@ from openshift_nightlies.tasks.install.openstack import jetpack from openshift_nightlies.tasks.install.baremetal import jetski, webfuse from openshift_nightlies.tasks.install.rosa import rosa +from openshift_nightlies.tasks.install.rosahcp import rosahcp from openshift_nightlies.tasks.install.rogcp import rogcp from openshift_nightlies.tasks.install.hypershift import hypershift from openshift_nightlies.tasks.install.prebuilt import initialize_cluster @@ -155,30 +156,21 @@ def _get_openshift_installer(self): class RosaNightlyDAG(AbstractOpenshiftNightlyDAG): def build(self): installer = self._get_openshift_installer() - if installer.get_type() == "rosa_hcp": - install_cluster = installer.get_install_hcp_task() - hosted_installer = self._get_hypershift_openshift_installer() - wait_task = hosted_installer.wait_task() - wait_before_cleanup = hosted_installer.wait_task(id="wait_before_cleanup") - for c_id, install_hc, postinstall_hc, cleanup_hc in install_cluster: - benchmark = self._add_benchmarks(task_group=c_id) - install_hc >> postinstall_hc >> wait_task >> benchmark >> wait_before_cleanup >> cleanup_hc + install_cluster = installer.get_install_task() + final_status = final_dag_status.get_task(self.dag) + with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: + must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") + benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() + chain(*benchmark_tasks) + # Configure must_gather as downstream of all benchmark tasks + for benchmark in benchmark_tasks: + benchmark >> must_gather + rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation() + if self.config.cleanup_on_success: + cleanup_cluster = installer.get_cleanup_task() + install_cluster >> rosa_post_installation >> benchmarks >> cleanup_cluster >> final_status else: - install_cluster = installer.get_install_task() - final_status = final_dag_status.get_task(self.dag) - with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: - must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") - benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() - chain(*benchmark_tasks) - # Configure must_gather as downstream of all benchmark tasks - for benchmark in benchmark_tasks: - benchmark >> must_gather - rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation() - if self.config.cleanup_on_success: - cleanup_cluster = installer.get_cleanup_task() - install_cluster >> rosa_post_installation >> benchmarks >> cleanup_cluster >> final_status - else: - install_cluster >> rosa_post_installation >> benchmarks >> final_status + install_cluster >> rosa_post_installation >> benchmarks >> final_status def _get_openshift_installer(self): return rosa.RosaInstaller(self.dag, self.config, self.release) @@ -186,15 +178,28 @@ def _get_openshift_installer(self): def _get_e2e_benchmarks(self, task_group="benchmarks"): return e2e.E2EBenchmarks(self.dag, self.config, self.release, task_group) +class RosaHCPNightlyDAG(AbstractOpenshiftNightlyDAG): + def build(self): + installer = self._get_openshift_installer() + install_cluster = installer.get_install_hcp_task() + wait_task = installer.wait_task() + wait_before_cleanup = installer.wait_task(id="wait_before_cleanup") + for c_id, install_hc, postinstall_hc, cleanup_hc in install_cluster: + benchmark = self._add_benchmarks(task_group=c_id) + install_hc >> postinstall_hc >> wait_task >> benchmark >> wait_before_cleanup >> cleanup_hc + + def _get_openshift_installer(self): + return rosahcp.RosaHCPInstaller(self.dag, self.config, self.release) + + def _get_e2e_benchmarks(self, task_group="benchmarks"): + return e2e.E2EBenchmarks(self.dag, self.config, self.release, task_group) + def _add_benchmarks(self, task_group): with TaskGroup(task_group, prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks(task_group).get_benchmarks() chain(*benchmark_tasks) return benchmarks - def _get_hypershift_openshift_installer(self): - return hypershift.HypershiftInstaller(self.dag, self.config, self.release) - class RoGCPNightlyDAG(AbstractOpenshiftNightlyDAG): def build(self): @@ -242,7 +247,7 @@ def build(self): install_mgmt_cluster >> rosa_post_installation >> install_hc >> wait_task >> benchmark def _get_openshift_installer(self): - return rosa.RosaInstaller(self.dag, self.config, self.release) + return rosahcp.RosaHCPInstaller(self.dag, self.config, self.release) def _get_hypershift_openshift_installer(self): return hypershift.HypershiftInstaller(self.dag, self.config, self.release) @@ -303,6 +308,8 @@ def build_releases(): nightly = OpenstackNightlyDAG(openshift_release, dag_config) elif openshift_release.platform == "rosa": nightly = RosaNightlyDAG(openshift_release, dag_config) + elif openshift_release.platform == "rosahcp": + nightly = RosaHCPNightlyDAG(openshift_release, dag_config) elif openshift_release.platform == "rogcp": nightly = RoGCPNightlyDAG(openshift_release, dag_config) elif openshift_release.platform == "hypershift": diff --git a/dags/openshift_nightlies/manifest.yaml b/dags/openshift_nightlies/manifest.yaml index 0fda36bc6..6dfc0d24c 100644 --- a/dags/openshift_nightlies/manifest.yaml +++ b/dags/openshift_nightlies/manifest.yaml @@ -103,16 +103,6 @@ platforms: config: install: rosa/iam-ovn.json benchmarks: control-plane.json - - name: osd-ovn-control-plane - schedule: "30 12 * * 1,3,5" - config: - install: rosa/ovn-osd.json - benchmarks: control-plane.json - - name: osd-ovn-data-plane - schedule: "30 1 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: rosa/ovn-osd.json - benchmarks: data-plane-mgs.json - name: ocm-api-load schedule: "None" config: @@ -123,16 +113,25 @@ platforms: config: install: rosa/upgrade.json benchmarks: upgrade.json - - name: rosa-hcp-control-plane + + rosahcp: + versions: ["4.12", "4.13"] + variants: + - name: p75-control-plane schedule: "0 12 * * 3" config: - install: rosa/rosa-hcp-ovn.json + install: rosa-hcp/p75-control-plane.json benchmarks: hosted-control-plane-p75.json - - name: rosa-hcp-data-plane + - name: data-plane-v2 schedule: "1 12 * * 3" config: - install: rosa/rosa-hcp-ovn-data-plane.json + install: rosa-hcp/data-plane.json benchmarks: data-plane-v2.json + - name: small-control-plane + schedule: "2 12 * * 3" + config: + install: rosa-hcp/small-control-plane.json + benchmarks: hcp-small-control-plane.json rogcp: versions: ["4.12", "4.13"] @@ -148,33 +147,6 @@ platforms: install: rogcp/ovn.json benchmarks: data-plane-mgs.json - hypershift: - versions: ["4.12", "4.13"] - variants: - - name: management-control-plane - schedule: "30 3 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: hypershift/none-type.json - benchmarks: management-control-plane.json - - name: ovn-control-plane-p75 - schedule: "30 4 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: hypershift/ovn-p75.json - benchmarks: hosted-control-plane-p75.json - - name: ovn-control-plane-p90 - schedule: "30 5 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: hypershift/ovn-p90.json - benchmarks: hosted-control-plane-p90.json - - name: chaos-ovn-control-plane-p75 - config: - install: hypershift/ovn-p75.json - benchmarks: hosted-control-plane-chaos-p75.json - - name: chaos-ovn-control-plane-p90 - config: - install: hypershift/ovn-p90.json - benchmarks: hosted-control-plane-chaos-p90.json - prebuilt: versions: ["4.x"] variants: diff --git a/dags/openshift_nightlies/models/release.py b/dags/openshift_nightlies/models/release.py index 67f6c250e..831deaf5f 100644 --- a/dags/openshift_nightlies/models/release.py +++ b/dags/openshift_nightlies/models/release.py @@ -41,7 +41,7 @@ def _generate_cluster_name(self): else: cluster_name = f"{git_user}-{git_branch}-{release_name}" - if self.platform == 'rosa' or self.platform == 'rogcp' or self.platform == 'hypershift': + if self.platform == 'rosa' or self.platform == 'rogcp' or self.platform == 'hypershift' or self.platform == 'rosahcp': #Only 15 chars are allowed cluster_version = str(self.version).replace(".","") return "perf-"+md5(cluster_name.encode("ascii")).hexdigest()[:3] diff --git a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh new file mode 100755 index 000000000..4c995e62e --- /dev/null +++ b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh @@ -0,0 +1,799 @@ +#!/bin/bash +# shellcheck disable=SC2155 +set -ex + +export INDEXDATA=() + +while getopts v:a:j:o: flag +do + case "${flag}" in + v) version=${OPTARG};; + j) json_file=${OPTARG};; + o) operation=${OPTARG};; + *) echo "ERROR: invalid parameter ${flag}" ;; + esac +done + +_get_cluster_id(){ + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "$(ocm list clusters --no-headers --columns id $1)" + else + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .id')" + fi +} + +_download_kubeconfig(){ + ocm get /api/clusters_mgmt/v1/clusters/$1/credentials | jq -r .kubeconfig > $2 +} + +_get_cluster_status(){ + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "$(ocm list clusters --no-headers --columns state $1 | xargs)" + else + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .status.state')" + fi +} + +_wait_for_nodes_ready(){ + _download_kubeconfig "$(_get_cluster_id $1)" ./kubeconfig + export KUBECONFIG=./kubeconfig + ALL_READY_ITERATIONS=0 + ITERATIONS=0 + NODES_COUNT=$2 + # 30 seconds per node, waiting for all nodes ready to finalize + while [ ${ITERATIONS} -le $((${NODES_COUNT}*5)) ] ; do + NODES_READY_COUNT=$(oc get nodes -l $3 | grep " Ready " | wc -l) + if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then + echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} $3 nodes ready. Waiting 30 seconds for next check" + # ALL_READY_ITERATIONS=0 + ITERATIONS=$((${ITERATIONS}+1)) + sleep 30 + else + if [ ${ALL_READY_ITERATIONS} -eq 2 ] ; then + echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready, continuing process" + return 0 + else + echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready. Waiting 60 seconds for next check" + ALL_READY_ITERATIONS=$((${ALL_READY_ITERATIONS}+1)) + sleep 60 + fi + fi + done + END_CLUSTER_STATUS="Ready. No Workers" + echo "ERROR: Not all $3 nodes (${NODES_READY_COUNT}/${NODES_COUNT}) are ready after about $((${NODES_COUNT}*3)) minutes, dumping oc get nodes..." + oc get nodes + exit 1 +} + +_aws_cmd(){ + ITR=0 + while [ $ITR -le 30 ]; do + if [[ "$(aws ec2 $1 2>&1)" == *"error"* ]]; then + echo "Failed to $1, retrying after 30 seconds" + ITR=$(($ITR+1)) + sleep 10 + else + return 0 + fi + done + echo "Failed to $1 after 10 minutes of multiple retries" + exit 1 +} + +_login_check(){ + echo "Trying to oc login with password" + ITR=1 + START_TIMER=$(date +%s) + while [ $ITR -le 100 ]; do + if [[ "$(oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s 2>&1)" == *"failed"* ]]; then + echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" + ITR=$(($ITR+1)) + sleep 5 + RECHECK=1 + else + if [[ $RECHECK -eq 10 ]]; then + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_login-${DURATION}") + _adm_logic_check $1 $2 + return 0 + else + echo "Rechecking login for $((10-$RECHECK)) more times" + RECHECK=$(($RECHECK+1)) + sleep 1 + fi + fi + done + END_CLUSTER_STATUS="Ready. Not Access" + echo "Failed to login after 100 attempts with 5 sec interval" +} + +_adm_logic_check(){ + ITR=1 + START_TIMER=$(date +%s) + while [ $ITR -le 100 ]; do + oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s + CHECK=$(oc adm top images 2>&1 > /dev/null) + if [[ $? != 0 ]]; then + echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" + ITR=$(($ITR+1)) + sleep 5 + else + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_oc_adm-${DURATION}") + return 0 + fi + done + END_CLUSTER_STATUS="Ready. Not Access" + echo "Failed to execute oc adm commands after 100 attempts with 5 sec interval" +} + +_balance_infra(){ + if [[ $1 == "prometheus-k8s" ]] ; then + echo "Initiate migration of prometheus componenets to infra nodepools" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + oc get sts prometheus-k8s -n openshift-monitoring + echo "Restart stateful set pods" + oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s + echo "Wait till they are completely restarted" + oc rollout status -n openshift-monitoring statefulset/prometheus-k8s + echo "Check pods status again and the hosting nodes" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + else + echo "Initiate migration of ingress router-default pods to infra nodepools" + echo "Add toleration to use infra nodes" + oc patch ingresscontroller -n openshift-ingress-operator default --type merge --patch '{"spec":{"nodePlacement":{"nodeSelector":{"matchLabels":{"node-role.kubernetes.io/infra":""}},"tolerations":[{"effect":"NoSchedule","key":"node-role.kubernetes.io/infra","operator":"Exists"}]}}}' + echo "Wait till it gets rolled out" + sleep 60 + oc get pods -n openshift-ingress -o wide + fi +} + +_check_infra(){ + TRY=0 + while [ $TRY -le 3 ]; do # Attempts three times to migrate pods + FLAG_ERROR="" + _balance_infra $1 + for node in $(oc get pods -n $2 -o wide | grep -i $1 | grep -i running | awk '{print$7}'); + do + if [[ $(oc get nodes | grep infra | awk '{print$1}' | grep $node) != "" ]]; then + echo "$node is an infra node" + else + echo "$1 pod on $node is not an infra node, retrying" + FLAG_ERROR=true + fi + done + if [[ $FLAG_ERROR == "" ]]; then return 0; else TRY=$((TRY+1)); fi + done + echo "Failed to move $1 pods in $2 namespace" + exit 1 +} + +_wait_for_extra_nodes_ready(){ + export NODE_LABLES=$(cat ${json_file} | jq -r .extra_machinepool[].labels) + for label in $NODE_LABLES; + do + REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.labels == '\"$label\"')'.replica) + NODES_COUNT=$((REPLICA*3)) + if [[ $label == *"infra"* ]] ; then NODES_COUNT=$((REPLICA*2)); fi + _wait_for_nodes_ready $CLUSTER_NAME $NODES_COUNT $label + if [[ $label == *"infra"* ]] ; then + _check_infra prometheus-k8s openshift-monitoring + _check_infra router openshift-ingress + fi + done + return 0 +} + +_add_machinepool(){ + export MACHINEPOOLS=$(cat ${json_file} | jq -r .extra_machinepool[].name) + for mcp in $MACHINEPOOLS; + do + echo "Add an extra machinepool - $mcp to cluster" + ZONES="a b c" + MC_NAME=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.name) + REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.replica) + INS_TYPE=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.instance_type) + LABELS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.labels) + TAINTS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.taints) + if [[ $MC_NAME == *"infra"* ]]; then ZONES="a b"; fi + for ZONE in $ZONES; + do + if [[ $(rosa list machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" | grep $MC_NAME-$ZONE) == "" ]]; then + rosa create machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" --name $MC_NAME-$ZONE --instance-type ${INS_TYPE} --replicas $REPLICA --availability-zone $AWS_REGION$ZONE --labels $LABELS --taints $TAINTS + fi + done + done + _wait_for_extra_nodes_ready + return 0 +} + +_wait_for_cluster_ready(){ + START_TIMER=$(date +%s) + echo "INFO: Installation starts at $(date -d @${START_TIMER})" + echo "INFO: Waiting about 180 iterations, counting only when cluster enters on installing status" + ITERATIONS=0 + PREVIOUS_STATUS="" + # 90 iterations, sleeping 60 seconds, 1.5 hours of wait + # Only increasing iterations on installing status + while [ ${ITERATIONS} -le 90 ] ; do + CLUSTER_STATUS=$(_get_cluster_status $1) + CURRENT_TIMER=$(date +%s) + if [ ${CLUSTER_STATUS} != ${PREVIOUS_STATUS} ] && [ ${PREVIOUS_STATUS} != "" ]; then + # When detected a status change, index timer and update start time for next status change + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("${PREVIOUS_STATUS}"-"${DURATION}") + START_TIMER=${CURRENT_TIMER} + echo "INFO: Cluster status changed to ${CLUSTER_STATUS}" + if [ ${CLUSTER_STATUS} == "error" ] ; then + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "ERROR: Cluster $1 not installed after 1.5 hours.." + else + rosa logs install -c $1 + rosa describe cluster -c $1 + fi + return 1 + fi + fi + if [ ${CLUSTER_STATUS} == "ready" ] ; then + END_CLUSTER_STATUS="Ready" + echo "Set end time of prom scrape" + export END_TIME=$(date +"%s") + START_TIMER=$(date +%s) + _wait_for_nodes_ready $1 ${COMPUTE_WORKERS_NUMBER} "node-role.kubernetes.io/worker" + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("day2operations-${DURATION}") + _add_machinepool $URL $PASSWORD + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "INFO: Cluster and nodes on ready status.." + else + echo "INFO: Cluster and nodes on ready status at ${CURRENT_TIMER}, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 + fi + return 0 + elif [ ${CLUSTER_STATUS} == "installing" ] ; then + echo "INFO: ${ITERATIONS}/90. Cluster on ${CLUSTER_STATUS} status, waiting 60 seconds for next check" + ITERATIONS=$((${ITERATIONS}+1)) + sleep 60 + else + # Sleep 1 to try to capture as much as posible states before installing + sleep 1 + fi + PREVIOUS_STATUS=${CLUSTER_STATUS} + done + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "ERROR: Cluster $1 not installed after 3 hours.." + else + END_CLUSTER_STATUS="Not Ready" + echo "ERROR: Cluster $1 not installed after 90 iterations, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 + fi + exit 1 +} + +_create_aws_vpc(){ + + echo "Create Internet Gateway" + aws ec2 create-internet-gateway --tag-specifications ResourceType=internet-gateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=igw-$CLUSTER_NAME}]" --output json + export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") + + echo "Create VPC and attach internet gateway" + aws ec2 create-vpc --cidr-block 10.0.0.0/16 --tag-specifications ResourceType=vpc,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpc-$CLUSTER_NAME}]" --output json + export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') + + aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-support "{\"Value\":true}" + aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-hostnames "{\"Value\":true}" + aws ec2 attach-internet-gateway --vpc-id $VPC --internet-gateway-id $IGW + + aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-rt-table-$CLUSTER_NAME}]" --output json + export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') + aws ec2 create-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $IGW + + ITR=0 + export ALL_PRI_RT_TB="" + for ZONE in a b c; + do + ITR=$((ITR+1)) + echo "Allocate Elastic IP" + aws ec2 allocate-address --tag-specifications ResourceType=elastic-ip,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=eip-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") + + echo "Create Subnets and Route tables" + aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$ITR.0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + aws ec2 create-nat-gateway --subnet-id $PUB_SUB --allocation-id $E_IP --tag-specifications ResourceType=natgateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available" or .State == "pending")' | jq -r ".NatGatewayId") + echo "Wait until NatGateway $NGW is available" + aws ec2 wait nat-gateway-available --nat-gateway-ids $NGW + aws ec2 associate-route-table --route-table-id $PUB_RT_TB --subnet-id $PUB_SUB + + aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$((ITR+10)).0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') + export ALL_PRI_RT_TB="${ALL_PRI_RT_TB} ${PRI_RT_TB}" + aws ec2 associate-route-table --route-table-id $PRI_RT_TB --subnet-id $PRI_SUB + aws ec2 create-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $NGW + done + + echo "Create private VPC endpoint to S3" + aws ec2 create-vpc-endpoint --vpc-id $VPC --service-name com.amazonaws.$AWS_REGION.s3 --route-table-ids $ALL_PRI_RT_TB --tag-specifications ResourceType=vpc-endpoint,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpce-$CLUSTER_NAME}]" +} + +_delete_aws_vpc(){ + echo "Delete Subnets, Routes, Gateways, VPC if exists" + export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') + if [ $VPC != null ]; then + echo "Delete VPC Endpoint" + export VPCE=$(aws ec2 describe-vpc-endpoints --filters "Name=tag:Name,Values=vpce-$CLUSTER_NAME" --output json | jq -r '.VpcEndpoints[0].VpcEndpointId') + if [ $VPCE != null ]; then _aws_cmd "delete-vpc-endpoints --vpc-endpoint-ids $VPCE"; fi + + export ELB=$(aws elb describe-load-balancers --output json | jq -r '.LoadBalancerDescriptions[]'| jq -r 'select(.VPCId == '\"${VPC}\"')' | jq -r '.LoadBalancerName') + if [ $ELB != "" ]; then aws elb delete-load-balancer --load-balancer-name $ELB; fi + + for ZONE in a b c; + do + echo "Delete Subnets and Route tables" + export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') + export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') + export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + + if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0"; fi + if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi + if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PRI_RT_TB"; fi + if [ $PRI_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PRI_SUB"; fi + + export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[].RouteTableAssociationId') + export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available")' | jq -r ".NatGatewayId") + export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") + + if [ $RT_TB_ASSO_ID != null ]; then for _id in $RT_TB_ASSO_ID; do _aws_cmd "disassociate-route-table --association-id $_id"; done; fi + if [ $NGW != null ]; then _aws_cmd "delete-nat-gateway --nat-gateway-id $NGW"; fi + if [ $PUB_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PUB_SUB"; fi + if [ $E_IP != null ]; then _aws_cmd "release-address --allocation-id $E_IP"; fi + done + + export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') + + if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0"; fi + if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PUB_RT_TB"; fi + + export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") + if [ $IGW != null ]; then _aws_cmd "detach-internet-gateway --internet-gateway-id $IGW --vpc-id $VPC"; fi + if [ $IGW != null ]; then _aws_cmd "delete-internet-gateway --internet-gateway-id $IGW"; fi + + echo "Delete Security Group Rules" + for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[].GroupId"); + do + for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == false)" | jq -r ".SecurityGroupRuleId"); + do + aws ec2 revoke-security-group-ingress --security-group-rule-ids $r --group-id $g + done + + for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == true)" | jq -r ".SecurityGroupRuleId"); + do + aws ec2 revoke-security-group-egress --security-group-rule-ids $r --group-id $g + done + done + + for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[]" | jq -r 'select(.GroupName != "default")' | jq -r ".GroupId"); + do + echo "Delete Security Groups $g" + _aws_cmd "delete-security-group --group-id $g" + done + + echo "Delete VPC $VPC" + _aws_cmd "delete-vpc --vpc-id $VPC" + fi +} + +_oidc_config(){ + echo "${1} OIDC config, with prefix ${2}" + if [[ $1 == "create" ]]; then + echo "${1} OIDC config" + rosa create oidc-config --mode=auto --managed=false --prefix ${2} -y + export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') + else + export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') + if [ ! -z $OIDC_CONFIG ]; then rosa delete oidc-config --mode=auto --oidc-config-id ${OIDC_CONFIG} -y || true; fi # forcing exit 0, as this command may file if it is a shared oidc config + fi +} + +_get_sc_mc_details(){ + if [ -z $SVC_CLUSTER_NAME ]; then + echo "Find Service Cluster" + export SVC_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Service Cluster" | awk '{print$3}') + fi + if [ -z $MGMT_CLUSTER_NAME ]; then + export MGMT_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Management Cluster" | awk '{print$3}') + fi + echo "Read Management cluster details" + export MGMT_CLUSTER_DETAILS=$(ocm get /api/clusters_mgmt/v1/clusters | jq -r ".items[]" | jq -r 'select(.name == '\"$MGMT_CLUSTER_NAME\"')') + export NUMBER_OF_HC=$(cat ${json_file} | jq -r .number_of_hostedcluster) +} + +setup(){ + mkdir /home/airflow/workspace + cd /home/airflow/workspace + export PATH=$PATH:/usr/bin:/usr/local/go/bin + export HOME=/home/airflow + export AWS_REGION=$(cat ${json_file} | jq -r .aws_region) + export AWS_ACCOUNT_ID=$(cat ${json_file} | jq -r .aws_account_id) + export AWS_ACCESS_KEY_ID=$(cat ${json_file} | jq -r .aws_access_key_id) + export AWS_SECRET_ACCESS_KEY=$(cat ${json_file} | jq -r .aws_secret_access_key) + export AWS_AUTHENTICATION_METHOD=$(cat ${json_file} | jq -r .aws_authentication_method) + export ROSA_ENVIRONMENT=$(cat ${json_file} | jq -r .rosa_environment) + export ROSA_TOKEN=$(cat ${json_file} | jq -r .rosa_token_${ROSA_ENVIRONMENT}) + export MANAGED_OCP_VERSION=$(cat ${json_file} | jq -r .managed_ocp_version) + export MANAGED_CHANNEL_GROUP=$(cat ${json_file} | jq -r .managed_channel_group) + export CLUSTER_NAME=$(cat ${json_file} | jq -r .openshift_cluster_name) + export COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .openshift_worker_count) + export NETWORK_TYPE=$(cat ${json_file} | jq -r .openshift_network_type) + export ES_SERVER=$(cat ${json_file} | jq -r .es_server) + export STAGE_CONFIG="" + export MGMT_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_mgmt_cluster_name) + export SVC_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_svc_cluster_name) + export STAGE_PROV_SHARD=$(cat ${json_file} | jq -r .staging_mgmt_provisioner_shards) + export OIDC_PREFIX=$(cat ${json_file} | jq -r .openshift_cluster_name) + export CLUSTER_NAME="${CLUSTER_NAME}-${HOSTED_ID}" # perf-as3-hcp-1, perf-as3-hcp-2.. + export KUBECONFIG_NAME=$(echo $KUBECONFIG_NAME | awk -F-kubeconfig '{print$1}')-$HOSTED_ID-kubeconfig + export KUBEADMIN_NAME=$(echo $KUBEADMIN_NAME | awk -F-kubeadmin '{print$1}')-$HOSTED_ID-kubeadmin + UUID=$(echo $AIRFLOW_CTX_DAG_RUN_ID | base64 | cut -c 1-32 ) + export UUID=${UUID} + export OCM_CLI_VERSION=$(cat ${json_file} | jq -r .ocm_cli_version) + if [[ ${OCM_CLI_VERSION} != "container" ]]; then + OCM_CLI_FORK=$(cat ${json_file} | jq -r .ocm_cli_fork) + git clone -q --depth=1 --single-branch --branch ${OCM_CLI_VERSION} ${OCM_CLI_FORK} + pushd ocm-cli + sudo PATH=$PATH:/usr/bin:/usr/local/go/bin make + sudo mv ocm /usr/local/bin/ + popd + fi + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "Clean-up existing OSD access keys.." + AWS_KEY=$(aws iam list-access-keys --user-name OsdCcsAdmin --output text --query 'AccessKeyMetadata[*].AccessKeyId') + LEN_AWS_KEY=`echo $AWS_KEY | wc -w` + if [[ ${LEN_AWS_KEY} -eq 2 ]]; then + aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id `printf ${AWS_KEY[0]}` + fi + echo "Create new OSD access key.." + export ADMIN_KEY=$(aws iam create-access-key --user-name OsdCcsAdmin) + export AWS_ACCESS_KEY_ID=$(echo $ADMIN_KEY | jq -r '.AccessKey.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo $ADMIN_KEY | jq -r '.AccessKey.SecretAccessKey') + ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" + ocm whoami + sleep 60 # it takes a few sec for new access key + echo "Check AWS Username..." + aws iam get-user | jq -r .User.UserName + else + export ROSA_CLI_VERSION=$(cat ${json_file} | jq -r .rosa_cli_version) + if [[ ${ROSA_CLI_VERSION} != "container" ]]; then + ROSA_CLI_FORK=$(cat ${json_file} | jq -r .rosa_cli_fork) + git clone -q --depth=1 --single-branch --branch ${ROSA_CLI_VERSION} ${ROSA_CLI_FORK} + pushd rosa + make + sudo mv rosa /usr/local/bin/ + popd + fi + ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" + ocm whoami + rosa login --env=${ROSA_ENVIRONMENT} + rosa whoami + rosa verify quota + rosa verify permissions + if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -1) + elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -2 | tail -1) + else + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | grep ^${MANAGED_OCP_VERSION}$) + fi + [ -z "${ROSA_VERSION}" ] && echo "ERROR: Image not found for version (${version}) on ROSA ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 + return 0 + fi +} + +install(){ + export COMPUTE_WORKERS_TYPE=$(cat ${json_file} | jq -r .openshift_worker_instance_type) + export CLUSTER_AUTOSCALE=$(cat ${json_file} | jq -r .cluster_autoscale) + export OIDC_CONFIG=$(cat ${json_file} | jq -r .oidc_config) + if [[ $INSTALL_METHOD == "osd" ]]; then + if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then + export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -1) + elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then + export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -2 | tail -1) + else + export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${MANAGED_OCP_VERSION}) + fi + [ -z ${OCM_VERSION} ] && echo "ERROR: Image not found for version (${version}) on OCM ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 + if [[ $CLUSTER_AUTOSCALE == "true" ]]; then + export MIN_COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .min_openshift_worker_count) + export CLUSTER_SIZE="--enable-autoscaling --min-replicas ${MIN_COMPUTE_WORKERS_NUMBER} --max-replicas ${COMPUTE_WORKERS_NUMBER}" + else + export CLUSTER_SIZE="--compute-nodes ${COMPUTE_WORKERS_NUMBER}" + fi + ocm create cluster --ccs --provider aws --region ${AWS_REGION} --aws-account-id ${AWS_ACCOUNT_ID} --aws-access-key-id ${AWS_ACCESS_KEY_ID} --aws-secret-access-key ${AWS_SECRET_ACCESS_KEY} --channel-group ${MANAGED_CHANNEL_GROUP} --version ${OCM_VERSION} --multi-az --compute-machine-type ${COMPUTE_WORKERS_TYPE} --network-type ${NETWORK_TYPE} ${CLUSTER_NAME} ${CLUSTER_SIZE} + else + export INSTALLATION_PARAMS="" + export ROSA_HCP_PARAMS="" + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" + fi + _create_aws_vpc + echo "Set start time of prom scrape" + export START_TIME=$(date +"%s") + if [ $STAGE_PROV_SHARD != "" ]; then + STAGE_CONFIG="--properties provision_shard_id:${STAGE_PROV_SHARD}" + fi + ALL_SUBNETS=$(aws ec2 describe-subnets --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r ".Subnets[].SubnetId") + SUBNETS_IDS="" + for _ID in ${ALL_SUBNETS}; + do + if [[ ${SUBNETS_IDS} == "" ]]; then SUBNETS_IDS=${_ID}; else SUBNETS_IDS=${SUBNETS_IDS}","${_ID}; fi + done + ROSA_HCP_PARAMS="--hosted-cp ${STAGE_CONFIG} --subnet-ids ${SUBNETS_IDS} --machine-cidr 10.0.0.0/16" + export OIDC_CONFIG=$(rosa list oidc-config | grep $OIDC_PREFIX | awk '{print$1}') + if [ -z $OIDC_CONFIG ]; then _oidc_config create $OIDC_PREFIX; fi + ROSA_HCP_PARAMS="${ROSA_HCP_PARAMS} --oidc-config-id ${OIDC_CONFIG}" + rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} ${ROSA_HCP_PARAMS} + fi + postinstall + return 0 +} + +postinstall(){ + _wait_for_cluster_ready ${CLUSTER_NAME} + # sleeping to address issue #324 + sleep 120 + export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) + _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig + _get_sc_mc_details + echo "Index Managment cluster info" + index_metadata "management" + _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig + kubectl delete secret staging-mgmt-cluster-kubeconfig || true + kubectl create secret generic staging-mgmt-cluster-kubeconfig --from-file=config=./mgmt_kubeconfig + + unset KUBECONFIG + kubectl delete secret ${KUBECONFIG_NAME} || true + kubectl create secret generic ${KUBECONFIG_NAME} --from-file=config=./kubeconfig + if [[ $INSTALL_METHOD == "osd" ]]; then + export PASSWORD=$(echo ${CLUSTER_NAME} | md5sum | awk '{print $1}') + ocm create idp -n localauth -t htpasswd --username kubeadmin --password ${PASSWORD} -c ${CLUSTER_NAME} + ocm create user kubeadmin -c "$(_get_cluster_id ${CLUSTER_NAME})" --group=cluster-admins + # set expiration time + EXPIRATION_STRING=$(date -d "${EXPIRATION_TIME} minutes" '+{"expiration_timestamp": "%FT%TZ"}') + ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING} + echo "Cluster is ready, deleting OSD access keys now.." + aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} + else + URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") + START_TIMER=$(date +%s) + PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') + CURRENT_TIMER=$(date +%s) + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_create-${DURATION}") + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} + _login_check $URL $PASSWORD + # set expiration to 24h + rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m + fi + index_metadata "cluster-install" + return 0 +} + +index_metadata(){ + if [[ ! "${INDEXDATA[*]}" =~ "cleanup" ]] ; then + _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig + export KUBECONFIG=./kubeconfig + fi + if [[ $INSTALL_METHOD == "osd" ]]; then + export PLATFORM="AWS-MS" + export CLUSTER_VERSION="${OCM_VERSION}" + else + export PLATFORM="ROSA" + export CLUSTER_VERSION="${ROSA_VERSION}" + fi + if [ "$1" == "management" ]; then + METADATA=$(cat << EOF +{ +"uuid" : "${UUID}", +"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", +"version": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".openshift_version")", +"infra_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".infra_id")", +"cluster_name": "$MGMT_CLUSTER_NAME", +"cluster_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".id")", +"base_domain": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".dns.base_domain")", +"aws_region": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".region.id")", +"workers": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.autoscale_compute.max_replicas")", +"workers_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.compute_machine_type.id")", +"network_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".network.type")", +"install_method": "rosa", +"provision_shard": "$STAGE_PROV_SHARD", +"hostedclusters": "$NUMBER_OF_HC" +} +EOF +) + elif [ "$1" == "cluster-install" ]; then + METADATA=$(cat << EOF +{ +"uuid" : "${UUID}", +"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", +"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", +"workers": "$COMPUTE_WORKERS_NUMBER", +"cluster_name": "${CLUSTER_NAME}", +"cluster_id": "$(_get_cluster_id ${CLUSTER_NAME})", +"network_type": "${NETWORK_TYPE}", +"version": "${CLUSTER_VERSION}", +"operation": "install", +"install_method": "rosa", +"status": "$END_CLUSTER_STATUS", +"timestamp": "$(date +%s%3N)" +EOF +) + INSTALL_TIME=0 + TOTAL_TIME=0 + WORKER_READY_TIME=0 + for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i + METADATA="${METADATA}, \"$1\":\"$2\"" + if [ $1 != "day2operations" ] && [ $1 != "login" ] ; then + INSTALL_TIME=$((${INSTALL_TIME} + $2)) + elif [ $1 == "day2operations" ]; then + WORKER_READY_TIME=$2 + else + TOTAL_TIME=$2 + fi + done + IFS=" " + METADATA="${METADATA}, \"duration\":\"${INSTALL_TIME}\"" + METADATA="${METADATA}, \"workers_ready\":\"$(($INSTALL_TIME + $WORKER_READY_TIME))\"" + METADATA="${METADATA} }" + else + METADATA=$(cat << EOF +{ +"uuid" : "${UUID}", +"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", +"workers": "$COMPUTE_WORKERS_NUMBER", +"cluster_name": "${CLUSTER_NAME}", +"cluster_id": "$ROSA_CLUSTER_ID", +"network_type": "${NETWORK_TYPE}", +"version": "${CLUSTER_VERSION}", +"operation": "destroy", +"install_method": "rosa", +"duration": "$DURATION", +"timestamp": "$(date +%s%3N)" +} +EOF +) + fi + printf "Indexing installation timings to ES" + curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/hypershift-wrapper-timers/_doc -d "${METADATA}" -o /dev/null + + unset KUBECONFIG + return 0 +} + +index_mgmt_cluster_stat(){ + echo "Indexing Management cluster stat..." + cd /home/airflow/workspace + echo "Installing kube-burner" + _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig + export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.5} + curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_RELEASE}/kube-burner-${KUBE_BURNER_RELEASE}-Linux-x86_64.tar.gz -o kube-burner.tar.gz + sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ + git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch + METRIC_PROFILE=/home/airflow/workspace/e2e-benchmarking/workloads/kube-burner-ocp-wrapper/metrics-profiles/mc-metrics.yml + envsubst < /home/airflow/workspace/e2e-benchmarking/workloads/kube-burner/workloads/managed-services/baseconfig.yml > baseconfig.yml + cat baseconfig.yml + HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" + MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") + MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) + Q_NODES="" + for n in $(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node'); + do + if [[ ${Q_NODES} == "" ]]; then Q_NODES=${n}; else Q_NODES=${Q_NODES}"|"${n}; fi + done + MGMT_WORKER_NODES=${Q_NODES} + echo "Exporting required vars" + cat << EOF +MC_PROMETHEUS: ${MC_PROMETHEUS} +MC_PROMETHEUS_TOKEN: +HCP_NAMESPACE: ${HCP_NAMESPACE} +MGMT_WORKER_NODES: ${MGMT_WORKER_NODES} +elapsed: "20m:" + +EOF + export MC_PROMETHEUS MC_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES elapsed + METADATA=$(cat << EOF +{ +"uuid":"${UUID}", +"timestamp": "$(date +%s%3N)", +"hostedClusterName": "${HC_INFRASTRUCTURE_NAME}", +"clusterName": "${HC_INFRASTRUCTURE_NAME}", +"mgmtClusterName": "${MGMT_CLUSTER_NAME}" +} +EOF +) + printf "Indexing metadata to ES" + curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/${ES_INDEX}/_doc -d "${METADATA}" -o /dev/null + + echo "Running kube-burner index.." + kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$END_TIME --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug + echo "Finished indexing results" +} + +cleanup(){ + if [[ $INSTALL_METHOD == "osd" ]]; then + ocm delete cluster "$(_get_cluster_id ${CLUSTER_NAME})" + echo "Cluster is getting Uninstalled, deleting OSD access keys now.." + aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true + else + export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) + export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} + CLEANUP_START_TIMING=$(date +%s) + export START_TIME=$CLEANUP_START_TIMING + rosa delete cluster -c ${ROSA_CLUSTER_ID} -y + rosa logs uninstall -c ${ROSA_CLUSTER_ID} --watch + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + rosa delete operator-roles -c ${ROSA_CLUSTER_ID} -m auto --yes || true + rosa delete oidc-provider -c ${ROSA_CLUSTER_ID} -m auto --yes || true + fi + DURATION=$(($(date +%s) - $CLEANUP_START_TIMING)) + INDEXDATA+=("cleanup-${DURATION}") + export END_TIME=$(date +"%s") + _delete_aws_vpc + if [ -z $OIDC_CONFIG ]; then _oidc_config delete $OIDC_PREFIX; fi + fi + return 0 +} + +export INSTALL_METHOD=$(cat ${json_file} | jq -r .cluster_install_method) +export HC_INTERVAL=$(cat ${json_file} | jq -r .hcp_install_interval) +SKEW_FACTOR=$(echo $HOSTED_ID|awk -F- '{print$2}') +sleep $(($HC_INTERVAL*$SKEW_FACTOR)) # 60*1, 60*2.. +setup + +if [[ "$operation" == "install" ]]; then + printf "INFO: Checking if cluster is already installed" + CLUSTER_STATUS=$(_get_cluster_status ${CLUSTER_NAME}) + if [ -z "${CLUSTER_STATUS}" ] ; then + printf "INFO: Cluster not found, installing..." + echo "pre-clean AWS resources" + _delete_aws_vpc + install + export HC_INFRASTRUCTURE_NAME=$(_get_cluster_id ${CLUSTER_NAME}) + index_mgmt_cluster_stat "install-metrics" + + elif [ "${CLUSTER_STATUS}" == "ready" ] ; then + printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..." + postinstall + elif [ "${CLUSTER_STATUS}" == "error" ] ; then + printf "INFO: Cluster ${CLUSTER_NAME} errored, cleaning them now..." + cleanup + printf "INFO: Fail this install to re-try a fresh install" + exit 1 + else + printf "INFO: Cluster ${CLUSTER_NAME} already installed but not ready, exiting..." + exit 1 + fi + +elif [[ "$operation" == "cleanup" ]]; then + printf "Running Cleanup Steps" + _get_sc_mc_details + cleanup + index_metadata + index_mgmt_cluster_stat "destroy-metrics" + rosa logout + ocm logout +fi diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh index 8e53dea91..998e8b0cb 100755 --- a/dags/openshift_nightlies/scripts/install/rosa.sh +++ b/dags/openshift_nightlies/scripts/install/rosa.sh @@ -15,11 +15,7 @@ do done _get_cluster_id(){ - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "$(ocm list clusters --no-headers --columns id $1)" - else - echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .id')" - fi + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .id')" } _download_kubeconfig(){ @@ -27,11 +23,7 @@ _download_kubeconfig(){ } _get_cluster_status(){ - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "$(ocm list clusters --no-headers --columns state $1 | xargs)" - else - echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .status.state')" - fi + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .status.state')" } _wait_for_nodes_ready(){ @@ -39,13 +31,8 @@ _wait_for_nodes_ready(){ export KUBECONFIG=./kubeconfig ALL_READY_ITERATIONS=0 ITERATIONS=0 - if [ $HCP == "true" ]; then - NODES_COUNT=$2 - ALL_READY_ITERATIONS=4 #reduced extra buffers for hosted cp clusters - else - # Node count is number of workers + 3 infra - NODES_COUNT=$(($2+3)) - fi + # Node count is number of workers + 3 infra + NODES_COUNT=$(($2+3)) # 30 seconds per node, waiting for all nodes ready to finalize while [ ${ITERATIONS} -le $((${NODES_COUNT}*5)) ] ; do NODES_READY_COUNT=$(oc get nodes -l $3 | grep " Ready " | wc -l) @@ -71,152 +58,6 @@ _wait_for_nodes_ready(){ exit 1 } -_aws_cmd(){ - ITR=0 - while [ $ITR -le 30 ]; do - if [[ "$(aws ec2 $1 2>&1)" == *"error"* ]]; then - echo "Failed to $1, retrying after 30 seconds" - ITR=$(($ITR+1)) - sleep 10 - else - return 0 - fi - done - echo "Failed to $1 after 10 minutes of multiple retries" - exit 1 -} - -_login_check(){ - echo "Trying to oc login with password" - ITR=1 - START_TIMER=$(date +%s) - while [ $ITR -le 100 ]; do - if [[ "$(oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s 2>&1)" == *"failed"* ]]; then - echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" - ITR=$(($ITR+1)) - sleep 5 - RECHECK=1 - else - if [[ $RECHECK -eq 10 ]]; then - CURRENT_TIMER=$(date +%s) - # Time since rosa cluster is ready until all nodes are ready - DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("cluster_admin_login-${DURATION}") - _adm_logic_check $1 $2 - return 0 - else - echo "Rechecking login for $((10-$RECHECK)) more times" - RECHECK=$(($RECHECK+1)) - sleep 1 - fi - fi - done - END_CLUSTER_STATUS="Ready. Not Access" - echo "Failed to login after 100 attempts with 5 sec interval" -} - -_adm_logic_check(){ - ITR=1 - START_TIMER=$(date +%s) - while [ $ITR -le 100 ]; do - oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s - CHECK=$(oc adm top images 2>&1 > /dev/null) - if [[ $? != 0 ]]; then - echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" - ITR=$(($ITR+1)) - sleep 5 - else - CURRENT_TIMER=$(date +%s) - # Time since rosa cluster is ready until all nodes are ready - DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("cluster_oc_adm-${DURATION}") - return 0 - fi - done - END_CLUSTER_STATUS="Ready. Not Access" - echo "Failed to execute oc adm commands after 100 attempts with 5 sec interval" -} - -_balance_infra(){ - if [[ $1 == "prometheus-k8s" ]] ; then - echo "Initiate migration of prometheus componenets to infra nodepools" - oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s - oc get sts prometheus-k8s -n openshift-monitoring - echo "Restart stateful set pods" - oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s - echo "Wait till they are completely restarted" - oc rollout status -n openshift-monitoring statefulset/prometheus-k8s - echo "Check pods status again and the hosting nodes" - oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s - else - echo "Initiate migration of ingress router-default pods to infra nodepools" - echo "Add toleration to use infra nodes" - oc patch ingresscontroller -n openshift-ingress-operator default --type merge --patch '{"spec":{"nodePlacement":{"nodeSelector":{"matchLabels":{"node-role.kubernetes.io/infra":""}},"tolerations":[{"effect":"NoSchedule","key":"node-role.kubernetes.io/infra","operator":"Exists"}]}}}' - echo "Wait till it gets rolled out" - sleep 60 - oc get pods -n openshift-ingress -o wide - fi -} - -_check_infra(){ - TRY=0 - while [ $TRY -le 3 ]; do # Attempts three times to migrate pods - FLAG_ERROR="" - _balance_infra $1 - for node in $(oc get pods -n $2 -o wide | grep -i $1 | grep -i running | awk '{print$7}'); - do - if [[ $(oc get nodes | grep infra | awk '{print$1}' | grep $node) != "" ]]; then - echo "$node is an infra node" - else - echo "$1 pod on $node is not an infra node, retrying" - FLAG_ERROR=true - fi - done - if [[ $FLAG_ERROR == "" ]]; then return 0; else TRY=$((TRY+1)); fi - done - echo "Failed to move $1 pods in $2 namespace" - exit 1 -} - -_wait_for_extra_nodes_ready(){ - export NODE_LABLES=$(cat ${json_file} | jq -r .extra_machinepool[].labels) - for label in $NODE_LABLES; - do - REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.labels == '\"$label\"')'.replica) - NODES_COUNT=$((REPLICA*3)) - if [[ $label == *"infra"* ]] ; then NODES_COUNT=$((REPLICA*2)); fi - _wait_for_nodes_ready $CLUSTER_NAME $NODES_COUNT $label - if [[ $label == *"infra"* ]] ; then - _check_infra prometheus-k8s openshift-monitoring - _check_infra router openshift-ingress - fi - done - return 0 -} - -_add_machinepool(){ - export MACHINEPOOLS=$(cat ${json_file} | jq -r .extra_machinepool[].name) - for mcp in $MACHINEPOOLS; - do - echo "Add an extra machinepool - $mcp to cluster" - ZONES="a b c" - MC_NAME=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.name) - REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.replica) - INS_TYPE=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.instance_type) - LABELS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.labels) - TAINTS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.taints) - if [[ $MC_NAME == *"infra"* ]]; then ZONES="a b"; fi - for ZONE in $ZONES; - do - if [[ $(rosa list machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" | grep $MC_NAME-$ZONE) == "" ]]; then - rosa create machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" --name $MC_NAME-$ZONE --instance-type ${INS_TYPE} --replicas $REPLICA --availability-zone $AWS_REGION$ZONE --labels $LABELS --taints $TAINTS - fi - done - done - _wait_for_extra_nodes_ready - return 0 -} - _wait_for_cluster_ready(){ START_TIMER=$(date +%s) echo "INFO: Installation starts at $(date -d @${START_TIMER})" @@ -235,12 +76,8 @@ _wait_for_cluster_ready(){ START_TIMER=${CURRENT_TIMER} echo "INFO: Cluster status changed to ${CLUSTER_STATUS}" if [ ${CLUSTER_STATUS} == "error" ] ; then - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "ERROR: Cluster $1 not installed after 1.5 hours.." - else - rosa logs install -c $1 - rosa describe cluster -c $1 - fi + rosa logs install -c $1 + rosa describe cluster -c $1 return 1 fi fi @@ -254,14 +91,9 @@ _wait_for_cluster_ready(){ # Time since rosa cluster is ready until all nodes are ready DURATION=$(($CURRENT_TIMER - $START_TIMER)) INDEXDATA+=("day2operations-${DURATION}") - if [ $HCP == "true" ]; then _add_machinepool $URL $PASSWORD; fi - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "INFO: Cluster and nodes on ready status.." - else - echo "INFO: Cluster and nodes on ready status at ${CURRENT_TIMER}, dumping installation logs..." - rosa logs install -c $1 - rosa describe cluster -c $1 - fi + echo "INFO: Cluster and nodes on ready status at ${CURRENT_TIMER}, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 return 0 elif [ ${CLUSTER_STATUS} == "installing" ] ; then echo "INFO: ${ITERATIONS}/90. Cluster on ${CLUSTER_STATUS} status, waiting 60 seconds for next check" @@ -273,157 +105,12 @@ _wait_for_cluster_ready(){ fi PREVIOUS_STATUS=${CLUSTER_STATUS} done - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "ERROR: Cluster $1 not installed after 3 hours.." - else - END_CLUSTER_STATUS="Not Ready" - echo "ERROR: Cluster $1 not installed after 90 iterations, dumping installation logs..." - rosa logs install -c $1 - rosa describe cluster -c $1 - fi - exit 1 -} - -_create_aws_vpc(){ - - echo "Create Internet Gateway" - aws ec2 create-internet-gateway --tag-specifications ResourceType=internet-gateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=igw-$CLUSTER_NAME}]" --output json - export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") - - echo "Create VPC and attach internet gateway" - aws ec2 create-vpc --cidr-block 10.0.0.0/16 --tag-specifications ResourceType=vpc,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpc-$CLUSTER_NAME}]" --output json - export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') - - aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-support "{\"Value\":true}" - aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-hostnames "{\"Value\":true}" - aws ec2 attach-internet-gateway --vpc-id $VPC --internet-gateway-id $IGW - - aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-rt-table-$CLUSTER_NAME}]" --output json - export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - aws ec2 create-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $IGW - - ITR=0 - export ALL_PRI_RT_TB="" - for ZONE in a b c; - do - ITR=$((ITR+1)) - echo "Allocate Elastic IP" - aws ec2 allocate-address --tag-specifications ResourceType=elastic-ip,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=eip-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") - - echo "Create Subnets and Route tables" - aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$ITR.0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - aws ec2 create-nat-gateway --subnet-id $PUB_SUB --allocation-id $E_IP --tag-specifications ResourceType=natgateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available" or .State == "pending")' | jq -r ".NatGatewayId") - echo "Wait until NatGateway $NGW is available" - aws ec2 wait nat-gateway-available --nat-gateway-ids $NGW - aws ec2 associate-route-table --route-table-id $PUB_RT_TB --subnet-id $PUB_SUB - - aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$((ITR+10)).0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') - export ALL_PRI_RT_TB="${ALL_PRI_RT_TB} ${PRI_RT_TB}" - aws ec2 associate-route-table --route-table-id $PRI_RT_TB --subnet-id $PRI_SUB - aws ec2 create-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $NGW - done - - echo "Create private VPC endpoint to S3" - aws ec2 create-vpc-endpoint --vpc-id $VPC --service-name com.amazonaws.$AWS_REGION.s3 --route-table-ids $ALL_PRI_RT_TB --tag-specifications ResourceType=vpc-endpoint,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpce-$CLUSTER_NAME}]" -} - -_delete_aws_vpc(){ - echo "Delete Subnets, Routes, Gateways, VPC if exists" - export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') - if [ $VPC != null ]; then - echo "Delete VPC Endpoint" - export VPCE=$(aws ec2 describe-vpc-endpoints --filters "Name=tag:Name,Values=vpce-$CLUSTER_NAME" --output json | jq -r '.VpcEndpoints[0].VpcEndpointId') - if [ $VPCE != null ]; then _aws_cmd "delete-vpc-endpoints --vpc-endpoint-ids $VPCE"; fi - - export ELB=$(aws elb describe-load-balancers --output json | jq -r '.LoadBalancerDescriptions[]'| jq -r 'select(.VPCId == '\"${VPC}\"')' | jq -r '.LoadBalancerName') - if [ $ELB != "" ]; then aws elb delete-load-balancer --load-balancer-name $ELB; fi - - for ZONE in a b c; - do - echo "Delete Subnets and Route tables" - export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') - export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') - export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - - if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0"; fi - if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi - if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PRI_RT_TB"; fi - if [ $PRI_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PRI_SUB"; fi - - export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[].RouteTableAssociationId') - export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available")' | jq -r ".NatGatewayId") - export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") - - if [ $RT_TB_ASSO_ID != null ]; then for _id in $RT_TB_ASSO_ID; do _aws_cmd "disassociate-route-table --association-id $_id"; done; fi - if [ $NGW != null ]; then _aws_cmd "delete-nat-gateway --nat-gateway-id $NGW"; fi - if [ $PUB_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PUB_SUB"; fi - if [ $E_IP != null ]; then _aws_cmd "release-address --allocation-id $E_IP"; fi - done - - export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - - if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0"; fi - if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PUB_RT_TB"; fi - - export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") - if [ $IGW != null ]; then _aws_cmd "detach-internet-gateway --internet-gateway-id $IGW --vpc-id $VPC"; fi - if [ $IGW != null ]; then _aws_cmd "delete-internet-gateway --internet-gateway-id $IGW"; fi + END_CLUSTER_STATUS="Not Ready" + echo "ERROR: Cluster $1 not installed after 90 iterations, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 - echo "Delete Security Group Rules" - for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[].GroupId"); - do - for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == false)" | jq -r ".SecurityGroupRuleId"); - do - aws ec2 revoke-security-group-ingress --security-group-rule-ids $r --group-id $g - done - - for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == true)" | jq -r ".SecurityGroupRuleId"); - do - aws ec2 revoke-security-group-egress --security-group-rule-ids $r --group-id $g - done - done - - for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[]" | jq -r 'select(.GroupName != "default")' | jq -r ".GroupId"); - do - echo "Delete Security Groups $g" - _aws_cmd "delete-security-group --group-id $g" - done - - echo "Delete VPC $VPC" - _aws_cmd "delete-vpc --vpc-id $VPC" - fi -} - -_oidc_config(){ - echo "${1} OIDC config, with prefix ${2}" - if [[ $1 == "create" ]]; then - echo "${1} OIDC config" - rosa create oidc-config --mode=auto --managed=false --prefix ${2} -y - export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') - else - export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') - if [ ! -z $OIDC_CONFIG ]; then rosa delete oidc-config --mode=auto --oidc-config-id ${OIDC_CONFIG} -y || true; fi # forcing exit 0, as this command may file if it is a shared oidc config - fi -} - -_get_sc_mc_details(){ - if [ -z $SVC_CLUSTER_NAME ]; then - echo "Find Service Cluster" - export SVC_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Service Cluster" | awk '{print$3}') - fi - if [ -z $MGMT_CLUSTER_NAME ]; then - export MGMT_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Management Cluster" | awk '{print$3}') - fi - echo "Read Management cluster details" - export MGMT_CLUSTER_DETAILS=$(ocm get /api/clusters_mgmt/v1/clusters | jq -r ".items[]" | jq -r 'select(.name == '\"$MGMT_CLUSTER_NAME\"')') - export NUMBER_OF_HC=$(cat ${json_file} | jq -r .number_of_hostedcluster) + exit 1 } setup(){ @@ -444,20 +131,7 @@ setup(){ export COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .openshift_worker_count) export NETWORK_TYPE=$(cat ${json_file} | jq -r .openshift_network_type) export ES_SERVER=$(cat ${json_file} | jq -r .es_server) - export HCP=$(cat ${json_file} | jq -r .rosa_hcp) export UUID=$(uuidgen) - if [ $HCP == "true" ]; then - export STAGE_CONFIG="" - export MGMT_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_mgmt_cluster_name) - export SVC_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_svc_cluster_name) - export STAGE_PROV_SHARD=$(cat ${json_file} | jq -r .staging_mgmt_provisioner_shards) - export OIDC_PREFIX=$(cat ${json_file} | jq -r .openshift_cluster_name) - export CLUSTER_NAME="${CLUSTER_NAME}-${HOSTED_ID}" # perf-as3-hcp-1, perf-as3-hcp-2.. - export KUBECONFIG_NAME=$(echo $KUBECONFIG_NAME | awk -F-kubeconfig '{print$1}')-$HOSTED_ID-kubeconfig - export KUBEADMIN_NAME=$(echo $KUBEADMIN_NAME | awk -F-kubeadmin '{print$1}')-$HOSTED_ID-kubeadmin - UUID=$(echo $AIRFLOW_CTX_DAG_RUN_ID | base64 | cut -c 1-32 ) - export UUID=${UUID} - fi export OCM_CLI_VERSION=$(cat ${json_file} | jq -r .ocm_cli_version) if [[ ${OCM_CLI_VERSION} != "container" ]]; then OCM_CLI_FORK=$(cat ${json_file} | jq -r .ocm_cli_fork) @@ -467,98 +141,42 @@ setup(){ sudo mv ocm /usr/local/bin/ popd fi - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "Clean-up existing OSD access keys.." - AWS_KEY=$(aws iam list-access-keys --user-name OsdCcsAdmin --output text --query 'AccessKeyMetadata[*].AccessKeyId') - LEN_AWS_KEY=`echo $AWS_KEY | wc -w` - if [[ ${LEN_AWS_KEY} -eq 2 ]]; then - aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id `printf ${AWS_KEY[0]}` - fi - echo "Create new OSD access key.." - export ADMIN_KEY=$(aws iam create-access-key --user-name OsdCcsAdmin) - export AWS_ACCESS_KEY_ID=$(echo $ADMIN_KEY | jq -r '.AccessKey.AccessKeyId') - export AWS_SECRET_ACCESS_KEY=$(echo $ADMIN_KEY | jq -r '.AccessKey.SecretAccessKey') - ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" - ocm whoami - sleep 60 # it takes a few sec for new access key - echo "Check AWS Username..." - aws iam get-user | jq -r .User.UserName + export ROSA_CLI_VERSION=$(cat ${json_file} | jq -r .rosa_cli_version) + if [[ ${ROSA_CLI_VERSION} != "container" ]]; then + ROSA_CLI_FORK=$(cat ${json_file} | jq -r .rosa_cli_fork) + git clone -q --depth=1 --single-branch --branch ${ROSA_CLI_VERSION} ${ROSA_CLI_FORK} + pushd rosa + make + sudo mv rosa /usr/local/bin/ + popd + fi + ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" + ocm whoami + rosa login --env=${ROSA_ENVIRONMENT} + rosa whoami + rosa verify quota + rosa verify permissions + if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -1) + elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -2 | tail -1) else - export ROSA_CLI_VERSION=$(cat ${json_file} | jq -r .rosa_cli_version) - if [[ ${ROSA_CLI_VERSION} != "container" ]]; then - ROSA_CLI_FORK=$(cat ${json_file} | jq -r .rosa_cli_fork) - git clone -q --depth=1 --single-branch --branch ${ROSA_CLI_VERSION} ${ROSA_CLI_FORK} - pushd rosa - make - sudo mv rosa /usr/local/bin/ - popd - fi - ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" - ocm whoami - rosa login --env=${ROSA_ENVIRONMENT} - rosa whoami - rosa verify quota - rosa verify permissions - if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then - export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -1) - elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then - export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -2 | tail -1) - else - export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | grep ^${MANAGED_OCP_VERSION}$) - fi - [ -z "${ROSA_VERSION}" ] && echo "ERROR: Image not found for version (${version}) on ROSA ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 - return 0 + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | grep ^${MANAGED_OCP_VERSION}$) fi + [ -z "${ROSA_VERSION}" ] && echo "ERROR: Image not found for version (${version}) on ROSA ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 + return 0 } install(){ export COMPUTE_WORKERS_TYPE=$(cat ${json_file} | jq -r .openshift_worker_instance_type) export CLUSTER_AUTOSCALE=$(cat ${json_file} | jq -r .cluster_autoscale) export OIDC_CONFIG=$(cat ${json_file} | jq -r .oidc_config) - if [[ $INSTALL_METHOD == "osd" ]]; then - if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then - export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -1) - elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then - export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -2 | tail -1) - else - export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${MANAGED_OCP_VERSION}) - fi - [ -z ${OCM_VERSION} ] && echo "ERROR: Image not found for version (${version}) on OCM ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 - if [[ $CLUSTER_AUTOSCALE == "true" ]]; then - export MIN_COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .min_openshift_worker_count) - export CLUSTER_SIZE="--enable-autoscaling --min-replicas ${MIN_COMPUTE_WORKERS_NUMBER} --max-replicas ${COMPUTE_WORKERS_NUMBER}" - else - export CLUSTER_SIZE="--compute-nodes ${COMPUTE_WORKERS_NUMBER}" - fi - ocm create cluster --ccs --provider aws --region ${AWS_REGION} --aws-account-id ${AWS_ACCOUNT_ID} --aws-access-key-id ${AWS_ACCESS_KEY_ID} --aws-secret-access-key ${AWS_SECRET_ACCESS_KEY} --channel-group ${MANAGED_CHANNEL_GROUP} --version ${OCM_VERSION} --multi-az --compute-machine-type ${COMPUTE_WORKERS_TYPE} --network-type ${NETWORK_TYPE} ${CLUSTER_NAME} ${CLUSTER_SIZE} - else - export INSTALLATION_PARAMS="" - export ROSA_HCP_PARAMS="" - if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then - INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" - fi - if [ $HCP == "true" ]; then - _create_aws_vpc - echo "Set start time of prom scrape" - export START_TIME=$(date +"%s") - if [ $STAGE_PROV_SHARD != "" ]; then - STAGE_CONFIG="--properties provision_shard_id:${STAGE_PROV_SHARD}" - fi - ALL_SUBNETS=$(aws ec2 describe-subnets --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r ".Subnets[].SubnetId") - SUBNETS_IDS="" - for _ID in ${ALL_SUBNETS}; - do - if [[ ${SUBNETS_IDS} == "" ]]; then SUBNETS_IDS=${_ID}; else SUBNETS_IDS=${SUBNETS_IDS}","${_ID}; fi - done - ROSA_HCP_PARAMS="--hosted-cp ${STAGE_CONFIG} --subnet-ids ${SUBNETS_IDS} --machine-cidr 10.0.0.0/16" - export OIDC_CONFIG=$(rosa list oidc-config | grep $OIDC_PREFIX | awk '{print$1}') - if [ -z $OIDC_CONFIG ]; then _oidc_config create $OIDC_PREFIX; fi - ROSA_HCP_PARAMS="${ROSA_HCP_PARAMS} --oidc-config-id ${OIDC_CONFIG}" - else - INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --multi-az" # Multi AZ is default on hosted-cp cluster - fi - rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} ${ROSA_HCP_PARAMS} + export INSTALLATION_PARAMS="" + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" fi + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --multi-az" # Multi AZ is default on hosted-cp cluster + rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} postinstall return 0 } @@ -569,42 +187,19 @@ postinstall(){ sleep 120 export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig - if [ $HCP == "true" ]; then - _get_sc_mc_details - echo "Index Managment cluster info" - index_metadata "management" - _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig - kubectl delete secret staging-mgmt-cluster-kubeconfig || true - kubectl create secret generic staging-mgmt-cluster-kubeconfig --from-file=config=./mgmt_kubeconfig - fi unset KUBECONFIG kubectl delete secret ${KUBECONFIG_NAME} || true kubectl create secret generic ${KUBECONFIG_NAME} --from-file=config=./kubeconfig - if [[ $INSTALL_METHOD == "osd" ]]; then - export PASSWORD=$(echo ${CLUSTER_NAME} | md5sum | awk '{print $1}') - ocm create idp -n localauth -t htpasswd --username kubeadmin --password ${PASSWORD} -c ${CLUSTER_NAME} - ocm create user kubeadmin -c "$(_get_cluster_id ${CLUSTER_NAME})" --group=cluster-admins - # set expiration time - EXPIRATION_STRING=$(date -d "${EXPIRATION_TIME} minutes" '+{"expiration_timestamp": "%FT%TZ"}') - ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING} - echo "Cluster is ready, deleting OSD access keys now.." - aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true - kubectl delete secret ${KUBEADMIN_NAME} || true - kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} - else - URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") - START_TIMER=$(date +%s) - PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') - CURRENT_TIMER=$(date +%s) - DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("cluster_admin_create-${DURATION}") - kubectl delete secret ${KUBEADMIN_NAME} || true - kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} - if [ $HCP == "true" ]; then _login_check $URL $PASSWORD; fi - # set expiration to 24h - rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m - fi - if [ $HCP == "true" ]; then index_metadata "cluster-install"; fi + URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") + START_TIMER=$(date +%s) + PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') + CURRENT_TIMER=$(date +%s) + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_create-${DURATION}") + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} + # set expiration to 24h + rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m return 0 } @@ -613,90 +208,10 @@ index_metadata(){ _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig export KUBECONFIG=./kubeconfig fi - if [[ $INSTALL_METHOD == "osd" ]]; then - export PLATFORM="AWS-MS" - export CLUSTER_VERSION="${OCM_VERSION}" - else - export PLATFORM="ROSA" - export CLUSTER_VERSION="${ROSA_VERSION}" - fi - if [ $HCP == "true" ]; then - if [ "$1" == "management" ]; then - METADATA=$(cat << EOF -{ -"uuid" : "${UUID}", -"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", -"version": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".openshift_version")", -"infra_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".infra_id")", -"cluster_name": "$MGMT_CLUSTER_NAME", -"cluster_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".id")", -"base_domain": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".dns.base_domain")", -"aws_region": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".region.id")", -"workers": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.autoscale_compute.max_replicas")", -"workers_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.compute_machine_type.id")", -"network_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".network.type")", -"install_method": "rosa", -"provision_shard": "$STAGE_PROV_SHARD", -"hostedclusters": "$NUMBER_OF_HC" -} -EOF -) - elif [ "$1" == "cluster-install" ]; then - METADATA=$(cat << EOF -{ -"uuid" : "${UUID}", -"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", -"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", -"workers": "$COMPUTE_WORKERS_NUMBER", -"cluster_name": "${CLUSTER_NAME}", -"cluster_id": "$(_get_cluster_id ${CLUSTER_NAME})", -"network_type": "${NETWORK_TYPE}", -"version": "${CLUSTER_VERSION}", -"operation": "install", -"install_method": "rosa", -"status": "$END_CLUSTER_STATUS", -"timestamp": "$(date +%s%3N)" -EOF -) - INSTALL_TIME=0 - TOTAL_TIME=0 - WORKER_READY_TIME=0 - for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i - METADATA="${METADATA}, \"$1\":\"$2\"" - if [ $1 != "day2operations" ] && [ $1 != "login" ] ; then - INSTALL_TIME=$((${INSTALL_TIME} + $2)) - elif [ $1 == "day2operations" ]; then - WORKER_READY_TIME=$2 - else - TOTAL_TIME=$2 - fi - done - IFS=" " - METADATA="${METADATA}, \"duration\":\"${INSTALL_TIME}\"" - METADATA="${METADATA}, \"workers_ready\":\"$(($INSTALL_TIME + $WORKER_READY_TIME))\"" - METADATA="${METADATA} }" - else - METADATA=$(cat << EOF -{ -"uuid" : "${UUID}", -"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", -"workers": "$COMPUTE_WORKERS_NUMBER", -"cluster_name": "${CLUSTER_NAME}", -"cluster_id": "$ROSA_CLUSTER_ID", -"network_type": "${NETWORK_TYPE}", -"version": "${CLUSTER_VERSION}", -"operation": "destroy", -"install_method": "rosa", -"duration": "$DURATION", -"timestamp": "$(date +%s%3N)" -} -EOF -) - fi - printf "Indexing installation timings to ES" - curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/hypershift-wrapper-timers/_doc -d "${METADATA}" -o /dev/null - else - METADATA=$(cat << EOF + export PLATFORM="ROSA" + export CLUSTER_VERSION="${ROSA_VERSION}" + + METADATA=$(cat << EOF { "uuid" : "${UUID}", "platform": "${PLATFORM}", @@ -713,7 +228,6 @@ EOF "timestamp": "$(date +%s%3N)" EOF ) - INSTALL_TIME=0 TOTAL_TIME=0 for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i @@ -731,91 +245,27 @@ EOF METADATA="${METADATA} }" printf "Indexing installation timings to ES" curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/managedservices-timings/_doc -d "${METADATA}" -o /dev/null - fi unset KUBECONFIG return 0 } -index_mgmt_cluster_stat(){ - echo "Indexing Management cluster stat..." - cd /home/airflow/workspace - echo "Installing kube-burner" - _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig - export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.5} - curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_RELEASE}/kube-burner-${KUBE_BURNER_RELEASE}-Linux-x86_64.tar.gz -o kube-burner.tar.gz - sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ - git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch - METRIC_PROFILE=/home/airflow/workspace/e2e-benchmarking/workloads/kube-burner-ocp-wrapper/metrics-profiles/mc-metrics.yml - envsubst < /home/airflow/workspace/e2e-benchmarking/workloads/kube-burner/workloads/managed-services/baseconfig.yml > baseconfig.yml - cat baseconfig.yml - HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" - MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") - MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) - Q_NODES="" - for n in $(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node'); - do - if [[ ${Q_NODES} == "" ]]; then Q_NODES=${n}; else Q_NODES=${Q_NODES}"|"${n}; fi - done - MGMT_WORKER_NODES=${Q_NODES} - echo "Exporting required vars" - cat << EOF -MC_PROMETHEUS: ${MC_PROMETHEUS} -MC_PROMETHEUS_TOKEN: -HCP_NAMESPACE: ${HCP_NAMESPACE} -MGMT_WORKER_NODES: ${MGMT_WORKER_NODES} -elapsed: "20m:" - -EOF - export MC_PROMETHEUS MC_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES elapsed - METADATA=$(cat << EOF -{ -"uuid":"${UUID}", -"timestamp": "$(date +%s%3N)", -"hostedClusterName": "${HC_INFRASTRUCTURE_NAME}", -"clusterName": "${HC_INFRASTRUCTURE_NAME}", -"mgmtClusterName": "${MGMT_CLUSTER_NAME}" -} -EOF -) - printf "Indexing metadata to ES" - curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/${ES_INDEX}/_doc -d "${METADATA}" -o /dev/null - - echo "Running kube-burner index.." - kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$END_TIME --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug - echo "Finished indexing results" -} - cleanup(){ - if [[ $INSTALL_METHOD == "osd" ]]; then - ocm delete cluster "$(_get_cluster_id ${CLUSTER_NAME})" - echo "Cluster is getting Uninstalled, deleting OSD access keys now.." - aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true - else - export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) - export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} - CLEANUP_START_TIMING=$(date +%s) - export START_TIME=$CLEANUP_START_TIMING - rosa delete cluster -c ${ROSA_CLUSTER_ID} -y - rosa logs uninstall -c ${ROSA_CLUSTER_ID} --watch - if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then - rosa delete operator-roles -c ${ROSA_CLUSTER_ID} -m auto --yes || true - rosa delete oidc-provider -c ${ROSA_CLUSTER_ID} -m auto --yes || true - fi - DURATION=$(($(date +%s) - $CLEANUP_START_TIMING)) - INDEXDATA+=("cleanup-${DURATION}") - export END_TIME=$(date +"%s") - if [ $HCP == "true" ]; then - _delete_aws_vpc - if [ -z $OIDC_CONFIG ]; then _oidc_config delete $OIDC_PREFIX; fi - fi + export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) + export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} + CLEANUP_START_TIMING=$(date +%s) + export START_TIME=$CLEANUP_START_TIMING + rosa delete cluster -c ${ROSA_CLUSTER_ID} -y + rosa logs uninstall -c ${ROSA_CLUSTER_ID} --watch + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + rosa delete operator-roles -c ${ROSA_CLUSTER_ID} -m auto --yes || true + rosa delete oidc-provider -c ${ROSA_CLUSTER_ID} -m auto --yes || true fi + DURATION=$(($(date +%s) - $CLEANUP_START_TIMING)) + INDEXDATA+=("cleanup-${DURATION}") + export END_TIME=$(date +"%s") return 0 } -export INSTALL_METHOD=$(cat ${json_file} | jq -r .cluster_install_method) -export HC_INTERVAL=$(cat ${json_file} | jq -r .hcp_install_interval) -SKEW_FACTOR=$(echo $HOSTED_ID|awk -F- '{print$2}') -sleep $(($HC_INTERVAL*$SKEW_FACTOR)) # 60*1, 60*2.. setup if [[ "$operation" == "install" ]]; then @@ -823,16 +273,8 @@ if [[ "$operation" == "install" ]]; then CLUSTER_STATUS=$(_get_cluster_status ${CLUSTER_NAME}) if [ -z "${CLUSTER_STATUS}" ] ; then printf "INFO: Cluster not found, installing..." - if [ $HCP == "true" ]; then - echo "pre-clean AWS resources" - _delete_aws_vpc - install - export HC_INFRASTRUCTURE_NAME=$(_get_cluster_id ${CLUSTER_NAME}) - index_mgmt_cluster_stat "install-metrics" - else - install - index_metadata - fi + install + index_metadata elif [ "${CLUSTER_STATUS}" == "ready" ] ; then printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..." postinstall @@ -848,10 +290,8 @@ if [[ "$operation" == "install" ]]; then elif [[ "$operation" == "cleanup" ]]; then printf "Running Cleanup Steps" - if [ $HCP == "true" ]; then _get_sc_mc_details; fi cleanup index_metadata - if [ $HCP == "true" ]; then index_mgmt_cluster_stat "destroy-metrics"; fi rosa logout ocm logout fi diff --git a/dags/openshift_nightlies/tasks/benchmarks/e2e.py b/dags/openshift_nightlies/tasks/benchmarks/e2e.py index b3b86e3c0..7dbaa5289 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/e2e.py +++ b/dags/openshift_nightlies/tasks/benchmarks/e2e.py @@ -85,15 +85,33 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group } self.install_vars = var_loader.build_task_vars( release, task="install") - if self.install_vars['rosa_hcp'] == "true": - cluster_name = release._generate_cluster_name() - self.env = { - **self.env, - "MGMT_CLUSTER_NAME": f"{self.install_vars['staging_mgmt_cluster_name']}.*", - "SVC_CLUSTER_NAME": f"{self.install_vars['staging_svc_cluster_name']}.*", - "MGMT_KUBECONFIG_SECRET": "staging-mgmt-cluster-kubeconfig", - **self._insert_kube_env() - } + + if self.release.platform == "rosahcp": + self.rosa_creds = var_loader.get_secret("rosa_creds", deserialize_json=True) + self.aws_creds = var_loader.get_secret("aws_creds", deserialize_json=True) + self.ocm_creds = var_loader.get_secret("ocm_creds", deserialize_json=True) + self.environment = self.vars["environment"] if "environment" in self.vars else "staging" + self.env = { + **self.env, + "ROSA_CLUSTER_NAME": release._generate_cluster_name(), + "ROSA_ENVIRONMENT": self.environment, + "ROSA_TOKEN": self.rosa_creds['rosa_token_'+self.environment], + "AWS_ACCESS_KEY_ID": self.aws_creds['aws_access_key_id'], + "AWS_SECRET_ACCESS_KEY": self.aws_creds['aws_secret_access_key'], + "AWS_DEFAULT_REGION": self.aws_creds['aws_region_for_openshift'], + "AWS_ACCOUNT_ID": self.aws_creds['aws_account_id'], + "OCM_TOKEN": self.ocm_creds['ocm_token'] + } + self.install_vars = var_loader.build_task_vars( + release, task="install") + cluster_name = release._generate_cluster_name() + self.env = { + **self.env, + "MGMT_CLUSTER_NAME": f"{self.install_vars['staging_mgmt_cluster_name']}.*", + "SVC_CLUSTER_NAME": f"{self.install_vars['staging_svc_cluster_name']}.*", + "MGMT_KUBECONFIG_SECRET": "staging-mgmt-cluster-kubeconfig", + **self._insert_kube_env() + } if self.release.platform == "hypershift": mgmt_cluster_name = release._generate_cluster_name() diff --git a/dags/openshift_nightlies/tasks/install/rosa/defaults.json b/dags/openshift_nightlies/tasks/install/rosa/defaults.json index c3ee1decc..29f67d108 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/defaults.json +++ b/dags/openshift_nightlies/tasks/install/rosa/defaults.json @@ -36,10 +36,5 @@ "ocm_cli_fork": "https://github.com/openshift-online/ocm-cli", "ocm_cli_version": "container", "rosa_hcp": "false", - "staging_mgmt_cluster_name": "", - "staging_svc_cluster_name": "", - "staging_mgmt_provisioner_shards": "", - "aws_region": "us-west-2", - "oidc_config": "", - "extra_machinepool": [] + "aws_region": "us-west-2" } diff --git a/dags/openshift_nightlies/tasks/install/rosa/rosa.py b/dags/openshift_nightlies/tasks/install/rosa/rosa.py index 9f277ae60..e5b2b9e02 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/rosa.py +++ b/dags/openshift_nightlies/tasks/install/rosa/rosa.py @@ -25,17 +25,6 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): self.exec_config = executor.get_default_executor_config(self.dag_config, executor_image="airflow-managed-services") self.rosa_postinstall_setup = rosa_post_install.Diagnosis(dag, config, release) - def get_type(self): - if self.config['rosa_hcp'] == "true": - return "rosa_hcp" - else: - return "rosa" - - def get_install_hcp_task(self): - for iteration in range(self.config['number_of_hostedcluster']): - c_id = f"{'hcp-'+str(iteration+1)}" # adding 1 to name the cluster hcp-1, hcp-2.. - yield c_id, self._get_task(operation="install", id=c_id), self.rosa_postinstall_setup._get_rosa_postinstallation(id=c_id), self._get_task(operation="cleanup", id=c_id) - # Create Airflow Task for Install/Cleanup steps def _get_task(self, operation="install", id="", trigger_rule="all_success"): self._setup_task(operation=operation) @@ -47,7 +36,7 @@ def _get_task(self, operation="install", id="", trigger_rule="all_success"): "PROM_URL": var_loader.get_secret("thanos_querier_url"), **self.env } - env = {**self.env, **{"HOSTED_ID": id}} + env = {**self.env} command=f"{constants.root_dag_dir}/scripts/install/rosa.sh -v {self.release.version} -j /tmp/{self.release_name}-{operation}-task.json -o {operation}" return BashOperator( diff --git a/dags/openshift_nightlies/tasks/install/rosahcp/__init__.py b/dags/openshift_nightlies/tasks/install/rosahcp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dags/openshift_nightlies/tasks/install/rosahcp/defaults.json b/dags/openshift_nightlies/tasks/install/rosahcp/defaults.json new file mode 100644 index 000000000..b44d4992f --- /dev/null +++ b/dags/openshift_nightlies/tasks/install/rosahcp/defaults.json @@ -0,0 +1,45 @@ +{ + "openshift_cluster_name": "", + "openshift_install_ssh_pub_key_file": "/home/airflow/workspace/perf-dept/ssh_keys/id_rsa_pbench_ec2.pub", + "openshift_cidr": "10.128.0.0/10", + "openshift_machine_cidr": "10.0.0.0/16", + "openshift_service_network": "172.30.0.0/16", + "openshift_host_prefix": "22", + "openshift_network_type": "", + "openshift_toggle_workload_node": true, + "kubeconfig_path": "", + "watch_nodes": true, + "watch_cluster_operators": true, + "watch_namepsaces": [ + "openshift-etcd", + "openshift-apiserver", + "openshift-kube-apiserver", + "openshift-monitoring", + "openshift-kube-controller", + "openshift-machine-api", + "openshift-kube-scheduler", + "openshift-ingress", + "openshift-sdn" + ], + "inspect_components": false, + "slack_integration": false, + "slack_api_token": "", + "slack_channel": "", + "watcher_slack_id": "{Monday: , Tuesday: , Wednesday: , Thursday: , Friday: , Saturday: , Sunday: }", + "slack_team_alias": "", + "iterations": 5, + "sleep_time": 30, + "daemon_mode": true, + "fips": false, + "rosa_expiration_time": "2880", + "rosa_cli_fork": "https://github.com/openshift/rosa", + "ocm_cli_fork": "https://github.com/openshift-online/ocm-cli", + "ocm_cli_version": "container", + "rosa_hcp": "true", + "staging_mgmt_cluster_name": "", + "staging_svc_cluster_name": "", + "staging_mgmt_provisioner_shards": "", + "aws_region": "us-west-2", + "oidc_config": "", + "extra_machinepool": [] +} diff --git a/dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py b/dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py new file mode 100644 index 000000000..3cb98ba41 --- /dev/null +++ b/dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py @@ -0,0 +1,63 @@ +import sys +from os.path import abspath, dirname +from os import environ + +from openshift_nightlies.util import var_loader, kubeconfig, constants, executor +from openshift_nightlies.tasks.install.openshift import AbstractOpenshiftInstaller +from openshift_nightlies.tasks.utils import rosa_post_install +from common.models.dag_config import DagConfig +from openshift_nightlies.models.release import OpenshiftRelease + +import requests +import uuid + +from airflow.operators.bash import BashOperator +from airflow.models import Variable +from kubernetes.client import models as k8s + +import json + +# Defines Tasks for installation of Openshift Clusters + +class RosaHCPInstaller(AbstractOpenshiftInstaller): + def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): + super().__init__(dag, config, release) + self.exec_config = executor.get_default_executor_config(self.dag_config, executor_image="airflow-managed-services") + self.rosa_postinstall_setup = rosa_post_install.Diagnosis(dag, config, release) + + def get_install_hcp_task(self): + for iteration in range(self.config['number_of_hostedcluster']): + c_id = f"{'hcp-'+str(iteration+1)}" # adding 1 to name the cluster hcp-1, hcp-2.. + yield c_id, self._get_task(operation="install", id=c_id), self.rosa_postinstall_setup._get_rosa_postinstallation(id=c_id), self._get_task(operation="cleanup", id=c_id) + + def wait_task(self, id="wait_task"): + return BashOperator(task_id=f"{id}", + depends_on_past=False, + trigger_rule="all_success", + dag=self.dag, + bash_command="sleep 60s") + + # Create Airflow Task for Install/Cleanup steps + def _get_task(self, operation="install", id="", trigger_rule="all_success"): + self._setup_task(operation=operation) + task_prefix=f"{id}-" + self.env = { + "ES_SERVER": var_loader.get_secret('elasticsearch'), + "ES_INDEX": "ripsaw-kube-burner", + "THANOS_RECEIVER_URL": var_loader.get_secret("thanos_receiver_url"), + "PROM_URL": var_loader.get_secret("thanos_querier_url"), + **self.env + } + env = {**self.env, **{"HOSTED_ID": id}} + command=f"{constants.root_dag_dir}/scripts/install/rosa-hcp.sh -v {self.release.version} -j /tmp/{self.release_name}-{operation}-task.json -o {operation}" + + return BashOperator( + task_id=f"{task_prefix if id != '' else ''}{operation}", + depends_on_past=False, + bash_command=command, + retries=3, + dag=self.dag, + trigger_rule=trigger_rule, + executor_config=self.exec_config, + env=env + ) diff --git a/dags/openshift_nightlies/util/manifest.py b/dags/openshift_nightlies/util/manifest.py index 3df0ef222..98b2aa3ee 100644 --- a/dags/openshift_nightlies/util/manifest.py +++ b/dags/openshift_nightlies/util/manifest.py @@ -163,6 +163,33 @@ def get_rosa_releases(self): } ) + def get_rosahcp_releases(self): + rosahcp = self.yaml['platforms']['rosahcp'] + for version in self.yaml['versions']: + if version['version'] in rosahcp['versions']: + version_number = version['version'] + release_stream = version['releaseStream'] + version_alias = version['alias'] + for variant in rosahcp['variants']: + release = OpenshiftRelease( + platform="rosahcp", + version=version_number, + release_stream=release_stream, + latest_release=self.latest_releases[release_stream], + variant=variant['name'], + config=variant['config'], + version_alias=version_alias + ) + schedule = self._get_schedule(variant, 'rosahcp') + dag_config = self._build_dag_config(schedule) + + self.releases.append( + { + "config": dag_config, + "release": release + } + ) + def get_rogcp_releases(self): rogcp = self.yaml['platforms']['rogcp'] for version in self.yaml['versions']: @@ -248,8 +275,10 @@ def get_releases(self): self.get_openstack_releases() if 'rosa' in self.yaml['platforms']: self.get_rosa_releases() + if 'rosahcp' in self.yaml['platforms']: + self.get_rosahcp_releases() if 'rogcp' in self.yaml['platforms']: - self.get_rogcp_releases() + self.get_rogcp_releases() if 'hypershift' in self.yaml['platforms']: self.get_hypershift_releases() if 'prebuilt' in self.yaml['platforms']: diff --git a/dags/openshift_nightlies/util/var_loader.py b/dags/openshift_nightlies/util/var_loader.py index 670470ef5..e8d95bb3c 100644 --- a/dags/openshift_nightlies/util/var_loader.py +++ b/dags/openshift_nightlies/util/var_loader.py @@ -25,7 +25,7 @@ def get_config_vars(release: OpenshiftRelease, task="install", config_dir=f"{con if release.platform == 'baremetal' and "bench" in task: file_path = f"{config_dir}/{release.config['benchmarks']}/{task}.json" return get_json(file_path) - elif ( release.platform == 'hypershift' or release.platform == 'rosa' ) and "hcp" in task: + elif ( release.platform == 'hypershift' or release.platform == 'rosa'or release.platform == 'rosahcp' ) and "hcp" in task: file_path = f"{config_dir}/benchmarks/{release.config['benchmarks']}" return get_json(file_path) elif task in release.config: From 0ec762fc5bcfd324735a87e4568b117338d12f26 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Thu, 27 Jul 2023 08:33:11 -0400 Subject: [PATCH 2/8] trimmed down extra char --- dags/openshift_nightlies/util/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/openshift_nightlies/util/manifest.py b/dags/openshift_nightlies/util/manifest.py index 98b2aa3ee..c99001352 100644 --- a/dags/openshift_nightlies/util/manifest.py +++ b/dags/openshift_nightlies/util/manifest.py @@ -278,7 +278,7 @@ def get_releases(self): if 'rosahcp' in self.yaml['platforms']: self.get_rosahcp_releases() if 'rogcp' in self.yaml['platforms']: - self.get_rogcp_releases() + self.get_rogcp_releases() if 'hypershift' in self.yaml['platforms']: self.get_hypershift_releases() if 'prebuilt' in self.yaml['platforms']: From 0233da325b20029ec693667bd1e01c2e9c754f8b Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Thu, 27 Jul 2023 09:59:30 -0400 Subject: [PATCH 3/8] updated workload HCP light-weight c-d-v2 iteration will be 7 iteration/node --- .../config/benchmarks/hcp-small-control-plane.json | 3 ++- .../config/benchmarks/hosted-control-plane-p75.json | 6 +++--- .../config/benchmarks/hosted-control-plane-p90.json | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json index 694c01cd7..5cd5c7271 100644 --- a/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json @@ -31,7 +31,8 @@ "WORKLOAD": "cluster-density-v2", "ITERATIONS": "500", "LOG_LEVEL": "debug", - "CHURN": "false" + "CHURN": "true", + "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } } ] diff --git a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json index d64034267..47e7ea0dc 100644 --- a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json +++ b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json @@ -1,13 +1,13 @@ { "benchmarks": [ { - "name": "cluster-density-ms-p75", + "name": "cluster-density-v2-p75", "workload": "kube-burner-ocp-wrapper", "trigger_rule": "all_done", "command": "./run.sh", "env": { - "WORKLOAD": "cluster-density-ms", - "ITERATIONS": "75", + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "63", "LOG_LEVEL": "debug", "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } diff --git a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json index ed418fbdb..af6671cb3 100644 --- a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json +++ b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json @@ -1,13 +1,13 @@ { "benchmarks": [ { - "name": "cluster-density-ms-p90", + "name": "cluster-density-v2-p90", "workload": "kube-burner-ocp-wrapper", "trigger_rule": "all_done", "command": "./run.sh", "env": { - "WORKLOAD": "cluster-density-ms", - "ITERATIONS": "90", + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "84", "LOG_LEVEL": "debug", "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } From e3a378d74187f5e92e88c2d9a13ecd0f34087597 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Thu, 27 Jul 2023 10:00:53 -0400 Subject: [PATCH 4/8] removed extra env vars --- .../config/benchmarks/hcp-small-control-plane.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json index 5cd5c7271..d6eaa6bb7 100644 --- a/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json @@ -7,8 +7,7 @@ "command": "./run.sh", "env": { "WORKLOAD": "node-density", - "LOG_LEVEL": "debug", - "CHURN": "false" + "LOG_LEVEL": "debug" } }, { @@ -18,8 +17,7 @@ "command": "./run.sh", "env": { "WORKLOAD": "node-density-cni", - "LOG_LEVEL": "debug", - "CHURN": "false" + "LOG_LEVEL": "debug" } }, { From 267cdede246f20f1b94a87e35f947f2442ca5c9e Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Fri, 28 Jul 2023 09:39:24 -0400 Subject: [PATCH 5/8] increasing node ready timeout --- dags/openshift_nightlies/scripts/install/rosa-hcp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh index 4c995e62e..05b58dc5a 100755 --- a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh +++ b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh @@ -41,7 +41,7 @@ _wait_for_nodes_ready(){ ITERATIONS=0 NODES_COUNT=$2 # 30 seconds per node, waiting for all nodes ready to finalize - while [ ${ITERATIONS} -le $((${NODES_COUNT}*5)) ] ; do + while [ ${ITERATIONS} -le $((NODES_COUNT*10)) ] ; do NODES_READY_COUNT=$(oc get nodes -l $3 | grep " Ready " | wc -l) if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} $3 nodes ready. Waiting 30 seconds for next check" From 1eb81c6e2310322c89dec50dfd5e2a7c29b43723 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Fri, 28 Jul 2023 10:08:11 -0400 Subject: [PATCH 6/8] mgmt cluster env var --- dags/openshift_nightlies/scripts/install/rosa-hcp.sh | 3 ++- dags/openshift_nightlies/tasks/benchmarks/e2e.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh index 05b58dc5a..000a653ae 100755 --- a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh +++ b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh @@ -698,7 +698,8 @@ index_mgmt_cluster_stat(){ cat baseconfig.yml HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") - MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) + MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) + MGMT_CLUSTER_NAME=$(oc get --kubeconfig=./mgmt_kubeconfig infrastructure.config.openshift.io cluster -o json 2>/dev/null | jq -r .status.infrastructureName) Q_NODES="" for n in $(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node'); do diff --git a/dags/openshift_nightlies/tasks/benchmarks/e2e.py b/dags/openshift_nightlies/tasks/benchmarks/e2e.py index 7dbaa5289..9608888bc 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/e2e.py +++ b/dags/openshift_nightlies/tasks/benchmarks/e2e.py @@ -107,8 +107,6 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group cluster_name = release._generate_cluster_name() self.env = { **self.env, - "MGMT_CLUSTER_NAME": f"{self.install_vars['staging_mgmt_cluster_name']}.*", - "SVC_CLUSTER_NAME": f"{self.install_vars['staging_svc_cluster_name']}.*", "MGMT_KUBECONFIG_SECRET": "staging-mgmt-cluster-kubeconfig", **self._insert_kube_env() } From 85d6ffffe5dd705e8c1c10bfeaa6729a9e9a93dc Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Fri, 28 Jul 2023 13:49:57 -0400 Subject: [PATCH 7/8] scrapping metric for longer duration --- dags/openshift_nightlies/scripts/install/rosa-hcp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh index 000a653ae..e8f035313 100755 --- a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh +++ b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh @@ -730,7 +730,7 @@ EOF curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/${ES_INDEX}/_doc -d "${METADATA}" -o /dev/null echo "Running kube-burner index.." - kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$END_TIME --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug + kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$((END_TIME+600)) --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug echo "Finished indexing results" } From 8f74950af6d587e017a4f9ffc4b6bf1cad18d607 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Tue, 1 Aug 2023 10:43:32 -0400 Subject: [PATCH 8/8] updated query logic to index MC stat --- .../scripts/install/rosa-hcp.sh | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh index e8f035313..4b5a27b87 100755 --- a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh +++ b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh @@ -694,18 +694,21 @@ index_mgmt_cluster_stat(){ sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch METRIC_PROFILE=/home/airflow/workspace/e2e-benchmarking/workloads/kube-burner-ocp-wrapper/metrics-profiles/mc-metrics.yml - envsubst < /home/airflow/workspace/e2e-benchmarking/workloads/kube-burner/workloads/managed-services/baseconfig.yml > baseconfig.yml - cat baseconfig.yml + cat > baseconfig.yml << EOF +--- +global: + indexerConfig: + esServers: ["${ES_SERVER}"] + insecureSkipVerify: true + defaultIndex: ${ES_INDEX} + type: elastic +EOF + HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) - MGMT_CLUSTER_NAME=$(oc get --kubeconfig=./mgmt_kubeconfig infrastructure.config.openshift.io cluster -o json 2>/dev/null | jq -r .status.infrastructureName) - Q_NODES="" - for n in $(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node'); - do - if [[ ${Q_NODES} == "" ]]; then Q_NODES=${n}; else Q_NODES=${Q_NODES}"|"${n}; fi - done - MGMT_WORKER_NODES=${Q_NODES} + Q_NODES=$(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node' | xargs) + MGMT_WORKER_NODES=${Q_NODES// /|} echo "Exporting required vars" cat << EOF MC_PROMETHEUS: ${MC_PROMETHEUS}