From 115a198c41063cb14b99850529f8fda3787641b2 Mon Sep 17 00:00:00 2001 From: Pradipta Date: Thu, 8 Mar 2018 18:40:39 -0800 Subject: [PATCH] Updated the script to permanently avoid the race conditions --- nodes.txt | 7 +-- scripts/delete_all_circe_deployments.py | 3 + scripts/delete_all_profilers.py | 5 +- scripts/delete_all_waves.py | 5 +- scripts/k8s_circe_scheduler.py | 53 ++++++++++++++++ scripts/k8s_jupiter_deploy.py | 56 +---------------- scripts/k8s_profiler_scheduler.py | 57 ++++++++++++++++- scripts/k8s_wave_scheduler.py | 63 ++++++++++++++++++- scripts/static_assignment.py | 84 ++++++++++++------------- 9 files changed, 227 insertions(+), 106 deletions(-) diff --git a/nodes.txt b/nodes.txt index 9b2299f69..385023342 100644 --- a/nodes.txt +++ b/nodes.txt @@ -1,4 +1,5 @@ home ubuntu-2gb-ams2-04 root PASSWORD +node1 ubuntu-2gb-sfo1-05 root PASSWORD node2 ubuntu-s-1vcpu-3gb-lon1-01 root PASSWORD node3 ubuntu-2gb-sfo1-03 root PASSWORD node4 ubuntu-2gb-ams2-03 root PASSWORD @@ -39,7 +40,7 @@ node38 ubuntu-s-1vcpu-3gb-lon1-02 root PASSWORD node39 ubuntu-s-1vcpu-3gb-lon1-03 root PASSWORD node40 ubuntu-s-1vcpu-3gb-lon1-04 root PASSWORD node41 ubuntu-s-1vcpu-3gb-lon1-05 root PASSWORD -node42 ubuntu-s-1vcpu-3gb-lon1-06 root PASSWORD +node42 ubuntu-2gb-sfo1-04 root PASSWORD node43 ubuntu-s-1vcpu-3gb-lon1-07 root PASSWORD node44 ubuntu-s-1vcpu-3gb-lon1-08 root PASSWORD node45 ubuntu-s-1vcpu-3gb-lon1-09 root PASSWORD @@ -85,6 +86,4 @@ node84 ubuntu-s-1vcpu-3gb-fra1-06 root PASSWORD node85 ubuntu-s-1vcpu-3gb-fra1-07 root PASSWORD node86 ubuntu-s-1vcpu-3gb-fra1-08 root PASSWORD node87 ubuntu-s-1vcpu-3gb-fra1-09 root PASSWORD -node88 ubuntu-2gb-sfo1-03 root PASSWORD -node89 ubuntu-2gb-sfo1-04 root PASSWORD -node90 ubuntu-2gb-sfo1-05 root PASSWORD \ No newline at end of file +node88 ubuntu-2gb-sfo1-03 root PASSWORD \ No newline at end of file diff --git a/scripts/delete_all_circe_deployments.py b/scripts/delete_all_circe_deployments.py index 55762a5ea..7bdf3383e 100644 --- a/scripts/delete_all_circe_deployments.py +++ b/scripts/delete_all_circe_deployments.py @@ -150,3 +150,6 @@ def delete_all_circe_deployments(): if resp: del_resp_2 = core_v1_api.delete_namespaced_service('home', namespace) print("Service Deleted. status='%s'" % str(del_resp_2.status)) + +if __name__ == '__main__': + delete_all_circe_deployments() \ No newline at end of file diff --git a/scripts/delete_all_profilers.py b/scripts/delete_all_profilers.py index 85e6da9a1..282667132 100644 --- a/scripts/delete_all_profilers.py +++ b/scripts/delete_all_profilers.py @@ -101,4 +101,7 @@ def delete_all_profilers(): del_resp_2 = api_2.delete_namespaced_service(key, namespace) print("Service Deleted. status='%s'" % str(del_resp_2.status)) - # At this point you should not have any of the profiler related service, pod, or deployment running \ No newline at end of file + # At this point you should not have any of the profiler related service, pod, or deployment running + +if __name__ == '__main__': + delete_all_profilers() \ No newline at end of file diff --git a/scripts/delete_all_waves.py b/scripts/delete_all_waves.py index 41e4284f7..a1e1b3ba7 100644 --- a/scripts/delete_all_waves.py +++ b/scripts/delete_all_waves.py @@ -101,4 +101,7 @@ def delete_all_waves(): del_resp_2 = api_2.delete_namespaced_service(key, namespace) print("Service Deleted. status='%s'" % str(del_resp_2.status)) - # At this point you should not have any of the profiler related service, pod, or deployment running \ No newline at end of file + # At this point you should not have any of the profiler related service, pod, or deployment running + +if __name__ == '__main__': + delete_all_waves() \ No newline at end of file diff --git a/scripts/k8s_circe_scheduler.py b/scripts/k8s_circe_scheduler.py index 247340a6d..7ad788429 100644 --- a/scripts/k8s_circe_scheduler.py +++ b/scripts/k8s_circe_scheduler.py @@ -24,6 +24,55 @@ from kubernetes import client, config from pprint import * +""" + This function prints out all the tasks that are not running. + If all the tasks are running: return True; else return False. +""" +def check_status_circe(dag): + + + """ + This loads the kubernetes instance configuration. + In our case this is stored in admin.conf. + You should set the config file path in the jupiter_config.py file. + """ + config.load_kube_config(config_file = jupiter_config.KUBECONFIG_PATH) + namespace = jupiter_config.DEPLOYMENT_NAMESPACE + + + # We have defined the namespace for deployments in jupiter_config + + # Get proper handles or pointers to the k8-python tool to call different functions. + extensions_v1_beta1_api = client.ExtensionsV1beta1Api() + v1_delete_options = client.V1DeleteOptions() + core_v1_api = client.CoreV1Api() + + result = True + for key, value in dag.items(): + # First check if there is a deployment existing with + # the name = key in the respective namespac # Check if there is a replicaset running by using the label app={key} + # The label of kubernets are used to identify replicaset associate to each task + label = "app=" + key + + resp = None + + resp = core_v1_api.list_namespaced_pod(namespace, label_selector = label) + # if a pod is running just delete it + if resp.items: + a=resp.items[0] + if a.status.phase != "Running": + print("Pod Not Running", key) + result = False + + # print("Pod Deleted. status='%s'" % str(del_resp_2.status)) + + if result: + print("All systems GOOOOO!!") + else: + print("Wait before trying again!!!!") + + return result + # if __name__ == '__main__': def k8s_circe_scheduler(dag_info , temp_info): @@ -180,6 +229,10 @@ def k8s_circe_scheduler(dag_info , temp_info): resp = k8s_beta.create_namespaced_deployment(body = dep, namespace = namespace) print("Deployment created. status = '%s'" % str(resp.status)) + while 1: + if check_status_circe(dag): + break + time.sleep(30) home_dep = write_home_specs(image = jupiter_config.HOME_IMAGE, host = jupiter_config.HOME_NODE, diff --git a/scripts/k8s_jupiter_deploy.py b/scripts/k8s_jupiter_deploy.py index e6fb06ea3..c3df95625 100644 --- a/scripts/k8s_jupiter_deploy.py +++ b/scripts/k8s_jupiter_deploy.py @@ -26,65 +26,11 @@ import requests import json from pprint import * +from utilities import * static_mapping = False -""" - read the dag from the file input -""" -def k8s_read_dag(dag_info_file): - - dag_info=[] - config_file = open(dag_info_file,'r') - dag_size = int(config_file.readline()) - - dag={} - for i, line in enumerate(config_file, 1): - dag_line = line.strip().split(" ") - if i == 1: - dag_info.append(dag_line[0]) - dag.setdefault(dag_line[0], []) - for j in range(1,len(dag_line)): - dag[dag_line[0]].append(dag_line[j]) - if i == dag_size: - break - - dag_info.append(dag) - return dag_info - - -def k8s_get_nodes(node_info_file): - - nodes = {} - node_file = open(node_info_file, "r") - for line in node_file: - node_line = line.strip().split(" ") - nodes.setdefault(node_line[0], []) - for i in range(1, len(node_line)): - nodes[node_line[0]].append(node_line[i]) - return nodes - - -def k8s_get_hosts(dag_info_file, node_info_file, mapping): - - dag_info = k8s_read_dag(dag_info_file) - nodes = k8s_get_nodes(node_info_file) - - hosts={} - - for i in mapping: - #get task, node IP, username and password - hosts.setdefault(i,[]) - hosts[i].append(i) # task - hosts[i].extend(nodes.get(mapping[i])) # assigned node id - - hosts.setdefault('home',[]) - hosts['home'].append('home') - hosts['home'].extend(nodes.get('home')) - dag_info.append(hosts) - return dag_info - if __name__ == '__main__': diff --git a/scripts/k8s_profiler_scheduler.py b/scripts/k8s_profiler_scheduler.py index cba1f7ce1..71d4fd517 100644 --- a/scripts/k8s_profiler_scheduler.py +++ b/scripts/k8s_profiler_scheduler.py @@ -24,9 +24,56 @@ import os import jupiter_config +def check_status_profilers(): + + + path1 = jupiter_config.HERE + 'nodes.txt' + nodes = read_node_list(path1) + + """ + This loads the kubernetes instance configuration. + In our case this is stored in admin.conf. + You should set the config file path in the jupiter_config.py file. + """ + config.load_kube_config(config_file = jupiter_config.KUBECONFIG_PATH) + namespace = jupiter_config.PROFILER_NAMESPACE + + + # We have defined the namespace for deployments in jupiter_config + + # Get proper handles or pointers to the k8-python tool to call different functions. + extensions_v1_beta1_api = client.ExtensionsV1beta1Api() + v1_delete_options = client.V1DeleteOptions() + core_v1_api = client.CoreV1Api() + + result = True + for key in nodes: + + # First check if there is a deployment existing with + # the name = key in the respective namespac # Check if there is a replicaset running by using the label app={key} + # The label of kubernets are used to identify replicaset associate to each task + label = "app=" + key + "profiler" + + resp = None + + resp = core_v1_api.list_namespaced_pod(namespace, label_selector = label) + # if a pod is running just delete it + if resp.items: + a=resp.items[0] + if a.status.phase != "Running": + print("Pod Not Running", key) + result = False + + # print("Pod Deleted. status='%s'" % str(del_resp_2.status)) + + if result: + print("All systems GOOOOO!!") + else: + print("Wait before trying again!!!!") + + return result -# if __name__ == '__main__': def k8s_profiler_scheduler(): """ This loads the task graph and node list @@ -145,7 +192,11 @@ def k8s_profiler_scheduler(): # have to somehow make sure that the worker nodes are on and working by this time - time.sleep(30) + while 1: + if check_status_profilers(): + break + time.sleep(30) + home_dep = write_profiler_specs(name = 'home', label = "homeprofiler", image = jupiter_config.PROFILER_HOME_IMAGE, host = jupiter_config.HOME_NODE, @@ -158,3 +209,5 @@ def k8s_profiler_scheduler(): pprint(service_ips) return(service_ips) +if __name__ == '__main__': + k8s_profiler_scheduler() diff --git a/scripts/k8s_wave_scheduler.py b/scripts/k8s_wave_scheduler.py index 6ad437234..0e4286109 100644 --- a/scripts/k8s_wave_scheduler.py +++ b/scripts/k8s_wave_scheduler.py @@ -23,9 +23,61 @@ from pprint import * import os import jupiter_config +from k8s_get_service_ips import * +""" + This function prints out all the waves that are not running. + If all the waves are running: return True; else return False. +""" +def check_status_waves(): + + + path1 = jupiter_config.HERE + 'nodes.txt' + nodes = read_node_list(path1) + + """ + This loads the kubernetes instance configuration. + In our case this is stored in admin.conf. + You should set the config file path in the jupiter_config.py file. + """ + config.load_kube_config(config_file = jupiter_config.KUBECONFIG_PATH) + namespace = jupiter_config.WAVE_NAMESPACE + # We have defined the namespace for deployments in jupiter_config + + # Get proper handles or pointers to the k8-python tool to call different functions. + extensions_v1_beta1_api = client.ExtensionsV1beta1Api() + v1_delete_options = client.V1DeleteOptions() + core_v1_api = client.CoreV1Api() + + result = True + for key in nodes: + + # First check if there is a deployment existing with + # the name = key in the respective namespac # Check if there is a replicaset running by using the label app={key} + # The label of kubernets are used to identify replicaset associate to each task + label = "app=wave_" + key + + resp = None + + resp = core_v1_api.list_namespaced_pod(namespace, label_selector = label) + # if a pod is running just delete it + if resp.items: + a=resp.items[0] + if a.status.phase != "Running": + print("Pod Not Running", key) + result = False + + # print("Pod Deleted. status='%s'" % str(del_resp_2.status)) + + if result: + print("All systems GOOOOO!!") + else: + print("Wait before trying again!!!!") + + return result + # if __name__ == '__main__': def k8s_wave_scheduler(profiler_ips): """ @@ -135,7 +187,11 @@ def k8s_wave_scheduler(profiler_ips): # TODO: have to make sure that the worker nodes are on and working by this time - time.sleep(60) + while 1: + if check_status_waves(): + break + time.sleep(30) + home_dep = write_wave_specs(name = 'home', label = "wave_home", image = jupiter_config.WAVE_HOME_IMAGE, host = jupiter_config.HOME_NODE, all_node = nexthost_names, @@ -148,3 +204,8 @@ def k8s_wave_scheduler(profiler_ips): print("Home deployment created. status = '%s'" % str(resp.status)) pprint(service_ips) + + +if __name__ == '__main__': + profiler_ips = get_all_profilers() + k8s_wave_scheduler(profiler_ips) diff --git a/scripts/static_assignment.py b/scripts/static_assignment.py index 336b5d82e..1aeeb21d5 100644 --- a/scripts/static_assignment.py +++ b/scripts/static_assignment.py @@ -58,18 +58,18 @@ 'teradetector0': ['1', 'true', 'fusioncenter0', 'teramaster0'], 'teradetector1': ['1', 'true', 'fusioncenter1', 'teramaster1'], 'teradetector2': ['1', 'true', 'fusioncenter2', 'teramaster2'], - 'teramaster0': ['1', 'false', 'teraslave00', 'teraslave01', 'teraslave02'], - 'teramaster1': ['1', 'false', 'teraslave10', 'teraslave11', 'teraslave12'], - 'teramaster2': ['1', 'false', 'teraslave20', 'teraslave21', 'teraslave22'], - 'teraslave00': ['1', 'false', 'teraslave00'], - 'teraslave01': ['1', 'false', 'teraslave01'], - 'teraslave02': ['1', 'false', 'teraslave02'], - 'teraslave10': ['1', 'false', 'teraslave10'], - 'teraslave11': ['1', 'false', 'teraslave11'], - 'teraslave12': ['1', 'false', 'teraslave12'], - 'teraslave20': ['1', 'false', 'teraslave20'], - 'teraslave21': ['1', 'false', 'teraslave21'], - 'teraslave22': ['1', 'false', 'teraslave22']}, + 'teramaster0': ['1', 'false', 'teraworker00', 'teraworker01', 'teraworker02'], + 'teramaster1': ['1', 'false', 'teraworker10', 'teraworker11', 'teraworker12'], + 'teramaster2': ['1', 'false', 'teraworker20', 'teraworker21', 'teraworker22'], + 'teraworker00': ['1', 'false', 'teraworker00'], + 'teraworker01': ['1', 'false', 'teraworker01'], + 'teraworker02': ['1', 'false', 'teraworker02'], + 'teraworker10': ['1', 'false', 'teraworker10'], + 'teraworker11': ['1', 'false', 'teraworker11'], + 'teraworker12': ['1', 'false', 'teraworker12'], + 'teraworker20': ['1', 'false', 'teraworker20'], + 'teraworker21': ['1', 'false', 'teraworker21'], + 'teraworker22': ['1', 'false', 'teraworker22']}, {'aggregate0': 'node16', 'aggregate1': 'node20', 'aggregate2': 'node33', @@ -102,15 +102,15 @@ 'teramaster0': 'node21', 'teramaster1': 'node53', 'teramaster2': 'node16', - 'teraslave00': 'node29', - 'teraslave01': 'node24', - 'teraslave02': 'node12', - 'teraslave10': 'node20', - 'teraslave11': 'node52', - 'teraslave12': 'node34', - 'teraslave20': 'node63', - 'teraslave21': 'node54', - 'teraslave22': 'node6'}] + 'teraworker00': 'node29', + 'teraworker01': 'node24', + 'teraworker02': 'node12', + 'teraworker10': 'node20', + 'teraworker11': 'node52', + 'teraworker12': 'node34', + 'teraworker20': 'node63', + 'teraworker21': 'node54', + 'teraworker22': 'node6'}] schedule = ['localpro', {'aggregate0': ['1', 'true', @@ -171,18 +171,18 @@ 'teradetector0': ['1', 'true', 'fusioncenter0', 'teramaster0'], 'teradetector1': ['1', 'true', 'fusioncenter1', 'teramaster1'], 'teradetector2': ['1', 'true', 'fusioncenter2', 'teramaster2'], - 'teramaster0': ['1', 'false', 'teraslave00', 'teraslave01', 'teraslave02'], - 'teramaster1': ['1', 'false', 'teraslave10', 'teraslave11', 'teraslave12'], - 'teramaster2': ['1', 'false', 'teraslave20', 'teraslave21', 'teraslave22'], - 'teraslave00': ['1', 'false', 'teraslave00'], - 'teraslave01': ['1', 'false', 'teraslave01'], - 'teraslave02': ['1', 'false', 'teraslave02'], - 'teraslave10': ['1', 'false', 'teraslave10'], - 'teraslave11': ['1', 'false', 'teraslave11'], - 'teraslave12': ['1', 'false', 'teraslave12'], - 'teraslave20': ['1', 'false', 'teraslave20'], - 'teraslave21': ['1', 'false', 'teraslave21'], - 'teraslave22': ['1', 'false', 'teraslave22']}, + 'teramaster0': ['1', 'false', 'teraworker00', 'teraworker01', 'teraworker02'], + 'teramaster1': ['1', 'false', 'teraworker10', 'teraworker11', 'teraworker12'], + 'teramaster2': ['1', 'false', 'teraworker20', 'teraworker21', 'teraworker22'], + 'teraworker00': ['1', 'false', 'teraworker00'], + 'teraworker01': ['1', 'false', 'teraworker01'], + 'teraworker02': ['1', 'false', 'teraworker02'], + 'teraworker10': ['1', 'false', 'teraworker10'], + 'teraworker11': ['1', 'false', 'teraworker11'], + 'teraworker12': ['1', 'false', 'teraworker12'], + 'teraworker20': ['1', 'false', 'teraworker20'], + 'teraworker21': ['1', 'false', 'teraworker21'], + 'teraworker22': ['1', 'false', 'teraworker22']}, {'aggregate0': ['aggregate0', 'ubuntu-s-1vcpu-3gb-blr1-08', 'root', @@ -309,36 +309,36 @@ 'ubuntu-s-1vcpu-3gb-blr1-08', 'root', 'PASSWORD'], - 'teraslave00': ['teraslave00', + 'teraworker00': ['teraworker00', 'ubuntu-s-1vcpu-3gb-ams3-01', 'root', 'PASSWORD'], - 'teraslave01': ['teraslave01', + 'teraworker01': ['teraworker01', 'ubuntu-s-1vcpu-3gb-nyc1-05', 'root', 'PASSWORD'], - 'teraslave02': ['teraslave02', + 'teraworker02': ['teraworker02', 'ubuntu-s-1vcpu-3gb-sgp1-01', 'root', 'PASSWORD'], - 'teraslave10': ['teraslave10', + 'teraworker10': ['teraworker10', 'ubuntu-s-1vcpu-3gb-nyc1-01', 'root', 'PASSWORD'], - 'teraslave11': ['teraslave11', + 'teraworker11': ['teraworker11', 'ubuntu-s-1vcpu-3gb-nyc3-02', 'root', 'PASSWORD'], - 'teraslave12': ['teraslave12', + 'teraworker12': ['teraworker12', 'ubuntu-s-1vcpu-3gb-ams3-06', 'root', 'PASSWORD'], - 'teraslave20': ['teraslave20', + 'teraworker20': ['teraworker20', 'ubuntu-s-1vcpu-3gb-sfo2-02', 'root', 'PASSWORD'], - 'teraslave21': ['teraslave21', + 'teraworker21': ['teraworker21', 'ubuntu-s-1vcpu-3gb-nyc3-04', 'root', 'PASSWORD'], - 'teraslave22': ['teraslave22', 'ubuntu-2gb-fra1-02', 'root', 'PASSWORD']}] + 'teraworker22': ['teraworker22', 'ubuntu-2gb-fra1-02', 'root', 'PASSWORD']}]