From 509ffecbd540fc595bba8a7518b82f419f7b233c Mon Sep 17 00:00:00 2001 From: Bianco95 Date: Mon, 10 Feb 2025 15:27:40 +0100 Subject: [PATCH] updated to support fpga --- change-values.sh | 9 +- jupyter_template.py | 336 ++++++++++++++++++++++++++++++++++++------- values_template.yaml | 16 ++- 3 files changed, 300 insertions(+), 61 deletions(-) diff --git a/change-values.sh b/change-values.sh index d205ed6..889f7a0 100755 --- a/change-values.sh +++ b/change-values.sh @@ -6,8 +6,7 @@ cp values_template.yaml values.yaml source .env -# change those values with the ones you want -HOSTNAME="jhub.131.154.98.62.myip.cloud.infn.it" +HOSTNAME="jhub.131.154.98.240.myip.cloud.infn.it" # values inside jupyter_template.py IAM_SERVER="https://iam.cloud.infn.it" @@ -15,6 +14,9 @@ CALLBACK_URL="https://$HOSTNAME:443/hub/oauth_callback" JHUB_HOST="$HOSTNAME" JHUB_PORT="443" JHUB_API_URL="https://$HOSTNAME/hub/api" +VK_NODENAME='gpu-node' +GPU_CAP="1" +FPGA_CAP="1" sed -i "s|__CALLBACK_URL__|\"$CALLBACK_URL\"|g" jupyterhubcustomconfig.py sed -i "s|__IAM_SERVER__|\"$IAM_SERVER\"|g" jupyterhubcustomconfig.py @@ -25,6 +27,9 @@ sed -i "s|__JHUB_HOST__|\"$JHUB_HOST\"|g" jupyterhubcustomconfig.py sed -i "s|__JHUB_IP__|\"$HOSTNAME\"|g" jupyterhubcustomconfig.py sed -i "s|__JHUB_PORT__|\"$JHUB_PORT\"|g" jupyterhubcustomconfig.py sed -i "s|__JHUB_API_URL__|\"$JHUB_API_URL\"|g" jupyterhubcustomconfig.py +sed -i "s|__VK_NODENAME__|\"$VK_NODENAME\"|g" jupyterhubcustomconfig.py +sed -i "s|__GPU_CAP__|\"$GPU_CAP\"|g" jupyterhubcustomconfig.py +sed -i "s|__FPGA_CAP__|\"$FPGA_CAP\"|g" jupyterhubcustomconfig.py # values inside values.yaml JHUB_URL="https://$HOSTNAME:443" diff --git a/jupyter_template.py b/jupyter_template.py index b03e31b..109bdfa 100644 --- a/jupyter_template.py +++ b/jupyter_template.py @@ -49,7 +49,7 @@ client_secret = cache_results["client_secret"] class EnvAuthenticator(GenericOAuthenticator): - + @gen.coroutine def pre_spawn_start(self, user, spawner): @@ -157,14 +157,37 @@ async def authenticate(self, handler, data=None): c.GenericOAuthenticator.scope = ['openid', 'profile', 'email', 'address', 'offline_access', 'groups'] c.GenericOAuthenticator.username_key = "preferred_username" c.GenericOAuthenticator.enable_auth_state = True +c.Authenticator.allow_all = True class CustomSpawner(kubespawner.KubeSpawner): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.map_node_gpu = {} + self.map_node_fpga = {} self.gpus_status = {} + self.fpga_status = {} self.notebook_dir = "" + self.gpus_model_known = [ + { + "model": "T4", + "image_url": "https://s.alicdn.com/@sc04/kf/Hb227c34878ab414388e8206abd03ef3cd.jpg_720x720q50.jpg" + }, + { + "model": "A100", + "image_url": "https://www.workstationshop.it/pimages/Nvidia-A100-40GB-HBM2-extra-big-16862.png" + }, + { + "model": "A200", + "image_url": "https://www.workstationshop.it/pimages/Nvidia-A100-40GB-HBM2-extra-big-16862.png" + } + ] + self.fpgas_model_known = [ + { + "model": "U55C", + "image_url": "https://www.colfaxdirect.com/store/pc/catalog/u55c.png" + } + ] def get_args(self): # Get the default arguments @@ -219,6 +242,89 @@ def options_form(self): def generate_options_form(self): options_to_return = """ + +
@@ -237,18 +343,44 @@ def generate_options_form(self):
Source docker image from DODAS
+ +
+ Custom FPGA image
- -
-
- - @@ -260,31 +392,39 @@ def generate_options_form(self):
""" - options_to_return += '

GPU Offloading Options

' - nest_asyncio.apply() nodes = asyncio.run(self._get_nodes()) - vk_nodes = [node for node in nodes if node.metadata.labels.get('type') == 'virtual-kubelet'] + vk_nodes = [node for node in nodes if node.metadata.labels.get('virtual-node.interlink/type') == 'virtual-kubelet'] nodes_labels = [] accelerator_labels = [] + accelerator_labels.append("none") + available_gpus = 0 + available_fpags = 0 for node in vk_nodes: # append the node label to the list nodes_labels nodes_labels.append({ "hostname": node.metadata.name, "label": node.metadata.labels.get('accelerator', '')}) + available_gpus += int(node.status.capacity.get('nvidia.com/gpu', 0)) + available_fpags += int(node.status.capacity.get('xilinx.com/fpga', 0)) - if node.metadata.labels.get('accelerator', '') == "T4" or "A200" in node.metadata.labels.get('accelerator', ''): - available_gpus += int(node.status.capacity.get('nvidia.com/gpu', 0)) + if any(gpu['model'] == node.metadata.labels.get('accelerator', '') for gpu in self.gpus_model_known): if node.metadata.labels.get('accelerator', '') not in accelerator_labels: accelerator_labels.append(node.metadata.labels.get('accelerator', '')) self.map_node_gpu[node.metadata.labels.get('accelerator', '')] = { "hostname": node.metadata.name, "gpus": int(node.status.capacity.get('nvidia.com/gpu', 0))} self.gpus_status[node.metadata.labels.get('accelerator', '')] = { 'total': int(node.status.capacity.get('nvidia.com/gpu', 0)), 'used': 0, 'available': int(node.status.capacity.get('nvidia.com/gpu', 0)) } + elif node.metadata.labels.get('accelerator', '') == "U55C": + if node.metadata.labels.get('accelerator', '') not in accelerator_labels: + accelerator_labels.append(node.metadata.labels.get('accelerator', '')) + self.map_node_fpga[node.metadata.labels.get('accelerator', '')] = { "hostname": node.metadata.name, "fpgas": int(node.status.capacity.get('xilinx.com/fpga', 0))} + self.fpga_status[node.metadata.labels.get('accelerator', '')] = { 'total': int(node.status.capacity.get('xilinx.com/fpga', 0)), 'used': 0, 'available': int(node.status.capacity.get('xilinx.com/fpga', 0)) } elif node.metadata.labels.get('accelerator', '') == "none": self.map_node_gpu[node.metadata.labels.get('accelerator', '')] = { "hostname": node.metadata.name, "gpus": 0} - accelerator_labels.append("none") + already_allocated_gpus = 0 + already_allocated_fpgas = 0 pods = asyncio.run(self._get_pods()) running_pods = [pod for pod in pods if pod.status.phase == "Running"] @@ -299,6 +439,13 @@ def generate_options_form(self): self.gpus_status[accelerator_of_node]["used"] += int(container.resources.limits['nvidia.com/gpu']) self.gpus_status[accelerator_of_node]["available"] -= int(container.resources.limits['nvidia.com/gpu']) already_allocated_gpus += int(container.resources.limits['nvidia.com/gpu']) + elif container.resources and 'xilinx.com/fpga' in container.resources.limits: + # get the label of the node where the pod is running + node_label = pod.spec.node_name + accelerator_of_node = [node["label"] for node in nodes_labels if node["hostname"] == node_label][0] + self.fpga_status[accelerator_of_node]["used"] += int(container.resources.limits['xilinx.com/fpga']) + self.fpga_status[accelerator_of_node]["available"] -= int(container.resources.limits['xilinx.com/fpga']) + already_allocated_fpgas += int(container.resources.limits['xilinx.com/fpga']) except Exception as e: pass @@ -311,8 +458,18 @@ def generate_options_form(self): options_to_return += 'Available GPUs' options_to_return += '' for key, value in self.gpus_status.items(): + # Find the image URL for the current GPU model + image_url = next((gpu['image_url'] for gpu in self.gpus_model_known if gpu['model'] == key), None) + options_to_return += '' - options_to_return += f'{key}' + options_to_return += '' + + # Add GPU image if URL is found + if image_url: + options_to_return += f'{key}' + + # Add GPU model name + options_to_return += f'{key}' options_to_return += f'{value["total"]}' options_to_return += f'{value["used"]}' options_to_return += f'{value["available"]}' @@ -321,37 +478,87 @@ def generate_options_form(self): options_to_return += '
' + # if available_gpus > 0: + # options_to_return += f"

Total GPUs available: {available_gpus}

" + + # if already_allocated_gpus > 0: + # options_to_return += f"

Used GPUs: {already_allocated_gpus}

" + # else: + # options_to_return += f"

Used GPUs: 0

" + + # unused_gpus = available_gpus - already_allocated_gpus + # if unused_gpus > 0: + # options_to_return += f"

Unused GPUs: {unused_gpus}

" + # else: + # options_to_return += f"

Unused GPUs: 0

" + if available_gpus > 0: - options_to_return += f"

Total GPUs available: {available_gpus}

" - else: - options_to_return += f"

Total GPUs available: 0

" + options_to_return += '' + options_to_return += '
" + options_to_return += "
" + + if self.fpga_status: + options_to_return += '' + options_to_return += '' + options_to_return += '' + options_to_return += '' + options_to_return += '' + options_to_return += '' + options_to_return += '' + for key, value in self.fpga_status.items(): + image_url = next((fpga['image_url'] for fpga in self.fpgas_model_known if fpga['model'] == key), None) + + options_to_return += '' + options_to_return += '' + options_to_return += f'' + options_to_return += f'' + options_to_return += f'' + options_to_return += '' + options_to_return += '
FPGA ModelTotal FPGAsUsed FPGAsAvailable FPGAs
' + + if image_url: + options_to_return += f'{key}' + + options_to_return += f'{key}{value["total"]}{value["used"]}{value["available"]}
' + + options_to_return += '
' + + # if available_fpags > 0: + # options_to_return += f"

Total FPGAs available: {available_fpags}

" + + # if already_allocated_fpgas > 0: + # options_to_return += f"

Used FPGAs: {already_allocated_fpgas}

" + # else: + # options_to_return += f"

Used FPGAs: 0

" + + # unused_fpgas = available_fpags - already_allocated_fpgas + # if unused_fpgas > 0: + # options_to_return += f"

Unused FPGAs: {unused_fpgas}

" + # else: + # options_to_return += f"

Unused FPGAs: 0

" + + options_to_return += '' + options_to_return += '
" options_to_return += '' options_to_return += '
' - - options_to_return += '' - options_to_return += '
" @@ -376,8 +583,13 @@ def options_from_form(self, formdata): options['offload'] = ''.join(formdata['offload']) - options['gpu'] = formdata['gpu'] - gpu = ''.join(formdata['gpu']) + if 'gpu' in formdata: + options['gpu'] = formdata['gpu'] + gpu = ''.join(formdata['gpu']) + + if 'fpga' in formdata: + options['fpga'] = formdata['fpga'] + fpga = ''.join(formdata['fpga']) sock = socket.socket() sock.bind(('', 0)) @@ -392,13 +604,13 @@ def options_from_form(self, formdata): "value": "none", "effect": "NoSchedule" }, - { - "key": "virtual-node.interlink/no-schedule", - "operator": "Exists", - "effect": "NoSchedule" - } + # { + # "key": "virtual-node.interlink/no-schedule=false", + # "value": "false", + # "effect": "NoSchedule" + # } ] - else: + elif options['offload'] in [gpu['model'] for gpu in self.gpus_model_known]: self.tolerations = [ { "key": "accelerator", @@ -406,14 +618,30 @@ def options_from_form(self, formdata): "value": options['offload'], "effect": "NoSchedule" }, - { - "key": "virtual-node.interlink/no-schedule", - "operator": "Exists", - "effect": "NoSchedule" - } + # { + # "key": "virtual-node.interlink/no-schedule=false", + # "value": "false", + # "effect": "NoSchedule" + # } ] self.extra_resource_guarantees = {"nvidia.com/gpu": gpu} self.extra_resource_limits = {"nvidia.com/gpu": gpu} + elif options['offload'] == 'U55C': + self.tolerations = [ + { + "key": "accelerator", + "operator": "Equal", + "value": options['offload'], + "effect": "NoSchedule" + }, + # { + # "key": "virtual-node.interlink/no-schedule", + # "value": "false", + # "effect": "NoSchedule" + # } + ] + self.extra_resource_guarantees = {"xilinx.com/fpga": fpga} + self.extra_resource_limits = {"xilinx.com/fpga": fpga} if 'poc' in options['offload']: @@ -484,6 +712,7 @@ def environment(self): # dciangot: create an ssh connection on a random port environment = { + "JUPYTERHUB_SINGLEUSER_EXTENSION": "0", "JHUB_HOST": jhub_host, "SSH_PORT": "31022", "FWD_PORT": f"{self.port}", @@ -506,9 +735,12 @@ def node_selector(self): node_selector = { "kubernetes.io/role": "agent", "beta.kubernetes.io/os": "linux", - "type" : "virtual-kubelet"} + "virtual-node.interlink/type" : "virtual-kubelet"} - node_selector.update({"kubernetes.io/hostname" : self.map_node_gpu[self.user_options.get('offload')]["hostname"]}) + if self.user_options.get('offload') in [gpu['model'] for gpu in self.gpus_model_known]: + node_selector.update({"kubernetes.io/hostname" : self.map_node_gpu[self.user_options.get('offload')]["hostname"]}) + elif self.user_options.get('offload')=="U55C": + node_selector.update({"kubernetes.io/hostname" : self.map_node_fpga[self.user_options.get('offload')]["hostname"]}) if self.user_options.get('offload')=="N": node_selector = {} @@ -563,6 +795,6 @@ def volumes(self): } } -c.KubeSpawner.http_timeout = 30 -c.KubeSpawner.start_timeout = 30 -#c.KubeSpawner.notebook_dir = "/home/jovyan" \ No newline at end of file +c.KubeSpawner.http_timeout = 60 +c.KubeSpawner.start_timeout = 60 +#c.KubeSpawner.notebook_dir = "/home/jovyan" diff --git a/values_template.yaml b/values_template.yaml index 37a2c31..6a1750a 100644 --- a/values_template.yaml +++ b/values_template.yaml @@ -39,8 +39,8 @@ jupyterhub: cpu: '1' memory: 1G requests: - cpu: 900m - memory: 900M + cpu: 250m + memory: 250M terminationMessagePath: "/dev/termination-log" terminationMessagePolicy: File - env: @@ -54,8 +54,8 @@ jupyterhub: cpu: '1' memory: 1G requests: - cpu: 900m - memory: 900M + cpu: 250m + memory: 250M terminationMessagePath: "/dev/termination-log" terminationMessagePolicy: File - env: @@ -69,13 +69,15 @@ jupyterhub: cpu: '1' memory: 1G requests: - cpu: 900m - memory: 900M + cpu: 250m + memory: 250M terminationMessagePath: "/dev/termination-log" terminationMessagePolicy: File image: name: biancoj/jhub-ai-infn - tag: latest + tag: 0.0.1 + # name: biancoj/jhub-asyncio + # tag: 0.0.8 networkPolicy: enabled: false cookieSecret: __COOKIE_SECRET__