-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement custom allocation for SWAN event participants with a GPU #216
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,22 @@ | ||
import subprocess | ||
|
||
from kubernetes_asyncio.client.models import ( | ||
V1Affinity, | ||
V1EnvVar, | ||
V1EnvVarSource, | ||
V1ContainerPort, | ||
V1NodeAffinity, | ||
V1NodeSelector, | ||
V1NodeSelectorRequirement, | ||
V1NodeSelectorTerm, | ||
V1ObjectMeta, | ||
V1Secret, | ||
V1SecretKeySelector, | ||
V1SecretVolumeSource, | ||
V1Service, | ||
V1ServicePort, | ||
V1ServiceSpec, | ||
V1Toleration, | ||
V1Volume, | ||
V1VolumeMount, | ||
) | ||
|
@@ -32,6 +38,10 @@ async def get_swan_user_pod(self): | |
|
||
required_ports = 0 | ||
|
||
if self._gpu_enabled(): | ||
# Configure GPU allocation | ||
self._modify_pod_for_gpu() | ||
|
||
if self._spark_enabled(): | ||
# Configure Spark clusters at CERN | ||
hadoop_secret_name = await self._init_hadoop_secret() | ||
|
@@ -52,6 +62,76 @@ async def get_swan_user_pod(self): | |
|
||
return self.pod | ||
|
||
def _modify_pod_for_gpu(self): | ||
""" | ||
Configure a pod that requested a GPU. | ||
|
||
Two scenarios are possible: | ||
- Regular user: we need to add resource requests and limits for a | ||
generic GPU resource to the notebook container. | ||
- User who participates in a SWAN event: we need to add resource | ||
requests and limits for the GPU resource that has been configured for | ||
the event. In addition, we need to add a node affinity and a taint | ||
toleration to the pod to ensure that event pods (and only them) are | ||
scheduled on resources that have been allocated for the event (and | ||
therefore have been labeled and tainted to host only event pods). | ||
""" | ||
if events_role in self.spawner.user_roles: | ||
# The user is a participant of an event hosted by SWAN. | ||
# Their pod must be allocated on a node that has been | ||
# provisioned exclusively for the event | ||
|
||
# Get the GPU resource name in k8s that the user should be | ||
# mapped to | ||
gpu_resource_name = events_gpu_name | ||
|
||
# Add affinity to nodes that have been provisioned for the | ||
# event, i.e. labeled with the events role name | ||
node_selector_req = V1NodeSelectorRequirement( | ||
key = events_role, | ||
operator = 'Exists' | ||
) | ||
node_selector_term = V1NodeSelectorTerm( | ||
match_expressions = [ node_selector_req ] | ||
) | ||
node_selector = V1NodeSelector( | ||
node_selector_terms = [ node_selector_term ] | ||
) | ||
node_affinity = V1NodeAffinity( | ||
required_during_scheduling_ignored_during_execution = node_selector | ||
) | ||
self.pod.spec.affinity = V1Affinity(node_affinity = node_affinity) | ||
|
||
# Add toleration to nodes that have been provisioned for the | ||
# event, i.e. tainted with the events role name | ||
toleration = V1Toleration( | ||
key = events_role, | ||
operator = 'Exists', | ||
effect = 'NoSchedule' | ||
) | ||
self.pod.spec.tolerations = [ toleration ] | ||
|
||
else: | ||
# Regular user | ||
|
||
# Request generic GPU resource name | ||
gpu_resource_name = 'nvidia.com/gpu' | ||
|
||
# Add to notebook container the requests and limits for the GPU | ||
notebook_container = self._get_pod_container('notebook') | ||
resources = notebook_container.resources | ||
resources.requests[gpu_resource_name] = '1' | ||
resources.limits[gpu_resource_name] = '1' | ||
|
||
# Configure OpenCL to use NVIDIA backend | ||
notebook_container.env = self._add_or_replace_by_name( | ||
notebook_container.env, | ||
V1EnvVar( | ||
name='OCL_ICD_FILENAMES', | ||
value='libnvidia-opencl.so.1' | ||
), | ||
) | ||
|
||
async def _init_hadoop_secret(self): | ||
""" | ||
Create secret for Spark/Hadoop | ||
|
@@ -191,6 +271,12 @@ def _modify_containers_for_spark(self, hadoop_secret_name): | |
), | ||
) | ||
|
||
def _gpu_enabled(self): | ||
""" | ||
return True if the user has requested a GPU | ||
""" | ||
return "cu" in self.spawner.user_options[self.spawner.lcg_rel_field] | ||
|
||
def _spark_enabled(self): | ||
""" | ||
Helper function to determine if spark related configuration is necessary | ||
|
@@ -393,4 +479,11 @@ def computing_modify_pod_hook(spawner, pod): | |
return computing_pod_hook_handler.get_swan_user_pod() | ||
|
||
|
||
# Custom configuration options | ||
# Name of the role that is assigned to participants of events hosted by SWAN | ||
events_role = get_config('custom.events.role', 'swan-events') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did you declare the vars here instead of inside the method that uses it? It would be cleaner if in the future we remove that method. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I followed the same structure as in the other hooks that have custom config. E.g. see https://github.com/swan-cern/swan-charts/blob/master/swan-cern/files/swan_config_cern.py#L232-L234 |
||
# Name in k8s of the GPU resource to be assigned to participants of an event in SWAN | ||
events_gpu_name = get_config('custom.events.gpu_name', 'nvidia.com/gpu') | ||
|
||
|
||
c.SwanKubeSpawner.modify_pod_hook = computing_modify_pod_hook |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Depending on how Rodrigo implements the custom env in the spawner, this might crash ^
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If so, he'll fix it :) This is purely a code move.