From 98992ec8d53f77ad6eb437cdc9c0c0d1a8c09981 Mon Sep 17 00:00:00 2001 From: Mira Kuntz Date: Wed, 13 Mar 2024 16:05:54 +0100 Subject: [PATCH 1/5] remove the script that creates new resources yaml (not used anymore) --- htcondor_migration.py | 169 ------------------------------------------ 1 file changed, 169 deletions(-) delete mode 100755 htcondor_migration.py diff --git a/htcondor_migration.py b/htcondor_migration.py deleted file mode 100755 index a2bb146..0000000 --- a/htcondor_migration.py +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env python -"""Manage resources for the migration to HTCondor 23. - -This script reads a resource definition (e.g. from resources.yaml) and produces -a new resource definition in which a (configurable) fraction of the cluster -resources are allocated to the secondary HTCondor cluster. -""" -import argparse -import sys -from copy import deepcopy -from math import ceil -from pathlib import Path - -import yaml - -"""Map HTCondor 8 images to the corresponding HTCondor 23 images. - -The images "default", "gpu", "secure" and "alma" all have HTCondor 8 installed -and attach to the primary cluster after boot. The following mapping determines -what are the equivalent images running HTCondor 23 and that attach to the -secondary cluster after boot. -""" -IMAGE_MAPPING = { - "default": "htcondor-secondary", - "gpu": "htcondor-secondary-gpu", - "secure": "htcondor-secondary", - "alma": "htcondor-secondary", - "htcondor-secondary": "htcondor-secondary", - "htcondor-secondary-gpu": "htcondor-secondary-gpu", -} - - -def allocate_resources(resources: dict, fraction: float) -> dict: - """Allocate resources to the secondary HTCondor cluster. - - Args: - resources: Resource definition from `resources.yaml`. - fraction: Fraction of resources to allocate to the secondary cluster. - - Returns: - Modified resource definition with the corresponding fraction of - resources allocated to the secondary cluster. - - Raises: - ValueError: Invalid resource fraction provided. - """ - if not 0 <= fraction <= 1: - raise ValueError("'fraction' must be between 0 and 1") - - if fraction <= 0: - return resources - - original = deepcopy(resources) - modified = deepcopy(resources) - - primary_deployment = deepcopy(modified["deployment"]) - secondary_deployment = dict() - for group, config in resources["deployment"].items(): - count = config["count"] - - if group.startswith("training") or "training" in config.get( - "group", "" - ): - count_primary = ceil(config["count"] * (1 - fraction)) - count_secondary = count - count_primary - else: - count_primary = ceil(config["count"] * (1 - fraction)) - count_secondary = ceil(config["count"] * fraction) - - if count_primary > 0: - primary_deployment[group] = {**config, "count": count_primary} - else: - del primary_deployment[group] - if count_secondary > 0: - secondary_deployment[f"{group}-htcondor-secondary"] = { - **config, - "count": count_secondary, - "image": IMAGE_MAPPING[config.get("image", "default")], - "secondary_htcondor_cluster": True, - } - modified["deployment"] = secondary_deployment | primary_deployment - - # We want to make use of a strategy that skips the modification of the - # primary deployment, because it is preferred that VMs are not spawned due - # to resources being exhausted rather than having VMs shut down in an - # uncontrolled manner. - # - # Because VMs can get stuck, the number of available machines of each - # flavor that is uncertain, and there are HTCondor groups that share the - # same flavor, this is tricky. - modified["deployment"] = ( - { - group: config - for group, config in modified["deployment"].items() - if config["group"] == "upload" and config.get("count", 0) > 0 - } - | { - group: config - for group, config in modified["deployment"].items() - if config["group"] == "interactive" and config.get("count", 0) > 0 - } - | { - group: config - for group, config in modified["deployment"].items() - if "training" not in config["group"] and config.get("count", 0) > 0 - } - | modified["deployment"] - | original["deployment"] - ) - - return modified - - -def make_parser() -> argparse.ArgumentParser: - """Command line interface for this script.""" - parser = argparse.ArgumentParser( - prog="htcondor-migration", - description="Manage resources for the migration to HTCondor 23.", - ) - - parser.add_argument( - "-r", - "--resources-file", - dest="resources_file", - type=Path, - metavar="resources_file", - help="resource definition file", - default="resources.yaml", - ) - parser.add_argument( - "-f", - "--fraction", - dest="resource_fraction", - type=float, - metavar="resource_fraction", - help="fraction of resources to be allocated to the secondary cluster", - default=0.0, - ) - parser.add_argument( - "-o", - "--output-file", - dest="output_file", - type=Path, - metavar="output_file", - help="output file, defaults to stdout", - ) - - return parser - - -if __name__ == "__main__": - command_parser = make_parser() - command_args = command_parser.parse_args() - - resource_definition = allocate_resources( - resources=yaml.safe_load(open(command_args.resources_file)), - fraction=command_args.resource_fraction, - ) - resource_definition = yaml.dump( - resource_definition, - sort_keys=False, - ) - - print( - resource_definition, - file=open(output_file, "w") - if (output_file := command_args.output_file) - else sys.stdout, - ) From 1cfb0458a0d3ac2edac7fb19079e371af6fa2ccb Mon Sep 17 00:00:00 2001 From: Mira Kuntz Date: Wed, 13 Mar 2024 16:07:05 +0100 Subject: [PATCH 2/5] all current resources use the else block so there is no need for old conf --- userdata.yaml.j2 | 55 ------------------------------------------------ 1 file changed, 55 deletions(-) diff --git a/userdata.yaml.j2 b/userdata.yaml.j2 index 18e96ce..f1ee17a 100644 --- a/userdata.yaml.j2 +++ b/userdata.yaml.j2 @@ -1,59 +1,5 @@ #cloud-config write_files: - {% if not (secondary_htcondor_cluster | default(false)) -%} - - content: | - # BEGIN MANAGED BLOCK - ETC = /etc/condor - CONDOR_HOST = condor-cm.galaxyproject.eu - ALLOW_WRITE = 10.5.68.0/24, 132.230.223.0/24 - ALLOW_READ = $(ALLOW_WRITE) - ALLOW_ADMINISTRATOR = 10.5.68.0/24, 132.230.223.239 - ALLOW_NEGOTIATOR = $(ALLOW_ADMINISTRATOR) - ALLOW_CONFIG = $(ALLOW_ADMINISTRATOR) - ALLOW_DAEMON = $(ALLOW_ADMINISTRATOR) - ALLOW_OWNER = $(ALLOW_ADMINISTRATOR) - ALLOW_CLIENT = * - DAEMON_LIST = MASTER, STARTD - FILESYSTEM_DOMAIN = bi.uni-freiburg.de - UID_DOMAIN = bi.uni-freiburg.de - TRUST_UID_DOMAIN = True - SOFT_UID_DOMAIN = True - CLAIM_PARTITIONABLE_LEFTOVERS = True - NUM_SLOTS = 1 - NUM_SLOTS_TYPE_1 = 1 - SLOT_TYPE_1 = 100% - SLOT_TYPE_1_PARTITIONABLE = True - ALLOW_PSLOT_PREEMPTION = False - STARTD.PROPORTIONAL_SWAP_ASSIGNMENT = True - MASTER_UPDATE_INTERVAL = 150 - UPDATE_INTERVAL = 120 - # END MANAGED BLOCK - {% if image is defined and image == "gpu" %} - # Advertise the GPUs - use feature : GPUs - GPU_DISCOVERY_EXTRA = -extra - {% endif %} - GalaxyTraining = {{ "training" in name }} - GalaxyGroup = "{{ group }}" - GalaxyCluster = "denbi" - GalaxyDockerHack = {{ docker }} - STARTD_ATTRS = GalaxyTraining, GalaxyGroup, GalaxyCluster, GalaxyDockerHack - Rank = StringListMember(MY.GalaxyGroup, TARGET.Group) - {% if cgroups is defined %} - BASE_CGROUP = /system.slice/condor.service - {% if cgroups.mem_limit_policy is defined %} - CGROUP_MEMORY_LIMIT_POLICY = {{ cgroups.mem_limit_policy }} - {% endif %} - {% if cgroups.mem_reserved_size is defined %} - RESERVED_MEMORY = {{ cgroups.mem_reserved_size }} - {% else %} - RESERVED_MEMORY = 1024 - {% endif %} - {% endif %} - owner: root:root - path: /etc/condor/condor_config.local - permissions: "0644" - {% else -%} - content: | {% if image is defined and image.endswith("gpu") -%} # Advertise the GPUs @@ -80,7 +26,6 @@ write_files: owner: root:root path: /etc/condor/config.d/99-cloud-init.conf permissions: "0644" - {% endif -%} - content: | [[outputs.influxdb]] urls = ["https://influxdb.galaxyproject.eu:8086"] From bb28a02532322c392e3c7920ac179af0a550a544 Mon Sep 17 00:00:00 2001 From: Mira Kuntz Date: Wed, 13 Mar 2024 16:07:21 +0100 Subject: [PATCH 3/5] can be removed from schema --- schema.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/schema.yaml b/schema.yaml index bd51bb3..27901a9 100644 --- a/schema.yaml +++ b/schema.yaml @@ -120,6 +120,3 @@ mapping: "docker": type: bool required: false - "secondary_htcondor_cluster": - type: bool - required: false From ddced2edd130eaa548ffd21e9d7965506a5435a0 Mon Sep 17 00:00:00 2001 From: Mira Kuntz Date: Wed, 20 Mar 2024 16:12:26 +0100 Subject: [PATCH 4/5] remove condor secondary image and tag --- resources.yaml | 108 ++++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 55 deletions(-) diff --git a/resources.yaml b/resources.yaml index 3951850..434c942 100644 --- a/resources.yaml +++ b/resources.yaml @@ -1,11 +1,9 @@ --- # Global configuration of computing nodes. images: - default: vggp-v60-j322-692e75a7c101-main - gpu: vggp-v60-gpu-j322-692e75a7c101-main-kernel-4.18.0-477.21.1.el8_8-nvidia secure: vggp-v60-secure-j322-692e75a7c101-main - htcondor-secondary: vgcn~workers+internal~rockylinux-8.6-x86_64~2023-10-26~43739~htcondor-secondary~ebb20b8~kysrpex_local_build - htcondor-secondary-gpu: vgcn~workers-gpu+internal~rockylinux-8.6-x86_64~2023-11-16~34096~htcondor-secondary~a23fbb0~kysrpex_local_build + default: vgcn~workers+internal~rockylinux-8.6-x86_64~2023-10-26~43739~htcondor-secondary~ebb20b8~kysrpex_local_build + gpu: vgcn~workers-gpu+internal~rockylinux-8.6-x86_64~2023-11-16~34096~htcondor-secondary~a23fbb0~kysrpex_local_build network: bioinf secgroups: - ufr-ingress @@ -41,7 +39,7 @@ deployment: # flavor: c1.c120m225d50 # group: compute # docker: true - # image: htcondor-secondary + # image: default # secondary_htcondor_cluster: true # volume: # size: 1024 @@ -54,15 +52,15 @@ deployment: count: 0 flavor: c1.c36m100d50 group: upload - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-interactive-htcondor-secondary: count: 4 #8 flavor: c1.c36m100d50 group: interactive docker: true - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + volume: size: 1024 type: default @@ -77,8 +75,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c28m225-htcondor-secondary: count: 0 flavor: c1.c28m225d50 @@ -90,8 +88,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c36m100-htcondor-secondary: count: 26 #32 flavor: c1.c36m100d50 @@ -103,8 +101,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c36m225-htcondor-secondary: count: 11 #11 flavor: c1.c36m225d50 @@ -116,8 +114,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c36m900-htcondor-secondary: count: 1 #1 it's a c1.c36m975d50 host with probably a faulty memory bank flavor: c1.c36m900d50 @@ -129,8 +127,8 @@ deployment: cgroups: mem_limit_policy: soft mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c36m975-htcondor-secondary: count: 8 #8 flavor: c1.c36m975d50 @@ -142,8 +140,8 @@ deployment: cgroups: mem_limit_policy: soft mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c28m935-htcondor-secondary: count: 4 #4 flavor: c1.c28m935d50 @@ -155,8 +153,8 @@ deployment: cgroups: mem_limit_policy: soft mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c28m875-htcondor-secondary: count: 2 #2 flavor: c1.c28m875d50 @@ -168,8 +166,8 @@ deployment: cgroups: mem_limit_policy: soft mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c64m2-htcondor-secondary: count: 1 #1 flavor: c1.c60m1975d50 @@ -178,8 +176,8 @@ deployment: volume: size: 1024 type: default - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c120m225-htcondor-secondary: count: 12 #12 flavor: c1.c120m225d50 @@ -191,8 +189,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c120m425-htcondor-secondary: count: 22 flavor: c1.c120m425d50 @@ -204,8 +202,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c125m425-htcondor-secondary: count: 16 #16 flavor: c1.c125m425d50 @@ -217,8 +215,8 @@ deployment: cgroups: mem_limit_policy: hard mem_reserved_size: 2048 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + worker-c14m40g1-htcondor-secondary: count: 4 #4 flavor: g1.c14m40g1d50 @@ -230,8 +228,8 @@ deployment: cgroups: mem_limit_policy: soft mem_reserved_size: 1024 - image: htcondor-secondary-gpu - secondary_htcondor_cluster: true + image: gpu + worker-c8m40g1-htcondor-secondary: count: 4 #4 flavor: g1.c8m40g1d50 @@ -243,8 +241,8 @@ deployment: cgroups: mem_limit_policy: soft mem_reserved_size: 1024 - image: htcondor-secondary-gpu - secondary_htcondor_cluster: true + image: gpu + # Trainings # These will overlap Mar 12-15 @@ -254,32 +252,32 @@ deployment: start: 2024-03-12 end: 2024-03-18 group: training-eml - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-kmb6: count: 2 flavor: c1.c28m225d50 start: 2024-03-01 end: 2024-05-17 group: training-kmb615 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-path-gen-march-24: count: 2 flavor: c1.c28m225d50 start: 2024-03-04 end: 2024-03-15 group: training-path-gen-march-24 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-msc-exeter: count: 2 flavor: c1.c28m225d50 start: 2024-03-06 end: 2024-03-20 group: training-msc-exeter - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-asse: count: 1 @@ -287,37 +285,37 @@ deployment: start: 2024-03-18 end: 2024-03-22 group: training-assemblyannotation - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-gie-: count: 1 flavor: c1.c28m225d50 start: 2024-03-18 end: 2024-03-19 group: training-gie-0324 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-geno: count: 2 flavor: c1.c28m225d50 start: 2024-05-07 end: 2024-05-10 group: training-genome-assembly-2024 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-nort: count: 2 flavor: c1.c28m225d50 start: 2024-06-07 end: 2024-06-07 group: training-northumbria-7jun24 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + training-joca: count: 2 flavor: c1.c28m225d50 start: 2024-04-12 end: 2024-04-12 group: training-joca-epigenomics-24 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default + From fc0ec4cd198c13bf803347d57a55f70c9e105b73 Mon Sep 17 00:00:00 2001 From: Mira Kuntz Date: Wed, 20 Mar 2024 16:28:51 +0100 Subject: [PATCH 5/5] change the workers' names --- resources.yaml | 53 ++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/resources.yaml b/resources.yaml index 9553aea..be63723 100644 --- a/resources.yaml +++ b/resources.yaml @@ -34,13 +34,12 @@ nodes_inventory: g1.c8m40g1d50: 4 deployment: - # worker-c120m225-htcondor-secondary: + # worker-c120m225: # count: 1 #12 # flavor: c1.c120m225d50 # group: compute # docker: true # image: default - # secondary_htcondor_cluster: true # volume: # size: 1024 # type: default @@ -48,13 +47,13 @@ deployment: # mem_limit_policy: hard # mem_reserved_size: 2048 - worker-fetch-htcondor-secondary: + worker-fetch: count: 0 flavor: c1.c36m100d50 group: upload image: default - worker-interactive-htcondor-secondary: + worker-interactive: count: 4 #8 flavor: c1.c36m100d50 group: interactive @@ -64,7 +63,7 @@ deployment: volume: size: 1024 type: default - worker-c28m475-htcondor-secondary: + worker-c28m475: count: 10 #19 flavor: c1.c28m475d50 group: compute @@ -77,7 +76,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c28m225-htcondor-secondary: + worker-c28m225: count: 0 flavor: c1.c28m225d50 group: compute_test @@ -90,7 +89,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c36m100-htcondor-secondary: + worker-c36m100: count: 26 #32 flavor: c1.c36m100d50 group: compute @@ -103,7 +102,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c36m225-htcondor-secondary: + worker-c36m225: count: 11 #11 flavor: c1.c36m225d50 group: compute @@ -116,7 +115,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c36m900-htcondor-secondary: + worker-c36m900: count: 1 #1 it's a c1.c36m975d50 host with probably a faulty memory bank flavor: c1.c36m900d50 group: compute @@ -129,7 +128,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c36m975-htcondor-secondary: + worker-c36m975: count: 8 #8 flavor: c1.c36m975d50 group: compute @@ -142,7 +141,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c28m935-htcondor-secondary: + worker-c28m935: count: 4 #4 flavor: c1.c28m935d50 group: compute @@ -155,7 +154,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c28m875-htcondor-secondary: + worker-c28m875: count: 2 #2 flavor: c1.c28m875d50 group: compute @@ -168,7 +167,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c64m2-htcondor-secondary: + worker-c64m2: count: 1 #1 flavor: c1.c60m1975d50 group: compute @@ -178,7 +177,7 @@ deployment: type: default image: default - worker-c120m225-htcondor-secondary: + worker-c120m225: count: 12 #12 flavor: c1.c120m225d50 group: compute @@ -191,7 +190,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c120m425-htcondor-secondary: + worker-c120m425: count: 22 flavor: c1.c120m425d50 group: compute @@ -204,7 +203,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c125m425-htcondor-secondary: + worker-c125m425: count: 16 #16 flavor: c1.c125m425d50 group: compute @@ -217,7 +216,7 @@ deployment: mem_reserved_size: 2048 image: default - worker-c14m40g1-htcondor-secondary: + worker-c14m40g1: count: 4 #4 flavor: g1.c14m40g1d50 group: compute_gpu @@ -230,7 +229,7 @@ deployment: mem_reserved_size: 1024 image: gpu - worker-c8m40g1-htcondor-secondary: + worker-c8m40g1: count: 4 #4 flavor: g1.c8m40g1d50 group: compute_gpu @@ -253,7 +252,6 @@ deployment: end: 2024-03-18 group: training-eml image: default - training-kmb6: count: 1 flavor: c1.c28m225d50 @@ -261,7 +259,6 @@ deployment: end: 2024-05-17 group: training-kmb615 image: default - training-path-gen-march-24: count: 2 flavor: c1.c28m225d50 @@ -269,7 +266,6 @@ deployment: end: 2024-03-15 group: training-path-gen-march-24 image: default - training-msc-exeter: count: 1 flavor: c1.c28m225d50 @@ -277,8 +273,6 @@ deployment: end: 2024-03-20 group: training-msc-exeter image: default - - training-asse: count: 2 flavor: c1.c28m225d50 @@ -286,7 +280,6 @@ deployment: end: 2024-03-22 group: training-assemblyannotation image: default - training-gie-: count: 1 flavor: c1.c28m225d50 @@ -294,26 +287,20 @@ deployment: end: 2024-03-19 group: training-gie-0324 image: default - - - training-e5020: count: 1 flavor: c1.c28m225d50 start: 2024-03-19 end: 2024-03-19 group: training-e5020-2024-03-19 - image: htcondor-secondary - secondary_htcondor_cluster: true + image: default training-gmc: count: 1 flavor: c1.c28m225d50 start: 2024-03-19 end: 2024-03-26 group: training-gmc - image: htcondor-secondary - secondary_htcondor_cluster: true - + image: default training-geno: count: 2 flavor: c1.c28m225d50 @@ -329,7 +316,6 @@ deployment: end: 2024-06-07 group: training-northumbria-7jun24 image: default - training-joca: count: 2 flavor: c1.c28m225d50 @@ -337,4 +323,3 @@ deployment: end: 2024-04-12 group: training-joca-epigenomics-24 image: default -