From 2a533f84e435361adaa304c2b8eaf22111bfc1ff Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Thu, 11 Jul 2024 21:07:37 -0500 Subject: [PATCH] Add ParallelCluster 3.10.0, 3.10.1 support (#244) Add support for ParallelCluster 3.10.0. Add alinux2023 support. Add support for external slurmdbd instance. Update documentation. Change the UID of the slurm user to 401 to match what ParallelCluster uses. Otherwise munge flags security errors because the UID of the submitter doesn't match the head node. Change the UpdateHeadNode lambda to only do the update via ssm if the cluster ins't already being updated. Resolves #242 Change the installer so that it checks to make sure that the cluster stack isn't already being changed or in a bad state. Resolves #221 Add support for ParallelCluster 3.10.1. Resolves #243 --- docs/config.md | 73 +++++++- docs/deploy-parallel-cluster.md | 18 -- docs/deployment-prerequisites.md | 111 ++++++++++-- source/cdk/cdk_slurm_stack.py | 170 +++++++++++++++--- source/cdk/config_schema.py | 83 +++++++-- .../CreateBuildFiles/CreateBuildFiles.py | 15 +- .../lambdas/UpdateHeadNode/UpdateHeadNode.py | 19 ++ .../tasks/main.yml | 35 +++- source/slurm_installer/installer.py | 21 +++ 9 files changed, 459 insertions(+), 86 deletions(-) diff --git a/docs/config.md b/docs/config.md index 89ec2d53..8f667be1 100644 --- a/docs/config.md +++ b/docs/config.md @@ -27,11 +27,16 @@ This project creates a ParallelCluster configuration file that is documented in Database: DatabaseStackName: str FQDN: str - Port: str + Port: str AdminUserName: str AdminPasswordSecretArn: str - ClientSecurityGroup: + ClientSecurityGroup: SecurityGroupName: SecurityGroupId + Slurmdbd: + SlurmdbdStackName: str + Host: str + Port: str + ClientSecurityGroup: str Dcv: Enabled: bool Port: int @@ -304,13 +309,18 @@ See [https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#p Optional +**Note**: Starting with ParallelCluster 3.10.0, you should use slurm/ParallelClusterConfig/[Slurmdbd](#slurmdbd) instead of slurm/ParallelClusterConfig/Database. +You cannot have both parameters. + Configure the Slurm database to use with the cluster. This is created independently of the cluster so that the same database can be used with multiple clusters. -The easiest way to do this is to use the [CloudFormation template provided by ParallelCluster](https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3) and then to just pass -the name of the stack in [DatabaseStackName](#databasestackname). -All of the other parameters will be pulled from the stack. +See [Create ParallelCluster Slurm Database](../deployment-prerequisites#create-parallelcluster-slurm-database) on the deployment prerequisites page. + +If you used the [CloudFormation template provided by ParallelCluster](https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3), then the easiest way to configure it is to pass +the name of the stack in slurm/ParallelClusterConfig/Database/[DatabaseStackName](#databasestackname). +All of the other parameters will be pulled from the outputs of the stack. See the [ParallelCluster documentation](https://docs.aws.amazon.com/parallelcluster/latest/ug/Scheduling-v3.html#Scheduling-v3-SlurmSettings-Database). @@ -330,7 +340,7 @@ The following parameters will be set using the outputs of the stack: Used with the Port to set the [Uri](https://docs.aws.amazon.com/parallelcluster/latest/ug/Scheduling-v3.html#yaml-Scheduling-SlurmSettings-Database-Uri) of the database. -##### Port +##### Database: Port type: int @@ -353,11 +363,56 @@ This password is used together with AdminUserName and Slurm accounting to authen Sets the [PasswordSecretArn](https://docs.aws.amazon.com/parallelcluster/latest/ug/Scheduling-v3.html#yaml-Scheduling-SlurmSettings-Database-PasswordSecretArn) parameter in ParallelCluster. -##### ClientSecurityGroup +##### Database: ClientSecurityGroup Security group that has permissions to connect to the database. -Required to be attached to the head node that is running slurmdbd so that the port connection to the database is allows. +Required to be attached to the head node that is running slurmdbd so that the port connection to the database is allowed. + +#### Slurmdbd + +**Note**: This is not supported before ParallelCluster 3.10.0. If you specify this parameter then you cannot specify slurm/ParallelClusterConfig/[Database](#database). + +Optional + +Configure an external Slurmdbd instance to use with the cluster. +The Slurmdbd instance provides access to the shared Slurm database. +This is created independently of the cluster so that the same database can be used with multiple clusters. + +This is created independently of the cluster so that the same slurmdbd instance can be used with multiple clusters. + +See [Create Slurmdbd instance](../deployment-prerequisites#create-slurmdbd-instance) on the deployment prerequisites page. + +If you used the [CloudFormation template provided by ParallelCluster](https://docs.aws.amazon.com/parallelcluster/latest/ug/external-slurmdb-accounting.html#external-slurmdb-accounting-step1), then the easiest way to configure it is to pass +the name of the stack in slurm/ParallelClusterConfig/Database/[SlurmdbdStackName](#slurmdbdstackname). +All of the other parameters will be pulled from the parameters and outputs of the stack. + +See the [ParallelCluster documentation for ExternalSlurmdbd](https://docs.aws.amazon.com/parallelcluster/latest/ug/Scheduling-v3.html#Scheduling-v3-SlurmSettings-ExternalSlurmdbd). + +##### SlurmdbdStackName + +Name of the ParallelCluster CloudFormation stack that created the Slurmdbd instance. + +The following parameters will be set using the outputs of the stack: + +* Host +* Port +* ClientSecurityGroup + +##### Slurmdbd: Host + +IP address or DNS name of the Slurmdbd instance. + +##### Slurmdbd: Port + +Default: 6819 + +Port used by the slurmdbd daemon on the Slurmdbd instance. + +##### Slurmdbd: ClientSecurityGroup + +Security group that has access to use the Slurmdbd instance. +This will be added as an extra security group to the head node. ### ClusterName @@ -373,6 +428,8 @@ For an existing secret can be the secret name or the ARN. If the secret doesn't exist one will be created, but won't be part of the cloudformation stack so that it won't be deleted when the stack is deleted. Required if your submitters need to use more than 1 cluster. +See [Create Munge Key](../deployment-prerequisites#create-munge-key) for more details. + ### SlurmCtl Configure the Slurm head node or controller. diff --git a/docs/deploy-parallel-cluster.md b/docs/deploy-parallel-cluster.md index d57d9e9e..ead1889f 100644 --- a/docs/deploy-parallel-cluster.md +++ b/docs/deploy-parallel-cluster.md @@ -10,24 +10,6 @@ The current latest version is 3.9.1. See [Deployment Prerequisites](deployment-prerequisites.md) page. -### Create ParallelCluster UI (optional but recommended) - -It is highly recommended to create a ParallelCluster UI to manage your ParallelCluster clusters. -A different UI is required for each version of ParallelCluster that you are using. -The versions are list in the [ParallelCluster Release Notes](https://docs.aws.amazon.com/parallelcluster/latest/ug/document_history.html). -The minimum required version is 3.6.0 which adds support for RHEL 8 and increases the number of allows queues and compute resources. -The suggested version is at least 3.7.0 because it adds configurable compute node weights which we use to prioritize the selection of -compute nodes by their cost. - -The instructions are in the [ParallelCluster User Guide](https://docs.aws.amazon.com/parallelcluster/latest/ug/install-pcui-v3.html). - -### Create ParallelCluster Slurm Database - -The Slurm Database is required for configuring Slurm accounts, users, groups, and fair share scheduling. -It you need these and other features then you will need to create a ParallelCluster Slurm Database. -You do not need to create a new database for each cluster; multiple clusters can share the same database. -Follow the directions in this [ParallelCluster tutorial to configure slurm accounting](https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3). - ## Create the Cluster To install the cluster run the install script. You can override some parameters in the config file diff --git a/docs/deployment-prerequisites.md b/docs/deployment-prerequisites.md index f2827cf4..46c86a49 100644 --- a/docs/deployment-prerequisites.md +++ b/docs/deployment-prerequisites.md @@ -99,6 +99,78 @@ The version that has been tested is in the CDK_VERSION variable in the install s The install script will try to install the prerequisites if they aren't already installed. +## Create ParallelCluster UI (optional but recommended) + +It is highly recommended to create a ParallelCluster UI to manage your ParallelCluster clusters. +A different UI is required for each version of ParallelCluster that you are using. +The versions are list in the [ParallelCluster Release Notes](https://docs.aws.amazon.com/parallelcluster/latest/ug/document_history.html). +The minimum required version is 3.6.0 which adds support for RHEL 8 and increases the number of allows queues and compute resources. +The suggested version is at least 3.7.0 because it adds configurable compute node weights which we use to prioritize the selection of +compute nodes by their cost. + +The instructions are in the [ParallelCluster User Guide](https://docs.aws.amazon.com/parallelcluster/latest/ug/install-pcui-v3.html). + +## Create Munge Key + +Munge is a package that Slurm uses to secure communication between servers. +The munge service uses a preshared key that must be the same on all of the servers in the Slurm cluster. +If you want to be able to use multiple clusters from your submission hosts, such as virtual desktops, then all of the clusters must be using the same munge key. +This is done by creating a munge key and storing it in secrets manager. +The secret is then passed as a parameter to ParallelCluster so that it can use it when configuring munge on all of the cluster instances. + +To create the munge key and store it in AWS Secrets Manager, run the following commands. + +``` +aws secretsmanager create-secret --name SlurmMungeKey --secret-string "$(dd if=/dev/random bs=1024 count=1 | base64 -w 0)" +``` + +Save the ARN of the secret for when you create the Slurmdbd instance and for when you create the configuration file. + +See the [Slurm documentation for authentication](https://slurm.schedmd.com/authentication.html) for more information. + +See the [ParallelCluster documentation for MungeKeySecretArn](https://docs.aws.amazon.com/parallelcluster/latest/ug/Scheduling-v3.html#yaml-Scheduling-SlurmSettings-MungeKeySecretArn). + +See the [MungeKeySecret configuration parameter](../config#mungekeysecret). + +## Create ParallelCluster Slurm Database + +The Slurm Database is required for configuring Slurm accounts, users, groups, and fair share scheduling. +It you need these and other features then you will need to create a ParallelCluster Slurm Database. +You do not need to create a new database for each cluster; multiple clusters can share the same database. +Follow the directions in this [ParallelCluster tutorial to configure slurm accounting](https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3). + +## Create Slurmdbd Instance + +**Note**: Before ParallelCluster 3.10.0, the slurmdbd daemon that connects to the data was created on each cluster's head node. +The recommended Slurm architecture is to have a shared slurmdbd daemon that is used by all of the clusters. +Starting in version 3.10.0, ParallelCluster supports specifying an external slurmdbd instance when you create a cluster and provide a cloud formation template to create it. + +Follow the directions in this [ParallelCluster tutorial to configure slurmdbd](https://docs.aws.amazon.com/parallelcluster/latest/ug/external-slurmdb-accounting.html#external-slurmdb-accounting-step1). +This requires that you have already created the slurm database. + +Here are some notes on the required parameters and how to fill them out. + +| Parameter | Description +|--------------|------------ +| AmiId | You can get this using the ParallelCluster UI. Click on Images and sort on Operating system. Confirm that the version is at least 3.10.0. Select the AMI for alinux2023 and the arm64 architecture. +| CustomCookbookUrl | Leave blank +| DBMSClientSG | Get this from the DatabaseClientSecurityGroup output of the database stack. +| DBMSDatabaseName | This is an arbitrary name. It must be alphanumeric. I use slurmaccounting +| DBMSPasswordSecretArn | Get this from the DatabaseSecretArn output of the database stack +| DBMSUri | Get this from the DatabaseHost output of the database stack. Note that if you copy and paste the link you should delete the https:// prefix and the trailing '/'. +| DBMSUsername | Get this from the DatabaseAdminUser output of the database stack. +| EnableSlurmdbdSystemService | Set to true. Note the warning. If the database already exists and was created with an older version of slurm then the database will be upgraded. This may break clusters using an older slurm version that are still using the cluster. Set to false if you don't want this to happen. +| InstanceType | Choose an instance type that is compatible with the AMI. For example, m7g.large. +| KeyName | Use an existing EC2 key pair. +| MungeKeySecretArn | ARN of an existing munge key secret. See [Create Munge Key](#create-munge-key). +| PrivateIp | Choose an available IP in the subnet. +| PrivatePrefix | CIDR of the instance's subnet. +| SlurmdbdPort | 6819 +| SubnetId | Preferably the same subnet where the clusters will be deployed. +| VPCId | The VPC of the subnet. + +The stack name will be used in the slurm/ParallelClusterConfig/[SlurmdbdStackName](../config#slurmdbdstackname) configuration parameter. + ## Security Groups for Login Nodes If you want to allow instances like remote desktops to use the cluster directly, you must define @@ -111,25 +183,30 @@ I'll call the three security groups the following names, but they can be whateve * SlurmHeadNodeSG * SlurmComputeNodeSG +First create these security groups without any security group rules. +The reason for this is that the security group rules reference the other security groups so the groups must all exist before any of the rules can be created. +After you have created the security groups then create the rules as described below. + ### Slurm Submitter Security Group The SlurmSubmitterSG will be attached to your login nodes, such as your virtual desktops. It needs at least the following inbound rules: -| Type | Port range | Source | Description -|------|------------|--------|------------ -| TCP | 1024-65535 | SlurmHeadNodeSG | SlurmHeadNode ephemeral -| TCP | 1024-65535 | SlurmComputeNodeSG | SlurmComputeNode ephemeral -| TCP | 6000-7024 | SlurmComputeNodeSG | SlurmComputeNode X11 +| Type | Port range | Source | Description | Details +|------|------------|--------------------|------------ |-------- +| TCP | 1024-65535 | SlurmHeadNodeSG | SlurmHeadNode ephemeral | Head node can use ephemeral ports to connect to the submitter +| TCP | 1024-65535 | SlurmComputeNodeSG | SlurmComputeNode ephemeral | Compute node will connect to submitter using ephemeral ports to manage interactive shells +| TCP | 6000-7024 | SlurmComputeNodeSG | SlurmComputeNode X11 | Compute node can send X11 traffic to submitter for GUI applications It needs the following outbound rules. -| Type | Port range | Destination | Description -|------|------------|-------------|------------ -| TCP | 2049 | SlurmHeadNodeSG | SlurmHeadNode NFS -| TCP | 6818 | SlurmComputeNodeSG | SlurmComputeNode slurmd -| TCP | 6819 | SlurmHeadNodeSG | SlurmHeadNode slurmdbd +| Type | Port range | Destination | Description | Details +|------|------------|--------------------|-------------|-------- +| TCP | 2049 | SlurmHeadNodeSG | SlurmHeadNode NFS | Mount the slurm NFS file system with binaries and config +| TCP | 6818 | SlurmComputeNodeSG | SlurmComputeNode slurmd | Connect to compute node for interactive jobs +| TCP | 6819 | SlurmHeadNodeSG | SlurmHeadNode slurmdbd | Connect to slurmdbd (accounting database) daemon on head node for versions before 3.10.0. +| TCP | 6819 | SlurmdbdSG | Slurmdbd | Connect to external Slurmdbd instance. For versions starting in 3.10.0. | TCP | 6820-6829 | SlurmHeadNodeSG | SlurmHeadNode slurmctld | TCP | 6830 | SlurmHeadNodeSG | SlurmHeadNode slurmrestd @@ -142,7 +219,7 @@ It needs at least the following inbound rules: | Type | Port range | Source | Description |------|------------|--------|------------ | TCP | 2049 | SlurmSubmitterSG | SlurmSubmitter NFS -| TCP | 6819 | SlurmSubmitterSG | SlurmSubmitter slurmdbd +| TCP | 6819 | SlurmSubmitterSG | SlurmSubmitter slurmdbd. If not using external Slurmdbd. | TCP | 6820-6829 | SlurmSubmitterSG | SlurmSubmitter slurmctld | TCP | 6830 | SlurmSubmitterSG | SlurmSubmitter slurmrestd @@ -152,6 +229,18 @@ It needs the following outbound rules. |------|------------|-------------|------------ | TCP | 1024-65535 | SlurmSubmitterSG | SlurmSubmitter ephemeral +### External Slurmdbd Security Group + +**Note**: ParallelCluster 3.10.0 added support for an external Slurmdbd instance. + +The submitter must be able to directly access the Slurmdbd instance on port 6819 when running commands like `sacctmgr`. +You must edit the inbound rules of the Slurmdbd instances security group to allow the access. +Add the following inbound rule. + +| Type | Port range | Source | Description +|------|------------|--------|------------ +| TCP | 6819 | SlurmSubmitterSG | SlurmSubmitter slurmdbd + ### Slurm Compute Node Security Group The SlurmComputeNodeSG will be specified in your configuration file for the slurm/InstanceConfig/AdditionalSecurityGroups parameter. diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 71f84b15..9d5143bf 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -279,22 +279,39 @@ def check_config(self): self.mount_home_src = mount_dict['src'] logger.info(f"Mounting /home from {self.mount_home_src} on compute nodes") - if self.config['slurm']['ParallelClusterConfig']['Image']['Os'] == 'rocky8': - if not config_schema.PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8(self.PARALLEL_CLUSTER_VERSION): - logger.error(f"rocky8 is not supported in ParallelCluster version {self.PARALLEL_CLUSTER_VERSION}. Support added in {PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8_VERSION}.") - config_errors += 1 + # Check OS + if self.config['slurm']['ParallelClusterConfig']['Image']['Os'] not in config_schema.get_PARALLEL_CLUSTER_ALLOWED_OSES(self.config): + logger.error(f"{self.config['slurm']['ParallelClusterConfig']['Image']['Os']} is not supported in ParallelCluster version {self.PARALLEL_CLUSTER_VERSION}.") + logger.info(f"alinux2: Supported in versions >= {config_schema.MIN_PARALLEL_CLUSTER_VERSION}") + logger.info(f"alinux23: Supported in versions >= {config_schema.PARALLEL_CLUSTER_SUPPORTS_AMAZON_LINUX_2023}") + logger.info(f"centos7: Supported in versions >= {config_schema.PARALLEL_CLUSTER_SUPPORTS_CENTOS_7_MIN_VERSION} and < {config_schema.PARALLEL_CLUSTER_SUPPORTS_CENTOS_7_DEPRECATED_VERSION}") + logger.info(f"rhel8: Supported in versions >= {config_schema.PARALLEL_CLUSTER_SUPPORTS_RHEL_8_MIN_VERSION}") + logger.info(f"rhel9: Supported in versions >= {config_schema.PARALLEL_CLUSTER_SUPPORTS_RHEL_9_MIN_VERSION}") + logger.info(f"rocky8: Supported in versions >= {config_schema.PARALLEL_CLUSTER_SUPPORTS_ROCKY_8_MIN_VERSION}") + logger.info(f"rocky9: Supported in versions >= {config_schema.PARALLEL_CLUSTER_SUPPORTS_ROCKY_9_MIN_VERSION}") + logger.info(f"ubuntu2004: Supported in versions >= {config_schema.MIN_PARALLEL_CLUSTER_VERSION}") + logger.info(f"ubuntu2204: Supported in versions >= {config_schema.MIN_PARALLEL_CLUSTER_VERSION}") + config_errors += 1 + + # Rocky 8 & 9 require a custom AMI because ParallelCluster doesn't provide one. + if self.config['slurm']['ParallelClusterConfig']['Image']['Os'] in ['rocky8', 'rocky9']: if 'CustomAmi' not in self.config['slurm']['ParallelClusterConfig']['Image']: - logger.error(f"Must specify config slurm/ParallelClusterConfig/Image/Os/CustomAmi with rocky8.") + logger.error(f"Must specify config slurm/ParallelClusterConfig/Image/Os/CustomAmi with rocky8 and rocky9.") config_errors += 1 + # Can't have both DatabaseStackName and SlurmdbdStackName + if 'Database' in self.config['slurm']['ParallelClusterConfig'] and 'Slurmdbd' in self.config['slurm']['ParallelClusterConfig']: + logger.error(f"Cannot specify both Database and Slurmdbd in config.") + exit(1) + if 'Database' in self.config['slurm']['ParallelClusterConfig']: - required_keys = ['ClientSecurityGroup', 'FQDN', 'Port', 'AdminUserName', 'AdminPasswordSecretArn'] + required_database_keys = ['ClientSecurityGroup', 'FQDN', 'Port', 'AdminUserName', 'AdminPasswordSecretArn'] if 'DatabaseStackName' in self.config['slurm']['ParallelClusterConfig']['Database']: invalid_keys = [] for database_key in self.config['slurm']['ParallelClusterConfig']['Database']: if database_key in ['DatabaseStackName']: continue - if database_key in required_keys: + if database_key in required_database_keys: logger.error(f"Cannot specify slurm/ParallelClusterConfig/Database/{database_key} and slurm/ParallelClusterConfig/Database/[Database,EdaSlurmCluster]StackName") invalid_keys.append(database_key) config_errors += 1 @@ -352,11 +369,62 @@ def check_config(self): logger.error(f"{output} output not found in self.config['slurm']['ParallelClusterConfig']['Database']['DatabaseStackName'] stack to set slurm/ParallelClusterConfig/Database/{database_key}") else: - for database_key in required_keys: + for database_key in required_database_keys: if database_key not in self.config['slurm']['ParallelClusterConfig']['Database']: logger.error(f"Must specify slurm/ParallelClusterConfig/Database/{database_key} when slurm/ParallelClusterConfig/Database/[Database,EdaSlurmCluster]StackName not set") config_errors += 1 + if 'Slurmdbd' in self.config['slurm']['ParallelClusterConfig']: + required_slurmdbd_keys = ['Host', 'Port', 'ClientSecurityGroup'] + if 'SlurmdbdStackName' in self.config['slurm']['ParallelClusterConfig']['Slurmdbd']: + cfn_client = boto3.client('cloudformation', region_name=self.config['Region']) + # Check that the stack exists + stacks_list = cfn_client.describe_stacks(StackName=self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName'])['Stacks'] + if not stacks_list: + logger.error(f"No stack named {self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName']} found.") + exit(1) + if len(stacks_list) > 1: + logger.error(f"More than 1 slurmdbd stack with name=={self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName']}. Please report a bug.") + for index, stack_dict in enumerate(stacks_list): + logger.error(f" stack[{index}]: StackName={stack_dict['StackName']} StackId={stack_dict['StackId']}") + exit(1) + + # Check to make sure that the slurmdbd instance is in the same VPC. + parameter_dicts = cfn_client.describe_stacks(StackName=self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName'])['Stacks'][0]['Parameters'] + vpc_checked = False + for parameter_dict in parameter_dicts: + if parameter_dict['ParameterKey'] == 'VPCId': + slurmdbd_vpc_id = parameter_dict['ParameterValue'] + if slurmdbd_vpc_id != self.config['VpcId']: + logger.error(f"Config slurm/ParallelClusterConfig/Slurmdbd/SlurmdbdStackName({self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName']}) is deployed in {slurmdbd_vpc_id} but needs to be in {self.config['VpcId']}") + config_errors += 1 + vpc_checked = True + break + if not vpc_checked: + logger.error(f"Didn't find VPCId parameter for slurmdbd in\n{json.dumps(parameter_dicts, indent=4)}") + exit(1) + + if 'Outputs' not in stacks_list[0]: + logger.error(f"No outputs found in {self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName']}. StackStatus={stacks_list[0]['StackStatus']}") + exit(1) + stack_outputs = stacks_list[0]['Outputs'] + output_to_key_map = { + 'SlurmdbdPrivateIp': 'Host', + 'SlurmdbdPort': 'Port', + 'AccountingClientSecurityGroup': 'ClientSecurityGroup', + } + for output in stack_outputs: + if output['OutputKey'] in output_to_key_map: + slurmdbd_key = output_to_key_map[output['OutputKey']] + if slurmdbd_key == 'Port': + value = int(output['OutputValue']) + else: + value = output['OutputValue'] + self.config['slurm']['ParallelClusterConfig']['Slurmdbd'][slurmdbd_key] = value + for output, slurmdbd_key in output_to_key_map.items(): + if slurmdbd_key not in self.config['slurm']['ParallelClusterConfig']['Slurmdbd']: + logger.error(f"{output} output not found in self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['SlurmdbdStackName'] stack to set slurm/ParallelClusterConfig/Slurmdbd/{slurmdbd_key}") + if self.config['slurm']['ParallelClusterConfig']['Image']['Os'] == 'centos7' and self.config['slurm']['ParallelClusterConfig']['Architecture'] != 'x86_64': logger.error(f'centos7 only supports x86_64 architecture. Update slurm/ParallelClusterConfig/Architecture.') config_errors += 1 @@ -803,11 +871,6 @@ def create_parallel_cluster_assets(self): 'x86_64': {} } }, - 'centos': { - '7': { - 'x86_64': {} - } - }, 'rhel': { '8': { 'arm64': {}, @@ -815,13 +878,47 @@ def create_parallel_cluster_assets(self): }, } } - if config_schema.PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8(self.PARALLEL_CLUSTER_VERSION): + if config_schema.PARALLEL_CLUSTER_SUPPORTS_AMAZON_LINUX_2023(self.PARALLEL_CLUSTER_VERSION): + self.ami_builds['amzn'] = { + '2023': { + 'arm64': {}, + 'x86_64': {} + } + } + if config_schema.PARALLEL_CLUSTER_SUPPORTS_CENTOS_7(self.PARALLEL_CLUSTER_VERSION): + self.ami_builds['centos'] = { + '7': { + 'x86_64': {} + } + } + if config_schema.PARALLEL_CLUSTER_SUPPORTS_RHEL_9(self.PARALLEL_CLUSTER_VERSION): + self.ami_builds['rhel'] = { + '9': { + 'arm64': {}, + 'x86_64': {} + } + } + if config_schema.PARALLEL_CLUSTER_SUPPORTS_ROCKY_8(self.PARALLEL_CLUSTER_VERSION): self.ami_builds['Rocky'] = { '8': { 'arm64': {}, 'x86_64': {} } } + if config_schema.PARALLEL_CLUSTER_SUPPORTS_ROCKY_9(self.PARALLEL_CLUSTER_VERSION): + self.ami_builds['Rocky'] = { + '9': { + 'arm64': {}, + 'x86_64': {} + } + } + if config_schema.PARALLEL_CLUSTER_SUPPORTS_ROCKY_9(self.PARALLEL_CLUSTER_VERSION): + self.ami_builds['Rocky'] = { + '9': { + 'arm64': {}, + 'x86_64': {} + } + } self.s3_client.put_object( Bucket = self.assets_bucket, Key = f"{self.assets_base_key}/config/build-files/build-file-amis.json", @@ -1459,6 +1556,7 @@ def create_parallel_cluster_lambdas(self): statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ + 'cloudformation:DescribeStacks', 'ec2:DescribeInstances', 'ssm:GetCommandInvocation', 'ssm:SendCommand', @@ -2013,7 +2111,9 @@ def get_instance_template_vars(self, instance_role): if instance_role == 'ParallelClusterHeadNode': instance_template_vars['pc_slurm_version'] = get_PC_SLURM_VERSION(self.config) if 'Database' in self.config['slurm']['ParallelClusterConfig']: - instance_template_vars['accounting_storage_host'] = 'pcvluster-head-node' + instance_template_vars['accounting_storage_host'] = 'pcluster-head-node' + elif 'Slurmdbd' in self.config['slurm']['ParallelClusterConfig']: + instance_template_vars['accounting_storage_host'] = self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['Host'] else: instance_template_vars['accounting_storage_host'] = '' instance_template_vars['licenses'] = self.config['Licenses'] @@ -2030,15 +2130,16 @@ def get_instance_template_vars(self, instance_role): instance_template_vars['slurmrestd_socket'] = f"{instance_template_vars['slurmrestd_socket_dir']}/slurmrestd.socket" instance_template_vars['slurmrestd_uid'] = self.config['slurm']['SlurmCtl']['SlurmrestdUid'] elif instance_role == 'ParallelClusterSubmitter': - instance_template_vars['slurm_version'] = get_SLURM_VERSION(self.config) + instance_template_vars['slurm_version'] = get_SLURM_VERSION(self.config) instance_template_vars['parallel_cluster_munge_version'] = get_PARALLEL_CLUSTER_MUNGE_VERSION(self.config) - instance_template_vars['slurmrestd_port'] = self.slurmrestd_port - instance_template_vars['file_system_mount_path'] = f'/opt/slurm/{cluster_name}' - instance_template_vars['slurm_base_dir'] = f'/opt/slurm/{cluster_name}' - instance_template_vars['submitter_slurm_base_dir'] = f'/opt/slurm/{cluster_name}' - instance_template_vars['slurm_config_dir'] = f'/opt/slurm/{cluster_name}/config' - instance_template_vars['slurm_etc_dir'] = f'/opt/slurm/{cluster_name}/etc' - instance_template_vars['modulefiles_base_dir'] = f'/opt/slurm/{cluster_name}/config/modules/modulefiles' + instance_template_vars['slurmrestd_port'] = self.slurmrestd_port + instance_template_vars['file_system_mount_path'] = f'/opt/slurm/{cluster_name}' + instance_template_vars['slurm_base_dir'] = f'/opt/slurm/{cluster_name}' + instance_template_vars['submitter_slurm_base_dir'] = f'/opt/slurm/{cluster_name}' + instance_template_vars['slurm_config_dir'] = f'/opt/slurm/{cluster_name}/config' + instance_template_vars['slurm_etc_dir'] = f'/opt/slurm/{cluster_name}/etc' + instance_template_vars['slurm_uid'] = self.config['slurm']['SlurmUid'] + instance_template_vars['modulefiles_base_dir'] = f'/opt/slurm/{cluster_name}/config/modules/modulefiles' elif instance_role == 'ParallelClusterComputeNode': pass @@ -2403,10 +2504,6 @@ def create_parallel_cluster_config(self): } ) - if 'Database' in self.config['slurm']['ParallelClusterConfig']: - for security_group_name, security_group_id in self.config['slurm']['ParallelClusterConfig']['Database']['ClientSecurityGroup'].items(): - self.parallel_cluster_config['HeadNode']['Networking']['AdditionalSecurityGroups'].append(security_group_id) - if 'LoginNodes' in self.config['slurm']['ParallelClusterConfig']: self.parallel_cluster_config['LoginNodes'] = self.config['slurm']['ParallelClusterConfig']['LoginNodes'] for login_node_pool in self.parallel_cluster_config['LoginNodes']['Pools']: @@ -2539,6 +2636,8 @@ def create_parallel_cluster_config(self): price = self.plugin.instance_type_and_family_info[self.cluster_region]['instance_types'][instance_type]['pricing']['spot']['max'] queue_name = f"{queue_name_prefix}-{instance_type}" queue_name = queue_name.replace('.', '-') + queue_name = queue_name.replace('large', 'l') + queue_name = queue_name.replace('medium', 'm') logger.info(f"Configuring {queue_name} queue:") if number_of_queues >= MAX_NUMBER_OF_QUEUES: logger.error(f"Can't create {queue_name} queue because MAX_NUMBER_OF_QUEUES=={MAX_NUMBER_OF_QUEUES} and have {number_of_queues} queues.") @@ -2549,10 +2648,12 @@ def create_parallel_cluster_config(self): number_of_queues += 1 compute_resource_name = f"{queue_name_prefix}-{instance_type}".replace('.', '-') + compute_resource_name = compute_resource_name.replace('large', 'l') + compute_resource_name = compute_resource_name.replace('medium', 'm') if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES: logger.error(f"Can't create {compute_resource_name} compute resource because MAX_NUMBER_OF_COMPUTE_RESOURCES=={MAX_NUMBER_OF_COMPUTE_RESOURCES} and have {number_of_compute_resources} compute resources") exit(1) - logger.info(f" Adding {compute_resource_name:18} compute resource") + logger.info(f" Adding {compute_resource_name:25} compute resource") if compute_resource_name in self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts']: min_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MinCount'] max_count = self.config['slurm']['InstanceConfig']['NodeCounts']['ComputeResourceCounts'][compute_resource_name]['MaxCount'] @@ -2654,6 +2755,17 @@ def create_parallel_cluster_config(self): 'UserName': self.config['slurm']['ParallelClusterConfig']['Database']['AdminUserName'], 'PasswordSecretArn': self.config['slurm']['ParallelClusterConfig']['Database']['AdminPasswordSecretArn'], } + for security_group_name, security_group_id in self.config['slurm']['ParallelClusterConfig']['Database']['ClientSecurityGroup'].items(): + self.parallel_cluster_config['HeadNode']['Networking']['AdditionalSecurityGroups'].append(security_group_id) + + if 'Slurmdbd' in self.config['slurm']['ParallelClusterConfig']: + self.parallel_cluster_config['Scheduling']['SlurmSettings']['ExternalSlurmdbd'] = { + 'Host': self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['Host'], + 'Port': self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['Port'] + } + self.parallel_cluster_config['HeadNode']['Networking']['AdditionalSecurityGroups'].append(self.config['slurm']['ParallelClusterConfig']['Slurmdbd']['ClientSecurityGroup']) + + if 'Database' in self.config['slurm']['ParallelClusterConfig'] or 'Slurmdbd' in self.config['slurm']['ParallelClusterConfig']: self.parallel_cluster_config['Scheduling']['SlurmSettings']['CustomSlurmSettings'].extend( [ {'AccountingStoreFlags': 'job_comment'}, @@ -2666,6 +2778,7 @@ def create_parallel_cluster_config(self): {'PriorityWeightJobSize': '0'}, ] ) + else: # Remote licenses configured using sacctmgr license_strings = [] @@ -2808,6 +2921,7 @@ def create_parallel_cluster_config(self): self.call_slurm_rest_api_lambda.node.add_dependency(self.parallel_cluster) # Custom resource to update the head node anytime the assets_hash changes + # This is to cover the case where the updates didn't affect the config and trigger a ParallelCluster update but a playbook or script changed and the update scripts should be run to ensure that the changes were applied. self.update_head_node = CustomResource( self, "UpdateHeadNode", service_token = self.update_head_node_lambda.function_arn, diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 5b447f6f..c85734d8 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -80,6 +80,13 @@ # 3.9.3: # * Add support for FSx Lustre as a shared storage type in us-iso-east-1. # * Bug fixes +# 3.10.0: +# * Add new configuration section Scheduling/SlurmSettings/ExternalSlurmdbd to connect the cluster to an external Slurmdbd +# * CentOS 7 is no longer supported. +# * Upgrade munge to version 0.5.16 (from 0.5.15). +# * Upgrade Python to 3.9.19 (from 3.9.17). +# 3.10.1: +# * Build fix for China regions MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0') # Update source/resources/default_config.yml with latest version when this is updated. PARALLEL_CLUSTER_VERSIONS = [ @@ -93,6 +100,8 @@ '3.9.1', '3.9.2', '3.9.3', + '3.10.0', + '3.10.1', ] PARALLEL_CLUSTER_MUNGE_VERSIONS = { # This can be found on the head node at /opt/parallelcluster/sources @@ -107,6 +116,8 @@ '3.9.1': '0.5.15', # confirmed '3.9.2': '0.5.15', # confirmed '3.9.3': '0.5.15', # confirmed + '3.10.0': '0.5.16', # confirmed + '3.10.1': '0.5.16', # confirmed } PARALLEL_CLUSTER_PYTHON_VERSIONS = { # This can be found on the head node at /opt/parallelcluster/pyenv/versions @@ -120,6 +131,8 @@ '3.9.1': '3.9.17', # confirmed '3.9.2': '3.9.17', # confirmed '3.9.3': '3.9.17', # confirmed + '3.10.0': '3.9.19', # confirmed + '3.10.1': '3.9.19', # confirmed } PARALLEL_CLUSTER_SLURM_VERSIONS = { # This can be found on the head node at /etc/chef/local-mode-cache/cache/ @@ -133,6 +146,8 @@ '3.9.1': '23.11.4', # confirmed '3.9.2': '23.11.7', # confirmed '3.9.3': '23.11.7', # confirmed + '3.10.0': '23.11.7', # confirmed + '3.10.1': '23.11.7', # confirmed } PARALLEL_CLUSTER_PC_SLURM_VERSIONS = { # This can be found on the head node at /etc/chef/local-mode-cache/cache/ @@ -146,6 +161,8 @@ '3.9.1': '23-11-4-1', # confirmed '3.9.2': '23-11-7-1', # confirmed '3.9.3': '23-11-7-1', # confirmed + '3.10.0': '23-11-7-1', # confirmed + '3.10.1': '23-11-7-1', # confirmed } SLURM_REST_API_VERSIONS = { '23-02-2-1': '0.0.39', @@ -157,16 +174,6 @@ '23-11-4-1': '0.0.39', '23-11-7-1': '0.0.39', } -PARALLEL_CLUSTER_ALLOWED_OSES = [ - 'alinux2', - 'centos7', - 'rhel8', - 'rhel9', - 'rocky8', - 'rocky9', - 'ubuntu2004', - 'ubuntu2204' - ] def get_parallel_cluster_version(config): parallel_cluster_version = config['slurm']['ParallelClusterConfig']['Version'] @@ -175,6 +182,52 @@ def get_parallel_cluster_version(config): raise KeyError(parallel_cluster_version) return parallel_cluster_version +PARALLEL_CLUSTER_SUPPORTS_CENTOS_7_MIN_VERSION = MIN_PARALLEL_CLUSTER_VERSION +PARALLEL_CLUSTER_SUPPORTS_CENTOS_7_DEPRECATED_VERSION = parse_version('3.10.0') +def PARALLEL_CLUSTER_SUPPORTS_CENTOS_7(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_CENTOS_7_MIN_VERSION and parallel_cluster_version < PARALLEL_CLUSTER_SUPPORTS_CENTOS_7_DEPRECATED_VERSION + +PARALLEL_CLUSTER_SUPPORTS_RHEL_8_MIN_VERSION = parse_version('3.6.0') +def PARALLEL_CLUSTER_SUPPORTS_RHEL_8(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_RHEL_8_MIN_VERSION + +PARALLEL_CLUSTER_SUPPORTS_ROCKY_8_MIN_VERSION = parse_version('3.8.0') +def PARALLEL_CLUSTER_SUPPORTS_ROCKY_8(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_ROCKY_8_MIN_VERSION + +PARALLEL_CLUSTER_SUPPORTS_RHEL_9_MIN_VERSION = parse_version('3.9.0') +def PARALLEL_CLUSTER_SUPPORTS_RHEL_9(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_RHEL_9_MIN_VERSION + +PARALLEL_CLUSTER_SUPPORTS_ROCKY_9_MIN_VERSION = parse_version('3.9.0') +def PARALLEL_CLUSTER_SUPPORTS_ROCKY_9(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_ROCKY_9_MIN_VERSION + +PARALLEL_CLUSTER_SUPPORTS_AMAZON_LINUX_2023_MIN_VERSION = parse_version('3.10.0') +def PARALLEL_CLUSTER_SUPPORTS_AMAZON_LINUX_2023(parallel_cluster_version): + return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_AMAZON_LINUX_2023_MIN_VERSION + +def get_PARALLEL_CLUSTER_ALLOWED_OSES(config): + allowed_oses = [ + 'alinux2', + 'ubuntu2004', + 'ubuntu2204' + ] + parallel_cluster_version = parse_version(get_parallel_cluster_version(config)) + if PARALLEL_CLUSTER_SUPPORTS_AMAZON_LINUX_2023(parallel_cluster_version): + allowed_oses.append('alinux2023') + if PARALLEL_CLUSTER_SUPPORTS_CENTOS_7(parallel_cluster_version): + allowed_oses.append('centos7') + if PARALLEL_CLUSTER_SUPPORTS_RHEL_8(parallel_cluster_version): + allowed_oses.append('rhel8') + if PARALLEL_CLUSTER_SUPPORTS_RHEL_9(parallel_cluster_version): + allowed_oses.append('rhel9') + if PARALLEL_CLUSTER_SUPPORTS_ROCKY_8(parallel_cluster_version): + allowed_oses.append('rocky8') + if PARALLEL_CLUSTER_SUPPORTS_ROCKY_9(parallel_cluster_version): + allowed_oses.append('rocky9') + return sorted(allowed_oses) + def get_PARALLEL_CLUSTER_MUNGE_VERSION(config): parallel_cluster_version = get_parallel_cluster_version(config) return PARALLEL_CLUSTER_MUNGE_VERSIONS[parallel_cluster_version] @@ -521,7 +574,7 @@ def get_config_schema(config): 'Version': And(str, lambda version: version in PARALLEL_CLUSTER_VERSIONS, lambda version: parse_version(version) >= MIN_PARALLEL_CLUSTER_VERSION), Optional('ClusterConfig'): lambda s: True, Optional('Image', default={'Os': DEFAULT_OS(config)}): { - 'Os': And(str, lambda s: s in PARALLEL_CLUSTER_ALLOWED_OSES), + 'Os': And(str, lambda s: s in get_PARALLEL_CLUSTER_ALLOWED_OSES(config)), # CustomAmi: AMI to use for head and compute nodes instead of the pre-built AMIs. Optional('CustomAmi'): And(str, lambda s: s.startswith('ami-')), }, @@ -539,6 +592,12 @@ def get_config_schema(config): Optional('AdminPasswordSecretArn'): And(str, lambda s: s.startswith('arn:')), Optional('ClientSecurityGroup'): {str: And(str, lambda s: re.match('sg-', s))}, }, + Optional('Slurmdbd'): { + Optional('SlurmdbdStackName'): str, + Optional('Host'): str, + Optional('Port'): int, + Optional('ClientSecurityGroup'): And(str, lambda s: re.match('sg-', s)) + }, Optional('Dcv', default={}): { Optional('Enabled', default=False): bool, Optional('Port', default=8443): int, @@ -672,7 +731,7 @@ def get_config_schema(config): Optional('Partition', default='onprem'): str, } }, - Optional('SlurmUid', default=900): int, + Optional('SlurmUid', default=401): int, Optional('storage'): { # # ExtraMounts diff --git a/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py b/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py index 996c6810..1d79b34b 100644 --- a/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py +++ b/source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py @@ -18,6 +18,8 @@ ''' Create/update/delete ParallelCluster AMI build configuration files and store them in S3. + +Don't fail if can't create build-files so that cluster can successfully deploy. ''' import boto3 import cfnresponse @@ -67,7 +69,7 @@ def get_image_builder_parent_image(distribution, version, architecture, parallel images = sorted(response['Images'], key=lambda image: image['CreationDate'], reverse=True) if not images: logger.error(f"No AMI found for {distribution} {version} {architecture}") - exit(1) + return None image_id = images[0]['ImageId'] return image_id @@ -175,6 +177,7 @@ def lambda_handler(event, context): build_file_template_content = response['Body'].read().decode('utf-8') build_file_template = Template(build_file_template_content) + error_count = 0 for distribution in ami_builds: for version in ami_builds[distribution]: for architecture in ami_builds[distribution][version]: @@ -183,6 +186,9 @@ def lambda_handler(event, context): else: template_vars['InstanceType'] = 'c6i.2xlarge' template_vars['ParentImage'] = get_image_builder_parent_image(distribution, version, architecture, parallelcluster_version) + if not template_vars['ParentImage']: + error_count += 1 + continue template_vars['RootVolumeSize'] = int(get_ami_root_volume_size(template_vars['ParentImage'])) + 10 logger.info(f"{distribution}-{version}-{architecture} image id: {template_vars['ParentImage']} root volume size={template_vars['RootVolumeSize']}") @@ -242,9 +248,12 @@ def lambda_handler(event, context): Body = build_file_content ) + if error_count: + raise RuntimeError(f"Errors occurred when creating build config files.") + except Exception as e: logger.exception(str(e)) - cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, physicalResourceId=cluster_name) + cfnresponse.send(event, context, cfnresponse.SUCCESS, {'error': str(e)}, physicalResourceId=cluster_name) sns_client = boto3.client('sns') sns_client.publish( TopicArn = environ['ErrorSnsTopicArn'], @@ -252,6 +261,6 @@ def lambda_handler(event, context): Message = str(e) ) logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") - raise + return cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) diff --git a/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py b/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py index 01ac2feb..8683131a 100644 --- a/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py +++ b/source/resources/lambdas/UpdateHeadNode/UpdateHeadNode.py @@ -18,6 +18,10 @@ ''' Update the head node when the config assets hash changes. + +Some asset changes may cause a cluster update while others like a playbook will not. +First check to make sure that the cluster ins't already being updated. +If it is already being updated then don't need to do anything. ''' import boto3 import cfnresponse @@ -71,6 +75,21 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return + # First check if the cluster is already being updated and return if it is. + cfn_client = boto3.client("cloudformation", region_name=cluster_region) + try: + stack_info = cfn_client.describe_stacks(StackName=cluster_name)['Stacks'][0] + except Exception as e: + logger.info(f"ParallelCluster stack {cluster_name} doesn't exist.\n{e}") + cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) + return + stack_status = stack_info['StackStatus'] + logger.info(f"ParallelCluster stack {cluster_name} in {stack_status} state.") + if stack_status in ['CREATE_IN_PROGRESS', 'UPDATE_IN_PROGRESS', 'UPDATE_COMPLETE_CLEANUP_IN_PROGRESS', 'UPDATE_ROLLBACK_IN_PROGRESS', 'UPDATE_ROLLBACK_COMPLETE_CLEANUP_IN_PROGRESS']: + logger.warning(f"ParallelCluster stack {cluster_name} update already initiated.") + cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) + return + head_node_ip_address = None head_node_instance_id = None ec2_client = boto3.client('ec2', region_name=cluster_region) diff --git a/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml b/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml index a33d2775..51963338 100644 --- a/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml +++ b/source/resources/playbooks/roles/ParallelClusterSubmitterConfigure/tasks/main.yml @@ -3,10 +3,13 @@ - name: Show vars used in this playbook debug: msg: | - cluster_name: {{ cluster_name }} - distribution: {{ distribution }} - region: {{ region }} - slurm_base_dir: {{ slurm_base_dir }} + cluster_name: {{ cluster_name }} + distribution: {{ distribution }} + parallel_cluster_munge_version: {{ parallel_cluster_munge_version }} + region: {{ region }} + slurm_base_dir: {{ slurm_base_dir }} + slurm_config_dir: {{ slurm_config_dir }} + slurm_uid: {{ slurm_uid }} - name: Add /opt/slurm/{{ cluster_name }} to /etc/fstab mount: @@ -53,9 +56,28 @@ state: present create_home: no + +- name: Get current munge version + register: munged_version_output + shell: | + if ! [ -e /usr/sbin/munged ]; then + echo "NONE" + exit 0 + fi + /usr/sbin/munged -V | awk '{print $1}' + +- set_fact: + act_munged_version: "{{ munged_version_output.stdout }}" + exp_munged_version: "munge-{{ parallel_cluster_munge_version }}" + +- name: Show munged_version + debug: + msg: | + act_munged_version: "{{ act_munged_version }}" + exp_munged_version: "{{ exp_munged_version }}" + - name: Build munge version used by ParallelCluster ({{ parallel_cluster_munge_version }}) - args: - creates: /usr/sbin/munged + when: act_munged_version != exp_munged_version shell: | set -ex @@ -116,6 +138,7 @@ system: yes state: present create_home: no + uid: "{{ slurm_uid }}" - name: Configure modules template: diff --git a/source/slurm_installer/installer.py b/source/slurm_installer/installer.py index 4316472a..cdf6d57f 100755 --- a/source/slurm_installer/installer.py +++ b/source/slurm_installer/installer.py @@ -352,6 +352,27 @@ def main(self): with open("installer_history.txt", "a+") as f: f.write(f"\n[{datetime.datetime.utcnow()}] {cmd}") + # Check that the ParallelCluster stack isn't being created or updated. + if 'ClusterName' not in self.config['slurm']: + if self.config['StackName'].endswith('-config'): + self.config['slurm']['ClusterName'] = self.config['StackName'][0:-7] + else: + self.config['slurm']['ClusterName'] = f"{self.config['StackName']}-cl" + logger.info(f"slurm/ClusterName defaulted to {self.config['slurm']['ClusterName']}") + stack_name = self.config['slurm']['ClusterName'] + cfn_client = session.client("cloudformation", region_name=region) + try: + stack_info = cfn_client.describe_stacks(StackName=stack_name)['Stacks'][0] + except: + logger.info(f"ParallelCluster stack (stack_name) doesn't exist.") + stack_info = None + if stack_info: + stack_status = stack_info['StackStatus'] + logger.info(f"ParallelCluster stack (stack_name) in {stack_status} state.") + if stack_status not in ['CREATE_COMPLETE', 'ROLLBACK_COMPLETE', 'UPDATE_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE']: + logger.error(f"ParallelCluster stack ({stack_name} is in invalid state: {stack_status}") + exit(1) + # Then launch the actual CDK installer logger.info("\n====== Deploying SLURM ======\n") launch_installer = os.system(cmd) # nosec