Skip to content

Commit

Permalink
added pcluster config
Browse files Browse the repository at this point in the history
  • Loading branch information
KeitaW committed Jul 17, 2023
1 parent 792e262 commit 129d98a
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
Imds:
ImdsSupport: v2.0
Image:
Os: alinux2
HeadNode:
InstanceType: m5.8xlarge
Networking:
SubnetId: ${PUBLIC_SUBNET_ID}
Ssh:
KeyName: ${SSH_KEY}
LocalStorage:
RootVolume:
Size: 100
DeleteOnTermination: true # that's your root and /home volume for users
Iam:
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
CustomActions:
OnNodeConfigured:
Sequence:
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/pyxis/postinstall.sh'
Scheduling:
Scheduler: slurm
SlurmSettings:
ScaledownIdletime: 60
SlurmQueues:
- Name: compute-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- ${PRIVATE_SUBNET_ID}
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /local_scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 200
ComputeResources:
- Name: distributed-ml
InstanceType: ${COMPUTE_INSTANCE_TYPE}
MinCount: ${COMPUTE_INSTANCE_MIN_COUNT} # if min = max then capacity is maintained and will
MaxCount: ${COMPUTE_INSTANCE_MAX_COUNT} # not scale down
Efa:
Enabled: true
# The capacity reservation section is recommended if you use instances
# with a targeted ODCRs. You can also use a capacity resource group and
# CapacityReservationResourceGroupArn if you want to regroup
# multiple reservations
CapacityReservationTarget:
CapacityReservationId: ${CAPACITY_RESERVATION_ID}
Networking:
PlacementGroup:
Enabled: true
Id: ${PLACEMENT_GROUP_ID}
CustomActions:
OnNodeConfigured:
Sequence:
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh'
- Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/pyxis/postinstall.sh'
SharedStorage:
- MountDir: /fsx
Name: fsx
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 4800 # size it to your storage and throughput needs
PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
DeploymentType: PERSISTENT_2
ImportPath: s3://${S3_BUCKET_NAME}
- Name: SharedEBS
StorageType: Ebs
MountDir: /apps # Store your shared apps & scripts here
EbsSettings:
VolumeType: gp3
Size: 200
Throughput: 300
Iops: 6000
Monitoring:
DetailedMonitoring: true
Logs:
CloudWatch:
Enabled: true # good for debug
Dashboards:
CloudWatch:
Enabled: false # provide basic dashboards
24 changes: 24 additions & 0 deletions 1.architectures/s3/s3-bucket.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
AWSTemplateFormatVersion: '2010-09-09'
Description: This CloudFormation template to create S3 Bucket

Parameters:
S3BucketName:
Description: Type of this BacketName.
Type: String

Resources:
S3Bucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub ${S3BucketName}
AccessControl: Private
PublicAccessBlockConfiguration:
BlockPublicAcls: True
BlockPublicPolicy: True
IgnorePublicAcls: True
RestrictPublicBuckets: True

Outputs:
S3BucketName:
Value: !Ref S3Bucket

6 changes: 3 additions & 3 deletions 3.test_cases/3.MPT/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ deploy-s3: # Deploy S3 bucket using CloudFormation
bash scripts/deploy-s3.sh

deploy-vpc: # Deploy VPC using CloudFormation
aws cloudformation create-stack --stack-name vpc-$(NAME) --template-body file://cloudformation/VPC-Large-Scale-singleAZ.yaml \
--parameters ParameterKey=VPCName,ParameterValue=$(VPC_NAME) ParameterKey=SubnetsAZ,ParameterValue=$(AZ) \
--region $(REGION) --capabilities=CAPABILITY_IAM
bash scripts/deploy-vpc.sh

deploy-pcluster:
bash scripts/deploy-pcluster.sh
19 changes: 15 additions & 4 deletions 3.test_cases/3.MPT/config.env
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@ function run(){
} >&2
fi
}
NAME=llm-foundry
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
S3_BUCKET_NAME=${NAME}-${ACCOUNT_ID}-bucket
REGION=ap-northeast-1
export NAME=llm-foundry
export ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
export # S3
export S3_BUCKET_NAME=${NAME}-${ACCOUNT_ID}-bucket
export # VPC
export VPC_NAME=${NAME}-${ACCOUNT_ID}-vpc
export REGION=ap-northeast-1
export AZ=ap-northeast-1a
export # PCLUSTER
export COMPUTE_INSTANCE_TYPE=p4d.24xlarge
export PLACEMENT_GROUP_ID=p4d-placement-group
export CAPACITY_RESERVATION_ID=cr-075894134626f63cb
export COMPUTE_INSTANCE_MIN_COUNT=2
export COMPUTE_INSTANCE_MAX_COUNT=2
export SSH_KEY=mlkeita-user-admin-dev-machine
13 changes: 13 additions & 0 deletions 3.test_cases/3.MPT/scripts/deploy-pcluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash
. config.env

export PRIVATE_SUBNET_ID=$(aws cloudformation describe-stacks --stack-name vpc-${NAME} --region ${REGION} \
| jq -r '.Stacks[] | .Outputs[] | select(.OutputKey == "PrivateSubnet") | .OutputValue')
export PUBLIC_SUBNET_ID=$(aws cloudformation describe-stacks --stack-name vpc-${NAME} --region ${REGION} \
| jq -r '.Stacks[] | .Outputs[] | select(.OutputKey == "PublicSubnet") | .OutputValue')
TMPFILE=$(mktemp)
echo ${TMPFILE}
cat ../../1.architectures/2.aws-parallelcluster/distributed-training-clususter-with-container.yaml | envsubst > ${TMPFILE}
set_options
run cat ${TMPFILE}
run pcluster create-cluster --cluster-configuration ${TMPFILE} --cluster-name pcluster-${NAME} --region ${REGION}
8 changes: 8 additions & 0 deletions 3.test_cases/3.MPT/scripts/deploy-vpc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash
. config.env

set_options
run aws cloudformation create-stack --stack-name vpc-${NAME} \
--template-body file://../../1.architectures/1.vpc_network/2.vpc-one-az.yaml \
--parameters ParameterKey=VPCName,ParameterValue=${VPC_NAME} ParameterKey=SubnetsAZ,ParameterValue=${AZ} \
--region ${REGION} --capabilities=CAPABILITY_IAM

0 comments on commit 129d98a

Please sign in to comment.