diff --git a/1.architectures/2.aws-parallelcluster/distributed-training-clususter-with-container.yaml b/1.architectures/2.aws-parallelcluster/distributed-training-clususter-with-container.yaml new file mode 100644 index 00000000..8c45ef18 --- /dev/null +++ b/1.architectures/2.aws-parallelcluster/distributed-training-clususter-with-container.yaml @@ -0,0 +1,87 @@ +Imds: + ImdsSupport: v2.0 +Image: + Os: alinux2 +HeadNode: + InstanceType: m5.8xlarge + Networking: + SubnetId: ${PUBLIC_SUBNET_ID} + Ssh: + KeyName: ${SSH_KEY} + LocalStorage: + RootVolume: + Size: 100 + DeleteOnTermination: true # that's your root and /home volume for users + Iam: + AdditionalIamPolicies: # grant ECR, SSM and S3 read access + - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + - Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess + - Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + CustomActions: + OnNodeConfigured: + Sequence: + - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh' + - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/pyxis/postinstall.sh' +Scheduling: + Scheduler: slurm + SlurmSettings: + ScaledownIdletime: 60 + SlurmQueues: + - Name: compute-gpu + CapacityType: ONDEMAND + Networking: + SubnetIds: + - ${PRIVATE_SUBNET_ID} + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /local_scratch # each instance has a local scratch on NVMe + RootVolume: + Size: 200 + ComputeResources: + - Name: distributed-ml + InstanceType: ${COMPUTE_INSTANCE_TYPE} + MinCount: ${COMPUTE_INSTANCE_MIN_COUNT} # if min = max then capacity is maintained and will + MaxCount: ${COMPUTE_INSTANCE_MAX_COUNT} # not scale down + Efa: + Enabled: true + # The capacity reservation section is recommended if you use instances + # with a targeted ODCRs. You can also use a capacity resource group and + # CapacityReservationResourceGroupArn if you want to regroup + # multiple reservations + CapacityReservationTarget: + CapacityReservationId: ${CAPACITY_RESERVATION_ID} + Networking: + PlacementGroup: + Enabled: true + Id: ${PLACEMENT_GROUP_ID} + CustomActions: + OnNodeConfigured: + Sequence: + - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/docker/postinstall.sh' + - Script: 'https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/main/pyxis/postinstall.sh' +SharedStorage: + - MountDir: /fsx + Name: fsx + StorageType: FsxLustre + FsxLustreSettings: + StorageCapacity: 4800 # size it to your storage and throughput needs + PerUnitStorageThroughput: 250 # this can be increased to 500 and 100 + DeploymentType: PERSISTENT_2 + ImportPath: s3://${S3_BUCKET_NAME} + - Name: SharedEBS + StorageType: Ebs + MountDir: /apps # Store your shared apps & scripts here + EbsSettings: + VolumeType: gp3 + Size: 200 + Throughput: 300 + Iops: 6000 +Monitoring: + DetailedMonitoring: true + Logs: + CloudWatch: + Enabled: true # good for debug + Dashboards: + CloudWatch: + Enabled: false # provide basic dashboards diff --git a/1.architectures/s3/s3-bucket.yaml b/1.architectures/s3/s3-bucket.yaml new file mode 100644 index 00000000..a5ef757c --- /dev/null +++ b/1.architectures/s3/s3-bucket.yaml @@ -0,0 +1,24 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: This CloudFormation template to create S3 Bucket + +Parameters: + S3BucketName: + Description: Type of this BacketName. + Type: String + +Resources: + S3Bucket: + Type: AWS::S3::Bucket + Properties: + BucketName: !Sub ${S3BucketName} + AccessControl: Private + PublicAccessBlockConfiguration: + BlockPublicAcls: True + BlockPublicPolicy: True + IgnorePublicAcls: True + RestrictPublicBuckets: True + +Outputs: + S3BucketName: + Value: !Ref S3Bucket + diff --git a/3.test_cases/3.MPT/Makefile b/3.test_cases/3.MPT/Makefile index 2d30f488..52ea57cc 100644 --- a/3.test_cases/3.MPT/Makefile +++ b/3.test_cases/3.MPT/Makefile @@ -41,7 +41,7 @@ deploy-s3: # Deploy S3 bucket using CloudFormation bash scripts/deploy-s3.sh deploy-vpc: # Deploy VPC using CloudFormation - aws cloudformation create-stack --stack-name vpc-$(NAME) --template-body file://cloudformation/VPC-Large-Scale-singleAZ.yaml \ - --parameters ParameterKey=VPCName,ParameterValue=$(VPC_NAME) ParameterKey=SubnetsAZ,ParameterValue=$(AZ) \ - --region $(REGION) --capabilities=CAPABILITY_IAM + bash scripts/deploy-vpc.sh +deploy-pcluster: + bash scripts/deploy-pcluster.sh \ No newline at end of file diff --git a/3.test_cases/3.MPT/config.env b/3.test_cases/3.MPT/config.env index 5e74fc80..918d98d1 100644 --- a/3.test_cases/3.MPT/config.env +++ b/3.test_cases/3.MPT/config.env @@ -16,7 +16,18 @@ function run(){ } >&2 fi } -NAME=llm-foundry -ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) -S3_BUCKET_NAME=${NAME}-${ACCOUNT_ID}-bucket -REGION=ap-northeast-1 \ No newline at end of file +export NAME=llm-foundry +export ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +export # S3 +export S3_BUCKET_NAME=${NAME}-${ACCOUNT_ID}-bucket +export # VPC +export VPC_NAME=${NAME}-${ACCOUNT_ID}-vpc +export REGION=ap-northeast-1 +export AZ=ap-northeast-1a +export # PCLUSTER +export COMPUTE_INSTANCE_TYPE=p4d.24xlarge +export PLACEMENT_GROUP_ID=p4d-placement-group +export CAPACITY_RESERVATION_ID=cr-075894134626f63cb +export COMPUTE_INSTANCE_MIN_COUNT=2 +export COMPUTE_INSTANCE_MAX_COUNT=2 +export SSH_KEY=mlkeita-user-admin-dev-machine \ No newline at end of file diff --git a/3.test_cases/3.MPT/scripts/deploy-pcluster.sh b/3.test_cases/3.MPT/scripts/deploy-pcluster.sh new file mode 100644 index 00000000..979fb1c8 --- /dev/null +++ b/3.test_cases/3.MPT/scripts/deploy-pcluster.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +. config.env + +export PRIVATE_SUBNET_ID=$(aws cloudformation describe-stacks --stack-name vpc-${NAME} --region ${REGION} \ + | jq -r '.Stacks[] | .Outputs[] | select(.OutputKey == "PrivateSubnet") | .OutputValue') +export PUBLIC_SUBNET_ID=$(aws cloudformation describe-stacks --stack-name vpc-${NAME} --region ${REGION} \ + | jq -r '.Stacks[] | .Outputs[] | select(.OutputKey == "PublicSubnet") | .OutputValue') +TMPFILE=$(mktemp) +echo ${TMPFILE} +cat ../../1.architectures/2.aws-parallelcluster/distributed-training-clususter-with-container.yaml | envsubst > ${TMPFILE} +set_options +run cat ${TMPFILE} +run pcluster create-cluster --cluster-configuration ${TMPFILE} --cluster-name pcluster-${NAME} --region ${REGION} \ No newline at end of file diff --git a/3.test_cases/3.MPT/scripts/deploy-vpc.sh b/3.test_cases/3.MPT/scripts/deploy-vpc.sh new file mode 100644 index 00000000..6b7b212b --- /dev/null +++ b/3.test_cases/3.MPT/scripts/deploy-vpc.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +. config.env + +set_options +run aws cloudformation create-stack --stack-name vpc-${NAME} \ + --template-body file://../../1.architectures/1.vpc_network/2.vpc-one-az.yaml \ + --parameters ParameterKey=VPCName,ParameterValue=${VPC_NAME} ParameterKey=SubnetsAZ,ParameterValue=${AZ} \ + --region ${REGION} --capabilities=CAPABILITY_IAM