-
Notifications
You must be signed in to change notification settings - Fork 92
/
distributed-training-p4de_batch-inference-g5_custom_ami.yaml
98 lines (94 loc) · 3.26 KB
/
distributed-training-p4de_batch-inference-g5_custom_ami.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
Imds:
ImdsSupport: v2.0
Image:
Os: ubuntu2004
HeadNode:
InstanceType: m5.8xlarge
Networking:
SubnetId: ${PUBLIC_SUBNET_ID}
LocalStorage:
RootVolume:
Size: 500
DeleteOnTermination: true # that's your root and /home volume for users
Iam:
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
Scheduler: slurm
SlurmSettings:
ScaledownIdletime: 60
CustomSlurmSettings:
# Simple accounting to text file /home/slurm/slurm-job-completions.txt.
#
# Must be disabled should you prefer to setup Slurm accounting to database
# (https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-accounting-v3.html).
#
# NOTE: JobCompType entry will be duplicated, hence will cause a harmless
# warning message in `systemctl status --no-pager slurmctld`.
- JobCompType: jobcomp/filetxt
- JobCompLoc: /home/slurm/slurm-job-completions.txt
- JobAcctGatherType: jobacct_gather/linux
SlurmQueues:
- Name: compute-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- ${PRIVATE_SUBNET_ID}
PlacementGroup:
Enabled: true
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 200
# The capacity reservation section is recommended if you use instances
# with a targeted ODCRs. You can also use a capacity resource group and
# CapacityReservationResourceGroupArn if you want to regroup
# multiple reservations
CapacityReservationTarget:
CapacityReservationId: ${CAPACITY_RESERVATION_ID}
JobExclusiveAllocation: true # GenAI training likes to gobble all GPUs in an instance
ComputeResources:
- Name: distributed-ml
InstanceType: p4de.24xlarge
MinCount: 4 # if min = max then capacity is maintained and will
MaxCount: 4 # not scale down
Efa:
Enabled: true
- Name: inference-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- ${PRIVATE_SUBNET_ID}
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 100
ComputeResources:
- Name: inference-ml
InstanceType: g5.12xlarge
MinCount: 0 # if min = max then capacity is maintained and will
MaxCount: 10 # not scale down
SharedStorage:
- MountDir: /fsx
Name: fsx
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 4800 # size it to your storage and throughput needs
PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
DeploymentType: PERSISTENT_2
Monitoring:
DetailedMonitoring: true
Logs:
CloudWatch:
Enabled: true # good for debug
Dashboards:
CloudWatch:
Enabled: true # provide basic dashboards