Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core][TPU] Support TPU V5 #3814

Merged
merged 7 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 138 additions & 9 deletions sky/clouds/service_catalog/data_fetchers/fetch_gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,111 @@
,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
""")))

# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
# NOTE(dev): Keep the zones and the df in sync.
# TODO(tian): Double check if the price is correct.
TPU_V5_MISSING_ZONES_DF = {
'europe-west4-b': pd.read_csv(
io.StringIO(
textwrap.dedent("""\
AcceleratorName,AcceleratorCount,Region,AvailabilityZone
tpu-v5p-8,1,europe-west4,europe-west4-b
tpu-v5p-16,1,europe-west4,europe-west4-b
tpu-v5p-32,1,europe-west4,europe-west4-b
tpu-v5p-64,1,europe-west4,europe-west4-b
tpu-v5p-128,1,europe-west4,europe-west4-b
tpu-v5p-256,1,europe-west4,europe-west4-b
tpu-v5p-384,1,europe-west4,europe-west4-b
tpu-v5p-512,1,europe-west4,europe-west4-b
tpu-v5p-640,1,europe-west4,europe-west4-b
tpu-v5p-768,1,europe-west4,europe-west4-b
tpu-v5p-896,1,europe-west4,europe-west4-b
tpu-v5p-1024,1,europe-west4,europe-west4-b
tpu-v5p-1152,1,europe-west4,europe-west4-b
tpu-v5p-1280,1,europe-west4,europe-west4-b
tpu-v5p-1408,1,europe-west4,europe-west4-b
tpu-v5p-1536,1,europe-west4,europe-west4-b
tpu-v5p-1664,1,europe-west4,europe-west4-b
tpu-v5p-1792,1,europe-west4,europe-west4-b
tpu-v5p-1920,1,europe-west4,europe-west4-b
tpu-v5p-2048,1,europe-west4,europe-west4-b
tpu-v5p-2176,1,europe-west4,europe-west4-b
tpu-v5p-2304,1,europe-west4,europe-west4-b
tpu-v5p-2432,1,europe-west4,europe-west4-b
tpu-v5p-2560,1,europe-west4,europe-west4-b
tpu-v5p-2688,1,europe-west4,europe-west4-b
tpu-v5p-2816,1,europe-west4,europe-west4-b
tpu-v5p-2944,1,europe-west4,europe-west4-b
tpu-v5p-3072,1,europe-west4,europe-west4-b
tpu-v5p-3200,1,europe-west4,europe-west4-b
tpu-v5p-3328,1,europe-west4,europe-west4-b
tpu-v5p-3456,1,europe-west4,europe-west4-b
tpu-v5p-3584,1,europe-west4,europe-west4-b
tpu-v5p-3712,1,europe-west4,europe-west4-b
tpu-v5p-3840,1,europe-west4,europe-west4-b
tpu-v5p-3968,1,europe-west4,europe-west4-b
tpu-v5p-4096,1,europe-west4,europe-west4-b
tpu-v5p-4224,1,europe-west4,europe-west4-b
tpu-v5p-4352,1,europe-west4,europe-west4-b
tpu-v5p-4480,1,europe-west4,europe-west4-b
tpu-v5p-4608,1,europe-west4,europe-west4-b
tpu-v5p-4736,1,europe-west4,europe-west4-b
tpu-v5p-4864,1,europe-west4,europe-west4-b
tpu-v5p-4992,1,europe-west4,europe-west4-b
tpu-v5p-5120,1,europe-west4,europe-west4-b
tpu-v5p-5248,1,europe-west4,europe-west4-b
tpu-v5p-5376,1,europe-west4,europe-west4-b
tpu-v5p-5504,1,europe-west4,europe-west4-b
tpu-v5p-5632,1,europe-west4,europe-west4-b
tpu-v5p-5760,1,europe-west4,europe-west4-b
tpu-v5p-5888,1,europe-west4,europe-west4-b
tpu-v5p-6016,1,europe-west4,europe-west4-b
tpu-v5p-6144,1,europe-west4,europe-west4-b
tpu-v5p-6272,1,europe-west4,europe-west4-b
tpu-v5p-6400,1,europe-west4,europe-west4-b
tpu-v5p-6528,1,europe-west4,europe-west4-b
tpu-v5p-6656,1,europe-west4,europe-west4-b
tpu-v5p-6784,1,europe-west4,europe-west4-b
tpu-v5p-6912,1,europe-west4,europe-west4-b
tpu-v5p-7040,1,europe-west4,europe-west4-b
tpu-v5p-7168,1,europe-west4,europe-west4-b
tpu-v5p-7296,1,europe-west4,europe-west4-b
tpu-v5p-7424,1,europe-west4,europe-west4-b
tpu-v5p-7552,1,europe-west4,europe-west4-b
tpu-v5p-7680,1,europe-west4,europe-west4-b
tpu-v5p-7808,1,europe-west4,europe-west4-b
tpu-v5p-7936,1,europe-west4,europe-west4-b
tpu-v5p-8064,1,europe-west4,europe-west4-b
tpu-v5p-8192,1,europe-west4,europe-west4-b
tpu-v5p-8320,1,europe-west4,europe-west4-b
tpu-v5p-8448,1,europe-west4,europe-west4-b
tpu-v5p-8704,1,europe-west4,europe-west4-b
tpu-v5p-8832,1,europe-west4,europe-west4-b
tpu-v5p-8960,1,europe-west4,europe-west4-b
tpu-v5p-9216,1,europe-west4,europe-west4-b
tpu-v5p-9472,1,europe-west4,europe-west4-b
tpu-v5p-9600,1,europe-west4,europe-west4-b
tpu-v5p-9728,1,europe-west4,europe-west4-b
tpu-v5p-9856,1,europe-west4,europe-west4-b
tpu-v5p-9984,1,europe-west4,europe-west4-b
tpu-v5p-10240,1,europe-west4,europe-west4-b
tpu-v5p-10368,1,europe-west4,europe-west4-b
tpu-v5p-10496,1,europe-west4,europe-west4-b
tpu-v5p-10752,1,europe-west4,europe-west4-b
tpu-v5p-10880,1,europe-west4,europe-west4-b
tpu-v5p-11008,1,europe-west4,europe-west4-b
tpu-v5p-11136,1,europe-west4,europe-west4-b
tpu-v5p-11264,1,europe-west4,europe-west4-b
tpu-v5p-11520,1,europe-west4,europe-west4-b
tpu-v5p-11648,1,europe-west4,europe-west4-b
tpu-v5p-11776,1,europe-west4,europe-west4-b
tpu-v5p-11904,1,europe-west4,europe-west4-b
tpu-v5p-12032,1,europe-west4,europe-west4-b
tpu-v5p-12160,1,europe-west4,europe-west4-b
tpu-v5p-12288,1,europe-west4,europe-west4-b
""")))
}
# FIXME(woosuk): Remove this once the bug is fixed.
# See https://github.com/skypilot-org/skypilot/issues/1759#issue-1619614345
TPU_V4_HOST_DF = pd.read_csv(
Expand Down Expand Up @@ -415,6 +520,9 @@ def get_gpu_price(row: pd.Series, spot: bool) -> Optional[float]:


def _get_tpu_for_zone(zone: str) -> 'pd.DataFrame':
# Use hardcoded TPU V5 data as it is invisible in some zones.
if zone in TPU_V5_MISSING_ZONES_DF:
return TPU_V5_MISSING_ZONES_DF[zone]
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
tpus = []
parent = f'projects/{project_id}/locations/{zone}'
tpus_request = tpu_client.projects().locations().acceleratorTypes().list(
Expand All @@ -432,9 +540,6 @@ def _get_tpu_for_zone(zone: str) -> 'pd.DataFrame':
new_tpus = []
for tpu in tpus:
tpu_name = tpu['type']
# skip tpu v5 as we currently don't support it
if 'v5' in tpu_name:
continue
new_tpus.append({
'AcceleratorName': f'tpu-{tpu_name}',
'AcceleratorCount': 1,
Expand All @@ -458,11 +563,22 @@ def _get_tpus() -> 'pd.DataFrame':


# TODO: the TPUs fetched fails to contain us-east1
def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
def get_tpu_df(gce_skus: List[Dict[str, Any]],
tpu_skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
df = _get_tpus()
if df.empty:
return df

def _get_tpu_str(tpu_version: str) -> str:
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
# TPU V5 has a different naming convention since it is contained in
# the GCE SKUs. v5p -> TpuV5p, v5litepod -> TpuV5e.
if tpu_version.startswith('v5'):
if tpu_version == 'v5p':
return 'TpuV5p'
assert tpu_version == 'v5litepod', tpu_version
return 'TpuV5e'
return f'Tpu-{tpu_version}'

def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
assert row['AcceleratorCount'] == 1, row
tpu_price = None
Expand All @@ -475,9 +591,13 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
# whether the TPU is a single device or a pod.
# For TPU-v4, the pricing is uniform, and thus the pricing API
# only provides the price of TPU-v4 pods.
is_pod = num_cores > 8 or tpu_version == 'v4'
# TODO(tian): Seems like there is no 'Pod' kw in the v5 description.
# Does that means v5 only have TPU Node (instead of VM)? Another
# possibility is that the price shown for v5 TPU is for one core.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we confirm this TODO before merge?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this is due to its pricing unit - per chip hour instead of the whole pod. TPU Node is deprecated by GCP now. Reference: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_architectures

Just updated the comment. Thanks for catching this!

is_pod = ((num_cores > 8 or tpu_version == 'v4') and
not tpu_version.startswith('v5'))

for sku in skus:
for sku in gce_skus + tpu_skus:
if tpu_region not in sku['serviceRegions']:
continue
description = sku['description']
Expand All @@ -489,7 +609,7 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
if 'Preemptible' in description:
continue

if f'Tpu-{tpu_version}' not in description:
if _get_tpu_str(tpu_version) not in description:
continue
if is_pod:
if 'Pod' not in description:
Expand All @@ -500,7 +620,15 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:

unit_price = _get_unit_price(sku)
tpu_device_price = unit_price
tpu_core_price = tpu_device_price / 8
# v5p naming convention is v$VERSION_NUMBERp-$CORES_COUNT, while
# v5e is v$VERSION_NUMBER-$CHIP_COUNT. In the same time, V5 price
# is shown as per chip price, which is 2 cores for v5p and 1 core
# for v5e. Reference here:
# https://cloud.google.com/tpu/docs/v5p#using-accelerator-type
# https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config
core_per_sku = (1 if tpu_version == 'v5litepod' else
2 if tpu_version == 'v5p' else 8)
tpu_core_price = tpu_device_price / core_per_sku
tpu_price = num_cores * tpu_core_price
break

Expand Down Expand Up @@ -546,7 +674,8 @@ def get_catalog_df(region_prefix: str) -> 'pd.DataFrame':
region_prefix)] if not gpu_df.empty else gpu_df

gcp_tpu_skus = get_skus(TPU_SERVICE_ID)
tpu_df = get_tpu_df(gcp_tpu_skus)
# TPU V5 SKU is not included in the TPU SKUs but in the GCE SKUs.
tpu_df = get_tpu_df(gcp_skus, gcp_tpu_skus)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a gcp_skus here? Can we add a comment for that?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TPU V5 is not in the tpu skus but in the gcp skus. Added a comment here. Thanks!


# Merge the dataframes.
df = pd.concat([vm_df, gpu_df, tpu_df, TPU_V4_HOST_DF])
Expand Down
15 changes: 11 additions & 4 deletions sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,10 +578,17 @@ def _set_accelerators(
'Cannot specify instance type'
f' (got "{self.instance_type}") for TPU VM.')
if 'runtime_version' not in accelerator_args:
if use_tpu_vm:
accelerator_args['runtime_version'] = 'tpu-vm-base'
else:
accelerator_args['runtime_version'] = '2.12.0'

def _get_default_runtime_version() -> str:
if not use_tpu_vm:
return '2.12.0'
# TPU V5 requires a newer runtime version.
if acc.startswith('tpu-v5'):
return 'v2-alpha-tpuv5'
return 'tpu-vm-base'

accelerator_args['runtime_version'] = (
_get_default_runtime_version())
logger.info(
'Missing runtime_version in accelerator_args, using'
f' default ({accelerator_args["runtime_version"]})')
Expand Down
Loading