From 4f01a75d6e51be7dbecc5befc174d44d14e9c6bf Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sat, 26 Oct 2024 13:34:01 -0700 Subject: [PATCH] [Azure] Support fractional A10 instance types (#3877) * fix * change catalog to float gpu num * support print float point gpu in sky launch. TODO: test if the ray deployment group works for fractional one * fix unittest * format * patch ray resources to ceil value * support launch from --gpus A10 * only allow strictly match fractional gpu counts * address comment * change back condition * fix * apply suggestions from code review * fix * Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Zhanghao Wu * format * fix display of fuzzy candidates * fix precision issue * fix num gpu required * refactor in check_resources_fit_cluster * change type annotation of acc_count * enable fuzzy fp acc count * fix k8s * Update sky/clouds/service_catalog/common.py Co-authored-by: Zhanghao Wu * fix integer gpus * format --------- Co-authored-by: Zhanghao Wu --- sky/backends/cloud_vm_ray_backend.py | 15 +++++++++ sky/clouds/aws.py | 11 +++---- sky/clouds/azure.py | 10 +++--- sky/clouds/cloud.py | 18 +++++++---- sky/clouds/cudo.py | 11 +++---- sky/clouds/fluidstack.py | 11 +++---- sky/clouds/gcp.py | 4 +-- sky/clouds/ibm.py | 11 +++---- sky/clouds/kubernetes.py | 11 +++---- sky/clouds/lambda_cloud.py | 11 +++---- sky/clouds/oci.py | 11 +++---- sky/clouds/paperspace.py | 11 +++---- sky/clouds/runpod.py | 11 +++---- sky/clouds/scp.py | 11 +++---- sky/clouds/service_catalog/__init__.py | 2 +- sky/clouds/service_catalog/aws_catalog.py | 4 +-- sky/clouds/service_catalog/azure_catalog.py | 5 +-- sky/clouds/service_catalog/common.py | 21 ++++++++---- sky/clouds/service_catalog/cudo_catalog.py | 4 +-- .../data_fetchers/fetch_azure.py | 32 ++++++++++++------- .../service_catalog/fluidstack_catalog.py | 4 +-- sky/clouds/service_catalog/ibm_catalog.py | 4 +-- sky/clouds/service_catalog/lambda_catalog.py | 4 +-- sky/clouds/service_catalog/oci_catalog.py | 4 +-- .../service_catalog/paperspace_catalog.py | 4 +-- sky/clouds/service_catalog/runpod_catalog.py | 4 +-- sky/clouds/service_catalog/scp_catalog.py | 4 +-- sky/clouds/service_catalog/vsphere_catalog.py | 4 +-- sky/clouds/vsphere.py | 11 +++---- sky/resources.py | 2 +- sky/utils/resources_utils.py | 14 +++++++- 31 files changed, 150 insertions(+), 134 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index f0fb4d97ba1..918848b045b 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2713,6 +2713,21 @@ def check_resources_fit_cluster( f' Existing:\t{handle.launched_nodes}x ' f'{handle.launched_resources}\n' f'{mismatch_str}') + else: + # For fractional acc count clusters, we round up the number of accs + # to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str) + # Here we scale the required acc count to (required / launched) * 1 + # so the total number of accs is the same as the requested number. + launched_accs = launched_resources.accelerators + if (launched_accs is not None and + valid_resource.accelerators is not None): + for _, count in launched_accs.items(): + if isinstance(count, float) and not count.is_integer(): + valid_resource = valid_resource.copy( + accelerators={ + k: v / count + for k, v in valid_resource.accelerators.items() + }) return valid_resource def _provision( diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index a0962b17cac..43062ebf393 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -2,13 +2,12 @@ import enum import fnmatch import functools -import json import os import re import subprocess import time import typing -from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union from sky import clouds from sky import exceptions @@ -383,7 +382,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='aws') @@ -411,10 +410,8 @@ def make_deploy_resources_variables( r = resources # r.accelerators is cleared but .instance_type encodes the info. acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) if r.extract_docker_image() is not None: image_id_to_use = None diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 0852c993ed3..fc9579d17c0 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -1,12 +1,11 @@ """Azure.""" import functools -import json import os import re import subprocess import textwrap import typing -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import colorama @@ -272,7 +271,7 @@ def zones_provision_loop( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='azure') @@ -304,10 +303,9 @@ def make_deploy_resources_variables( acc_dict = self.get_accelerators_from_instance_type(r.instance_type) acc_count = None if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) acc_count = str(sum(acc_dict.values())) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) if (resources.image_id is None or resources.extract_docker_image() is not None): diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 3e21204f0a3..4028c1fef59 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -9,8 +9,9 @@ """ import collections import enum +import math import typing -from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple +from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union from sky import exceptions from sky import skypilot_config @@ -306,7 +307,7 @@ def get_vcpus_mem_from_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: """Returns {acc: acc_count} held by 'instance_type', if any.""" raise NotImplementedError @@ -673,8 +674,9 @@ def _check_instance_type_accelerators_combination( assert resources.is_launchable(), resources def _equal_accelerators( - acc_requested: Optional[Dict[str, int]], - acc_from_instance_type: Optional[Dict[str, int]]) -> bool: + acc_requested: Optional[Dict[str, Union[int, float]]], + acc_from_instance_type: Optional[Dict[str, Union[int, + float]]]) -> bool: """Check the requested accelerators equals to the instance type Check the requested accelerators equals to the accelerators @@ -689,12 +691,14 @@ def _equal_accelerators( for acc in acc_requested: if acc not in acc_from_instance_type: return False - if acc_requested[acc] != acc_from_instance_type[acc]: + # Avoid float point precision issue. + if not math.isclose(acc_requested[acc], + acc_from_instance_type[acc]): return False return True - acc_from_instance_type = (cls.get_accelerators_from_instance_type( - resources.instance_type)) + acc_from_instance_type = cls.get_accelerators_from_instance_type( + resources.instance_type) if not _equal_accelerators(resources.accelerators, acc_from_instance_type): with ux_utils.print_exception_no_traceback(): diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 4dca442fa01..6f02e007049 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -1,8 +1,7 @@ """Cudo Compute""" -import json import subprocess import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky.clouds import service_catalog @@ -183,7 +182,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='cudo') @@ -202,10 +201,8 @@ def make_deploy_resources_variables( del zones, cluster_name # unused r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index 473fceabbe3..31e2112f8f7 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -1,8 +1,7 @@ """Fluidstack Cloud.""" -import json import os import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -155,7 +154,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='fluidstack') @@ -184,10 +183,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 1b70abf914d..0e20fdc9789 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -7,7 +7,7 @@ import subprocess import time import typing -from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union import colorama @@ -669,7 +669,7 @@ def _get_feasible_launchable_resources( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: # GCP handles accelerators separately from regular instance types, # hence return none here. return None diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index b78cc4287c0..0ac3c36cc48 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -1,8 +1,7 @@ """IBM Web Services.""" -import json import os import typing -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import colorama @@ -206,10 +205,8 @@ def _get_profile_resources(instance_profile): 'IBM does not currently support spot instances in this framework' acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) instance_resources = _get_profile_resources(r.instance_type) @@ -247,7 +244,7 @@ def get_vcpus_mem_from_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: """Returns {acc: acc_count} held by 'instance_type', if any.""" return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='ibm') diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 8ff4172a5b1..39ddbe30577 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -1,10 +1,9 @@ """Kubernetes.""" import functools -import json import os import re import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky import sky_logging @@ -271,7 +270,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: inst = kubernetes_utils.KubernetesInstanceType.from_instance_type( instance_type) return { @@ -328,10 +327,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) # resources.memory and cpus are None if they are not explicitly set. # We fetch the default values for the instance type in that case. diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index 0201f4f76ad..055a5338750 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -1,7 +1,6 @@ """Lambda Cloud.""" -import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -136,7 +135,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='lambda') @@ -164,10 +163,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) resources_vars = { 'instance_type': resources.instance_type, diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 0feda467bbf..93a70c5ac37 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -20,11 +20,10 @@ - Hysun He (hysun.he@oracle.com) @ Oct 13, 2024: Support more OS types additional to ubuntu for OCI resources. """ -import json import logging import os import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky import exceptions @@ -193,7 +192,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='oci') @@ -213,10 +212,8 @@ def make_deploy_resources_variables( acc_dict = self.get_accelerators_from_instance_type( resources.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) image_str = self._get_image_id(resources.image_id, region.name, resources.instance_type) diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index 4c4fa1d695a..4047a2f5926 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -1,8 +1,7 @@ """ Paperspace Cloud. """ -import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -162,7 +161,7 @@ def get_default_instance_type( @classmethod def get_accelerators_from_instance_type( - cls, instance_type: str) -> Optional[Dict[str, int]]: + cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='paperspace') @@ -181,10 +180,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 6cfdf11c6b4..0d693fd9f60 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -1,8 +1,7 @@ """ RunPod Cloud. """ -import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky.clouds import service_catalog @@ -147,7 +146,7 @@ def get_default_instance_type( @classmethod def get_accelerators_from_instance_type( - cls, instance_type: str) -> Optional[Dict[str, int]]: + cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='runpod') @@ -166,10 +165,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) if r.image_id is None: image_id = 'runpod/base:0.0.2' diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index 17a54ce1607..d0ad611bf0c 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -4,9 +4,8 @@ to access the SCP catalog and check credentials for the SCP access. """ -import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky import exceptions @@ -160,7 +159,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='scp') @@ -188,11 +187,9 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None image_id = self._get_image_id(r.image_id, region.name, r.instance_type) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index f2301bac466..4deab8ac204 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -238,7 +238,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( instance_type: str, - clouds: CloudFilter = None) -> Optional[Dict[str, int]]: + clouds: CloudFilter = None) -> Optional[Dict[str, Union[int, float]]]: """Returns the accelerators from a instance type.""" return _map_clouds_catalog(clouds, 'get_accelerators_from_instance_type', instance_type) diff --git a/sky/clouds/service_catalog/aws_catalog.py b/sky/clouds/service_catalog/aws_catalog.py index d156135047b..918a4070414 100644 --- a/sky/clouds/service_catalog/aws_catalog.py +++ b/sky/clouds/service_catalog/aws_catalog.py @@ -8,7 +8,7 @@ import os import threading import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky import exceptions from sky import sky_logging @@ -243,7 +243,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl( _get_df(), instance_type) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 867141f7899..62cb422bf83 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for Azure. """ import re -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky import clouds as cloud_lib from sky import sky_logging @@ -137,7 +137,7 @@ def _filter_disk_type(instance_type: str) -> bool: def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) @@ -157,6 +157,7 @@ def get_instance_type_for_accelerator( if zone is not None: with ux_utils.print_exception_no_traceback(): raise ValueError('Azure does not support zones.') + return common.get_instance_type_for_accelerator_impl(df=_df, acc_name=acc_name, acc_count=acc_count, diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 4df72824027..1082b4e9efd 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -5,7 +5,7 @@ import os import time import typing -from typing import Callable, Dict, List, NamedTuple, Optional, Tuple +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union import filelock import requests @@ -481,7 +481,7 @@ def get_instance_type_for_cpus_mem_impl( def get_accelerators_from_instance_type_impl( df: 'pd.DataFrame', instance_type: str, -) -> Optional[Dict[str, int]]: +) -> Optional[Dict[str, Union[int, float]]]: df = _get_instance_type(df, instance_type, None) if len(df) == 0: with ux_utils.print_exception_no_traceback(): @@ -490,13 +490,19 @@ def get_accelerators_from_instance_type_impl( acc_name, acc_count = row['AcceleratorName'], row['AcceleratorCount'] if pd.isnull(acc_name): return None - return {acc_name: int(acc_count)} + + def _convert(value): + if int(value) == value: + return int(value) + return float(value) + + return {acc_name: _convert(acc_count)} def get_instance_type_for_accelerator_impl( df: 'pd.DataFrame', acc_name: str, - acc_count: int, + acc_count: Union[int, float], cpus: Optional[str] = None, memory: Optional[str] = None, use_spot: bool = False, @@ -509,7 +515,7 @@ def get_instance_type_for_accelerator_impl( accelerators with sorted prices and a list of candidates with fuzzy search. """ result = df[(df['AcceleratorName'].str.fullmatch(acc_name, case=False)) & - (df['AcceleratorCount'] == acc_count)] + (abs(df['AcceleratorCount'] - acc_count) <= 0.01)] result = _filter_region_zone(result, region, zone) if len(result) == 0: fuzzy_result = df[ @@ -522,8 +528,11 @@ def get_instance_type_for_accelerator_impl( fuzzy_candidate_list = [] if len(fuzzy_result) > 0: for _, row in fuzzy_result.iterrows(): + acc_cnt = float(row['AcceleratorCount']) + acc_count_display = (int(acc_cnt) if acc_cnt.is_integer() else + f'{acc_cnt:.2f}') fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:' - f'{int(row["AcceleratorCount"])}') + f'{acc_count_display}') return (None, fuzzy_candidate_list) result = _filter_with_cpus(result, cpus) diff --git a/sky/clouds/service_catalog/cudo_catalog.py b/sky/clouds/service_catalog/cudo_catalog.py index 62832cba5bf..d4adc5baea5 100644 --- a/sky/clouds/service_catalog/cudo_catalog.py +++ b/sky/clouds/service_catalog/cudo_catalog.py @@ -1,7 +1,7 @@ """Cudo Compute Offerings Catalog.""" import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common import sky.provision.cudo.cudo_machine_type as cudo_mt @@ -66,7 +66,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index bbd337e23aa..f646cac339a 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -93,14 +93,15 @@ def get_regions() -> List[str]: # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] -# Some A10 instance types only contains a fractional of GPU. We temporarily -# filter them out here to avoid using it as a whole A10 GPU. -# TODO(zhwu,tian): support fractional GPUs, which can be done on -# kubernetes as well. +# Azure has those fractional A10 instance types, which still shows has 1 A10 GPU +# in the API response. We manually changing the number of GPUs to a float here. # Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series -FILTERED_A10_INSTANCE_TYPES = [ - f'Standard_NV{vcpu}ads_A10_v5' for vcpu in [6, 12, 18] -] +# TODO(zhwu,tian): Support fractional GPUs on k8s as well. +# TODO(tian): Maybe we should support literally fractional count, i.e. A10:1/6 +# instead of float point count (A10:0.167). +AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = { + f'Standard_NV{vcpu}ads_A10_v5': round(vcpu / 36, 3) for vcpu in [6, 12, 18] +} USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', @@ -274,6 +275,19 @@ def get_additional_columns(row): axis='columns', ) + def _upd_a10_gpu_count(row): + new_gpu_cnt = AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.get( + row['InstanceType']) + if new_gpu_cnt is not None: + return new_gpu_cnt + return row['AcceleratorCount'] + + # Manually update the GPU count for fractional A10 instance types. + # Those instance types have fractional GPU count, but Azure API returns + # 1 GPU count for them. We manually update the GPU count here. + df_ret['AcceleratorCount'] = df_ret.apply(_upd_a10_gpu_count, + axis='columns') + # As of Dec 2023, a few H100 instance types fetched from Azure APIs do not # have pricing: # @@ -299,10 +313,6 @@ def get_additional_columns(row): after_drop_len = len(df_ret) print(f'Dropped {before_drop_len - after_drop_len} duplicated rows') - # Filter out instance types that only contain a fractional of GPU. - df_ret = df_ret.loc[~df_ret['InstanceType'].isin(FILTERED_A10_INSTANCE_TYPES - )] - # Filter out deprecated families df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)] df_ret = df_ret[USEFUL_COLUMNS] diff --git a/sky/clouds/service_catalog/fluidstack_catalog.py b/sky/clouds/service_catalog/fluidstack_catalog.py index 2f47a38df43..7a28ac8174a 100644 --- a/sky/clouds/service_catalog/fluidstack_catalog.py +++ b/sky/clouds/service_catalog/fluidstack_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for FluidStack. """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -65,7 +65,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/ibm_catalog.py b/sky/clouds/service_catalog/ibm_catalog.py index 51b4e14f569..5cec86fbb65 100644 --- a/sky/clouds/service_catalog/ibm_catalog.py +++ b/sky/clouds/service_catalog/ibm_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for IBM. """ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky import sky_logging from sky.adaptors import ibm @@ -43,7 +43,7 @@ def get_vcpus_mem_from_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/lambda_catalog.py b/sky/clouds/service_catalog/lambda_catalog.py index e843ab72cc0..24cb4064d54 100644 --- a/sky/clouds/service_catalog/lambda_catalog.py +++ b/sky/clouds/service_catalog/lambda_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for Lambda. """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import resources_utils @@ -72,7 +72,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py index 47d0489f6ab..c8e475df871 100644 --- a/sky/clouds/service_catalog/oci_catalog.py +++ b/sky/clouds/service_catalog/oci_catalog.py @@ -14,7 +14,7 @@ import logging import threading import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.adaptors import oci as oci_adaptor from sky.clouds import OCI @@ -131,7 +131,7 @@ def _filter_disk_type(instance_type: str) -> bool: def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl( _get_df(), instance_type) diff --git a/sky/clouds/service_catalog/paperspace_catalog.py b/sky/clouds/service_catalog/paperspace_catalog.py index 1eb635c93e5..49948b219a1 100644 --- a/sky/clouds/service_catalog/paperspace_catalog.py +++ b/sky/clouds/service_catalog/paperspace_catalog.py @@ -5,7 +5,7 @@ """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -60,7 +60,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/runpod_catalog.py b/sky/clouds/service_catalog/runpod_catalog.py index 2d3ed44307b..7fbc46206ed 100644 --- a/sky/clouds/service_catalog/runpod_catalog.py +++ b/sky/clouds/service_catalog/runpod_catalog.py @@ -5,7 +5,7 @@ """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -56,7 +56,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/scp_catalog.py b/sky/clouds/service_catalog/scp_catalog.py index 209bb4cf631..e4773ab3250 100644 --- a/sky/clouds/service_catalog/scp_catalog.py +++ b/sky/clouds/service_catalog/scp_catalog.py @@ -5,7 +5,7 @@ """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import resources_utils @@ -67,7 +67,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/vsphere_catalog.py b/sky/clouds/service_catalog/vsphere_catalog.py index e1199d3d266..74fb2fbe60d 100644 --- a/sky/clouds/service_catalog/vsphere_catalog.py +++ b/sky/clouds/service_catalog/vsphere_catalog.py @@ -2,7 +2,7 @@ import io import os import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.adaptors import common as adaptors_common from sky.clouds.service_catalog import common @@ -85,7 +85,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl( _get_df(), instance_type) diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 7cf56b46a8d..88d5df3232a 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -1,8 +1,7 @@ """Vsphere cloud implementation.""" -import json import subprocess import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -152,7 +151,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds=_CLOUD_VSPHERE) @@ -182,10 +181,8 @@ def make_deploy_resources_variables( zone_names = [zone.name for zone in zones] r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/resources.py b/sky/resources.py index 540cbfb703c..164ef312ba1 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -392,7 +392,7 @@ def memory(self) -> Optional[str]: @property @functools.lru_cache(maxsize=1) - def accelerators(self) -> Optional[Dict[str, int]]: + def accelerators(self) -> Optional[Dict[str, Union[int, float]]]: """Returns the accelerators field directly or by inferring. For example, Resources(AWS, 'p3.2xlarge') has its accelerators field diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 72aa5ac05d3..653bb109ac0 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -2,9 +2,11 @@ import dataclasses import enum import itertools +import json +import math import re import typing -from typing import List, Optional, Set +from typing import Dict, List, Optional, Set, Union from sky import skypilot_config from sky.clouds import cloud_registry @@ -163,6 +165,16 @@ def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle', return _DEFAULT_MESSAGE_HANDLE_INITIALIZING +def make_ray_custom_resources_str( + resource_dict: Optional[Dict[str, Union[int, float]]]) -> Optional[str]: + """Convert resources to Ray custom resources format.""" + if resource_dict is None: + return None + # Ray does not allow fractional resources, so we need to ceil the values. + ceiled_dict = {k: math.ceil(v) for k, v in resource_dict.items()} + return json.dumps(ceiled_dict, separators=(',', ':')) + + @dataclasses.dataclass class FeasibleResources: """Feasible resources returned by cloud.