Skip to content

Commit

Permalink
[Azure] Support fractional A10 instance types (#3877)
Browse files Browse the repository at this point in the history
* fix

* change catalog to float gpu num

* support print float point gpu in sky launch. TODO: test if the ray deployment group works for fractional one

* fix unittest

* format

* patch ray resources to ceil value

* support launch from --gpus A10

* only allow strictly match fractional gpu counts

* address comment

* change back condition

* fix

* apply suggestions from code review

* fix

* Update sky/backends/cloud_vm_ray_backend.py

Co-authored-by: Zhanghao Wu <[email protected]>

* format

* fix display of fuzzy candidates

* fix precision issue

* fix num gpu required

* refactor in check_resources_fit_cluster

* change type annotation of acc_count

* enable fuzzy fp acc count

* fix k8s

* Update sky/clouds/service_catalog/common.py

Co-authored-by: Zhanghao Wu <[email protected]>

* fix integer gpus

* format

---------

Co-authored-by: Zhanghao Wu <[email protected]>
  • Loading branch information
cblmemo and Michaelvll authored Oct 26, 2024
1 parent 0e915d3 commit 647fcea
Show file tree
Hide file tree
Showing 31 changed files with 150 additions and 134 deletions.
15 changes: 15 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2713,6 +2713,21 @@ def check_resources_fit_cluster(
f' Existing:\t{handle.launched_nodes}x '
f'{handle.launched_resources}\n'
f'{mismatch_str}')
else:
# For fractional acc count clusters, we round up the number of accs
# to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str)
# Here we scale the required acc count to (required / launched) * 1
# so the total number of accs is the same as the requested number.
launched_accs = launched_resources.accelerators
if (launched_accs is not None and
valid_resource.accelerators is not None):
for _, count in launched_accs.items():
if isinstance(count, float) and not count.is_integer():
valid_resource = valid_resource.copy(
accelerators={
k: v / count
for k, v in valid_resource.accelerators.items()
})
return valid_resource

def _provision(
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import enum
import fnmatch
import functools
import json
import os
import re
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -383,7 +382,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='aws')

Expand Down Expand Up @@ -411,10 +410,8 @@ def make_deploy_resources_variables(
r = resources
# r.accelerators is cleared but .instance_type encodes the info.
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

if r.extract_docker_image() is not None:
image_id_to_use = None
Expand Down
10 changes: 4 additions & 6 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Azure."""
import functools
import json
import os
import re
import subprocess
import textwrap
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -272,7 +271,7 @@ def zones_provision_loop(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='azure')

Expand Down Expand Up @@ -304,10 +303,9 @@ def make_deploy_resources_variables(
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
acc_count = None
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
acc_count = str(sum(acc_dict.values()))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

if (resources.image_id is None or
resources.extract_docker_image() is not None):
Expand Down
18 changes: 11 additions & 7 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
"""
import collections
import enum
import math
import typing
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union

from sky import exceptions
from sky import skypilot_config
Expand Down Expand Up @@ -306,7 +307,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
raise NotImplementedError

Expand Down Expand Up @@ -673,8 +674,9 @@ def _check_instance_type_accelerators_combination(
assert resources.is_launchable(), resources

def _equal_accelerators(
acc_requested: Optional[Dict[str, int]],
acc_from_instance_type: Optional[Dict[str, int]]) -> bool:
acc_requested: Optional[Dict[str, Union[int, float]]],
acc_from_instance_type: Optional[Dict[str, Union[int,
float]]]) -> bool:
"""Check the requested accelerators equals to the instance type
Check the requested accelerators equals to the accelerators
Expand All @@ -689,12 +691,14 @@ def _equal_accelerators(
for acc in acc_requested:
if acc not in acc_from_instance_type:
return False
if acc_requested[acc] != acc_from_instance_type[acc]:
# Avoid float point precision issue.
if not math.isclose(acc_requested[acc],
acc_from_instance_type[acc]):
return False
return True

acc_from_instance_type = (cls.get_accelerators_from_instance_type(
resources.instance_type))
acc_from_instance_type = cls.get_accelerators_from_instance_type(
resources.instance_type)
if not _equal_accelerators(resources.accelerators,
acc_from_instance_type):
with ux_utils.print_exception_no_traceback():
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Cudo Compute"""
import json
import subprocess
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky.clouds import service_catalog
Expand Down Expand Up @@ -183,7 +182,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='cudo')

Expand All @@ -202,10 +201,8 @@ def make_deploy_resources_variables(
del zones, cluster_name # unused
r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Fluidstack Cloud."""
import json
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -155,7 +154,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='fluidstack')

Expand Down Expand Up @@ -184,10 +183,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
4 changes: 2 additions & 2 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

import colorama

Expand Down Expand Up @@ -669,7 +669,7 @@ def _get_feasible_launchable_resources(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
# GCP handles accelerators separately from regular instance types,
# hence return none here.
return None
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""IBM Web Services."""
import json
import os
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -206,10 +205,8 @@ def _get_profile_resources(instance_profile):
'IBM does not currently support spot instances in this framework'

acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

instance_resources = _get_profile_resources(r.instance_type)

Expand Down Expand Up @@ -247,7 +244,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='ibm')
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
"""Kubernetes."""
import functools
import json
import os
import re
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import sky_logging
Expand Down Expand Up @@ -271,7 +270,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
inst = kubernetes_utils.KubernetesInstanceType.from_instance_type(
instance_type)
return {
Expand Down Expand Up @@ -328,10 +327,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

# resources.memory and cpus are None if they are not explicitly set.
# We fetch the default values for the instance type in that case.
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Lambda Cloud."""
import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -136,7 +135,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='lambda')

Expand Down Expand Up @@ -164,10 +163,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

resources_vars = {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
- Hysun He ([email protected]) @ Oct 13, 2024:
Support more OS types additional to ubuntu for OCI resources.
"""
import json
import logging
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -193,7 +192,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='oci')

Expand All @@ -213,10 +212,8 @@ def make_deploy_resources_variables(

acc_dict = self.get_accelerators_from_instance_type(
resources.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

image_str = self._get_image_id(resources.image_id, region.name,
resources.instance_type)
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/paperspace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
""" Paperspace Cloud. """

import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -162,7 +161,7 @@ def get_default_instance_type(

@classmethod
def get_accelerators_from_instance_type(
cls, instance_type: str) -> Optional[Dict[str, int]]:
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='paperspace')

Expand All @@ -181,10 +180,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
Loading

0 comments on commit 647fcea

Please sign in to comment.