Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

List hosts #332

Merged
merged 22 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
# Unreleased

### Fixed
- List hosts (Issue [#331](https://github.com/fabric-testbed/fabrictestbed-extensions/issues/331))
- AL2S Support (Issue [#325](https://github.com/fabric-testbed/fabrictestbed-extensions/issues/325))
- Deny infeasible slices (Issue [#326](https://github.com/fabric-testbed/fabrictestbed-extensions/issues/326))
- Add display of switch port name to network service table listing (Issue [#152](https://github.com/fabric-testbed/fabrictestbed-extensions/issues/152))
Expand Down
24 changes: 24 additions & 0 deletions fabrictestbed_extensions/fablib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,27 @@ class Constants:
SSH_KEYS = "sshkeys"
EXPIRES_ON = "expires_on"
LEASE_TIME_FORMAT = "%Y-%m-%d %H:%M:%S %z"

NON_PRETTY_NAME = "non_pretty_name"
PRETTY_NAME = "pretty_name"
HEADER_NAME = "header_name"
AVAILABLE = "Available"
CAPACITY = "Capacity"
ALLOCATED = "Allocated"
VALUE = "value"

NIC_SHARED_CONNECTX_6 = "SharedNIC-ConnectX-6"
SMART_NIC_CONNECTX_6 = "SmartNIC-ConnectX-6"
SMART_NIC_CONNECTX_5 = "SmartNIC-ConnectX-5"
NVME_P4510 = "NVME-P4510"
GPU_TESLA_T4 = "GPU-Tesla T4"
GPU_RTX6000 = "GPU-RTX6000"
GPU_A30 = "GPU-A30"
GPU_A40 = "GPU-A40"
FPGA_XILINX_U280 = "FPGA-Xilinx-U280"
CORES = "Cores"
RAM = "Ram"
DISK = "Disk"
CPUS = "CPUs"
HOSTS = "Hosts"
P4_SWITCH = "P4-Switch"
176 changes: 118 additions & 58 deletions fabrictestbed_extensions/fablib/fablib.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
import traceback
import warnings

from fabrictestbed_extensions.fablib.site import Host, Site

warnings.filterwarnings("always", category=DeprecationWarning)

from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -143,6 +145,16 @@ def list_sites(latlon: bool = True) -> object:
"""
return fablib.get_default_fablib_manager().list_sites(latlon=latlon)

@staticmethod
def list_hosts() -> object:
"""
Get a string used to print a tabular list of sites with state

:return: tabulated string of site state
:rtype: str
"""
return fablib.get_default_fablib_manager().list_hosts()

@staticmethod
def list_links() -> object:
"""
Expand Down Expand Up @@ -1137,6 +1149,78 @@ def list_sites(
latlon=latlon,
)

def list_hosts(
self,
output: str = None,
fields: str = None,
quiet: bool = False,
filter_function=None,
update: bool = True,
pretty_names: bool = True,
force_refresh: bool = False,
start: datetime = None,
end: datetime = None,
avoid: List[str] = None,
includes: List[str] = None,
) -> object:
"""
Lists all the hosts and their attributes.

There are several output options: "text", "pandas", and "json" that determine the format of the
output that is returned and (optionally) displayed/printed.

output: 'text': string formatted with tabular
'pandas': pandas dataframe
'json': string in json format

fields: json output will include all available fields/columns.

Example: fields=['Name','ConnectX-5 Available', 'NVMe Total']

filter_function: A lambda function to filter data by field values.

Example: filter_function=lambda s: s['ConnectX-5 Available'] > 3 and s['NVMe Available'] <= 10

:param output: output format
:type output: str
:param fields: list of fields (table columns) to show
:type fields: List[str]
:param quiet: True to specify printing/display
:type quiet: bool
:param filter_function: lambda function
:type filter_function: lambda
:return: table in format specified by output parameter
:param update:
:type update: bool
:param pretty_names:
:type pretty_names: bool
:param force_refresh:
:type force_refresh: bool
:param start: start time in UTC format: %Y-%m-%d %H:%M:%S %z
:type: datetime
:param end: end time in UTC format: %Y-%m-%d %H:%M:%S %z
:type: datetime
:param avoid: list of sites to avoid
:type: list of string
:param includes: list of sites to include
:type: list of string

"""
return self.get_resources(
update=update,
force_refresh=force_refresh,
start=start,
end=end,
avoid=avoid,
includes=includes,
).list_hosts(
output=output,
fields=fields,
quiet=quiet,
filter_function=filter_function,
pretty_names=pretty_names,
)

def list_links(
self,
output: str = None,
Expand Down Expand Up @@ -2284,86 +2368,62 @@ def create_show_table(data, fields=None, pretty_names_dict={}):
return table

@staticmethod
def __can_allocate_node_in_worker(
worker: FimNode, node: Node, allocated: dict, site: FimNode
def __can_allocate_node_in_host(
host: Host, node: Node, allocated: dict, site: Site
) -> Tuple[bool, str]:
"""
Check if a node can be provisioned on a worker node on a site w.r.t available resources on that site
Check if a node can be provisioned on a host node on a site w.r.t available resources on that site

:return: Tuple indicating status for validation and error message in case of failure
:rtype: Tuple[bool, str]
"""
if worker is None or site is None:
if host is None or site is None:
return (
True,
f"Ignoring validation: Worker: {worker}, Site: {site} not available.",
f"Ignoring validation: Host: {host}, Site: {site} not available.",
)

msg = f"Node can be allocated on the host: {worker.name}."
msg = f"Node can be allocated on the host: {host.get_name()}."

worker_maint_info = site.maintenance_info.get(worker.name)
if worker_maint_info and str(worker_maint_info.state) != "Active":
msg = f"Node cannot be allocated on {worker.name}, {worker.name} is in {worker_maint_info.state}!"
if host.get_state() != "Active":
msg = f"Node cannot be allocated on {host.get_name()}, {host.get_name()} is in {host.get_state()}!"
return False, msg

allocated_core = allocated.setdefault("core", 0)
allocated_ram = allocated.setdefault("ram", 0)
allocated_disk = allocated.setdefault("disk", 0)
available_cores = (
worker.capacities.core
- (
worker.capacity_allocations.core
if worker.capacity_allocations is not None
else 0
)
- allocated_core
)
available_ram = (
worker.capacities.ram
- (
worker.capacity_allocations.ram
if worker.capacity_allocations is not None
else 0
)
- allocated_ram
)
available_disk = (
worker.capacities.disk
- (
worker.capacity_allocations.disk
if worker.capacity_allocations is not None
else 0
)
- allocated_disk
)
available_cores = host.get_core_available()
available_ram = host.get_ram_available()
available_disk = host.get_disk_available()

if (
node.get_requested_cores() > available_cores
or node.get_requested_disk() > available_disk
or node.get_requested_ram() > available_ram
):
msg = f"Insufficient Resources: Host: {worker.name} does not meet core/ram/disk requirements."
msg = f"Insufficient Resources: Host: {host.get_name()} does not meet core/ram/disk requirements."
return False, msg

# Check if there are enough components available
for c in node.get_components():
comp_model_type = f"{c.get_type()}-{c.get_fim_model()}"
if comp_model_type not in worker.components:
msg = f"Invalid Request: Host: {worker.name} does not have the requested component: {comp_model_type}."
substrate_component = host.get_component(comp_model_type=comp_model_type)
if not substrate_component:
msg = f"Invalid Request: Host: {host.get_name()} does not have the requested component: {comp_model_type}."
return False, msg

allocated_comp_count = allocated.setdefault(comp_model_type, 0)
available_comps = (
worker.components[comp_model_type].capacities.unit
substrate_component.capacities.unit
- (
worker.components[comp_model_type].capacity_allocations.unit
if worker.components[comp_model_type].capacity_allocations
substrate_component.capacity_allocations.unit
if substrate_component.capacity_allocations
else 0
)
- allocated_comp_count
)
if available_comps <= 0:
msg = f"Insufficient Resources: Host: {worker.name} has reached the limit for component: {comp_model_type}."
msg = f"Insufficient Resources: Host: {host.get_name()} has reached the limit for component: {comp_model_type}."
return False, msg

allocated[comp_model_type] += 1
Expand All @@ -2385,7 +2445,7 @@ def validate_node(self, node: Node, allocated: dict = None) -> Tuple[bool, str]:
error = None
if allocated is None:
allocated = {}
site = self.get_resources().get_topology_site(site_name=node.get_site())
site = self.get_resources().get_site(site_name=node.get_site())

if not site:
logging.warning(
Expand All @@ -2396,43 +2456,43 @@ def validate_node(self, node: Node, allocated: dict = None) -> Tuple[bool, str]:
f"Ignoring validation: Site: {node.get_site()} not available in resources.",
)

site_maint_info = site.maintenance_info.get(site.name)
if site_maint_info and str(site_maint_info.state) != "Active":
msg = f"Node cannot be allocated on {node.get_site()}, {node.get_site()} is in {site_maint_info.state}."
site_state = site.get_state()
if site_state != "Active":
msg = f"Node cannot be allocated on {node.get_site()}, {node.get_site()} is in {site_state}."
logging.error(msg)
return False, msg
workers = self.get_resources().get_nodes(site=site)
if not workers:
hosts = site.get_hosts()
if not hosts:
msg = f"Node cannot be validated, host information not available for {site}."
logging.error(msg)
return False, msg

if node.get_host():
if node.get_host() not in workers:
if node.get_host() not in hosts:
msg = f"Invalid Request: Requested Host {node.get_host()} does not exist on site: {node.get_site()}."
logging.error(msg)
return False, msg

worker = workers.get(node.get_host())
host = hosts.get(node.get_host())

allocated_comps = allocated.setdefault(node.get_host(), {})
status, error = self.__can_allocate_node_in_worker(
worker=worker, node=node, allocated=allocated_comps, site=site
status, error = self.__can_allocate_node_in_host(
host=host, node=node, allocated=allocated_comps, site=site
)

if not status:
logging.error(error)
return status, error

for worker in workers.values():
allocated_comps = allocated.setdefault(worker.name, {})
status, error = self.__can_allocate_node_in_worker(
worker=worker, node=node, allocated=allocated_comps, site=site
for host in hosts.values():
allocated_comps = allocated.setdefault(host.get_name(), {})
status, error = self.__can_allocate_node_in_host(
host=host, node=node, allocated=allocated_comps, site=site
)
if status:
return status, error

msg = f"Invalid Request: Requested Node cannot be accommodated by any of the hosts on site: {site.name}."
msg = f"Invalid Request: Requested Node cannot be accommodated by any of the hosts on site: {site.get_name()}."
if error:
msg += f" Details: {error}"
logging.error(msg)
Expand Down
Loading
Loading