Skip to content

Commit

Permalink
add rack-plane builders (fabric and host)
Browse files Browse the repository at this point in the history
TOOD: Scale out network modelling
  • Loading branch information
thomas-am committed Nov 4, 2024
1 parent 5045f37 commit 1ff0b3d
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 0 deletions.
Binary file added src/notes/rack-plane.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
73 changes: 73 additions & 0 deletions src/rack_plane_fabric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""RackPlaneFabric package
Uses the infra_pb2 protobuf generated code
to capture components, links, and connections of the
RackPlane block diagram.
"""

from typing import Tuple

from src.closfabric import ClosFabricSwitch

if __package__ is None or __package__ == "":
import generated.infra_pb2 as infra
import builders as bld
else:
from .generated import infra_pb2 as infra
from . import builders as bld


class RackPlaneFabricBuilder(bld.FabricBuilder):
"""
generates infrastructure of a fabric that
supports connecting to switching via multiple planes
"""

name: str = "rack plane fabric"
description: str = "fabric that users multiple planes inside a rack"
lowest_device: bld.DeviceBuilder = None

def __init__(self, host_builder: bld.DeviceBuilder, host_count: int = 1):
super().__init__(self.name)
assert isinstance(host_builder, bld.DeviceBuilder)

rack_switch, _ = self._add_fabric_devices(
host_builder,
host_count,
"rack switch",
)
self.lowest_device = rack_switch

device_link = infra.Link(
name="eth",
type=infra.LinkType.LINK_ETHERNET,
)
self.fabric.links[device_link.name].CopyFrom(device_link)

def _add_fabric_devices(
self,
host_builder: bld.DeviceBuilder,
host_count: int,
device_name: str,
) -> Tuple[bld.DeviceBuilder, int]:
"""Adds fabric switches to the infrastructure
Returns: Tuple of the device and the number of devices
"""
down_link_count = int(host_builder.port_up_component.count * host_count)
up_link_count = 0
device = ClosFabricSwitch(device_name, down_link_count, up_link_count)
# create one rack switch per host scale up nic
sw_count = host_builder.port_up_component.count
self._add_device(device, sw_count)
return (device, sw_count)

def _add_device(
self, package_builder: bld.DeviceBuilder, device_count: int
) -> None:
if package_builder is not None:
self.fabric.devices[package_builder.device.name].CopyFrom(
infra.DeviceCount(
count=device_count,
device=package_builder.device,
)
)
76 changes: 76 additions & 0 deletions src/rack_plane_host.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""RackPlaneHost package
Uses the infra_pb2 protobuf generated code
to capture components, links, and connections of the
RackPlane block diagram.
"""

if __package__ is None or __package__ == "":
import generated.infra_pb2 as infra
import builders as bld
else:
from .generated import infra_pb2 as infra
from . import builders as bld


class RackPlaneHostBuilder(bld.HostBuilder):
"""
generates infrastructure of a host that
supports connecting to switching via multiple planes
"""

name = "rack plane host"
description = "a host with dedicated scale up and scale out NICs"

def __init__(
self, npu_count: int, scale_up_nic_count: int, scale_out_nic_count: int
):
super(RackPlaneHostBuilder).__init__()
# 1. Add components
npu = infra.Component(name="npu", count=npu_count, npu=infra.Npu())
scale_up_nic = infra.Component(
name="scale-up-nic", count=scale_up_nic_count, nic=infra.Nic()
)
self._port_component = scale_up_nic

# TODO: Scale OUT NICs
# scale_out_nic = infra.Component(
# name="scale-out-nic", count=scale_out_nic_count, nic=infra.Nic()
# )

# 2. Add device
self._device = infra.Device(
name=self.name,
components={
npu.name: npu,
scale_up_nic.name: scale_up_nic,
# scale_out_nic.name: scale_out_nic,
},
)

# 3. Add component links
# scale UP NICs to NPU connections
for c1_index in range(npu.count):
for c2_index in range(scale_up_nic.count):
self._add_component_link(
npu.name,
c1_index,
f"{npu.name}.{c1_index}.to.{scale_up_nic.name}.{c2_index}",
scale_up_nic.name,
c2_index,
)

# scale OUT NICs to NPU connections
for c1_index in range(npu.count):
for c2_index in range(scale_up_nic.count):
self._add_component_link(
npu.name,
c1_index,
f"{npu.name}.{c1_index=}.to.{scale_up_nic.name}.{c2_index}",
scale_up_nic.name,
c2_index,
)

@property
def port_up_component(self) -> infra.Component:
return self._port_component
81 changes: 81 additions & 0 deletions src/tests/test_rack_plane.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""rack plane related unit tests"""

import pytest

if __package__ is None or __package__ == "":
from src.generated import infra_pb2
from src.rack_plane_host import RackPlaneHostBuilder
from src.rack_plane_fabric import RackPlaneFabricBuilder
from src.infrastructure import Infrastructure
else:
from .generated import infra_pb2
from keysight_chakra.rack_plane_host import RackPlaneHostBuilder
from keysight_chakra.rack_plane_fabric import RackPlaneFabricBuilder
from keysight_chakra.infrastructure import Infrastructure


@pytest.mark.parametrize("host_count", [2, 3, 4, 8])
@pytest.mark.parametrize("sup_nic_count", [2, 3, 4])
def test_rack_plane_fabric_and_host(host_count: int, sup_nic_count: int):
"""verifies that the correct infrastructure can be created from rack_plane fabric/host"""
rp_host_builder = RackPlaneHostBuilder(
npu_count=1, scale_up_nic_count=sup_nic_count, scale_out_nic_count=1
)
rp_fabric_builder = RackPlaneFabricBuilder(host_builder=rp_host_builder)
infra_builder = Infrastructure(
host_device=rp_host_builder,
host_devices=host_count,
fabric=rp_fabric_builder,
assignment_scheme="ROUND_ROBIN",
)
infrastructure = infra_builder.infrastructure

assert infrastructure is not None
# loose check confirming the correct number of connections
# between host and rack switches
assert len(infrastructure.connections) == host_count * sup_nic_count


def test_rack_plane_fabric_and_host_detailed():
"""verifies that the correct infrastructure can be created from rack_plane fabric/host"""
sup_nic_count = 2
host_count = 2
rp_host_builder = RackPlaneHostBuilder(
npu_count=1, scale_up_nic_count=sup_nic_count, scale_out_nic_count=1
)
rp_fabric_builder = RackPlaneFabricBuilder(host_builder=rp_host_builder)
infra_builder = Infrastructure(
host_device=rp_host_builder,
host_devices=host_count,
fabric=rp_fabric_builder,
assignment_scheme="ROUND_ROBIN",
)
infrastructure = infra_builder.infrastructure

assert infrastructure is not None
assert len(infrastructure.connections) == host_count * sup_nic_count

# now let's confirm every details of the DeviceConnections
def assert_device_conn(
dev_conn: infra_pb2.DeviceConnection,
d1_index: int,
c1_index: int,
d2_index: int,
c2_index: int,
):
assert dev_conn.link.d1 == "rack plane host"
assert dev_conn.link.c1 == "scale-up-nic"
assert dev_conn.link.d2 == "rack switch"
assert dev_conn.link.c2 == "port-down"
assert dev_conn.link.link == "eth"
assert dev_conn.link.d1_index == d1_index
assert dev_conn.link.c1_index == c1_index
assert dev_conn.link.d2_index == d2_index
assert dev_conn.link.c2_index == c2_index

# plane 0 d1,c1,d2,c2
assert_device_conn(infrastructure.connections[0], 0, 0, 0, 0)
assert_device_conn(infrastructure.connections[1], 0, 1, 1, 0)
# plane 1 (and thus rack switch 1; and scale up nic 1 on all hosts)
assert_device_conn(infrastructure.connections[2], 1, 0, 0, 1)
assert_device_conn(infrastructure.connections[3], 1, 1, 1, 1)

0 comments on commit 1ff0b3d

Please sign in to comment.