Skip to content

Commit

Permalink
Merge branch 'main' into issue-rigid-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
ajbalogh committed Nov 7, 2024
2 parents f628e4a + 823fdb9 commit c888d84
Show file tree
Hide file tree
Showing 11 changed files with 272 additions and 30 deletions.
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PYTHONTEST=.
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,10 @@
},
"clang-format.language.proto.enable": true,
"clang-format.language.proto.style": "{ IndentWidth: 2, BasedOnStyle: google, ReflowComments: false, ColumnLimit: 0, AlignTrailingComments: true }",
"python.testing.pytestArgs": [
"src/tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.envFile": "${workspaceFolder}/.env",
}
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.20
0.0.22
4 changes: 4 additions & 0 deletions protos/infra.proto
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,14 @@ message Pcie {
message NvLink {
}

message Custom {
}

message Switch {
oneof type {
Pcie pcie = 1;
NvLink nvlink = 2;
Custom custom = 3;
}
}

Expand Down
12 changes: 4 additions & 8 deletions src/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- extended functionality for finding nodes, building inter-package adjacencies,
calculating paths etc
"""

from typing import List, Literal, Union, Type, Tuple
from abc import ABC, abstractmethod
import google.protobuf.json_format
Expand Down Expand Up @@ -52,10 +53,6 @@ def description(self) -> str:
def port_up_component(self) -> infra.Component:
pass

@property
def port_up_component(self) -> infra.Component:
pass

def _add_component_link(self, c1, c1_index, link_name, c2, c2_index):
connection = infra.ComponentConnection(
link=infra.ComponentLink(
Expand Down Expand Up @@ -168,6 +165,7 @@ def message_to_yaml(self, message) -> str:
)
)


class HostBuilder(DeviceBuilder):
def __init__(self):
super(DeviceBuilder).__init__()
Expand Down Expand Up @@ -382,6 +380,7 @@ def get_link(self, link_type: int) -> infra.Link:
f"Inter package link of type {link_type} does not exist in system {self._fabric.name}"
)


class InfraBuilder(ABC):
def __init__(
self,
Expand All @@ -391,10 +390,7 @@ def __init__(
links: List[Type[infra.Link]] = [],
):
self._infra = infra.Infrastructure(
custom_fabric=fabric,
hosts=hosts,
connections=connections,
links={}
custom_fabric=fabric, hosts=hosts, connections=connections, links={}
)

for link in links:
Expand Down
32 changes: 16 additions & 16 deletions src/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ class GenericHost(bld.HostBuilder):
def __init__(
self,
npu_count=1,
nvlink_bandwidth_gbps: int=0
npu_interconnect_bandwidth_gbps: int=0
):
"""Creates a generic device with only npu and nic components that are
connected by a pcie link.
Optionally, npu components can be connected via nvlink using a single nvswitch.
Optionally, npu components within a device can be interconnected via generic links attached to a single generic switch.
name: The name of the generic device
npu_count: The number of npu/nic components in the device.
nvlink_bandwidth_gbps: nvlink bandwidth in gigabits per second. If 0, no nvlink connections will be added to the device.
npu_interconnect_bandwidth_gbps: npu-to-npu interconnect bandwidth in gigabits per second. If 0, no internal npu-to-npu connectivity will be added to the device.
"""
super(GenericHost).__init__()
npu = infra.Component(
Expand All @@ -46,15 +46,15 @@ def __init__(
name="pcie",
type=infra.LinkType.LINK_PCIE,
)
nvlink = infra.Link(
name="nvlink",
type=infra.LinkType.LINK_NVLINK,
bandwidth=infra.Bandwidth(gbps=nvlink_bandwidth_gbps),
npu_interconnect = infra.Link(
name="npu_interconnect",
type=infra.LinkType.LINK_CUSTOM,
bandwidth=infra.Bandwidth(gbps=npu_interconnect_bandwidth_gbps),
)
nvswitch = infra.Component(
name="nvswitch",
npu_interconnect_switch = infra.Component(
name="npu_interconnect_switch",
count=1,
switch=infra.Switch(nvlink=infra.NvLink()),
switch=infra.Switch(custom=infra.Custom()),
)

links = { pcie.name: pcie }
Expand All @@ -77,18 +77,18 @@ def __init__(
)
)

# Add nvlink connections if bandwidth was provided
if nvlink_bandwidth_gbps > 0:
components[nvswitch.name] = nvswitch
links[nvlink.name] = nvlink
# Add npu_interconnect connections if bandwidth was provided
if npu_interconnect_bandwidth_gbps > 0:
components[npu_interconnect_switch.name] = npu_interconnect_switch
links[npu_interconnect.name] = npu_interconnect
for npu_idx_a in range(npu_count):
connections.append(
infra.ComponentConnection(
link=infra.ComponentLink(
c1=npu.name,
c1_index=npu_idx_a,
link=nvlink.name,
c2=nvswitch.name,
link=npu_interconnect.name,
c2=npu_interconnect_switch.name,
c2_index=0,
)
)
Expand Down
Binary file added src/notes/rack-plane.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
73 changes: 73 additions & 0 deletions src/rack_plane_fabric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""RackPlaneFabric package
Uses the infra_pb2 protobuf generated code
to capture components, links, and connections of the
RackPlane block diagram.
"""

from typing import Tuple

if __package__ is None or __package__ == "":
import generated.infra_pb2 as infra
import builders as bld
from keysight_chakra.closfabric import ClosFabricSwitch
else:
from .generated import infra_pb2 as infra
from . import builders as bld
from .closfabric import ClosFabricSwitch


class RackPlaneFabricBuilder(bld.FabricBuilder):
"""
generates infrastructure of a fabric that
supports connecting to switching via multiple planes
"""

name: str = "RackPlaneFabric"
description: str = "fabric that users multiple planes inside a rack"
lowest_device: bld.DeviceBuilder = None

def __init__(self, host_builder: bld.DeviceBuilder, host_count: int = 1):
super().__init__(self.name)
assert isinstance(host_builder, bld.DeviceBuilder)

rack_switch, _ = self._add_fabric_devices(
host_builder,
host_count,
"RackSwitch",
)
self.lowest_device = rack_switch

device_link = infra.Link(
name="eth",
type=infra.LinkType.LINK_ETHERNET,
)
self.fabric.links[device_link.name].CopyFrom(device_link)

def _add_fabric_devices(
self,
host_builder: bld.DeviceBuilder,
host_count: int,
device_name: str,
) -> Tuple[bld.DeviceBuilder, int]:
"""Adds fabric switches to the infrastructure
Returns: Tuple of the device and the number of devices
"""
down_link_count = int(host_builder.port_up_component.count * host_count)
up_link_count = 0
device = ClosFabricSwitch(device_name, down_link_count, up_link_count)
# create one rack switch per host scale up nic
sw_count = host_builder.port_up_component.count
self._add_device(device, sw_count)
return (device, sw_count)

def _add_device(
self, package_builder: bld.DeviceBuilder, device_count: int
) -> None:
if package_builder is not None:
self.fabric.devices[package_builder.device.name].CopyFrom(
infra.DeviceCount(
count=device_count,
device=package_builder.device,
)
)
81 changes: 81 additions & 0 deletions src/rack_plane_host.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""RackPlaneHost package
Uses the infra_pb2 protobuf generated code
to capture components, links, and connections of the
RackPlane block diagram.
"""

if __package__ is None or __package__ == "":
import generated.infra_pb2 as infra
import builders as bld
else:
from .generated import infra_pb2 as infra
from . import builders as bld


class RackPlaneHostBuilder(bld.HostBuilder):
"""
generates infrastructure of a host that
supports connecting to switching via multiple planes
"""

name = "RackPlaneHost"
description = "a host with dedicated scale up and scale out NICs"

def __init__(
self, npu_count: int, scale_up_nic_count: int, scale_out_nic_count: int
):
super(RackPlaneHostBuilder).__init__()
# 1. Add components
npu = infra.Component(name="npu", count=npu_count, npu=infra.Npu())
scale_up_nic = infra.Component(
name="scale-up-nic", count=scale_up_nic_count, nic=infra.Nic()
)
self._port_component = scale_up_nic

# TODO: Scale OUT NICs
# scale_out_nic = infra.Component(
# name="scale-out-nic", count=scale_out_nic_count, nic=infra.Nic()
# )

# 2. Add link & device
# link is yet undetermined, using mii as placeholder with zero cost speed
mii_link = infra.Link(name="mii")

self._device = infra.Device(
name=self.name,
components={
npu.name: npu,
scale_up_nic.name: scale_up_nic,
# scale_out_nic.name: scale_out_nic,
},
links={mii_link.name: mii_link},
)

# 3. Add component links
# scale UP NICs to NPU connections
for c1_index in range(npu.count):
for c2_index in range(scale_up_nic.count):
self._add_component_link(
npu.name,
c1_index,
mii_link.name,
scale_up_nic.name,
c2_index,
)

# TODO: Scale OUT NICs
# scale OUT NICs to NPU connections
# for c1_index in range(npu.count):
# for c2_index in range(scale_up_nic.count):
# self._add_component_link(
# npu.name,
# c1_index,
# mii_link.name,
# scale_up_nic.name,
# c2_index,
# )

@property
def port_up_component(self) -> infra.Component:
return self._port_component
10 changes: 5 additions & 5 deletions src/tests/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@ def test_generic_host_no_params():
host = GenericHost()
assert host.get_component("npu") is not None
assert host.get_component("nic") is not None
assert "nvlink" not in host._device.links
assert "npu_interconnect" not in host._device.links

def test_generic_host_with_params():
npu_count = 4
host = GenericHost(npu_count=npu_count, nvlink_bandwidth_gbps=600)
assert "nvlink" in host._device.links
assert host._device.links["nvlink"].type == infra.LINK_NVLINK
host = GenericHost(npu_count=npu_count, npu_interconnect_bandwidth_gbps=600)
assert "npu_interconnect" in host._device.links
assert host._device.links["npu_interconnect"].type == infra.LINK_CUSTOM

seen_map = {}
for npu_index in range(npu_count):
seen_map[npu_index] = False

for connection in host._device.connections:
if connection.link.c1 == "npu" and connection.link.c2 == "nvswitch":
if connection.link.c1 == "npu" and connection.link.c2 == "npu_interconnect_switch":
npu_index = connection.link.c1_index
assert npu_index in seen_map
assert not seen_map[npu_index]
Expand Down
Loading

0 comments on commit c888d84

Please sign in to comment.