From c7ba9b05dec3db81608c5385314bbfbc1dcd1a23 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 25 Nov 2024 19:43:14 -0800 Subject: [PATCH] Update for NVLS OP (#24) Support NVLS in msccl-tools --- examples/mscclang/mscclpp/allreduce_nvls.py | 49 ++++++++ msccl/language/collectives.py | 26 ++-- msccl/language/mscclpp/__init__.py | 61 +++++++++- msccl/language/mscclpp/instruction_dag.py | 113 +++++++++++++++--- .../language/mscclpp/instruction_optimizer.py | 29 +++++ msccl/language/mscclpp/ir.py | 57 +++++++-- msccl/language/types.py | 13 +- 7 files changed, 315 insertions(+), 33 deletions(-) create mode 100644 examples/mscclang/mscclpp/allreduce_nvls.py diff --git a/examples/mscclang/mscclpp/allreduce_nvls.py b/examples/mscclang/mscclpp/allreduce_nvls.py new file mode 100644 index 0000000..35d5ea2 --- /dev/null +++ b/examples/mscclang/mscclpp/allreduce_nvls.py @@ -0,0 +1,49 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import argparse +from msccl.language import * +from msccl.topologies import * +from msccl.language.collectives import AllReduce + + +def allreduce_allpairs(gpus, instances): + size = gpus + chunksperloop = gpus + topology = fully_connected(size) + collective = AllReduce(size, chunksperloop, True) + with MSCCLPPProgram( + "allreduce_nvls", + topology, + collective, + instances, + ): + # Each rank sends the nth chunk to the nth rank into scratch space + for rank in range(size): + index = rank + c = chunk(rank, Buffer.input, index) + reduce_chunks = [] + # make sure the data is ready + for nghr in range(size): + if rank != nghr: + c_peer = chunk(nghr, Buffer.input, index) + reduce_chunks.append(c_peer) + c.signal(nghr, Buffer.input, index, sendtb=0) + for nghr in range(size): + if rank != nghr: + c.wait(nghr, Buffer.input, index, recvtb=0) + c = c.group_load_reduce(reduce_chunks, recvtb=0) + ngbrs = [nghr for nghr in range(size) if nghr != rank] + c.group_store(ngbrs, sendtb=0) + + Json() + Check() + + +parser = argparse.ArgumentParser() +parser.add_argument("num_gpus", type=int, help="number of gpus") +parser.add_argument("instances", type=int, help="number of instances") + +args = parser.parse_args() + +allreduce_allpairs(args.num_gpus, args.instances) diff --git a/msccl/language/collectives.py b/msccl/language/collectives.py index 6dc20cc..c8434da 100755 --- a/msccl/language/collectives.py +++ b/msccl/language/collectives.py @@ -4,13 +4,24 @@ class Collective: - def __init__(self, num_ranks, chunk_factor, inplace, num_chunk_groups=1): + + def __init__(self, num_ranks, chunk_factor, inplace, num_ranks_per_node=-1, **kwargs): self.num_ranks = num_ranks self.chunk_factor = chunk_factor self.inplace = inplace self.name = "custom" - # Devide the buffer into num_chunk_groups groups - self.num_chunk_groups = num_chunk_groups + # Devide the buffer into num_chunk_groups group + if num_ranks_per_node == -1: + self.num_ranks_per_node = num_ranks + else: + self.num_ranks_per_node = num_ranks_per_node + + # kwargs + # Number of chunk groups: which means we will group n chunks into m groups. + # We will gurantee that the group size is the same. + # But in the same group, the chunk size may be different due to group size + # can not be divided by the number of chunks. + self.num_chunk_groups = kwargs.get("num_chunk_groups", 1) def init_buffers(self): pass @@ -120,10 +131,11 @@ def get_buffer_index(self, rank, buffer, index): class AllReduce(Collective): - def __init__(self, num_ranks, chunk_factor, inplace, num_chunk_groups=None): - if num_chunk_groups == None: - num_chunk_groups = num_ranks - Collective.__init__(self, num_ranks, chunk_factor, inplace, num_chunk_groups) + def __init__(self, num_ranks, chunk_factor, inplace, num_ranks_per_node=-1, **kwargs): + num_chunk_groups = kwargs.get('num_chunk_groups', num_ranks) + Collective.__init__( + self, num_ranks, chunk_factor, inplace, num_ranks_per_node, num_chunk_groups=num_chunk_groups + ) self.name = "allreduce" def init_buffers(self): diff --git a/msccl/language/mscclpp/__init__.py b/msccl/language/mscclpp/__init__.py index 9f00ad4..13ea870 100644 --- a/msccl/language/mscclpp/__init__.py +++ b/msccl/language/mscclpp/__init__.py @@ -132,7 +132,7 @@ def lower(self): self.instr_dag.optimize() self.instr_dag.lower_pt1(self.instances) gpu_prgms = self.instr_dag.lower_pt2(self.instances, self.replication_policy) - return Program( + program = Program( self.name, self.collective.name, self.collective.inplace, @@ -142,6 +142,11 @@ def lower(self): self.num_threads_per_block, self.use_double_scratch_buffer, ) + for gpu in program.gpus: + gpu.input_chunks = len(self.buffers[gpu.rank][Buffer.input]) * self.instances + if not self.collective.inplace: + gpu.output_chunks = len(self.buffers[gpu.rank][Buffer.output]) * self.instances + return program def generate_json(self): return ir_to_json(self.lower()) @@ -327,6 +332,60 @@ def reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm): def reduce_packet(self, other_chunkref, recvtb=-1): return self._reduce(other_chunkref, recvtb, use_packet=True) + # """ + # Group operations. These operations are used to perform collective operations across multiple chunks. + # For now, all chunks must has the same buffer type and offset. + # """ + # Reads the chunk(s) referenced by other_chunkref and reduce into the chunk referenced by this chunkref + def group_load_reduce(self, other_chunkrefs: list, recvtb=-1, chan_type=ChannelType.nvls): + assert ( + len(other_chunkrefs) > 0 and chan_type == ChannelType.nvls + ), "Group load reduce only supports nvls channel" + nranks_per_node = self.prog.collective.num_ranks_per_node + for other_chunkref in other_chunkrefs: + assert ( + self.rank // nranks_per_node == other_chunkref.rank // nranks_per_node + ), "Group load reduce only supports chunks on the same node" + assert self.buffer == other_chunkref.buffer, "Group load reduce only supports chunks with the same buffer" + assert self.index == other_chunkref.index, "Group load reduce only supports chunks with the same index" + + src_chunkref = other_chunkref + self.prog.apply_reduce( + src_chunkref.rank, + src_chunkref.buffer, + src_chunkref.index, + self.rank, + self.buffer, + self.index, + self.size, + ) + self.prog.instr_dag.add_group_load_reduce(self.rank, other_chunkrefs, self, recvtb, chan_type) + return self + + # Copies the chunk(s) referenced by this chunkref onto other_chunkrefs + def group_store(self, dsts: list, index=-1, buffer=None, sendtb=-1, chan_type=ChannelType.nvls): + for dst in dsts: + self.prog.check_buffer_exists(dst, buffer) + assert index == -1 or self.index == index, "Group store only supports chunks with the same index" + assert chan_type == ChannelType.nvls, "Group store only supports nvls channel" + + other_chunkrefs = [] + nrank_per_node = self.prog.collective.num_ranks_per_node + for dst in dsts: + # Direct linked + buffer, index = self._get_buffer_index(dst, buffer, index) + assert self.prog.topo.link(self.rank, dst) or dst == self.rank, f"No link from {self.rank} to {dst}" + assert self.buffer == buffer, "Group store only supports chunks with the same buffer" + assert ( + self.rank // nrank_per_node == dst // nrank_per_node + ), "Group store only supports chunks on the same node" + + dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size) + self.prog.apply_send(self.rank, self.buffer, self.index, dst, buffer, index, self.size) + other_chunkrefs.append(dst_chunkref) + # add new op here + self.prog.instr_dag.add_group_store(self.rank, self, other_chunkrefs, sendtb, chan_type) + def get_origin_index(self, index=0): return self._get_chunk(index + self.index).origin_index diff --git a/msccl/language/mscclpp/instruction_dag.py b/msccl/language/mscclpp/instruction_dag.py index 6f20abf..0f692bf 100644 --- a/msccl/language/mscclpp/instruction_dag.py +++ b/msccl/language/mscclpp/instruction_dag.py @@ -191,29 +191,88 @@ def add_read_reduce(self, rank, send_ref, recv_ref, tb, ch_type): self._write(rank, buffer, index, size, op, read=True) return op + def add_group_load_reduce(self, rank, send_refs, recv_ref, tb, ch_type): + tb_step = self._get_tb_step(rank, tb) + op = Op( + Instruction.group_load_reduce, + rank, + recv_ref, + recv_ref, + next=set(), + prev=set(), + tb=tb, + channel_type=ch_type, + step=tb_step, + ) + # treat recv_ref as src for group_load_reduce + op.srcs.append((ChunkRef(recv_ref.rank, recv_ref.buffer, recv_ref.index, recv_ref.size), tb_step)) + for send_ref in send_refs: + op.srcs.append((ChunkRef(send_ref.rank, send_ref.buffer, send_ref.index, send_ref.size), tb_step)) + buffer = recv_ref.buffer + index = recv_ref.index + size = recv_ref.size + self._write(rank, buffer, index, size, op, read=True) + + def add_group_store(self, rank, send_ref, recv_refs, tb, ch_type): + tb_step = self._get_tb_step(rank, tb) + op = Op( + Instruction.group_store, + rank, + send_ref, + send_ref, + next=set(), + prev=set(), + tb=tb, + channel_type=ch_type, + step=tb_step, + ) + # treat send_ref as dst for group_store + op.dsts.append((ChunkRef(send_ref.rank, send_ref.buffer, send_ref.index, send_ref.size), tb_step)) + for recv_ref in recv_refs: + op.dsts.append((ChunkRef(recv_ref.rank, recv_ref.buffer, recv_ref.index, recv_ref.size), tb_step)) + buffer = send_ref.buffer + index = send_ref.index + size = send_ref.size + self._read(rank, buffer, index, size, op) + return op + def complete_channels(self): send_op = [Instruction.put, Instruction.signal, Instruction.put_packet] recv_op = [Instruction.wait, Instruction.get, Instruction.read_reduce_copy] + group_send_op = [Instruction.group_store] + group_recv_op = [Instruction.group_load_reduce] for rank, rank_tbs in enumerate(self.tbs): for tbid, tb in rank_tbs.items(): chans = set() for op in tb.ops: - src_buffer = ( - Buffer.scratch - if op.src.buffer is not Buffer.input and op.src.buffer is not Buffer.output - else op.src.buffer - ) - dst_buffer = ( - Buffer.scratch - if op.dst.buffer is not Buffer.input and op.dst.buffer is not Buffer.output - else op.dst.buffer - ) - if op.inst in send_op: - chan = Channel(src_buffer, dst_buffer, op.channel_type, op.dst.rank) - chans.add(chan) - elif op.inst in recv_op: - chan = Channel(src_buffer, dst_buffer, op.channel_type, op.src.rank) - chans.add(chan) + if op.src != None: + src_buffer = ( + Buffer.scratch + if op.src.buffer is not Buffer.input and op.src.buffer is not Buffer.output + else op.src.buffer + ) + if op.dst != None: + dst_buffer = ( + Buffer.scratch + if op.dst.buffer is not Buffer.input and op.dst.buffer is not Buffer.output + else op.dst.buffer + ) + if op.channel_type == ChannelType.nvls: + if op.inst in group_send_op: + ranks = [dst[0].rank for dst in op.dsts] + chan = Channel(src_buffer, dst_buffer, op.channel_type, ranks) + chans.add(chan) + elif op.inst in group_recv_op: + ranks = [src[0].rank for src in op.srcs] + chan = Channel(src_buffer, dst_buffer, op.channel_type, ranks) + chans.add(chan) + else: + if op.inst in send_op: + chan = Channel(src_buffer, dst_buffer, op.channel_type, op.dst.rank) + chans.add(chan) + elif op.inst in recv_op: + chan = Channel(src_buffer, dst_buffer, op.channel_type, op.src.rank) + chans.add(chan) tb.channels = list(chans) def remove_redundant_signal_wait(self): @@ -326,6 +385,27 @@ def _optimize_rrcs_rs(self): continue queue = queue[1:] + # glre(srcs, sbuf, si, _, _, _), gstore (_, _, _, dsts, dbuf, di) -> glres(srcs, sbuf, si, dsts, dbuf, di) + def _optimize_group_ops(self): + optimizer = InstructionOptimizer() + inst_types = [ + Instruction.group_load_reduce, + ] + for _, rank_tbs in enumerate(self.tbs): + for _, tb in rank_tbs.items(): + queue = list(tb.ops) + while len(queue) > 0: + op = queue[0] + fused = False + if op.inst in inst_types: + for next_op in op.next: + fused = optimizer.try_fuse_with_group_store(op, next_op, tb, queue) + if fused: + break + if fused: + continue + queue = queue[1:] + # merge ops which are independent of other operations and no other operations in between # get(src, sbuf. si, dst, dbuf, di) get(src, sbuf, si, dst, dbuf, di) -> get(list[src,sbuf,si], list[dst,dbuf,di]) # put(src, sbuf, si, dst, dbuf, di) put(src, sbuf, si, dst, dbuf, di) -> put(list[src,sbuf,si], list[dst,dbuf,di]) @@ -372,6 +452,7 @@ def optimize(self): self._fuse_instructions_using_proxy_channel() self._fuse_same_instructions() self._optimize_rrcs_rs() + self._optimize_group_ops() self._compact_instructions() def replicate(self, instances: int, replication_policy: ReplicationPolicy): diff --git a/msccl/language/mscclpp/instruction_optimizer.py b/msccl/language/mscclpp/instruction_optimizer.py index f36303b..14ddf1a 100644 --- a/msccl/language/mscclpp/instruction_optimizer.py +++ b/msccl/language/mscclpp/instruction_optimizer.py @@ -187,6 +187,35 @@ def try_fuse_instructions_using_proxy_channel( return True return False + def try_fuse_with_group_store(self, op: Op, next_op: Op, tb: Threadblock, queue: list) -> bool: + """ + Attempts to fuse 'gruop_load_reduce' operations with 'group_store' operations. + :param op: The current operation. + :param next_op: The next operation to potentially merge with. + :param tb: The thread block containing the operations. + :param queue: The queue of operations. + :return: True if operations are merged, False otherwise. + """ + if ( + next_op.inst == Instruction.group_store + and same_count(op, next_op) + and buf_dst_src_match(op, next_op) + and same_chan_type(op, next_op) + and not circular_dep_after_merge(op, next_op) + and all_prevs_visited_after_merge(op, next_op) + ): + # Append the destination chunk from next_op + op.inst = Instruction.group_load_reduce_store + op.src = next_op.src + for dst in next_op.dsts: + op.dsts.append(dst) + # Merge operations + merge_op(op, next_op) + tb.ops.remove(next_op) + queue.remove(next_op) + return True + return False + def try_remove_op(self, pending_remove_op: Op, condition: bool) -> bool: if condition: remove_op(pending_remove_op) diff --git a/msccl/language/mscclpp/ir.py b/msccl/language/mscclpp/ir.py index f5ba0fd..3eaa651 100644 --- a/msccl/language/mscclpp/ir.py +++ b/msccl/language/mscclpp/ir.py @@ -20,6 +20,8 @@ Instruction.reduce_packet, Instruction.reduce_send, Instruction.reduce_send_packet, + Instruction.group_load_reduce_store, + Instruction.group_store, } _local_dst_insts_mscclpp: set = { Instruction.get, @@ -33,6 +35,8 @@ Instruction.reduce_send, Instruction.reduce_packet, Instruction.reduce_send_packet, + Instruction.group_load_reduce_store, + Instruction.group_load_reduce, } _insts_no_need_sync_barrier: set = { @@ -148,20 +152,32 @@ def dump_to_json(program: Program): def get_channel_ids(chunk_list, tb_channel_dict, src_buffer, dst_buffer, chan_type): channel_ids = [] - for c in chunk_list: - key = (src_buffer, dst_buffer, chan_type) + key = (src_buffer, dst_buffer, chan_type) + if chan_type == ChannelType.nvls: + ranks = [] + for c in chunk_list: + ranks.append(c.rank) channel_ids.extend( - [ - {"id": id, "off": c.index} - for id, ele in enumerate(tb_channel_dict[key]["connectedTo"]) - if ele == c.rank - ] + [{"id": id} for id, ele in enumerate(tb_channel_dict[key]["connectedTo"]) if set(ele) == set(ranks)] ) + else: + for c in chunk_list: + channel_ids.extend( + [ + {"id": id, "off": c.index} + for id, ele in enumerate(tb_channel_dict[key]["connectedTo"]) + if ele == c.rank + ] + ) return channel_ids def remove_empty_fields(d): return {k: v for k, v in d.items() if v not in [None, "", [], {}]} + max_scratch = max(gpu.scratch_chunks for gpu in program.gpus) + max_input = max(gpu.input_chunks for gpu in program.gpus) + max_output = max(gpu.output_chunks for gpu in program.gpus) + for id, gpu in enumerate(program.gpus): gpu_instance = { "id": id, @@ -179,9 +195,27 @@ def remove_empty_fields(d): "type": type.value, "connectedTo": [eles[1] for eles in channels], } + if type == ChannelType.nvls: + obj["connectedTo"] = [sorted(list(eles)) for eles in obj["connectedTo"]] gpu_instance["channels"].append(obj) gpu_instance["channels"] = list(filter(lambda x: x["type"] != "none", gpu_instance["channels"])) gpu_instance["channels"] = sorted(gpu_instance["channels"], key=lambda x: (x["srcbuff"], x["dstbuff"])) + + # render for GPU NVLS channels + for i, chan in enumerate(gpu_instance["channels"]): + if chan["type"] == "nvls": + buff = chan["srcbuff"] + buffer_size = ( + max_input + if buff == Buffer.input.value + else max_output if buff == Buffer.output.value else max_scratch + ) + gpu_instance["channels"][i] = { + "buff": chan["srcbuff"], + "type": chan["type"], + "rankGroups": [{"size": buffer_size, "ranks": ranks} for ranks in chan["connectedTo"]], + } + for tb in gpu.threadblocks: if tb.id < 0: continue @@ -282,6 +316,15 @@ def remove_empty_fields(d): ): src = op.src dst = op.dst + elif op.inst == Instruction.group_load_reduce_store: + src = op.src + dst = op.dst + src_channel_ids = get_channel_ids( + op.srcs, tb_channel_dict, op.src.buffer, op.dst.buffer, op.channel_type + ) + dst_channel_ids = get_channel_ids( + op.dsts, tb_channel_dict, op.src.buffer, op.dst.buffer, op.channel_type + ) if op.inst != Instruction.nop: instr = { "name": op.inst.value, diff --git a/msccl/language/types.py b/msccl/language/types.py index 9a8e676..ef2addd 100644 --- a/msccl/language/types.py +++ b/msccl/language/types.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from enum import Enum -from typing import Union +from typing import Union, List from msccl.language.buffer import Buffer @@ -124,6 +124,9 @@ class MscclppInstruction(Enum): wait = "wait" signal = "signal" flush = "flush" + group_store = "gstore" + group_load_reduce = "glre" + group_load_reduce_store = "glres" def __str__(self): return self.value @@ -144,6 +147,7 @@ class ChannelType(Enum): proxy = "proxy" sm = "sm" none = "none" + nvls = "nvls" def __str__(self): return self.value @@ -154,7 +158,12 @@ class Channel: srcBuffer: Buffer dstBuffer: Buffer type: ChannelType - connected_to: int + connected_to: Union[int, List[int]] + + def __hash__(self): + # Ensure connected_to is converted to a tuple if it's a list + connected_to_hashable = tuple(self.connected_to) if isinstance(self.connected_to, list) else self.connected_to + return hash((self.srcBuffer, self.dstBuffer, self.type, connected_to_hashable)) @dataclass