From 215ec98a51d3efb8c65b9ad5230bf4c9ee1c5012 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 18 May 2021 11:47:04 -0700
Subject: [PATCH 001/135] Initial commit

---
 .github/workflows/codeql.yml                 |  30 +
 .github/workflows/tests.yaml                 |  27 +
 .gitignore                                   | 133 ++++
 CODE_OF_CONDUCT.md                           |   9 +
 CONTRIBUTING.md                              |  14 +
 LICENSE                                      |  21 +
 README.md                                    |  78 +++
 SECURITY.md                                  |  41 ++
 SUPPORT.md                                   |  25 +
 examples/unpermute_dgx1.py                   |  14 +
 pytest.ini                                   |   2 +
 requirements.txt                             |   8 +
 sccl/__init__.py                             |   2 +
 sccl/__main__.py                             |  36 ++
 sccl/algorithm.py                            | 137 ++++
 sccl/cli/__init__.py                         |   7 +
 sccl/cli/analyze.py                          |  47 ++
 sccl/cli/common.py                           | 188 ++++++
 sccl/cli/distribute.py                       | 101 +++
 sccl/cli/known_collectives.py                |  64 ++
 sccl/cli/known_distributed_topologies.py     |  30 +
 sccl/cli/known_topologies.py                 |  70 ++
 sccl/cli/known_transformers.py               |  19 +
 sccl/cli/ncclize.py                          |  43 ++
 sccl/cli/solve.py                            |  92 +++
 sccl/collectives.py                          | 154 +++++
 sccl/distributors/__init__.py                |   6 +
 sccl/distributors/alltoall_subproblem.py     | 223 +++++++
 sccl/distributors/gather_scatter_alltoall.py | 181 ++++++
 sccl/distributors/greedy_alltoall.py         | 177 ++++++
 sccl/instance.py                             |  39 ++
 sccl/isomorphisms.py                         |  87 +++
 sccl/ncclize.py                              | 633 +++++++++++++++++++
 sccl/ncd_reduction.py                        |  69 ++
 sccl/path_encoding.py                        | 222 +++++++
 sccl/rounds_bound.py                         |  76 +++
 sccl/serialization.py                        | 107 ++++
 sccl/steps_bound.py                          |  44 ++
 sccl/strategies.py                           | 159 +++++
 sccl/topologies/__init__.py                  |   8 +
 sccl/topologies/amd.py                       |  26 +
 sccl/topologies/distributed.py               |  41 ++
 sccl/topologies/generic.py                   |  52 ++
 sccl/topologies/nvidia.py                    |  74 +++
 sccl/topologies/topology.py                  |  41 ++
 sccl/topologies/transformers.py              |  24 +
 setup.py                                     |  22 +
 tests/__init__.py                            |   2 +
 tests/common.py                              |  12 +
 tests/test_algorithm.py                      |  20 +
 tests/test_analyses.py                       |  12 +
 tests/test_cli.py                            | 128 ++++
 tests/test_distributors.py                   |  30 +
 tests/test_path_encoding.py                  |  49 ++
 tests/test_serialization.py                  |  24 +
 tests/test_topologies.py                     |  50 ++
 56 files changed, 4030 insertions(+)
 create mode 100644 .github/workflows/codeql.yml
 create mode 100644 .github/workflows/tests.yaml
 create mode 100644 .gitignore
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 SECURITY.md
 create mode 100644 SUPPORT.md
 create mode 100644 examples/unpermute_dgx1.py
 create mode 100644 pytest.ini
 create mode 100644 requirements.txt
 create mode 100644 sccl/__init__.py
 create mode 100644 sccl/__main__.py
 create mode 100644 sccl/algorithm.py
 create mode 100644 sccl/cli/__init__.py
 create mode 100644 sccl/cli/analyze.py
 create mode 100644 sccl/cli/common.py
 create mode 100644 sccl/cli/distribute.py
 create mode 100644 sccl/cli/known_collectives.py
 create mode 100644 sccl/cli/known_distributed_topologies.py
 create mode 100644 sccl/cli/known_topologies.py
 create mode 100644 sccl/cli/known_transformers.py
 create mode 100644 sccl/cli/ncclize.py
 create mode 100644 sccl/cli/solve.py
 create mode 100644 sccl/collectives.py
 create mode 100644 sccl/distributors/__init__.py
 create mode 100644 sccl/distributors/alltoall_subproblem.py
 create mode 100644 sccl/distributors/gather_scatter_alltoall.py
 create mode 100644 sccl/distributors/greedy_alltoall.py
 create mode 100644 sccl/instance.py
 create mode 100644 sccl/isomorphisms.py
 create mode 100644 sccl/ncclize.py
 create mode 100644 sccl/ncd_reduction.py
 create mode 100644 sccl/path_encoding.py
 create mode 100644 sccl/rounds_bound.py
 create mode 100644 sccl/serialization.py
 create mode 100644 sccl/steps_bound.py
 create mode 100644 sccl/strategies.py
 create mode 100644 sccl/topologies/__init__.py
 create mode 100644 sccl/topologies/amd.py
 create mode 100644 sccl/topologies/distributed.py
 create mode 100644 sccl/topologies/generic.py
 create mode 100644 sccl/topologies/nvidia.py
 create mode 100644 sccl/topologies/topology.py
 create mode 100644 sccl/topologies/transformers.py
 create mode 100644 setup.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/common.py
 create mode 100644 tests/test_algorithm.py
 create mode 100644 tests/test_analyses.py
 create mode 100644 tests/test_cli.py
 create mode 100644 tests/test_distributors.py
 create mode 100644 tests/test_path_encoding.py
 create mode 100644 tests/test_serialization.py
 create mode 100644 tests/test_topologies.py

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..12a15f0
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,30 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  schedule:
+    - cron: '16 14 * * 2'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: python
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
new file mode 100644
index 0000000..971e0d0
--- /dev/null
+++ b/.github/workflows/tests.yaml
@@ -0,0 +1,27 @@
+name: Tests
+
+on: [push]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8, 3.9]
+
+    name: Test with Python ${{ matrix.python-version }}
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install sccl and dependencies
+      run: |
+        pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Run tests and check at least 90% coverage
+      run: |
+        pytest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7bc2779
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,133 @@
+# SCCL specific
+*.sccl.json
+*.sccl.xml
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..f9ba8cf
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..c282e9a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+# Contributing
+
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to,
+and actually do, grant us the rights to use your contribution. For details, visit
+https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
+instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..9e841e7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c17eee4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,78 @@
+# SCCL
+
+The Synthesized Collective Communication Library is a tool for synthesizing collective algorithms tailored to a particular hardware topology.
+
+## Installation
+
+To install:
+```
+pip install .
+```
+This installs the Python package and the `sccl` command line tool.
+
+To enable Bash completion for `sccl`:
+```
+echo 'eval "$(register-python-argcomplete sccl)"' >> ~/.bashrc
+```
+
+## Usage
+
+At its core SCCL answers synthesis queries is there an algorithm for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
+
+For example, to synthesize an Allgather algorithm for an [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
+```
+$ sccl solve instance DGX1 Allgather --steps 4
+Solving instance steps=4... synthesized! (0.7s)
+Wrote to Allgather.n8-DGX1-steps4.sccl.json
+```
+The instance is satisfiable and `sccl` saves it to a file.
+
+Four steps is not necessarily the least number of steps required. To find the least steps:
+```
+$ sccl solve least-steps DGX1 Allgather
+Algorithms need at least 2 steps.
+Solving instance steps=2... synthesized! (0.2s)
+Wrote to Allgather.n8-DGX1-steps2.sccl.json
+```
+The `least-steps` strategy statically determines that any Allgather in a DGX-1 requires at least 2 steps and starting from that finds the smallest satisfiable number of steps.
+
+While this two step algorithm is a latency-optimal one, there may be other algorithms that achieve higher bandwidth. The `pareto-optimal` strategy searches through different latency-bandwidth tradeoffs:
+```
+$ sccl solve pareto-optimal DGX1 Allgather
+Algorithms need at least 2 steps.
+Algorithms need at least 7/6 rounds per chunk.
+Solving instance steps=2... synthesized! (0.5s)
+Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.9s)
+Solving instance steps=2,rounds=4,chunks=3... unsatisfiable. (1.1s)
+Assuming 2 step algorithms need at least 4/3 rounds per chunk.
+Solving instance steps=3,rounds=4,chunks=3... synthesized! (2.9s)
+Solving instance steps=3,rounds=5,chunks=4... synthesized! (6.5s)
+Solving instance steps=3,rounds=6,chunks=5... synthesized! (44.0s)
+Solving instance steps=3,rounds=7,chunks=6... synthesized! (56.1s)
+Bandwidth optimal algorithm found!
+Found 2 Pareto optimal algorithms. Pruned 4 non-optimal algorithms.
+Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.sccl.json
+Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.sccl.json
+```
+
+## Contributing
+
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+
+## Trademarks
+
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
+trademarks or logos is subject to and must follow 
+[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
+Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
+Any use of third-party trademarks or logos are subject to those third-party's policies.
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..f7b8998
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
\ No newline at end of file
diff --git a/SUPPORT.md b/SUPPORT.md
new file mode 100644
index 0000000..dc72f0e
--- /dev/null
+++ b/SUPPORT.md
@@ -0,0 +1,25 @@
+# TODO: The maintainer of this repo has not yet edited this file
+
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
+- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
+
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+
+# Support
+
+## How to file issues and get help  
+
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+feature request as a new Issue.
+
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+
+## Microsoft Support Policy  
+
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
diff --git a/examples/unpermute_dgx1.py b/examples/unpermute_dgx1.py
new file mode 100644
index 0000000..2d87a11
--- /dev/null
+++ b/examples/unpermute_dgx1.py
@@ -0,0 +1,14 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# This script shows how to use SCCL to find a way to permute the nodes of a DGX1 to match the default order.
+
+from sccl.topologies import *
+from sccl.isomorphisms import find_isomorphisms
+
+def solve_dgx1_permutation():
+    local = nvlink_only()
+    isomorphisms = find_isomorphisms(dgx1(), local, limit=1)
+    if len(isomorphisms) == 0:
+        raise RuntimeError('No isomorphism to DGX1 found')
+    return isomorphisms[0].nodes
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..cb4173f
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --cov=sccl --cov-report term-missing:skip-covered --cov-fail-under 90 -n auto
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..aa95aa6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+dataclasses; python_version < "3.7"
+z3-solver
+argcomplete
+lxml
+pytest
+pytest-cov
+pytest-xdist
+-e .
diff --git a/sccl/__init__.py b/sccl/__init__.py
new file mode 100644
index 0000000..7f3fd83
--- /dev/null
+++ b/sccl/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
\ No newline at end of file
diff --git a/sccl/__main__.py b/sccl/__main__.py
new file mode 100644
index 0000000..f97f616
--- /dev/null
+++ b/sccl/__main__.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# PYTHON_ARGCOMPLETE_OK
+
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl.collectives as collectives
+import sccl.topologies as topologies
+import sccl.strategies as strategies
+from sccl.cli import *
+
+import argparse
+import argcomplete
+import sys
+
+def main():
+    parser = argparse.ArgumentParser('sccl')
+
+    cmd_parsers = parser.add_subparsers(title='command', dest='command')
+    cmd_parsers.required = True
+
+    handlers = []
+    handlers.append(make_solvers(cmd_parsers))
+    handlers.append(make_distributors(cmd_parsers))
+    handlers.append(make_analyses(cmd_parsers))
+    handlers.append(make_handle_ncclize(cmd_parsers))
+
+    argcomplete.autocomplete(parser)
+    args = parser.parse_args()
+    
+    for handler in handlers:
+        if handler(args, args.command):
+            break
+
+if __name__ == '__main__':
+    main()
diff --git a/sccl/algorithm.py b/sccl/algorithm.py
new file mode 100644
index 0000000..69c4191
--- /dev/null
+++ b/sccl/algorithm.py
@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from dataclasses import dataclass
+from collections import defaultdict
+
+@dataclass
+class Step(object):
+    rounds: int
+    sends: list
+
+class Algorithm(object):
+    def __init__(self, name, collective, topology, instance, steps, input_map = {}, output_map = {}):
+        self.name = name
+        self.topology = topology
+        self.collective = collective
+        self.instance = instance
+        self.steps = steps
+        self.input_map = input_map
+        self.output_map = output_map
+
+        self._update_link_utilizations()
+        self._check_bandwidth_constraints()
+
+        for step in self.steps:
+            step.sends.sort()
+
+    @classmethod
+    def make_implementation(cls, collective, topology, instance, steps):
+        chunked = collective.chunk_up(instance.chunks)
+
+        # Figure out input and output addresses
+        input_map = {}
+        output_map = {}
+        for rank in chunked.ranks():
+            input_addrs = set()
+            output_addrs = set()
+            for chunk in chunked.chunks():
+                # An address is an input address if any of its chunks is in the precondition
+                if chunked.precondition(rank, chunk):
+                    input_addrs.add(chunked.address(chunk))
+                # An address is an output address if any of its chunks is in the postcondition
+                if chunked.postcondition(rank, chunk):
+                    output_addrs.add(chunked.address(chunk))
+            if len(input_addrs) > 0:
+                input_map[rank] = input_addrs
+            if len(output_addrs) > 0:
+                output_map[rank] = output_addrs
+
+        # Concatenate collective and topology names plus instance arguments to create a name
+        name = f'{collective.name}-{topology.name}-{instance}'
+
+        algo = cls(name, collective, topology, instance, steps, input_map, output_map)
+        algo.check_implements(chunked)
+        if instance.extra_rounds > 0:
+            used_extra_rounds = algo.extra_rounds()
+            if used_extra_rounds > instance.extra_rounds:
+                raise ValueError(f'steps use {used_extra_rounds} extra rounds but only {instance.extra_rounds} were allowed')
+        return algo
+
+    def ranks(self):
+        return range(self.topology.num_nodes())
+    
+    def num_steps(self):
+        return len(self.steps)
+
+    def extra_rounds(self):
+        rounds = 0
+        for step in self.steps:
+            rounds += step.rounds
+        return rounds - self.num_steps()
+
+    def is_pipelined(self):
+        return self.instance.pipeline != None
+
+    def check_implements(self, collective):
+        if self.topology.num_nodes() != collective.num_nodes:
+            raise RuntimeError('topology and collective have different number of nodes')
+        # Find which chunks will be sent from an address
+        chunks_at_address = defaultdict(list)
+        for chunk in collective.chunks():
+            chunks_at_address[collective.address(chunk)].append(chunk)
+        # State records if a rank holds a chunk
+        def idx(rank, chunk):
+            return rank * collective.num_chunks + chunk
+        state = [False] * (collective.num_nodes * collective.num_chunks)
+        # Initialize state from precondition
+        for rank in collective.ranks():
+            for chunk in collective.chunks():
+                state[idx(rank, chunk)] = collective.precondition(rank, chunk)
+        # Propagate state through sends of every step
+        for step in self.steps:
+            next_state = state.copy()
+            for addr, src, dst in step.sends:
+                for chunk in chunks_at_address[addr]:
+                    next_state[idx(dst, chunk)] |= state[idx(src, chunk)]
+            state = next_state
+        # Check that the postcondition holds
+        for rank in collective.ranks():
+            for chunk in collective.chunks():
+                if collective.postcondition(rank, chunk) and not state[idx(rank, chunk)]:
+                    raise RuntimeError(f'rank {rank} does not get chunk {chunk} as required by the postcondition')
+
+    def _update_link_utilizations(self):
+        self._link_utilizations = []
+        ranks = range(self.topology.num_nodes())
+        for step in self.steps:
+            step_utilizations = [[0 for _ in ranks] for _ in ranks]
+            for addr, src, dst in step.sends:
+                step_utilizations[dst][src] += 1 # Same order as topology
+            self._link_utilizations.append(step_utilizations)
+
+    def _check_bandwidth_constraints(self):
+        for srcs, dsts, bw, name in self.topology.bandwidth_constraints():
+            for step_num, step in enumerate(self.steps):
+                util = 0
+                for dst in dsts:
+                    for src in srcs:
+                        if self.is_pipelined():
+                            for overlapping_step in range(step_num, len(self.steps), self.instance.pipeline):
+                                util += self._link_utilizations[overlapping_step][dst][src]
+                        else:
+                            util += self._link_utilizations[step_num][dst][src]
+                assert util <= bw * step.rounds, \
+                    f'Step {step_num} uses {util} bandwidth but constraint {name} only allows for {bw * step.rounds} bandwidth (when rounds={step.rounds}).'
+
+    def __str__(self):
+        s = ''
+        for i, step in enumerate(self.steps):
+            if i != 0:
+                s += '\n'
+            if step.rounds > 1:
+                s += f'(step {i+1}, rounds={step.rounds}) '
+            else:
+                s += f'(step {i+1}) '
+            s += ', '.join([f'{chunk}:{src}→{dst}' for chunk, src, dst in step.sends])
+        return s
diff --git a/sccl/cli/__init__.py b/sccl/cli/__init__.py
new file mode 100644
index 0000000..db5cbfc
--- /dev/null
+++ b/sccl/cli/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .solve import *
+from .distribute import *
+from .analyze import *
+from .ncclize import *
diff --git a/sccl/cli/analyze.py b/sccl/cli/analyze.py
new file mode 100644
index 0000000..6c1e3cc
--- /dev/null
+++ b/sccl/cli/analyze.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .known_topologies import KnownTopologies
+from .known_collectives import KnownCollectives
+from .common import *
+from sccl.rounds_bound import lower_bound_rounds
+from sccl.isomorphisms import find_isomorphisms
+
+def make_analyses(cmd_parsers):
+    handler_funcs = []
+    handler_funcs.append(make_handle_bound_rounds)
+    handler_funcs.append(make_handle_find_isomorphisms)
+
+    return make_cmd_category(cmd_parsers, 'analyze', 'analysis', handler_funcs)
+
+def make_handle_bound_rounds(cmd_parsers):
+    cmd = cmd_parsers.add_parser('rounds')
+    topologies = KnownTopologies(cmd)
+    collectives = KnownCollectives(cmd)
+
+    def handle(args, command):
+        if command != 'rounds':
+            return False
+
+        topology = topologies.create(args)
+        collective = collectives.create(args, topology.num_nodes())
+        lower_bound_rounds(topology, collective, logging=True)
+        return True
+    
+    return handle
+
+def make_handle_find_isomorphisms(cmd_parsers):
+    cmd = cmd_parsers.add_parser('isomorphisms')
+    topologies1 = KnownTopologies(cmd, tag='1')
+    topologies2 = KnownTopologies(cmd, tag='2')
+
+    def handle(args, command):
+        if command != 'isomorphisms':
+            return False
+
+        topology1 = topologies1.create(args)
+        topology2 = topologies2.create(args)
+        isomorphisms = find_isomorphisms(topology1, topology2, logging=True)
+        return True
+    
+    return handle
diff --git a/sccl/cli/common.py b/sccl/cli/common.py
new file mode 100644
index 0000000..12110e3
--- /dev/null
+++ b/sccl/cli/common.py
@@ -0,0 +1,188 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.serialization import *
+from sccl.instance import *
+from pathlib import Path
+import sys
+import re
+from fractions import Fraction
+
+def _legalize_sccl_name(name):
+    name = name.replace('(', '.')
+    name = name.replace('=', '')
+    name = name.replace(',', '.')
+    name = name.replace(')', '')
+    return name
+
+def name_sccl_object(name, ending='sccl.json'):
+    return f'{_legalize_sccl_name(name)}.{ending}'
+
+def _validate_output_directory(directory):
+    if not directory.exists():
+        print('error: output directory does not exists', file=sys.stderr)
+        exit(1)
+    if not directory.is_dir():
+        print('error: output path is not a directory', file=sys.stderr)
+        exit(1)
+
+def _handle_write_to_directory(directory, force, get_contents, preferred_file_name):
+    output_file = directory / preferred_file_name
+    if output_file.exists():
+        if output_file.is_dir():
+            print(f'error: output path is a directory', file=sys.stderr)
+            exit(1)
+        if force:
+            print(f'Overwriting {output_file}')
+        else:
+            print(f'file already exists, use -f/--force to overwrite {output_file}', file=sys.stderr)
+            return False
+    with output_file.open('w') as f:
+        f.write(get_contents())
+    print(f'Wrote to {output_file}')
+    return True
+
+def add_output_file(parser):
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-o', '--output', type=Path, help='file to write synthesized algorithm to', metavar='FILE')
+    group.add_argument('-d', '--directory', type=Path, default=Path(), help='directory to write the synthesized algorithm to', metavar='DIR')
+    parser.add_argument('-f', '--force', action='store_true', help='overwrite existing files')
+    parser.add_argument('--no-save', action='store_true', help='do not save to file')
+
+    def validate_args(args):
+        if args.output != None:
+            if args.output.is_dir():
+                print(f'error: output path is a directory, did you mean to use -d?', file=sys.stderr)
+                exit(1)
+        if args.directory != None:
+            _validate_output_directory(args.directory)
+
+    def handle(args, get_contents, preferred_file_name):
+        if args.no_save:
+            return False
+        if args.output != None:
+            if args.output.exists() and not args.force:
+                print(f'file already exists, use -f/--force to overwrite {args.output}', file=sys.stderr)
+                return False
+            with args.output.open('w') as f:
+                f.write(get_contents())
+            print(f'Wrote to {args.output}')
+        else:
+            return _handle_write_to_directory(args.directory, args.force, get_contents, preferred_file_name)
+        return True
+
+    return validate_args, handle
+
+def add_output_algorithm(parser):
+    validate_args, handle_file = add_output_file(parser)
+
+    def handle(args, algorithm):
+        if algorithm == None:
+            return # Strategies/distributors have their specific failure prints
+
+        handled = handle_file(args, lambda: SCCLEncoder().encode(algorithm), name_sccl_object(algorithm.name))
+        if not handled:
+            print(f'\n{algorithm.name} algorithm:')
+            print(algorithm)
+
+    return validate_args, handle
+
+def add_output_topology(parser):
+    validate_args, handle_file = add_output_file(parser)
+
+    def handle(args, topology):
+        handled = handle_file(args, lambda: SCCLEncoder().encode(topology), name_sccl_object(topology.name))
+
+    return validate_args, handle
+
+def add_output_sccl_objects(parser):
+    parser.add_argument('-d', '--directory', type=Path, default=Path(), help='directory to write outputs to', metavar='DIR')
+    parser.add_argument('-f', '--force', action='store_true', help='overwrite existing files')
+    parser.add_argument('--no-save', action='store_true', help='do not save to file')
+
+    def validate_args(args):
+        _validate_output_directory(args.directory)
+
+    def handle(args, sccl_object, name):
+        if not args.no_save:
+            _handle_write_to_directory(args.directory, args.force, lambda: SCCLEncoder().encode(sccl_object), name_sccl_object(name))
+    
+    return validate_args, handle
+
+def add_input_algorithm(parser, multiple=False, name='algorithm'):
+    parser.add_argument(name, type=Path, nargs='+' if multiple else 1, help=f'algorithm to operate on')
+
+    def read_algorithm(args):
+        algos = []
+        for input_file in vars(args)[name]:
+            if not input_file.exists():
+                print(f'error: input file not found: {input_file}', file=sys.stderr)
+                exit(1)
+
+            algo = load_sccl_object(input_file)
+            algos.append(algo)
+        if multiple:
+            return algos
+        else:
+            return algos[0]
+
+    return read_algorithm
+
+def add_instance(parser, take_steps=True, take_rounds=True, take_chunks=True):
+    if take_steps:
+        parser.add_argument('-s', '--steps', type=int, required=True)
+    if take_rounds:
+        parser.add_argument('-r', '--rounds', type=int, default=None, metavar='N')
+    if take_chunks:
+        parser.add_argument('-c', '--chunks', type=int, default=1, metavar='N')
+    parser.add_argument('--pipeline', type=int, default=None, metavar='N')
+    parser.add_argument('--extra-memory', type=int, default=None, metavar='N')
+    parser.add_argument('--allow-exchange', action='store_true')
+
+    def handle(args):
+        if take_rounds:
+            if args.rounds != None:
+                if args.rounds < args.steps:
+                    parser.error(f'error: rounds cannot be less than steps ({args.rounds} < {args.steps})')
+                extra_rounds = args.rounds - args.steps
+            else:
+                extra_rounds = 0
+        return Instance(
+            steps=args.steps if take_steps else None,
+            extra_rounds=extra_rounds if take_rounds else 0,
+            chunks=args.chunks if take_chunks else 1,
+            pipeline=args.pipeline,
+            extra_memory=args.extra_memory,
+            allow_exchange=args.allow_exchange)
+
+    return handle
+
+def parse_fraction(value):
+    try:
+        return int(value)
+    except ValueError:
+        m = re.fullmatch('(.+)/(.+)', value)
+        if m == None:
+            raise ValueError('value must be in format "<numerator>/<denominator>"')
+        numerator = int(m.group(1))
+        denominator = int(m.group(2))
+        return Fraction(numerator, denominator)
+
+def make_cmd_category(cmd_parsers, name, title, handler_funcs):
+    cmd = cmd_parsers.add_parser(name)
+    category_parsers = cmd.add_subparsers(title=title, dest=title)
+    category_parsers.required = True
+    
+    handlers = []
+    for func in handler_funcs:
+        handlers.append(func(category_parsers))
+    
+    def handle(args, command):
+        if command != name:
+            return False
+        
+        for handler in handlers:
+            if handler(args, vars(args)[title]):
+                return True
+    
+    return handle
diff --git a/sccl/cli/distribute.py b/sccl/cli/distribute.py
new file mode 100644
index 0000000..aa24179
--- /dev/null
+++ b/sccl/cli/distribute.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.distributors import *
+from .known_distributed_topologies import KnownDistributedTopologies
+from .known_topologies import KnownTopologies
+from .common import *
+
+def make_distributors(cmd_parsers):
+    handler_funcs = []
+    handler_funcs.append(make_handle_greedy_alltoall)
+    handler_funcs.append(make_handle_gather_scatter_alltoall)
+    handler_funcs.append(make_handle_create_subproblem_distributed_alltoall)
+    handler_funcs.append(make_handle_distribute_alltoall_stitch_subproblem)
+
+    return make_cmd_category(cmd_parsers, 'distribute', 'distributor', handler_funcs)
+
+def make_handle_greedy_alltoall(cmd_parsers):
+    name = 'alltoall-greedy'
+    cmd = cmd_parsers.add_parser(name)
+    read_algorithm = add_input_algorithm(cmd)
+    distributed_topologies = KnownDistributedTopologies(cmd)
+    validate_output_args, output_handler = add_output_algorithm(cmd)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        input_algorithm = read_algorithm(args)
+        validate_output_args(args)
+        topology = distributed_topologies.create(args, input_algorithm.topology)
+        algo = synthesize_greedy_distributed_alltoall(topology, input_algorithm, logging=True)
+        output_handler(args, algo)
+        return True
+
+    return handle
+
+def make_handle_gather_scatter_alltoall(cmd_parsers):
+    name = 'alltoall-gather-scatter'
+    cmd = cmd_parsers.add_parser(name)
+    read_gather_algorithm = add_input_algorithm(cmd, name='gather')
+    read_scatter_algorithm = add_input_algorithm(cmd, name='scatter')
+    cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made')
+    validate_output_args, output_handler = add_output_algorithm(cmd)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        gather_algorithm = read_gather_algorithm(args)
+        scatter_algorithm = read_scatter_algorithm(args)
+        validate_output_args(args)
+        algo = synthesize_gather_scatter_distributed_alltoall(args.copies, gather_algorithm, scatter_algorithm, logging=True)
+        output_handler(args, algo)
+        return True
+
+    return handle
+
+def make_handle_create_subproblem_distributed_alltoall(cmd_parsers):
+    name = 'alltoall-create-subproblem'
+    cmd = cmd_parsers.add_parser(name)
+    topologies = KnownTopologies(cmd)
+    cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made')
+    cmd.add_argument('--relay-nodes', type=int, nargs='+', default=[0], help='relay nodes')
+    cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N')
+    cmd.add_argument('--share-bandwidth', action='store_true', help='share local bandwidth between relay nodes')
+    validate_output_args, output_handler = add_output_sccl_objects(cmd)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        local_topology = topologies.create(args)
+        validate_output_args(args)
+
+        collective, topology = make_alltoall_subproblem_collective_and_topology(local_topology, args.copies, args.relay_nodes, args.remote_bandwidth, args.share_bandwidth)
+
+        output_handler(args, collective, collective.name)
+        output_handler(args, topology, topology.name)
+        return True
+
+    return handle
+ 
+def make_handle_distribute_alltoall_stitch_subproblem(cmd_parsers):
+    name = 'alltoall-stitch-subproblem'
+    cmd = cmd_parsers.add_parser(name)
+    read_subproblem_algorithm = add_input_algorithm(cmd)
+    cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology made for the subproblem')
+    validate_output_args, output_handler = add_output_algorithm(cmd)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        subproblem_algorithm = read_subproblem_algorithm(args)
+        validate_output_args(args)
+        algo = synthesize_alltoall_subproblem(subproblem_algorithm, args.copies, logging=True)
+        output_handler(args, algo)
+        return True
+
+    return handle
\ No newline at end of file
diff --git a/sccl/cli/known_collectives.py b/sccl/cli/known_collectives.py
new file mode 100644
index 0000000..0ace187
--- /dev/null
+++ b/sccl/cli/known_collectives.py
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl.collectives as collectives
+from sccl.serialization import *
+from pathlib import Path
+import sys
+
+class KnownCollectives:
+    def __init__(self, parser):
+        self.parser = parser
+        self.constructors = {
+            'Broadcast': self._rooted_coll(collectives.broadcast),
+            'Reduce': self._rooted_coll(collectives.reduce),
+            'Scatter': self._rooted_coll(collectives.scatter),
+            'Gather': self._rooted_coll(collectives.gather),
+            'Allgather': self._coll(collectives.allgather),
+            'Allreduce': self._coll(collectives.allreduce),
+            'Alltoall': self._coll(collectives.alltoall),
+            'ReduceScatter': self._coll(collectives.reduce_scatter),
+            'Scan': self._coll(collectives.scan),
+            'MultirootBroadcast': self._multiroot_coll(collectives.multiroot_broadcast),
+            'MultirootScatter': self._multiroot_coll(collectives.multiroot_scatter),
+            'MultirootGather': self._multiroot_coll(collectives.multiroot_gather),
+            'custom': self._custom_coll(),
+        }
+        self.parser.add_argument('collective', type=str, choices=self.constructors.keys(), help='collective')
+        self.parser.add_argument('--collective-file', type=Path, default=None, help='a serialized collective', metavar='FILE')
+        self.parser.add_argument('--root', type=int, default=0, help='used by rooted collectives', metavar='N')
+        self.parser.add_argument('--roots', type=int, nargs='+', default=[0], help='used by multi-rooted collectives', metavar='N')
+
+    def create(self, args, num_nodes):
+        return self.constructors[args.collective](num_nodes, args)
+
+    def _custom_coll(self):
+        def make(size, args):
+            input_file = args.collective_file
+            if input_file is None:
+                self.parser.error('--collective-file is required for custom collectives')
+                exit(1)
+
+            if not input_file.exists():
+                print(f'error: input file not found: {input_file}', file=sys.stderr)
+                exit(1)
+
+            return load_sccl_object(input_file)
+        return make
+
+    def _rooted_coll(self, fun):
+        def make(size, args):
+            root = args.root
+            return fun(size, root)
+        return make
+
+    def _coll(self, fun):
+        def make(size, args):
+            return fun(size)
+        return make
+
+    def _multiroot_coll(self, fun):
+        def make(size, args):
+            roots = args.roots
+            return fun(size, roots)
+        return make
diff --git a/sccl/cli/known_distributed_topologies.py b/sccl/cli/known_distributed_topologies.py
new file mode 100644
index 0000000..b4db6a3
--- /dev/null
+++ b/sccl/cli/known_distributed_topologies.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl.topologies as topologies
+import pathlib
+
+class KnownDistributedTopologies:
+    def __init__(self, parser):
+        self.parser = parser
+        self.constructors = {
+            'DistributedFullyConnected': topologies.distributed_fully_connected,
+            'DistributedHubAndSpoke': topologies.distributed_hub_and_spoke,
+        }
+        self.parser.add_argument('topology', type=str, choices=self.constructors.keys(), help='the distributed topology')
+        self.parser.add_argument('-n', '--nodes', type=int, help='total nodes in the distributed topology, must be divisible by local topology')
+        self.parser.add_argument('--copies', type=int, help='copies of the local topology to be made')
+        self.parser.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='bandwidth of links in the distributed topology', metavar='N')
+
+    def create(self, args, local_topology):
+        if args.nodes != None and args.copies != None:
+            self.parser.error('please use only one of -n/--nodes, --copies')
+        if args.copies != None:
+            copies = args.copies
+        elif args.nodes != None:
+            if args.nodes % local_topology.num_nodes() != 0:
+                self.parser.error(f'total number of nodes must be divisible by the local number of nodes {local_topology.num_nodes()}, but {args.nodes} was given')
+            copies = args.nodes // local_topology.num_nodes()
+        else:
+            self.parser.error('one of the following arguments is required: --nodes, --copies')
+        return self.constructors[args.topology](local_topology, copies, args.remote_bandwidth)
diff --git a/sccl/cli/known_topologies.py b/sccl/cli/known_topologies.py
new file mode 100644
index 0000000..70ec79d
--- /dev/null
+++ b/sccl/cli/known_topologies.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl.topologies as topologies
+from sccl.serialization import *
+from .known_transformers import KnownTransformers
+from pathlib import Path
+import sys
+
+class KnownTopologies:
+    def __init__(self, parser, tag=''):
+        self.parser = parser
+        self.tag = tag
+        self.constructors = {
+            'FullyConnected': self._sized_topo(topologies.fully_connected),
+            'HubAndSpoke': self._sized_topo(topologies.hub_and_spoke),
+            'Ring': self._sized_topo(topologies.ring),
+            'Line': self._sized_topo(topologies.line),
+            'Star': self._sized_topo(topologies.star),
+            'AMD4': self._fixed_topo(topologies.amd4),
+            'AMD8': self._fixed_topo(topologies.amd8),
+            'DGX1': self._fixed_topo(topologies.dgx1),
+            'DGX2': self._fixed_topo(lambda: topologies.hub_and_spoke(16)),
+            'NVLinkOnly': self._fixed_topo(topologies.nvlink_only),
+            'custom': self._custom_topo(),
+        }
+        self.parser.add_argument(f'topology{tag}', type=str, choices=self.constructors.keys(), help=f'topology {tag}')
+        self.parser.add_argument(f'--topology-file{tag}', type=Path, default=None, help=f'a serialized topology', metavar=f'FILE')
+        self.parser.add_argument(f'-n{tag}', f'--nodes{tag}', type=int, help='required for non-fixed topologies', metavar='N')
+        self.known_transformers = KnownTransformers(parser, tag=tag)
+
+    def _topology(self, args):
+        return vars(args)[f'topology{self.tag}']
+
+    def _nodes(self, args):
+        return vars(args)[f'nodes{self.tag}']
+
+    def create(self, args):
+        topology = self.constructors[self._topology(args)](args)
+        topology = self.known_transformers.transform(args, topology)
+        return topology
+
+    def _custom_topo(self):
+        def make(args):
+            input_file = vars(args)[f'topology_file{self.tag}']
+            if input_file is None:
+                self.parser.error(f'--topology-file{self.tag} is required for custom topologies')
+                exit(1)
+
+            if not input_file.exists():
+                print(f'error: input file not found: {input_file}', file=sys.stderr)
+                exit(1)
+
+            return load_sccl_object(input_file)
+        return make
+
+    def _fixed_topo(self, Cls):
+        def make(args):
+            topo = Cls()
+            if self._nodes(args) != None and self._nodes(args) != topo.num_nodes():
+                self.parser.error(f'fixed-size topology {self._topology(args)} has {topo.num_nodes()} nodes, but command line specified {self._nodes(args)} nodes')
+            return topo
+        return make
+
+    def _sized_topo(self, Cls):
+        def make(args):
+            if self._nodes(args) == None:
+                self.parser.error(f'topology {self._topology(args)} requires -n/--nodes')
+            return Cls(self._nodes(args))
+        return make
diff --git a/sccl/cli/known_transformers.py b/sccl/cli/known_transformers.py
new file mode 100644
index 0000000..48268d5
--- /dev/null
+++ b/sccl/cli/known_transformers.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl.topologies as topologies
+
+class KnownTransformers:
+    def __init__(self, parser, tag=''):
+        self.parser = parser
+        self.tag = tag
+        self.transformers = {
+            'reverse': topologies.reverse_topology,
+            'binarize': topologies.binarize_topology,
+        }
+        self.parser.add_argument(f'-t{tag}', f'--transform{tag}', action='append', default=[], choices=self.transformers.keys(), help='apply a topology transformer. may be used multiple times')
+
+    def transform(self, args, topology):
+        for key in vars(args)[f'transform{self.tag}']:
+            topology = self.transformers[key](topology)
+        return topology
diff --git a/sccl/cli/ncclize.py b/sccl/cli/ncclize.py
new file mode 100644
index 0000000..733a4d6
--- /dev/null
+++ b/sccl/cli/ncclize.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.ncclize import *
+from .common import *
+
+def make_handle_ncclize(cmd_parsers):
+    cmd = cmd_parsers.add_parser('ncclize')
+    read_algorithm = add_input_algorithm(cmd, multiple=True)
+    validate_output_args, output_handler = add_output_file(cmd)
+    remap_scratch_grp = cmd.add_mutually_exclusive_group()
+    remap_scratch_grp.add_argument('--remap-scratch', action='store_true', default=None, help='remap scratch buffer indices into free input/output indices')
+    remap_scratch_grp.add_argument('--no-remap-scratch', action='store_false', dest='remap_scratch', help='don\'t remap scratch buffer indices into free input/output indices')
+    cmd.add_argument('--no-merge-contiguous', action='store_true', help='don\'t merge sends/receives from/to contiguous memory')
+    cmd.add_argument('--no-pretty-print', action='store_true', help='don\'t pretty print the generated XML')
+    cmd.add_argument('--old-format', action='store_true', help='use the old format')
+    cmd.add_argument('--use-scratch', action='store_true', help='use the scratch buffer instead of extra space at the end of output buffer')
+    cmd.add_argument('--channel-policy', type=ChannelPolicy, choices=list(ChannelPolicy), default=ChannelPolicy.MatchTopology, help='channel allocation policy')
+    cmd.add_argument('--instances', type=int, default=1, help='number of interleaved instances of the algorithm to make')
+
+    def handle(args, command):
+        if command != 'ncclize':
+            return False
+
+        input_algorithms = read_algorithm(args)
+        validate_output_args(args)
+
+        for algo in input_algorithms:
+            ncclized = ncclize(algo,
+                remap_scratch=args.remap_scratch,
+                channel_policy=args.channel_policy,
+                pretty_print=not args.no_pretty_print,
+                old_format=args.old_format,
+                use_scratch=args.use_scratch,
+                merge_contiguous=not args.no_merge_contiguous,
+                instances=args.instances,
+                logging=True)
+
+            handled = output_handler(args, lambda: ncclized, name_sccl_object(algo.name, ending='sccl.xml'))
+
+        return True
+    
+    return handle
diff --git a/sccl/cli/solve.py b/sccl/cli/solve.py
new file mode 100644
index 0000000..4f1c094
--- /dev/null
+++ b/sccl/cli/solve.py
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl.strategies as strategies
+from .known_topologies import KnownTopologies
+from .known_collectives import KnownCollectives
+from .common import *
+
+def make_solvers(cmd_parsers):
+    handler_funcs = []
+    handler_funcs.append(make_handle_solve_instance)
+    handler_funcs.append(make_handle_solve_least_steps)
+    handler_funcs.append(make_handle_solve_pareto_optimal)
+
+    return make_cmd_category(cmd_parsers, 'solve', 'solver', handler_funcs)
+
+def _make_handle_strategy(cmd_parsers, name, invoke, take_steps = True):
+    cmd = cmd_parsers.add_parser(name)
+    instance_handler = add_instance(cmd, take_steps=take_steps)
+    topologies = KnownTopologies(cmd)
+    collectives = KnownCollectives(cmd)
+    validate_output_args, output_handler = add_output_algorithm(cmd)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        validate_output_args(args)
+        topology = topologies.create(args)
+        collective = collectives.create(args, topology.num_nodes())
+        instance = instance_handler(args)
+        algo = invoke(args, topology, collective, instance)
+        output_handler(args, algo)
+        return True
+    
+    return cmd, handle
+
+def make_handle_solve_instance(cmd_parsers):
+    def invoke(args, topology, collective, instance):
+        return strategies.solve_instance(topology, collective, instance, logging=True)
+
+    cmd, handle = _make_handle_strategy(cmd_parsers, 'instance', invoke)
+    return handle
+
+def make_handle_solve_least_steps(cmd_parsers):
+    def invoke(args, topology, collective, instance):
+        return strategies.solve_least_steps(topology, collective, args.initial_steps, instance, logging=True)
+
+    cmd, handle = _make_handle_strategy(cmd_parsers, 'least-steps', invoke, take_steps=False)
+    cmd.add_argument('--initial-steps', type=int, default=1, metavar='N')
+    return handle
+
+def make_handle_solve_pareto_optimal(cmd_parsers):
+    name = 'pareto-optimal'
+    cmd = cmd_parsers.add_parser(name)
+    topologies = KnownTopologies(cmd)
+    collectives = KnownCollectives(cmd)
+    validate_output_args, output_handler = add_output_sccl_objects(cmd)
+    cmd.add_argument('--min-chunks', type=int, default=1, metavar='N')
+    cmd.add_argument('--max-chunks', type=int, default=None, metavar='N')
+    cmd.add_argument('--assume-rpc-bound', default=None, help='assume bandwidth optimality requires at least this many rounds per chunk', metavar='N/N')
+    cmd.add_argument('--no-monotonic-feasibility', action='store_true', help='turn off an unproven assumption about monotonic feasibility of instances')
+    cmd.add_argument('--save-eagerly', action='store_true', help='save algorithms as soon as they are found, without pruning non-Pareto optimal algorithms at the end')
+    instance_handler = add_instance(cmd, take_steps=False, take_rounds=False)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        validate_output_args(args)
+        topology = topologies.create(args)
+        instance = instance_handler(args)
+        collective = collectives.create(args, topology.num_nodes())
+        assume_rpc_bound = None
+        if args.assume_rpc_bound:
+            try:
+                assume_rpc_bound = parse_fraction(args.assume_rpc_bound)
+            except ValueError:
+                cmd.error('could not parse --assume-rpc-bound as a fraction')
+        algorithms = []
+        for algorithm in strategies.solve_all_latency_bandwidth_tradeoffs(topology, collective, args.min_chunks, args.max_chunks, assume_rpc_bound, not args.no_monotonic_feasibility, base_instance=instance, logging=True):
+            algorithms.append(algorithm)
+            if args.save_eagerly:
+                output_handler(args, algorithm, algorithm.name)
+        if not args.save_eagerly:
+            efficient_algorithms = strategies.prune_pareto_optimal(algorithms)
+            print(f'Found {len(efficient_algorithms)} Pareto optimal algorithms. Pruned {len(algorithms) - len(efficient_algorithms)} non-optimal algorithms.')
+            for algorithm in efficient_algorithms:
+                output_handler(args, algorithm, algorithm.name)
+        return True
+    
+    return handle
diff --git a/sccl/collectives.py b/sccl/collectives.py
new file mode 100644
index 0000000..347ec23
--- /dev/null
+++ b/sccl/collectives.py
@@ -0,0 +1,154 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+@dataclass
+class Chunk:
+    precondition: set
+    postcondition: set
+    address: int
+
+class Collective:
+    def __init__(self, name, num_nodes, chunks, triggers = {}):
+        self.name = name
+        self.num_nodes = num_nodes
+        self.num_chunks = len(chunks)
+        self._chunks = chunks
+        self._triggers = triggers
+
+        self.is_combining = False
+        addresses_seen = set()
+        for chunk in self._chunks:
+            if chunk.address in addresses_seen:
+                self.is_combining = True
+            addresses_seen.add(chunk.address)
+        self.num_addresses = len(addresses_seen)
+
+    def ranks(self):
+        return range(self.num_nodes)
+
+    def chunks(self):
+        return range(len(self._chunks))
+
+    def precondition(self, rank, chunk):
+        return rank in self._chunks[chunk].precondition
+
+    def postcondition(self, rank, chunk):
+        return rank in self._chunks[chunk].postcondition
+
+    def address(self, chunk):
+        return self._chunks[chunk].address
+
+    def trigger(self, rank, chunk):
+        if (rank, chunk) in self._triggers:
+            return self._triggers[(rank, chunk)]
+        else:
+            return None
+
+    def has_triggers(self):
+        return len(self._triggers) > 0
+
+    def chunk_up(self, div):
+        if div < 1:
+            raise ValueError('Divisor must be greater or equal to one (and one is a no-op).')
+        if div == 1:
+            return self
+
+        def remap(addr, i):
+            return addr * div + i
+
+        new_chunks = []
+        for chunk in self._chunks:
+            for i in range(div):
+                new_chunks.append(Chunk(chunk.precondition, chunk.postcondition, remap(chunk.address, i)))
+
+        name = f'{self.name},chunks={div}'
+        return Collective(name, self.num_nodes, new_chunks)
+
+def build_collective(name, num_nodes, num_chunks, precondition, postcondition, address = lambda c: c, trigger = lambda r, c: None):
+    chunks = []
+    for chunk in range(num_chunks):
+        chunk_precondition = set(rank for rank in range(num_nodes) if precondition(rank, chunk))
+        chunk_postcondition = set(rank for rank in range(num_nodes) if postcondition(rank, chunk))
+        chunk_address = address(chunk)
+        chunks.append(Chunk(chunk_precondition, chunk_postcondition, chunk_address))
+    triggers = {(rank, chunk): trigger(rank, chunk) for rank in range(num_nodes) for chunk in range(num_chunks) if trigger(rank, chunk) != None}
+    return Collective(name, num_nodes, chunks, triggers)
+
+# Common pre- and postconditions
+def _scattered(num_nodes, chunks = 1):
+    def cond(rank, chunk):
+        return rank == (chunk // chunks) % num_nodes
+    return cond
+
+def _transpose(num_nodes):
+    def cond(rank, chunk):
+        return rank == chunk // num_nodes
+    return cond
+
+def _all(rank, chunk):
+    return True
+
+def _root(root):
+    def cond(rank, chunk):
+        return rank == root
+    return cond
+
+# Non-combining collectives
+
+def broadcast(num_nodes, root):
+    return build_collective(f'Broadcast(n={num_nodes},root={root})', num_nodes, 1, _root(root), _all)
+
+def scatter(num_nodes, root):
+    return build_collective(f'Scatter(n={num_nodes},root={root})', num_nodes, num_nodes, _root(root), _scattered(num_nodes))
+
+def gather(num_nodes, root):
+    return build_collective(f'Gather(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root))
+
+def allgather(num_nodes):
+    return build_collective(f'Allgather(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all)
+
+def alltoall(num_nodes):
+    return build_collective(f'Alltoall(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes))
+
+# Combining collectives
+
+# Represents a single buffer to reduce
+def _single_scattered(num_nodes):
+    def address(chunk):
+        return chunk // num_nodes
+    return address
+
+def reduce(num_nodes, root):
+    return build_collective(f'Reduce(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root), _single_scattered(num_nodes))
+
+def allreduce(num_nodes):
+    return build_collective(f'Allreduce(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, _single_scattered(num_nodes))
+
+def reduce_scatter(num_nodes):
+    return build_collective(f'ReduceScatter(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), _single_scattered(num_nodes))
+
+def scan(num_nodes):
+    def postcondition(rank, chunk):
+        origin = chunk % num_nodes
+        return rank >= origin
+    return build_collective(f'Scan(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), postcondition, _single_scattered(num_nodes))
+
+# Multi-root generalizations of MPI rooted collectives
+# TODO: Add one for reduce. That needs a new addressing function.
+
+def _roots(roots):
+    def cond(rank, chunk):
+        return rank == roots[chunk % len(roots)]
+    return cond
+
+def multiroot_broadcast(num_nodes, roots):
+    return build_collective(f'MultirootBroadcast(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, len(roots), _roots(roots), _all)
+
+def multiroot_scatter(num_nodes, roots):
+    return build_collective(f'MultirootScatter(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, num_nodes * len(roots), _roots(roots), _scattered(num_nodes, len(roots)))
+
+def multiroot_gather(num_nodes, roots):
+    return build_collective(f'MultirootGather(n={num_nodes},roots=({",".join(str(i) for i in roots)}))', num_nodes, num_nodes * len(roots), _scattered(num_nodes, len(roots)), _roots(roots))
diff --git a/sccl/distributors/__init__.py b/sccl/distributors/__init__.py
new file mode 100644
index 0000000..3b26f08
--- /dev/null
+++ b/sccl/distributors/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .greedy_alltoall import *
+from .gather_scatter_alltoall import *
+from .alltoall_subproblem import *
diff --git a/sccl/distributors/alltoall_subproblem.py b/sccl/distributors/alltoall_subproblem.py
new file mode 100644
index 0000000..f2a6400
--- /dev/null
+++ b/sccl/distributors/alltoall_subproblem.py
@@ -0,0 +1,223 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.collectives import *
+from sccl.algorithm import *
+from sccl.instance import *
+from sccl.topologies import *
+
+def _alltoall_subproblem(local_nodes, num_copies):
+    remote_node = local_nodes
+
+    local_end = local_nodes * local_nodes
+    num_remote_pairs = (num_copies - 1) * local_nodes * local_nodes
+    remote_out_end = local_end + num_remote_pairs
+    num_chunks = remote_out_end + num_remote_pairs
+
+    def cases(chunk, local,remote_out,remote_in):
+        if chunk < local_end:
+            return local(chunk)
+        elif chunk < remote_out_end:
+            return remote_out(chunk - local_end)
+        else:
+            return remote_in(chunk - remote_out_end)
+
+    def pre(rank, chunk):
+        return cases(chunk,
+            lambda c: rank == c % local_nodes,
+            lambda c: rank == (c // (num_copies - 1)) % local_nodes,
+            lambda c: rank == remote_node)
+
+    def post(rank, chunk):
+        return cases(chunk,
+            lambda c: rank == c // local_nodes,
+            lambda c: rank == remote_node,
+            lambda c: rank == (c // (num_copies - 1)) // local_nodes)
+
+    def trigger(rank, chunk):
+        if rank == remote_node:
+            return cases(chunk,
+                lambda c: None,
+                lambda c: chunk + num_remote_pairs,
+                lambda c: chunk - num_remote_pairs)
+        else:
+            return None
+
+    return build_collective(f'AlltoallSubproblem(n={local_nodes},copies={num_copies})',
+        local_nodes + 1, num_chunks,
+        pre, post, trigger=trigger)
+
+def make_alltoall_subproblem_collective_and_topology(topology, num_copies, relay_nodes, bw = 1, share_bw = False):
+    local_nodes = topology.num_nodes()
+    remote_node = local_nodes
+
+    links = [[0 for _ in range(local_nodes + 1)] for _ in range(local_nodes + 1)]
+    for src in range(local_nodes):
+        for dst in range(local_nodes):
+            links[dst][src] = topology.link(src, dst)
+    for relay in relay_nodes:
+        links[remote_node][relay] = bw
+        links[relay][remote_node] = bw
+
+    switches = topology.switches.copy()
+    if share_bw:
+        switches.append((relay_nodes, [num_nodes + 1], bw, 'remote_out'))
+        switches.append(([num_nodes + 1], relay_nodes, bw, 'remote_in'))
+
+    collective = _alltoall_subproblem(local_nodes, num_copies)
+    topology = Topology(f'Subtopo(local={topology.name},relays=({",".join(str(i) for i in relay_nodes)}))', links, topology.switches)
+    return collective, topology
+
+def synthesize_alltoall_subproblem(subproblem_algo, num_copies, logging=False):
+    if subproblem_algo.is_pipelined():
+        raise ValueError('Pipelining is not supported.')
+
+    local_topology = subproblem_algo.topology
+
+    chunks = subproblem_algo.instance.chunks
+    local_nodes = local_topology.num_nodes() - 1
+    remote_node = local_nodes
+    nodes = local_nodes * num_copies
+
+    collective = alltoall(nodes).chunk_up(chunks)
+
+    # Create a distributed topology where copies of relay nodes that connect to the remote node in the subproblem
+    # topology are connected to all the relay nodes in the other copies.
+    links = [[0 for _ in range(nodes)] for _ in range(nodes)]
+    for dst in range(nodes):
+        for src in range(nodes):
+            local_src = src % local_nodes
+            local_dst = dst % local_nodes
+            if src // local_nodes != dst // local_nodes:
+                bw = min(local_topology.link(local_src, remote_node), local_topology.link(remote_node, local_dst))
+                links[dst][src] = bw
+            else:
+                links[dst][src] = local_topology.link(local_src, local_dst)
+
+    # Also make copies of switches with a similar expansion of the remote node into the nodes of other copies.
+    switches = []
+    for srcs, dsts, bw, name in local_topology.switches:
+        for i in range(num_copies):
+            def to_dist(ranks):
+                for rank in ranks:
+                    if rank < remote_node:
+                        # Non-remote nodes are just translated to the distributed numbering of ranks.
+                        yield rank + i * local_nodes
+                    else:
+                        # Include all remote nodes in the switch. This is fine because the links already limit
+                        # connectivity to just the relay nodes.
+                        for r in range(nodes):
+                            if r // local_nodes != i:
+                                yield r
+
+            dist_srcs = list(to_dist(srcs))
+            dist_dsts = list(to_dist(dsts))
+            switches.append((dist_srcs, dist_dsts, bw, f'copy_{i}_{name}_local'))
+
+    topology = Topology(f'Stiched(sub={local_topology.name},copies={num_copies})', links, switches)
+
+    def nth_chunk_for_pair(src, dst, idx):
+        # The following chunk calculation respects both the _scattered and _transpose
+        # pre/postconditions in Alltoall. When substituting it in:
+        # -the precondition (chunk % self.num_nodes) simplifies to src
+        # -the postcondition ((chunk // self.num_nodes) % self.num_nodes) simplifies to dst
+        return (src + dst * collective.num_nodes) * chunks + idx
+
+    steps = []
+
+    # Calculate the ranges of the differently handled chunks
+    local_end = local_nodes * local_nodes
+    num_remote_pairs = (num_copies - 1) * local_nodes * local_nodes
+    remote_out_end = local_end + num_remote_pairs
+    num_chunks = remote_out_end + num_remote_pairs
+
+    for local_step in subproblem_algo.steps:
+        sends = []
+
+        # These are used to track operations involving remote nodes that get matched with another operation in the same
+        # step.
+        unmatched_sends = {}
+        unmatched_recvs = {}
+
+        # Stitch together copies of the subproblem algorithm
+        for chunk, src, dst in local_step.sends:
+            for i in range(num_copies):
+                def to_dist(rank):
+                    # Translates ranks from the local to the distributed topology
+                    return rank + i * local_nodes
+
+                def other_start(c):
+                    # Given a relative remote chunk return local rank 0 in the copy it corresponds to 
+                    other_i = c % (num_copies - 1)
+                    if other_i >= i:
+                        other_i += 1
+                    return other_i * local_nodes
+
+                # Calculate origin and target ranks that match the Alltoall pre/postconditions
+                if chunk < local_end:
+                    assert src != remote_node and dst != remote_node
+
+                    origin = to_dist((chunk // chunks) % local_nodes)
+                    target = to_dist((chunk // chunks) // local_nodes)
+
+                    # Check that the origin and target calculation match the local collective    
+                    assert subproblem_algo.collective.precondition(origin % local_nodes, chunk)
+                    assert subproblem_algo.collective.postcondition(target % local_nodes, chunk)
+                elif chunk < remote_out_end:
+                    c = chunk - local_end
+                    local_origin = ((c // chunks) // (num_copies - 1)) % local_nodes
+
+                    origin = to_dist(local_origin)
+                    target = other_start(c) + ((c // (num_copies - 1))) // local_nodes
+
+                    # Check that the origin and target calculation match the local collective
+                    assert subproblem_algo.collective.precondition(local_origin, chunk)
+                    assert subproblem_algo.collective.postcondition(target % local_nodes, chunk + num_remote_pairs)
+                else:
+                    assert chunk < num_chunks
+                    c = chunk - remote_out_end
+                    local_target = ((c // chunks) // (num_copies - 1)) // local_nodes
+                    
+                    target = to_dist(local_target)
+                    origin = other_start(c) + ((c // (num_copies - 1))) % local_nodes
+
+                    # Check that the origin and target calculation match the local collective
+                    assert subproblem_algo.collective.precondition(origin % local_nodes, chunk - num_remote_pairs)
+                    assert subproblem_algo.collective.postcondition(local_target, chunk)
+                
+                # Get the chunk number in the distributed algorithm
+                chunk_idx = chunk % chunks
+                # Translate send src and dst to distributed space and add the send to the distributed algorithm
+                dist_chunk = nth_chunk_for_pair(origin, target, chunk_idx)
+
+                if dst == remote_node:
+                    assert chunk < remote_out_end
+                    # Sends to remote nodes have to find a matched receive
+                    if dist_chunk in unmatched_recvs:
+                        dist_dst = unmatched_recvs.pop(dist_chunk)
+                        sends.append((dist_chunk, to_dist(src), dist_dst))
+                    else:
+                        unmatched_sends[dist_chunk] = to_dist(src)
+                elif src == remote_node:
+                    assert chunk < num_chunks
+                    # Receives from remote nodes have to find a matched send
+                    if dist_chunk in unmatched_sends:
+                        dist_src = unmatched_sends.pop(dist_chunk)
+                        sends.append((dist_chunk, dist_src, to_dist(dst)))
+                    else:
+                        unmatched_recvs[dist_chunk] = to_dist(dst)
+                else:
+                    # Sends locally are just translated to the new distributed space of ranks
+                    sends.append((dist_chunk, to_dist(src), to_dist(dst)))
+
+        if len(unmatched_sends) > 0 or len(unmatched_recvs) > 0:
+            raise ValueError('Subproblem algorithm has unpaired sends/recvs.')
+
+        steps.append(Step(local_step.rounds, sends))
+
+    instance = Instance(
+        steps=len(steps),
+        extra_rounds=sum(step.rounds - 1 for step in steps),
+        chunks=chunks,
+    )
+    return Algorithm.make_implementation(collective, topology, instance, steps)
diff --git a/sccl/distributors/gather_scatter_alltoall.py b/sccl/distributors/gather_scatter_alltoall.py
new file mode 100644
index 0000000..6da6ec8
--- /dev/null
+++ b/sccl/distributors/gather_scatter_alltoall.py
@@ -0,0 +1,181 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.collectives import *
+from sccl.algorithm import *
+from sccl.instance import *
+from sccl.topologies import distributed_fully_connected
+
+def synthesize_gather_scatter_distributed_alltoall(num_copies, gather_algo, scatter_algo, logging=False):
+    if gather_algo.is_pipelined() or scatter_algo.is_pipelined():
+        raise ValueError('Pipelining is not supported.')
+
+    if gather_algo.instance.chunks != scatter_algo.instance.chunks:
+        raise ValueError(f'Local gather and local scatter must have the same chunks (got {gather_algo.instance.chunks} and {scatter_algo.instance.chunks})')
+
+    if gather_algo.topology.name != scatter_algo.topology.name:
+        # TODO: improve this check to check actual structure, not just name
+        raise ValueError(f'Local gather and local scatter must have the same topology (got {gather_algo.topology.name} and {scatter_algo.topology.name})')
+    local_topology = gather_algo.topology
+
+    chunks = gather_algo.instance.chunks
+    local_nodes = gather_algo.topology.num_nodes()
+    nodes = local_nodes * num_copies
+
+    # Figure out the roots of the (possibly multi-root) gather
+    gather_roots = []
+    for chunk in range(gather_algo.collective.num_chunks // local_nodes):
+        for rank in range(local_nodes):
+            if gather_algo.collective.postcondition(rank, chunk):
+                gather_roots.append(rank)
+                break
+        else:
+            raise ValueError(f'Root number {chunk} not found for the Gather algorithm.')
+
+    # Check that we got the roots right
+    if len(gather_roots) > 1:
+        local_gather = multiroot_gather(local_nodes, roots=gather_roots).chunk_up(chunks)
+        try:
+            gather_algo.check_implements(local_gather)
+        except:
+            raise ValueError(f'Given local Gather algorithm "{gather_algo.name}" does not implement MultirootGather for the roots {gather_roots}.')
+    elif len(gather_roots) == 1:
+        local_gather = gather(local_nodes, root=gather_roots[0]).chunk_up(chunks)
+        try:
+            gather_algo.check_implements(local_gather)
+        except:
+            raise ValueError(f'Given local Gather algorithm "{gather_algo.name}" does not implement Gather for the root {gather_roots[0]}.')
+    else:
+        raise ValueError(f'No roots found for the Gather algorithm.')
+
+    # Figure out the roots of the (possibly multi-root) gather
+    scatter_roots = []
+    for chunk in range(scatter_algo.collective.num_chunks // local_nodes):
+        for rank in range(local_nodes):
+            if scatter_algo.collective.precondition(rank, chunk):
+                scatter_roots.append(rank)
+                break
+        else:
+            raise ValueError(f'Root number {chunk} not found for the Scatter algorithm.')
+
+    # Check that we got the roots right
+    if len(scatter_roots) > 1:
+        local_scatter = multiroot_scatter(local_nodes, roots=scatter_roots).chunk_up(chunks)
+        try:
+            scatter_algo.check_implements(local_scatter)
+        except:
+            raise ValueError(f'Given local Scatter algorithm "{scatter_algo.name}" does not implement MultirootScatter for the roots {scatter_roots}.')
+    elif len(scatter_roots) == 1:
+        local_scatter = scatter(local_nodes, root=scatter_roots[0]).chunk_up(chunks)
+        try:
+            scatter_algo.check_implements(local_scatter)
+        except:
+            raise ValueError(f'Given local Scatter algorithm "{scatter_algo.name}" does not implement Scatter for the root {scatter_roots[0]}.')
+    else:
+        raise ValueError(f'No roots found for the Scatter algorithm.')
+
+    if len(gather_roots) != len(scatter_roots):
+        raise ValueError(f'The number of roots for the Gather algorithm ({len(gather_roots)}) does not match the number of roots for the Scatter algorithm ({len(scatter_roots)}).')
+
+    # Multiply chunks to match the number of roots
+    if len(gather_roots) > 1:
+        chunks *= len(gather_roots)
+        print(f'Multiplying chunks by {len(gather_roots)} to match the number of roots.')
+
+    collective = alltoall(nodes)
+    topology = distributed_fully_connected(gather_algo.topology, num_copies, 1)
+
+    def nth_chunk_for_pair(src, dst, idx):
+        # The following chunk calculation respects both the _scattered and _transpose
+        # pre/postconditions in Alltoall. When substituting it in:
+        # -the precondition (chunk % self.num_nodes) simplifies to src
+        # -the postcondition ((chunk // self.num_nodes) % self.num_nodes) simplifies to dst
+        return (src + dst * nodes) * chunks + idx
+
+    steps = []
+
+    for local_step in gather_algo.steps:
+        sends = []
+
+        # Translate copies of the local Gather to the new space of ranks
+        for chunk, src, dst in local_step.sends:
+            for target_rank in range(nodes):
+                for i in range(num_copies):
+                    # Translates ranks from the local to the distributed topology
+                    def to_dist(rank):
+                        return rank + i * local_nodes
+
+                    # Calculate origin rank that matches the Gather precondition
+                    origin = (chunk // chunks) % local_nodes
+
+                    # Check that we got that calculation right
+                    assert local_gather.precondition(origin, chunk)
+
+                    # Get the chunk number in the distributed algorithm
+                    chunk_idx = chunk % chunks
+                    dist_chunk = nth_chunk_for_pair(to_dist(origin), target_rank, chunk_idx)
+
+                    # Translate send src and dst to distributed space and the send to the distributed algorithm
+                    sends.append((dist_chunk, to_dist(src), to_dist(dst)))
+                    assert to_dist(src) != to_dist(dst)
+
+        steps.append(Step(local_step.rounds * nodes, sends))
+
+    # Perform transpose between local root nodes
+    transpose_sends = []
+    for src in range(nodes):
+        for dst in range(nodes):
+                # Sends are needed for the chunks going from src to dst if they are in different copies or if the
+                # gather and scatter roots are different.
+                for chunk_idx in range(chunks):
+                    gather_root = gather_roots[chunk_idx % len(gather_roots)]
+                    scatter_root = scatter_roots[chunk_idx % len(scatter_roots)]
+                    if (src // local_nodes == dst // local_nodes and
+                            gather_root != scatter_root and
+                            local_topology.link(gather_root, scatter_root) == 0):
+                        raise ValueError(f'The local topology does not have a link from root {gather_root} of the Gather to root {scatter_root} of the Scatter.')
+                    if gather_root != scatter_root or src // local_nodes != dst // local_nodes:
+                        chunk = nth_chunk_for_pair(src, dst, chunk_idx)
+                        # Calculate the local root ranks' global indices
+                        root_src = (src // local_nodes) * local_nodes + gather_root
+                        root_dst = (dst // local_nodes) * local_nodes + scatter_root
+                        transpose_sends.append((chunk, root_src, root_dst))
+                        assert root_src != root_dst
+    steps.append(Step(chunks * local_nodes * local_nodes, transpose_sends))
+
+    #TODO: integrate into above
+    if gather_root != scatter_root and local_topology.link(gather_root, scatter_root) == 0:
+        raise ValueError(f'Local topology does not have a link from the root of the Gather ({gather_root}) to that of the Scatter ({scatter_root}).')
+
+    for local_step in scatter_algo.steps:
+        sends = []
+
+        # Translate copies of the local Scatter to the new space of ranks
+        for chunk, src, dst in local_step.sends:
+            for source_rank in range(nodes):
+                for i in range(num_copies):
+                    # Translates ranks from the local to the distributed topology
+                    def to_dist(rank):
+                        return rank + i * local_nodes
+                    
+                    # Calculate target rank that matches the Scatter postcondition
+                    target = (chunk // chunks) % local_nodes
+
+                    # Check that we got that calculation right
+                    assert local_scatter.postcondition(target, chunk)
+
+                    # Get the chunk number in the distributed algorithm
+                    chunk_idx = chunk % chunks
+                    dist_chunk = nth_chunk_for_pair(source_rank, to_dist(target), chunk_idx)
+
+                    # Translate send src and dst to distributed space and the send to the distributed algorithm
+                    sends.append((dist_chunk, to_dist(src), to_dist(dst)))
+
+        steps.append(Step(local_step.rounds * nodes, sends))
+
+    instance = Instance(
+        steps=len(steps),
+        extra_rounds=sum(step.rounds - 1 for step in steps),
+        chunks=chunks,
+    )
+    return Algorithm.make_implementation(collective, topology, instance, steps)
diff --git a/sccl/distributors/greedy_alltoall.py b/sccl/distributors/greedy_alltoall.py
new file mode 100644
index 0000000..50a9b47
--- /dev/null
+++ b/sccl/distributors/greedy_alltoall.py
@@ -0,0 +1,177 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.collectives import alltoall
+from sccl.algorithm import *
+from sccl.instance import *
+
+from collections import defaultdict
+from dataclasses import dataclass
+from enum import Enum
+import math
+
+@dataclass
+class _BwLimit:
+    srcs: set
+    dsts: set
+    bw: int
+    util: int = 0
+
+def synthesize_greedy_distributed_alltoall(topology, local_algorithm, logging=False):
+    if local_algorithm.is_pipelined():
+        raise ValueError('Pipelining is not supported.')
+
+    chunks = local_algorithm.instance.chunks
+    local_nodes = local_algorithm.topology.num_nodes()
+    local_alltoall = alltoall(local_nodes).chunk_up(chunks)
+    try:
+        local_algorithm.check_implements(local_alltoall)
+    except:
+        raise ValueError(f'Given local Alltoall algorithm "{local_algorithm.name}" does not implement Alltoall in the local topology.')
+
+    if topology.num_nodes() % local_nodes != 0:
+        raise ValueError(f'Number of nodes in topology is not a multiple of ranks in local_algorithm.')
+    num_copies = topology.num_nodes() // local_nodes
+
+    def is_pair_remote(rank1, rank2):
+        # A pair of nodes is remote if they are in different copies of the local topology
+        return src // local_nodes != dst // local_nodes
+
+    # Check that all switches are either purely local or remote.
+    # Also check that remote part is fully connected.
+    # Also remember remote constraints.
+    remote_constraints = []
+    for srcs, dsts, bw, _ in topology.bandwidth_constraints():
+        has_local_pairs = False
+        has_remote_pairs = False
+        for src in srcs:
+            for dst in dsts:
+                if is_pair_remote(src, dst):
+                    has_remote_pairs = True
+                else:
+                    has_local_pairs = True
+        if has_local_pairs and has_remote_pairs:
+            raise NotImplementedError('Support for switches with mixed local and remote connections is not implemented.')
+        if has_remote_pairs and bw == 0:
+            # This is required because it's what makes Alltoall routing trivial
+            raise ValueError('All remote pairs must have direct connectivity.')
+        if has_remote_pairs:
+            remote_constraints.append((srcs, dsts, bw))
+
+    collective = alltoall(topology.num_nodes())
+
+    def nth_chunk_for_pair(src, dst, idx):
+        # The following chunk calculation respects both the _scattered and _transpose
+        # pre/postconditions in Alltoall. When substituting it in:
+        # -the precondition (chunk % self.num_nodes) simplifies to src
+        # -the postcondition ((chunk // self.num_nodes) % self.num_nodes) simplifies to dst
+        return (src + dst * collective.num_nodes) * chunks + idx
+
+    if logging:
+        print('Generating sends for remote pairs')
+
+    # Generate all of the sends that need to happen for the remote part, grouped by pairs of src and dst
+    remote_sends = {}
+    for src in collective.ranks():
+        for dst in collective.ranks():
+            if is_pair_remote(src, dst):
+                sends = [(nth_chunk_for_pair(src, dst, i), src, dst)
+                    for i in reversed(range(chunks))]
+                remote_sends[(src,dst)] = sends
+
+    # This function pulls as many sends out of remote_sends as the topology's bw constraints allow
+    def pack_sends(rounds):
+        packed_sends = []
+        # Make a mutable copy of the bandwidth constraints
+        bw_limits = [_BwLimit(srcs, dsts, bw * rounds) for srcs, dsts, bw in remote_constraints]
+        empty_pairs = []
+        for pair in remote_sends:
+            src, dst = pair
+            sends = remote_sends[pair]
+            # Yield as many sends as allowed by the bw limits
+            max_sends = len(sends)
+            relevant_limits = []
+            for limit in bw_limits:
+                if src in limit.srcs and dst in limit.dsts:
+                    max_sends = min(max_sends, limit.bw - limit.util)
+                    relevant_limits.append(limit)
+            for i in range(max_sends):
+                packed_sends.append(sends.pop())
+            # Remove used bandwidth from limits
+            for limit in relevant_limits:
+                limit.util += max_sends
+            if len(sends) == 0:
+                empty_pairs.append(pair)
+        # Remove pairs that don't have sends remaining
+        for pair in empty_pairs:
+            del remote_sends[pair]
+        return packed_sends
+
+    steps = []
+
+    if logging:
+        print('Overlapping remote sends with local algorithm')
+
+    for step_idx, local_step in enumerate(local_algorithm.steps):
+        sends = []
+
+        # Translate copies of the local algorithm to the new space of ranks
+        for chunk, src, dst in local_step.sends:
+            for i in range(num_copies):
+                # Translates ranks from the local to the distributed topology
+                def to_dist(rank):
+                    return rank + i * local_nodes
+
+                # Calculate origin and target ranks that match the Alltoall pre/postconditions
+                origin = (chunk // chunks) % local_nodes
+                target = (chunk // chunks) // local_nodes
+
+                # Check that we got that calculation right
+                assert local_alltoall.precondition(origin, chunk)
+                assert local_alltoall.postcondition(target, chunk)
+
+                # Get the chunk number in the distributed algorithm
+                chunk_idx = chunk % chunks
+                dist_chunk = nth_chunk_for_pair(to_dist(origin), to_dist(target), chunk_idx)
+
+                # Translate send src and dst to distributed space and the send to the distributed algorithm
+                sends.append((dist_chunk, to_dist(src), to_dist(dst)))
+
+        # Pack sends respecting the local step's rounds
+        packed_sends = pack_sends(local_step.rounds)
+        sends.extend(packed_sends)
+        if logging:
+            print(f'Packed {len(packed_sends)} remote sends into step {step_idx+1}')
+
+        steps.append(Step(local_step.rounds, sends))
+    
+    # If any remote sends are left over once the local algorithm is done, put them all into the last step
+    remaining_sends = len(remote_sends)
+    if remaining_sends > 0:
+        last_step = steps[-1]
+
+        # Add sends and count their utilization against all constraints
+        bw_limits = [_BwLimit(srcs, dsts, bw) for srcs, dsts, bw in remote_constraints]
+        empty_pairs = []
+        for pair in remote_sends:
+            src, dst = pair
+            sends = remote_sends[pair]
+            # Add utilization against all relevant limits
+            for limit in bw_limits:
+                if src in limit.srcs and dst in limit.dsts:
+                    limit.util += len(sends)
+            # Add the sends to the last step
+            last_step.sends.extend(sends)
+
+        # Find the least rounds required and add additional rounds to last step
+        additional_rounds = max(math.ceil(limit.util / limit.bw) for limit in bw_limits)
+        last_step.rounds += additional_rounds
+        if logging:
+            print(f'Packed remaining {remaining_sends} remote sends into step {len(steps)} by adding {additional_rounds} additional rounds')
+    else:
+        if logging:
+            print('All remote sends fit into the rounds of the local algorithm')
+        additional_rounds = 0
+
+    instance = local_algorithm.instance.set(extra_rounds=local_algorithm.instance.extra_rounds + additional_rounds)
+    return Algorithm.make_implementation(collective, topology, instance, steps)
diff --git a/sccl/instance.py b/sccl/instance.py
new file mode 100644
index 0000000..750d80e
--- /dev/null
+++ b/sccl/instance.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from dataclasses import dataclass
+
+@dataclass(frozen=True)
+class Instance:
+    steps: int
+    extra_rounds: int = 0
+    chunks: int = 1
+    pipeline: int = None
+    extra_memory: int = None
+    allow_exchange: bool = False
+
+    def rounds(self):
+        return self.steps + self.extra_rounds
+
+    def set(self, steps = None, extra_rounds = None, chunks = None, pipeline = None, extra_memory = None, allow_exchange = None):
+        return Instance(
+            steps if steps != None else self.steps,
+            extra_rounds if extra_rounds != None else self.extra_rounds,
+            chunks if chunks != None else self.chunks,
+            pipeline if pipeline != None else self.pipeline,
+            extra_memory if extra_memory != None else self.extra_memory,
+            allow_exchange if allow_exchange != None else self.allow_exchange)
+    
+    def __str__(self):
+        s = f'steps={self.steps}'
+        if self.extra_rounds > 0:
+            s += f',rounds={self.steps + self.extra_rounds}'
+        if self.chunks > 1:
+            s += f',chunks={self.chunks}'
+        if self.pipeline != None:
+            s += f',pipeline={self.pipeline}'
+        if self.extra_memory != None:
+            s += f',extra_memory={self.extra_memory}'
+        if self.allow_exchange:
+            s += f',allow_exchange'
+        return s
diff --git a/sccl/isomorphisms.py b/sccl/isomorphisms.py
new file mode 100644
index 0000000..45c2792
--- /dev/null
+++ b/sccl/isomorphisms.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from z3 import *
+from dataclasses import dataclass
+
+@dataclass
+class Permutation:
+    nodes: list
+
+    def __str__(self):
+        return f'Permutation(nodes={self.nodes})'
+
+def _pn(node):
+    return Int(f'perm_node_{node}')
+
+def _select_node_permutation(s, topology):
+    # Select a permutation of nodes
+    for node in topology.nodes():
+        s.add(_pn(node) >= 0)
+        s.add(_pn(node) < topology.num_nodes())
+        for prev in range(node):
+            s.add(_pn(node) != _pn(prev))
+
+def _links_constraint(topology, target_topology):
+    nodes = range(topology.num_nodes())
+
+    def links_isomorphic(perm_src, perm_dst, link):
+        # Return a condition on whether the permuted ranks are isomorphic from src to dst wrt. the given link
+        for src in nodes:
+            for dst in nodes:
+                if target_topology.link(src, dst) != link:
+                    yield Not(And(perm_src == src, perm_dst == dst))
+    # Require all pairs of nodes to be isomorphic to their permuted counterparts
+    conditions = []
+    for src in nodes:
+        for dst in nodes:
+            link = topology.link(src, dst)
+            conditions.extend(links_isomorphic(_pn(src), _pn(dst), link))
+    return And(conditions)
+
+def _decode_permutation(model, topology):
+    node_permutation = [model.eval(_pn(node)).as_long() for node in topology.nodes()]
+    return Permutation(node_permutation)
+
+def find_isomorphisms(topology, target_topology, limit=None, logging=False):
+    '''
+    Finds all isomorphisms from one topology to a target topology. Returns a list of permutations.
+    '''
+    if len(topology.switches) > 0:
+        raise ValueError('Topologies with switches are not supported.')
+
+    if limit != None and limit <= 0:
+        return []
+    
+    if topology.num_nodes() != target_topology.num_nodes():
+        return []
+
+    if logging:
+        print(f'Encoding {topology.name} - {target_topology.name} isomorphisms to Z3')
+
+    s = Solver()
+
+    _select_node_permutation(s, topology)
+    s.add(_links_constraint(topology, target_topology))
+
+    if logging:
+        print(f'Solving isomorphisms incrementally...')
+
+    isomorphisms = []
+    while s.check() == sat:
+        isomorphism = _decode_permutation(s.model(), topology)
+        isomorphisms.append(isomorphism)
+
+        if logging:
+            print(isomorphism)
+
+        if limit != None and len(isomorphisms) >= limit:
+            break
+
+        # Block this permutation
+        assignment = [_pn(node) == perm for node, perm in enumerate(isomorphism.nodes)]
+        s.add(Not(And(assignment)))
+
+    if logging:
+        print(f'{len(isomorphisms)} isomorphisms found.')
+    return isomorphisms
diff --git a/sccl/ncclize.py b/sccl/ncclize.py
new file mode 100644
index 0000000..26896e2
--- /dev/null
+++ b/sccl/ncclize.py
@@ -0,0 +1,633 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from lxml import etree as ET
+from collections import defaultdict
+from dataclasses import dataclass, field, replace
+import math
+import threading, queue
+from enum import Enum
+from z3 import *
+
+@dataclass
+class _Gpu:
+    copies: list
+    inputs: dict
+    outputs: dict
+    input_chunks: int
+    output_chunks: int
+    scratch: dict = field(default_factory=dict)
+    threadbloks: list = field(default_factory=list)
+
+    def scratch_size(self):
+        return max((idx for addr, idx in self.scratch.items()), default=-1) + 1
+
+@dataclass
+class _Threadblock:
+    channel: int
+    rbid: int = None
+    send: int = -1
+    recv: int = -1
+    steps: list = field(default_factory=list)
+    # The steps may expand into multiple operations here
+    ops: list = field(default_factory=list)
+
+@dataclass
+class _Copy:
+    input_offset: int
+    output_offset: int
+
+@dataclass
+class _Op:
+    gpu: int
+    peer: int
+    step: int
+    is_send: bool
+    op_type: str
+    src_buffer: str
+    src_offset: int
+    dst_buffer: str
+    dst_offset: int
+    cnt: int
+    depends: list
+    block_rbid: int = None
+    # idx is the NCCL XML step index, which may not be the same as the algorithm step index
+    idx: int = None
+    has_dependence: bool = False
+
+    def __eq__(self, other):
+        return self is other
+
+    def __hash__(self):
+        return id(self)
+
+def _analyze_liveness(gpus, algorithm):
+    # Initialize liveness intervals for buffers on each GPU
+    input_livenesses = {rank: [[(-1,-1)] for _ in range(gpu.input_chunks)] for rank, gpu in gpus.items()}
+    output_livenesses = {rank: [[(math.inf,math.inf)] for _ in range(gpu.output_chunks)] for rank, gpu in gpus.items()}
+    scratch_livenesses = {rank: [[(math.inf,-1)] for addr, idx in gpu.scratch.items()] for rank, gpu in gpus.items()}
+
+    # For copies reserve the index in the output buffer from the very beginning
+    for rank, gpu in gpus.items():
+        for copy in gpu.copies:
+            output_livenesses[rank][copy.output_offset] = [(-1,math.inf)]
+
+    def update_liveness(rank, addr, step_idx):
+        gpu = gpus[rank]
+        # Find the relevant buffer and livenesses for the address
+        if addr in gpu.inputs:
+            buffer = gpu.inputs
+            liveness = input_livenesses[rank]
+        elif addr in gpu.outputs:
+            buffer = gpu.outputs
+            liveness = output_livenesses[rank]
+        elif addr in gpu.scratch:
+            buffer = gpu.scratch
+            liveness = scratch_livenesses[rank]
+        else:
+            raise RuntimeError(f'Address {addr} not found in any buffer of rank {rank}.')
+        
+        # Expand the interval to include the step
+        idx = buffer[addr]
+        start, end = liveness[idx][0]
+        liveness[idx][0] = (min(start, step_idx), max(end, step_idx))
+
+    # For each step of the algorithm, update liveness intervals for all buffers
+    for step_idx, step in enumerate(algorithm.steps):
+        for addr, src, dst in step.sends:
+            update_liveness(src, addr, step_idx)
+            update_liveness(dst, addr, step_idx)
+    
+    return (input_livenesses, output_livenesses, scratch_livenesses)
+
+def _remap_scratch_into_input_output(liveness, gpus, logging):
+    '''
+    This function solves and applies a static mapping for scratch buffer indices to input/output buffers that minimizes
+    scratch buffer usage for each GPU. The solving is done per GPU using the Z3 SMT solver.
+    '''
+    input_livenesses, output_livenesses, scratch_livenesses = liveness
+
+    if logging:
+        print('Remapping scratch into input/output...')
+
+    def conflict(b1, b2):
+        # Check if any of the intervals in lists b1 and b2 overlap
+        return any(s1 <= e2 and s2 <= e1 for s1, e1 in b1 for s2, e2 in b2)
+
+    print('Optimizing scratch mapping on all GPUs: ', end='', flush=True)
+    # Handle each GPU separately
+    for rank, gpu in gpus.items():
+        ctx = Context()
+        s = Solver(ctx=ctx)
+
+        def remap(idx):
+            # Choose for each scratch index a new index in one of the buffers
+            # The index space has the input buffer from 0 to input_chunks-1,
+            # the output buffer from input_chunks to output_chunks-1,
+            # and the scratch buffer for any indices past that.
+            return Int(f'{idx}_remap', ctx=ctx)
+
+        # This variable limits the maximum index, in effect the size of the scratch buffer
+        idx_end = Int(f'idx_end', ctx=ctx)
+
+        for scratch_idx, scratch_liveness in enumerate(scratch_livenesses[rank]):
+            # Block any input indices that conflict with the scratch index
+            for input_idx, liveness in enumerate(input_livenesses[rank]):
+                if conflict(scratch_liveness, liveness):
+                    s.add(remap(scratch_idx) != input_idx)
+            # Block any output indices that conflict with the scratch index
+            for output_idx, liveness in enumerate(output_livenesses[rank]):
+                if conflict(scratch_liveness, liveness):
+                    s.add(remap(scratch_idx) != output_idx + gpu.input_chunks)
+            # Block remapping conflicting scratch indices to the same input/output indices
+            for other_idx, liveness in enumerate(scratch_livenesses[rank]):
+                if other_idx != scratch_idx and conflict(liveness, scratch_liveness):
+                    s.add(remap(scratch_idx) != remap(other_idx))
+            # Require all indices to fit in the allowed buffer space
+            s.add(remap(scratch_idx) >= 0)
+            s.add(remap(scratch_idx) < idx_end)
+
+        no_memory = gpu.input_chunks + gpu.output_chunks
+
+        q = queue.Queue()
+        def optimize(q):
+            # Iterate the memory limit down to find a mapping that minimizes scratch usage
+            for memory in range(no_memory + gpu.scratch_size(), no_memory - 1, -1):
+                if s.check(idx_end == memory) == sat:
+                    # Remember the model for the best solution
+                    try:
+                        m = s.model()
+                        new_idxs = {addr: m[remap(old_idx)].as_long() for addr, old_idx in gpu.scratch.items()}
+                        q.put(new_idxs)
+                    except Z3Exception:
+                        # This can happen when the solver is interrupted
+                        return
+                else:
+                    return
+        t = threading.Thread(target=optimize, args=(q,))
+        t.start()
+        t.join(1)
+        ctx.interrupt()
+
+        new_idxs = None
+        while not q.empty():
+            new_idxs = q.get()
+        
+        if new_idxs != None:
+            print('.', end='', flush=True)
+            # Apply the model to remap the scratch indices
+            new_scratch = {}
+            new_scratch_livenesses = [[] for addr, idx in gpu.scratch.items()]
+            for addr, old_idx in gpu.scratch.items():
+                new_idx = new_idxs[addr]
+                # Figure out which buffer the index is in
+                if new_idx < gpu.input_chunks:
+                    tgt_buffer = gpu.inputs
+                    tgt_idx = new_idx
+                    tgt_liveness = input_livenesses[rank][tgt_idx]
+                elif new_idx < gpu.input_chunks + gpu.output_chunks:
+                    tgt_buffer = gpu.outputs
+                    tgt_idx = new_idx - gpu.input_chunks
+                    tgt_liveness = output_livenesses[rank][tgt_idx]
+                else:
+                    tgt_buffer = new_scratch
+                    tgt_idx = new_idx - gpu.input_chunks - gpu.output_chunks
+                    tgt_liveness = new_scratch_livenesses[tgt_idx]
+
+                # Check that the remapping doesn't conflict with any existing mappings
+                liveness = scratch_livenesses[rank][old_idx]
+                assert not conflict(tgt_liveness, liveness)
+                tgt_liveness.extend(liveness)
+
+                # Remap the scratch index to the new index in the target buffer
+                tgt_buffer[addr] = tgt_idx
+            gpu.scratch = new_scratch
+        else:
+            print('x', end='', flush=True)
+    else:
+        print()
+
+    if logging:
+        max_scratch_overhead = max(gpu.scratch_size() / (gpu.input_chunks + gpu.output_chunks) for gpu in gpus.values())
+        print(f'Maximum scratch overhead is {max_scratch_overhead * 100:.0f}%')
+
+def _allocate_channels_max_concurrency(op_sets, logging):
+    # This function solves a coloring problem to ops to a minimal set of channels
+    ctx = Context()
+
+    def chan(idx):
+        return Int(f'chan_{idx}', ctx=ctx)
+    max_channels = Int('max_channels', ctx=ctx)
+
+    constraints = []
+
+    # Add basic constraints and find conflicting sets of operations
+    conflict_groups = defaultdict(set)
+    for idx, op_set in enumerate(op_sets):
+        for op in op_set:
+            # Two operations conflict if they use the same src-dst edge on the same step
+            conflict_groups[(op.gpu, op.is_send, op.peer, op.step)].add(idx)
+        constraints.append(chan(idx) >= 0)
+        constraints.append(chan(idx) < max_channels)
+
+    # Require channels within the conflict groups to be disjoint
+    for grp in conflict_groups.values():
+        constraints.append(Distinct([chan(idx) for idx in grp]))
+
+    opt = Optimize(ctx=ctx)
+    opt.add(constraints)
+    opt.minimize(max_channels)
+    
+    t = threading.Thread(target=opt.check)
+    t.start()
+    t.join(1)
+    main_ctx().interrupt()
+    t.join()
+
+    try:
+        model = opt.model()
+    except Z3Exception:
+        # TODO: This altenate process does not guarantee that channels are contiguous
+        s = Solver(ctx=ctx)
+        s.add(constraints)
+        s.check()
+        model = s.model()
+            
+    if logging:
+        print(f'Using up to {model[max_channels].as_long()} channels')
+
+    # Group the operations by which channels they use
+    ops_by_channel = defaultdict(list)
+    for idx, op_set in enumerate(op_sets):
+        ops = ops_by_channel[model[chan(idx)].as_long()]
+        ops.extend(op_set)
+
+    return ops_by_channel
+
+def _allocate_channels_match_topology(op_sets, topology, instances, logging):
+    if len(topology.switches) > 0 and logging:
+        print('Warning: Switches in the topology are ignored for the channel policy MatchTopology.')
+
+    ops_by_channel = defaultdict(list)
+    next_channel = defaultdict(lambda: 0)
+    for op_set in op_sets:
+        send = op_set[0]
+        assert send.op_type == 's'
+        src = send.gpu
+        dst = send.peer
+        ops_by_channel[next_channel[(src,dst)]].extend(op_set)
+        link = topology.link(src,dst) * instances
+        assert link > 0, 'Encountered send on non-existent link'
+        next_channel[(src,dst)] = (next_channel[(src,dst)] + 1) % link
+
+    return ops_by_channel
+
+class ChannelPolicy(Enum):
+    One = 'One'
+    MaxConcurrency = 'MaxConcurrency'
+    MatchTopology = 'MatchTopology'
+
+    def __str__(self):
+        return self.value
+
+def ncclize(algorithm, remap_scratch = None, channel_policy=ChannelPolicy.MatchTopology, pretty_print = True, old_format=False, use_scratch=False, merge_contiguous=True, instances=1, logging=False):
+    '''
+    Generate the XML format used by the NCCL SCCL backend.
+
+    Sends are split into send/recv operations and grouped by the rank executing them. Within each rank operations are
+    grouped under <threadblock/> tags, which handle 1) a single peer, 2) a single type of operation, and 3) at most one
+    operation per each step of the algorithm. Additional threadblocks are created as necessary to meet these
+    constraints.
+
+    Each send operation is mapped from the abstract addresses used by the synthesized algorithm to offsets into three
+    named buffers, "input", "output" and "scratch", based on whether the address appears in a particular rank's
+    precondition, postcondition or neither. For addresses that would be in both the input and output buffers <copy/>
+    tags are created to mark an initial transfer to the output buffer and only the output buffer mapping is kept.
+    '''
+
+    if algorithm.is_pipelined():
+        raise ValueError('Pipelining is not supported.')
+
+    if remap_scratch is None:
+        if algorithm.instance.extra_memory != None:
+            remap_scratch = True
+            if logging:
+                print('Turning scratch remapping on to honor the memory limit set in the instance.')
+        else:
+            remap_scratch = False
+
+    # Create GPUs, their address to buffer mappings and possible copies
+    gpus = {}
+    for rank in algorithm.ranks():
+        outputs = {}
+        if rank in algorithm.output_map:
+            outputs.update({ addr: idx for idx, addr in enumerate(sorted(algorithm.output_map[rank])) })
+        inputs = {}
+        copies = []
+        if rank in algorithm.input_map:
+            for idx, addr in enumerate(sorted(algorithm.input_map[rank])):
+                if addr in outputs:
+                    copies.append(_Copy(idx, outputs[addr]))
+                else:
+                    inputs[addr] = idx
+        gpus[rank] = _Gpu(copies, inputs, outputs, len(inputs) + len(copies), len(outputs))
+
+    # Create scratch buffer mappings if necessary
+    def allocate_scratch(gpu, addr):
+        if not (addr in gpu.inputs or addr in gpu.outputs or addr in gpu.scratch):
+            offset = len(gpu.scratch)
+            gpu.scratch[addr] = offset
+    for step in algorithm.steps:
+        for addr, src, dst in step.sends:
+            allocate_scratch(gpus[src], addr)
+            allocate_scratch(gpus[dst], addr)
+
+    # Analyze liveness of indices in buffers and remap scratch into input/output as possible
+    if remap_scratch:
+        liveness = _analyze_liveness(gpus, algorithm)
+        _remap_scratch_into_input_output(liveness, gpus, logging)
+
+    # Sort scratch mappings in an attemp to make more of them contiguous (this is of course a heuristic).
+    for gpu in gpus.values():
+        gpu.scratch = { addr: idx for idx, addr in enumerate(sorted(gpu.scratch)) }
+
+    def get_buffer_and_offset(gpu, addr):
+        # Map an address to one of the named buffers
+        if addr in gpu.inputs:
+            return 'i', gpu.inputs[addr]
+        elif addr in gpu.outputs:
+            return 'o', gpu.outputs[addr]
+        elif addr in gpu.scratch:
+            return 's', gpu.scratch[addr]
+        else:
+            raise RuntimeError('Address is not mapped to a buffer')
+
+    def make_intervals(src, dst, addrs_set):
+        if len(addrs_set) == 0:
+            return
+
+        buffs_and_offs = []
+        for addr in addrs_set:
+            srcbuff, srcoff = get_buffer_and_offset(gpus[src], addr)
+            dstbuff, dstoff = get_buffer_and_offset(gpus[dst], addr)
+            buffs_and_offs.append((srcbuff, srcoff, dstbuff, dstoff))
+        
+        if merge_contiguous:
+            # Sort sends by both buffers and offsets and merge sends into larger intervals when both the source and
+            # destination are contiguous.
+            buffs_and_offs.sort()
+            start = prev = buffs_and_offs[0]
+
+            def make_interval(a,b):
+                cnt = b[1] - a[1] + 1
+                assert cnt == b[3] - a[3] + 1, 'Source and destination count mismatch'
+                return (a[0], a[1], a[2], a[3], cnt)
+        
+            for x in buffs_and_offs[1:]:
+                if x[0] == prev[0] and x[1] == prev[1] + 1 and x[2] == prev[2] and x[3] == prev[3] + 1:
+                    # Merge into previous interval if buffers match and the new offsets are at the end of the interval
+                    prev = x
+                else:
+                    # Yield the previous interval and start a new one
+                    yield make_interval(start, prev)
+                    start = prev = x
+            # Yield the last interval
+            yield make_interval(start, prev)
+        else:
+            # Just yield size 1 intervals if merging is disabled
+            for srcbuff, srcoff, dstbuff, dstoff in buffs_and_offs:
+                yield (srcbuff, srcoff, dstbuff, dstoff, 1)    
+
+    # Turn all steps of the algorithm into operations
+    op_sets = []
+    # Track the latest op that wrote to each buffer index
+    writers = defaultdict(list)
+    # Track all the reads since the last write to each buffer index
+    readers = defaultdict(list)
+    for step_idx, step in enumerate(algorithm.steps):
+        new_writers = defaultdict(list)
+        new_readers = defaultdict(list)
+
+        # Group sent addresses by edge
+        grouped_sends = defaultdict(set)
+        for addr, src, dst in step.sends:
+            grouped_sends[(src,dst)].add(addr)
+
+        # Combine sends into intervals and create multiple instances if necessary
+        sends = []
+        for (src, dst), addrs in grouped_sends.items():
+            for src_buf, src_off, dst_buf, dst_off, cnt in make_intervals(src, dst, addrs):
+                for i in range(instances):
+                    new_src_off = src_off * instances + i
+                    new_dst_off = dst_off * instances + i
+                    send = (src, dst, src_buf, new_src_off, dst_buf, new_dst_off, cnt)
+                    sends.append(send)
+
+        # Perform dependency tracking and create _Op instances
+        for src, dst, src_buf, src_off, dst_buf, dst_off, cnt in sends:
+            read_keys = [(src,src_buf,src_off+i) for i in range(cnt)]
+            # A send must wait for the previous recv (if any) to finish
+            send_depends = list(set(d for k in read_keys for d in writers[k]))
+
+            write_keys = [(dst,dst_buf,dst_off+i) for i in range(cnt)]
+            # A receive must wait for both the previous recv and any previous sends to finish
+            recv_depends = list(set(d for deps in (readers, writers) for k in write_keys for d in deps[k]))
+
+            send_op = _Op(src, dst, step_idx, True, 's', src_buf, src_off, dst_buf, dst_off, cnt, send_depends)
+            recv_op = _Op(dst, src, step_idx, False, 'r', src_buf, src_off, dst_buf, dst_off, cnt, recv_depends)
+            # Record the send and receive as a set of operations that must happen on the same channel
+            op_sets.append([send_op, recv_op])
+
+            # Mark writers and readers to be added for the next step
+            for k in write_keys:
+                new_writers[k].append(recv_op)
+            for k in read_keys:
+                new_readers[k].append(send_op)
+        # Writes cut the dependency to both previous writes and reads
+        for key, deps in new_writers.items():
+            if key in new_readers:
+                gpu, buf, off = key
+                raise RuntimeError(f'Encountered receive and send on the same buffer index on step {step_idx + 1} (gpu={gpu}, buf={buf}, off={off})')
+            writers[key] = deps
+            readers[key] = []
+        # Reads get added to any previous reads
+        for key, deps in new_readers.items():
+            readers[key].extend(deps)
+
+    # Fixup everything to match the instanced sends when multiple instances are generated
+    if instances > 1:
+        for gpu in gpus.values():
+            # Create instances copies of the copies.
+            new_copies = []
+            for copy in gpu.copies:
+                for i in range(instances):
+                    new_copy = _Copy(copy.input_offset * instances + i, copy.output_offset * instances + i)
+                    new_copies.append(new_copy)
+            gpu.copies = new_copies
+
+            # Multiply the other metadata with instances
+            def expand_mappings(mappings):
+                return { addr * instances + i: idx * instances + i for addr, idx in mappings.items() for i in range(instances) }
+            gpu.inputs = expand_mappings(gpu.inputs)
+            gpu.outputs = expand_mappings(gpu.outputs)
+            gpu.input_chunks *= instances
+            gpu.output_chunks *= instances
+            gpu.scratch = expand_mappings(gpu.scratch)
+
+    # Allocate channels and group operations by channel
+    if channel_policy == ChannelPolicy.One:
+        ops_by_channel = {0: [op for op_set in op_sets for op in op_set]}
+    elif channel_policy == ChannelPolicy.MaxConcurrency:
+        ops_by_channel = _allocate_channels_max_concurrency(op_sets, logging)
+    elif channel_policy == ChannelPolicy.MatchTopology:
+        ops_by_channel = _allocate_channels_match_topology(op_sets, algorithm.topology, instances, logging)
+    else:
+        assert False, 'Unhandled channel policy'
+
+    # Group by which operations need to be in the same threadblock
+    tb_groups = defaultdict(list)
+    for chan, chan_ops in ops_by_channel.items():
+        for op in chan_ops:
+            tb_groups[(op.gpu, op.is_send, op.peer, chan)].append(op)
+
+    tbs_by_gpu_chan = defaultdict(lambda: defaultdict(list))
+    # For each group find or create a threadblock to add them to
+    for key, grp in tb_groups.items():
+        rank, is_send, peer, chan = key
+        tbs = tbs_by_gpu_chan[rank][chan]
+        for tb in tbs:
+            tb_peer = tb.send if is_send else tb.recv
+            # An existing threadblock can be reused if:
+            # - Either the relevant peer is not set yet or the peer is the same
+            # - No operations already in the threadblock execute in the same step
+            if tb_peer == -1 or tb_peer == peer:
+                if all(not any(op1.step == op2.step for op2 in grp) for op1 in tb.steps):
+                    break
+        else:
+            # No existing threadblock was suitble, so create a new one
+            tb = _Threadblock(chan)
+            tbs.append(tb)
+        # Ensure the peer is set correctly
+        if is_send:
+            assert tb.send == -1 or tb.send == peer
+            tb.send = peer
+        else:
+            assert tb.recv == -1 or tb.recv == peer
+            tb.recv = peer
+        tb.steps.extend(grp)
+
+    # Sort threadblocks in each GPU by peers and then the channel
+    # This is important as in NCCL threadblocks using the same NVLink concurrently should be close together
+    for rank, gpu in gpus.items():
+        gpu.threadblocks = sorted([tb for tbs in tbs_by_gpu_chan[rank].values() for tb in tbs],
+            key=lambda tb: (tb.send, tb.recv, tb.channel))
+        for i, tb in enumerate(gpu.threadblocks):
+            tb.rbid = i
+
+    # Do some additional postprocessing of operations:
+    # - Expand operations with extra dependencies with no-ops
+    # - Mark the index of each operation taking any extra no-ops into account
+    # - Record the threadblock rbids for each operation
+    all_ops = []
+    for rank, gpu in gpus.items():
+        for tb in gpu.threadblocks:
+            tb.steps.sort(key=lambda op: op.step)
+            for op in tb.steps:
+                # Expand extra dependencies into nop operations
+                if len(op.depends) > 1:
+                    extra_deps = op.depends[1:]
+                    op.depends = op.depends[:1]
+                    first_step = op.step
+                    for i, dep in enumerate(extra_deps):
+                        tb.ops.append(_Op(op.gpu, None, op.step, False, 'nop', None, None, None, None, 0, [dep]))
+                        tb.ops[-1].idx = len(tb.ops) - 1
+                tb.ops.append(op)
+                tb.ops[-1].idx = len(tb.ops) - 1
+            for op in tb.ops:
+                op.block_rbid = tb.rbid
+            all_ops.extend(tb.ops)
+
+    # Filter out dependencies within the same threadblock
+    for op in all_ops:
+        op.depends = list(filter(lambda d: d.block_rbid != op.block_rbid, op.depends))
+
+    # Mark all ops that have a dependence on them
+    for op in all_ops:
+        for dep in op.depends:
+            dep.has_dependence = True
+
+    # Generate the XML structure
+    algo_elem = ET.Element('algo')
+    algo_elem.set('name', algorithm.name)
+    algo_elem.set('nchannels', str(1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in gpus.values())))
+    if old_format:
+        algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))
+    for rank, gpu in gpus.items():
+        gpu_elem = ET.SubElement(algo_elem, 'gpu')
+        gpu_elem.set('id', str(rank))
+        gpu_elem.set('i_chunks', str(gpu.input_chunks))
+        gpu_elem.set('o_chunks', str(gpu.output_chunks))
+        gpu_elem.set('s_chunks', str(gpu.scratch_size()))
+        for copy in gpu.copies:
+            copy_elem = ET.SubElement(gpu_elem, 'copy')
+            copy_elem.set('i_off', str(copy.input_offset))
+            copy_elem.set('o_off', str(copy.output_offset))
+        for tb in gpu.threadblocks:
+            tb_elem = ET.SubElement(gpu_elem, 'tb')
+            tb_elem.set('id', str(tb.rbid))
+            tb_elem.set('send', str(tb.send))
+            tb_elem.set('recv', str(tb.recv))
+            tb_elem.set('chan', str(tb.channel))
+            for op in tb.ops:
+                op_elem = ET.SubElement(tb_elem, 'op' if not old_format else 'step')
+                op_elem.set('step' if not old_format else 's', str(op.idx))
+                op_elem.set('type', op.op_type)
+
+                # The NCCL backend currently wants scratch at the end of output
+                if not use_scratch:
+                    if op.src_buffer == 's':
+                        op.src_buffer = 'o'
+                        op.src_offset += gpu.output_chunks
+                    if op.dst_buffer == 's':
+                        op.dst_buffer = 'o'
+                        op.dst_offset += gpu.output_chunks
+
+                if old_format:
+                    if op.src_buffer is not None:
+                        op_elem.set('srcbuf', op.src_buffer)
+                        op_elem.set('srcoff', str(op.src_offset))
+                    else:
+                        op_elem.set('srcbuf', 'i')
+                        op_elem.set('srcoff', '-1')
+                    if op.dst_buffer is not None:
+                        op_elem.set('dstbuf', op.dst_buffer)
+                        op_elem.set('dstoff', str(op.dst_offset))
+                    else:
+                        op_elem.set('dstbuf', 'o')
+                        op_elem.set('dstoff', '-1')
+                else:
+                    if op.is_send:
+                        if op.src_buffer is not None:
+                            op_elem.set('buf', op.src_buffer)
+                            op_elem.set('off', str(op.src_offset))
+                    else:
+                        if op.dst_buffer is not None:
+                            op_elem.set('buf', op.dst_buffer)
+                            op_elem.set('off', str(op.dst_offset))
+                if op.cnt > 1 or old_format:
+                    op_elem.set('cnt', str(op.cnt))
+                assert len(op.depends) <= 1
+                if len(op.depends) == 1:
+                    op_elem.set('depid', str(op.depends[0].block_rbid))
+                    op_elem.set('deps', str(op.depends[0].idx))
+                elif old_format:
+                    op_elem.set('depid', '-1')
+                    op_elem.set('deps', '-1')
+                if op.has_dependence:
+                    op_elem.set('hasdep', '1')
+                elif old_format:
+                    op_elem.set('hasdep', '0')
+
+    if pretty_print:
+        ET.indent(algo_elem, space='  ')
+    return ET.tostring(algo_elem, encoding='unicode')
diff --git a/sccl/ncd_reduction.py b/sccl/ncd_reduction.py
new file mode 100644
index 0000000..e079761
--- /dev/null
+++ b/sccl/ncd_reduction.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.collectives import *
+from sccl.topologies import reverse_topology
+from sccl.algorithm import Algorithm, Step
+from collections import defaultdict
+
+class ReductionNotApplicableError(ValueError):
+    pass
+
+def non_combining_dual(primal):
+    if not primal.is_combining:
+        raise ReductionNotApplicableError('The collective is already non-combining.')
+
+    if primal.has_triggers():
+        raise ReductionNotApplicableError('The collective has triggers.')
+
+    dual_precondition = defaultdict(set)
+    dual_postcondition = defaultdict(set)
+
+    addresses = set()
+    for chunk in primal.chunks():
+        addr = primal.address(chunk)
+        addresses.add(addr)
+        for rank in primal.ranks():
+            if primal.postcondition(rank, chunk):
+                dual_precondition[addr].add(rank)
+            if primal.precondition(rank, chunk):
+                dual_postcondition[addr].add(rank)
+    for addr in dual_precondition:
+        if len(dual_precondition[addr]) > 1:
+            raise ReductionNotApplicableError('The non-combining reduction is only applicable to collectives with a unique root per address.')
+
+    return build_collective(f'Dual{primal.name}', primal.num_nodes, len(addresses),
+        lambda r, c: r in dual_precondition[c],
+        lambda r, c: r in dual_postcondition[c])
+
+def recover_primal_algorithm(dual_algorithm, primal, original_topology, instance):
+    primal_steps = []
+    for step in reversed(dual_algorithm.steps):
+        primal_sends = [(chunk, dst, src) for chunk, src, dst in step.sends]
+        primal_steps.append(Step(step.rounds, primal_sends))
+    return Algorithm.make_implementation(primal, original_topology, instance, primal_steps)
+
+def wrap_try_ncd_reduction(solver_cls):
+    class NonCombiningReductionWrapper(solver_cls):
+        def __init__(self, topology, collective):
+            self.primal = collective
+            try:
+                # Create the dual collective
+                self.dual = non_combining_dual(collective)
+                collective = self.dual
+
+                # Solve the dual in the reverse topology
+                self.original_topology = topology
+                topology = reverse_topology(topology)
+            except ReductionNotApplicableError:
+                self.dual = None
+            super().__init__(topology, collective)
+
+        def solve(self, instance):
+            algo = super().solve(instance)
+            if self.dual != None and algo != None:
+                return recover_primal_algorithm(algo, self.primal, self.original_topology, instance)
+            else:
+                return algo
+
+    return NonCombiningReductionWrapper
diff --git a/sccl/path_encoding.py b/sccl/path_encoding.py
new file mode 100644
index 0000000..44838b3
--- /dev/null
+++ b/sccl/path_encoding.py
@@ -0,0 +1,222 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.algorithm import *
+from sccl.ncd_reduction import wrap_try_ncd_reduction
+from z3 import *
+
+from collections import defaultdict
+
+def _start(chunk, rank):
+    return Int(f'start_{chunk}_at_{rank}')
+
+def _end(chunk, rank):
+    return Int(f'end_{chunk}_at_{rank}')
+
+def _rounds(step):
+    return Int(f'rounds_{step}')
+
+def _send(chunk, src, dst):
+    return Bool(f'send_{chunk}_from_{src}_to_{dst}')
+
+def _sent_in(chunk, src, dst, step):
+    # Constructs a Z3 term that is true iff a chunk is sent from src to dst in step
+    return And(_send(chunk, src, dst), _start(chunk, dst) == step + 1)
+
+def _idx(addr, rank):
+    return Int(f'idx_{addr}_at_{rank}')
+
+def _addr_start(addr, rank):
+    return Int(f'addr_start_{addr}_at_{rank}')
+
+def _addr_end(addr, rank):
+    return Int(f'addr_end_{addr}_at_{rank}')
+
+class PathEncodingBase(object):
+    def __init__(self, topology, collective):
+        self.topology = topology
+        self.collective = collective
+
+    def _encode(self, s, instance, collective):
+        # Calculate how much iterations of the algorithm overlap if pipelining is specified
+        if instance.pipeline != None:
+            # TODO: move this check into Instance
+            if instance.pipeline <= 0:
+                raise ValueError('instance.pipeline must be strictly positive.')
+            overlap = max(instance.steps - instance.pipeline, 0)
+        else:
+            overlap = 0
+
+        # Correctness
+        for chunk in collective.chunks():
+            for rank in collective.ranks():
+                if collective.precondition(rank, chunk):
+                    # Have chunks start on their starting ranks before the first step
+                    # This is not required for the encoding, but makes debugging the models produced more intuitive
+                    s.add(_start(chunk, rank) == 0)
+                else:
+                    # Any rank that gets a chunk (and doesn't start with it) must have a unique source for it
+                    sent_once = PbEq([(_send(chunk, src, rank), 1) for src in self.topology.sources(rank)], 1)
+                    s.add(Implies(_start(chunk, rank) <= instance.steps, sent_once))
+                # If the postcondition requires the chunk on the rank then it must start being there before the end
+                if collective.postcondition(rank, chunk):
+                    s.add(_start(chunk, rank) <= instance.steps)
+                for src in self.topology.sources(rank):
+                    # If a rank send a chunk then it needs to have it before sending it
+                    s.add(Implies(_send(chunk, src, rank), _start(chunk, src) < _start(chunk, rank)))
+                    if instance.extra_memory != None:
+                        # Also to send a chunk it needs to not have been deleted before sending it
+                        s.add(Implies(_send(chunk, src, rank), _end(chunk, src) >= _start(chunk, rank) - 1))
+                    # Handle chunks at the same address getting reduced in combining collectives
+                    if collective.is_combining:
+                        for other in collective.chunks():
+                            if other != chunk and collective.address(other) == collective.address(chunk):
+                                # If you send and another chunk at the same address is available (i.e. reduced) then you have to send that too at the same time
+                                s.add(Implies(And(_send(chunk, src, rank), _start(other, src) < _start(chunk, rank)),
+                                                And(_send(other, src, rank), (_start(other, rank) == _start(chunk, rank)))))
+
+                    # Handle the triggers used in subproblem based synthesizers
+                    if collective.trigger(rank, chunk) != None:
+                        # When receiving a chunk with a trigger, the triggering chunk must be sent at the same time
+                        trigger = collective.trigger(rank, chunk)
+                        s.add(Implies(_send(chunk, src, rank),
+                            And(_send(trigger, rank, src), _start(trigger, src) == _start(chunk, rank))))
+                    if collective.trigger(src, chunk) != None:
+                        # When sending a chunk with a trigger, the triggering chunk must be received at the same time
+                        trigger = collective.trigger(src, chunk)
+                        s.add(Implies(_send(chunk, src, rank),
+                            And(_send(trigger, rank, src), _start(trigger, src) == _start(chunk, rank))))
+
+        # Rounds
+        # Each step must use at least one round of bandwidth
+        s.add(*[_rounds(step) >= 1 for step in range(instance.steps)])
+        # Total number of rounds used by all steps must not exceed the limits given
+        s.add(sum([_rounds(step) for step in range(instance.steps)]) <= instance.rounds())
+        # Overlapping steps in pipelined algorithms must use the same number of rounds
+        for step in range(instance.steps - overlap):
+            for overlapping_step in range(step, instance.steps, instance.steps - overlap):
+                if overlapping_step != step:
+                    s.add(_rounds(step) == _rounds(overlapping_step))
+
+        # Bandwidth
+        # Each bandwidth group (e.g. a link or a switch) generates a separate set of constraints
+        for srcs, dsts, bw, _ in self.topology.bandwidth_constraints():
+            # overlap is subtracted here because overlapping steps are considered together
+            for step in range(instance.steps - overlap):
+                pb_sends = []
+                for src in srcs:
+                    for dst in dsts:
+                        # Generate terms for all sends on this step and group them by address and edge
+                        sends_by_addr = defaultdict(list)
+                        for chunk in collective.chunks():
+                            # Consider all pipelined steps that overlap with this step
+                            for overlapping_step in range(step, instance.steps, instance.steps - overlap):
+                                sends_by_addr[(collective.address(chunk))].append(_sent_in(chunk, src, dst, overlapping_step))
+                        # Count sends happening on an address only once and give each of these weight 1
+                        pb_sends.extend([(Or(sends),1) for sends in sends_by_addr.values()])
+                # For each number of rounds this step could have impose a pseudo-boolean
+                # constraint limiting sends on this step to the available bandwidth
+                for i in range(1, instance.extra_rounds + 2):
+                    s.add(Implies(_rounds(step) == i, PbLe(pb_sends, bw * i)))
+
+        # Memory
+        if instance.extra_memory != None:
+            # Choose the last step a chunk is present on a rank
+            for chunk in collective.chunks():
+                for rank in collective.ranks():
+                    if collective.postcondition(rank, chunk):
+                        # In the postcondition the chunk can not stop being on the rank before the end of the algorithm
+                        s.add(_end(chunk, rank) > instance.steps)
+                    else:
+                        # On other ranks the chunk can stop being on the rank any time after its start
+                        s.add(_end(chunk, rank) >= _start(chunk, rank))
+
+            for rank in collective.ranks():
+                # Figure out all addresses plus which ones weill be in the input and output buffers
+                addresses = set()
+                input_addresses = set()
+                output_addresses = set()
+                for chunk in collective.chunks():
+                    addr = collective.address(chunk)
+                    addresses.add(addr)
+                    if collective.precondition(rank, chunk):
+                        input_addresses.add(addr)
+                    if collective.postcondition(rank, chunk):
+                        output_addresses.add(addr)
+                    # Enforce the address start-end intervals to contain all the chunk start-end intervals
+                    s.add(_addr_start(addr, rank) <= _start(chunk, rank))
+                    s.add(_addr_end(addr, rank) >= _end(chunk, rank))
+
+                # Statically allocate indices for addresses in the input and output buffers
+                next_idx = 0
+                for addr in sorted(input_addresses):
+                    # Allocate addresses that are both input and output in the output portion
+                    if addr not in output_addresses:
+                        s.add(_idx(addr, rank) == next_idx)
+                        next_idx += 1
+                for addr in sorted(output_addresses):
+                    s.add(_idx(addr, rank) == next_idx)
+                    next_idx += 1
+
+                def conflict(addr1, addr2):
+                    s1 = _addr_start(addr1, rank)
+                    s2 = _addr_start(addr2, rank)
+                    e1 = _addr_end(addr1, rank)
+                    e2 = _addr_end(addr2, rank)
+                    if not instance.allow_exchange:
+                        # Without exhanges the index has to be reserved for the states before and after the interval
+                        # (The correctness part of the encoding allows chunks to "hop" from one rank to the next
+                        # without overlap)
+                        s1 = s1 - 1
+                        s2 = s2 - 1
+                        e1 = e1 + 1
+                        e2 = e2 + 1
+                    # There is a conflict if the intervals overlap
+                    return And(s1 < e2, s2 < e1)
+                
+                # Count how many addresses will be in the input and output buffers
+                input_size = len(input_addresses)
+                output_size = len(output_addresses)
+                idx_end = input_size + output_size + instance.extra_memory
+                
+                # Add constraints for allocating indices for all the addresses just passing through the rank
+                for addr in (addresses - input_addresses) - output_addresses:
+                    for other in addresses:
+                        if other != addr:
+                            # If two addresses have the same index they have to have non-conflicting liveness intervals
+                            s.add(Implies(_idx(addr, rank) == _idx(other, rank), Not(conflict(addr, other))))
+                    # If the address is ever live on this rank require it to be inside the memory limits
+                    in_memory = And(0 <= _idx(addr, rank), _idx(addr, rank) < idx_end)
+                    s.add(Implies(_addr_start(addr, rank) <= instance.steps, in_memory))
+
+    def solve(self, instance):
+        chunked = self.collective.chunk_up(instance.chunks)
+
+        solver = Solver()
+        self._encode(solver, instance, chunked)
+        if solver.check() == sat:
+            model = solver.model()
+
+            # Decode sends from the model
+            send_sets = [set() for step in range(instance.steps)]
+            for chunk in chunked.chunks():
+                addr = chunked.address(chunk)
+                for dst in chunked.ranks():
+                    for src in self.topology.sources(dst):
+                        # Check if the send of chunk from src to dst happens
+                        if is_true(model.eval(_send(chunk, src, dst))):
+                            # Find which step it happens on (the step before it starts on the destination)
+                            step = model.eval(_start(chunk, dst)).as_long() - 1
+                            # Filter out "phantom" sends that happen outside the algorithm
+                            if 0 <= step and step < instance.steps:
+                                send_sets[step].add((addr, src, dst))
+
+            # Store the sends for each step and number of rounds used
+            steps = [Step(model.eval(_rounds(i)).as_long(), list(send_sets[i])) for i in range(instance.steps)]
+
+            return Algorithm.make_implementation(self.collective, self.topology, instance, steps)
+        else:
+            return None
+
+# Prefer using the non-combining dual reduction
+PathEncoding = wrap_try_ncd_reduction(PathEncodingBase)
diff --git a/sccl/rounds_bound.py b/sccl/rounds_bound.py
new file mode 100644
index 0000000..eaaa417
--- /dev/null
+++ b/sccl/rounds_bound.py
@@ -0,0 +1,76 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.ncd_reduction import non_combining_dual
+from sccl.topologies import reverse_topology
+from z3 import *
+from fractions import Fraction
+
+def _flow(chunk, src, dst):
+    return Real(f'flow_{chunk}_from_{src}_to_{dst}')
+
+def lower_bound_rounds(topology, collective, logging=False):
+    '''
+    Solve a lower bound rounds required by any algorithm. Uses a multi-commodity feasibility inspired encoding to Z3.
+    '''
+
+    opt = Optimize()
+
+    # Remember names before possible non-combining dual reduction
+    collective_name = collective.name
+    topology_name = topology.name
+
+    # Use non-combining dual if necessary
+    if collective.is_combining:
+        collective = non_combining_dual(collective)
+        topology = reverse_topology(topology)
+
+    chunks = collective.chunks()
+    ranks = collective.ranks()
+
+    for chunk in chunks:
+        for rank in ranks:
+            # All flows are between 0 and 1
+            for dst in topology.destinations(rank):
+                opt.add(_flow(chunk,rank,dst) >= 0)
+                opt.add(_flow(chunk,rank,dst) <= 1)
+            total_in = sum(_flow(chunk,src,rank) for src in topology.sources(rank))
+            if not collective.precondition(rank, chunk):
+                # Ranks not in the precondition need to justify outflows
+                for dst in topology.destinations(rank):
+                    opt.add(_flow(chunk,rank,dst) <= total_in)
+                # Ranks in the postcondition, but not in the precondition need the whole chunk
+                if collective.postcondition(rank, chunk):
+                    opt.add(total_in == 1)
+
+    # Represents how many rounds all the steps of the algorithm would use
+    rounds = Real(f'rounds')
+    
+    for srcs, dsts, bw, _ in topology.bandwidth_constraints():
+        # Sum of all flows relevant to this constraint
+        sum_flow = sum(_flow(chunk,src,dst) for src in srcs for dst in dsts for chunk in chunks)
+        # Total flow must be less than the limit, taking rounds into consideration
+        opt.add(sum_flow <= bw * rounds)
+
+    # Minimize the number of rounds
+    min_rounds = opt.minimize(rounds)
+    result = opt.check()
+    if result == sat:
+        bound_ref = opt.lower(min_rounds)
+        if isinstance(bound_ref, IntNumRef):
+            rounds_lb = Fraction(bound_ref.as_long(), 1)
+        elif isinstance(bound_ref, RatNumRef):
+            rounds_lb = bound_ref.as_fraction()
+        else:
+            raise RuntimeError(f'Unhandled Z3 numeral type: {type(bound_ref)}')
+        if logging:
+            print(f'{collective_name} algorithms need at least {rounds_lb} rounds in {topology_name} topology.')
+        return rounds_lb
+    else:
+        if logging:
+            if result == unsat:
+                print(f'Unsat. {collective_name} is not implementable in {topology_name} topology.')
+            else:
+                assert result == unknown, 'Unhandled Z3 result'
+                print('Unknown. Z3 was not able to solve the lower bound.')
+        return None
diff --git a/sccl/serialization.py b/sccl/serialization.py
new file mode 100644
index 0000000..b44d137
--- /dev/null
+++ b/sccl/serialization.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.algorithm import Algorithm, Step
+from sccl.topologies import Topology
+from sccl.instance import Instance
+from sccl.collectives import Collective, Chunk
+
+import json
+import warnings
+
+def _sccl_object_hook(o):
+    if not 'sccl_type' in o:
+        return o
+    if o['sccl_type'] == 'algorithm':
+        input_map = { int(k): set(v) for k, v in o['input_map'].items() }
+        output_map = { int(k): set(v) for k, v in o['output_map'].items() }
+        return Algorithm(o['name'], o['collective'], o['topology'], o['instance'], o['steps'], input_map, output_map)
+    if o['sccl_type'] == 'step':
+        sends = [(addr, src, dst) for addr, src, dst in o['sends']]
+        return Step(o['rounds'], sends)
+    if o['sccl_type'] == 'collective':
+        triggers = { (int(r), int(c)): v for r, rmap in o['triggers'].items() for c, v in rmap.items() }
+        return Collective(o['name'], o['nodes'], o['chunks'], triggers)
+    if o['sccl_type'] == 'chunk':
+        pre = set(o['pre'])
+        post = set(o['post'])
+        return Chunk(pre, post, o['addr'])
+    if o['sccl_type'] == 'topology':
+        return Topology(o['name'], o['links'], o['switches'])
+    if o['sccl_type'] == 'instance':
+        return Instance(o['steps'], o['extra_rounds'], o['chunks'], o['pipeline'], o['extra_memory'], o['allow_exchange'])
+    warnings.warn('Unhandled sccl_type in JSON')
+
+def SCCLDecoder():
+    return json.JSONDecoder(object_hook=_sccl_object_hook)
+
+class SCCLEncoder(json.JSONEncoder):
+    def __init__(self):
+        super().__init__()
+    
+    def default(self, o):
+        if isinstance(o, Algorithm):
+            input_map = { k: list(v) for k, v in o.input_map.items() }
+            output_map = { k: list(v) for k, v in o.output_map.items() }
+            return {
+                'sccl_type': 'algorithm',
+                'name': o.name,
+                'instance': o.instance,
+                'input_map': input_map,
+                'output_map': output_map,
+                'steps': o.steps,
+                'collective': o.collective,
+                'topology': o.topology,
+            }
+        if isinstance(o, Step):
+            return {
+                'sccl_type': 'step',
+                'rounds': o.rounds,
+                'sends': o.sends,
+            }
+        if isinstance(o, Collective):
+            triggers = {}
+            for (r, c), v in o._triggers.items():
+                if not r in triggers:
+                    triggers[r] = {}
+                triggers[r][c] = v
+            return {
+                'sccl_type': 'collective',
+                'name': o.name,
+                'nodes': o.num_nodes,
+                'chunks': o._chunks,
+                'triggers': triggers,
+            }
+        if isinstance(o, Chunk):
+            return {
+                'sccl_type': 'chunk',
+                'pre': list(o.precondition),
+                'post': list(o.postcondition),
+                'addr': o.address,
+            }
+        if isinstance(o, Topology):
+            return {
+                'sccl_type': 'topology',
+                'name': o.name,
+                'switches': o.switches,
+                'links': o.links,
+            }
+        if isinstance(o, Instance):
+            return {
+                'sccl_type': 'instance',
+                'steps': o.steps,
+                'extra_rounds': o.extra_rounds,
+                'chunks': o.chunks,
+                'pipeline': o.pipeline,
+                'extra_memory': o.extra_memory,
+                'allow_exchange': o.allow_exchange,
+            }
+        return json.JSONEncoder.default(self, o)
+
+def save_sccl_object(obj, filename):
+    with open(filename, 'w') as f:
+        f.write(SCCLEncoder().encode(obj))
+
+def load_sccl_object(filename):
+    with open(filename) as f:
+        return SCCLDecoder().decode(f.read())
diff --git a/sccl/steps_bound.py b/sccl/steps_bound.py
new file mode 100644
index 0000000..775c3e4
--- /dev/null
+++ b/sccl/steps_bound.py
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import math
+
+def _distances(topology):
+    # Floyd–Warshall algorithm for all-pairs shortest paths
+    nodes = range(topology.num_nodes())
+    dist = [[math.inf for _ in nodes] for _ in nodes]
+    for dst in nodes:
+        for src in topology.sources(dst):
+            dist[src][dst] = 1
+    for node in nodes:
+        dist[node][node] = 0
+    for k in nodes:
+        for i in nodes:
+            for j in nodes:
+                if dist[i][j] > dist[i][k] + dist[k][j]:
+                    dist[i][j] = dist[i][k] + dist[k][j]
+    return dist
+
+def lower_bound_steps(topology, collective):
+    ''' Finds a lower bound for the steps required as the maximum distance for a chunk from any of its sources. '''
+
+    dist = _distances(topology)
+
+    # Find the maximum of the least steps required for each chunk 
+    least_steps = 0
+    for chunk in collective.chunks():
+        for dst in collective.ranks():
+            if collective.postcondition(dst, chunk):
+                # Find the shortest distance from some rank in the precondition
+                least_distance = math.inf
+                for src in collective.ranks():
+                    if collective.precondition(src, chunk):
+                        least_distance = min(least_distance, dist[src][dst])
+                # Update the least steps required if the distance from any rank in the precondition is larger
+                least_steps = max(least_steps, least_distance)
+
+    if least_steps == math.inf:
+        # Return None if the collective is unimplementable with any number of steps
+        return None
+    else:
+        return least_steps
diff --git a/sccl/strategies.py b/sccl/strategies.py
new file mode 100644
index 0000000..aa9dbc8
--- /dev/null
+++ b/sccl/strategies.py
@@ -0,0 +1,159 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.instance import Instance
+from sccl.path_encoding import PathEncoding
+from sccl.rounds_bound import lower_bound_rounds
+from sccl.steps_bound import lower_bound_steps
+
+import time
+import math
+from fractions import Fraction
+import itertools
+from collections import defaultdict
+
+def _solve_and_log(encoding, instance, logging):
+    if logging:
+        print(f'Solving instance {instance}... ', end='', flush=True)
+
+    start_time = time.time()
+    result = encoding.solve(instance)
+    duration = time.time() - start_time
+    
+    if logging:
+        if result != None:
+            print(f'synthesized! ({duration:.1f}s)')
+        else:
+            print(f'unsatisfiable. ({duration:.1f}s)')
+
+    return result
+
+def solve_instance(topology, collective, instance, logging = False):
+    encoding = PathEncoding(topology, collective)
+    return _solve_and_log(encoding, instance, logging)
+
+def solve_least_steps(topology, collective, initial_steps = 1, base_instance = Instance(None), logging = False):
+    if initial_steps < 1:
+        raise ValueError('initial_steps must be strictly positive')
+
+    encoding = PathEncoding(topology, collective)
+
+    # Lower bound the number of steps required
+    steps_lb = lower_bound_steps(topology, collective)
+    if steps_lb == None:
+        if logging:
+            raise ValueError('The collective is unimplementable in this topology.')
+    if logging:
+        print(f'Algorithms need at least {steps_lb} steps.')
+
+    num_steps = max(initial_steps, steps_lb)
+    if num_steps > steps_lb:
+        result = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging)
+        if result != None:
+            if logging:
+                print('Synthesized on initial guess. Checking for fewer steps.')
+            while num_steps > steps_lb:
+                num_steps -= 1
+                maybe_better = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging)
+                if maybe_better != None:
+                    result = maybe_better
+                else:
+                    break
+            return result
+        else:
+            num_steps += 1
+    
+    while True:
+        result = _solve_and_log(encoding, base_instance.set(steps=num_steps), logging)
+        if result != None:
+            return result
+        else:
+            num_steps += 1
+
+def solve_all_latency_bandwidth_tradeoffs(topology, collective, min_chunks = 1, max_chunks = None, assume_rounds_per_chunk_lb = None, assume_monotonic_feasibility = False, base_instance = Instance(None), logging = False):
+    if min_chunks < 1:
+        raise ValueError('min_chunks must be strictly positive.')
+    if max_chunks != None and max_chunks < min_chunks:
+        raise ValueError('max_chunks must be greater or equal to min_chunks.')
+    if assume_rounds_per_chunk_lb != None and assume_rounds_per_chunk_lb < 0:
+        raise ValueError('assume_rounds_per_chunk_lb must be positive.')
+
+    # Lower bound the number of steps required
+    steps_lb = lower_bound_steps(topology, collective)
+    if logging:
+        print(f'Algorithms need at least {steps_lb} steps.')
+
+    # Lower bound the number of rounds per unit of chunkiness required
+    if assume_rounds_per_chunk_lb != None:
+        rounds_per_chunk_lb = assume_rounds_per_chunk_lb
+        if logging:
+            print(f'Assuming algorithms need at least {rounds_per_chunk_lb} rounds per chunk.')
+    else:
+        rounds_per_chunk_lb = lower_bound_rounds(topology, collective)
+        if logging:
+            print(f'Algorithms need at least {rounds_per_chunk_lb} rounds per chunk.')
+
+    # Remember for which rounds per chunk fraction a given number of steps will be unsat
+    step_rpc_lb = defaultdict(lambda: Fraction(0))
+
+    chunks_iter = range(min_chunks, max_chunks+1) if max_chunks != None else itertools.count(min_chunks)
+
+    algorithms = []
+    for chunks in chunks_iter:
+        encoding = PathEncoding(topology, collective)
+        rounds_lb = math.ceil(rounds_per_chunk_lb * chunks)
+
+        rounds = rounds_lb - 1
+        found = False
+        while not found:
+            rounds += 1
+            rpc = Fraction(rounds, chunks)
+            # Skip this fraction if a lower number of chunks will have already considered it
+            if math.gcd(chunks, rounds) != 1:
+                continue
+            for steps in range(steps_lb, rounds+1):
+                # Skip this number of steps if a previous instance with stricter rounds per chunk already failed
+                if assume_monotonic_feasibility and rpc < step_rpc_lb[steps]:
+                    continue
+                instance = base_instance.set(steps=steps, extra_rounds=rounds - steps, chunks=chunks)
+                result = _solve_and_log(encoding, instance, logging=logging)
+                if result != None:
+                    assert rpc >= step_rpc_lb[steps], 'Monotonic feasibility assumption would have been violated.'
+                    found = True
+                    yield result
+                    break
+                else:
+                    # Update the rounds per chunk for which this number of steps is not sufficient
+                    step_rpc_lb[steps] = max(step_rpc_lb[steps], rpc)
+                    if logging and assume_monotonic_feasibility:
+                        print(f'Assuming {steps} step algorithms need at least {rpc} rounds per chunk.')
+        # Check if a bandwidth optimal algorithm has been found
+        if found and rpc <= rounds_per_chunk_lb:
+            assert rpc == rounds_per_chunk_lb, 'Rounds per chunk lower bound did not hold.'
+            if logging:
+                print(f'Bandwidth optimal algorithm found!')
+            break
+    else:
+        if logging:
+            print(f'Reached the limit for chunks.')
+
+def _steps(algo):
+    return len(algo.steps)
+
+def _rpc(algo):
+    return Fraction(_steps(algo) + algo.extra_rounds(), algo.instance.chunks) 
+
+def prune_pareto_optimal(algorithms):
+    efficient_algorithms = []
+    for i, algo in enumerate(algorithms):
+        is_efficient = True
+        for j, other in enumerate(algorithms):
+            either_worse = _steps(algo) > _steps(other) or _rpc(algo) > _rpc(other)
+            neither_better = _steps(algo) >= _steps(other) and _rpc(algo) >= _rpc(other)
+            if either_worse and neither_better:
+                is_efficient = False
+                break
+        if is_efficient:
+            efficient_algorithms.append(algo)
+
+    return efficient_algorithms
diff --git a/sccl/topologies/__init__.py b/sccl/topologies/__init__.py
new file mode 100644
index 0000000..63aaee0
--- /dev/null
+++ b/sccl/topologies/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .generic import *
+from .transformers import *
+from .amd import *
+from .nvidia import *
+from .distributed import *
diff --git a/sccl/topologies/amd.py b/sccl/topologies/amd.py
new file mode 100644
index 0000000..6161cfd
--- /dev/null
+++ b/sccl/topologies/amd.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .topology import Topology
+
+def amd4():
+    links = [
+        [0, 1, 1, 0],
+        [1, 0, 0, 1],
+        [1, 0, 0, 1],
+        [0, 1, 1, 0]
+    ]
+    return Topology('AMD4', links)
+
+def amd8():
+    links = [
+        [0, 5, 6, 6, 5, 6, 5, 5],
+        [5, 0, 5, 5, 6, 5, 6, 6],
+        [6, 5, 0, 6, 5, 6, 5, 5],
+        [6, 5, 6, 0, 5, 6, 5, 5],
+        [5, 6, 5, 5, 0, 5, 6, 6],
+        [6, 5, 6, 6, 5, 0, 5, 5],
+        [5, 6, 5, 5, 6, 5, 0, 6],
+        [5, 6, 5, 5, 6, 5, 6, 0]
+    ]
+    return Topology('AMD8', links)
diff --git a/sccl/topologies/distributed.py b/sccl/topologies/distributed.py
new file mode 100644
index 0000000..70f5330
--- /dev/null
+++ b/sccl/topologies/distributed.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .topology import Topology
+
+def _copy_links(remote_bw, num_local, num_dist, local_links):
+    return [[remote_bw if src // num_local != dst // num_local else local_links[dst % num_local][src % num_local]
+        for src in range(num_dist)] for dst in range(num_dist)]
+
+def _copy_switches(num_local, num_copies, local_switches):
+    switches = []
+    for srcs, dsts, bw, name in local_switches:
+        for i in range(num_copies):
+            dist_srcs = [src + i * num_local for src in srcs]
+            dist_dsts = [dst + i * num_local for dst in dsts]
+            switches.append((dist_srcs, dist_dsts, bw, f'copy_{i}_{name}_local'))
+    return switches
+
+def distributed_fully_connected(local_topology, num_copies, remote_bw):
+    num_local = local_topology.num_nodes()
+    num_dist = num_local * num_copies
+
+    links = _copy_links(remote_bw, num_local, num_dist, local_topology.links)
+    switches = _copy_switches(num_local, num_copies, local_topology.switches)
+
+    return Topology(f'DistributedFullyConnected(local={local_topology.name},copies={num_copies},bw={remote_bw})', links, switches)
+
+def distributed_hub_and_spoke(local_topology, num_copies, remote_bw):
+    num_local = local_topology.num_nodes()
+    num_dist = num_local * num_copies
+
+    links = _copy_links(remote_bw, num_local, num_dist, local_topology.links)
+    switches = _copy_switches(num_local, num_copies, local_topology.switches)
+
+    for i in range(num_copies):
+        local_ranks = [j + i * num_local for j in range(num_local)]
+        remote_ranks = [k for k in range(num_dist) if k // num_local != i]
+        switches.append((local_ranks, remote_ranks, remote_bw, f'copy_{i}_out_remote'))
+        switches.append((remote_ranks, local_ranks, remote_bw, f'copy_{i}_in_remote'))
+    
+    return Topology(f'DistributedHubAndSpoke(local={local_topology.name},copies={num_copies},bw={remote_bw})', links, switches)
diff --git a/sccl/topologies/generic.py b/sccl/topologies/generic.py
new file mode 100644
index 0000000..8f2e50d
--- /dev/null
+++ b/sccl/topologies/generic.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .topology import Topology
+
+def hub_and_spoke(num_nodes):
+    links = [[0 if x==y else 1 for y in range(num_nodes)] for x in range(num_nodes)]
+    switches = []
+    for node in range(num_nodes):
+        others = [other for other in range(num_nodes) if other != node]
+        switches.append(([node],others,1,f'node_{node}_out'))
+        switches.append((others,[node],1,f'node_{node}_in'))
+    return Topology(f'HubAndSpoke(n={num_nodes})', links, switches)
+
+def fully_connected(num_nodes):
+    links = []
+    for i in range(num_nodes):
+        row = [1] * num_nodes
+        row[i] = 0
+        links.append(row)
+    return Topology(f'FullyConnected(n={num_nodes})', links)
+
+def ring(num_nodes):
+    links = []
+    for i in range(num_nodes):
+        row = [0] * num_nodes
+        row[(i+1) % num_nodes] = 1
+        row[(i-1) % num_nodes] = 1
+        links.append(row)
+    return Topology(f'Ring(n={num_nodes})', links)
+
+def line(num_nodes):
+    links = []
+    for i in range(num_nodes):
+        row = [0] * num_nodes
+        if i - 1 >= 0:
+            row[i-1] = 1
+        if i + 1 < num_nodes:
+            row[i+1] = 1
+        links.append(row)
+    return Topology(f'Line(n={num_nodes})', links)
+
+def star(num_nodes, non_blocking=True):
+    links = [[0 if i == 0 else 1 for i in range(num_nodes)]]
+    for i in range(1, num_nodes):
+        links.append([1 if j == 0 else 0 for j in range(num_nodes)])
+    switches = []
+    if not non_blocking:
+        points = [i for i in range(num_nodes) if i != 0]
+        switches.append(([0],points,1,f'to_points'))
+        switches.append((points,[0],1,f'from_points'))
+    return Topology(f'Star(n={num_nodes})', links, switches)
diff --git a/sccl/topologies/nvidia.py b/sccl/topologies/nvidia.py
new file mode 100644
index 0000000..57f2d46
--- /dev/null
+++ b/sccl/topologies/nvidia.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .topology import Topology
+
+from fractions import Fraction
+import subprocess
+
+def dgx1():
+    # (0 1 2 3) (4 5 6 7) are two sockets
+    # 0 1 3 2 is the high bandwidth chain in socket 1
+    # 4 5 7 6 is the high bandwidth chain in socket 2
+    # 0 4 and 2 6 are high bandwidth intersocket links  
+
+    links = [
+        #0  1  2  3  4  5  6  7
+        [0, 2, 1, 1, 2, 0, 0, 0],
+        [2, 0, 1, 2, 0, 1, 0, 0],
+        [1, 1, 0, 2, 0, 0, 2, 0],
+        [1, 2, 2, 0, 0, 0, 0, 1],
+        [2, 0, 0, 0, 0, 2, 1, 1],
+        [0, 1, 0, 0, 2, 0, 1, 2],
+        [0, 0, 2, 0, 1, 1, 0, 2],
+        [0, 0, 0, 1, 1, 2, 2, 0]
+    ]
+
+    # self.symmetries = [
+    #     [0, 1, 2, 3, 4, 5, 6, 7], #0 goes to itself
+    #     [0, 1, 2, 3, 4, 5, 6, 7], #1 goes to itself
+    #     [2, 3, 0, 1, 6, 7, 4, 5], #2 goes to 0, 3 goes to 1, ... top - bottom symmetry
+    #     [2, 3, 0, 1, 6, 7, 4, 5], #3 goes to 1, 2 goes to 0, ... top - bottom symmetry
+    #     [4, 5, 6, 7, 0, 1, 2, 3], #4 goes to 0, 5 goes to 1, ... left - right symmetry
+    #     [4, 5, 6, 7, 0, 1, 2, 3], #5 goes to 1, 4 goes to 0, ... left - right symmetry
+    #     [6, 7, 4, 5, 2, 3, 0, 1], #6 goes to 0, 7 goes to 1, ... top-bottom + left-right
+    #     [6, 7, 4, 5, 2, 3, 0, 1]  #7 goes to 1, 6 goes to 0, ... top-bottom + left-right
+    # ]
+
+    # self.beta_bound = Fraction(7,6)
+    # self.diameter = 2
+
+    return Topology('DGX1', links)
+
+def nvlink_only(nvidia_smi_topo=None):
+    if nvidia_smi_topo == None:
+        nvidia_smi_topo = _get_nvidia_smi_topo()
+    links = _parse_nvidia_smi_topo(nvidia_smi_topo)
+    return Topology('NVLinkOnly', links)
+
+def _get_nvidia_smi_topo():
+    output = subprocess.check_output("nvidia-smi topo -m".split())
+    return output.decode("utf-8")
+
+def _parse_nvidia_smi_topo(output):
+    lines = output.splitlines()
+    before_legend = []
+    for l in lines[1:]:
+        if l and l.startswith("GPU"):
+            # Only look at the rows for GPU
+            before_legend.append(l)
+        else:
+            break
+    devices = [x.split("\t")[0] for x in before_legend]
+    gpus = [i for i in range(len(before_legend))
+            if before_legend[i].startswith("GPU")]
+    matrix = [x.split("\t")[1:] for x in before_legend]
+    nvlink_matrix = [[_nvlink_num(x[g]) for g in gpus] for x in matrix]
+    return nvlink_matrix
+
+def _nvlink_num(x):
+    x = x.strip()
+    if x.startswith("NV"):
+        return int(x[2:])
+    else:
+        return 0
diff --git a/sccl/topologies/topology.py b/sccl/topologies/topology.py
new file mode 100644
index 0000000..a6eb3ab
--- /dev/null
+++ b/sccl/topologies/topology.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+class Topology(object):
+    def __init__(self, name, links, switches=[]):
+        self.name = name
+        self.links = links
+        self.switches = switches
+        for srcs, dsts, bw, switch_name in switches:
+            if bw == 0:
+                raise ValueError(f'Switch {switch_name} has zero bandwidth, but switch bandwidths must be strictly positive. Please encode connectedness in links.')
+            if bw < 0:
+                raise ValueError(f'Switch {switch_name} has a negative bandwidth of {bw}. Bandwidth must be strictly positive.')
+
+    def sources(self, dst):
+        for src, bw in enumerate(self.links[dst]):
+            if bw > 0:
+                yield src
+
+    def destinations(self, src):
+        for dst, links in enumerate(self.links):
+            bw = links[src]
+            if bw > 0:
+                yield dst
+
+    def link(self, src, dst):
+        return self.links[dst][src]
+
+    def num_nodes(self):
+        return len(self.links)
+
+    def nodes(self):
+        return range(self.num_nodes())
+    
+    def bandwidth_constraints(self):
+        for dst, dst_links in enumerate(self.links):
+            for src, bw in enumerate(dst_links):
+                if bw > 0:
+                    yield ([src], [dst], bw, f'{src}→{dst}')
+        for srcs, dsts, bw, switch_name in self.switches:
+            yield (srcs, dsts, bw, switch_name)
diff --git a/sccl/topologies/transformers.py b/sccl/topologies/transformers.py
new file mode 100644
index 0000000..5d1ca5e
--- /dev/null
+++ b/sccl/topologies/transformers.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .topology import Topology
+
+def reverse_topology(topology):
+    '''
+    Reverses the direction of all links and switches in the topology.
+    '''
+    num_nodes = topology.num_nodes()
+    # Transpose the links
+    links = [[topology.links[src][dst] for src in range(num_nodes)] for dst in range(num_nodes)]
+    # Reverse the switches
+    switches = [(dsts, srcs, bw, f'{name}_reversed') for srcs, dsts, bw, name in topology.switches]
+    return Topology(f'Reverse{topology.name}', links, switches)
+
+def binarize_topology(topology):
+    '''
+    Makes all link bandwidths 1 and removes all switches. Essentially, the bandwidth modeling part of the topology
+    is stripped out and only connectivity information is kept.
+    '''
+    num_nodes = topology.num_nodes()
+    links = [[1 if topology.links[src][dst] > 0 else 0 for src in range(num_nodes)] for dst in range(num_nodes)]
+    return Topology(f'Binarized{topology.name}', links, [])
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..43ecfaf
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from setuptools import setup, find_packages
+
+setup(
+    name='sccl',
+    version='2.0.0',
+    packages=find_packages(),
+    entry_points={
+        'console_scripts': [
+            'sccl = sccl.__main__:main',
+        ],
+    },
+    install_requires=[
+        'dataclasses; python_version < "3.7"',
+        'z3-solver',
+        'argcomplete',
+        'lxml',
+    ],
+    python_requires='>=3.6',
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..7f3fd83
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
\ No newline at end of file
diff --git a/tests/common.py b/tests/common.py
new file mode 100644
index 0000000..1e9a70a
--- /dev/null
+++ b/tests/common.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.collectives import *
+
+def null_collective(num_nodes):
+    return build_collective(f'Null(n={num_nodes})', num_nodes, 1,
+        lambda r, c: True, lambda r, c: False)
+
+def impossible_collective(num_nodes):
+    return build_collective(f'Impossible(n={num_nodes})', num_nodes, 1,
+        lambda r, c: False, lambda r, c: True)
diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py
new file mode 100644
index 0000000..2063b26
--- /dev/null
+++ b/tests/test_algorithm.py
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import pytest
+from .common import *
+from sccl.algorithm import Algorithm, Step
+from sccl.topologies import fully_connected
+from sccl.instance import Instance
+
+def test_invalid_empty():
+    with pytest.raises(RuntimeError):
+        num_nodes = 2
+        topo = fully_connected(num_nodes)
+        algo = Algorithm.make_implementation(impossible_collective(num_nodes), topo, Instance(1), [Step(1,[])])
+
+def test_valid_empty():
+    num_nodes = 2
+    topo = fully_connected(num_nodes)
+    algo = Algorithm.make_implementation(null_collective(num_nodes), topo, Instance(1), [Step(1,[])])
+    assert algo != None
diff --git a/tests/test_analyses.py b/tests/test_analyses.py
new file mode 100644
index 0000000..cfcc9a6
--- /dev/null
+++ b/tests/test_analyses.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import pytest
+from sccl.topologies import Topology
+from sccl.collectives import build_collective
+from sccl.rounds_bound import *
+
+def test_rounds_bound_unimplementable():
+    topo = Topology('Unconnected', [[0,0],[0,0]])
+    coll = build_collective('Send', 2, 1, lambda r, c: r == 0, lambda r, c: r == 1)
+    assert lower_bound_rounds(topo, coll) == None
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..6520733
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,128 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.topologies import *
+from sccl.collectives import *
+from sccl.serialization import *
+
+import os
+import sys
+import tempfile
+import shutil
+
+class in_tempdir:
+    '''Context manager for changing to a temporary directory.'''
+    def __init__(self):
+        self.tempdir = tempfile.mkdtemp()
+
+    def __enter__(self):
+        self.cwd = os.getcwd()
+        os.chdir(self.tempdir)
+
+    def __exit__(self, etype, value, traceback):
+        os.chdir(self.cwd)
+        shutil.rmtree(self.tempdir)
+
+def _check_ncclizes(path):
+    assert 0 == os.system(f'sccl ncclize {path} -o ncclized.sccl.xml')
+    assert os.path.exists('ncclized.sccl.xml')
+
+def test_run_as_module():
+    assert 0 == os.system(f'{sys.executable} -m sccl --help')
+
+def test_entrypoint():
+    assert 0 == os.system('sccl --help')
+
+def test_solve_instance():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 4 --steps 1 -o algo.json')
+        assert not os.path.exists('algo.json')
+        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json')
+        assert os.path.exists('algo.json')
+        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json --force')
+        _check_ncclizes('algo.json')
+
+def test_extra_memory():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve instance Ring -n 4 Alltoall -s 2 --extra-memory 0 -o algo.json')
+        _check_ncclizes('algo.json')
+
+def test_solve_least_steps():
+    assert 0 == os.system('sccl solve least-steps Ring Allgather --nodes 2')
+    assert 0 == os.system('sccl solve least-steps Ring Allgather --nodes 2 --initial-steps 2')
+
+def test_solve_pareto_optimal():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve pareto-optimal Ring Allgather --nodes 4 -d .')
+        assert len(os.listdir('.')) == 1
+    with in_tempdir():
+        assert 0 == os.system('sccl solve pareto-optimal Ring Allgather --nodes 4 -d . --save-eagerly')
+        assert len(os.listdir('.')) == 2
+    assert 0 == os.system('sccl solve pareto-optimal Ring Alltoall --nodes 2 --assume-rpc-bound 1/1')
+    assert 0 == os.system('sccl solve pareto-optimal Ring Alltoall --nodes 2 --no-monotonic-feasibility')
+
+def test_ncclize():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json')
+        assert os.path.exists('algo.json')
+        assert 0 == os.system('sccl ncclize algo.json -o ncclized1.sccl.xml')
+        assert os.path.exists('ncclized1.sccl.xml')
+        assert 0 == os.system('sccl ncclize algo.json -f --channel-policy One')
+        assert 0 == os.system('sccl ncclize algo.json -f --channel-policy MaxConcurrency')
+        assert 0 == os.system('sccl ncclize algo.json -f --channel-policy MatchTopology')
+        assert 0 == os.system('sccl ncclize algo.json -f --no-merge-contiguous')
+        assert 0 == os.system('sccl solve instance Star Alltoall --nodes 4 --steps 2 --rounds 4 -o algo_scratch.json')
+        assert 0 == os.system('sccl ncclize algo_scratch.json -f --remap-scratch')
+
+def test_custom_topology_and_collective():
+    with in_tempdir():
+        topo = Topology('CT', [[0, 1], [1, 0]])
+        coll = build_collective('CC', 2, 1, lambda r, c: r == 0, lambda r, c: r == 1)
+        save_sccl_object(topo, 'topo.json')
+        save_sccl_object(coll, 'coll.json')
+        assert 0 == os.system('sccl solve instance custom custom --topology-file topo.json --collective-file coll.json -s 1')
+
+def test_solve_bound_rounds():
+    assert '7/6' in os.popen('sccl analyze rounds DGX1 Allgather').read()
+
+def test_find_isomorphisms():
+    assert 0 == os.system('sccl analyze isomorphisms DGX1 DGX1')
+
+def test_distribute_alltoall_greedy():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve instance Ring Alltoall --nodes 4 --steps 2 -o local.json')
+        assert 0 == os.system('sccl distribute alltoall-greedy local.json DistributedFullyConnected --copies 3 -o dist.json')
+        assert os.path.exists('dist.json')
+        _check_ncclizes('dist.json')
+        assert 0 == os.system('sccl distribute alltoall-greedy local.json DistributedHubAndSpoke --nodes 8')
+        assert 0 != os.system('sccl distribute alltoall-greedy local.json DistributedHubAndSpoke --nodes 5')
+
+def test_distribute_alltoall_scatter_gather():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve instance DGX1 Gather --root 5 --steps 2 -o gather.json')
+        assert 0 == os.system('sccl solve instance DGX1 Scatter --root 5 --steps 2 -o scatter.json')
+        assert 0 == os.system('sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 2 -o alltoall.json')
+        assert os.path.exists('alltoall.json')
+        _check_ncclizes('alltoall.json')
+
+def test_distribute_alltoall_scatter_gather_multiroot():
+    with in_tempdir():
+        assert 0 == os.system('sccl solve instance Ring -n 3 MultirootGather --roots 0 1 --steps 1 -o gather.json')
+        assert 0 == os.system('sccl solve instance Ring -n 3 MultirootScatter --roots 1 2 --steps 1 -o scatter.json')
+        assert 0 == os.system('sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 2 -o alltoall.json')
+        assert os.path.exists('alltoall.json')
+        _check_ncclizes('alltoall.json')
+
+def test_distribute_alltoall_subproblem():
+    # TODO: make this test less brittle. Currentl it will break when algorithm naming is changed, but we don't actually
+    # want to test for that.
+    with in_tempdir():
+        assert 0 == os.system('sccl distribute alltoall-create-subproblem Line -n 2 --copies 2')
+        coll_name = 'AlltoallSubproblem.n2.copies2.sccl.json'
+        topo_name = 'Subtopo.localLine.n2.relays.0.sccl.json'
+        assert os.path.exists(coll_name)
+        assert os.path.exists(topo_name)
+        assert 0 == os.system('sccl solve instance custom custom --topology-file Subtopo.localLine.n2.relays.0.sccl.json --collective-file AlltoallSubproblem.n2.copies2.sccl.json -s 3 -r 4 -o subalgo.json')
+        assert 0 == os.system('sccl distribute alltoall-stitch-subproblem subalgo.json --copies 2 -o stitched.json')
+        assert os.path.exists('stitched.json')
+        _check_ncclizes('stitched.json')
diff --git a/tests/test_distributors.py b/tests/test_distributors.py
new file mode 100644
index 0000000..178a170
--- /dev/null
+++ b/tests/test_distributors.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .common import *
+from sccl.topologies import fully_connected, ring, distributed_fully_connected
+from sccl.collectives import alltoall
+from sccl.instance import Instance
+from sccl.path_encoding import PathEncoding
+from sccl.distributors import *
+
+
+def test_greedy_alltoall():
+    num_nodes = 2
+    num_copies = 2
+    local_topo = fully_connected(num_nodes)
+    encoding = PathEncoding(local_topo, alltoall(num_nodes))
+    local_algo = encoding.solve(Instance(1))
+    dist_topo = distributed_fully_connected(local_topo, num_copies, remote_bw=1)
+    dist_algo = synthesize_greedy_distributed_alltoall(dist_topo, local_algo)
+    dist_algo.check_implements(alltoall(num_nodes * num_copies))
+
+def test_alltoall_subproblem():
+    num_nodes = 2
+    num_copies = 2
+    local_topo = ring(num_nodes)
+    sub_coll, sub_topo = make_alltoall_subproblem_collective_and_topology(local_topo, num_copies, [0])
+    encoding = PathEncoding(sub_topo, sub_coll)
+    sub_algo = encoding.solve(Instance(3, extra_rounds=1))
+    dist_algo = synthesize_alltoall_subproblem(sub_algo, num_copies)
+    dist_algo.check_implements(alltoall(num_nodes * num_copies))
diff --git a/tests/test_path_encoding.py b/tests/test_path_encoding.py
new file mode 100644
index 0000000..19205ac
--- /dev/null
+++ b/tests/test_path_encoding.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.path_encoding import PathEncoding
+from sccl.topologies import fully_connected, line, dgx1
+from sccl.collectives import *
+from sccl.instance import Instance
+
+def test_fc_noncombining():
+    num_nodes = 2
+    enc = PathEncoding(fully_connected(num_nodes), allgather(num_nodes))
+    assert enc.solve(Instance(1, chunks=2)) == None
+    assert enc.solve(Instance(2, chunks=2)) != None
+
+def test_fc_combining_reducible():
+    num_nodes = 2
+    enc = PathEncoding(fully_connected(num_nodes), reduce_scatter(num_nodes))
+    assert enc.solve(Instance(1, chunks=2)) == None
+    assert enc.solve(Instance(2, chunks=2)) != None
+
+def test_fc_combining_nonreducible():
+    num_nodes = 2
+    enc = PathEncoding(fully_connected(num_nodes), allreduce(num_nodes))
+    assert enc.solve(Instance(1, chunks=2)) == None
+    assert enc.solve(Instance(2, chunks=2)) != None
+
+def test_dgx1_noncombining():
+    topo = dgx1()
+    enc = PathEncoding(topo, allgather(topo.num_nodes()))
+    assert enc.solve(Instance(1)) == None
+    assert enc.solve(Instance(2)) != None
+
+def test_dgx1_combining_reducible():
+    topo = dgx1()
+    enc = PathEncoding(topo, reduce_scatter(topo.num_nodes()))
+    assert enc.solve(Instance(1)) == None
+    assert enc.solve(Instance(2)) != None
+
+def test_dgx1_combining_nonreducible():
+    topo = dgx1()
+    enc = PathEncoding(topo, allreduce(topo.num_nodes()))
+    assert enc.solve(Instance(1)) == None
+    assert enc.solve(Instance(2)) != None
+
+def test_memory_constraint():
+    topo = line(3)
+    enc = PathEncoding(topo, alltoall(topo.num_nodes()))
+    assert enc.solve(Instance(2, extra_memory=0)) == None
+    assert enc.solve(Instance(2, extra_memory=1)) != None
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
new file mode 100644
index 0000000..a875233
--- /dev/null
+++ b/tests/test_serialization.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .common import *
+from sccl.serialization import SCCLEncoder, SCCLDecoder
+from sccl.algorithm import Algorithm, Step
+from sccl.topologies import fully_connected
+from sccl.instance import Instance
+
+def test_algorithm_roundtrip():
+    name = 'test_algorithm'
+    num_nodes = 2
+    collective = null_collective(num_nodes)
+    topo = fully_connected(num_nodes)
+    steps = [Step(1,[(0,0,1)]),Step(2,[(1,1,0),(1,0,1)]),Step(1,[(0,1,0)])]
+    instance = Instance(3, pipeline=2)
+    algo1 = Algorithm(name, collective, topo, instance, steps)
+    json = SCCLEncoder().encode(algo1)
+    assert json != None
+
+    algo2 = SCCLDecoder().decode(json)
+    assert algo2.name == name
+    assert algo2.instance == instance
+    assert algo2.steps == steps
diff --git a/tests/test_topologies.py b/tests/test_topologies.py
new file mode 100644
index 0000000..9c7b1ee
--- /dev/null
+++ b/tests/test_topologies.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.topologies import *
+
+def test_local_topologies():
+    assert hub_and_spoke(4) != None
+    assert fully_connected(6) != None
+    assert ring(3) != None
+    assert line(5) != None
+    assert star(6) != None
+    assert dgx1() != None
+    assert amd4() != None
+    assert amd8() != None
+
+def test_distributed_topologies():
+    assert distributed_fully_connected(ring(4), 2, 1) != None
+    assert distributed_hub_and_spoke(star(6), 4, 2) != None
+
+def test_transformers():
+    assert binarize_topology(dgx1()) != None
+    assert reverse_topology(dgx1()) != None
+
+def test_nvlink_only():
+    dgx1_topo = '''	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	mlx5_0	mlx5_2	mlx5_1	mlx5_3	CPU Affinity
+GPU0	 X 	NV1	NV1	NV2	NV2	SYS	SYS	SYS	PIX	SYS	PHB	SYS	0-19,40-59
+GPU1	NV1	 X 	NV2	NV1	SYS	NV2	SYS	SYS	PIX	SYS	PHB	SYS	0-19,40-59
+GPU2	NV1	NV2	 X 	NV2	SYS	SYS	NV1	SYS	PHB	SYS	PIX	SYS	0-19,40-59
+GPU3	NV2	NV1	NV2	 X 	SYS	SYS	SYS	NV1	PHB	SYS	PIX	SYS	0-19,40-59
+GPU4	NV2	SYS	SYS	SYS	 X 	NV1	NV1	NV2	SYS	PIX	SYS	PHB	20-39,60-79
+GPU5	SYS	NV2	SYS	SYS	NV1	 X 	NV2	NV1	SYS	PIX	SYS	PHB	20-39,60-79
+GPU6	SYS	SYS	NV1	SYS	NV1	NV2	 X 	NV2	SYS	PHB	SYS	PIX	20-39,60-79
+GPU7	SYS	SYS	SYS	NV1	NV2	NV1	NV2	 X 	SYS	PHB	SYS	PIX	20-39,60-79
+mlx5_0	PIX	PIX	PHB	PHB	SYS	SYS	SYS	SYS	 X 	SYS	PHB	SYS	
+mlx5_2	SYS	SYS	SYS	SYS	PIX	PIX	PHB	PHB	SYS	 X 	SYS	PHB	
+mlx5_1	PHB	PHB	PIX	PIX	SYS	SYS	SYS	SYS	PHB	SYS	 X 	SYS	
+mlx5_3	SYS	SYS	SYS	SYS	PHB	PHB	PIX	PIX	SYS	PHB	SYS	 X 	
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks'''
+    topo = nvlink_only(dgx1_topo)
+    assert topo != None
+    assert topo.num_nodes() == 8

From c072809fa5d502c57f73189bec9d5880cf10ebbe Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 18 May 2021 15:37:31 -0700
Subject: [PATCH 002/135] Improve readme and add custom collective example

---
 README.md        | 67 ++++++++++++++++++++++++++++++++++++++++++++++--
 examples/send.py | 12 +++++++++
 2 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 examples/send.py

diff --git a/README.md b/README.md
index c17eee4..212a8c2 100644
--- a/README.md
+++ b/README.md
@@ -15,11 +15,11 @@ To enable Bash completion for `sccl`:
 echo 'eval "$(register-python-argcomplete sccl)"' >> ~/.bashrc
 ```
 
-## Usage
+## Synthesizing Algorithms
 
 At its core SCCL answers synthesis queries is there an algorithm for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
 
-For example, to synthesize an Allgather algorithm for an [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
+SCCL groups its solver strategies under the `sccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
 ```
 $ sccl solve instance DGX1 Allgather --steps 4
 Solving instance steps=4... synthesized! (0.7s)
@@ -55,6 +55,69 @@ Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.sccl.json
 Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.sccl.json
 ```
 
+## Collectives
+
+SCCL includes a number of built in common collectives.
+
+| Collective | Arguments | Description | Kind |
+| - | - | - | - |
+| Broadcast | `--root N` | Send data from root to all nodes. | NC |
+| Reduce | `--root N` | Combine data from all nodes to root. | CR |
+| Scatter | `--root N` | Send slices of data from root to all nodes. | NC |
+| Gather | `--root N` | Send slices of data from all nodes to root. | NC |
+| Allgather | | Send slices of data from all nodes to all nodes. | NC |
+| Allreduce | | Combine data from all nodes to all nodes. | CNR |
+| Alltoall | | Transpose data between all nodes. | NC |
+| ReduceScatter | | Combine slices of data to all nodes. | CR |
+| Scan | | Combine partial prefixes of data to all nodes in sequence. | CNR |
+| MultirootBroadcast | `--roots N [N ...]` | Like Broadcast, but set of nodes have slices of input. | NC |
+| MultirootScatter | `--roots N [N ...]` | Like Scatter, but set of nodes have slices of input. | NC |
+| MultirootGather | `--roots N [N ...]` | Like Gather, but output is sent in slices to a set of nodes. | NC |
+| custom | `--collective-file` | Arbitrary collective serialized by the user. | ? |
+
+Custom collectives may be defined by instantiating the `Collective` class, which is easiest through the `build_collective` function. For example, a send from rank 2 to rank 7 in an 8 node topology can be defined and saved with:
+```
+from sccl.collectives import build_collective
+from sccl.serialization import save_sccl_object
+
+precondition = lambda r, c: r == 2
+postcondition = lambda r, c: r == 7
+coll = build_collective('Send', 8, 1, precondition, postcondition)
+save_sccl_object(coll, 'send.json')
+```
+
+The *kind* of the collective determines support for some features of SCCL:
+- **NC** are non-combining collectives, and are always supported.
+- **CR** are combining collectives that have a non-combining dual collective, and are supported through a reduction.
+- **CNR** are combining collectives with no dual, which may not always be supported.
+
+Currently the rounds per chunk analysis described below can not support CNR collectives.
+
+## Steps and Rounds
+
+SCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step.
+
+How much data a single round corresponds to depends on what is the actual size of a chunk at runtime, and how many chunks a collective uses can change (e.g. you can control this directly in the `instance` strategy by setting `--chunks N`). Thus for each collective the total data usage of different algorithms implementing it can be measured with their *rounds per chunk*.
+
+SCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1:
+```
+$ sccl analyze rounds DGX1 Gather
+Gather(n=8,root=0) algorithms need at least 7/6 rounds in DGX1 topology.
+```
+In this case the bound happens to be tight and the `pareto-optimal` strategy would use it to detect that it has found a bandwidth optimal algorithm.
+
+## Distributed Algorithms
+
+SCCL provides routines to synthesize algorithms for distributed topologies under the `sccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one.
+
+**Alltoall from Gather and Scatter:** `alltoall-gather-scatter` combines a Gather and a Scatter algorithm with a transpose step in the middle to form a distributed Alltoall algorithm. For example, an Alltoall algorithm for a cluster of 4 DGX-1 machines can be created with:
+```
+sccl solve least-steps DGX1 Gather -o gather.json
+sccl solve least-steps DGX1 Scatter -o scatter.json --root 1
+sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json
+```
+This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. SCCL also provides multi-root versions of Gather and Scatter that can be substituted here.
+
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
diff --git a/examples/send.py b/examples/send.py
new file mode 100644
index 0000000..c4c1b46
--- /dev/null
+++ b/examples/send.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# This script defines and saves a custom collective to send from rank 2 to rank 7
+
+from sccl.collectives import build_collective
+from sccl.serialization import save_sccl_object
+
+precondition = lambda r, c: r == 2
+postcondition = lambda r, c: r == 7
+coll = build_collective('Send', 8, 1, precondition, postcondition)
+save_sccl_object(coll, 'send.json')

From 25fd22aca4554d130a0768650b28760b1852623c Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 21 May 2021 15:37:08 -0700
Subject: [PATCH 003/135] Fix offset with counts and instances

---
 sccl/ncclize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/ncclize.py b/sccl/ncclize.py
index 26896e2..5d5c3b7 100644
--- a/sccl/ncclize.py
+++ b/sccl/ncclize.py
@@ -418,8 +418,8 @@ def make_interval(a,b):
         for (src, dst), addrs in grouped_sends.items():
             for src_buf, src_off, dst_buf, dst_off, cnt in make_intervals(src, dst, addrs):
                 for i in range(instances):
-                    new_src_off = src_off * instances + i
-                    new_dst_off = dst_off * instances + i
+                    new_src_off = src_off * instances + i * cnt
+                    new_dst_off = dst_off * instances + i * cnt
                     send = (src, dst, src_buf, new_src_off, dst_buf, new_dst_off, cnt)
                     sends.append(send)
 

From a745898d8f767afcb2966c2fbedbc9bef1fe97c0 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 27 May 2021 17:25:06 +0000
Subject: [PATCH 004/135] added a print for the unpermute function

---
 examples/unpermute_dgx1.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/unpermute_dgx1.py b/examples/unpermute_dgx1.py
index 2d87a11..74f2735 100644
--- a/examples/unpermute_dgx1.py
+++ b/examples/unpermute_dgx1.py
@@ -8,7 +8,8 @@
 
 def solve_dgx1_permutation():
     local = nvlink_only()
-    isomorphisms = find_isomorphisms(dgx1(), local, limit=1)
+    isomorphisms = find_isomorphisms(dgx1(), local, limit=4)
     if len(isomorphisms) == 0:
         raise RuntimeError('No isomorphism to DGX1 found')
-    return isomorphisms[0].nodes
+    return isomorphisms
+print(solve_dgx1_permutation())

From e186f455cf5ec623563eebeb602440f587577151 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 2 Jul 2021 14:41:32 -0700
Subject: [PATCH 005/135] Add proto attribute to ncclize XML

---
 sccl/ncclize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sccl/ncclize.py b/sccl/ncclize.py
index 5d5c3b7..c0c7514 100644
--- a/sccl/ncclize.py
+++ b/sccl/ncclize.py
@@ -559,6 +559,7 @@ def expand_mappings(mappings):
     # Generate the XML structure
     algo_elem = ET.Element('algo')
     algo_elem.set('name', algorithm.name)
+    algo_elem.set('proto', 'Simple')
     algo_elem.set('nchannels', str(1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in gpus.values())))
     if old_format:
         algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))

From 392057e9480cb24112dbac3d50d885c2d1735589 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 17 Jun 2021 06:12:04 -0700
Subject: [PATCH 006/135] Add remote bw option to alltoall-gather-scatter

Useful for controlling ncclize concurrency.
Exposed as -bw/--remote-bandwidth.
---
 sccl/cli/distribute.py                       | 3 ++-
 sccl/distributors/gather_scatter_alltoall.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sccl/cli/distribute.py b/sccl/cli/distribute.py
index aa24179..09461a9 100644
--- a/sccl/cli/distribute.py
+++ b/sccl/cli/distribute.py
@@ -41,6 +41,7 @@ def make_handle_gather_scatter_alltoall(cmd_parsers):
     read_gather_algorithm = add_input_algorithm(cmd, name='gather')
     read_scatter_algorithm = add_input_algorithm(cmd, name='scatter')
     cmd.add_argument('--copies', type=int, metavar='N', required=True, help='copies of the local topology to be made')
+    cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N')
     validate_output_args, output_handler = add_output_algorithm(cmd)
 
     def handle(args, command):
@@ -50,7 +51,7 @@ def handle(args, command):
         gather_algorithm = read_gather_algorithm(args)
         scatter_algorithm = read_scatter_algorithm(args)
         validate_output_args(args)
-        algo = synthesize_gather_scatter_distributed_alltoall(args.copies, gather_algorithm, scatter_algorithm, logging=True)
+        algo = synthesize_gather_scatter_distributed_alltoall(args.copies, gather_algorithm, scatter_algorithm, args.remote_bandwidth, logging=True)
         output_handler(args, algo)
         return True
 
diff --git a/sccl/distributors/gather_scatter_alltoall.py b/sccl/distributors/gather_scatter_alltoall.py
index 6da6ec8..4ba688c 100644
--- a/sccl/distributors/gather_scatter_alltoall.py
+++ b/sccl/distributors/gather_scatter_alltoall.py
@@ -6,7 +6,7 @@
 from sccl.instance import *
 from sccl.topologies import distributed_fully_connected
 
-def synthesize_gather_scatter_distributed_alltoall(num_copies, gather_algo, scatter_algo, logging=False):
+def synthesize_gather_scatter_distributed_alltoall(num_copies, gather_algo, scatter_algo, remote_bw=1, logging=False):
     if gather_algo.is_pipelined() or scatter_algo.is_pipelined():
         raise ValueError('Pipelining is not supported.')
 
@@ -83,7 +83,7 @@ def synthesize_gather_scatter_distributed_alltoall(num_copies, gather_algo, scat
         print(f'Multiplying chunks by {len(gather_roots)} to match the number of roots.')
 
     collective = alltoall(nodes)
-    topology = distributed_fully_connected(gather_algo.topology, num_copies, 1)
+    topology = distributed_fully_connected(gather_algo.topology, num_copies, remote_bw)
 
     def nth_chunk_for_pair(src, dst, idx):
         # The following chunk calculation respects both the _scattered and _transpose

From da6cb1ac56b618fc4afc0c109f3b55fb822d6886 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 12 Jul 2021 15:05:47 -0700
Subject: [PATCH 007/135] Start sccl.autosynth package

Detection logic mostly not implemented yet.
---
 sccl/__init__.py           |  7 ++++++-
 sccl/autosynth/__init__.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 sccl/autosynth/__init__.py

diff --git a/sccl/__init__.py b/sccl/__init__.py
index 7f3fd83..e123819 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -1,2 +1,7 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
\ No newline at end of file
+# Licensed under the MIT License.
+
+from sccl.autosynth import detect_node_type
+
+def autosynth():
+    node_type = detect_node_type()
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
new file mode 100644
index 0000000..8ad6a4b
--- /dev/null
+++ b/sccl/autosynth/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.topologies.nvidia import nvlink_only
+import subprocess
+import re
+
+def detect_node_type():
+    node_type = _detect_nvidia_node_type()
+    if node_type != None:
+        return node_type
+
+def _detect_nvidia_node_type():
+    try:
+        smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
+    except FileNotFoundError:
+        return None
+    except subprocess.CalledProcessError:
+        return 'unknown'
+
+    nvlink_topo = nvlink_only(smi_topo)
+
+    if nvlink_topo.num_nodes == 8: # DGX-1 and DGX A100 like nodes
+        if _is_one_host_ib_dgx1():
+            return 'one_host_ib_dgx1'
+
+def _is_one_host_ib_dgx1(smi_topo):
+    ib_host = re.findall('^mlx\\d_\\d(\s+NODE)*\s+X(\s+NODE)*&', smi_topo, re.MULTILINE)
+    ib_any = re.findall('^mlx\\d_\\d.*&', smi_topo, re.MULTILINE)
+    return len(ib_host) == 1 and len(ib_any) == 1

From 8af61a57b4d40652b06a71f5664f70b79a73e3d1 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 11:44:25 -0700
Subject: [PATCH 008/135] Work in progress fleshed out autosynth

---
 sccl/__init__.py                       |  4 +--
 sccl/autosynth/__init__.py             | 41 ++++++++++++++++++++-----
 sccl/autosynth/dgx1_relay_node_plan.py | 42 ++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 9 deletions(-)
 create mode 100644 sccl/autosynth/dgx1_relay_node_plan.py

diff --git a/sccl/__init__.py b/sccl/__init__.py
index e123819..5d84f6b 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.autosynth import detect_node_type
+import sccl.autosynth as as
 
 def autosynth():
-    node_type = detect_node_type()
+    as.init()
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 8ad6a4b..f82d1d2 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -2,29 +2,56 @@
 # Licensed under the MIT License.
 
 from sccl.topologies.nvidia import nvlink_only
+from sccl.autosynth.dgx1_relay_node_plan import DGX1RelayNodePlan
 import subprocess
 import re
 
-def detect_node_type():
-    node_type = _detect_nvidia_node_type()
-    if node_type != None:
-        return node_type
+def init(logging=False):
+    try:
+        from mpi4py import MPI
+    except ImportError as e:
+        print('Please install the mpi4py package to use SCCL autosynth.')
+        raise e
+    comm = MPI.COMM_WORLD
+    size = comm.Get_size()
+    rank = comm.Get_rank()
+
+    machine = detect_machine()
+    names = comm.gather(machine[0], root=0)
+    if rank == 0:
+        for i in range(len(names) - 1):
+            if names[i] != names[i+1]:
+                raise RuntimeError(f'Rank {i} detected machine as {names[i]} but rank {i+1} detected machine as {names[i+1]}.')
+        plan = select_synthesis_plan(machine)
+        for algo in plan.synthesize(size, ['Alltoall'], logging):
+            pass
 
-def _detect_nvidia_node_type():
+def detect_machine():
+    machine = _detect_nvidia_machine()
+    if machine != None:
+        return machine
+    return ('unknown', None)
+
+def _detect_nvidia_machine():
     try:
         smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
     except FileNotFoundError:
         return None
     except subprocess.CalledProcessError:
-        return 'unknown'
+        return ('unknown', None)
 
     nvlink_topo = nvlink_only(smi_topo)
 
     if nvlink_topo.num_nodes == 8: # DGX-1 and DGX A100 like nodes
         if _is_one_host_ib_dgx1():
-            return 'one_host_ib_dgx1'
+            return ('one_host_ib_dgx1', nvlink_topo)
 
 def _is_one_host_ib_dgx1(smi_topo):
     ib_host = re.findall('^mlx\\d_\\d(\s+NODE)*\s+X(\s+NODE)*&', smi_topo, re.MULTILINE)
     ib_any = re.findall('^mlx\\d_\\d.*&', smi_topo, re.MULTILINE)
     return len(ib_host) == 1 and len(ib_any) == 1
+
+def select_synthesis_plan(machine):
+    machine_name, machine_info = machine
+    if machine_name == 'one_host_ib_dgx1':
+        return DGX1RelayNodePlan(machine_info)
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
new file mode 100644
index 0000000..37635d9
--- /dev/null
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -0,0 +1,42 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.topologies import dgx1
+from sccl.collectives import gather, scatter
+from sccl.strategies import solve_least_steps
+from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
+from sccl.isomorphisms import find_isomorphisms
+
+class DGX1RelayNodePlan:
+    def __init__(self, local_topo):
+        self.local_topo = local_topo
+    
+    def synthesize(self, world_size, collective_names, logging=False):
+        if world_size % self.local_topo.num_nodes() != 0:
+            raise RuntimeError('Local machine size does not evenly divide world size.')
+        num_machines = world_size // self.local_topo.num_nodes()
+        for name in collective_names:
+            if name == 'Alltoall':
+                yield self._synthesize_alltoall(num_machines, logging)
+    
+    def _synthesize_alltoall(self, num_machines, logging):
+        outbound, inbound = self._select_root_nodes()
+        gather_coll = gather(8, outbound)
+        scatter_coll = scatter(8, inbound)
+        gather_algo = solve_least_steps(dgx1(), gather_coll, logging=logging)
+        scatter_algo = solve_least_steps(dgx1(), scatter_coll, logging=logging)
+        synthesize_gather_scatter_distributed_alltoall(num_machines, gather_algo, scatter_algo, logging)
+
+    def _select_root_nodes(self):
+        # TODO: is this always the right thing?
+        return (0,1)
+
+    def local_rank_permutation(self):
+        isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
+        if len(isomorphisms) != 4:
+            raise RuntimeError(f'Expected to find 4 isomorphisms to DGX1 topology, but found {len(isomorphisms)}.')
+        return self._select_isomorphism(isomorphisms)
+
+    def _select_isomorphism(self, isomorphisms):
+        # TODO: do the microbenchmarking
+        return isomorphisms[0]

From 255ba45d864b50473b5c397ee01266e475b90337 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 14 Jul 2021 00:17:00 +0000
Subject: [PATCH 009/135] inspector-topo is added

---
 sccl/autosynth/dgx1_relay_node_plan.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 37635d9..7942e71 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -6,6 +6,7 @@
 from sccl.strategies import solve_least_steps
 from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
 from sccl.isomorphisms import find_isomorphisms
+import re, subprocess
 
 class DGX1RelayNodePlan:
     def __init__(self, local_topo):
@@ -38,5 +39,13 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms):
-        # TODO: do the microbenchmarking
-        return isomorphisms[0]
+        topo_detect_output = subprocess.run(['inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"}).stdout.decode('utf-9')
+        print(topo_detect_output)
+        g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
+        if g is None:
+            raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
+        ib_gpus = {g.group(1), g.group(2)}
+        for iso in isomorphisms:
+            if ib_gpus.intersection({iso[0],iso[2]}) == None:
+                return iso
+        return None

From c417aa3492f038a2fd86ee0aacb4c8c821bd6831 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 12:19:37 -0700
Subject: [PATCH 010/135] Autosynth finish top level logic (untested)

---
 sccl/autosynth/__init__.py             | 33 +++++++++++++++++++++++---
 sccl/autosynth/dgx1_relay_node_plan.py |  9 ++++---
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index f82d1d2..811f12d 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -3,8 +3,11 @@
 
 from sccl.topologies.nvidia import nvlink_only
 from sccl.autosynth.dgx1_relay_node_plan import DGX1RelayNodePlan
+from sccl.ncclize import ncclize
 import subprocess
 import re
+import tempfile
+import os
 
 def init(logging=False):
     try:
@@ -16,15 +19,39 @@ def init(logging=False):
     size = comm.Get_size()
     rank = comm.Get_rank()
 
+    collective_names = ['Alltoall']
+
     machine = detect_machine()
+    plan = select_synthesis_plan(machine)
     names = comm.gather(machine[0], root=0)
     if rank == 0:
         for i in range(len(names) - 1):
             if names[i] != names[i+1]:
                 raise RuntimeError(f'Rank {i} detected machine as {names[i]} but rank {i+1} detected machine as {names[i+1]}.')
-        plan = select_synthesis_plan(machine)
-        for algo in plan.synthesize(size, ['Alltoall'], logging):
-            pass
+        efs = []
+        for name in collective_names:
+            algo = plan.synthesize(size, name, logging)
+            efs.append(ncclize(algo, old_format=True, use_scratch=True))
+    else:
+        efs = None
+    efs = comm.bcast(efs, root=0)
+
+    tempdir = tempfile.mkdtemp()
+    ef_files = []
+    for name, ef in zip(collective_names, efs):
+        ef_file = os.path.join(tempdir, f'{name}.xml')
+        ef_files.append(ef_file)
+        with open(ef_file, 'w') as f:
+            f.write(ef)
+        if logging:
+            print(f'Wrote to {ef_file}')
+
+    if len(ef_files) != 1:
+        raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
+    os.environ['SCCL_XML_FILE'] = ef_files[0]
+
+    perm = plan.local_rank_permutation()
+    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(perm.nodes)
 
 def detect_machine():
     machine = _detect_nvidia_machine()
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 7942e71..439a32d 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -12,13 +12,12 @@ class DGX1RelayNodePlan:
     def __init__(self, local_topo):
         self.local_topo = local_topo
     
-    def synthesize(self, world_size, collective_names, logging=False):
+    def synthesize(self, world_size, collective_name, logging=False):
         if world_size % self.local_topo.num_nodes() != 0:
             raise RuntimeError('Local machine size does not evenly divide world size.')
         num_machines = world_size // self.local_topo.num_nodes()
-        for name in collective_names:
-            if name == 'Alltoall':
-                yield self._synthesize_alltoall(num_machines, logging)
+        if collective_name == 'Alltoall':
+            return self._synthesize_alltoall(num_machines, logging)
     
     def _synthesize_alltoall(self, num_machines, logging):
         outbound, inbound = self._select_root_nodes()
@@ -26,7 +25,7 @@ def _synthesize_alltoall(self, num_machines, logging):
         scatter_coll = scatter(8, inbound)
         gather_algo = solve_least_steps(dgx1(), gather_coll, logging=logging)
         scatter_algo = solve_least_steps(dgx1(), scatter_coll, logging=logging)
-        synthesize_gather_scatter_distributed_alltoall(num_machines, gather_algo, scatter_algo, logging)
+        return synthesize_gather_scatter_distributed_alltoall(num_machines, gather_algo, scatter_algo, logging)
 
     def _select_root_nodes(self):
         # TODO: is this always the right thing?

From bb608549acc2e10dc04e6dd49972068fa41eac8d Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 13:24:41 -0700
Subject: [PATCH 011/135] Fixes to autosynth

---
 sccl/__init__.py           | 4 ++--
 sccl/autosynth/__init__.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sccl/__init__.py b/sccl/__init__.py
index 5d84f6b..bb6234c 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.autosynth as as
+import sccl.autosynth as _autosynth
 
 def autosynth():
-    as.init()
+    _autosynth.init()
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 811f12d..5726c1b 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -82,3 +82,5 @@ def select_synthesis_plan(machine):
     machine_name, machine_info = machine
     if machine_name == 'one_host_ib_dgx1':
         return DGX1RelayNodePlan(machine_info)
+    else:
+        raise RuntimeError(f'Unhandled machine type {machine_name}.')

From 03f230f29392d2314957f4426b3e7e2ba1e3e10b Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 13:29:42 -0700
Subject: [PATCH 012/135] Logging in autosynth

---
 sccl/__init__.py           |  4 ++--
 sccl/autosynth/__init__.py | 15 +++++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/sccl/__init__.py b/sccl/__init__.py
index bb6234c..8aa94d7 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -3,5 +3,5 @@
 
 import sccl.autosynth as _autosynth
 
-def autosynth():
-    _autosynth.init()
+def autosynth(logging=False):
+    _autosynth.init(logging=logging)
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 5726c1b..0d177de 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -21,7 +21,7 @@ def init(logging=False):
 
     collective_names = ['Alltoall']
 
-    machine = detect_machine()
+    machine = detect_machine(logging)
     plan = select_synthesis_plan(machine)
     names = comm.gather(machine[0], root=0)
     if rank == 0:
@@ -53,25 +53,32 @@ def init(logging=False):
     perm = plan.local_rank_permutation()
     os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(perm.nodes)
 
-def detect_machine():
-    machine = _detect_nvidia_machine()
+def detect_machine(logging):
+    machine = _detect_nvidia_machine(logging)
     if machine != None:
         return machine
     return ('unknown', None)
 
-def _detect_nvidia_machine():
+def _detect_nvidia_machine(logging):
     try:
         smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
     except FileNotFoundError:
         return None
     except subprocess.CalledProcessError:
+        if logging:
+            print('Found nvidia-smi, but got error.')
         return ('unknown', None)
 
     nvlink_topo = nvlink_only(smi_topo)
 
     if nvlink_topo.num_nodes == 8: # DGX-1 and DGX A100 like nodes
+        if logging:
+            print('8 GPUs, so looks like a DGX-1 or DGX A100.')
         if _is_one_host_ib_dgx1():
             return ('one_host_ib_dgx1', nvlink_topo)
+        else:
+            if logging:
+                print('Unknown network configuration.')
 
 def _is_one_host_ib_dgx1(smi_topo):
     ib_host = re.findall('^mlx\\d_\\d(\s+NODE)*\s+X(\s+NODE)*&', smi_topo, re.MULTILINE)

From cf9fcdc9465cd5ceefe9eb37b6d3c9c41510adf1 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 13:42:56 -0700
Subject: [PATCH 013/135] Fix autosynth regexes

---
 sccl/autosynth/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 0d177de..49024e3 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -81,8 +81,8 @@ def _detect_nvidia_machine(logging):
                 print('Unknown network configuration.')
 
 def _is_one_host_ib_dgx1(smi_topo):
-    ib_host = re.findall('^mlx\\d_\\d(\s+NODE)*\s+X(\s+NODE)*&', smi_topo, re.MULTILINE)
-    ib_any = re.findall('^mlx\\d_\\d.*&', smi_topo, re.MULTILINE)
+    ib_host = re.findall('^mlx\\d_\\d(?:\s+NODE)*\s+X(?:\s+NODE)*$', smi_topo, re.MULTILINE)
+    ib_any = re.findall('^mlx\\d_\\d.*$', smi_topo, re.MULTILINE)
     return len(ib_host) == 1 and len(ib_any) == 1
 
 def select_synthesis_plan(machine):

From b30ba4a995ce53ee9f1d117905c7afe1130db5ac Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 13:46:11 -0700
Subject: [PATCH 014/135] Fixes to autosynth

---
 sccl/autosynth/__init__.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 49024e3..d564f26 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -60,9 +60,13 @@ def detect_machine(logging):
     return ('unknown', None)
 
 def _detect_nvidia_machine(logging):
+    if logging:
+        print('Checking for NVIDIA machines')
     try:
         smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
     except FileNotFoundError:
+        if logging:
+            print('nvidia-smi not found.')
         return None
     except subprocess.CalledProcessError:
         if logging:
@@ -71,14 +75,15 @@ def _detect_nvidia_machine(logging):
 
     nvlink_topo = nvlink_only(smi_topo)
 
-    if nvlink_topo.num_nodes == 8: # DGX-1 and DGX A100 like nodes
+    if nvlink_topo.num_nodes() == 8: # DGX-1 and DGX A100 like nodes
         if logging:
             print('8 GPUs, so looks like a DGX-1 or DGX A100.')
-        if _is_one_host_ib_dgx1():
+        if _is_one_host_ib_dgx1(smi_topo):
             return ('one_host_ib_dgx1', nvlink_topo)
         else:
             if logging:
                 print('Unknown network configuration.')
+    return ('unknown', None)
 
 def _is_one_host_ib_dgx1(smi_topo):
     ib_host = re.findall('^mlx\\d_\\d(?:\s+NODE)*\s+X(?:\s+NODE)*$', smi_topo, re.MULTILINE)

From 21e619d3bd8b52f3ac920436d98539cf1a7cbc02 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 13:49:43 -0700
Subject: [PATCH 015/135] Autosynth regex fix

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index d564f26..0c38dc5 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -86,7 +86,7 @@ def _detect_nvidia_machine(logging):
     return ('unknown', None)
 
 def _is_one_host_ib_dgx1(smi_topo):
-    ib_host = re.findall('^mlx\\d_\\d(?:\s+NODE)*\s+X(?:\s+NODE)*$', smi_topo, re.MULTILINE)
+    ib_host = re.findall('^mlx\\d_\\d(?:\s+NODE)*\s+X(?:\s+NODE)*\s+$', smi_topo, re.MULTILINE)
     ib_any = re.findall('^mlx\\d_\\d.*$', smi_topo, re.MULTILINE)
     return len(ib_host) == 1 and len(ib_any) == 1
 

From 4136ec9df405c5c31c73fa0cf40328cd1b27748e Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 13 Jul 2021 13:51:35 -0700
Subject: [PATCH 016/135] Autosynth fix CUDA_VISIBLE_DEVICES

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 0c38dc5..fe7d58d 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -51,7 +51,7 @@ def init(logging=False):
     os.environ['SCCL_XML_FILE'] = ef_files[0]
 
     perm = plan.local_rank_permutation()
-    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(perm.nodes)
+    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(rank) for rank in perm.nodes)
 
 def detect_machine(logging):
     machine = _detect_nvidia_machine(logging)

From 10bc100c52f43a7c4faf5c59cfcafa595f15c751 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 14 Jul 2021 01:09:43 +0000
Subject: [PATCH 017/135] inspector-topo is  correct now

---
 sccl/autosynth/dgx1_relay_node_plan.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 439a32d..f3668a9 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -38,13 +38,17 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms):
-        topo_detect_output = subprocess.run(['inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"}).stdout.decode('utf-9')
-        print(topo_detect_output)
+        print("Running inspector-topo to find the IB placement. This will take a minute...")
+        topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
+        print(topo_detect)
+        topo_detect_output = topo_detect.stdout.decode('utf-8')
         g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
         if g is None:
             raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
-        ib_gpus = {g.group(1), g.group(2)}
+        ib_gpus = {int(g.group(1)), int(g.group(2))}
+        print(isomorphisms)
         for iso in isomorphisms:
-            if ib_gpus.intersection({iso[0],iso[2]}) == None:
+            if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                 return iso
+        raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
         return None

From 2576874de1e04634ea9e2ea36b9e52c4563e2343 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 14 Jul 2021 01:16:23 +0000
Subject: [PATCH 018/135] inspector-topo has more asserts

---
 sccl/autosynth/dgx1_relay_node_plan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index f3668a9..dfe1533 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -40,13 +40,13 @@ def local_rank_permutation(self):
     def _select_isomorphism(self, isomorphisms):
         print("Running inspector-topo to find the IB placement. This will take a minute...")
         topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
-        print(topo_detect)
         topo_detect_output = topo_detect.stdout.decode('utf-8')
+        if topo_detect.returncode != 0:
+            raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect_output}\n{topo_detect.stderr.decode('utf-8')}')
         g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
         if g is None:
             raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
         ib_gpus = {int(g.group(1)), int(g.group(2))}
-        print(isomorphisms)
         for iso in isomorphisms:
             if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                 return iso

From 42a193aa14504c1320f442dc81157f10019953d1 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 14 Jul 2021 01:17:23 +0000
Subject: [PATCH 019/135] inspector-topo has more asserts

---
 sccl/autosynth/dgx1_relay_node_plan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index dfe1533..626018f 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -40,9 +40,9 @@ def local_rank_permutation(self):
     def _select_isomorphism(self, isomorphisms):
         print("Running inspector-topo to find the IB placement. This will take a minute...")
         topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
-        topo_detect_output = topo_detect.stdout.decode('utf-8')
         if topo_detect.returncode != 0:
-            raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect_output}\n{topo_detect.stderr.decode('utf-8')}')
+            raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
+        topo_detect_output = topo_detect.stdout.decode('utf-8')
         g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
         if g is None:
             raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')

From 126557d7def806e7792bcdc7225fc43871c5ef0a Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 10:41:56 -0700
Subject: [PATCH 020/135] Locking for inspector-topo

---
 sccl/autosynth/dgx1_relay_node_plan.py | 33 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 626018f..ec97848 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -6,7 +6,7 @@
 from sccl.strategies import solve_least_steps
 from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
 from sccl.isomorphisms import find_isomorphisms
-import re, subprocess
+import re, subprocess, fcntl, tempfile, os
 
 class DGX1RelayNodePlan:
     def __init__(self, local_topo):
@@ -39,16 +39,21 @@ def local_rank_permutation(self):
 
     def _select_isomorphism(self, isomorphisms):
         print("Running inspector-topo to find the IB placement. This will take a minute...")
-        topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
-        if topo_detect.returncode != 0:
-            raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
-        topo_detect_output = topo_detect.stdout.decode('utf-8')
-        g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
-        if g is None:
-            raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
-        ib_gpus = {int(g.group(1)), int(g.group(2))}
-        for iso in isomorphisms:
-            if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
-                return iso
-        raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
-        return None
+
+        with open(os.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w") as lock_file:
+            fcntl.lockf(lock_file, fcntl.LOCK_EX)
+            try:
+                topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
+                if topo_detect.returncode != 0:
+                    raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
+                topo_detect_output = topo_detect.stdout.decode('utf-8')
+                g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
+                if g is None:
+                    raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
+                ib_gpus = {int(g.group(1)), int(g.group(2))}
+                for iso in isomorphisms:
+                    if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
+                        return iso
+                raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
+            finally:
+                fcntl.lockf(lock_file, fcntl.LOCK_UN)

From 23006e945a104d94eff7817fe459116785c78b4c Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 10:43:40 -0700
Subject: [PATCH 021/135] Bugfix

---
 sccl/autosynth/dgx1_relay_node_plan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index ec97848..049a17e 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -40,7 +40,7 @@ def local_rank_permutation(self):
     def _select_isomorphism(self, isomorphisms):
         print("Running inspector-topo to find the IB placement. This will take a minute...")
 
-        with open(os.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w") as lock_file:
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w") as lock_file:
             fcntl.lockf(lock_file, fcntl.LOCK_EX)
             try:
                 topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})

From fc7a57f04fc19ac3c51a67003159c2e4d0f8659e Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 10:44:53 -0700
Subject: [PATCH 022/135] Autosynth logging

---
 sccl/autosynth/dgx1_relay_node_plan.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 049a17e..8e1bdb7 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -38,11 +38,10 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms):
-        print("Running inspector-topo to find the IB placement. This will take a minute...")
-
         with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w") as lock_file:
             fcntl.lockf(lock_file, fcntl.LOCK_EX)
             try:
+                print("Running inspector-topo to find the IB placement. This will take a minute...")
                 topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
                 if topo_detect.returncode != 0:
                     raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')

From 7b8b006674cf0b57679a308460e6b1f2f86f303d Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 12:12:39 -0700
Subject: [PATCH 023/135] Share inspector-topo results

---
 sccl/autosynth/dgx1_relay_node_plan.py | 46 +++++++++++++++-----------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 8e1bdb7..118a9f3 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -6,19 +6,19 @@
 from sccl.strategies import solve_least_steps
 from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
 from sccl.isomorphisms import find_isomorphisms
-import re, subprocess, fcntl, tempfile, os
+import re, subprocess, fcntl, tempfile, os, json
 
 class DGX1RelayNodePlan:
     def __init__(self, local_topo):
         self.local_topo = local_topo
-    
+
     def synthesize(self, world_size, collective_name, logging=False):
         if world_size % self.local_topo.num_nodes() != 0:
             raise RuntimeError('Local machine size does not evenly divide world size.')
         num_machines = world_size // self.local_topo.num_nodes()
         if collective_name == 'Alltoall':
             return self._synthesize_alltoall(num_machines, logging)
-    
+
     def _synthesize_alltoall(self, num_machines, logging):
         outbound, inbound = self._select_root_nodes()
         gather_coll = gather(8, outbound)
@@ -38,21 +38,29 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms):
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w") as lock_file:
-            fcntl.lockf(lock_file, fcntl.LOCK_EX)
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "r+") as f:
+            fcntl.lockf(f, fcntl.LOCK_EX)
             try:
-                print("Running inspector-topo to find the IB placement. This will take a minute...")
-                topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
-                if topo_detect.returncode != 0:
-                    raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
-                topo_detect_output = topo_detect.stdout.decode('utf-8')
-                g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
-                if g is None:
-                    raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
-                ib_gpus = {int(g.group(1)), int(g.group(2))}
-                for iso in isomorphisms:
-                    if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
-                        return iso
-                raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
+                f.seek(0, os.SEEK_END)
+                size = f.tell()
+                f.seek(0)
+                if size > 0:
+                    return json.load(f)
+                else:
+                    print("Running inspector-topo to find the IB placement. This will take a minute...")
+                    topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
+                    if topo_detect.returncode != 0:
+                        raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
+                    topo_detect_output = topo_detect.stdout.decode('utf-8')
+                    g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
+                    if g is None:
+                        raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
+                    ib_gpus = {int(g.group(1)), int(g.group(2))}
+                    for iso in isomorphisms:
+                        if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
+                            nodes = iso.nodes
+                            json.dump(nodes, f)
+                            return nodes
+                    raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
             finally:
-                fcntl.lockf(lock_file, fcntl.LOCK_UN)
+                fcntl.lockf(f, fcntl.LOCK_UN)

From b998d466c1141921b182f56cbecec9288cb20122 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 12:14:30 -0700
Subject: [PATCH 024/135] Bugfix

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index fe7d58d..47f8fc0 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -51,7 +51,7 @@ def init(logging=False):
     os.environ['SCCL_XML_FILE'] = ef_files[0]
 
     perm = plan.local_rank_permutation()
-    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(rank) for rank in perm.nodes)
+    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(rank) for rank in perm)
 
 def detect_machine(logging):
     machine = _detect_nvidia_machine(logging)

From 18338f704dd60a1d7ea6c52a29cd886b04b9ca84 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 12:18:27 -0700
Subject: [PATCH 025/135] Improve autosynth prints

---
 sccl/autosynth/dgx1_relay_node_plan.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 118a9f3..c546968 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -45,7 +45,9 @@ def _select_isomorphism(self, isomorphisms):
                 size = f.tell()
                 f.seek(0)
                 if size > 0:
-                    return json.load(f)
+                    nodes = json.load(f)
+                    print(f'Read permutation {nodes} from {f.name}')
+                    return nodes
                 else:
                     print("Running inspector-topo to find the IB placement. This will take a minute...")
                     topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
@@ -60,6 +62,7 @@ def _select_isomorphism(self, isomorphisms):
                         if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                             nodes = iso.nodes
                             json.dump(nodes, f)
+                            print(f'Wrote permutation {nodes} to {f.name}')
                             return nodes
                     raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
             finally:

From e85e89f943461c789bb0cbf5062c30b217c39783 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 16:17:03 -0700
Subject: [PATCH 026/135] Hack to support TDL

---
 sccl/autosynth/__init__.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 47f8fc0..9b33998 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -4,12 +4,29 @@
 from sccl.topologies.nvidia import nvlink_only
 from sccl.autosynth.dgx1_relay_node_plan import DGX1RelayNodePlan
 from sccl.ncclize import ncclize
-import subprocess
-import re
-import tempfile
-import os
+import re, subprocess, fcntl, tempfile, os, json, glob
 
-def init(logging=False):
+def init(logging=False, torch_distributed_launch_hack=False):
+    if torch_distributed_launch_hack:
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "r+") as f:
+            fcntl.lockf(f, fcntl.LOCK_EX)
+            try:
+                f.seek(0, os.SEEK_END)
+                size = f.tell()
+                f.seek(0)
+                if size > 0:
+                    env = json.load(f)
+                else:
+                    env = _autosynth_and_get_env(logging)
+                    json.dump(env, f)
+            finally:
+                fcntl.lockf(f, fcntl.LOCK_UN)
+    else:
+        env = _autosynth_and_get_env(logging)
+
+    os.environ.update(env)
+
+def _autosynth_and_get_env(logging):
     try:
         from mpi4py import MPI
     except ImportError as e:
@@ -48,10 +65,13 @@ def init(logging=False):
 
     if len(ef_files) != 1:
         raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
-    os.environ['SCCL_XML_FILE'] = ef_files[0]
 
     perm = plan.local_rank_permutation()
-    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(rank) for rank in perm)
+
+    return {
+        'SCCL_XML_FILE': ef_files[0],
+        'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm)
+    }
 
 def detect_machine(logging):
     machine = _detect_nvidia_machine(logging)

From 8e5d608e9b8ae47e8a2a61750bfc9ee4e20c46ae Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 16:21:29 -0700
Subject: [PATCH 027/135] Add torch_distributed_launch parm to autosynth

---
 sccl/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/__init__.py b/sccl/__init__.py
index 8aa94d7..60f92e3 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -3,5 +3,5 @@
 
 import sccl.autosynth as _autosynth
 
-def autosynth(logging=False):
-    _autosynth.init(logging=logging)
+def autosynth(logging=False, torch_distributed_launch=False):
+    _autosynth.init(logging=logging, torch_distributed_launch_hack=torch_distributed_launch)

From 948ffdcbb45a527c93297f9ffe717f7d0b49213f Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 14 Jul 2021 16:24:44 -0700
Subject: [PATCH 028/135] Bugfix

---
 sccl/autosynth/__init__.py             | 2 +-
 sccl/autosynth/dgx1_relay_node_plan.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 9b33998..c256542 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -8,7 +8,7 @@
 
 def init(logging=False, torch_distributed_launch_hack=False):
     if torch_distributed_launch_hack:
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "r+") as f:
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "w+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
                 f.seek(0, os.SEEK_END)
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index c546968..815ba2e 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -38,7 +38,7 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms):
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "r+") as f:
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
                 f.seek(0, os.SEEK_END)

From 02e28b30e905af57af12ea7b9b7371ecae41fc7a Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 15 Jul 2021 13:19:49 -0700
Subject: [PATCH 029/135] Improve autosynth API

autosynth() is now called init() and takes a num_subprocesses parameter.
---
 sccl/__init__.py           |  5 +----
 sccl/autosynth/__init__.py | 12 ++++++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/sccl/__init__.py b/sccl/__init__.py
index 60f92e3..7638ff6 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -1,7 +1,4 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.autosynth as _autosynth
-
-def autosynth(logging=False, torch_distributed_launch=False):
-    _autosynth.init(logging=logging, torch_distributed_launch_hack=torch_distributed_launch)
+from sccl.autosynth import init
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index c256542..d8cda4f 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -6,8 +6,8 @@
 from sccl.ncclize import ncclize
 import re, subprocess, fcntl, tempfile, os, json, glob
 
-def init(logging=False, torch_distributed_launch_hack=False):
-    if torch_distributed_launch_hack:
+def init(num_subprocesses=1, logging=True):
+    if num_subprocesses > 1:
         with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "w+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
@@ -17,23 +17,23 @@ def init(logging=False, torch_distributed_launch_hack=False):
                 if size > 0:
                     env = json.load(f)
                 else:
-                    env = _autosynth_and_get_env(logging)
+                    env = _autosynth_and_get_env(num_subprocesses, logging)
                     json.dump(env, f)
             finally:
                 fcntl.lockf(f, fcntl.LOCK_UN)
     else:
-        env = _autosynth_and_get_env(logging)
+        env = _autosynth_and_get_env(num_subprocesses, logging)
 
     os.environ.update(env)
 
-def _autosynth_and_get_env(logging):
+def _autosynth_and_get_env(num_subprocesses, logging):
     try:
         from mpi4py import MPI
     except ImportError as e:
         print('Please install the mpi4py package to use SCCL autosynth.')
         raise e
     comm = MPI.COMM_WORLD
-    size = comm.Get_size()
+    size = comm.Get_size() * num_subprocesses
     rank = comm.Get_rank()
 
     collective_names = ['Alltoall']

From 91bd3eb92e3dd08a5a86d773d4eb9c1c33938379 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 15 Jul 2021 13:41:30 -0700
Subject: [PATCH 030/135] Fix lock file logic in autosynth

---
 sccl/autosynth/__init__.py             | 5 ++---
 sccl/autosynth/dgx1_relay_node_plan.py | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index d8cda4f..d39e161 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -8,13 +8,12 @@
 
 def init(num_subprocesses=1, logging=True):
     if num_subprocesses > 1:
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "w+") as f:
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "a+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
-                f.seek(0, os.SEEK_END)
                 size = f.tell()
-                f.seek(0)
                 if size > 0:
+                    f.seek(0)
                     env = json.load(f)
                 else:
                     env = _autosynth_and_get_env(num_subprocesses, logging)
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 815ba2e..a2dfcd8 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -38,13 +38,12 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms):
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "w+") as f:
+        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "a+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
-                f.seek(0, os.SEEK_END)
                 size = f.tell()
-                f.seek(0)
                 if size > 0:
+                    f.seek(0)
                     nodes = json.load(f)
                     print(f'Read permutation {nodes} from {f.name}')
                     return nodes

From 8a84a9f4273d3c9915e14b1274372c5e579f6c90 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 16 Jul 2021 15:45:47 -0700
Subject: [PATCH 031/135] New approach to handling both MPI and launchers

---
 sccl/autosynth/__init__.py | 83 ++++++++++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 22 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index d39e161..54d033a 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -4,49 +4,88 @@
 from sccl.topologies.nvidia import nvlink_only
 from sccl.autosynth.dgx1_relay_node_plan import DGX1RelayNodePlan
 from sccl.ncclize import ncclize
-import re, subprocess, fcntl, tempfile, os, json, glob
-
-def init(num_subprocesses=1, logging=True):
-    if num_subprocesses > 1:
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_env.lock'), "a+") as f:
-            fcntl.lockf(f, fcntl.LOCK_EX)
-            try:
-                size = f.tell()
-                if size > 0:
-                    f.seek(0)
-                    env = json.load(f)
-                else:
-                    env = _autosynth_and_get_env(num_subprocesses, logging)
-                    json.dump(env, f)
-            finally:
-                fcntl.lockf(f, fcntl.LOCK_UN)
+import re, subprocess, tempfile, os, json, atexit, time
+
+def init(logging=True):
+    if 'LOCAL_WORLD_SIZE' in os.environ:
+        has_subprocesses = True
+        world_size = os.environ['WORLD_SIZE']
+        is_mpi_process = os.environ['LOCAL_RANK'] == 0
+        if logging:
+            print(f'SCCL: Found LOCAL_WORLD_SIZE in environment, torch.distributed.run detected, with {os.environ["LOCAL_WORLD_SIZE"]} subprocesses per machine.')
     else:
-        env = _autosynth_and_get_env(num_subprocesses, logging)
+        import argparse
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--local_rank", type=int)
+        args = parser.parse_known_args()
+        if args.local_rank != None:
+            has_subprocesses = True
+            world_size = os.environ['WORLD_SIZE']
+            is_mpi_process = args.local_rank == True
+            if logging:
+                print('SCCL: Found --local_rank N argument, legacy torch.distributed.launch detected, assuming one subprocess per GPU.')
+        else:
+            has_subprocesses = False
+            world_size = None
+            is_mpi_process = True
+            if logging:
+                print(f'SCCL: No launcher detected, assuming one MPI rank per process.')
+    # Name environment file by parent PID, which will be shared between subprocesses for torch.distributed.(launch|run)
+    env_file = os.path.join(tempfile.gettempdir(), f'sccl_autosynth_env.{os.getppid()}.lock')
+    if is_mpi_process:
+        env = _autosynth_and_get_env(world_size, logging)
+        if has_subprocesses:
+            # Make sure the lock file doesn't exist yet
+            if os.path.exists(env_file):
+                raise RuntimeError(f'SCCL: Lock file already exists: {env_file}')
+            # Broadcast algorithm to other subprocesses
+            with open(env_file, "w") as f:
+                json.dump(env, f)
+            # Delete the environment file at local rank 0 exit
+            atexit.register(os.remove(), env_file)
+    else:
+        assert has_subprocesses
+        # Wait until the environment file is available
+        elapsed = 0
+        while not os.path.exists(env_file):
+            time.sleep(1)
+            elapsed += 1
+            if elapsed == 10 and logging:
+                print(f'SCCL: Still waiting to read lock file {env_file}...')
+        # Load the environment to set from the file
+        with open(env_file, "r") as f:
+            env = json.load(f)
 
     os.environ.update(env)
 
-def _autosynth_and_get_env(num_subprocesses, logging):
+    if logging:
+        print('SCCL: Algorithms installed.')
+
+def _autosynth_and_get_env(world_size, logging):
     try:
         from mpi4py import MPI
     except ImportError as e:
         print('Please install the mpi4py package to use SCCL autosynth.')
         raise e
     comm = MPI.COMM_WORLD
-    size = comm.Get_size() * num_subprocesses
-    rank = comm.Get_rank()
+    mpi_size = comm.Get_size()
+    mpi_rank = comm.Get_rank()
+
+    if world_size == None:
+        world_size = mpi_size
 
     collective_names = ['Alltoall']
 
     machine = detect_machine(logging)
     plan = select_synthesis_plan(machine)
     names = comm.gather(machine[0], root=0)
-    if rank == 0:
+    if mpi_rank == 0:
         for i in range(len(names) - 1):
             if names[i] != names[i+1]:
                 raise RuntimeError(f'Rank {i} detected machine as {names[i]} but rank {i+1} detected machine as {names[i+1]}.')
         efs = []
         for name in collective_names:
-            algo = plan.synthesize(size, name, logging)
+            algo = plan.synthesize(world_size, name, logging)
             efs.append(ncclize(algo, old_format=True, use_scratch=True))
     else:
         efs = None

From d4fcc2a184946f1f08040c97201cfc0169162f6f Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 16 Jul 2021 15:51:06 -0700
Subject: [PATCH 032/135] Bugfix

---
 sccl/autosynth/__init__.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 54d033a..10f284c 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -7,12 +7,12 @@
 import re, subprocess, tempfile, os, json, atexit, time
 
 def init(logging=True):
-    if 'LOCAL_WORLD_SIZE' in os.environ:
+    if 'LOCAL_RANK' in os.environ:
         has_subprocesses = True
-        world_size = os.environ['WORLD_SIZE']
-        is_mpi_process = os.environ['LOCAL_RANK'] == 0
+        world_size = int(os.environ['WORLD_SIZE'])
+        is_mpi_process = int(os.environ['LOCAL_RANK']) == 0
         if logging:
-            print(f'SCCL: Found LOCAL_WORLD_SIZE in environment, torch.distributed.run detected, with {os.environ["LOCAL_WORLD_SIZE"]} subprocesses per machine.')
+            print(f'SCCL: Found LOCAL_RANK in environment, torch.distributed.run (or launch with --use_env) detected.')
     else:
         import argparse
         parser = argparse.ArgumentParser()
@@ -20,10 +20,10 @@ def init(logging=True):
         args = parser.parse_known_args()
         if args.local_rank != None:
             has_subprocesses = True
-            world_size = os.environ['WORLD_SIZE']
-            is_mpi_process = args.local_rank == True
+            world_size = int(os.environ['WORLD_SIZE'])
+            is_mpi_process = args.local_rank == 0
             if logging:
-                print('SCCL: Found --local_rank N argument, legacy torch.distributed.launch detected, assuming one subprocess per GPU.')
+                print('SCCL: Found --local_rank N argument, legacy torch.distributed.launch without --use_env detected.')
         else:
             has_subprocesses = False
             world_size = None

From 68b9bee377ce89c09ce4220c930f3f69e6d317e2 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 16 Jul 2021 15:53:18 -0700
Subject: [PATCH 033/135] Bugfix

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 10f284c..6f19722 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -42,7 +42,7 @@ def init(logging=True):
             with open(env_file, "w") as f:
                 json.dump(env, f)
             # Delete the environment file at local rank 0 exit
-            atexit.register(os.remove(), env_file)
+            atexit.register(os.remove, env_file)
     else:
         assert has_subprocesses
         # Wait until the environment file is available

From d758034d449fb21eb1d6a763aac75e4c06122fc3 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 16 Jul 2021 16:01:17 -0700
Subject: [PATCH 034/135] Comments for autosynth logic

---
 sccl/autosynth/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 6f19722..cfc0b90 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -7,7 +7,9 @@
 import re, subprocess, tempfile, os, json, atexit, time
 
 def init(logging=True):
+    # Detect how this process was launched
     if 'LOCAL_RANK' in os.environ:
+        # Either torch.distributed.run or legacy run with --use_env
         has_subprocesses = True
         world_size = int(os.environ['WORLD_SIZE'])
         is_mpi_process = int(os.environ['LOCAL_RANK']) == 0
@@ -19,12 +21,14 @@ def init(logging=True):
         parser.add_argument("--local_rank", type=int)
         args = parser.parse_known_args()
         if args.local_rank != None:
+            # Legacy torch.distributed.launch without --use_env
             has_subprocesses = True
             world_size = int(os.environ['WORLD_SIZE'])
             is_mpi_process = args.local_rank == 0
             if logging:
                 print('SCCL: Found --local_rank N argument, legacy torch.distributed.launch without --use_env detected.')
         else:
+            # Pure MPI
             has_subprocesses = False
             world_size = None
             is_mpi_process = True
@@ -33,7 +37,9 @@ def init(logging=True):
     # Name environment file by parent PID, which will be shared between subprocesses for torch.distributed.(launch|run)
     env_file = os.path.join(tempfile.gettempdir(), f'sccl_autosynth_env.{os.getppid()}.lock')
     if is_mpi_process:
+        # Synthesize on MPI rank 0 and distribute to all MPI processes
         env = _autosynth_and_get_env(world_size, logging)
+        # If there are non-MPI subprocesses, they get the environment through a temporary file
         if has_subprocesses:
             # Make sure the lock file doesn't exist yet
             if os.path.exists(env_file):
@@ -41,7 +47,7 @@ def init(logging=True):
             # Broadcast algorithm to other subprocesses
             with open(env_file, "w") as f:
                 json.dump(env, f)
-            # Delete the environment file at local rank 0 exit
+            # Delete the environment file when the local MPI process exits
             atexit.register(os.remove, env_file)
     else:
         assert has_subprocesses

From b286c622329a74258829a5031685c06d4b3b342a Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 16 Jul 2021 16:04:50 -0700
Subject: [PATCH 035/135] Tag autosynth prints with SCCL

---
 sccl/autosynth/__init__.py             | 14 +++++++-------
 sccl/autosynth/dgx1_relay_node_plan.py |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index cfc0b90..0da0871 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -71,7 +71,7 @@ def _autosynth_and_get_env(world_size, logging):
     try:
         from mpi4py import MPI
     except ImportError as e:
-        print('Please install the mpi4py package to use SCCL autosynth.')
+        print('SCCL: Please install the mpi4py package to use SCCL\'s automated init function.')
         raise e
     comm = MPI.COMM_WORLD
     mpi_size = comm.Get_size()
@@ -105,7 +105,7 @@ def _autosynth_and_get_env(world_size, logging):
         with open(ef_file, 'w') as f:
             f.write(ef)
         if logging:
-            print(f'Wrote to {ef_file}')
+            print(f'SCCL: Wrote to {ef_file}')
 
     if len(ef_files) != 1:
         raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
@@ -125,28 +125,28 @@ def detect_machine(logging):
 
 def _detect_nvidia_machine(logging):
     if logging:
-        print('Checking for NVIDIA machines')
+        print('SCCL: Checking for NVIDIA machines')
     try:
         smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
     except FileNotFoundError:
         if logging:
-            print('nvidia-smi not found.')
+            print('SCCL: nvidia-smi not found.')
         return None
     except subprocess.CalledProcessError:
         if logging:
-            print('Found nvidia-smi, but got error.')
+            print('SCCL: Found nvidia-smi, but got error.')
         return ('unknown', None)
 
     nvlink_topo = nvlink_only(smi_topo)
 
     if nvlink_topo.num_nodes() == 8: # DGX-1 and DGX A100 like nodes
         if logging:
-            print('8 GPUs, so looks like a DGX-1 or DGX A100.')
+            print('SCCL: 8 GPUs, so looks like a DGX-1 or DGX A100.')
         if _is_one_host_ib_dgx1(smi_topo):
             return ('one_host_ib_dgx1', nvlink_topo)
         else:
             if logging:
-                print('Unknown network configuration.')
+                print('SCCL: Unknown network configuration.')
     return ('unknown', None)
 
 def _is_one_host_ib_dgx1(smi_topo):
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index a2dfcd8..e8eef60 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -45,10 +45,10 @@ def _select_isomorphism(self, isomorphisms):
                 if size > 0:
                     f.seek(0)
                     nodes = json.load(f)
-                    print(f'Read permutation {nodes} from {f.name}')
+                    print(f'SCCL: Read IB placement from {f.name}')
                     return nodes
                 else:
-                    print("Running inspector-topo to find the IB placement. This will take a minute...")
+                    print('SCCL: Running inspector-topo to find the IB placement. This will take a minute...')
                     topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
                     if topo_detect.returncode != 0:
                         raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
@@ -61,7 +61,7 @@ def _select_isomorphism(self, isomorphisms):
                         if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                             nodes = iso.nodes
                             json.dump(nodes, f)
-                            print(f'Wrote permutation {nodes} to {f.name}')
+                            print(f'SCCL: Wrote IB placement to {f.name}')
                             return nodes
                     raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
             finally:

From 8a342df0a9d0beebed21fdcc3261982a973fe321 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 16 Jul 2021 16:23:53 -0700
Subject: [PATCH 036/135] Bugfix

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 0da0871..57a5176 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -19,7 +19,7 @@ def init(logging=True):
         import argparse
         parser = argparse.ArgumentParser()
         parser.add_argument("--local_rank", type=int)
-        args = parser.parse_known_args()
+        args, _ = parser.parse_known_args()
         if args.local_rank != None:
             # Legacy torch.distributed.launch without --use_env
             has_subprocesses = True

From 7bc5ac8314cd582fccab8880dabb72075e7d21d5 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Sat, 17 Jul 2021 09:07:10 -0700
Subject: [PATCH 037/135] Improvements to logging in autosynth

---
 sccl/autosynth/__init__.py             | 40 ++++++++++++--------------
 sccl/autosynth/dgx1_relay_node_plan.py |  8 ++++--
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 57a5176..809db92 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -6,14 +6,14 @@
 from sccl.ncclize import ncclize
 import re, subprocess, tempfile, os, json, atexit, time
 
-def init(logging=True):
+def init(verbose=False):
     # Detect how this process was launched
     if 'LOCAL_RANK' in os.environ:
         # Either torch.distributed.run or legacy run with --use_env
         has_subprocesses = True
         world_size = int(os.environ['WORLD_SIZE'])
         is_mpi_process = int(os.environ['LOCAL_RANK']) == 0
-        if logging:
+        if verbose:
             print(f'SCCL: Found LOCAL_RANK in environment, torch.distributed.run (or launch with --use_env) detected.')
     else:
         import argparse
@@ -25,20 +25,20 @@ def init(logging=True):
             has_subprocesses = True
             world_size = int(os.environ['WORLD_SIZE'])
             is_mpi_process = args.local_rank == 0
-            if logging:
+            if verbose:
                 print('SCCL: Found --local_rank N argument, legacy torch.distributed.launch without --use_env detected.')
         else:
             # Pure MPI
             has_subprocesses = False
             world_size = None
             is_mpi_process = True
-            if logging:
+            if verbose:
                 print(f'SCCL: No launcher detected, assuming one MPI rank per process.')
     # Name environment file by parent PID, which will be shared between subprocesses for torch.distributed.(launch|run)
     env_file = os.path.join(tempfile.gettempdir(), f'sccl_autosynth_env.{os.getppid()}.lock')
     if is_mpi_process:
         # Synthesize on MPI rank 0 and distribute to all MPI processes
-        env = _autosynth_and_get_env(world_size, logging)
+        env = _autosynth_and_get_env(world_size, verbose)
         # If there are non-MPI subprocesses, they get the environment through a temporary file
         if has_subprocesses:
             # Make sure the lock file doesn't exist yet
@@ -56,18 +56,16 @@ def init(logging=True):
         while not os.path.exists(env_file):
             time.sleep(1)
             elapsed += 1
-            if elapsed == 10 and logging:
+            if elapsed == 10:
                 print(f'SCCL: Still waiting to read lock file {env_file}...')
         # Load the environment to set from the file
         with open(env_file, "r") as f:
             env = json.load(f)
 
     os.environ.update(env)
+    print('SCCL: Algorithms installed.')
 
-    if logging:
-        print('SCCL: Algorithms installed.')
-
-def _autosynth_and_get_env(world_size, logging):
+def _autosynth_and_get_env(world_size, verbose):
     try:
         from mpi4py import MPI
     except ImportError as e:
@@ -82,7 +80,7 @@ def _autosynth_and_get_env(world_size, logging):
 
     collective_names = ['Alltoall']
 
-    machine = detect_machine(logging)
+    machine = detect_machine(verbose)
     plan = select_synthesis_plan(machine)
     names = comm.gather(machine[0], root=0)
     if mpi_rank == 0:
@@ -91,7 +89,7 @@ def _autosynth_and_get_env(world_size, logging):
                 raise RuntimeError(f'Rank {i} detected machine as {names[i]} but rank {i+1} detected machine as {names[i+1]}.')
         efs = []
         for name in collective_names:
-            algo = plan.synthesize(world_size, name, logging)
+            algo = plan.synthesize(world_size, name, verbose)
             efs.append(ncclize(algo, old_format=True, use_scratch=True))
     else:
         efs = None
@@ -104,7 +102,7 @@ def _autosynth_and_get_env(world_size, logging):
         ef_files.append(ef_file)
         with open(ef_file, 'w') as f:
             f.write(ef)
-        if logging:
+        if verbose:
             print(f'SCCL: Wrote to {ef_file}')
 
     if len(ef_files) != 1:
@@ -117,35 +115,35 @@ def _autosynth_and_get_env(world_size, logging):
         'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm)
     }
 
-def detect_machine(logging):
-    machine = _detect_nvidia_machine(logging)
+def detect_machine(verbose):
+    machine = _detect_nvidia_machine(verbose)
     if machine != None:
         return machine
     return ('unknown', None)
 
-def _detect_nvidia_machine(logging):
-    if logging:
+def _detect_nvidia_machine(verbose):
+    if verbose:
         print('SCCL: Checking for NVIDIA machines')
     try:
         smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
     except FileNotFoundError:
-        if logging:
+        if verbose:
             print('SCCL: nvidia-smi not found.')
         return None
     except subprocess.CalledProcessError:
-        if logging:
+        if verbose:
             print('SCCL: Found nvidia-smi, but got error.')
         return ('unknown', None)
 
     nvlink_topo = nvlink_only(smi_topo)
 
     if nvlink_topo.num_nodes() == 8: # DGX-1 and DGX A100 like nodes
-        if logging:
+        if verbose:
             print('SCCL: 8 GPUs, so looks like a DGX-1 or DGX A100.')
         if _is_one_host_ib_dgx1(smi_topo):
             return ('one_host_ib_dgx1', nvlink_topo)
         else:
-            if logging:
+            if verbose:
                 print('SCCL: Unknown network configuration.')
     return ('unknown', None)
 
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index e8eef60..25fe3dd 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -37,7 +37,7 @@ def local_rank_permutation(self):
             raise RuntimeError(f'Expected to find 4 isomorphisms to DGX1 topology, but found {len(isomorphisms)}.')
         return self._select_isomorphism(isomorphisms)
 
-    def _select_isomorphism(self, isomorphisms):
+    def _select_isomorphism(self, isomorphisms, verbose=False):
         with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "a+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
@@ -45,7 +45,8 @@ def _select_isomorphism(self, isomorphisms):
                 if size > 0:
                     f.seek(0)
                     nodes = json.load(f)
-                    print(f'SCCL: Read IB placement from {f.name}')
+                    if verbose:
+                        print(f'SCCL: Read IB placement from {f.name}')
                     return nodes
                 else:
                     print('SCCL: Running inspector-topo to find the IB placement. This will take a minute...')
@@ -61,7 +62,8 @@ def _select_isomorphism(self, isomorphisms):
                         if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                             nodes = iso.nodes
                             json.dump(nodes, f)
-                            print(f'SCCL: Wrote IB placement to {f.name}')
+                            if verbose:
+                                print(f'SCCL: Wrote IB placement to {f.name}')
                             return nodes
                     raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
             finally:

From 1c700611bd02dd263de5c68e1fb5775273148b11 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Sat, 17 Jul 2021 09:19:17 -0700
Subject: [PATCH 038/135] Improve autosynth logging

---
 sccl/autosynth/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 809db92..e17f22b 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -56,14 +56,13 @@ def init(verbose=False):
         while not os.path.exists(env_file):
             time.sleep(1)
             elapsed += 1
-            if elapsed == 10:
+            if elapsed == 60:
                 print(f'SCCL: Still waiting to read lock file {env_file}...')
         # Load the environment to set from the file
         with open(env_file, "r") as f:
             env = json.load(f)
 
     os.environ.update(env)
-    print('SCCL: Algorithms installed.')
 
 def _autosynth_and_get_env(world_size, verbose):
     try:
@@ -79,6 +78,8 @@ def _autosynth_and_get_env(world_size, verbose):
         world_size = mpi_size
 
     collective_names = ['Alltoall']
+    if mpi_rank == 0:
+        print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
 
     machine = detect_machine(verbose)
     plan = select_synthesis_plan(machine)

From 49423f769984a33f254302094d0cec57c0272fb7 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Sat, 17 Jul 2021 09:51:32 -0700
Subject: [PATCH 039/135] Improve autosynth file system locking

Use /var/lock instead of /tmp as suggested to ensure strong semantics.
Avoid possible race by renaming instead of writing directly.
---
 sccl/autosynth/__init__.py             | 5 +++--
 sccl/autosynth/dgx1_relay_node_plan.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index e17f22b..550cd31 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -35,7 +35,7 @@ def init(verbose=False):
             if verbose:
                 print(f'SCCL: No launcher detected, assuming one MPI rank per process.')
     # Name environment file by parent PID, which will be shared between subprocesses for torch.distributed.(launch|run)
-    env_file = os.path.join(tempfile.gettempdir(), f'sccl_autosynth_env.{os.getppid()}.lock')
+    env_file = f'/var/lock/sccl_autosynth_env.{os.getppid()}.lock'
     if is_mpi_process:
         # Synthesize on MPI rank 0 and distribute to all MPI processes
         env = _autosynth_and_get_env(world_size, verbose)
@@ -45,8 +45,9 @@ def init(verbose=False):
             if os.path.exists(env_file):
                 raise RuntimeError(f'SCCL: Lock file already exists: {env_file}')
             # Broadcast algorithm to other subprocesses
-            with open(env_file, "w") as f:
+            with tempfile.mkstemp() as (f, private_file):
                 json.dump(env, f)
+            os.rename(private_file, env_file)
             # Delete the environment file when the local MPI process exits
             atexit.register(os.remove, env_file)
     else:
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 25fe3dd..5d8263b 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -38,7 +38,7 @@ def local_rank_permutation(self):
         return self._select_isomorphism(isomorphisms)
 
     def _select_isomorphism(self, isomorphisms, verbose=False):
-        with open(os.path.join(tempfile.gettempdir(), 'sccl_autosynth_inspector_topo.lock'), "a+") as f:
+        with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
                 size = f.tell()

From fb529cd299a90af7de5930442f3fd6d7c4ff856a Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Sat, 17 Jul 2021 09:56:46 -0700
Subject: [PATCH 040/135] Bugfix

---
 sccl/autosynth/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 550cd31..325407d 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -45,7 +45,8 @@ def init(verbose=False):
             if os.path.exists(env_file):
                 raise RuntimeError(f'SCCL: Lock file already exists: {env_file}')
             # Broadcast algorithm to other subprocesses
-            with tempfile.mkstemp() as (f, private_file):
+            fd, private_file = tempfile.mkstemp()
+            with open(fd, "w") as f:
                 json.dump(env, f)
             os.rename(private_file, env_file)
             # Delete the environment file when the local MPI process exits

From 5271c44ed3d8f8dba8539905dd60a502d3712ec4 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 21 Jul 2021 16:52:50 -0700
Subject: [PATCH 041/135] Autosynth assumes Z3 gives same algo on all ranks

This is to work around issues with MPI usage outside torch.distributed
messing up PyTorch's initialization.
---
 sccl/autosynth/__init__.py | 55 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 325407d..9b1d719 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -7,6 +7,15 @@
 import re, subprocess, tempfile, os, json, atexit, time
 
 def init(verbose=False):
+    env = _autosynth_assume_deterministic_z3_and_ompi(verbose)
+    os.environ.update(env)
+    return
+
+    # The code below does not work in all usecases with PyTorch, due to mpi4py calling MPI_Init, which
+    # some part of PyTorch cannot tolerate. The other way around would work, importing mpi4py after
+    # torch.distributed has initialized, but currently the SCCL interpreter in NCCL cannot load new algorithms
+    # after initialization. Once this dynamic loading support lands the code path below can be re-enabled.
+
     # Detect how this process was launched
     if 'LOCAL_RANK' in os.environ:
         # Either torch.distributed.run or legacy run with --use_env
@@ -66,6 +75,52 @@ def init(verbose=False):
 
     os.environ.update(env)
 
+def _autosynth_assume_deterministic_z3_and_ompi(verbose):
+    rank = None
+    if 'WORLD_SIZE' in os.environ:
+        # We're in a PyTorch launcher compatible script
+        world_size = int(os.environ['WORLD_SIZE'])
+        if 'RANK' in os.environ:
+            rank = int(os.environ['RANK'])
+    else:
+        if not 'OMPI_COMM_WORLD_SIZE' in os.environ:
+            print('SCCL: Could not detect world size. Please set either WORLD_SIZE or OMPI_COMM_WORLD_SIZE to total number of processes.')
+            raise RuntimeError('Could not detect world size.')
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        if 'OMPI_COMM_WORLD_RANK' in os.environ:
+            rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+
+    collective_names = ['Alltoall']
+    if rank == 0:
+        print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
+    
+    machine = detect_machine(verbose)
+    plan = select_synthesis_plan(machine)
+    efs = []
+    for name in collective_names:
+        algo = plan.synthesize(world_size, name, verbose)
+        efs.append(ncclize(algo, old_format=True, use_scratch=True))
+
+    tempdir = tempfile.mkdtemp()
+    ef_files = []
+    for name, ef in zip(collective_names, efs):
+        ef_file = os.path.join(tempdir, f'{name}.xml')
+        ef_files.append(ef_file)
+        with open(ef_file, 'w') as f:
+            f.write(ef)
+        if verbose:
+            print(f'SCCL: Wrote to {ef_file}')
+
+    if len(ef_files) != 1:
+        raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
+
+    perm = plan.local_rank_permutation()
+
+    return {
+        'SCCL_XML_FILE': ef_files[0],
+        'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm)
+    }
+
 def _autosynth_and_get_env(world_size, verbose):
     try:
         from mpi4py import MPI

From 4f78d6052ef807d486e32387cacf5a6100ea093e Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 22 Jul 2021 20:21:48 +0000
Subject: [PATCH 042/135] added the flush necessary for file, f.seek to the end
 of the file, algo.nchannels

---
 sccl/autosynth/__init__.py             | 4 +++-
 sccl/autosynth/dgx1_relay_node_plan.py | 9 ++++++---
 sccl/ncclize.py                        | 4 +++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 9b1d719..8fa00ff 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -118,7 +118,9 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
 
     return {
         'SCCL_XML_FILE': ef_files[0],
-        'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm)
+        'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm),
+        'NCCL_NET_SHARED_BUFFERS': '0',
+        'NCCL_MIN_NCHANNELS': str(algo.nchannels)
     }
 
 def _autosynth_and_get_env(world_size, verbose):
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index 5d8263b..c5d2294 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -37,10 +37,11 @@ def local_rank_permutation(self):
             raise RuntimeError(f'Expected to find 4 isomorphisms to DGX1 topology, but found {len(isomorphisms)}.')
         return self._select_isomorphism(isomorphisms)
 
-    def _select_isomorphism(self, isomorphisms, verbose=False):
+    def _select_isomorphism(self, isomorphisms, verbose=True):
         with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
             fcntl.lockf(f, fcntl.LOCK_EX)
             try:
+                f.seek(0, 2)
                 size = f.tell()
                 if size > 0:
                     f.seek(0)
@@ -49,8 +50,9 @@ def _select_isomorphism(self, isomorphisms, verbose=False):
                         print(f'SCCL: Read IB placement from {f.name}')
                     return nodes
                 else:
-                    print('SCCL: Running inspector-topo to find the IB placement. This will take a minute...')
-                    topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEIVCES":"0,1,2,3,4,5,6,7"})
+                    print('SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
+                    topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEVICES":"0,1,2,3,4,5,6,7"})
+                    print('SCCL: Finished running inspector-topo. Finding the permutaion.')
                     if topo_detect.returncode != 0:
                         raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
                     topo_detect_output = topo_detect.stdout.decode('utf-8')
@@ -62,6 +64,7 @@ def _select_isomorphism(self, isomorphisms, verbose=False):
                         if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                             nodes = iso.nodes
                             json.dump(nodes, f)
+                            f.flush()
                             if verbose:
                                 print(f'SCCL: Wrote IB placement to {f.name}')
                             return nodes
diff --git a/sccl/ncclize.py b/sccl/ncclize.py
index c0c7514..c98eefe 100644
--- a/sccl/ncclize.py
+++ b/sccl/ncclize.py
@@ -560,7 +560,9 @@ def expand_mappings(mappings):
     algo_elem = ET.Element('algo')
     algo_elem.set('name', algorithm.name)
     algo_elem.set('proto', 'Simple')
-    algo_elem.set('nchannels', str(1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in gpus.values())))
+    nchannels = 1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in gpus.values())
+    algorithm.nchannels = nchannels
+    algo_elem.set('nchannels', str(nchannels))
     if old_format:
         algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))
     for rank, gpu in gpus.items():

From 2a736a8dd90594a673907a81c7dbb92689902c47 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 23 Jul 2021 02:49:12 +0000
Subject: [PATCH 043/135] made a launcher for ndv2

---
 sccl/__init__.py                       |  1 +
 sccl/autosynth/__init__.py             | 15 ++++++++++-----
 sccl/autosynth/dgx1_relay_node_plan.py |  9 +++++----
 setup.py                               |  2 +-
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/sccl/__init__.py b/sccl/__init__.py
index 7638ff6..8627102 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -2,3 +2,4 @@
 # Licensed under the MIT License.
 
 from sccl.autosynth import init
+from sccl.autosynth import ndv2_perm
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 8fa00ff..f95cc09 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -75,6 +75,14 @@ def init(verbose=False):
 
     os.environ.update(env)
 
+def ndv2_perm(verbose=True):
+    machine = detect_machine(verbose)
+    if machine[1] == None:
+        return
+    plan = select_synthesis_plan(machine)
+    plan.local_rank_permutation()
+    
+
 def _autosynth_assume_deterministic_z3_and_ompi(verbose):
     rank = None
     if 'WORLD_SIZE' in os.environ:
@@ -99,7 +107,7 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
     efs = []
     for name in collective_names:
         algo = plan.synthesize(world_size, name, verbose)
-        efs.append(ncclize(algo, old_format=True, use_scratch=True))
+        efs.append(ncclize(algo, old_format=True, use_scratch=True, instances=8))
 
     tempdir = tempfile.mkdtemp()
     ef_files = []
@@ -114,11 +122,8 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
     if len(ef_files) != 1:
         raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
 
-    perm = plan.local_rank_permutation()
-
     return {
         'SCCL_XML_FILE': ef_files[0],
-        'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm),
         'NCCL_NET_SHARED_BUFFERS': '0',
         'NCCL_MIN_NCHANNELS': str(algo.nchannels)
     }
@@ -217,4 +222,4 @@ def select_synthesis_plan(machine):
     if machine_name == 'one_host_ib_dgx1':
         return DGX1RelayNodePlan(machine_info)
     else:
-        raise RuntimeError(f'Unhandled machine type {machine_name}.')
+        raise RuntimeError(f'Unhandled machine type {machine_name}.')
\ No newline at end of file
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index c5d2294..b9eb892 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -45,10 +45,10 @@ def _select_isomorphism(self, isomorphisms, verbose=True):
                 size = f.tell()
                 if size > 0:
                     f.seek(0)
-                    nodes = json.load(f)
+                    order = f.read()
                     if verbose:
                         print(f'SCCL: Read IB placement from {f.name}')
-                    return nodes
+                    return order
                 else:
                     print('SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
                     topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEVICES":"0,1,2,3,4,5,6,7"})
@@ -63,11 +63,12 @@ def _select_isomorphism(self, isomorphisms, verbose=True):
                     for iso in isomorphisms:
                         if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
                             nodes = iso.nodes
-                            json.dump(nodes, f)
+                            order = ",".join(str(rank) for rank in nodes)
+                            f.write(order)
                             f.flush()
                             if verbose:
                                 print(f'SCCL: Wrote IB placement to {f.name}')
-                            return nodes
+                            return order
                     raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
             finally:
                 fcntl.lockf(f, fcntl.LOCK_UN)
diff --git a/setup.py b/setup.py
index 43ecfaf..bc7c78d 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='sccl',
-    version='2.0.0',
+    version='2.1.2',
     packages=find_packages(),
     entry_points={
         'console_scripts': [

From a5ad03c38898ddf32ed00ed41c7726f63fdde1e5 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 23 Jul 2021 02:54:26 +0000
Subject: [PATCH 044/135] made a launcher for ndv2

---
 sccl/autosynth/ndv2_launcher.sh | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100755 sccl/autosynth/ndv2_launcher.sh

diff --git a/sccl/autosynth/ndv2_launcher.sh b/sccl/autosynth/ndv2_launcher.sh
new file mode 100755
index 0000000..ff49e21
--- /dev/null
+++ b/sccl/autosynth/ndv2_launcher.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+python -c "import sccl; sccl.ndv2_perm()"
+order=/var/lock/sccl_autosynth_inspector_topo.lock
+if [ -f "$order" ]; then
+    echo "Setting CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
+    export CUDA_VISIBLE_DEVICES=$(</var/lock/sccl_autosynth_inspector_topo.lock)
+fi
+$@
\ No newline at end of file

From 949c222e8ee0a3108be50524bf087893ecae2634 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 23 Jul 2021 06:02:35 +0000
Subject: [PATCH 045/135] silly bug

---
 sccl/autosynth/ndv2_launcher.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/ndv2_launcher.sh b/sccl/autosynth/ndv2_launcher.sh
index ff49e21..1f6f0c8 100755
--- a/sccl/autosynth/ndv2_launcher.sh
+++ b/sccl/autosynth/ndv2_launcher.sh
@@ -2,7 +2,7 @@
 python -c "import sccl; sccl.ndv2_perm()"
 order=/var/lock/sccl_autosynth_inspector_topo.lock
 if [ -f "$order" ]; then
-    echo "Setting CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
     export CUDA_VISIBLE_DEVICES=$(</var/lock/sccl_autosynth_inspector_topo.lock)
+    echo "Setting CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
 fi
 $@
\ No newline at end of file

From f6f5a2ff169c9aee7856478610f15b45859213b0 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 23 Jul 2021 06:17:51 +0000
Subject: [PATCH 046/135] silly bug

---
 sccl/autosynth/ndv2_launcher.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/ndv2_launcher.sh b/sccl/autosynth/ndv2_launcher.sh
index 1f6f0c8..e2bb54b 100755
--- a/sccl/autosynth/ndv2_launcher.sh
+++ b/sccl/autosynth/ndv2_launcher.sh
@@ -3,6 +3,6 @@ python -c "import sccl; sccl.ndv2_perm()"
 order=/var/lock/sccl_autosynth_inspector_topo.lock
 if [ -f "$order" ]; then
     export CUDA_VISIBLE_DEVICES=$(</var/lock/sccl_autosynth_inspector_topo.lock)
-    echo "Setting CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
+    echo "Set CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
 fi
 $@
\ No newline at end of file

From cf56c0ee61ec918c69f7d8fe9198cd3988fe14b0 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 23 Jul 2021 12:49:53 -0700
Subject: [PATCH 047/135] Install sccl_ndv2_launcher.sh as a script

---
 sccl/autosynth/{ndv2_launcher.sh => sccl_ndv2_launcher.sh} | 0
 setup.py                                                   | 3 +++
 2 files changed, 3 insertions(+)
 rename sccl/autosynth/{ndv2_launcher.sh => sccl_ndv2_launcher.sh} (100%)

diff --git a/sccl/autosynth/ndv2_launcher.sh b/sccl/autosynth/sccl_ndv2_launcher.sh
similarity index 100%
rename from sccl/autosynth/ndv2_launcher.sh
rename to sccl/autosynth/sccl_ndv2_launcher.sh
diff --git a/setup.py b/setup.py
index bc7c78d..5a7c47e 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,9 @@
             'sccl = sccl.__main__:main',
         ],
     },
+    scripts = [
+        'sccl/autosynth/sccl_ndv2_launcher.sh'
+    ],
     install_requires=[
         'dataclasses; python_version < "3.7"',
         'z3-solver',

From 91d0713f3311244e919d628fbe52d96ab5f985e6 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 24 Jul 2021 01:13:42 +0000
Subject: [PATCH 048/135] fixed a bug

---
 sccl/autosynth/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index f95cc09..e6d94d6 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -146,6 +146,9 @@ def _autosynth_and_get_env(world_size, verbose):
         print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
 
     machine = detect_machine(verbose)
+    machine_name, machine_info = machine
+    if machine_name == "unknown":
+        return {}
     plan = select_synthesis_plan(machine)
     names = comm.gather(machine[0], root=0)
     if mpi_rank == 0:

From 3eddbc1805559ee7cace1a738e6422f5ed70aff4 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 24 Jul 2021 01:37:48 +0000
Subject: [PATCH 049/135] fixed a bug

---
 sccl/autosynth/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index e6d94d6..b4c42d9 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -92,8 +92,8 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
             rank = int(os.environ['RANK'])
     else:
         if not 'OMPI_COMM_WORLD_SIZE' in os.environ:
-            print('SCCL: Could not detect world size. Please set either WORLD_SIZE or OMPI_COMM_WORLD_SIZE to total number of processes.')
-            raise RuntimeError('Could not detect world size.')
+            print('SCCL: Could not detect world size and import SCCL will be ignored.')
+            return {}
         world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
         if 'OMPI_COMM_WORLD_RANK' in os.environ:
             rank = int(os.environ['OMPI_COMM_WORLD_RANK'])

From 62e92517264a827bd4c8411958e0b2ced85a6a71 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 24 Jul 2021 18:41:43 +0000
Subject: [PATCH 050/135] bug fix foor unknown node type

---
 sccl/autosynth/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index b4c42d9..f084c6f 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -102,7 +102,10 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
     if rank == 0:
         print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
     
-    machine = detect_machine(verbose)
+    machine_name, machine_info = machine
+    if machine_name == "unknown":
+        print("SCCL could not detect the type of machine. import sccl will be ignored.")
+        return {}
     plan = select_synthesis_plan(machine)
     efs = []
     for name in collective_names:

From fe59b35bd7aa2c199921b5e90d491d88989cc3e0 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 24 Jul 2021 19:32:19 +0000
Subject: [PATCH 051/135] bug fix foor unknown node type

---
 sccl/autosynth/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index f084c6f..48cc85a 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -102,6 +102,7 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
     if rank == 0:
         print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
     
+    machine = detect_machine(verbose)
     machine_name, machine_info = machine
     if machine_name == "unknown":
         print("SCCL could not detect the type of machine. import sccl will be ignored.")

From 729b68bb4cbca299378d906248ce663e0020cd50 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 26 Jul 2021 18:09:28 +0000
Subject: [PATCH 052/135] adding necessary warning/error messaging

---
 sccl/autosynth/__init__.py             | 20 +++++++++++++-------
 sccl/autosynth/dgx1_relay_node_plan.py |  4 ++++
 sccl/isomorphisms.py                   |  7 ++++---
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 48cc85a..6b7c37e 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -92,7 +92,7 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
             rank = int(os.environ['RANK'])
     else:
         if not 'OMPI_COMM_WORLD_SIZE' in os.environ:
-            print('SCCL: Could not detect world size and import SCCL will be ignored.')
+            print('SCCL info: Could not detect world size and import SCCL will be ignored.')
             return {}
         world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
         if 'OMPI_COMM_WORLD_RANK' in os.environ:
@@ -100,14 +100,20 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
 
     collective_names = ['Alltoall']
     if rank == 0:
-        print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
+        print(f'SCCL info: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
     
     machine = detect_machine(verbose)
     machine_name, machine_info = machine
     if machine_name == "unknown":
-        print("SCCL could not detect the type of machine. import sccl will be ignored.")
+        print("SCCL warning: could not detect the type of machine. import sccl will be ignored.")
+        return {}
+    if world_size != 16:
+        print(f'SCCL warning: currently only generates alltoall for 2 ndv2 nodes. import sccl will be ignored.')
         return {}
     plan = select_synthesis_plan(machine)
+    if plan.is_dgx1() == False:
+        print(f'SCCL warning: the node does seem like a ndv2. import sccl will be ignored.')
+        return {}
     efs = []
     for name in collective_names:
         algo = plan.synthesize(world_size, name, verbose)
@@ -124,7 +130,7 @@ def _autosynth_assume_deterministic_z3_and_ompi(verbose):
             print(f'SCCL: Wrote to {ef_file}')
 
     if len(ef_files) != 1:
-        raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
+        raise RuntimeError(f'SCCL error: only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
 
     return {
         'SCCL_XML_FILE': ef_files[0],
@@ -195,16 +201,16 @@ def detect_machine(verbose):
 
 def _detect_nvidia_machine(verbose):
     if verbose:
-        print('SCCL: Checking for NVIDIA machines')
+        print('SCCL info: Checking for NVIDIA machines')
     try:
         smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
     except FileNotFoundError:
         if verbose:
-            print('SCCL: nvidia-smi not found.')
+            print('SCCL info: nvidia-smi not found.')
         return None
     except subprocess.CalledProcessError:
         if verbose:
-            print('SCCL: Found nvidia-smi, but got error.')
+            print('SCCL warning: Found nvidia-smi, but got error.')
         return ('unknown', None)
 
     nvlink_topo = nvlink_only(smi_topo)
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
index b9eb892..181921b 100644
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ b/sccl/autosynth/dgx1_relay_node_plan.py
@@ -31,6 +31,10 @@ def _select_root_nodes(self):
         # TODO: is this always the right thing?
         return (0,1)
 
+    def is_dgx1(self):
+        isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
+        return len(isomorphisms) == 4
+            
     def local_rank_permutation(self):
         isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
         if len(isomorphisms) != 4:
diff --git a/sccl/isomorphisms.py b/sccl/isomorphisms.py
index 45c2792..9dabe0b 100644
--- a/sccl/isomorphisms.py
+++ b/sccl/isomorphisms.py
@@ -48,13 +48,14 @@ def find_isomorphisms(topology, target_topology, limit=None, logging=False):
     Finds all isomorphisms from one topology to a target topology. Returns a list of permutations.
     '''
     if len(topology.switches) > 0:
-        raise ValueError('Topologies with switches are not supported.')
+        print('SCCL Warning: Topologies with switches are not supported. import sccl will be ignored.')
+        return []
 
     if limit != None and limit <= 0:
-        return []
+        raise ValueError('SCCL error: limit was set improperly.')
     
     if topology.num_nodes() != target_topology.num_nodes():
-        return []
+        raise ValueError('SCCL error: target topology does not match with the given topology.')
 
     if logging:
         print(f'Encoding {topology.name} - {target_topology.name} isomorphisms to Z3')

From 86da9cc8cb703c53d0dbbab53b383643755ff457 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 26 Jul 2021 14:23:48 -0700
Subject: [PATCH 053/135] Add ngpus attribute to ncclize XML

---
 sccl/ncclize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sccl/ncclize.py b/sccl/ncclize.py
index c98eefe..85fc4d6 100644
--- a/sccl/ncclize.py
+++ b/sccl/ncclize.py
@@ -563,6 +563,7 @@ def expand_mappings(mappings):
     nchannels = 1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in gpus.values())
     algorithm.nchannels = nchannels
     algo_elem.set('nchannels', str(nchannels))
+    algo_elem.set('ngpus', str(len(gpus)))
     if old_format:
         algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))
     for rank, gpu in gpus.items():

From e3e10a48d3a82d76ff66f3164c899c2f00db43cb Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 20 Aug 2021 11:50:20 -0700
Subject: [PATCH 054/135] Redesign sccl.init/autosynth to be declarative

Users now have to specify which machine type they are running on, how
many of them, which collectives will be used and their input sizes.
---
 requirements.txt                       |   1 +
 sccl/autosynth/__init__.py             | 348 +++++++++----------------
 sccl/autosynth/dgx1_plans.py           |  25 ++
 sccl/autosynth/dgx1_relay_node_plan.py |  78 ------
 sccl/autosynth/registry.py             |  46 ++++
 setup.py                               |   1 +
 6 files changed, 189 insertions(+), 310 deletions(-)
 create mode 100644 sccl/autosynth/dgx1_plans.py
 delete mode 100644 sccl/autosynth/dgx1_relay_node_plan.py
 create mode 100644 sccl/autosynth/registry.py

diff --git a/requirements.txt b/requirements.txt
index aa95aa6..ccbf125 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ dataclasses; python_version < "3.7"
 z3-solver
 argcomplete
 lxml
+humanfriendly
 pytest
 pytest-cov
 pytest-xdist
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 6b7c37e..e48b9af 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -1,238 +1,122 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.topologies.nvidia import nvlink_only
-from sccl.autosynth.dgx1_relay_node_plan import DGX1RelayNodePlan
-from sccl.ncclize import ncclize
-import re, subprocess, tempfile, os, json, atexit, time
-
-def init(verbose=False):
-    env = _autosynth_assume_deterministic_z3_and_ompi(verbose)
-    os.environ.update(env)
-    return
-
-    # The code below does not work in all usecases with PyTorch, due to mpi4py calling MPI_Init, which
-    # some part of PyTorch cannot tolerate. The other way around would work, importing mpi4py after
-    # torch.distributed has initialized, but currently the SCCL interpreter in NCCL cannot load new algorithms
-    # after initialization. Once this dynamic loading support lands the code path below can be re-enabled.
-
-    # Detect how this process was launched
-    if 'LOCAL_RANK' in os.environ:
-        # Either torch.distributed.run or legacy run with --use_env
-        has_subprocesses = True
-        world_size = int(os.environ['WORLD_SIZE'])
-        is_mpi_process = int(os.environ['LOCAL_RANK']) == 0
-        if verbose:
-            print(f'SCCL: Found LOCAL_RANK in environment, torch.distributed.run (or launch with --use_env) detected.')
-    else:
-        import argparse
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--local_rank", type=int)
-        args, _ = parser.parse_known_args()
-        if args.local_rank != None:
-            # Legacy torch.distributed.launch without --use_env
-            has_subprocesses = True
-            world_size = int(os.environ['WORLD_SIZE'])
-            is_mpi_process = args.local_rank == 0
-            if verbose:
-                print('SCCL: Found --local_rank N argument, legacy torch.distributed.launch without --use_env detected.')
+from sccl.topologies import dgx1
+from sccl.isomorphisms import find_isomorphisms
+from sccl.autosynth.registry import synthesis_plans
+import re
+import subprocess
+import fcntl
+import tempfile
+import os
+import subprocess
+import tempfile
+import atexit
+import humanfriendly
+
+from sccl.autosynth.dgx1_plans import register_dgx1_plans
+register_dgx1_plans()
+
+
+def init(num_machines, machine_type, *collectives):
+    plans_and_sizes = []
+    for collective in collectives:
+        name, size = collective
+        if isinstance(size, str):
+            size = humanfriendly.parse_size(size)
+        candidates = synthesis_plans[(name, machine_type)]
+        valid_candidates = filter(
+            _candidate_filter(num_machines, size), candidates)
+        sorted_candidates = sorted(valid_candidates, key=_candidate_sort_key)
+        description = f'{name} with size {humanfriendly.format_size(size)}'
+        if len(sorted_candidates) == 0:
+            print(
+                f'SCCL: No synthesis plan found for {description}. Falling back to NCCL baseline.')
         else:
-            # Pure MPI
-            has_subprocesses = False
-            world_size = None
-            is_mpi_process = True
-            if verbose:
-                print(f'SCCL: No launcher detected, assuming one MPI rank per process.')
-    # Name environment file by parent PID, which will be shared between subprocesses for torch.distributed.(launch|run)
-    env_file = f'/var/lock/sccl_autosynth_env.{os.getppid()}.lock'
-    if is_mpi_process:
-        # Synthesize on MPI rank 0 and distribute to all MPI processes
-        env = _autosynth_and_get_env(world_size, verbose)
-        # If there are non-MPI subprocesses, they get the environment through a temporary file
-        if has_subprocesses:
-            # Make sure the lock file doesn't exist yet
-            if os.path.exists(env_file):
-                raise RuntimeError(f'SCCL: Lock file already exists: {env_file}')
-            # Broadcast algorithm to other subprocesses
-            fd, private_file = tempfile.mkstemp()
-            with open(fd, "w") as f:
-                json.dump(env, f)
-            os.rename(private_file, env_file)
-            # Delete the environment file when the local MPI process exits
-            atexit.register(os.remove, env_file)
-    else:
-        assert has_subprocesses
-        # Wait until the environment file is available
-        elapsed = 0
-        while not os.path.exists(env_file):
-            time.sleep(1)
-            elapsed += 1
-            if elapsed == 60:
-                print(f'SCCL: Still waiting to read lock file {env_file}...')
-        # Load the environment to set from the file
-        with open(env_file, "r") as f:
-            env = json.load(f)
-
-    os.environ.update(env)
-
-def ndv2_perm(verbose=True):
-    machine = detect_machine(verbose)
-    if machine[1] == None:
-        return
-    plan = select_synthesis_plan(machine)
-    plan.local_rank_permutation()
-    
-
-def _autosynth_assume_deterministic_z3_and_ompi(verbose):
-    rank = None
-    if 'WORLD_SIZE' in os.environ:
-        # We're in a PyTorch launcher compatible script
-        world_size = int(os.environ['WORLD_SIZE'])
-        if 'RANK' in os.environ:
-            rank = int(os.environ['RANK'])
-    else:
-        if not 'OMPI_COMM_WORLD_SIZE' in os.environ:
-            print('SCCL info: Could not detect world size and import SCCL will be ignored.')
-            return {}
-        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-        if 'OMPI_COMM_WORLD_RANK' in os.environ:
-            rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-
-    collective_names = ['Alltoall']
-    if rank == 0:
-        print(f'SCCL info: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
-    
-    machine = detect_machine(verbose)
-    machine_name, machine_info = machine
-    if machine_name == "unknown":
-        print("SCCL warning: could not detect the type of machine. import sccl will be ignored.")
-        return {}
-    if world_size != 16:
-        print(f'SCCL warning: currently only generates alltoall for 2 ndv2 nodes. import sccl will be ignored.')
-        return {}
-    plan = select_synthesis_plan(machine)
-    if plan.is_dgx1() == False:
-        print(f'SCCL warning: the node does seem like a ndv2. import sccl will be ignored.')
-        return {}
-    efs = []
-    for name in collective_names:
-        algo = plan.synthesize(world_size, name, verbose)
-        efs.append(ncclize(algo, old_format=True, use_scratch=True, instances=8))
-
-    tempdir = tempfile.mkdtemp()
-    ef_files = []
-    for name, ef in zip(collective_names, efs):
-        ef_file = os.path.join(tempdir, f'{name}.xml')
-        ef_files.append(ef_file)
-        with open(ef_file, 'w') as f:
+            name, plan, _, _, _ = sorted_candidates[-1]
+            print(f'SCCL: Synthesis plan for {description} is {name}')
+            plans_and_sizes.append((plan, size))
+
+    envs = {}
+    for plan, size in plans_and_sizes:
+        ef, env = plan(num_machines, size)
+        fd, path = tempfile.mkstemp()
+        with os.fdopen(fd, 'w') as f:
             f.write(ef)
-        if verbose:
-            print(f'SCCL: Wrote to {ef_file}')
-
-    if len(ef_files) != 1:
-        raise RuntimeError(f'SCCL error: only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
-
-    return {
-        'SCCL_XML_FILE': ef_files[0],
-        'NCCL_NET_SHARED_BUFFERS': '0',
-        'NCCL_MIN_NCHANNELS': str(algo.nchannels)
-    }
-
-def _autosynth_and_get_env(world_size, verbose):
-    try:
-        from mpi4py import MPI
-    except ImportError as e:
-        print('SCCL: Please install the mpi4py package to use SCCL\'s automated init function.')
-        raise e
-    comm = MPI.COMM_WORLD
-    mpi_size = comm.Get_size()
-    mpi_rank = comm.Get_rank()
-
-    if world_size == None:
-        world_size = mpi_size
-
-    collective_names = ['Alltoall']
-    if mpi_rank == 0:
-        print(f'SCCL: Synthesizing algorithm(s) for {", ".join(collective_names)}...')
-
-    machine = detect_machine(verbose)
-    machine_name, machine_info = machine
-    if machine_name == "unknown":
-        return {}
-    plan = select_synthesis_plan(machine)
-    names = comm.gather(machine[0], root=0)
-    if mpi_rank == 0:
-        for i in range(len(names) - 1):
-            if names[i] != names[i+1]:
-                raise RuntimeError(f'Rank {i} detected machine as {names[i]} but rank {i+1} detected machine as {names[i+1]}.')
-        efs = []
-        for name in collective_names:
-            algo = plan.synthesize(world_size, name, verbose)
-            efs.append(ncclize(algo, old_format=True, use_scratch=True))
-    else:
-        efs = None
-    efs = comm.bcast(efs, root=0)
-
-    tempdir = tempfile.mkdtemp()
-    ef_files = []
-    for name, ef in zip(collective_names, efs):
-        ef_file = os.path.join(tempdir, f'{name}.xml')
-        ef_files.append(ef_file)
-        with open(ef_file, 'w') as f:
-            f.write(ef)
-        if verbose:
-            print(f'SCCL: Wrote to {ef_file}')
-
-    if len(ef_files) != 1:
-        raise RuntimeError(f'Only a single algorithm is supported currently by the NCCL backend, but got {len(efs)}.')
-
-    perm = plan.local_rank_permutation()
-
-    return {
-        'SCCL_XML_FILE': ef_files[0],
-        'CUDA_VISIBLE_DEVICES': ','.join(str(rank) for rank in perm)
-    }
-
-def detect_machine(verbose):
-    machine = _detect_nvidia_machine(verbose)
-    if machine != None:
-        return machine
-    return ('unknown', None)
-
-def _detect_nvidia_machine(verbose):
-    if verbose:
-        print('SCCL info: Checking for NVIDIA machines')
-    try:
-        smi_topo = subprocess.check_output(['nvidia-smi', 'topo', '-m']).decode("utf-8")
-    except FileNotFoundError:
-        if verbose:
-            print('SCCL info: nvidia-smi not found.')
-        return None
-    except subprocess.CalledProcessError:
-        if verbose:
-            print('SCCL warning: Found nvidia-smi, but got error.')
-        return ('unknown', None)
-
-    nvlink_topo = nvlink_only(smi_topo)
-
-    if nvlink_topo.num_nodes() == 8: # DGX-1 and DGX A100 like nodes
-        if verbose:
-            print('SCCL: 8 GPUs, so looks like a DGX-1 or DGX A100.')
-        if _is_one_host_ib_dgx1(smi_topo):
-            return ('one_host_ib_dgx1', nvlink_topo)
+        atexit.register(os.remove, path)
+        if 'SCCL_XML_FILE' in envs:
+            envs['SCCL_XML_FILE'] += ',' + path
         else:
-            if verbose:
-                print('SCCL: Unknown network configuration.')
-    return ('unknown', None)
-
-def _is_one_host_ib_dgx1(smi_topo):
-    ib_host = re.findall('^mlx\\d_\\d(?:\s+NODE)*\s+X(?:\s+NODE)*\s+$', smi_topo, re.MULTILINE)
-    ib_any = re.findall('^mlx\\d_\\d.*$', smi_topo, re.MULTILINE)
-    return len(ib_host) == 1 and len(ib_any) == 1
-
-def select_synthesis_plan(machine):
-    machine_name, machine_info = machine
-    if machine_name == 'one_host_ib_dgx1':
-        return DGX1RelayNodePlan(machine_info)
-    else:
-        raise RuntimeError(f'Unhandled machine type {machine_name}.')
\ No newline at end of file
+            envs['SCCL_XML_FILE'] = path
+        envs.update(env)
+
+    os.environ.update(envs)
+
+
+def _candidate_filter(m, s):
+    def fun(candidate):
+        _, _, machines, size_ranges, _ = candidate
+        size_matches = any(map(lambda x: x[0] <= s and s <= x[1], size_ranges))
+        return size_matches and machines(m)
+    return fun
+
+
+def _candidate_sort_key(candidate):
+    _, _, _, _, priority = candidate
+    return priority
+
+
+def ndv2_perm(self):
+    # This function is used in a hacky way right now. The sccl_ndv2_launcher.sh
+    # relies on the side effect of _select_isomorphism creating the lock file,
+    # which is read by the script after calling this function, so the return
+    # value does't currently get used. If you make changes, please fix or update
+    # sccl_ndv2_launcher.sh accordingly.
+    isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
+    if len(isomorphisms) != 4:
+        raise RuntimeError(
+            f'Expected to find 4 isomorphisms to DGX1 topology, but found {len(isomorphisms)}.')
+    return _select_isomorphism(isomorphisms)
+
+
+def _select_isomorphism(self, isomorphisms, verbose=True):
+    with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
+        fcntl.lockf(f, fcntl.LOCK_EX)
+        try:
+            f.seek(0, 2)
+            size = f.tell()
+            if size > 0:
+                f.seek(0)
+                order = f.read()
+                if verbose:
+                    print(f'SCCL: Read IB placement from {f.name}')
+                return order
+            else:
+                print(
+                    'SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
+                topo_detect = subprocess.run(
+                    ['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"})
+                print('SCCL: Finished running inspector-topo. Finding the permutaion.')
+                if topo_detect.returncode != 0:
+                    raise RuntimeError(
+                        f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
+                topo_detect_output = topo_detect.stdout.decode('utf-8')
+                g = re.search(
+                    "GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
+                if g is None:
+                    raise RuntimeError(
+                        f'expected to detect a pair of GPUs connected to IB but something went wrong!')
+                ib_gpus = {int(g.group(1)), int(g.group(2))}
+                for iso in isomorphisms:
+                    if len(ib_gpus.intersection({iso.nodes[0], iso.nodes[2]})) == 0:
+                        nodes = iso.nodes
+                        order = ",".join(str(rank) for rank in nodes)
+                        f.write(order)
+                        f.flush()
+                        if verbose:
+                            print(f'SCCL: Wrote IB placement to {f.name}')
+                        return order
+                raise RuntimeError(
+                    f'expected an isomorphism to match our expectation but none of them did!')
+        finally:
+            fcntl.lockf(f, fcntl.LOCK_UN)
diff --git a/sccl/autosynth/dgx1_plans.py b/sccl/autosynth/dgx1_plans.py
new file mode 100644
index 0000000..084ff65
--- /dev/null
+++ b/sccl/autosynth/dgx1_plans.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.topologies import dgx1
+from sccl.collectives import gather, scatter
+from sccl.strategies import solve_least_steps
+from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
+from sccl.autosynth.registry import register_synthesis_plan
+from sccl.ncclize import ncclize
+
+
+def register_dgx1_plans():
+    @register_synthesis_plan('alltoall', 'dgx1', machines=lambda x: x >= 2)
+    def synthesize_dgx1_relay_alltoall(machines, size):
+        gather_coll = gather(8, 0)
+        scatter_coll = scatter(8, 1)
+        gather_algo = solve_least_steps(dgx1(), gather_coll)
+        scatter_algo = solve_least_steps(dgx1(), scatter_coll)
+        algo = synthesize_gather_scatter_distributed_alltoall(
+            machines, gather_algo, scatter_algo)
+        ef = ncclize(algo, old_format=True, use_scratch=True, instances=8)
+        return (ef, {
+            'NCCL_NET_SHARED_BUFFERS': '0',
+            'NCCL_MIN_NCHANNELS': str(algo.nchannels)
+        })
\ No newline at end of file
diff --git a/sccl/autosynth/dgx1_relay_node_plan.py b/sccl/autosynth/dgx1_relay_node_plan.py
deleted file mode 100644
index 181921b..0000000
--- a/sccl/autosynth/dgx1_relay_node_plan.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-from sccl.topologies import dgx1
-from sccl.collectives import gather, scatter
-from sccl.strategies import solve_least_steps
-from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
-from sccl.isomorphisms import find_isomorphisms
-import re, subprocess, fcntl, tempfile, os, json
-
-class DGX1RelayNodePlan:
-    def __init__(self, local_topo):
-        self.local_topo = local_topo
-
-    def synthesize(self, world_size, collective_name, logging=False):
-        if world_size % self.local_topo.num_nodes() != 0:
-            raise RuntimeError('Local machine size does not evenly divide world size.')
-        num_machines = world_size // self.local_topo.num_nodes()
-        if collective_name == 'Alltoall':
-            return self._synthesize_alltoall(num_machines, logging)
-
-    def _synthesize_alltoall(self, num_machines, logging):
-        outbound, inbound = self._select_root_nodes()
-        gather_coll = gather(8, outbound)
-        scatter_coll = scatter(8, inbound)
-        gather_algo = solve_least_steps(dgx1(), gather_coll, logging=logging)
-        scatter_algo = solve_least_steps(dgx1(), scatter_coll, logging=logging)
-        return synthesize_gather_scatter_distributed_alltoall(num_machines, gather_algo, scatter_algo, logging)
-
-    def _select_root_nodes(self):
-        # TODO: is this always the right thing?
-        return (0,1)
-
-    def is_dgx1(self):
-        isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
-        return len(isomorphisms) == 4
-            
-    def local_rank_permutation(self):
-        isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
-        if len(isomorphisms) != 4:
-            raise RuntimeError(f'Expected to find 4 isomorphisms to DGX1 topology, but found {len(isomorphisms)}.')
-        return self._select_isomorphism(isomorphisms)
-
-    def _select_isomorphism(self, isomorphisms, verbose=True):
-        with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
-            fcntl.lockf(f, fcntl.LOCK_EX)
-            try:
-                f.seek(0, 2)
-                size = f.tell()
-                if size > 0:
-                    f.seek(0)
-                    order = f.read()
-                    if verbose:
-                        print(f'SCCL: Read IB placement from {f.name}')
-                    return order
-                else:
-                    print('SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
-                    topo_detect = subprocess.run(['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEVICES":"0,1,2,3,4,5,6,7"})
-                    print('SCCL: Finished running inspector-topo. Finding the permutaion.')
-                    if topo_detect.returncode != 0:
-                        raise RuntimeError(f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
-                    topo_detect_output = topo_detect.stdout.decode('utf-8')
-                    g = re.search("GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
-                    if g is None:
-                        raise RuntimeError(f'expected to detect a pair of GPUs connected to IB but something went wrong!')
-                    ib_gpus = {int(g.group(1)), int(g.group(2))}
-                    for iso in isomorphisms:
-                        if len(ib_gpus.intersection({iso.nodes[0],iso.nodes[2]})) == 0:
-                            nodes = iso.nodes
-                            order = ",".join(str(rank) for rank in nodes)
-                            f.write(order)
-                            f.flush()
-                            if verbose:
-                                print(f'SCCL: Wrote IB placement to {f.name}')
-                            return order
-                    raise RuntimeError(f'expected an isomorphism to match our expectation but none of them did!')
-            finally:
-                fcntl.lockf(f, fcntl.LOCK_UN)
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
new file mode 100644
index 0000000..22ef64f
--- /dev/null
+++ b/sccl/autosynth/registry.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from collections import defaultdict
+import math
+import humanfriendly
+
+# The plans are keyed by (collective, machine_type) and each entry is a tuple
+# (name, function, machines, size_ranges, priority).
+synthesis_plans = defaultdict(list)
+
+
+def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, priority=0):
+    def decorator(fun):
+        # Parse size_ranges
+        size_ranges = []
+
+        def parse_sizes(x):
+            lower, upper = x
+            if isinstance(lower, str):
+                lower = humanfriendly.parse_size(lower)
+            if isinstance(upper, str):
+                upper = humanfriendly.parse_size(upper)
+            if lower == None:
+                lower = 0
+            if upper == None:
+                upper = math.inf
+            return (lower, upper)
+
+        if sizes == None:
+            size_ranges.append((0, math.inf))
+        elif isinstance(sizes, list):
+            for x in sizes:
+                size_ranges.append(parse_sizes(x))
+        else:
+            size_ranges.append(parse_sizes(sizes))
+        # Register entries under all keys that might trigger this plan
+        entry = (fun.__name__, fun, machines, size_ranges, priority)
+        if isinstance(machine_type, list):
+            for mtype in machine_type:
+                synthesis_plans[(collective, mtype)].append(entry)
+        else:
+            synthesis_plans[(collective, machine_type)].append(entry)
+        # Return the original function to not break other use
+        return fun
+    return decorator
diff --git a/setup.py b/setup.py
index 5a7c47e..af80e4a 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@
         'z3-solver',
         'argcomplete',
         'lxml',
+        'humanfriendly'
     ],
     python_requires='>=3.6',
 )

From e7a8d60410d605fb2ffdf37b793f3cc5b8018b7f Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 20 Aug 2021 12:10:54 -0700
Subject: [PATCH 055/135] Add tests for autosynth

---
 sccl/autosynth/__init__.py |  4 ++--
 sccl/autosynth/registry.py |  2 --
 tests/test_autosynth.py    | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_autosynth.py

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index e48b9af..45ba823 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -66,7 +66,7 @@ def _candidate_sort_key(candidate):
     return priority
 
 
-def ndv2_perm(self):
+def ndv2_perm(self): # pragma: no cover
     # This function is used in a hacky way right now. The sccl_ndv2_launcher.sh
     # relies on the side effect of _select_isomorphism creating the lock file,
     # which is read by the script after calling this function, so the return
@@ -79,7 +79,7 @@ def ndv2_perm(self):
     return _select_isomorphism(isomorphisms)
 
 
-def _select_isomorphism(self, isomorphisms, verbose=True):
+def _select_isomorphism(self, isomorphisms, verbose=True): # pragma: no cover
     with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
         fcntl.lockf(f, fcntl.LOCK_EX)
         try:
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
index 22ef64f..bde3b38 100644
--- a/sccl/autosynth/registry.py
+++ b/sccl/autosynth/registry.py
@@ -21,8 +21,6 @@ def parse_sizes(x):
                 lower = humanfriendly.parse_size(lower)
             if isinstance(upper, str):
                 upper = humanfriendly.parse_size(upper)
-            if lower == None:
-                lower = 0
             if upper == None:
                 upper = math.inf
             return (lower, upper)
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
new file mode 100644
index 0000000..f9ed50b
--- /dev/null
+++ b/tests/test_autosynth.py
@@ -0,0 +1,21 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import pytest
+import sccl
+from sccl.autosynth.registry import register_synthesis_plan
+
+
+def test_sccl_init():
+    sccl.init(4, 'not_a_machine_type', ('alltoall', 0))
+    sccl.init(2, 'dgx1', ('alltoall', '1MB'))
+
+
+def test_register_plan():
+    @register_synthesis_plan('allgather', 'fancy_machine', sizes=(0, '4MB'))
+    def dummy_plan(m, s):
+        pass
+
+    @register_synthesis_plan('allgather', ['m1', 'm2'], sizes=[(0, '4MB'), ('1GiB', None)])
+    def dummy_plan(m, s):
+        pass

From 40feb7619d4a2381577b248ffbf9a4129b5b2bc9 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 20 Aug 2021 12:27:22 -0700
Subject: [PATCH 056/135] Bump version to 2.2.0 due to API changes

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index af80e4a..828b8dc 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='sccl',
-    version='2.1.2',
+    version='2.2.0',
     packages=find_packages(),
     entry_points={
         'console_scripts': [

From b9f3f5c77aca59eb937671a351982af7b46170bc Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 20 Aug 2021 13:20:14 -0700
Subject: [PATCH 057/135] Add a way to register existing SCCL-EF files

---
 sccl/autosynth/__init__.py | 16 ++------
 sccl/autosynth/registry.py | 78 ++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 45ba823..eb0dfb7 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -7,11 +7,7 @@
 import re
 import subprocess
 import fcntl
-import tempfile
 import os
-import subprocess
-import tempfile
-import atexit
 import humanfriendly
 
 from sccl.autosynth.dgx1_plans import register_dgx1_plans
@@ -31,19 +27,15 @@ def init(num_machines, machine_type, *collectives):
         description = f'{name} with size {humanfriendly.format_size(size)}'
         if len(sorted_candidates) == 0:
             print(
-                f'SCCL: No synthesis plan found for {description}. Falling back to NCCL baseline.')
+                f'SCCL: No plan found for {description}. Falling back to NCCL baseline.')
         else:
-            name, plan, _, _, _ = sorted_candidates[-1]
-            print(f'SCCL: Synthesis plan for {description} is {name}')
+            desc, plan, _, _, _ = sorted_candidates[-1]
+            print(f'SCCL: Plan for {description} is {desc}')
             plans_and_sizes.append((plan, size))
 
     envs = {}
     for plan, size in plans_and_sizes:
-        ef, env = plan(num_machines, size)
-        fd, path = tempfile.mkstemp()
-        with os.fdopen(fd, 'w') as f:
-            f.write(ef)
-        atexit.register(os.remove, path)
+        path, env = plan(num_machines, size)
         if 'SCCL_XML_FILE' in envs:
             envs['SCCL_XML_FILE'] += ',' + path
         else:
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
index bde3b38..6c25f9c 100644
--- a/sccl/autosynth/registry.py
+++ b/sccl/autosynth/registry.py
@@ -3,6 +3,9 @@
 
 from collections import defaultdict
 import math
+import tempfile
+import os
+import atexit
 import humanfriendly
 
 # The plans are keyed by (collective, machine_type) and each entry is a tuple
@@ -10,35 +13,54 @@
 synthesis_plans = defaultdict(list)
 
 
+def _register_ef_provider(desc, fun, collective, machine_type, machines, sizes, priority):
+    # Parse size_ranges
+    size_ranges = []
+
+    def parse_sizes(x):
+        lower, upper = x
+        if isinstance(lower, str):
+            lower = humanfriendly.parse_size(lower)
+        if isinstance(upper, str):
+            upper = humanfriendly.parse_size(upper)
+        if upper == None:
+            upper = math.inf
+        return (lower, upper)
+
+    if sizes == None:
+        size_ranges.append((0, math.inf))
+    elif isinstance(sizes, list):
+        for x in sizes:
+            size_ranges.append(parse_sizes(x))
+    else:
+        size_ranges.append(parse_sizes(sizes))
+    # Register entries under all keys that might trigger this plan
+    entry = (desc, fun, machines, size_ranges, priority)
+    if isinstance(machine_type, list):
+        for mtype in machine_type:
+            synthesis_plans[(collective, mtype)].append(entry)
+    else:
+        synthesis_plans[(collective, machine_type)].append(entry)
+
+
+def register_ef_file(path, collective, machine_type, num_machines, sizes=None, priority=1):
+    def provide_ef_path(machines, size):
+        return path, {}
+    _register_ef_provider(f'load {path}', provide_ef_path, collective,
+                         machine_type, lambda x: x == num_machines, sizes, priority)
+
+
 def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, priority=0):
     def decorator(fun):
-        # Parse size_ranges
-        size_ranges = []
-
-        def parse_sizes(x):
-            lower, upper = x
-            if isinstance(lower, str):
-                lower = humanfriendly.parse_size(lower)
-            if isinstance(upper, str):
-                upper = humanfriendly.parse_size(upper)
-            if upper == None:
-                upper = math.inf
-            return (lower, upper)
-
-        if sizes == None:
-            size_ranges.append((0, math.inf))
-        elif isinstance(sizes, list):
-            for x in sizes:
-                size_ranges.append(parse_sizes(x))
-        else:
-            size_ranges.append(parse_sizes(sizes))
-        # Register entries under all keys that might trigger this plan
-        entry = (fun.__name__, fun, machines, size_ranges, priority)
-        if isinstance(machine_type, list):
-            for mtype in machine_type:
-                synthesis_plans[(collective, mtype)].append(entry)
-        else:
-            synthesis_plans[(collective, machine_type)].append(entry)
-        # Return the original function to not break other use
+        def wrapped(machines, size):
+            ef, env = fun(machines, size)
+            fd, path = tempfile.mkstemp()
+            with os.fdopen(fd, 'w') as f:
+                f.write(ef)
+            atexit.register(os.remove, path)
+            return path, env
+        _register_ef_provider(f'call {fun.__name__}', wrapped, collective,
+                             machine_type, machines, sizes, priority)
+        # Return the original function to not break other usage
         return fun
     return decorator

From adba54c61039482ff6b50dcfe5727e87ba51a55f Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 20 Aug 2021 15:23:22 -0700
Subject: [PATCH 058/135] Add handwritten alltoall for A100s

---
 sccl/autosynth/__init__.py   |   2 +
 sccl/autosynth/a100_plans.py | 101 +++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 sccl/autosynth/a100_plans.py

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index eb0dfb7..5bd2646 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -11,7 +11,9 @@
 import humanfriendly
 
 from sccl.autosynth.dgx1_plans import register_dgx1_plans
+from sccl.autosynth.a100_plans import register_a100_plans
 register_dgx1_plans()
+register_a100_plans()
 
 
 def init(num_machines, machine_type, *collectives):
diff --git a/sccl/autosynth/a100_plans.py b/sccl/autosynth/a100_plans.py
new file mode 100644
index 0000000..3f8642d
--- /dev/null
+++ b/sccl/autosynth/a100_plans.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.autosynth.registry import register_synthesis_plan
+
+def register_a100_plans():
+    @register_synthesis_plan('alltoall', 'a100', machines=lambda x: x == 9)
+    def synthesize_a100_hierarchical(machines, size):
+        xml = ""
+        nnodes = 9
+        assert(machines == nnodes)
+        ngpuspernode = 8
+        instances = 2
+        nchunksperloop = nnodes*ngpuspernode*instances
+        xml += ('<algo name="test" nchunksperloop="{}" nchannels="{}" proto="Simple">'.format(nchunksperloop, 2*instances)) + '\n'
+
+        def CrossNodeNghr(node, g):
+            nghrNode = g if node > g else g+1
+            nghrG = node if nghrNode > node else node-1
+            return nghrNode, nghrG, nghrNode * ngpuspernode + nghrG
+        for node in range(nnodes):
+            for g in range(ngpuspernode):
+                tbindex = 0
+                nghrNode, nghrG, crossnodenghr = CrossNodeNghr(node,g)
+                xml += ('  <gpu id="{}" i_chunks="{}" o_chunks="{}" s_chunks="{}">'.format(node*ngpuspernode+g, nchunksperloop, nchunksperloop, instances*2*ngpuspernode**2)) + '\n'
+                for ch in range(instances):
+                    xml += ('    <tb id="{}" send="{}" recv="-1" chan="{}">'.format(tbindex, crossnodenghr, ch)) + '\n'
+                    xml += ('      <step s="0" type="s" srcbuf="s" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="{}" deps="{}" hasdep="0"/>'.format(ch*ngpuspernode**2, instances*ngpuspernode**2+ch*ngpuspernode**2, ngpuspernode**2, instances*(2+2*g)+ch, ngpuspernode)) + '\n'
+                    xml += ('    </tb>') + '\n'
+                    tbindex+=1
+                for ch in range(instances):
+                    xml += ('    <tb id="{}" send="-1" recv="{}" chan="{}">'.format(tbindex, crossnodenghr, ch)) + '\n'
+                    xml += ('      <step s="0" type="r" srcbuf="s" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="1"/>'.format(ch*ngpuspernode**2, instances*ngpuspernode**2+ch*ngpuspernode**2, ngpuspernode**2)) + '\n'
+                    xml += ('    </tb>') + '\n'
+                    tbindex+=1
+                for withinnodenghr  in range(ngpuspernode):
+                    withinNghrNode, withinNghrG, withinCrossNodeNghr = CrossNodeNghr(node, withinnodenghr)
+                    if withinnodenghr == g:
+                        for ch in range(instances):
+                            step = 0
+                            xml += ('    <tb id="{}" send="-1" recv="-1" chan="0">'.format(tbindex)) + '\n'
+                            xml += ('      <step s="{}" type="cpy" srcbuf="i" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="{}"/>'.format(step, instances*nghrNode*ngpuspernode+ch*ngpuspernode, instances*g*ngpuspernode+ch*ngpuspernode, ngpuspernode, 1)) + '\n'
+                            step += 1
+                            for j in range(ch*(ngpuspernode//instances), (ch+1)*(ngpuspernode//instances)):
+                                for k in range(instances):
+                                    xml += ('      <step s="{}" type="nop" srcbuf="i" srcoff="0" dstbuf="o" dstoff="0" cnt="0" depid="{}" deps="{}" hasdep="{}"/>'.format(step, (instances*(2*j+2+1)+k) if j < g else (instances*(2*j+2)+k), 0, 1 if step == 1+ngpuspernode-1 else 0)) + '\n'
+                                    step += 1
+                            xml += ('    </tb>') + '\n'
+                            tbindex+=1
+                    else:
+                        for ch in range(instances):
+                            xml += ('    <tb id="{}" send="{}" recv="-1" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, ch)) + '\n'
+                            xml += ('      <step s="0" type="s" srcbuf="i" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(instances*withinNghrNode*ngpuspernode+ch*ngpuspernode, instances*g*ngpuspernode+ch*ngpuspernode, ngpuspernode)) + '\n'
+                            xml += ('    </tb>') + '\n'
+                            tbindex+=1
+                        for ch in range(instances):
+                            xml += ('    <tb id="{}" send="-1" recv="{}" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, ch)) + '\n'
+                            xml += ('      <step s="0" type="r" srcbuf="i" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="1"/>'.format(instances*nghrNode*ngpuspernode+ch*ngpuspernode, instances*withinnodenghr*ngpuspernode+ch*ngpuspernode, ngpuspernode)) + '\n'
+                            xml += ('    </tb>') + '\n'
+                            tbindex+=1
+
+        # --------------------------------
+                for withinnodenghr  in range(ngpuspernode):
+                    withinNghrNode, withinNghrG, withinCrossNodeNghr = CrossNodeNghr(node, withinnodenghr)
+                    if withinnodenghr == g:
+                        for ch in range(instances):
+                            xml += ('    <tb id="{}" send="-1" recv="-1" chan="0">'.format(tbindex)) + '\n'
+                            step = 0
+                            xml += ('      <step s="{}" type="cpy" srcbuf="i" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(node*ngpuspernode+g)+ch, instances*(node*ngpuspernode+g)+ch, 1)) + '\n'
+                            step += 1
+                            for j in range(ngpuspernode):
+                                xml += ('      <step s="{}" type="cpy" srcbuf="s" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="{}" deps="{}" hasdep="0"/>'.format(step, instances*(ngpuspernode**2+j*ngpuspernode+g)+ch, instances*(nghrNode*ngpuspernode+j)+ch, 1, instances+(instances*(j*ngpuspernode+g)+ch)//((instances*ngpuspernode**2)//instances), 0)) + '\n'
+                                step += 1
+                            xml += ('    </tb>') + '\n'
+                            tbindex+=1
+                    else:
+                        for ch in range(instances):
+                            xml += ('    <tb id="{}" send="{}" recv="-1" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, instances+ch)) + '\n'
+                            step = 0
+                            xml += ('      <step s="{}" type="s" srcbuf="i" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(node*ngpuspernode+withinnodenghr)+ch, instances*(node*ngpuspernode+g)+ch, 1)) + '\n'
+                            step += 1
+                            for j in range(ngpuspernode):
+                                xml += ('      <step s="{}" type="s" srcbuf="s" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="{}" deps="{}" hasdep="0"/>'.format(step, instances*(ngpuspernode**2+j*ngpuspernode+withinnodenghr)+ch, instances*(nghrNode*ngpuspernode+j)+ch, 1, instances+(instances*(j*ngpuspernode+withinnodenghr)+ch)//((instances*ngpuspernode**2)//instances), 0)) + '\n'
+                                step += 1
+                            xml += ('    </tb>') + '\n'
+                            tbindex+=1
+                        for ch in range(instances):
+                            xml += ('    <tb id="{}" send="-1" recv="{}" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, instances+ch)) + '\n'
+                            step = 0
+                            xml += ('      <step s="{}" type="r" srcbuf="i" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(node*ngpuspernode+g)+ch, instances*(node*ngpuspernode+withinnodenghr)+ch, 1)) + '\n'
+                            step += 1
+                            for j in range(ngpuspernode):
+                                xml += ('      <step s="{}" type="r" srcbuf="s" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(ngpuspernode**2+j*ngpuspernode+g)+ch, instances*(withinNghrNode*ngpuspernode+j)+ch, 1)) + '\n'
+                                step += 1
+                            xml += ('    </tb>') + '\n'
+                            tbindex+=1
+                xml += ('  </gpu>') + '\n'
+        xml += ('</algo>') + '\n'
+        return xml, {
+            # TODO: NCCL environment variables
+        }
\ No newline at end of file

From 9278dd052230080ad8fd43ed17742305a88e703a Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 23 Aug 2021 14:06:14 -0700
Subject: [PATCH 059/135] A100 autosynth test and more stringent tests

---
 sccl/autosynth/a100_plans.py | 2 +-
 tests/test_autosynth.py      | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/a100_plans.py b/sccl/autosynth/a100_plans.py
index 3f8642d..ceafbf5 100644
--- a/sccl/autosynth/a100_plans.py
+++ b/sccl/autosynth/a100_plans.py
@@ -5,7 +5,7 @@
 
 def register_a100_plans():
     @register_synthesis_plan('alltoall', 'a100', machines=lambda x: x == 9)
-    def synthesize_a100_hierarchical(machines, size):
+    def synthesize_a100_hierarchical_alltoall(machines, size):
         xml = ""
         nnodes = 9
         assert(machines == nnodes)
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index f9ed50b..3bceacc 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -6,9 +6,16 @@
 from sccl.autosynth.registry import register_synthesis_plan
 
 
-def test_sccl_init():
+def test_sccl_init(capsys):
     sccl.init(4, 'not_a_machine_type', ('alltoall', 0))
+    out, err = capsys.readouterr()
+    assert 'No plan found' in out
     sccl.init(2, 'dgx1', ('alltoall', '1MB'))
+    out, err = capsys.readouterr()
+    assert 'synthesize_dgx1_relay_alltoall' in out
+    sccl.init(9, 'a100', ('alltoall', '1MB'))
+    out, err = capsys.readouterr()
+    assert 'synthesize_a100_hierarchical_alltoall' in out
 
 
 def test_register_plan():

From 7718c5abd9c03ed64951ea8b9ff920776640974e Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 24 Aug 2021 18:08:50 -0700
Subject: [PATCH 060/135] Logic for combining NCCL_MIN_CHANNELS requirements

Also synthesis plans no longer return an env, just a path to the XML.
---
 sccl/autosynth/__init__.py   | 55 +++++++++++++++++++++++++++++-------
 sccl/autosynth/a100_plans.py |  4 +--
 sccl/autosynth/dgx1_plans.py |  6 +---
 sccl/autosynth/registry.py   |  6 ++--
 4 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 5bd2646..514f828 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -35,16 +35,23 @@ def init(num_machines, machine_type, *collectives):
             print(f'SCCL: Plan for {description} is {desc}')
             plans_and_sizes.append((plan, size))
 
-    envs = {}
+    paths = None
+    max_min_channels = 0
     for plan, size in plans_and_sizes:
-        path, env = plan(num_machines, size)
-        if 'SCCL_XML_FILE' in envs:
-            envs['SCCL_XML_FILE'] += ',' + path
-        else:
-            envs['SCCL_XML_FILE'] = path
-        envs.update(env)
-
-    os.environ.update(envs)
+        path = plan(num_machines, size)
+        min_channels = _extract_min_channels(path)
+        if min_channels:
+            max_min_channels = max(max_min_channels, min_channels)
+            if paths:
+                paths += f',{path}'
+            else:
+                paths = path
+    if paths:
+        os.environ.update({
+            'SCCL_XML_FILE': paths,
+            'NCCL_MIN_NCHANNELS': str(max_min_channels),
+            'NCCL_NET_SHARED_BUFFERS': '0'
+        })
 
 
 def _candidate_filter(m, s):
@@ -60,6 +67,34 @@ def _candidate_sort_key(candidate):
     return priority
 
 
+def _extract_min_channels(path):
+    algo_pattern = re.compile('<algo[^>]*>')
+    nchannels_pattern = re.compile('nchannels=["\'](\\d+)["\']')
+    with open(path) as f:
+        # Try with the first line
+        first_line = f.readline()
+        match = algo_pattern.search(first_line)
+        if match:
+            tag_match = nchannels_pattern.search(match.group(0))
+            if not tag_match:
+                print(f'SCCL: Skipping algorithm, could not read nchannels from <algo/> tag in {path}')
+                return None
+            return int(tag_match.group(1))
+        # Try again with the whole file
+        f.seek(0)
+        whole_file = f.read()
+        match = algo_pattern.search(whole_file)
+        if match:
+            tag_match = nchannels_pattern.search(match.group(0))
+            if not tag_match:
+                print(f'SCCL: Skipping algorithm, could not read nchannels from <algo/> tag in {path}')
+                return None
+            return int(tag_match.group(1))
+        else:
+            print(f'SCCL: Skipping algorithm, could not find <algo/> tag in {path}')
+            return None
+
+
 def ndv2_perm(self): # pragma: no cover
     # This function is used in a hacky way right now. The sccl_ndv2_launcher.sh
     # relies on the side effect of _select_isomorphism creating the lock file,
@@ -96,7 +131,7 @@ def _select_isomorphism(self, isomorphisms, verbose=True): # pragma: no cover
                         f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
                 topo_detect_output = topo_detect.stdout.decode('utf-8')
                 g = re.search(
-                    "GPU pair shared with NIC appears to be (\d) and (\d)", topo_detect_output)
+                    'GPU pair shared with NIC appears to be (\\d) and (\\d)', topo_detect_output)
                 if g is None:
                     raise RuntimeError(
                         f'expected to detect a pair of GPUs connected to IB but something went wrong!')
diff --git a/sccl/autosynth/a100_plans.py b/sccl/autosynth/a100_plans.py
index ceafbf5..ce3502f 100644
--- a/sccl/autosynth/a100_plans.py
+++ b/sccl/autosynth/a100_plans.py
@@ -96,6 +96,4 @@ def CrossNodeNghr(node, g):
                             tbindex+=1
                 xml += ('  </gpu>') + '\n'
         xml += ('</algo>') + '\n'
-        return xml, {
-            # TODO: NCCL environment variables
-        }
\ No newline at end of file
+        return xml
\ No newline at end of file
diff --git a/sccl/autosynth/dgx1_plans.py b/sccl/autosynth/dgx1_plans.py
index 084ff65..9793853 100644
--- a/sccl/autosynth/dgx1_plans.py
+++ b/sccl/autosynth/dgx1_plans.py
@@ -18,8 +18,4 @@ def synthesize_dgx1_relay_alltoall(machines, size):
         scatter_algo = solve_least_steps(dgx1(), scatter_coll)
         algo = synthesize_gather_scatter_distributed_alltoall(
             machines, gather_algo, scatter_algo)
-        ef = ncclize(algo, old_format=True, use_scratch=True, instances=8)
-        return (ef, {
-            'NCCL_NET_SHARED_BUFFERS': '0',
-            'NCCL_MIN_NCHANNELS': str(algo.nchannels)
-        })
\ No newline at end of file
+        return ncclize(algo, old_format=True, use_scratch=True, instances=8)
\ No newline at end of file
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
index 6c25f9c..93f77ce 100644
--- a/sccl/autosynth/registry.py
+++ b/sccl/autosynth/registry.py
@@ -45,7 +45,7 @@ def parse_sizes(x):
 
 def register_ef_file(path, collective, machine_type, num_machines, sizes=None, priority=1):
     def provide_ef_path(machines, size):
-        return path, {}
+        return path
     _register_ef_provider(f'load {path}', provide_ef_path, collective,
                          machine_type, lambda x: x == num_machines, sizes, priority)
 
@@ -53,12 +53,12 @@ def provide_ef_path(machines, size):
 def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, priority=0):
     def decorator(fun):
         def wrapped(machines, size):
-            ef, env = fun(machines, size)
+            ef = fun(machines, size)
             fd, path = tempfile.mkstemp()
             with os.fdopen(fd, 'w') as f:
                 f.write(ef)
             atexit.register(os.remove, path)
-            return path, env
+            return path
         _register_ef_provider(f'call {fun.__name__}', wrapped, collective,
                              machine_type, machines, sizes, priority)
         # Return the original function to not break other usage

From e084ce66b9ba1079946955d0ff8bfc00c2abd67b Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 13 Sep 2021 18:09:24 -0700
Subject: [PATCH 061/135] Runtime algorithm selection support in sccl.init

User can give a size range instead of a single size for each collective.
Selected plans are written in a new XML format with params for NCCL.
Plans can no longer be registered with many intervals in the same entry,
instead multiple registrations to same plan should be used.
Add protocol to plan registry.
Remove the size parameter from synthesis plans.
---
 sccl/autosynth/__init__.py   | 124 +++++++++++++++++++++++++----------
 sccl/autosynth/a100_plans.py |   2 +-
 sccl/autosynth/dgx1_plans.py |   2 +-
 sccl/autosynth/registry.py   |  39 +++++------
 4 files changed, 105 insertions(+), 62 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 514f828..fbb0fb0 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -4,10 +4,12 @@
 from sccl.topologies import dgx1
 from sccl.isomorphisms import find_isomorphisms
 from sccl.autosynth.registry import synthesis_plans
+from lxml import etree as ET
 import re
 import subprocess
 import fcntl
 import os
+import tempfile
 import humanfriendly
 
 from sccl.autosynth.dgx1_plans import register_dgx1_plans
@@ -17,53 +19,103 @@
 
 
 def init(num_machines, machine_type, *collectives):
-    plans_and_sizes = []
+    # Collect and sort all plans that match the collectives and sizes given by the user.
+    selected_plans = {}
     for collective in collectives:
-        name, size = collective
-        if isinstance(size, str):
-            size = humanfriendly.parse_size(size)
-        candidates = synthesis_plans[(name, machine_type)]
-        valid_candidates = filter(
-            _candidate_filter(num_machines, size), candidates)
-        sorted_candidates = sorted(valid_candidates, key=_candidate_sort_key)
-        description = f'{name} with size {humanfriendly.format_size(size)}'
-        if len(sorted_candidates) == 0:
-            print(
-                f'SCCL: No plan found for {description}. Falling back to NCCL baseline.')
+        name, sizes = collective
+        if isinstance(sizes, tuple):
+            lower, upper = sizes
+            if isinstance(lower, str):
+                lower = humanfriendly.parse_size(lower)
+            if isinstance(upper, str):
+                upper = humanfriendly.parse_size(upper)
+            sizes = (lower, upper)
         else:
-            desc, plan, _, _, _ = sorted_candidates[-1]
-            print(f'SCCL: Plan for {description} is {desc}')
-            plans_and_sizes.append((plan, size))
-
-    paths = None
-    max_min_channels = 0
-    for plan, size in plans_and_sizes:
-        path = plan(num_machines, size)
-        min_channels = _extract_min_channels(path)
-        if min_channels:
-            max_min_channels = max(max_min_channels, min_channels)
-            if paths:
-                paths += f',{path}'
-            else:
-                paths = path
-    if paths:
+            if isinstance(sizes, str):
+                sizes = humanfriendly.parse_size(sizes)
+            sizes = (sizes, sizes)
+        candidates = synthesis_plans[(name, machine_type)]
+        selected_plans[name] = _select_plans(name, candidates, num_machines, sizes)
+
+    if len(selected_plans) > 0:
+        # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by SCCL-RT.
+        algos_elem = ET.Element('sccl_algos')
+        max_min_channels = 0
+        for collective_name, plans in selected_plans.items():
+            for plan, params in plans:
+                path = plan(num_machines)
+                min_channels = _extract_min_channels(path)
+                # Skip the algorithm if minimum channels could not be determined (corrupted XML for example)
+                if min_channels:
+                    max_min_channels = max(max_min_channels, min_channels)
+
+                    load_elem = ET.SubElement(algos_elem, 'load')
+                    load_elem.set('path', path)
+                    minsize, maxsize, proto = params
+                    load_elem.set('minsize', str(minsize))
+                    load_elem.set('maxsize', str(maxsize+1))
+                    load_elem.set('proto', proto)
+        ET.indent(algos_elem, space='  ')
+        
+        fd, path = tempfile.mkstemp()
+        with os.fdopen(fd, 'w') as f:
+            f.write(ET.tostring(algos_elem, encoding='unicode'))
         os.environ.update({
-            'SCCL_XML_FILE': paths,
+            'SCCL_CONFIG': path,
             'NCCL_MIN_NCHANNELS': str(max_min_channels),
             'NCCL_NET_SHARED_BUFFERS': '0'
         })
+    else:
+        print(f'SCCL: No algorithms were selected.')
 
 
-def _candidate_filter(m, s):
-    def fun(candidate):
-        _, _, machines, size_ranges, _ = candidate
-        size_matches = any(map(lambda x: x[0] <= s and s <= x[1], size_ranges))
-        return size_matches and machines(m)
-    return fun
+def _select_plans(name, candidates, num_machines, sizes):
+    candidate_intervals = [(sizes, [])]
+    valid_candidates = list(filter(lambda x: x[2](num_machines), candidates))
+    for candidate in valid_candidates:
+        csizes = candidate[3]
+        i = 0
+        while i < len(candidate_intervals):
+            ival = candidate_intervals[i]
+            isizes = ival[0]
+            if isizes[1] < csizes[0]:
+                i += 1
+                continue
+            if isizes[0] > csizes[1]:
+                break
+            if isizes[0] < csizes[0]:
+                del candidate_intervals[i]
+                candidate_intervals.insert(i, ((csizes[0], isizes[1]), ival[1]))
+                candidate_intervals.insert(i, ((isizes[0], csizes[0]-1), ival[1].copy()))
+                i += 1
+                continue
+            if isizes[1] > csizes [1]:
+                del candidate_intervals[i]
+                candidate_intervals.insert(i, ((csizes[1]+1, isizes[1]), ival[1]))
+                candidate_intervals.insert(i, ((isizes[0], csizes[1]), ival[1] + [candidate]))
+                break
+            ival[1].append(candidate)
+            csizes = (isizes[1]+1,csizes[1])
+            if csizes[0] > csizes[1]:
+                break
+    results = []
+    for isizes, candidates in candidate_intervals:
+        sorted_candidates = sorted(candidates, key=_candidate_sort_key)
+        description = f'{name} with sizes from {humanfriendly.format_size(isizes[0])} to {humanfriendly.format_size(isizes[1])}'
+        if len(sorted_candidates) == 0:
+            print(f'SCCL: No plan found for {description}. Falling back to NCCL baseline.')
+        else:
+            desc, plan, _, _, proto, _ = sorted_candidates[-1]
+            print(f'SCCL: Plan for {description} is {desc} with {proto} protocol.')
+            if len(results) > 0 and plan == results[-1][0] and isizes[0] == results[-1][1][1] + 1 and proto == results[-1][1][2]:
+                results[-1][1][1] = isizes[1]
+            else:
+                results.append((plan, [isizes[0], isizes[1], proto]))
+    return results
 
 
 def _candidate_sort_key(candidate):
-    _, _, _, _, priority = candidate
+    _, _, _, _, _, priority = candidate
     return priority
 
 
diff --git a/sccl/autosynth/a100_plans.py b/sccl/autosynth/a100_plans.py
index ce3502f..01bffd2 100644
--- a/sccl/autosynth/a100_plans.py
+++ b/sccl/autosynth/a100_plans.py
@@ -5,7 +5,7 @@
 
 def register_a100_plans():
     @register_synthesis_plan('alltoall', 'a100', machines=lambda x: x == 9)
-    def synthesize_a100_hierarchical_alltoall(machines, size):
+    def synthesize_a100_hierarchical_alltoall(machines):
         xml = ""
         nnodes = 9
         assert(machines == nnodes)
diff --git a/sccl/autosynth/dgx1_plans.py b/sccl/autosynth/dgx1_plans.py
index 9793853..9d21cae 100644
--- a/sccl/autosynth/dgx1_plans.py
+++ b/sccl/autosynth/dgx1_plans.py
@@ -11,7 +11,7 @@
 
 def register_dgx1_plans():
     @register_synthesis_plan('alltoall', 'dgx1', machines=lambda x: x >= 2)
-    def synthesize_dgx1_relay_alltoall(machines, size):
+    def synthesize_dgx1_relay_alltoall(machines):
         gather_coll = gather(8, 0)
         scatter_coll = scatter(8, 1)
         gather_algo = solve_least_steps(dgx1(), gather_coll)
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
index 93f77ce..b0842d1 100644
--- a/sccl/autosynth/registry.py
+++ b/sccl/autosynth/registry.py
@@ -9,33 +9,24 @@
 import humanfriendly
 
 # The plans are keyed by (collective, machine_type) and each entry is a tuple
-# (name, function, machines, size_ranges, priority).
+# (name, function, machines, size_range, protocol, priority).
 synthesis_plans = defaultdict(list)
 
 
-def _register_ef_provider(desc, fun, collective, machine_type, machines, sizes, priority):
-    # Parse size_ranges
-    size_ranges = []
-
-    def parse_sizes(x):
-        lower, upper = x
+def _register_ef_provider(desc, fun, collective, machine_type, machines, sizes, protocol, priority):
+    if sizes == None:
+        sizes == (0, math.inf)
+    else:
+        lower, upper = sizes
         if isinstance(lower, str):
             lower = humanfriendly.parse_size(lower)
         if isinstance(upper, str):
             upper = humanfriendly.parse_size(upper)
         if upper == None:
             upper = math.inf
-        return (lower, upper)
-
-    if sizes == None:
-        size_ranges.append((0, math.inf))
-    elif isinstance(sizes, list):
-        for x in sizes:
-            size_ranges.append(parse_sizes(x))
-    else:
-        size_ranges.append(parse_sizes(sizes))
+        sizes = (lower, upper)
     # Register entries under all keys that might trigger this plan
-    entry = (desc, fun, machines, size_ranges, priority)
+    entry = (desc, fun, machines, sizes, protocol, priority)
     if isinstance(machine_type, list):
         for mtype in machine_type:
             synthesis_plans[(collective, mtype)].append(entry)
@@ -43,24 +34,24 @@ def parse_sizes(x):
         synthesis_plans[(collective, machine_type)].append(entry)
 
 
-def register_ef_file(path, collective, machine_type, num_machines, sizes=None, priority=1):
-    def provide_ef_path(machines, size):
+def register_ef_file(path, collective, machine_type, num_machines, sizes=None, protocol='Simple', priority=0):
+    def provide_ef_path(machines):
         return path
     _register_ef_provider(f'load {path}', provide_ef_path, collective,
-                         machine_type, lambda x: x == num_machines, sizes, priority)
+                         machine_type, lambda x: x == num_machines, sizes, protocol, priority)
 
 
-def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, priority=0):
+def register_synthesis_plan(collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', priority=0):
     def decorator(fun):
-        def wrapped(machines, size):
-            ef = fun(machines, size)
+        def wrapped(machines):
+            ef = fun(machines)
             fd, path = tempfile.mkstemp()
             with os.fdopen(fd, 'w') as f:
                 f.write(ef)
             atexit.register(os.remove, path)
             return path
         _register_ef_provider(f'call {fun.__name__}', wrapped, collective,
-                             machine_type, machines, sizes, priority)
+                             machine_type, machines, sizes, protocol, priority)
         # Return the original function to not break other usage
         return fun
     return decorator

From 339e66fd6b88e174031aa2c2016c2a791c52688c Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 24 Sep 2021 14:43:24 -0700
Subject: [PATCH 062/135] Fix support for giving a specific size

The algorithm overlapping that size will still be used for its entire
advantageous range.
---
 sccl/autosynth/__init__.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index fbb0fb0..52f592c 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -9,6 +9,7 @@
 import subprocess
 import fcntl
 import os
+import math
 import tempfile
 import humanfriendly
 
@@ -29,6 +30,8 @@ def init(num_machines, machine_type, *collectives):
                 lower = humanfriendly.parse_size(lower)
             if isinstance(upper, str):
                 upper = humanfriendly.parse_size(upper)
+            if upper == None:
+                upper = math.inf
             sizes = (lower, upper)
         else:
             if isinstance(sizes, str):
@@ -52,8 +55,10 @@ def init(num_machines, machine_type, *collectives):
                     load_elem = ET.SubElement(algos_elem, 'load')
                     load_elem.set('path', path)
                     minsize, maxsize, proto = params
-                    load_elem.set('minsize', str(minsize))
-                    load_elem.set('maxsize', str(maxsize+1))
+                    if minsize != 0:
+                        load_elem.set('minsize', str(minsize))
+                    if maxsize != math.inf:
+                        load_elem.set('maxsize', str(maxsize+1))
                     load_elem.set('proto', proto)
         ET.indent(algos_elem, space='  ')
         
@@ -70,10 +75,13 @@ def init(num_machines, machine_type, *collectives):
 
 
 def _select_plans(name, candidates, num_machines, sizes):
-    candidate_intervals = [(sizes, [])]
+    candidate_intervals = [((0, math.inf), [])]
     valid_candidates = list(filter(lambda x: x[2](num_machines), candidates))
     for candidate in valid_candidates:
         csizes = candidate[3]
+        # Skip candidate if it does not overlap with user provided sizes
+        if csizes[0] > sizes[1] or sizes[0] > csizes[1]:
+            continue
         i = 0
         while i < len(candidate_intervals):
             ival = candidate_intervals[i]
@@ -100,6 +108,9 @@ def _select_plans(name, candidates, num_machines, sizes):
                 break
     results = []
     for isizes, candidates in candidate_intervals:
+        # Skip interval if it does not overlap with user provided sizes
+        if isizes[0] > sizes[1] or sizes[0] > isizes[1]:
+            continue
         sorted_candidates = sorted(candidates, key=_candidate_sort_key)
         description = f'{name} with sizes from {humanfriendly.format_size(isizes[0])} to {humanfriendly.format_size(isizes[1])}'
         if len(sorted_candidates) == 0:

From 69ceb74f6f541ffbb2fe10e28ad52b35377e6805 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 24 Sep 2021 14:47:17 -0700
Subject: [PATCH 063/135] Improve printing of infinite sizes

---
 sccl/autosynth/__init__.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 52f592c..4618049 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -74,6 +74,13 @@ def init(num_machines, machine_type, *collectives):
         print(f'SCCL: No algorithms were selected.')
 
 
+def _format_size(size):
+    if size != math.inf:
+        return humanfriendly.format_size(size)
+    else:
+        return 'infinity'
+
+
 def _select_plans(name, candidates, num_machines, sizes):
     candidate_intervals = [((0, math.inf), [])]
     valid_candidates = list(filter(lambda x: x[2](num_machines), candidates))
@@ -112,7 +119,7 @@ def _select_plans(name, candidates, num_machines, sizes):
         if isizes[0] > sizes[1] or sizes[0] > isizes[1]:
             continue
         sorted_candidates = sorted(candidates, key=_candidate_sort_key)
-        description = f'{name} with sizes from {humanfriendly.format_size(isizes[0])} to {humanfriendly.format_size(isizes[1])}'
+        description = f'{name} with sizes from {_format_size(isizes[0])} to {_format_size(isizes[1])}'
         if len(sorted_candidates) == 0:
             print(f'SCCL: No plan found for {description}. Falling back to NCCL baseline.')
         else:

From 69e9ac9169d1ed8aeeb458f440e1e7fa9443bacc Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 29 Sep 2021 12:56:31 -0700
Subject: [PATCH 064/135] Bugfix concerning infinite sizes

---
 sccl/autosynth/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 4618049..de6b354 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -113,6 +113,8 @@ def _select_plans(name, candidates, num_machines, sizes):
             csizes = (isizes[1]+1,csizes[1])
             if csizes[0] > csizes[1]:
                 break
+            if csizes[0] == math.inf:
+                break
     results = []
     for isizes, candidates in candidate_intervals:
         # Skip interval if it does not overlap with user provided sizes

From 4627fd10193871c08d63ea14236617b616acb00e Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 29 Sep 2021 13:21:24 -0700
Subject: [PATCH 065/135] Add new attributes to ncclize XML

---
 sccl/collectives.py   | 15 ++++++++-------
 sccl/ncclize.py       |  2 ++
 sccl/serialization.py |  3 ++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/sccl/collectives.py b/sccl/collectives.py
index 347ec23..d49b457 100644
--- a/sccl/collectives.py
+++ b/sccl/collectives.py
@@ -11,12 +11,13 @@ class Chunk:
     address: int
 
 class Collective:
-    def __init__(self, name, num_nodes, chunks, triggers = {}):
+    def __init__(self, name, num_nodes, chunks, triggers = {}, runtime_name= 'custom'):
         self.name = name
         self.num_nodes = num_nodes
         self.num_chunks = len(chunks)
         self._chunks = chunks
         self._triggers = triggers
+        self.runtime_name = runtime_name
 
         self.is_combining = False
         addresses_seen = set()
@@ -67,7 +68,7 @@ def remap(addr, i):
         name = f'{self.name},chunks={div}'
         return Collective(name, self.num_nodes, new_chunks)
 
-def build_collective(name, num_nodes, num_chunks, precondition, postcondition, address = lambda c: c, trigger = lambda r, c: None):
+def build_collective(name, num_nodes, num_chunks, precondition, postcondition, address = lambda c: c, trigger = lambda r, c: None, runtime_name = 'custom'):
     chunks = []
     for chunk in range(num_chunks):
         chunk_precondition = set(rank for rank in range(num_nodes) if precondition(rank, chunk))
@@ -75,7 +76,7 @@ def build_collective(name, num_nodes, num_chunks, precondition, postcondition, a
         chunk_address = address(chunk)
         chunks.append(Chunk(chunk_precondition, chunk_postcondition, chunk_address))
     triggers = {(rank, chunk): trigger(rank, chunk) for rank in range(num_nodes) for chunk in range(num_chunks) if trigger(rank, chunk) != None}
-    return Collective(name, num_nodes, chunks, triggers)
+    return Collective(name, num_nodes, chunks, triggers, runtime_name)
 
 # Common pre- and postconditions
 def _scattered(num_nodes, chunks = 1):
@@ -108,10 +109,10 @@ def gather(num_nodes, root):
     return build_collective(f'Gather(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root))
 
 def allgather(num_nodes):
-    return build_collective(f'Allgather(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all)
+    return build_collective(f'Allgather(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, runtime_name='allgather')
 
 def alltoall(num_nodes):
-    return build_collective(f'Alltoall(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes))
+    return build_collective(f'Alltoall(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), runtime_name='alltoall')
 
 # Combining collectives
 
@@ -125,10 +126,10 @@ def reduce(num_nodes, root):
     return build_collective(f'Reduce(n={num_nodes},root={root})', num_nodes, num_nodes, _scattered(num_nodes), _root(root), _single_scattered(num_nodes))
 
 def allreduce(num_nodes):
-    return build_collective(f'Allreduce(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, _single_scattered(num_nodes))
+    return build_collective(f'Allreduce(n={num_nodes})', num_nodes, num_nodes, _scattered(num_nodes), _all, _single_scattered(num_nodes), runtime_name='allreduce')
 
 def reduce_scatter(num_nodes):
-    return build_collective(f'ReduceScatter(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), _single_scattered(num_nodes))
+    return build_collective(f'ReduceScatter(n={num_nodes})', num_nodes, num_nodes * num_nodes, _scattered(num_nodes), _transpose(num_nodes), _single_scattered(num_nodes), runtime_name='reduce_scatter')
 
 def scan(num_nodes):
     def postcondition(rank, chunk):
diff --git a/sccl/ncclize.py b/sccl/ncclize.py
index 85fc4d6..59b69ed 100644
--- a/sccl/ncclize.py
+++ b/sccl/ncclize.py
@@ -564,6 +564,8 @@ def expand_mappings(mappings):
     algorithm.nchannels = nchannels
     algo_elem.set('nchannels', str(nchannels))
     algo_elem.set('ngpus', str(len(gpus)))
+    algo_elem.set('inplace', '0')
+    algo_elem.set('coll', algorithm.collective.runtime_name)
     if old_format:
         algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))
     for rank, gpu in gpus.items():
diff --git a/sccl/serialization.py b/sccl/serialization.py
index b44d137..4480517 100644
--- a/sccl/serialization.py
+++ b/sccl/serialization.py
@@ -21,7 +21,7 @@ def _sccl_object_hook(o):
         return Step(o['rounds'], sends)
     if o['sccl_type'] == 'collective':
         triggers = { (int(r), int(c)): v for r, rmap in o['triggers'].items() for c, v in rmap.items() }
-        return Collective(o['name'], o['nodes'], o['chunks'], triggers)
+        return Collective(o['name'], o['nodes'], o['chunks'], triggers, o['runtime_name'])
     if o['sccl_type'] == 'chunk':
         pre = set(o['pre'])
         post = set(o['post'])
@@ -71,6 +71,7 @@ def default(self, o):
                 'nodes': o.num_nodes,
                 'chunks': o._chunks,
                 'triggers': triggers,
+                'runtime_name': o.runtime_name,
             }
         if isinstance(o, Chunk):
             return {

From c0e4c43807e31fd2142e1bd3d656e2d0461f64ea Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 29 Sep 2021 14:23:50 -0700
Subject: [PATCH 066/135] Update SCCL config attribute names

---
 sccl/autosynth/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index de6b354..b8ab7d4 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -56,9 +56,9 @@ def init(num_machines, machine_type, *collectives):
                     load_elem.set('path', path)
                     minsize, maxsize, proto = params
                     if minsize != 0:
-                        load_elem.set('minsize', str(minsize))
+                        load_elem.set('minbytes', str(minsize))
                     if maxsize != math.inf:
-                        load_elem.set('maxsize', str(maxsize+1))
+                        load_elem.set('maxbytes', str(maxsize+1))
                     load_elem.set('proto', proto)
         ET.indent(algos_elem, space='  ')
         

From b0061f462d635a82ae7a71ffd46f89285a0e4218 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 29 Sep 2021 14:46:30 -0700
Subject: [PATCH 067/135] Bugfix

---
 sccl/autosynth/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
index b0842d1..2a3f70e 100644
--- a/sccl/autosynth/registry.py
+++ b/sccl/autosynth/registry.py
@@ -15,7 +15,7 @@
 
 def _register_ef_provider(desc, fun, collective, machine_type, machines, sizes, protocol, priority):
     if sizes == None:
-        sizes == (0, math.inf)
+        sizes = (0, math.inf)
     else:
         lower, upper = sizes
         if isinstance(lower, str):

From 8cab0d4ff1ef17c88c2576c82c28844acf09e8b3 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 30 Sep 2021 10:48:12 -0700
Subject: [PATCH 068/135] Add sccl.init example and rename machines

dgx1 and a100 are now more accurately named ndv2 and ndv4.
---
 examples/requirements_sccl_init.txt           |  1 +
 examples/sccl_init.py                         | 57 +++++++++++++++++++
 sccl/autosynth/__init__.py                    |  8 +--
 .../{dgx1_plans.py => ndv2_plans.py}          |  6 +-
 .../{a100_plans.py => ndv4_plans.py}          |  6 +-
 5 files changed, 68 insertions(+), 10 deletions(-)
 create mode 100644 examples/requirements_sccl_init.txt
 create mode 100644 examples/sccl_init.py
 rename sccl/autosynth/{dgx1_plans.py => ndv2_plans.py} (84%)
 rename sccl/autosynth/{a100_plans.py => ndv4_plans.py} (98%)

diff --git a/examples/requirements_sccl_init.txt b/examples/requirements_sccl_init.txt
new file mode 100644
index 0000000..c26d6b4
--- /dev/null
+++ b/examples/requirements_sccl_init.txt
@@ -0,0 +1 @@
+git+https://github.com/parasailteam/sccl-presynth
\ No newline at end of file
diff --git a/examples/sccl_init.py b/examples/sccl_init.py
new file mode 100644
index 0000000..66beca2
--- /dev/null
+++ b/examples/sccl_init.py
@@ -0,0 +1,57 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import os
+
+def show():
+    print()
+    print(f"SCCL_CONFIG = {os.environ['SCCL_CONFIG']}")
+    print(f"NCCL_MIN_NCHANNELS = {os.environ['NCCL_MIN_NCHANNELS']}")
+    print(f"NCCL_NET_SHARED_BUFFERS = {os.environ['NCCL_NET_SHARED_BUFFERS']}")
+    print(f"Contents of {os.environ['SCCL_CONFIG']}:")
+    with open(os.environ['SCCL_CONFIG']) as f:
+        print(f.read())
+    print()
+
+
+print('=== Trigger a builtin synthesis plan ===')
+
+import sccl
+sccl.init(9, 'ndv4', ('alltoall', '1GB'))
+
+show()
+
+
+print('=== Register additional plans from a library ===')
+
+import sccl_presynth
+sccl.init(3, 'ndv2',
+    ('alltoall', '1GB'),
+    ('allgather', (128, '1KB')))
+
+show()
+
+
+print('=== Register custom plans ===')
+
+from sccl.autosynth.registry import register_synthesis_plan
+
+@register_synthesis_plan('alltoall', 'ndv9000', lambda m: m == 1, ('1MB', None))
+def alltoall_9000(machines):
+    return """<algo name="a2andv9000" nchunksperloop="2" nchannels="1" inplace="0" ngpus="2" proto="Simple">
+    ...
+    </algo>"""
+
+sccl.init(1, 'ndv9000', ('alltoall', '2MB'))
+
+show()
+
+
+print('=== Overlapping size ranges ===')
+
+register_synthesis_plan('alltoall', 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000)
+register_synthesis_plan('alltoall', 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000)
+
+sccl.init(1, 'ndv9000', ('alltoall', ('2KB', None)))
+
+show()
\ No newline at end of file
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index b8ab7d4..cce1ac0 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -13,10 +13,10 @@
 import tempfile
 import humanfriendly
 
-from sccl.autosynth.dgx1_plans import register_dgx1_plans
-from sccl.autosynth.a100_plans import register_a100_plans
-register_dgx1_plans()
-register_a100_plans()
+from sccl.autosynth.ndv2_plans import register_ndv2_plans
+from sccl.autosynth.ndv4_plans import register_ndv4_plans
+register_ndv2_plans()
+register_ndv4_plans()
 
 
 def init(num_machines, machine_type, *collectives):
diff --git a/sccl/autosynth/dgx1_plans.py b/sccl/autosynth/ndv2_plans.py
similarity index 84%
rename from sccl/autosynth/dgx1_plans.py
rename to sccl/autosynth/ndv2_plans.py
index 9d21cae..4e257c3 100644
--- a/sccl/autosynth/dgx1_plans.py
+++ b/sccl/autosynth/ndv2_plans.py
@@ -9,9 +9,9 @@
 from sccl.ncclize import ncclize
 
 
-def register_dgx1_plans():
-    @register_synthesis_plan('alltoall', 'dgx1', machines=lambda x: x >= 2)
-    def synthesize_dgx1_relay_alltoall(machines):
+def register_ndv2_plans():
+    @register_synthesis_plan('alltoall', 'ndv2', machines=lambda x: x >= 2)
+    def synthesize_ndv2_relay_alltoall(machines):
         gather_coll = gather(8, 0)
         scatter_coll = scatter(8, 1)
         gather_algo = solve_least_steps(dgx1(), gather_coll)
diff --git a/sccl/autosynth/a100_plans.py b/sccl/autosynth/ndv4_plans.py
similarity index 98%
rename from sccl/autosynth/a100_plans.py
rename to sccl/autosynth/ndv4_plans.py
index 01bffd2..efe10a1 100644
--- a/sccl/autosynth/a100_plans.py
+++ b/sccl/autosynth/ndv4_plans.py
@@ -3,9 +3,9 @@
 
 from sccl.autosynth.registry import register_synthesis_plan
 
-def register_a100_plans():
-    @register_synthesis_plan('alltoall', 'a100', machines=lambda x: x == 9)
-    def synthesize_a100_hierarchical_alltoall(machines):
+def register_ndv4_plans():
+    @register_synthesis_plan('alltoall', 'ndv4', machines=lambda x: x == 9)
+    def synthesize_ndv4_hierarchical_alltoall(machines):
         xml = ""
         nnodes = 9
         assert(machines == nnodes)

From 6f02cf303ceada9edc0792362512f86419f8b451 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 30 Sep 2021 11:15:11 -0700
Subject: [PATCH 069/135] Make autosynth intervals [min,max)

---
 sccl/autosynth/__init__.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index cce1ac0..660d056 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -36,7 +36,7 @@ def init(num_machines, machine_type, *collectives):
         else:
             if isinstance(sizes, str):
                 sizes = humanfriendly.parse_size(sizes)
-            sizes = (sizes, sizes)
+            sizes = (sizes, sizes+1)
         candidates = synthesis_plans[(name, machine_type)]
         selected_plans[name] = _select_plans(name, candidates, num_machines, sizes)
 
@@ -58,7 +58,7 @@ def init(num_machines, machine_type, *collectives):
                     if minsize != 0:
                         load_elem.set('minbytes', str(minsize))
                     if maxsize != math.inf:
-                        load_elem.set('maxbytes', str(maxsize+1))
+                        load_elem.set('maxbytes', str(maxsize))
                     load_elem.set('proto', proto)
         ET.indent(algos_elem, space='  ')
         
@@ -87,38 +87,38 @@ def _select_plans(name, candidates, num_machines, sizes):
     for candidate in valid_candidates:
         csizes = candidate[3]
         # Skip candidate if it does not overlap with user provided sizes
-        if csizes[0] > sizes[1] or sizes[0] > csizes[1]:
+        if csizes[0] >= sizes[1] or sizes[0] >= csizes[1]:
             continue
         i = 0
         while i < len(candidate_intervals):
             ival = candidate_intervals[i]
             isizes = ival[0]
-            if isizes[1] < csizes[0]:
+            if isizes[1] <= csizes[0]:
                 i += 1
                 continue
-            if isizes[0] > csizes[1]:
+            if isizes[0] >= csizes[1]:
                 break
             if isizes[0] < csizes[0]:
                 del candidate_intervals[i]
                 candidate_intervals.insert(i, ((csizes[0], isizes[1]), ival[1]))
-                candidate_intervals.insert(i, ((isizes[0], csizes[0]-1), ival[1].copy()))
+                candidate_intervals.insert(i, ((isizes[0], csizes[0]), ival[1].copy()))
                 i += 1
                 continue
             if isizes[1] > csizes [1]:
                 del candidate_intervals[i]
-                candidate_intervals.insert(i, ((csizes[1]+1, isizes[1]), ival[1]))
+                candidate_intervals.insert(i, ((csizes[1], isizes[1]), ival[1]))
                 candidate_intervals.insert(i, ((isizes[0], csizes[1]), ival[1] + [candidate]))
                 break
             ival[1].append(candidate)
-            csizes = (isizes[1]+1,csizes[1])
-            if csizes[0] > csizes[1]:
+            csizes = (isizes[1],csizes[1])
+            if csizes[0] >= csizes[1]:
                 break
             if csizes[0] == math.inf:
                 break
     results = []
     for isizes, candidates in candidate_intervals:
         # Skip interval if it does not overlap with user provided sizes
-        if isizes[0] > sizes[1] or sizes[0] > isizes[1]:
+        if isizes[0] >= sizes[1] or sizes[0] >= isizes[1]:
             continue
         sorted_candidates = sorted(candidates, key=_candidate_sort_key)
         description = f'{name} with sizes from {_format_size(isizes[0])} to {_format_size(isizes[1])}'

From e16ff526df9270c0f99dad5cb1c7add10f459135 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 30 Sep 2021 11:18:48 -0700
Subject: [PATCH 070/135] Fix autosynth tests

---
 tests/test_autosynth.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 3bceacc..51715d6 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -10,12 +10,12 @@ def test_sccl_init(capsys):
     sccl.init(4, 'not_a_machine_type', ('alltoall', 0))
     out, err = capsys.readouterr()
     assert 'No plan found' in out
-    sccl.init(2, 'dgx1', ('alltoall', '1MB'))
+    sccl.init(2, 'ndv2', ('alltoall', '1MB'))
     out, err = capsys.readouterr()
-    assert 'synthesize_dgx1_relay_alltoall' in out
-    sccl.init(9, 'a100', ('alltoall', '1MB'))
+    assert 'synthesize_ndv2_relay_alltoall' in out
+    sccl.init(9, 'ndv4', ('alltoall', '1MB'))
     out, err = capsys.readouterr()
-    assert 'synthesize_a100_hierarchical_alltoall' in out
+    assert 'synthesize_ndv4_hierarchical_alltoall' in out
 
 
 def test_register_plan():

From 695bbd7689dbbafd39e5a8c4b2284371421649fa Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 5 Oct 2021 11:14:48 -0700
Subject: [PATCH 071/135] Various ncclize improvements (#6)

* Do more sophisticated index sorting in ncclize.

Now sets of addresses that result in larger counts when contiguously
allocated are greedily allocated first. Simple scratch sorting is still the
default. A CLI flag is available to use the greedy version.

* Move logic for dependency filtering earlier

This fixes a problem where nop ops with no dependencies were created.

* Add support for cpy operations in ncclize

Any addresses that are both input and output are now copied with the new
instruction in threadblocks.
The cpy operations hook into the dependency tracking of the main algorithm,
which should minimize the synchronization they need.
The greedy scratch allocation algorithm will also insert copies to make
input/output addresses contiguous in scratch.

* Overlapping in gather-scatter distributor

Also improve greedy scratch sorting to work better with this.

* Make ncclize channel allocation counts aware

Now contiguous sends are split up as necessary to match concurrency.
Also remove MaxConcurrency channel policy.

* Get rid of --old-format

The old format has become entrenched.

* Make --use-scratch the default

New option is --no-scratch
---
 sccl/autosynth/ndv2_plans.py                 |   2 +-
 sccl/cli/ncclize.py                          |   8 +-
 sccl/distributors/gather_scatter_alltoall.py |  18 +-
 sccl/ncclize.py                              | 387 +++++++++++--------
 tests/test_cli.py                            |   2 +-
 5 files changed, 239 insertions(+), 178 deletions(-)

diff --git a/sccl/autosynth/ndv2_plans.py b/sccl/autosynth/ndv2_plans.py
index 4e257c3..c7218ae 100644
--- a/sccl/autosynth/ndv2_plans.py
+++ b/sccl/autosynth/ndv2_plans.py
@@ -18,4 +18,4 @@ def synthesize_ndv2_relay_alltoall(machines):
         scatter_algo = solve_least_steps(dgx1(), scatter_coll)
         algo = synthesize_gather_scatter_distributed_alltoall(
             machines, gather_algo, scatter_algo)
-        return ncclize(algo, old_format=True, use_scratch=True, instances=8)
\ No newline at end of file
+        return ncclize(algo, instances=8)
\ No newline at end of file
diff --git a/sccl/cli/ncclize.py b/sccl/cli/ncclize.py
index 733a4d6..0a0cb7e 100644
--- a/sccl/cli/ncclize.py
+++ b/sccl/cli/ncclize.py
@@ -13,8 +13,8 @@ def make_handle_ncclize(cmd_parsers):
     remap_scratch_grp.add_argument('--no-remap-scratch', action='store_false', dest='remap_scratch', help='don\'t remap scratch buffer indices into free input/output indices')
     cmd.add_argument('--no-merge-contiguous', action='store_true', help='don\'t merge sends/receives from/to contiguous memory')
     cmd.add_argument('--no-pretty-print', action='store_true', help='don\'t pretty print the generated XML')
-    cmd.add_argument('--old-format', action='store_true', help='use the old format')
-    cmd.add_argument('--use-scratch', action='store_true', help='use the scratch buffer instead of extra space at the end of output buffer')
+    cmd.add_argument('--greedy-scratch-sorting', action='store_true', help='sort scratch buffer indices greedily to increase contiguous operations')
+    cmd.add_argument('--no-scratch', action='store_true', help='use extra space at the end of output buffer instead of the scratch buffer')
     cmd.add_argument('--channel-policy', type=ChannelPolicy, choices=list(ChannelPolicy), default=ChannelPolicy.MatchTopology, help='channel allocation policy')
     cmd.add_argument('--instances', type=int, default=1, help='number of interleaved instances of the algorithm to make')
 
@@ -30,9 +30,9 @@ def handle(args, command):
                 remap_scratch=args.remap_scratch,
                 channel_policy=args.channel_policy,
                 pretty_print=not args.no_pretty_print,
-                old_format=args.old_format,
-                use_scratch=args.use_scratch,
+                use_scratch=not args.no_scratch,
                 merge_contiguous=not args.no_merge_contiguous,
+                greedy_scratch_sorting=args.greedy_scratch_sorting,
                 instances=args.instances,
                 logging=True)
 
diff --git a/sccl/distributors/gather_scatter_alltoall.py b/sccl/distributors/gather_scatter_alltoall.py
index 4ba688c..9cb3bc3 100644
--- a/sccl/distributors/gather_scatter_alltoall.py
+++ b/sccl/distributors/gather_scatter_alltoall.py
@@ -94,7 +94,9 @@ def nth_chunk_for_pair(src, dst, idx):
 
     steps = []
 
-    for local_step in gather_algo.steps:
+    chunk_end = defaultdict(lambda: 0)
+
+    for step_idx, local_step in enumerate(gather_algo.steps):
         sends = []
 
         # Translate copies of the local Gather to the new space of ranks
@@ -119,10 +121,13 @@ def to_dist(rank):
                     sends.append((dist_chunk, to_dist(src), to_dist(dst)))
                     assert to_dist(src) != to_dist(dst)
 
+                    # Update the latest step this chunk was touched on
+                    chunk_end[dist_chunk] = max(chunk_end[dist_chunk], step_idx+1)
+
         steps.append(Step(local_step.rounds * nodes, sends))
 
     # Perform transpose between local root nodes
-    transpose_sends = []
+    transpose_sends = [[] for _ in range(len(gather_algo.steps) + 1)]
     for src in range(nodes):
         for dst in range(nodes):
                 # Sends are needed for the chunks going from src to dst if they are in different copies or if the
@@ -139,9 +144,14 @@ def to_dist(rank):
                         # Calculate the local root ranks' global indices
                         root_src = (src // local_nodes) * local_nodes + gather_root
                         root_dst = (dst // local_nodes) * local_nodes + scatter_root
-                        transpose_sends.append((chunk, root_src, root_dst))
+                        transpose_sends[chunk_end[chunk]].append((chunk, root_src, root_dst))
                         assert root_src != root_dst
-    steps.append(Step(chunks * local_nodes * local_nodes, transpose_sends))
+    for i, sends in enumerate(transpose_sends):
+        if i < len(gather_algo.steps):
+            steps[i].sends.extend(sends)
+            steps[i].rounds = max(steps[i].rounds, chunks * local_nodes * local_nodes)
+        else:
+            steps.append(Step(chunks * local_nodes * local_nodes, sends))
 
     #TODO: integrate into above
     if gather_root != scatter_root and local_topology.link(gather_root, scatter_root) == 0:
diff --git a/sccl/ncclize.py b/sccl/ncclize.py
index 59b69ed..f1016ad 100644
--- a/sccl/ncclize.py
+++ b/sccl/ncclize.py
@@ -5,19 +5,20 @@
 from collections import defaultdict
 from dataclasses import dataclass, field, replace
 import math
-import threading, queue
+import threading, queue, itertools, bisect
 from enum import Enum
 from z3 import *
 
 @dataclass
 class _Gpu:
-    copies: list
+    precopies: list
+    postcopies: list
     inputs: dict
     outputs: dict
     input_chunks: int
     output_chunks: int
     scratch: dict = field(default_factory=dict)
-    threadbloks: list = field(default_factory=list)
+    threadblocks: list = field(default_factory=list)
 
     def scratch_size(self):
         return max((idx for addr, idx in self.scratch.items()), default=-1) + 1
@@ -32,11 +33,6 @@ class _Threadblock:
     # The steps may expand into multiple operations here
     ops: list = field(default_factory=list)
 
-@dataclass
-class _Copy:
-    input_offset: int
-    output_offset: int
-
 @dataclass
 class _Op:
     gpu: int
@@ -67,14 +63,11 @@ def _analyze_liveness(gpus, algorithm):
     output_livenesses = {rank: [[(math.inf,math.inf)] for _ in range(gpu.output_chunks)] for rank, gpu in gpus.items()}
     scratch_livenesses = {rank: [[(math.inf,-1)] for addr, idx in gpu.scratch.items()] for rank, gpu in gpus.items()}
 
-    # For copies reserve the index in the output buffer from the very beginning
-    for rank, gpu in gpus.items():
-        for copy in gpu.copies:
-            output_livenesses[rank][copy.output_offset] = [(-1,math.inf)]
-
     def update_liveness(rank, addr, step_idx):
         gpu = gpus[rank]
         # Find the relevant buffer and livenesses for the address
+        # Addresses in both input and output are treated as input (as currently postcopies are inserted).
+        # TODO: This is a bit dangerous, as changing the other bit of code to do precopies would silently break this.
         if addr in gpu.inputs:
             buffer = gpu.inputs
             liveness = input_livenesses[rank]
@@ -211,86 +204,86 @@ def optimize(q):
         max_scratch_overhead = max(gpu.scratch_size() / (gpu.input_chunks + gpu.output_chunks) for gpu in gpus.values())
         print(f'Maximum scratch overhead is {max_scratch_overhead * 100:.0f}%')
 
-def _allocate_channels_max_concurrency(op_sets, logging):
-    # This function solves a coloring problem to ops to a minimal set of channels
-    ctx = Context()
-
-    def chan(idx):
-        return Int(f'chan_{idx}', ctx=ctx)
-    max_channels = Int('max_channels', ctx=ctx)
-
-    constraints = []
-
-    # Add basic constraints and find conflicting sets of operations
-    conflict_groups = defaultdict(set)
-    for idx, op_set in enumerate(op_sets):
-        for op in op_set:
-            # Two operations conflict if they use the same src-dst edge on the same step
-            conflict_groups[(op.gpu, op.is_send, op.peer, op.step)].add(idx)
-        constraints.append(chan(idx) >= 0)
-        constraints.append(chan(idx) < max_channels)
-
-    # Require channels within the conflict groups to be disjoint
-    for grp in conflict_groups.values():
-        constraints.append(Distinct([chan(idx) for idx in grp]))
-
-    opt = Optimize(ctx=ctx)
-    opt.add(constraints)
-    opt.minimize(max_channels)
-    
-    t = threading.Thread(target=opt.check)
-    t.start()
-    t.join(1)
-    main_ctx().interrupt()
-    t.join()
-
-    try:
-        model = opt.model()
-    except Z3Exception:
-        # TODO: This altenate process does not guarantee that channels are contiguous
-        s = Solver(ctx=ctx)
-        s.add(constraints)
-        s.check()
-        model = s.model()
-            
-    if logging:
-        print(f'Using up to {model[max_channels].as_long()} channels')
-
-    # Group the operations by which channels they use
-    ops_by_channel = defaultdict(list)
-    for idx, op_set in enumerate(op_sets):
-        ops = ops_by_channel[model[chan(idx)].as_long()]
-        ops.extend(op_set)
-
-    return ops_by_channel
-
-def _allocate_channels_match_topology(op_sets, topology, instances, logging):
-    if len(topology.switches) > 0 and logging:
-        print('Warning: Switches in the topology are ignored for the channel policy MatchTopology.')
-
-    ops_by_channel = defaultdict(list)
-    next_channel = defaultdict(lambda: 0)
-    for op_set in op_sets:
-        send = op_set[0]
-        assert send.op_type == 's'
-        src = send.gpu
-        dst = send.peer
-        ops_by_channel[next_channel[(src,dst)]].extend(op_set)
-        link = topology.link(src,dst) * instances
-        assert link > 0, 'Encountered send on non-existent link'
-        next_channel[(src,dst)] = (next_channel[(src,dst)] + 1) % link
-
-    return ops_by_channel
+def _greedy_scratch_sort(algorithm, gpus):
+    # Sort scratch mappings in an attempt to make more of them contiguous (this is of course a heuristic).
+    # The procedure first figures out the sets of addresses that would result in combined operations if
+    # the source and destination indices were contiguously allocated. These are then greedily allocated
+    # starting with the largest sets. Afterwards any remaining scratch mappings are allocated in order.
+    tosort = { rank: set(gpu.scratch.keys()).union(gpu.inputs.keys()).union(gpu.outputs.keys()) for rank, gpu in gpus.items() }
+    csets = defaultdict(set)
+    for idx, step in enumerate(algorithm.steps):
+        for addr, src, dst in step.sends:
+            if addr in tosort[src] and addr in tosort[dst]:
+                csets[(idx, src, dst)].add(addr)
+    for gpu in gpus.values():
+        gpu.scratch = {}
+    for key in sorted(csets, key=lambda x: len(csets[x]), reverse=True):
+        idx, src, dst = key
+        cset = csets[key]
+
+        def contiguous_in(buffer):
+            if not cset.issubset(buffer.keys()):
+                return False
+            for i in range(1, len(addrs)):
+                if buffer[addrs[i]] != buffer[addrs[i-1]] + 1:
+                    return False
+            return True
+        
+        # Check if either side is already contiguous
+        addrs = sorted(cset)
+        src_input_contig = contiguous_in(gpus[src].inputs)
+        skip_src = src_input_contig or contiguous_in(gpus[src].outputs) or contiguous_in(gpus[src].scratch)
+        dst_input_contig = contiguous_in(gpus[dst].inputs) 
+        skip_dst = dst_input_contig or contiguous_in(gpus[dst].outputs) or contiguous_in(gpus[dst].scratch)
+
+        if (cset.issubset(tosort[src]) or skip_src) and (cset.issubset(tosort[dst]) or skip_dst):
+            # Block these addresses from being sorted again on both GPUs
+            tosort[src].difference_update(cset)
+            tosort[dst].difference_update(cset)
+
+            for addr in addrs:
+                def alloc(rank, skip, prefer_input):
+                    gpu = gpus[rank]
+                    if skip:
+                        # If not allocating in scratch, check if we need to make a copy and do a precopy if that allows
+                        # maintaining contiguity.
+                        if addr in gpu.inputs and addr in gpu.outputs:
+                            copy = _Op(rank, None, -1, False, 'cpy', 'i', gpu.inputs[addr], 'o', gpu.outputs[addr], 1, [])
+                            if prefer_input:
+                                gpu.postcopies.append(copy)
+                                del gpu.outputs[addr]
+                            else:
+                                gpu.precopies.append(copy)
+                                del gpu.inputs[addr]
+                    else:
+                        # Reallocate address in scratch and insert necessary copies for input/output addresses
+                        gpu.scratch[addr] = len(gpu.scratch)
+                        if addr in gpu.inputs:
+                            gpu.precopies.append(_Op(src, None, -1, False, 'cpy',
+                                'i', gpu.inputs[addr], 's', gpu.scratch[addr], 1, []))
+                            del gpu.inputs[addr]
+                        if addr in gpu.outputs:
+                            gpu.postcopies.append(_Op(src, None, -1, False, 'cpy',
+                                's', gpu.scratch[addr], 'o', gpu.outputs[addr], 1, []))
+                            del gpu.outputs[addr]
+                alloc(src, skip_src, src_input_contig)
+                alloc(dst, skip_dst, dst_input_contig)
+
+    # Allocate any remaining addresses that aren't already input or output
+    for rank in tosort:
+        gpu = gpus[rank]
+        for addr in sorted(tosort[rank]):
+            if not addr in gpu.inputs and not addr in gpu.outputs:
+                gpu.scratch[addr] = len(gpu.scratch)
 
 class ChannelPolicy(Enum):
     One = 'One'
-    MaxConcurrency = 'MaxConcurrency'
     MatchTopology = 'MatchTopology'
 
     def __str__(self):
         return self.value
 
-def ncclize(algorithm, remap_scratch = None, channel_policy=ChannelPolicy.MatchTopology, pretty_print = True, old_format=False, use_scratch=False, merge_contiguous=True, instances=1, logging=False):
+def ncclize(algorithm, remap_scratch = None, channel_policy=ChannelPolicy.MatchTopology, pretty_print = True, use_scratch=True, merge_contiguous=True, greedy_scratch_sorting=False, instances=1, logging=False):
     '''
     Generate the XML format used by the NCCL SCCL backend.
 
@@ -323,14 +316,9 @@ def ncclize(algorithm, remap_scratch = None, channel_policy=ChannelPolicy.MatchT
         if rank in algorithm.output_map:
             outputs.update({ addr: idx for idx, addr in enumerate(sorted(algorithm.output_map[rank])) })
         inputs = {}
-        copies = []
         if rank in algorithm.input_map:
-            for idx, addr in enumerate(sorted(algorithm.input_map[rank])):
-                if addr in outputs:
-                    copies.append(_Copy(idx, outputs[addr]))
-                else:
-                    inputs[addr] = idx
-        gpus[rank] = _Gpu(copies, inputs, outputs, len(inputs) + len(copies), len(outputs))
+            inputs.update({ addr: idx for idx, addr in enumerate(sorted(algorithm.input_map[rank])) })
+        gpus[rank] = _Gpu([], [], inputs, outputs, len(inputs), len(outputs))
 
     # Create scratch buffer mappings if necessary
     def allocate_scratch(gpu, addr):
@@ -342,14 +330,49 @@ def allocate_scratch(gpu, addr):
             allocate_scratch(gpus[src], addr)
             allocate_scratch(gpus[dst], addr)
 
-    # Analyze liveness of indices in buffers and remap scratch into input/output as possible
     if remap_scratch:
+        # Analyze liveness of indices in buffers and remap scratch into input/output as possible
         liveness = _analyze_liveness(gpus, algorithm)
         _remap_scratch_into_input_output(liveness, gpus, logging)
+    elif greedy_scratch_sorting:
+        _greedy_scratch_sort(algorithm, gpus)
+    else:
+        # Sort scratch mappings in an attempt to make more of them contiguous (this is of course a heuristic).
+        for gpu in gpus.values():
+            gpu.scratch = { addr: idx for idx, addr in enumerate(sorted(gpu.scratch)) }
 
-    # Sort scratch mappings in an attemp to make more of them contiguous (this is of course a heuristic).
-    for gpu in gpus.values():
-        gpu.scratch = { addr: idx for idx, addr in enumerate(sorted(gpu.scratch)) }
+    # Add any copies from input to output that weren't already added
+    for rank, gpu in gpus.items():
+        for addr in gpu.inputs:
+            if addr in gpu.outputs:
+                gpu.postcopies.append(_Op(rank, None, -1, False, 'cpy',
+                    'i', gpu.inputs[addr], 'o', gpu.outputs[addr], 1, []))
+                del gpu.outputs[addr]
+
+    # Sort and combine contiguous copy operations
+    for rank, gpu in gpus.items():
+        def combine_copies(copies):
+            copies.sort(key=lambda x: (x.src_buffer, x.dst_buffer, x.src_offset, x.dst_offset))
+            i = 0
+            while i < len(copies) - 1:
+                c1 = copies[i]
+                c2 = copies[i+1]
+                if (c1.src_buffer == c2.src_buffer and c1.dst_buffer == c2.dst_buffer and
+                    c1.src_offset + c1.cnt == c2.src_offset and c1.dst_offset + c1.cnt == c2.dst_offset):
+                    c1.cnt += c2.cnt
+                    del copies[i+1]
+                else:
+                    i += 1
+        combine_copies(gpu.precopies)
+        combine_copies(gpu.postcopies)
+
+    # Expand copies by instances if necessary
+    if instances > 1:
+        for rank, gpu in gpus.items():
+            for copy in itertools.chain(gpu.precopies, gpu.postcopies):
+                copy.src_offset *= instances
+                copy.dst_offset *= instances
+                copy.cnt *= instances
 
     def get_buffer_and_offset(gpu, addr):
         # Map an address to one of the named buffers
@@ -399,11 +422,19 @@ def make_interval(a,b):
                 yield (srcbuff, srcoff, dstbuff, dstoff, 1)    
 
     # Turn all steps of the algorithm into operations
-    op_sets = []
+    ops_by_channel = defaultdict(list)
     # Track the latest op that wrote to each buffer index
     writers = defaultdict(list)
     # Track all the reads since the last write to each buffer index
     readers = defaultdict(list)
+
+    # Initialize readers and writers for precopies
+    for rank, gpu in gpus.items():
+        for op in gpu.precopies:
+            for i in range(op.cnt):
+                readers[(rank,op.src_buffer,op.src_offset+i)].append(op)
+                writers[(rank,op.dst_buffer,op.dst_offset+i)].append(op)
+
     for step_idx, step in enumerate(algorithm.steps):
         new_writers = defaultdict(list)
         new_readers = defaultdict(list)
@@ -416,15 +447,56 @@ def make_interval(a,b):
         # Combine sends into intervals and create multiple instances if necessary
         sends = []
         for (src, dst), addrs in grouped_sends.items():
-            for src_buf, src_off, dst_buf, dst_off, cnt in make_intervals(src, dst, addrs):
+            intervals = list(make_intervals(src, dst, addrs))
+            if channel_policy == ChannelPolicy.One:
+                num_chans = 1
+                channeled_intervals = [ (src_buf, src_off, dst_buf, dst_off, cnt, 0) for src_buf, src_off, dst_buf, dst_off, cnt in intervals ]
+            elif channel_policy == ChannelPolicy.MatchTopology:
+                # Divide sends onto channels matching the topology (assume bw is ideal concurrency)
+                # Sends are split to balance channels if necessary
+                num_chans = algorithm.topology.link(src,dst)
+                channeled_intervals = []
+
+                intervals.sort(key=lambda x: x[4])
+                counts = [x[4] for x in intervals]
+                total = sum(counts)
+                targets = [(total//num_chans) + (1 if i < (total%num_chans) else 0) for i in range(num_chans)]
+
+                chan = 0
+                while len(intervals) > 0:
+                    if targets[chan] >= counts[-1]:
+                        i = -1
+                    else:
+                        i = bisect.bisect_left(counts, targets[chan])
+                        if i == len(counts) or counts[i] != targets[chan]:
+                            i = -1
+                    src_buf, src_off, dst_buf, dst_off, cnt = intervals[i]
+                    del intervals[i]
+                    del counts[i]
+                    if cnt > targets[chan]:
+                        rem = cnt - targets[chan]
+                        cnt = targets[chan]
+                        j = bisect.bisect_left(counts, rem)
+                        intervals.insert(j, (src_buf, src_off + cnt, dst_buf, dst_off + cnt, rem))
+                        counts.insert(j, rem)
+
+                    channeled_intervals.append((src_buf, src_off, dst_buf, dst_off, cnt, chan))
+                    targets[chan] -= cnt
+                    assert targets[chan] >= 0
+                    if targets[chan] == 0:
+                        chan += 1
+            else:
+                assert False, 'Unhandled channel policy'
+
+            for src_buf, src_off, dst_buf, dst_off, cnt, chan in channeled_intervals:
                 for i in range(instances):
                     new_src_off = src_off * instances + i * cnt
                     new_dst_off = dst_off * instances + i * cnt
-                    send = (src, dst, src_buf, new_src_off, dst_buf, new_dst_off, cnt)
+                    send = (src, dst, src_buf, new_src_off, dst_buf, new_dst_off, cnt, chan * instances + i)
                     sends.append(send)
 
         # Perform dependency tracking and create _Op instances
-        for src, dst, src_buf, src_off, dst_buf, dst_off, cnt in sends:
+        for src, dst, src_buf, src_off, dst_buf, dst_off, cnt, chan in sends:
             read_keys = [(src,src_buf,src_off+i) for i in range(cnt)]
             # A send must wait for the previous recv (if any) to finish
             send_depends = list(set(d for k in read_keys for d in writers[k]))
@@ -436,7 +508,7 @@ def make_interval(a,b):
             send_op = _Op(src, dst, step_idx, True, 's', src_buf, src_off, dst_buf, dst_off, cnt, send_depends)
             recv_op = _Op(dst, src, step_idx, False, 'r', src_buf, src_off, dst_buf, dst_off, cnt, recv_depends)
             # Record the send and receive as a set of operations that must happen on the same channel
-            op_sets.append([send_op, recv_op])
+            ops_by_channel[chan].extend([send_op, recv_op])
 
             # Mark writers and readers to be added for the next step
             for k in write_keys:
@@ -454,18 +526,18 @@ def make_interval(a,b):
         for key, deps in new_readers.items():
             readers[key].extend(deps)
 
+    # Add dependencies for postcopies
+    for rank, gpu in gpus.items():
+        for op in gpu.postcopies:
+            for i in range(op.cnt):
+                op.depends.extend(writers[(rank,op.src_buffer,op.src_offset+i)])
+                op.depends.extend(readers[(rank,op.dst_buffer,op.dst_offset+i)])
+                op.depends.extend(writers[(rank,op.dst_buffer,op.dst_offset+i)])
+
     # Fixup everything to match the instanced sends when multiple instances are generated
     if instances > 1:
-        for gpu in gpus.values():
-            # Create instances copies of the copies.
-            new_copies = []
-            for copy in gpu.copies:
-                for i in range(instances):
-                    new_copy = _Copy(copy.input_offset * instances + i, copy.output_offset * instances + i)
-                    new_copies.append(new_copy)
-            gpu.copies = new_copies
-
-            # Multiply the other metadata with instances
+        for rank, gpu in gpus.items():
+            # Multiply metadata with instances
             def expand_mappings(mappings):
                 return { addr * instances + i: idx * instances + i for addr, idx in mappings.items() for i in range(instances) }
             gpu.inputs = expand_mappings(gpu.inputs)
@@ -474,16 +546,6 @@ def expand_mappings(mappings):
             gpu.output_chunks *= instances
             gpu.scratch = expand_mappings(gpu.scratch)
 
-    # Allocate channels and group operations by channel
-    if channel_policy == ChannelPolicy.One:
-        ops_by_channel = {0: [op for op_set in op_sets for op in op_set]}
-    elif channel_policy == ChannelPolicy.MaxConcurrency:
-        ops_by_channel = _allocate_channels_max_concurrency(op_sets, logging)
-    elif channel_policy == ChannelPolicy.MatchTopology:
-        ops_by_channel = _allocate_channels_match_topology(op_sets, algorithm.topology, instances, logging)
-    else:
-        assert False, 'Unhandled channel policy'
-
     # Group by which operations need to be in the same threadblock
     tb_groups = defaultdict(list)
     for chan, chan_ops in ops_by_channel.items():
@@ -524,11 +586,28 @@ def expand_mappings(mappings):
         for i, tb in enumerate(gpu.threadblocks):
             tb.rbid = i
 
+    # Add all copies into extra threadblocks
+    for rank, gpu in gpus.items():
+        cpy_tb = _Threadblock(0)
+        cpy_tb.rbid = len(gpu.threadblocks)
+        cpy_tb.steps = gpu.precopies + gpu.postcopies
+        gpu.threadblocks.append(cpy_tb)
+
+    # Filter out dependencies within the same threadblock and mark all ops that have a dependence on them
+    for rank, gpu in gpus.items():
+        for tb in gpu.threadblocks:
+            for op in tb.steps:
+                op.block_rbid = tb.rbid
+    for rank, gpu in gpus.items():
+        for tb in gpu.threadblocks:
+            for op in tb.steps:
+                op.depends = list(filter(lambda d: d.block_rbid != op.block_rbid, op.depends))
+                for dep in op.depends:
+                    dep.has_dependence = True
+
     # Do some additional postprocessing of operations:
     # - Expand operations with extra dependencies with no-ops
     # - Mark the index of each operation taking any extra no-ops into account
-    # - Record the threadblock rbids for each operation
-    all_ops = []
     for rank, gpu in gpus.items():
         for tb in gpu.threadblocks:
             tb.steps.sort(key=lambda op: op.step)
@@ -543,18 +622,6 @@ def expand_mappings(mappings):
                         tb.ops[-1].idx = len(tb.ops) - 1
                 tb.ops.append(op)
                 tb.ops[-1].idx = len(tb.ops) - 1
-            for op in tb.ops:
-                op.block_rbid = tb.rbid
-            all_ops.extend(tb.ops)
-
-    # Filter out dependencies within the same threadblock
-    for op in all_ops:
-        op.depends = list(filter(lambda d: d.block_rbid != op.block_rbid, op.depends))
-
-    # Mark all ops that have a dependence on them
-    for op in all_ops:
-        for dep in op.depends:
-            dep.has_dependence = True
 
     # Generate the XML structure
     algo_elem = ET.Element('algo')
@@ -566,18 +633,13 @@ def expand_mappings(mappings):
     algo_elem.set('ngpus', str(len(gpus)))
     algo_elem.set('inplace', '0')
     algo_elem.set('coll', algorithm.collective.runtime_name)
-    if old_format:
-        algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))
+    algo_elem.set('nchunksperloop', str(max(max(gpu.input_chunks, gpu.output_chunks) for gpu in gpus.values())))
     for rank, gpu in gpus.items():
         gpu_elem = ET.SubElement(algo_elem, 'gpu')
         gpu_elem.set('id', str(rank))
         gpu_elem.set('i_chunks', str(gpu.input_chunks))
         gpu_elem.set('o_chunks', str(gpu.output_chunks))
         gpu_elem.set('s_chunks', str(gpu.scratch_size()))
-        for copy in gpu.copies:
-            copy_elem = ET.SubElement(gpu_elem, 'copy')
-            copy_elem.set('i_off', str(copy.input_offset))
-            copy_elem.set('o_off', str(copy.output_offset))
         for tb in gpu.threadblocks:
             tb_elem = ET.SubElement(gpu_elem, 'tb')
             tb_elem.set('id', str(tb.rbid))
@@ -585,8 +647,8 @@ def expand_mappings(mappings):
             tb_elem.set('recv', str(tb.recv))
             tb_elem.set('chan', str(tb.channel))
             for op in tb.ops:
-                op_elem = ET.SubElement(tb_elem, 'op' if not old_format else 'step')
-                op_elem.set('step' if not old_format else 's', str(op.idx))
+                op_elem = ET.SubElement(tb_elem, 'step')
+                op_elem.set('s', str(op.idx))
                 op_elem.set('type', op.op_type)
 
                 # The NCCL backend currently wants scratch at the end of output
@@ -598,40 +660,29 @@ def expand_mappings(mappings):
                         op.dst_buffer = 'o'
                         op.dst_offset += gpu.output_chunks
 
-                if old_format:
-                    if op.src_buffer is not None:
-                        op_elem.set('srcbuf', op.src_buffer)
-                        op_elem.set('srcoff', str(op.src_offset))
-                    else:
-                        op_elem.set('srcbuf', 'i')
-                        op_elem.set('srcoff', '-1')
-                    if op.dst_buffer is not None:
-                        op_elem.set('dstbuf', op.dst_buffer)
-                        op_elem.set('dstoff', str(op.dst_offset))
-                    else:
-                        op_elem.set('dstbuf', 'o')
-                        op_elem.set('dstoff', '-1')
+                if op.src_buffer is not None:
+                    op_elem.set('srcbuf', op.src_buffer)
+                    op_elem.set('srcoff', str(op.src_offset))
                 else:
-                    if op.is_send:
-                        if op.src_buffer is not None:
-                            op_elem.set('buf', op.src_buffer)
-                            op_elem.set('off', str(op.src_offset))
-                    else:
-                        if op.dst_buffer is not None:
-                            op_elem.set('buf', op.dst_buffer)
-                            op_elem.set('off', str(op.dst_offset))
-                if op.cnt > 1 or old_format:
-                    op_elem.set('cnt', str(op.cnt))
+                    op_elem.set('srcbuf', 'i')
+                    op_elem.set('srcoff', '-1')
+                if op.dst_buffer is not None:
+                    op_elem.set('dstbuf', op.dst_buffer)
+                    op_elem.set('dstoff', str(op.dst_offset))
+                else:
+                    op_elem.set('dstbuf', 'o')
+                    op_elem.set('dstoff', '-1')
+                op_elem.set('cnt', str(op.cnt))
                 assert len(op.depends) <= 1
                 if len(op.depends) == 1:
                     op_elem.set('depid', str(op.depends[0].block_rbid))
                     op_elem.set('deps', str(op.depends[0].idx))
-                elif old_format:
+                else:
                     op_elem.set('depid', '-1')
                     op_elem.set('deps', '-1')
                 if op.has_dependence:
                     op_elem.set('hasdep', '1')
-                elif old_format:
+                else:
                     op_elem.set('hasdep', '0')
 
     if pretty_print:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6520733..6c72607 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -68,11 +68,11 @@ def test_ncclize():
         assert 0 == os.system('sccl ncclize algo.json -o ncclized1.sccl.xml')
         assert os.path.exists('ncclized1.sccl.xml')
         assert 0 == os.system('sccl ncclize algo.json -f --channel-policy One')
-        assert 0 == os.system('sccl ncclize algo.json -f --channel-policy MaxConcurrency')
         assert 0 == os.system('sccl ncclize algo.json -f --channel-policy MatchTopology')
         assert 0 == os.system('sccl ncclize algo.json -f --no-merge-contiguous')
         assert 0 == os.system('sccl solve instance Star Alltoall --nodes 4 --steps 2 --rounds 4 -o algo_scratch.json')
         assert 0 == os.system('sccl ncclize algo_scratch.json -f --remap-scratch')
+        assert 0 == os.system('sccl ncclize algo_scratch.json -f --greedy-scratch-sorting')
 
 def test_custom_topology_and_collective():
     with in_tempdir():

From b7f8579232af50cb99d868ee63b3f4f38f724460 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 7 Oct 2021 11:01:11 -0700
Subject: [PATCH 072/135] Changes to sccl.init from feedback (#8)

* Switch first two arguments of sccl.init

The number of machines might become optional in the future.

* Add enum for collectives usable with sccl.init

* Update sccl.init example to use Collective enum
---
 examples/sccl_init.py      | 18 +++++++++---------
 sccl/__init__.py           |  1 +
 sccl/autosynth/__init__.py | 17 ++++++++++++++++-
 tests/test_autosynth.py    |  6 +++---
 4 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/examples/sccl_init.py b/examples/sccl_init.py
index 66beca2..f8cee84 100644
--- a/examples/sccl_init.py
+++ b/examples/sccl_init.py
@@ -17,7 +17,7 @@ def show():
 print('=== Trigger a builtin synthesis plan ===')
 
 import sccl
-sccl.init(9, 'ndv4', ('alltoall', '1GB'))
+sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1GB'))
 
 show()
 
@@ -25,9 +25,9 @@ def show():
 print('=== Register additional plans from a library ===')
 
 import sccl_presynth
-sccl.init(3, 'ndv2',
-    ('alltoall', '1GB'),
-    ('allgather', (128, '1KB')))
+sccl.init('ndv2', 3,
+    (sccl.Collective.alltoall, '1GB'),
+    (sccl.Collective.allgather, (128, '1KB')))
 
 show()
 
@@ -36,22 +36,22 @@ def show():
 
 from sccl.autosynth.registry import register_synthesis_plan
 
-@register_synthesis_plan('alltoall', 'ndv9000', lambda m: m == 1, ('1MB', None))
+@register_synthesis_plan(sccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1MB', None))
 def alltoall_9000(machines):
     return """<algo name="a2andv9000" nchunksperloop="2" nchannels="1" inplace="0" ngpus="2" proto="Simple">
     ...
     </algo>"""
 
-sccl.init(1, 'ndv9000', ('alltoall', '2MB'))
+sccl.init('ndv9000', 1, (sccl.Collective.alltoall, '2MB'))
 
 show()
 
 
 print('=== Overlapping size ranges ===')
 
-register_synthesis_plan('alltoall', 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000)
-register_synthesis_plan('alltoall', 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000)
+register_synthesis_plan(sccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000)
+register_synthesis_plan(sccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000)
 
-sccl.init(1, 'ndv9000', ('alltoall', ('2KB', None)))
+sccl.init('ndv9000', 1, (sccl.Collective.alltoall, ('2KB', None)))
 
 show()
\ No newline at end of file
diff --git a/sccl/__init__.py b/sccl/__init__.py
index 8627102..745d1ef 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -3,3 +3,4 @@
 
 from sccl.autosynth import init
 from sccl.autosynth import ndv2_perm
+from sccl.autosynth import Collective
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 660d056..e7f5d3f 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -12,6 +12,7 @@
 import math
 import tempfile
 import humanfriendly
+from enum import Enum
 
 from sccl.autosynth.ndv2_plans import register_ndv2_plans
 from sccl.autosynth.ndv4_plans import register_ndv4_plans
@@ -19,11 +20,25 @@
 register_ndv4_plans()
 
 
-def init(num_machines, machine_type, *collectives):
+class Collective(Enum):
+    allreduce = 'allreduce'
+    allgather = 'allgather'
+    reduce = 'reduce'
+    broadcast = 'broadcast'
+    alltoall = 'alltoall'
+    reduce_scatter = 'reduce_scatter'
+
+    def __str__(self):
+        return self.value
+
+
+def init(machine_type, num_machines, *collectives):
     # Collect and sort all plans that match the collectives and sizes given by the user.
     selected_plans = {}
     for collective in collectives:
         name, sizes = collective
+        if isinstance(name, Collective):
+            name = str(name)
         if isinstance(sizes, tuple):
             lower, upper = sizes
             if isinstance(lower, str):
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 51715d6..ec74f39 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -7,13 +7,13 @@
 
 
 def test_sccl_init(capsys):
-    sccl.init(4, 'not_a_machine_type', ('alltoall', 0))
+    sccl.init('not_a_machine_type', 4, ('alltoall', 0))
     out, err = capsys.readouterr()
     assert 'No plan found' in out
-    sccl.init(2, 'ndv2', ('alltoall', '1MB'))
+    sccl.init('ndv2', 2, ('alltoall', '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
-    sccl.init(9, 'ndv4', ('alltoall', '1MB'))
+    sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv4_hierarchical_alltoall' in out
 

From 09a126d8c8e96d902a37b680c55a1511b6998e41 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 13 Oct 2021 15:32:09 -0700
Subject: [PATCH 073/135] Bump version to 2.3.0

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 828b8dc..7c76b1a 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='sccl',
-    version='2.2.0',
+    version='2.3.0',
     packages=find_packages(),
     entry_points={
         'console_scripts': [

From e2d566e89c2bd13038f809a85cc3633967a5d86f Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 20 Oct 2021 10:47:52 -0700
Subject: [PATCH 074/135] Fix ndv2_perm that I had improperly copy-pasted

---
 sccl/autosynth/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index e7f5d3f..fca057d 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.topologies import dgx1
+from sccl.topologies import dgx1, nvlink_only
 from sccl.isomorphisms import find_isomorphisms
 from sccl.autosynth.registry import synthesis_plans
 from lxml import etree as ET
@@ -182,13 +182,13 @@ def _extract_min_channels(path):
             return None
 
 
-def ndv2_perm(self): # pragma: no cover
+def ndv2_perm(): # pragma: no cover
     # This function is used in a hacky way right now. The sccl_ndv2_launcher.sh
     # relies on the side effect of _select_isomorphism creating the lock file,
     # which is read by the script after calling this function, so the return
     # value does't currently get used. If you make changes, please fix or update
     # sccl_ndv2_launcher.sh accordingly.
-    isomorphisms = find_isomorphisms(dgx1(), self.local_topo)
+    isomorphisms = find_isomorphisms(dgx1(), nvlink_only())
     if len(isomorphisms) != 4:
         raise RuntimeError(
             f'Expected to find 4 isomorphisms to DGX1 topology, but found {len(isomorphisms)}.')

From 0c9c93cb5c44ef667d47e4eda008794bc7262da9 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 20 Oct 2021 16:17:43 -0700
Subject: [PATCH 075/135] Remove stray self parameter

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index fca057d..4fca5b6 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -195,7 +195,7 @@ def ndv2_perm(): # pragma: no cover
     return _select_isomorphism(isomorphisms)
 
 
-def _select_isomorphism(self, isomorphisms, verbose=True): # pragma: no cover
+def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
     with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
         fcntl.lockf(f, fcntl.LOCK_EX)
         try:

From 2d1a34e45982471d3df32ad2974ba86657186517 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 3 Nov 2021 12:41:29 -0700
Subject: [PATCH 076/135] Update readme and split synthesis out

---
 README.md    | 163 ++++++++++++++++++++-------------------------------
 SYNTHESIS.md | 102 ++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 101 deletions(-)
 create mode 100644 SYNTHESIS.md

diff --git a/README.md b/README.md
index 212a8c2..5fd0874 100644
--- a/README.md
+++ b/README.md
@@ -1,141 +1,102 @@
 # SCCL
 
-The Synthesized Collective Communication Library is a tool for synthesizing collective algorithms tailored to a particular hardware topology.
+SCCL is a programmable GPU communication library that offers synthesis tools and a programming language, SCCLang, for
+building collective algorithms tailored to a particular hardware and workload.
 
 ## Installation
 
-To install:
+### Python package and tool
+
+To install either clone this repo and run "`pip install .`" or run:
 ```
-pip install .
+pip install git+https://github.com/microsoft/sccl.git
 ```
 This installs the Python package and the `sccl` command line tool.
 
-To enable Bash completion for `sccl`:
+To enable Bash completion for the `sccl` tool:
 ```
 echo 'eval "$(register-python-argcomplete sccl)"' >> ~/.bashrc
 ```
 
-## Synthesizing Algorithms
+### Runtime
 
-At its core SCCL answers synthesis queries is there an algorithm for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
+SCCL's algorithms run in [a modified version of NCCL that includes an interpreter](https://github.com/microsoft/msccl),
+which is API compatible with NCCL and is installed as normal. See https://github.com/microsoft/msccl for instructions.
 
-SCCL groups its solver strategies under the `sccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
+To use SCCL with PyTorch, the built in NCCL submodule has to be replaced with SCCL's version. Additionally, to expose
+the new native Alltoall support that SCCL adds, PyTorch's `torch.distributed` package can optionally be patched. The
+following commands perform these steps and install PyTorch with SCCL:
 ```
-$ sccl solve instance DGX1 Allgather --steps 4
-Solving instance steps=4... synthesized! (0.7s)
-Wrote to Allgather.n8-DGX1-steps4.sccl.json
+git clone https://github.com/pytorch/pytorch.git
+cd pytorch    
+git checkout tags/v1.9.0 -b v1.9.0_sccl
+perl -p -i -e  's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules
+git submodule sync third_party/nccl
+git submodule update --init --recursive
+git submodule update --init --recursive --remote third_party/nccl
+git apply third_party/nccl/nccl/patches/nccl.cpp.patch
+python setup.py install
 ```
-The instance is satisfiable and `sccl` saves it to a file.
 
-Four steps is not necessarily the least number of steps required. To find the least steps:
-```
-$ sccl solve least-steps DGX1 Allgather
-Algorithms need at least 2 steps.
-Solving instance steps=2... synthesized! (0.2s)
-Wrote to Allgather.n8-DGX1-steps2.sccl.json
-```
-The `least-steps` strategy statically determines that any Allgather in a DGX-1 requires at least 2 steps and starting from that finds the smallest satisfiable number of steps.
+## Usage
 
-While this two step algorithm is a latency-optimal one, there may be other algorithms that achieve higher bandwidth. The `pareto-optimal` strategy searches through different latency-bandwidth tradeoffs:
-```
-$ sccl solve pareto-optimal DGX1 Allgather
-Algorithms need at least 2 steps.
-Algorithms need at least 7/6 rounds per chunk.
-Solving instance steps=2... synthesized! (0.5s)
-Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.9s)
-Solving instance steps=2,rounds=4,chunks=3... unsatisfiable. (1.1s)
-Assuming 2 step algorithms need at least 4/3 rounds per chunk.
-Solving instance steps=3,rounds=4,chunks=3... synthesized! (2.9s)
-Solving instance steps=3,rounds=5,chunks=4... synthesized! (6.5s)
-Solving instance steps=3,rounds=6,chunks=5... synthesized! (44.0s)
-Solving instance steps=3,rounds=7,chunks=6... synthesized! (56.1s)
-Bandwidth optimal algorithm found!
-Found 2 Pareto optimal algorithms. Pruned 4 non-optimal algorithms.
-Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.sccl.json
-Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.sccl.json
-```
+The SCCL Python package ships with a registry of synthesis strategies and hand optimized algorithms. These can be loaded
+into [the runtime](https://github.com/parasailteam/msccl) through the `sccl.init` function, which must be called before
+the application creates its NCCL communicator. For PyTorch this means before `torch.distributed` is initialized.
 
-## Collectives
-
-SCCL includes a number of built in common collectives.
-
-| Collective | Arguments | Description | Kind |
-| - | - | - | - |
-| Broadcast | `--root N` | Send data from root to all nodes. | NC |
-| Reduce | `--root N` | Combine data from all nodes to root. | CR |
-| Scatter | `--root N` | Send slices of data from root to all nodes. | NC |
-| Gather | `--root N` | Send slices of data from all nodes to root. | NC |
-| Allgather | | Send slices of data from all nodes to all nodes. | NC |
-| Allreduce | | Combine data from all nodes to all nodes. | CNR |
-| Alltoall | | Transpose data between all nodes. | NC |
-| ReduceScatter | | Combine slices of data to all nodes. | CR |
-| Scan | | Combine partial prefixes of data to all nodes in sequence. | CNR |
-| MultirootBroadcast | `--roots N [N ...]` | Like Broadcast, but set of nodes have slices of input. | NC |
-| MultirootScatter | `--roots N [N ...]` | Like Scatter, but set of nodes have slices of input. | NC |
-| MultirootGather | `--roots N [N ...]` | Like Gather, but output is sent in slices to a set of nodes. | NC |
-| custom | `--collective-file` | Arbitrary collective serialized by the user. | ? |
-
-Custom collectives may be defined by instantiating the `Collective` class, which is easiest through the `build_collective` function. For example, a send from rank 2 to rank 7 in an 8 node topology can be defined and saved with:
+The following snippet requests `sccl.init` to provide an Alltoall algorithm in a configuration of 2 Azure NDv2 machines:
 ```
-from sccl.collectives import build_collective
-from sccl.serialization import save_sccl_object
-
-precondition = lambda r, c: r == 2
-postcondition = lambda r, c: r == 7
-coll = build_collective('Send', 8, 1, precondition, postcondition)
-save_sccl_object(coll, 'send.json')
+import sccl
+sccl.init('ndv2', 2, (sccl.Collective.alltoall, ('1MB')))
 ```
+The call will finds an algorithm provider that can create an Alltoall algorithm that is expected to be good with 1MB of
+data. That will call a synthesis routine that writes the algorithm to disk. `sccl.init` will then pass a configuration
+file pointing to this algorithm to the runtime through environment variables.
 
-The *kind* of the collective determines support for some features of SCCL:
-- **NC** are non-combining collectives, and are always supported.
-- **CR** are combining collectives that have a non-combining dual collective, and are supported through a reduction.
-- **CNR** are combining collectives with no dual, which may not always be supported.
+See [the examples](examples/sccl_init.py) for more on `sccl.init` usage.
 
-Currently the rounds per chunk analysis described below can not support CNR collectives.
+Refer to the next section on availability of algorithms with `sccl.init`.
 
-## Steps and Rounds
+### Note on Azure NDv2
 
-SCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step.
+Azure NDv2 does not expose the true PCIe topology of the machines to the VM and worse, does not assign PCIe devices
+consistently to the virtual paths in the VM. As SCCL is generating topology-aware algorithms, this device ordering must
+be fixed. The [sccl_ndv2_launcher.sh](sccl/autosynth/sccl_ndv2_launcher.sh) script can be used to fix this problem. The
+script solves the automorphisms from the local VM's NVLink topology to the reference topology and selects one of the 4
+automorphisms based on measured placement of the Infiniband card such that GPU 0 is close to the NIC. A tool called
+[inspector-topo](https://github.com/microsoft/inspector-topo) needs to be available for the latter step.
 
-How much data a single round corresponds to depends on what is the actual size of a chunk at runtime, and how many chunks a collective uses can change (e.g. you can control this directly in the `instance` strategy by setting `--chunks N`). Thus for each collective the total data usage of different algorithms implementing it can be measured with their *rounds per chunk*.
+## Available Algorithms
 
-SCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1:
-```
-$ sccl analyze rounds DGX1 Gather
-Gather(n=8,root=0) algorithms need at least 7/6 rounds in DGX1 topology.
-```
-In this case the bound happens to be tight and the `pareto-optimal` strategy would use it to detect that it has found a bandwidth optimal algorithm.
+SCCL's built-in algorithm providers currently includes an efficient Alltoall algorithm for Azure NDv2 nodes. Stay tuned
+for more algorithms coming soon!
 
-## Distributed Algorithms
+https://github.com/parasailteam/sccl-presynth offers additional algorithms that have been pre-synthesized for fixed
+configurations. To enable them install the package and import it before the call to `sccl.init`.
 
-SCCL provides routines to synthesize algorithms for distributed topologies under the `sccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one.
+## Synthesis
 
-**Alltoall from Gather and Scatter:** `alltoall-gather-scatter` combines a Gather and a Scatter algorithm with a transpose step in the middle to form a distributed Alltoall algorithm. For example, an Alltoall algorithm for a cluster of 4 DGX-1 machines can be created with:
-```
-sccl solve least-steps DGX1 Gather -o gather.json
-sccl solve least-steps DGX1 Scatter -o scatter.json --root 1
-sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json
-```
-This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. SCCL also provides multi-root versions of Gather and Scatter that can be substituted here.
+SCCL started out as a synthesizer for collective algorithms, and has since expanded to cover a broader range of
+programmability. See [this readme](SYNTHESIS.md) for using SCCL as a synthesizer.
 
 ## Contributing
 
-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a Contributor License
+Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For
+details, visit https://cla.opensource.microsoft.com.
 
-When you submit a pull request, a CLA bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate
+the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only
+need to do this once across all repos using our CLA.
 
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
+[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
 
 ## Trademarks
 
-This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
-trademarks or logos is subject to and must follow 
-[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
-Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
-Any use of third-party trademarks or logos are subject to those third-party's policies.
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks
+or logos is subject to and must follow [Microsoft's Trademark & Brand
+Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft
+trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any
+use of third-party trademarks or logos are subject to those third-party's policies.
diff --git a/SYNTHESIS.md b/SYNTHESIS.md
new file mode 100644
index 0000000..9b1b2e6
--- /dev/null
+++ b/SYNTHESIS.md
@@ -0,0 +1,102 @@
+## Synthesizing Algorithms
+
+SCCL can synthesize algorithms for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
+
+SCCL groups its solver strategies under the `sccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
+```
+$ sccl solve instance DGX1 Allgather --steps 4
+Solving instance steps=4... synthesized! (0.7s)
+Wrote to Allgather.n8-DGX1-steps4.sccl.json
+```
+The instance is satisfiable and `sccl` saves it to a file.
+
+Four steps is not necessarily the least number of steps required. To find the least steps:
+```
+$ sccl solve least-steps DGX1 Allgather
+Algorithms need at least 2 steps.
+Solving instance steps=2... synthesized! (0.2s)
+Wrote to Allgather.n8-DGX1-steps2.sccl.json
+```
+The `least-steps` strategy statically determines that any Allgather in a DGX-1 requires at least 2 steps and starting from that finds the smallest satisfiable number of steps.
+
+While this two step algorithm is a latency-optimal one, there may be other algorithms that achieve higher bandwidth. The `pareto-optimal` strategy searches through different latency-bandwidth tradeoffs:
+```
+$ sccl solve pareto-optimal DGX1 Allgather
+Algorithms need at least 2 steps.
+Algorithms need at least 7/6 rounds per chunk.
+Solving instance steps=2... synthesized! (0.5s)
+Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.9s)
+Solving instance steps=2,rounds=4,chunks=3... unsatisfiable. (1.1s)
+Assuming 2 step algorithms need at least 4/3 rounds per chunk.
+Solving instance steps=3,rounds=4,chunks=3... synthesized! (2.9s)
+Solving instance steps=3,rounds=5,chunks=4... synthesized! (6.5s)
+Solving instance steps=3,rounds=6,chunks=5... synthesized! (44.0s)
+Solving instance steps=3,rounds=7,chunks=6... synthesized! (56.1s)
+Bandwidth optimal algorithm found!
+Found 2 Pareto optimal algorithms. Pruned 4 non-optimal algorithms.
+Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.sccl.json
+Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.sccl.json
+```
+
+## Collectives
+
+SCCL includes a number of built in common collectives.
+
+| Collective | Arguments | Description | Kind |
+| - | - | - | - |
+| Broadcast | `--root N` | Send data from root to all nodes. | NC |
+| Reduce | `--root N` | Combine data from all nodes to root. | CR |
+| Scatter | `--root N` | Send slices of data from root to all nodes. | NC |
+| Gather | `--root N` | Send slices of data from all nodes to root. | NC |
+| Allgather | | Send slices of data from all nodes to all nodes. | NC |
+| Allreduce | | Combine data from all nodes to all nodes. | CNR |
+| Alltoall | | Transpose data between all nodes. | NC |
+| ReduceScatter | | Combine slices of data to all nodes. | CR |
+| Scan | | Combine partial prefixes of data to all nodes in sequence. | CNR |
+| MultirootBroadcast | `--roots N [N ...]` | Like Broadcast, but set of nodes have slices of input. | NC |
+| MultirootScatter | `--roots N [N ...]` | Like Scatter, but set of nodes have slices of input. | NC |
+| MultirootGather | `--roots N [N ...]` | Like Gather, but output is sent in slices to a set of nodes. | NC |
+| custom | `--collective-file` | Arbitrary collective serialized by the user. | ? |
+
+Custom collectives may be defined by instantiating the `Collective` class, which is easiest through the `build_collective` function. For example, a send from rank 2 to rank 7 in an 8 node topology can be defined and saved with:
+```
+from sccl.collectives import build_collective
+from sccl.serialization import save_sccl_object
+
+precondition = lambda r, c: r == 2
+postcondition = lambda r, c: r == 7
+coll = build_collective('Send', 8, 1, precondition, postcondition)
+save_sccl_object(coll, 'send.json')
+```
+
+The *kind* of the collective determines support for some features of SCCL:
+- **NC** are non-combining collectives, and are always supported.
+- **CR** are combining collectives that have a non-combining dual collective, and are supported through a reduction.
+- **CNR** are combining collectives with no dual, which may not always be supported.
+
+Currently the rounds per chunk analysis described below can not support CNR collectives.
+
+## Steps and Rounds
+
+SCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step.
+
+How much data a single round corresponds to depends on what is the actual size of a chunk at runtime, and how many chunks a collective uses can change (e.g. you can control this directly in the `instance` strategy by setting `--chunks N`). Thus for each collective the total data usage of different algorithms implementing it can be measured with their *rounds per chunk*.
+
+SCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1:
+```
+$ sccl analyze rounds DGX1 Gather
+Gather(n=8,root=0) algorithms need at least 7/6 rounds in DGX1 topology.
+```
+In this case the bound happens to be tight and the `pareto-optimal` strategy would use it to detect that it has found a bandwidth optimal algorithm.
+
+## Distributed Algorithms
+
+SCCL provides routines to synthesize algorithms for distributed topologies under the `sccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one.
+
+**Alltoall from Gather and Scatter:** `alltoall-gather-scatter` combines a Gather and a Scatter algorithm with a transpose step in the middle to form a distributed Alltoall algorithm. For example, an Alltoall algorithm for a cluster of 4 DGX-1 machines can be created with:
+```
+sccl solve least-steps DGX1 Gather -o gather.json
+sccl solve least-steps DGX1 Scatter -o scatter.json --root 1
+sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json
+```
+This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. SCCL also provides multi-root versions of Gather and Scatter that can be substituted here.

From 5174599df93819dd17cafdd63bacabcb6109ae52 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Thu, 3 Feb 2022 17:46:01 -0800
Subject: [PATCH 077/135] Fix issue with empty SCCL_CONFIG being set

Condition to check that no plans were selected was wrong.
Also remove NCCL_MIN_NCHANNELS as the runtime no longer requires it.
Improve tests to check that SCCL_CONFIG is not set when it shouldn't be.
---
 sccl/autosynth/__init__.py | 67 ++++++++++----------------------------
 tests/test_autosynth.py    |  3 ++
 2 files changed, 20 insertions(+), 50 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 4fca5b6..a917566 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -55,34 +55,29 @@ def init(machine_type, num_machines, *collectives):
         candidates = synthesis_plans[(name, machine_type)]
         selected_plans[name] = _select_plans(name, candidates, num_machines, sizes)
 
-    if len(selected_plans) > 0:
-        # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by SCCL-RT.
-        algos_elem = ET.Element('sccl_algos')
-        max_min_channels = 0
-        for collective_name, plans in selected_plans.items():
-            for plan, params in plans:
-                path = plan(num_machines)
-                min_channels = _extract_min_channels(path)
-                # Skip the algorithm if minimum channels could not be determined (corrupted XML for example)
-                if min_channels:
-                    max_min_channels = max(max_min_channels, min_channels)
-
-                    load_elem = ET.SubElement(algos_elem, 'load')
-                    load_elem.set('path', path)
-                    minsize, maxsize, proto = params
-                    if minsize != 0:
-                        load_elem.set('minbytes', str(minsize))
-                    if maxsize != math.inf:
-                        load_elem.set('maxbytes', str(maxsize))
-                    load_elem.set('proto', proto)
-        ET.indent(algos_elem, space='  ')
+    # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by SCCL-RT.
+    algos_elem = ET.Element('sccl_algos')
+    any_selected = False
+    for collective_name, plans in selected_plans.items():
+        for plan, params in plans:
+            path = plan(num_machines)
+            load_elem = ET.SubElement(algos_elem, 'load')
+            load_elem.set('path', path)
+            minsize, maxsize, proto = params
+            if minsize != 0:
+                load_elem.set('minbytes', str(minsize))
+            if maxsize != math.inf:
+                load_elem.set('maxbytes', str(maxsize))
+            load_elem.set('proto', proto)
+            any_selected = True
+    ET.indent(algos_elem, space='  ')
         
+    if any_selected:
         fd, path = tempfile.mkstemp()
         with os.fdopen(fd, 'w') as f:
             f.write(ET.tostring(algos_elem, encoding='unicode'))
         os.environ.update({
             'SCCL_CONFIG': path,
-            'NCCL_MIN_NCHANNELS': str(max_min_channels),
             'NCCL_NET_SHARED_BUFFERS': '0'
         })
     else:
@@ -154,34 +149,6 @@ def _candidate_sort_key(candidate):
     return priority
 
 
-def _extract_min_channels(path):
-    algo_pattern = re.compile('<algo[^>]*>')
-    nchannels_pattern = re.compile('nchannels=["\'](\\d+)["\']')
-    with open(path) as f:
-        # Try with the first line
-        first_line = f.readline()
-        match = algo_pattern.search(first_line)
-        if match:
-            tag_match = nchannels_pattern.search(match.group(0))
-            if not tag_match:
-                print(f'SCCL: Skipping algorithm, could not read nchannels from <algo/> tag in {path}')
-                return None
-            return int(tag_match.group(1))
-        # Try again with the whole file
-        f.seek(0)
-        whole_file = f.read()
-        match = algo_pattern.search(whole_file)
-        if match:
-            tag_match = nchannels_pattern.search(match.group(0))
-            if not tag_match:
-                print(f'SCCL: Skipping algorithm, could not read nchannels from <algo/> tag in {path}')
-                return None
-            return int(tag_match.group(1))
-        else:
-            print(f'SCCL: Skipping algorithm, could not find <algo/> tag in {path}')
-            return None
-
-
 def ndv2_perm(): # pragma: no cover
     # This function is used in a hacky way right now. The sccl_ndv2_launcher.sh
     # relies on the side effect of _select_isomorphism creating the lock file,
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index ec74f39..5c7e86a 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -3,6 +3,7 @@
 
 import pytest
 import sccl
+import os
 from sccl.autosynth.registry import register_synthesis_plan
 
 
@@ -10,9 +11,11 @@ def test_sccl_init(capsys):
     sccl.init('not_a_machine_type', 4, ('alltoall', 0))
     out, err = capsys.readouterr()
     assert 'No plan found' in out
+    assert not 'SCCL_CONFIG' in os.environ
     sccl.init('ndv2', 2, ('alltoall', '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
+    assert 'SCCL_CONFIG' in os.environ
     sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv4_hierarchical_alltoall' in out

From fe1022745f3efe5328f665d024203b5585e46477 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <cowanmeg@cs.washington.edu>
Date: Fri, 4 Feb 2022 13:00:41 -0800
Subject: [PATCH 078/135] Language and compiler (#10)

Adds a language and compiler to manually write SCCL programs:
- Registers programs in sccl.init
- Examples programs for writing collectives
- Optimization passes, auto-instances

Co-authored-by: Olli Saarikivi <olsaarik@microsoft.com>
Co-authored-by: Madan Musuvathi <madanm@microsoft.com>
Co-authored-by: Saeed Maleki <saemal@microsoft.com>
---
 examples/sccl_init.py                        |  31 ++
 examples/scclang/allreduce_a100_allpairs.py  |  48 +++
 examples/scclang/allreduce_a100_ring.py      |  45 +++
 examples/scclang/allreduce_a100_tree.py      |  87 ++++
 examples/scclang/allreduce_dgx1.py           |  63 +++
 examples/scclang/allreduce_ndv2.py           |  51 +++
 examples/scclang/alltoall_a100.py            | 135 +++++++
 examples/scclang/alltoall_a100_mesh.py       | 117 ++++++
 examples/scclang/alltoall_a100_yifan.py      |  59 +++
 examples/scclang/alltoall_allpairs.py        |  28 ++
 examples/scclang/alltonext_backward.py       |  93 +++++
 examples/scclang/alltonext_forward.py        |  96 +++++
 examples/scclang/reducegather.py             |  98 +++++
 examples/scclang/simple/allgather_ring.py    |  49 +++
 examples/scclang/simple/allreduce_ring.py    |  40 ++
 examples/scclang/simple/custom_collective.py |  85 ++++
 requirements.txt                             |   1 +
 sccl/autosynth/ndv4_plans.py                 |  16 +-
 sccl/autosynth/registry.py                   |  40 ++
 sccl/language/__init__.py                    | 378 +++++++++++++++++
 sccl/language/buffer.py                      |  37 ++
 sccl/language/chunk.py                       |  59 +++
 sccl/language/collectives.py                 | 222 ++++++++++
 sccl/language/ir.py                          | 313 +++++++++++++++
 sccl/language/passes.py                      |  46 +++
 sccl/language/rank_dag.py                    | 401 +++++++++++++++++++
 sccl/language/tb_assignment.py               | 185 +++++++++
 sccl/language/visualize.py                   | 103 +++++
 sccl/programs/__init__.py                    |   1 +
 sccl/programs/allreduce_a100_ring.py         |  29 ++
 sccl/programs/alltoall_a100_yifan.py         |  44 ++
 tests/test_language.py                       | 240 +++++++++++
 32 files changed, 3239 insertions(+), 1 deletion(-)
 create mode 100644 examples/scclang/allreduce_a100_allpairs.py
 create mode 100644 examples/scclang/allreduce_a100_ring.py
 create mode 100644 examples/scclang/allreduce_a100_tree.py
 create mode 100644 examples/scclang/allreduce_dgx1.py
 create mode 100644 examples/scclang/allreduce_ndv2.py
 create mode 100644 examples/scclang/alltoall_a100.py
 create mode 100644 examples/scclang/alltoall_a100_mesh.py
 create mode 100644 examples/scclang/alltoall_a100_yifan.py
 create mode 100644 examples/scclang/alltoall_allpairs.py
 create mode 100644 examples/scclang/alltonext_backward.py
 create mode 100644 examples/scclang/alltonext_forward.py
 create mode 100644 examples/scclang/reducegather.py
 create mode 100644 examples/scclang/simple/allgather_ring.py
 create mode 100644 examples/scclang/simple/allreduce_ring.py
 create mode 100644 examples/scclang/simple/custom_collective.py
 create mode 100644 sccl/language/__init__.py
 create mode 100644 sccl/language/buffer.py
 create mode 100644 sccl/language/chunk.py
 create mode 100644 sccl/language/collectives.py
 create mode 100644 sccl/language/ir.py
 create mode 100644 sccl/language/passes.py
 create mode 100644 sccl/language/rank_dag.py
 create mode 100644 sccl/language/tb_assignment.py
 create mode 100644 sccl/language/visualize.py
 create mode 100644 sccl/programs/__init__.py
 create mode 100644 sccl/programs/allreduce_a100_ring.py
 create mode 100644 sccl/programs/alltoall_a100_yifan.py
 create mode 100644 tests/test_language.py

diff --git a/examples/sccl_init.py b/examples/sccl_init.py
index f8cee84..56bb4f8 100644
--- a/examples/sccl_init.py
+++ b/examples/sccl_init.py
@@ -54,4 +54,35 @@ def alltoall_9000(machines):
 
 sccl.init('ndv9000', 1, (sccl.Collective.alltoall, ('2KB', None)))
 
+show()
+
+
+print('=== SCCLang program ===')
+
+from sccl.autosynth.registry import register_sccl_program
+from sccl.topologies import line
+from sccl.language import *
+
+@register_sccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1)
+def trivial_allgather(prog, nodes):
+    chunk(Buffer.input, 0, 0).send(0, Buffer.output, 0).send(1)
+    chunk(Buffer.input, 1, 0).send(1, Buffer.output, 1).send(0)
+
+sccl.init('two_gpus', 1, (sccl.Collective.allgather, (0, None)))
+
+show()
+
+
+print('=== SCCLang program example ====')
+
+from sccl.topologies import fully_connected
+from sccl.programs.allreduce_a100_ring import allreduce_ring
+
+@register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+    instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines=lambda x: x == 1)
+def ndv4_ring_allreduce(prog, nodes):
+    allreduce_ring(size=8, channels=8)
+
+sccl.init('ndv4', 1, (sccl.Collective.allreduce, (0, None)))
+
 show()
\ No newline at end of file
diff --git a/examples/scclang/allreduce_a100_allpairs.py b/examples/scclang/allreduce_a100_allpairs.py
new file mode 100644
index 0000000..1835529
--- /dev/null
+++ b/examples/scclang/allreduce_a100_allpairs.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+def allreduce_allpairs(instances):
+    size = 8
+    chunksperloop = 8
+    topology = fully_connected(size)
+    collective = AllReduce(size, chunksperloop, True)
+    with SCCLProgram("allreduce_pairs", topology, collective, instances, protocol="LL", 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual):
+        
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r2
+                    c = chunk(r1, Buffer.input, index)
+                    c.send(r2, 'scratch', sendtb=r2, recvtb=r1, ch=0)
+
+        # Each rank performs a local reduction on the nth chunk
+        for r in range(size):
+            for index in range(0, 7):
+                c = chunk('scratch', r, index)
+                c.reduce(r, Buffer.input, r, sendtb=r, ch=0)
+        
+        # Each rank sends the fully reduced nth chunk to all other gpus
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r1
+                    c = chunk(r1, Buffer.input, index)
+                    c.send(r2, Buffer.input, index, sendtb=r2, recvtb=r1, ch=0)
+                
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('instances', type=int, help='number of instances')
+# parser.add_argument('threadblocks', type=int, default=0, help='number of threadblocks per instance')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.instances)
\ No newline at end of file
diff --git a/examples/scclang/allreduce_a100_ring.py b/examples/scclang/allreduce_a100_ring.py
new file mode 100644
index 0000000..37ae767
--- /dev/null
+++ b/examples/scclang/allreduce_a100_ring.py
@@ -0,0 +1,45 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+# Ring all reduce for A100s
+# Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
+# channels=1 is standard ring, all chunks are assigned to the same tb/channel
+# channels=8 devotes 1 tb/channel to handling 1 chunk of the data
+def allreduce_ring(instances, channels):
+    size = 8
+    topology = fully_connected(size)
+    collective = AllReduce(size, size, True)
+    with SCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
+         protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
+        # Reduce ring
+        for step in range(0, size-1):
+            for index in range(0, size):
+                rank = (index + step) % size
+                c = chunk(rank, Buffer.input, index)
+                next_rank = (index + step + 1) % size
+                channel = index%channels
+                c = c.reduce(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+        # Propagate ring
+        for step in range(-1, size-2):
+            for index in range(0, size):
+                rank = (index + step) % size
+                c = chunk(rank, Buffer.input, index)
+                next_rank = (index + step + 1) % size
+                channel = index%channels
+                c = c.send(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+               
+        XML()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
+parser.add_argument('instances', type=int, help='number of instances')
+args = parser.parse_args()
+
+allreduce_ring(args.instances, args.channels)
diff --git a/examples/scclang/allreduce_a100_tree.py b/examples/scclang/allreduce_a100_tree.py
new file mode 100644
index 0000000..28e8e36
--- /dev/null
+++ b/examples/scclang/allreduce_a100_tree.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Halving-doubling implementation of allreduce
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+
+def allreduce(ways, instances):
+    topology = fully_connected(8)
+    size = topology.num_nodes() #  Number of gpus
+    logical_chunk = 8 * ways
+    tb_per_channel = 12
+    collective = AllReduce(size, logical_chunk, True)
+    with SCCLProgram("allreduce_a100_tree", topology, collective, instances, 'Simple', interleaved_replication=False):
+        # 1 reduction between pairs of gpus of count
+        def reduce_tree(pairs, count, next_index, lc, sendtb, recvtb):
+            current_index = next_index.copy()
+            for r in range(size):
+                next = r ^ pairs
+                offset = (count if r <= next else 0) 
+                next_index[next] += offset
+                # Split the reduce into two separate reduces to enable fused instructions
+                block = 2 ** pairs
+                for x in range(count):
+                    index = current_index[r] + offset + lc*8 + x
+                    c = chunk(r, Buffer.input, index)
+                    c.reduce(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb)
+
+
+        # Propagates reduced chunks in reverse order 
+        def propagate_tree(pairs, count, next_index, lc, sendtb, recvtb):
+            current_index = next_index.copy()            
+            for r in range(size):
+                next = r ^ pairs
+                offset = (count if r > next else 0) 
+                next_index[r] -= offset
+                index = current_index[r] + lc*8
+                c = chunk(r, Buffer.input, index, count)
+                c.send(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb)
+
+        next_index = [0] * 8
+        reduce_tree(1, 4, next_index, 0, 0, 1)
+        reduce_tree(2, 2, next_index, 0, 1, 2)
+        reduce_tree(4, 1, next_index, 0, 2, 3)
+
+        propagate_tree(4, 1, next_index, 0, 2, 3)
+        propagate_tree(2, 2, next_index, 0, 1, 2)
+        propagate_tree(1, 4, next_index, 0, 0, 1)
+
+        if ways > 1:
+            next_index = [0] * 8
+            lc = 1
+            reduce_tree(4, 4, next_index, lc, 8, 9)
+            reduce_tree(2, 2, next_index, lc, 9, 10)
+            reduce_tree(1, 1, next_index, lc, 10, 11)
+
+            propagate_tree(1, 1, next_index, lc, 10, 11)
+            propagate_tree(2, 2, next_index, lc, 9, 10)
+            propagate_tree(4, 4, next_index, lc, 8, 9)
+            
+        if ways > 2:
+            next_index = [0] * 8
+            lc = 2
+            reduce_tree(2, 4, next_index, lc, 4, 5)
+            reduce_tree(1, 2, next_index, lc, 5, 6)
+            reduce_tree(4, 1, next_index, lc, 6, 7)
+
+            
+            propagate_tree(4, 1, next_index, lc, 6, 7)
+            propagate_tree(1, 2, next_index, lc, 5, 6)
+            propagate_tree(2, 4, next_index, lc, 4, 5)
+            
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('ways', type=int, help='number of parallel trees to perform reduction min:1 max:2')
+parser.add_argument('instances', type=int, help='number of instances')
+args = parser.parse_args()
+assert args.ways >=0 and args.ways <= 3
+allreduce(args.ways, args.instances)
diff --git a/examples/scclang/allreduce_dgx1.py b/examples/scclang/allreduce_dgx1.py
new file mode 100644
index 0000000..2318687
--- /dev/null
+++ b/examples/scclang/allreduce_dgx1.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies.distributed import *
+from sccl.topologies.nvidia import *
+from sccl.language.collectives import AllReduce
+
+def allreduce(num_nodes, instances):
+    local_topology = dgx1()
+    num_local_gpus = 8
+    remote_bw = 1
+    topology = distributed_fully_connected(local_topology, num_nodes, remote_bw)
+    size = topology.num_nodes()
+    collective = AllReduce(size, instances, True)
+    local_ring_order = [1,3,2,6,7,5,4,0] # Reductions will happen locally within a node in this order.
+
+    def rank(n, g):
+        return local_ring_order[g] + n * num_local_gpus
+        
+    with SCCLProgram("allreduce_ring_dgx1", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual):
+
+        # Chunks travels around local rings being reduced (local_gpus-1 hops) starting at local gpu 1
+        # At the end of the most reduced chunk ends up on local gpu 0 every each node
+        for ch in range(instances):
+            for n in range(num_nodes):
+                r = rank(n, 0) # Start at local gpu 1 (index 0 in local_ring_order)
+                c = chunk(r, Buffer.input, ch)
+                for g in range(1, 8):
+                    next = rank(n, g)
+                    c = c.reduce(next, buffer=Buffer.input, index=ch, ch=ch, sendtb=0+3*ch, recvtb=0+3*ch)
+
+            # At this point gpu0 and gpu8 have the two most reduced chunks
+            # 1 IB send to fully reduce chunk + 1 IB send to update other node 
+            c0 = chunk(0, Buffer.input, ch)
+            c0 = c0.send(9, buffer=Buffer.input, index=ch, ch=ch, sendtb=0+3*ch, recvtb=0+3*ch)
+            c1 = chunk(8, Buffer.input, ch)
+            c1 = c1.send(1, buffer=Buffer.input, index=ch, ch=ch, sendtb=0+3*ch, recvtb=0+3*ch)
+
+            c0 = c0.reduce(8, buffer=Buffer.input, index=ch, ch=ch, sendtb=2+3*ch, recvtb=2+3*ch) # Completely reduced chunk on node 1, gpu0
+            c1 = c1.reduce(0, buffer=Buffer.input, index=ch, ch=ch, sendtb=2+3*ch, recvtb=2+3*ch) # Completely reduced chunk on node 0, gpu0
+
+            #  Propagate the fully reduced chunks going backwards around the ring
+            for n in range(num_nodes):
+                r = rank(n, -1) 
+                c = chunk(r, Buffer.input, ch)
+                for g in range(6, -1, -1):
+                    next = rank(n, g)
+                    c = c.send(next, buffer=Buffer.input, index=ch, ch=ch, sendtb=2+3*ch, recvtb=2+3*ch)
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_nodes', type=int, help='number of nodes')
+parser.add_argument('instances', type=int, help='number of instances')
+args = parser.parse_args()
+
+assert args.num_nodes > 1, "Number of nodes must be greater than 1"
+
+allreduce(args.num_nodes, args.instances)
diff --git a/examples/scclang/allreduce_ndv2.py b/examples/scclang/allreduce_ndv2.py
new file mode 100644
index 0000000..a178923
--- /dev/null
+++ b/examples/scclang/allreduce_ndv2.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies.distributed import *
+from sccl.topologies.nvidia import *
+from sccl.language.collectives import AllReduce
+
+def allreduce(instances):
+    topology = dgx1()
+    num_local_gpus = 8
+    size = topology.num_nodes() #  Number of gpus
+    logical_chunk = 8
+    collective = AllReduce(size, logical_chunk, True)
+    with SCCLProgram("allreduce_ndv2", topology, collective, instances, interleaved_replication=False):
+        # local reduce_scatter
+        instances = 1
+        for lc in range(num_local_gpus//2):
+            for r in range(num_local_gpus):
+                if lc == (r % (num_local_gpus//2)):
+                    continue
+                within_socket_nghr = lc + (4 if (r >= num_local_gpus//2) else 0)
+                index = lc * 2
+                c = chunk(r, Buffer.input, index, 2)
+                c.reduce(within_socket_nghr, buffer=Buffer.input, index=index)
+        #  cross-socket reduce_scatter
+        for r in range(num_local_gpus):
+            index = (r % (num_local_gpus//2)) * 2
+            if r >= num_local_gpus // 2:
+                index += 1 # Handle the odd chunk
+            lc = chunk(r, Buffer.input, index)
+            lc = lc.reduce((r+num_local_gpus//2) % num_local_gpus, buffer=Buffer.input, index=index)
+            lc.send(r, Buffer.input, index, ch=1) # Reduce and send should be on different tbs
+        #  local all_gather
+        for r in range(num_local_gpus):
+            index = (r % (num_local_gpus//2)) * 2
+            lc = chunk(r, Buffer.input, index, 2)
+            for t in range(num_local_gpus//2):
+                local_nghr = t + (num_local_gpus//2 if (r >= num_local_gpus//2) else 0)
+                if local_nghr == r:
+                    continue
+                lc.send(local_nghr, buffer=Buffer.input, index=index)
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('instances', type=int, help='number of instances')
+args = parser.parse_args()
+allreduce(args.instances)
diff --git a/examples/scclang/alltoall_a100.py b/examples/scclang/alltoall_a100.py
new file mode 100644
index 0000000..5a295ef
--- /dev/null
+++ b/examples/scclang/alltoall_a100.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# For AllToAll on 9 A100 nodes
+# alltoall_a100.py 9 8 2
+# For AllToAll on 16 A100 nodes
+# alltoall_a100.py 16 8 2 --ib_connections 1
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllToAll
+
+def alltoall_hierarchical(num_nodes, gpus_per_node, instances, ib_connections):
+    num_ranks = num_nodes * gpus_per_node
+
+    # (node, local gpu) to rank
+    # (n, g) => r
+    def RankFromNodeGpuPair(n, g):
+        return n*gpus_per_node + g
+
+    # For cross node traffic from node n1 to node n2, returns the ranks g
+    # gpus on n1 and n2 that handle that traffic.
+    def CrossNodeGpus(n1, n2):
+        def LocalRank(n1, n2):
+            return (n2 if n1 > n2 else n2-1) % gpus_per_node
+        r1 = RankFromNodeGpuPair(n1, LocalRank(n1, n2))
+        r2 = RankFromNodeGpuPair(n2, LocalRank(n2, n1))
+        return (r1, r2)
+
+    # Groups chunk reference into one large chunk reference (used for IB)
+    # Save them under a key in the dictionary ib_chunks
+    def AddChunk(ib_chunks, key, c):
+        if key in ib_chunks: 
+            ib_chunks[key] = ib_chunks[key].group(c)
+        else:
+            ib_chunks[key] = c
+        
+
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, instances, inplace=False)
+    
+    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+        ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
+
+        # Local Gathers
+        # for n1 in range(num_nodes):
+        #     for g1 in range(gpus_per_node):
+        #         for n2 in range(num_nodes):
+        #             for g2 in range(gpus_per_node):
+        #                 for ch in range(instances):
+        #                     r1 = RankFromNodeGpuPair(n1, g1)
+        #                     r2 = RankFromNodeGpuPair(n2, g2)
+        #                     # Rank(r) gives accesses the rth rank of the program
+        #                     # input(i) gives a reference to ith chunk
+        #                     c = Rank(r1).input(r2 * instances + ch)
+                            
+        #                     if (n1 != n2): 
+        #                         # Gather chunks destined for cross node ranks in scratch to route through IB
+        #                         gather_rank, _ = CrossNodeGpus(n1, n2)
+        #                         buffer_key = (n1, n2)
+        #                         # Send chunk to the gather_rank. Send returns a chunk reference to the 
+        #                         # receiver's chunk
+        #                         c = c.send(gather_rank, buffer=buffer_key, ch=ch)
+        #                         # Group the chunks using a particular IB pair into one large chunk reference
+        #                         AddChunk(ib_chunks, buffer_key, c) 
+        #                     else:
+        #                         # Directly send chunks destined for ranks within the node or
+        #                         # copy chunks destined for current rank into the output buffer
+        #                         c.send(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch)
+
+        for n1 in range(num_nodes):
+            for g1 in range(gpus_per_node):
+                for ch in range(instances):
+                    for n2 in range(num_nodes):
+                        r1 = RankFromNodeGpuPair(n1, g1)
+                        if (n1 != n2): 
+                            # Send over all chunks destined for that node to the peer gpu that handles chunks to that node
+                            c = chunk(r1, Buffer.input, n2 * gpus_per_node * instances + ch * gpus_per_node, gpus_per_node)
+                            # Gather chunks destined for cross node ranks in scratch to route through IB
+                            gather_rank, _ = CrossNodeGpus(n1, n2)
+                            buffer_key = (n1, n2)
+                            # Send chunk to the gather_rank. Send returns a chunk reference to the 
+                            # receiver's chunk
+                            c = c.send(gather_rank, buffer=buffer_key, ch=ch*2)
+                            # Group the chunks using a particular IB pair into one large chunk reference
+                            AddChunk(ib_chunks, buffer_key, c) 
+                        else:
+                            # Within a node - direct send/copy the chunks over nvlink to the output buffer. 
+                            # Use a different channel to ensure that we don't get in the way of sends/receives above
+                            # which are on the critical path.
+                            for g2 in range(gpus_per_node):
+                                r2 = RankFromNodeGpuPair(n2, g2)
+                                c = chunk(r1, Buffer.input, r2 * instances + ch)
+                                c.send(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
+
+                    
+
+        # IB Send and local scatters
+        for buffer_key, ib_chunk in ib_chunks.items(): 
+            (n1, n2) = buffer_key
+            _, scatter_rank = CrossNodeGpus(n1, n2)
+            # IB send divided across multiple parallel channels
+            chunks = ib_chunk.split(ib_connections)
+            for ch, c in enumerate(chunks):
+                # Note: If we are only going to use 1 IB connection for each IB send
+                # alternate between channels 0 and 1 to utilize both IB links.
+                if ib_connections == 1:
+                    ib_channel = c.rank % 2
+                else:
+                    ib_channel = ch
+                c = c.send(scatter_rank, buffer=buffer_key, ch=ib_channel)
+                # Local scatter
+                cs = c.split(gpus_per_node * gpus_per_node)
+                for i, c in enumerate(cs):
+                    # Access the chunk's destination rank and index to route it to its final place
+                    final_rank = c.get_dst_rank()
+                    index = c.get_dst_index()
+                    c.send(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
+
+        XML() # Prints the XML
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_nodes', type=int, help ='number of nodes')
+parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--ib_connections', type=int, default=-1, help='Number of connections used for each IB send. Default: number of instances')
+args = parser.parse_args()
+
+if args.ib_connections == -1:
+    args.ib_connections = args.instances
+
+alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.instances, args.ib_connections)
\ No newline at end of file
diff --git a/examples/scclang/alltoall_a100_mesh.py b/examples/scclang/alltoall_a100_mesh.py
new file mode 100644
index 0000000..311c44e
--- /dev/null
+++ b/examples/scclang/alltoall_a100_mesh.py
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from typing import Callable, List
+
+import sccl.language
+import sccl.topologies
+from sccl.language.collectives import AllToAll
+
+
+def alltoall_mesh(nnodes: int, ngpus: int, nchannels: int, threadblocks: int) -> None:
+    """Generate XML for 4-phase mesh alltoall algorithm.
+
+    Args:
+        nnodes (int): Number of nodes.
+        ngpus (int): Number of GPUs per node.
+        nchannels (int): Number of channels/instances.
+    """
+    nranks: int = nnodes * ngpus
+    node_rank: Callable[[int], int] = lambda r: r // ngpus
+    local_rank: Callable[[int], int] = lambda r: r % ngpus
+    stride_idx: Callable[[int, int, int], int] = lambda r, step, n: n // step * (r % step) + r // step
+    shift_channel: Callable[[int, int], int] = lambda chunk_idx, ch: chunk_idx + nranks * ch
+
+    topology = sccl.topologies.fully_connected(nranks)
+    collective = AllToAll(nranks, nchannels, inplace=False, name='alltoall')
+    with sccl.language.SCCLProgram('alltoall_mesh', topology, collective, instances=1, protocol='Simple', threadblocks=threadblocks):
+        # get device on all ranks
+        devices: List[sccl.language.Process] = list(map(lambda r: sccl.language.Rank(r), range(nranks)))
+
+        for ch in range(nchannels):
+            # phase-0: per-gpu (step=ngpus) stride copy
+            for r in range(nranks):
+                for peer in range(nranks):
+                    chunk = devices[r].input(peer * nchannels + ch)
+                    chunk.send(r, buffer='phase-0', index=shift_channel(stride_idx(peer, ngpus, nranks), ch), ch=ch)
+
+            # phase-1: intra-node alltoall
+            for r in range(nranks):
+                for g in range(ngpus):
+                    peer = g + node_rank(r) * ngpus
+                    chunk = devices[r].scratch('phase-0', shift_channel(g * nnodes, ch), size=nnodes)
+                    chunk.send(peer, buffer='phase-1', index=shift_channel(local_rank(r) * nnodes, ch), ch=ch)
+
+            # phase-2: per-gpu (step=nnodes) stride copy
+            for r in range(nranks):
+                for peer in range(nranks):
+                    chunk = devices[r].scratch('phase-1', shift_channel(peer, ch))
+                    chunk.send(r, buffer='phase-2', index=shift_channel(stride_idx(peer, nnodes, nranks), ch), ch=ch)
+
+            # phase-3: inter-node alltoall
+            for r in range(nranks):
+                for n in range(nnodes):
+                    peer = local_rank(r) + n * ngpus
+                    chunk = devices[r].scratch('phase-2', shift_channel(n * ngpus, ch), size=ngpus)
+                    if nchannels > 1:
+                        chunk.send(peer, buffer='phase-3', index=shift_channel(node_rank(r) * ngpus, ch), ch=ch)
+                    else:
+                        chunk.send(
+                            peer,
+                            buffer=sccl.language.Buffer.output,
+                            index=shift_channel(node_rank(r) * ngpus, ch),
+                            ch=ch
+                        )
+
+            # re-order chunks in channels
+            if nchannels <= 1:
+                continue
+            for r in range(nranks):
+                for peer in range(nranks):
+                    chunk = devices[r].scratch('phase-3', shift_channel(peer, ch))
+                    chunk.send(
+                        r,
+                        buffer=sccl.language.Buffer.output,
+                        index=stride_idx(peer, nranks, nranks * nchannels) + ch,
+                        ch=ch
+                    )
+
+        sccl.language.XML()
+        sccl.language.Check()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-n',
+        '--num_nodes',
+        type=int,
+        default=2,
+        help='number of nodes',
+    )
+    parser.add_argument(
+        '-g',
+        '--gpus_per_node',
+        type=int,
+        default=4,
+        help='gpus per node',
+    )
+    parser.add_argument(
+        '-c',
+        '--channels',
+        type=int,
+        default=2,
+        help='number of channels',
+    )
+
+    parser.add_argument(
+        '-t',
+        '--threadblocks',
+        type=int,
+        default=0,
+        help='number of threadblocks. Default: 0, SCCLang controlled',
+    )
+    args = parser.parse_args()
+
+    alltoall_mesh(args.num_nodes, args.gpus_per_node, args.channels, args.threadblocks)
\ No newline at end of file
diff --git a/examples/scclang/alltoall_a100_yifan.py b/examples/scclang/alltoall_a100_yifan.py
new file mode 100644
index 0000000..c4dbf77
--- /dev/null
+++ b/examples/scclang/alltoall_a100_yifan.py
@@ -0,0 +1,59 @@
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllToAll
+
+
+def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
+    num_ranks = num_nodes * gpus_per_node
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+
+        
+    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol):
+        for n1 in range(num_nodes):
+            for r in range(1,num_nodes):
+                n2 = (n1 + r) % num_nodes
+                # print(f"r {r} n1 {n1} n2 {n2}")
+
+                # Gather all local chunks for the node neighbor
+                for g1 in range(gpus_per_node):
+                    rank1 = n1 * gpus_per_node + g1
+
+                    for g2 in range(gpus_per_node):
+                        rank2 = n1 * gpus_per_node + g2
+                        # chunk to send: g2 on n2
+                        index = n2 * gpus_per_node + g2 
+                        c = chunk(rank1, Buffer.input, index)
+                        c = c.send(rank2, f'send_{n2}')
+
+            for r in range(1,num_nodes):
+                n2 = (n1 + r) % num_nodes
+                # IB send
+                for g1 in range(gpus_per_node):
+                    rank = n1 * gpus_per_node + g1
+                    ib_peer = n2 * gpus_per_node + g1
+                    c = chunk(rank, f'send_{n2}', 0, 8)
+                    c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=(n2 % 8)*2+(rank%2)+2)
+
+          
+        # Handle local chunks within a node
+        for rank in range(num_ranks):
+            for g in range(gpus_per_node):
+                index = (rank // gpus_per_node) * gpus_per_node + g
+                c = chunk(rank, Buffer.input, index)
+                c.send(c.get_dst_rank(), Buffer.output, c.get_dst_index())
+
+        XML() # Prints the XML
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_nodes', type=int, help ='number of nodes')
+parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
+parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+
+alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.protocol)
diff --git a/examples/scclang/alltoall_allpairs.py b/examples/scclang/alltoall_allpairs.py
new file mode 100644
index 0000000..c037149
--- /dev/null
+++ b/examples/scclang/alltoall_allpairs.py
@@ -0,0 +1,28 @@
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllToAll
+
+# One-step AllToAll program
+# Each gpu makes sends and receives a chunk from every other gpu
+
+def alltoall(num_ranks, instances, protocol):
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+
+    with SCCLProgram("alltoall_allpairs", topology, collective, instances=instances, protocol=protocol):
+        for r in range(num_ranks):
+            for index in range(num_ranks):
+                chunk(r, Buffer.input, index).send(index, Buffer.output, r)
+        XML()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help ='number of instances')
+parser.add_argument('protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+alltoall(args.num_gpus, args.instances, args.protocol)
diff --git a/examples/scclang/alltonext_backward.py b/examples/scclang/alltonext_backward.py
new file mode 100644
index 0000000..fb3e12b
--- /dev/null
+++ b/examples/scclang/alltonext_backward.py
@@ -0,0 +1,93 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies.distributed import *
+from sccl.topologies import *
+from sccl.language.collectives import Collective
+
+class Pipeline(Collective):
+    def init_buffers(self):
+        chunks_per_node = self.chunk_factor
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = [None] * chunks_per_node
+            output_buffer = [None] * chunks_per_node
+            if r != 0:
+                for c in range(chunks_per_node):
+                    input_buffer[c] = Chunk(r, c, r-1, c)
+            buffers = {Buffer.input : input_buffer, 
+                       Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+            
+
+    # Final state chunks on rank(i) end up on rank(i-1)
+    def check(self, prog):
+        correct = True
+        for r in range(0, self.num_ranks-1):
+            output = prog.buffers[r][Buffer.output]
+            for c in range(self.chunk_factor):
+                chunk = output[c]
+                if chunk is None or chunk.origin_rank != r+1 or chunk.origin_index != c:
+                    print(f'Rank {r} chunk {c} is incorrect should be ({r+1}, {c}) given {chunk}')
+                    correct = False
+        return correct
+
+
+def pipeline(num_nodes, instances):
+    num_local_gpus = 8
+    chunks = num_local_gpus
+    chunk_factor = chunks
+    remote_bw = 1
+    size = num_local_gpus * num_nodes
+    topology = fully_connected(size)
+    collective = Pipeline(size, chunk_factor, False)
+
+    def rank(node, local_rank):
+        return node * num_local_gpus + local_rank
+    
+    with SCCLProgram("alltonext-backwards", topology, collective, instances):
+
+        for n in range(num_nodes):
+            for g in range(num_local_gpus):
+                r = rank(n, g)
+
+                # Do nothing for first gpu - end of pipeline
+                if r == 0:
+                    continue
+
+                # Cross node send - cooperative
+                if g == 0:
+                    for ch in range(chunks):
+                        c = chunk(r, Buffer.input, ch)
+                        if ch == 0:
+                            # 2 steps: IB send to (node-1, g) then gather onto (node+1, num_local_gpus-1)
+                            c = c.send(rank(n-1, ch), 'gather', 0, ch=ch%2)
+                        elif ch == num_local_gpus-1: 
+                            # 2 steps: Scatter - send to (node, num_local_gpus-1), IB send to (node+1, num_local_gpus-1)
+                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
+                        else:
+                            # 3 steps: Scatter - send to (node, g), IB send to (node-1, g), gather onto (node-1, num_local_gpus-1)
+                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
+                            c = c.send(rank(n-1, ch), 'gather', 0, ch=ch%2)
+                        c.send(r-1, Buffer.output, c.get_dst_index(), ch=ch%2)
+                        
+                # Normal send - directly
+                else:
+                    c = chunk(r, Buffer.input, 0, chunks)
+                    c.send(r-1, Buffer.output, 0, ch=g%2)
+        
+        Check()
+        XML()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('num_nodes', type=int, help ='number of nodes')
+    parser.add_argument('instances', type=int, help ='number of instances')
+
+    args = parser.parse_args()
+
+    pipeline(args.num_nodes, args.instances)
\ No newline at end of file
diff --git a/examples/scclang/alltonext_forward.py b/examples/scclang/alltonext_forward.py
new file mode 100644
index 0000000..2edc419
--- /dev/null
+++ b/examples/scclang/alltonext_forward.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies.distributed import *
+from sccl.topologies import *
+from sccl.language.collectives import Collective
+
+class Pipeline(Collective):
+    def init_buffers(self):
+        chunks_per_node = self.chunk_factor
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = [None] * chunks_per_node
+            output_buffer = [None] * chunks_per_node
+            if r != self.num_ranks -1:
+                for c in range(chunks_per_node):
+                    # Chunk(starting rank, starting index, ending rank, ending index)
+                    input_buffer[c] = Chunk(r, c, r+1, c)
+            buffers = {Buffer.input : input_buffer, 
+                       Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+            
+
+    # Final state chunks on rank(i) end up on rank(i+1)
+    def check(self, prog):
+        correct = True
+        for r in range(1, self.num_ranks):
+            output = prog.buffers[r][Buffer.output]
+            for c in range(self.chunk_factor):
+                chunk = output[c]
+                # Check we got the previous rank's chunks
+                if chunk is None or chunk.origin_rank != r-1 or chunk.origin_index != c:
+                    print(f'Rank {r} chunk {c} is incorrect should be ({r-1}, {c}) given {chunk}')
+                    correct = False
+        return correct
+
+
+def pipeline(num_nodes, instances):
+    num_local_gpus = 8
+    chunks = num_local_gpus
+    total_chunks_per_loop = chunks
+    remote_bw = 1
+    size = num_local_gpus * num_nodes
+    topology = fully_connected(size)
+    collective = Pipeline(size, total_chunks_per_loop, True)
+
+    def rank(node, local_rank):
+        return node * num_local_gpus + local_rank
+    
+    with SCCLProgram("alltonext-forward", topology, collective, instances):
+
+        for n in range(num_nodes):
+            for g in range(num_local_gpus):
+                r = rank(n, g)
+
+                # Do nothing for last gpu - end of pipeline
+                if r == size - 1:
+                    continue
+
+                # Cross node send - cooperative
+                if g == num_local_gpus -1:
+                    for ch in range(chunks):
+                        c = chunk(r, Buffer.input, ch)
+                        if ch == 0: # 2 steps: Scatter - send to (node, 0), IB send to (node+1, 0)
+                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
+
+                        elif ch == num_local_gpus-1:
+                            # 2 steps: IB send to (node+1, g) then gather onto (node+1, 0)
+                            c = c.send(rank(n+1, ch), 'gather', 0, ch=ch%2)
+                        else:
+                            # 3 steps: Scatter - send to (node, g), IB send to (node+1, g), gather onto (node+1, 0)
+                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
+                            c = c.send(rank(n+1, ch), 'gather', 0, ch=ch%2)
+                        
+                        c.send(r+1, Buffer.output, c.get_dst_index(), ch=ch%2)
+                        
+                # Normal send - directly
+                else:
+                    c = chunk(r, Buffer.input, 0, chunks)
+                    c.send(r+1, Buffer.output, 0, ch=g%2)
+        
+        Check()
+        XML()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('num_nodes', type=int, help ='number of nodes')
+    parser.add_argument('instances', type=int, help ='number of instances')
+
+    args = parser.parse_args()
+
+    pipeline(args.num_nodes, args.instances)
\ No newline at end of file
diff --git a/examples/scclang/reducegather.py b/examples/scclang/reducegather.py
new file mode 100644
index 0000000..a1001b8
--- /dev/null
+++ b/examples/scclang/reducegather.py
@@ -0,0 +1,98 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import Collective
+
+class ReduceGather(Collective):
+    def __init__(self, num_ranks, chunk_factor, inplace, groups):
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.groups = groups
+        self.gpus_per_group = num_ranks // groups
+        assert chunk_factor == 1, "Only supports chunks == number of ranks"
+
+    def init_buffers(self):
+        assert self.chunk_factor == 1
+        rank_buffers = []
+        chunks_per_node = self.num_ranks
+        for r in range(self.num_ranks):
+            input_buffer = [None] * self.gpus_per_group
+            output_buffer = [None] * chunks_per_node
+            for c in range(self.groups):
+                input_buffer[c] = Chunk(r, c, -1, c)
+            buffers = {Buffer.input : input_buffer, 
+                       Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+            
+
+    def check(self, prog):
+        expected_chunks = []
+        for r in range(self.num_ranks):
+            chunk = ReduceChunk([])
+            for x in range(self.groups):
+                y = r // self.groups
+                next = y * self.groups + x
+                chunk = chunk.reduce(Chunk(next, r % self.gpus_per_group))
+            expected_chunks.append(chunk)
+
+        correct = True
+        for r in range(self.num_ranks):
+            output = prog.buffers[r][Buffer.output]
+            for c in range(self.num_ranks):
+                chunk = output[c]
+                if chunk is None or chunk != expected_chunks[c]:
+                    print(f'Rank {r} chunk {c} is incorrect should be {expected_chunks[c]} given {chunk}')
+                    correct = False
+        return correct
+
+
+def program(num_ranks, groups, instances, protocol):
+    gpus_per_group = num_ranks // groups
+    topology = fully_connected(num_ranks)
+    chunk_factor = 1
+    inplace = False
+    collective = ReduceGather(num_ranks, chunk_factor, inplace, groups)
+
+    with SCCLProgram("reduce-gather", topology, collective, instances, protocol, threadblock_policy=ThreadblockPolicy.manual):
+
+        # Per group reduce scatter
+        for y in range(groups):
+            for x in range(gpus_per_group):
+                output_index = y * groups + x
+                input_index = x
+                gpu = y * groups + (x+1) % gpus_per_group
+                c = chunk(gpu, Buffer.input, input_index)
+                # Use the input buffer to perform reduction across groups
+                for x_ in range(1, gpus_per_group):
+                    c = c.reduce(y * groups + (x + 1 + x_) % gpus_per_group, Buffer.input, input_index, sendtb=0, recvtb=0, ch=0)
+                # Copy reduced chunk into the output buffer
+                c = c.send(c.rank, Buffer.output, output_index, sendtb=0, recvtb=0, ch=0)
+
+
+        # Ring Allgather
+        for r in range(num_ranks):
+            c = chunk(r, Buffer.output, r)
+            next = (r + 1) % num_ranks
+            while next != r:
+                c = c.send(next, Buffer.output, r, sendtb=1, recvtb=1, ch=1)
+                next = (next + 1) % num_ranks
+
+        Check()
+        XML()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('num_ranks', type=int, help ='number of ranks')
+    parser.add_argument('groups', type=int, help='number of reduction groups')
+    parser.add_argument('--instances', type=int, default=1, help='number of instances')
+    parser.add_argument('--protocol', type=str, default='Simple', 
+        choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol')
+    args = parser.parse_args()
+
+    assert args.num_ranks % args.groups == 0
+
+    program(args.num_ranks, args.groups, args.instances, args.protocol)
diff --git a/examples/scclang/simple/allgather_ring.py b/examples/scclang/simple/allgather_ring.py
new file mode 100644
index 0000000..8ce40ed
--- /dev/null
+++ b/examples/scclang/simple/allgather_ring.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllGather
+
+def allgather_ring(size):
+    topology = fully_connected(size)
+    collective = AllGather(size, 1, False)
+    with SCCLProgram("allgather_ring", topology, collective, 1):
+        # Loop over each chunk's root
+        for r in range(size):
+            # Get the chunk at rank r, input[r]
+            c = chunk(r, Buffer.input, 0)
+            # Copy chunk to the output buffer
+            c = c.send(r, buffer=Buffer.output, index=r, sendtb=0)
+
+            next = (r + 1) % size
+            while next != r:
+                # For each rank in the ring, send the chunk to the next rank
+                # Setting sender's tb and receiver's tb to be 0 so that send/receives on the
+                # same rank can be merged into a receive-copy-send
+                c = c.send(next, buffer=Buffer.output, index=r)
+                next = (next + 1) % size
+        XML()
+        Check()
+
+def allgather_ring_inplace(size):
+    topology = fully_connected(size)
+    collective = AllGather(size, 1, True)
+    with SCCLProgram("allgather_ring", topology, collective, 1):
+        # Loop over each chunk's root
+        for r in range(size):
+            # Get the chunk at rank r, input[r]
+            c = chunk(r, Buffer.input, 0)
+
+            next = (r + 1) % size
+            while next != r:
+                # For each rank in the ring, send the chunk to the next rank
+                # Setting sender's tb and receiver's tb to be 0 so that send/receives on the
+                # same rank can be merged into a receive-copy-send
+                c = c.send(next, buffer=Buffer.output, index=r)
+                next = (next + 1) % size
+        XML()
+        Check()
+
+# allgather_ring(4)
+allgather_ring_inplace(4)
\ No newline at end of file
diff --git a/examples/scclang/simple/allreduce_ring.py b/examples/scclang/simple/allreduce_ring.py
new file mode 100644
index 0000000..1f32b78
--- /dev/null
+++ b/examples/scclang/simple/allreduce_ring.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.collectives import *
+from sccl.language.collectives import AllReduce
+
+
+def allreduce_ring(size, instances, threadblocks):
+    topology = fully_connected(size)
+    collective = AllReduce(size, size, inplace=True)
+    with SCCLProgram("allreduce_ring_inplace", topology, collective, instances, threadblocks):
+        for r in range(size):
+            index = r
+            c = chunk(r, Buffer.input, index)
+            next = (r + 1) % size
+            # Chunk travels around the ring being reduced
+            while next != r:
+                c = c.reduce(next, buffer=Buffer.input, index=r)
+                next = (next + 1) % size
+            
+            # Send the fully reduced chunk around the ring
+            while next != (r - 1) % size:
+                c = c.send(next, buffer=Buffer.input, index=r)
+                next = (next + 1) % size
+
+        Check()
+        XML()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('threadblocks', type=int, help='number of threadblocks per instance')
+
+args = parser.parse_args()
+
+allreduce_ring(args.num_gpus, args.instances, args.threadblocks)
diff --git a/examples/scclang/simple/custom_collective.py b/examples/scclang/simple/custom_collective.py
new file mode 100644
index 0000000..5d18e78
--- /dev/null
+++ b/examples/scclang/simple/custom_collective.py
@@ -0,0 +1,85 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Example of a simple custom collective where Rank 0 sends a chunk to Ranks 1 and 2
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import Collective
+
+# For custom collectives you need to define a new collective class
+# this is used by scclang to initialize buffers with chunks (pre-condition)
+# and provide a checker to check that chunks satisfy the post-condition of the collective.
+class CollEx(Collective):
+    # Initial state is chunk0 is on rank0 in the input buffer
+    def init_buffers(self):
+        chunks_per_node = self.instances
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = [None] * chunks_per_node
+            output_buffer = [None] * chunks_per_node 
+            if r == 0:
+                for c in range(chunks_per_node):
+                    # Format for specifying a chunk
+                    # Chunk(starting rank, starting index, ending rank, ending index)
+                    # Because this chunk ends up on multiple ranks ending rank is set to -1
+                    input_buffer[c] = Chunk(r, c, -1, c)
+            buffers = {Buffer.input : input_buffer, 
+                       Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+            
+
+    # Final state chunk0 from rank0 is in the output buffer of rank1 and rank2
+    def check(self, prog):
+        correct = True
+        for r in range(1, self.num_ranks):
+            output = prog.buffers[r][Buffer.output]
+            for c in range(self.instances):
+                chunk = output[c]
+                # Check that we got chunk 0 from rank 0
+                if chunk is None or chunk.origin_rank != 0 or chunk.origin_index != 0:
+                    print(f'Rank {r} chunk {c} is incorrect should be ({0}, {0}) given {chunk}')
+                    correct = False
+        return correct
+
+
+def custom_example1():
+    # SCCLang programs take in a name for hte program, the topology of the network, 
+    # collective being implemented, chunksperloop of the collective, and optionally the NCCL protocol to be used
+    size = 3
+    topology = fully_connected(size) 
+    # Collectives take in number of ranks in the network, chunksperloop of the collective, whether it is inplace, 
+    collective = CollEx(size, 1, inplace=False")
+    with SCCLProgram("allgather_ring", topology, collective, 1, protocol="Simple"):
+        # Get the chunk at rank 0 index 0 of the input buffer
+        c = chunk(0, Buffer.input, 0)
+        # Send chunks to 1 and 2
+        # Can specify the sender's tb, receiver's tb, and channel for the send operation
+        # SCCLang provides a default threadblock assignment if they aren't specified
+        # SCCLang will also check the tb/channel combos are valid
+        c.send(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
+        c.send(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
+
+        XML() # Generates the XML for this collective
+        Check() # Checks the routes defined for each chunk are correct. Currently doesn't check XML correct
+
+def custom_example2():
+
+    size = 3
+    topology = fully_connected(size) 
+
+    collective = CollEx(size, 1, inplace=False)
+    with SCCLProgram("allgather_ring", topology, collective, 1, protocol="Simple"):
+        c = chunk(0, Buffer.input, 0)
+        # This is the same program as above but instead of rank 0 sending to 1 and 2
+        # 0 sends to 1 which sends to 2
+        # send returns the chunk on the receiver's side
+        c = c.send(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
+        c.send(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
+
+        XML()
+        Check() 
+
+custom_example1()
+custom_example2()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index ccbf125..1d14d8c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ z3-solver
 argcomplete
 lxml
 humanfriendly
+igraph
 pytest
 pytest-cov
 pytest-xdist
diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
index efe10a1..079d0ba 100644
--- a/sccl/autosynth/ndv4_plans.py
+++ b/sccl/autosynth/ndv4_plans.py
@@ -1,9 +1,23 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.autosynth.registry import register_synthesis_plan
+from sccl.autosynth.registry import register_synthesis_plan, register_sccl_program
+from sccl.programs.allreduce_a100_ring import allreduce_ring
+from sccl.programs.alltoall_a100_yifan import alltoall_hierarchical
+from sccl.topologies import fully_connected
+from sccl.language.ir import ThreadblockPolicy
 
 def register_ndv4_plans():
+
+    @register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+        instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+    def ndv4_ring_allreduce(prog, nodes):
+        allreduce_ring(size=8, channels=8)
+
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', machines=lambda x: x == 8 or x == 16 or x == 32)
+    def ndv4_alltoall(prog, nodes):
+        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
+
     @register_synthesis_plan('alltoall', 'ndv4', machines=lambda x: x == 9)
     def synthesize_ndv4_hierarchical_alltoall(machines):
         xml = ""
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
index 2a3f70e..9d85dca 100644
--- a/sccl/autosynth/registry.py
+++ b/sccl/autosynth/registry.py
@@ -8,6 +8,11 @@
 import atexit
 import humanfriendly
 
+from sccl.language import SCCLProgram, ir_to_xml
+from sccl.language.ir import ThreadblockPolicy
+import sccl.language.collectives as lang_collectives
+from sccl.topologies import distributed_fully_connected
+
 # The plans are keyed by (collective, machine_type) and each entry is a tuple
 # (name, function, machines, size_range, protocol, priority).
 synthesis_plans = defaultdict(list)
@@ -55,3 +60,38 @@ def wrapped(machines):
         # Return the original function to not break other usage
         return fun
     return decorator
+
+
+def register_sccl_program(local_topology, collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', 
+    chunk_factor=1, priority=0, collective_obj=None, instances=1, inplace=False, threadblock_policy=ThreadblockPolicy.auto):
+    def decorator(fun):
+        name = fun.__name__
+        def wrapped(machines):
+            topology = distributed_fully_connected(local_topology, machines, 1)
+            co = collective_obj
+            if co == None:
+                if collective == 'allreduce':
+                    co = lang_collectives.AllReduce(topology.num_nodes(), chunk_factor, inplace)
+                elif collective == 'allgather':
+                    co = lang_collectives.AllGather(topology.num_nodes(), chunk_factor, inplace)
+                elif collective == 'alltoall':
+                    co = lang_collectives.AllToAll(topology.num_nodes(), chunk_factor, inplace)
+                elif collective == 'reduce_scatter':
+                    co = lang_collectives.ReduceScatter(topology.num_nodes(), chunk_factor, inplace)
+                else:
+                    raise RuntimeError(f'No collective_obj in sccl.language.collectives known for "{collective}"')
+            prog = SCCLProgram(name, topology, co, instances, protocol, threadblock_policy)
+            with prog:
+                fun(prog, machines)
+            prog.check()
+            ef = ir_to_xml(prog.lower())
+            fd, path = tempfile.mkstemp()
+            with os.fdopen(fd, 'w') as f:
+                f.write(ef)
+            atexit.register(os.remove, path)
+            return path
+        _register_ef_provider(f'run {name}', wrapped, collective,
+                             machine_type, machines, sizes, protocol, priority)
+        # Return the original function to not break other usage
+        return fun
+    return decorator
\ No newline at end of file
diff --git a/sccl/language/__init__.py b/sccl/language/__init__.py
new file mode 100644
index 0000000..bdfeff3
--- /dev/null
+++ b/sccl/language/__init__.py
@@ -0,0 +1,378 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from dataclasses import dataclass
+from enum import Enum
+import functools
+from sccl.language.ir import *
+from sccl.language.passes import *
+from sccl.language.tb_assignment import *
+from sccl.language.chunk import *
+from sccl.language.buffer import *
+from sccl.language.rank_dag import *
+from sccl.language.visualize import *
+import sccl.collectives as collectives
+
+
+_current_program = None
+def _curr():
+    global _current_program
+    if _current_program == None:
+        raise RuntimeError("No Program in context")
+    return _current_program
+
+class SCCLProgram:
+    def __init__(self, name, topo, collective, instances, protocol='Simple', \
+            threadblock_policy=ThreadblockPolicy.auto, interleaved_replication=True):
+        self.name = name
+        self.topo = topo
+        self.collective = collective       
+        self.num_ranks = topo.num_nodes()
+        self.instances = instances
+        self.protocol = protocol
+        self.threadblock_policy = threadblock_policy
+        self.interleaved_replication = interleaved_replication
+        assert protocol == 'Simple' or protocol == 'LL' or protocol == 'LL128', \
+            f'Given protocol: {protocol}. Must be either Simple, LL, LL128'
+        self.run_opt = True # Runs optimization passes
+        # Initialize the input buffers
+        self.chunk_dag = ChunkDAG()
+        self.buffers = collective.init_buffers()
+        self.rank_dag = RankDAG(self.num_ranks, self.buffers)
+        for r in range(self.num_ranks):
+            for index, chunk in enumerate(self.buffers[r][Buffer.input]):
+                ref = self.get_ref(r, Buffer.input, index, 1)
+                self.chunk_dag.init_chunk(chunk, ref)
+
+    def __enter__(self):
+        global _current_program
+        if _current_program != None:
+            raise RuntimeError("There is already a SCCL Program in context")
+        _current_program = self
+    
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        global _current_program
+        if _current_program != self:
+            raise RuntimeError("This program is not currently in context")
+        _current_program = None
+
+    def add_send(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size):
+        src_buffer, src_index = self.collective.get_buffer_index(src, src_buffer, src_index)
+        dst_buffer, dst_index = self.collective.get_buffer_index(dst, dst_buffer, dst_index)
+        sb = self.buffers[src][src_buffer]
+        db = self.buffers[dst][dst_buffer]
+        for i in range(size):
+            db[dst_index + i] = sb[src_index + i]
+
+    def add_reduce(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size):
+        src_buffer, src_index = self.collective.get_buffer_index(src, src_buffer, src_index)
+        dst_buffer, dst_index = self.collective.get_buffer_index(dst, dst_buffer, dst_index)
+        sb = self.buffers[src][src_buffer]
+        db = self.buffers[dst][dst_buffer]
+        for i in range(size):
+            reduce_chunk = db[dst_index + i]
+            sent_chunk = sb[src_index + i]
+            db[dst_index + i] = reduce_chunk.reduce(sent_chunk)
+
+    def get_ref(self, rank, buffer, index, size):
+        buffer, index = self.collective.get_buffer_index(rank, buffer, index)
+        return Ref(rank, buffer, index, size, self)
+
+    def get_chunks(self, rank, buffer, index, size=1):
+        chunks = [None] * size
+        for i in range(index, index+size):
+            chunks[i-index] = self.buffers[rank][buffer][i]
+        return chunks
+
+    def check_buffer_exists(self, rank, name):
+        if name not in self.buffers[rank]:
+            self.buffers[rank][name] = BufferSlice(Buffer.scratch, name)
+
+    # Checks that all chunks that should be on each rank
+    # are present in the output buffer.
+    def check(self):
+        return self.collective.check(self)
+
+    # Lower program to XML
+    def lower(self):
+        self.chunk_dag._complete_metadata()
+        self.chunk_dag.lower_rank_dag(self.rank_dag)
+       
+        self.rank_dag.optimize()
+        if self.threadblock_policy == ThreadblockPolicy.manual:
+            manual_assign_tbs(self.rank_dag)
+        else:
+            create_base_tbs(self.rank_dag)
+            auto_assign_tbs(self.rank_dag)
+        self.rank_dag.lower_pt1(self.instances)
+        gpu_prgms = self.rank_dag.lower_pt2(self.instances, self.interleaved_replication)
+        check_dependency_cycles(self.rank_dag.tbs)
+        check_threadblock_ordering(self.rank_dag)
+        return Program(self.name, self.collective.name, self.collective.inplace, self.protocol, gpu_prgms)  
+
+    def print_chunk_dag(self):
+        visualize_chunk_dag(self.chunk_dag.chunk_paths)
+
+    def print_rank_dags(self, rank):
+        if rank == -1:
+            for r in range(len(self.ranks)):
+                visualize_rank_dag(self.rank_dags[r].operations)
+        else:
+            visualize_rank_dag(self.rank_dags[rank].operations)
+
+def Print():
+    _curr().print_chunk_dag()
+
+def chunk(rank, buffer, index, size=1):
+    return _curr().get_ref(rank, buffer, index, size)
+
+def create_scratch(rank, name):
+    return _curr().create_scratch(rank, name)
+
+def XML():
+   print(ir_to_xml(_curr().lower()))
+
+def Check():
+    return _curr().check()
+
+@dataclass
+class Ref(ChunkRef):
+    prog: SCCLProgram
+
+    def __repr__(self):
+        return f'Ref(Buffer:{self.buffer}, Index:{self.index}, Size:{self.size}, Rank:{self.rank})'
+
+    def _end(self):
+        return self.index + self.size
+
+    def _get_chunk(self, index):
+        return self.prog.buffers[self.rank][self.buffer][index]
+
+    def split(self, num):
+        assert (self.size % num == 0), f'Trying to split a chunk of {self.size} elements into {num} parts'
+        chunks = [None] * num
+        size = self.size // num
+        for i in range(num):
+            index = self.index + i * size
+            chunks[i] = self.prog.get_ref(self.buffer, self.rank, index, size)
+        return chunks
+
+    def group(self, other):
+        assert (self.rank == other.rank), f'Trying to concatenate chunks on ranks {self.rank} and {other.rank}'
+        assert (self.buffer == other.buffer), f'Trying to concatenate chunks in {self.buffer} and {other.buffer}'
+        if self.index < other.index:
+            first = self
+            second = other
+        else:
+            first = other
+            second = self
+
+        end = max(first._end(), second._end())
+        return Ref(self.rank, self.buffer, first.index, end - first.index, self.prog)
+        
+
+    def send(self, dst, buffer=None, index=-1, sendtb=-1, recvtb=-1, ch=-1):
+        self.prog.check_buffer_exists(dst, buffer)
+
+        # If index is not specified assume it is going to the same place in the next gpu
+        if index == -1 and buffer == None:
+            index = self.index
+            buffer = self.buffer
+        elif index == -1 and buffer is not Buffer.input and buffer is not Buffer.output:
+            index = self.prog.buffers[dst][buffer].instance_size()
+
+        # Some inplace collectives have custom logic for buffers and index (ReduceScatter, AllGather)
+        buffer, index = self.prog.collective.get_buffer_index(self.rank, buffer, index)
+
+        # Direct send
+        assert (self.prog.topo.link(self.rank, dst) or dst == self.rank), f'No link from {self.rank} to {dst}'
+        dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
+
+        self.prog.add_send(self.rank, self.buffer, self.index, dst, buffer, index, self.size)
+
+        chunks = self.prog.get_chunks(self.rank, self.buffer, self.index, self.size)
+        self.prog.chunk_dag.add_send(chunks, self, dst_chunkref, sendtb, recvtb, ch)
+
+        return dst_chunkref
+
+    def reduce(self, dst, buffer, index=-1, sendtb=-1, recvtb=-1, ch=0):
+        self.prog.check_buffer_exists(dst, buffer)
+
+        # Some inplace collectives have custom logic for buffers and index (ReduceScatter, AllGather)
+        buffer, index = self.prog.collective.get_buffer_index(self.rank, buffer, index)
+
+        # Receive reduce copy
+        assert (self.prog.topo.link(self.rank, dst) or dst == self.rank), f'No link from {self.rank} to {dst}'
+        dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
+
+        chunks1 = self.prog.get_chunks(self.rank, self.buffer, self.index, self.size)
+        chunks2 = self.prog.get_chunks(dst, buffer, index, self.size)
+
+        self.prog.add_reduce(self.rank, self.buffer, self.index, dst, buffer, index, self.size)
+
+        reduce_chunks = self.prog.get_chunks(dst, buffer, index, self.size)
+        self.prog.chunk_dag.add_reduce(chunks1, chunks2, reduce_chunks, self, dst_chunkref, sendtb, recvtb, ch)
+        return dst_chunkref
+
+    def get_origin_index(self, index=0):
+        return self._get_chunk(index + self.index).origin_index
+
+    def get_origin_rank(self, index=0):
+        return self._get_chunk(index + self.index).origin_rank
+
+    def get_dst_index(self, index=0):
+        return self._get_chunk(index + self.index).dst_index
+
+    def get_dst_rank(self, index=0):
+        return self._get_chunk(index + self.index).dst_rank
+
+    def print_chunk_info(self, index=0):
+        print(self._get_chunk(index + self.index)) 
+
+
+@dataclass
+class ChunkOp():
+    inst: ChunkInstruction
+    src: Ref # Ref Chunk acted on
+    dst: Ref # Ref Chunk created
+    sendtb: int = -1# For lowering to RankInstructions
+    recvtb: int = -1#  For lowering to RankInstructions
+    ch: int = -1 # For lowering to RankInstructions
+    steps_from_start:int  = -1
+    steps_to_end: int = -1 
+    prev: list = field(default_factory=list) # Previous ChunkOps
+    next: list = field(default_factory=list) # Next ChunkOps
+    visited = False
+    num = -1
+
+    def __repr__(self):
+        return f'ChunkOp({self.inst} {self.dst.rank} {self.dst.buffer} {self.dst.index})'
+
+    def __lt__(self, other):
+        return self.steps_from_start < other.steps_from_start
+
+    def __hash__(self):
+        return hash((self.inst, self.dst.rank, self.dst.index, self.dst.buffer)) # TODO 
+
+def same_slot(ref1, ref2):
+    return ref1.rank == ref2.rank and ref1.buffer == ref2.buffer and ref1.index == ref2.index
+
+# Returns if there is overlap between the refs
+def overlap_refs(ref1, ref2):
+    same_location = ref1.rank == ref2.rank and ref1.buffer == ref2.buffer
+    contained1 = ref1.index >= ref2.index and (ref1.index + ref1.size) <= (ref2.index + ref2.size)
+    contained2 = ref2.index >= ref1.index and (ref2.index + ref2.size) <= (ref1.index + ref1.size)
+    return same_location and (contained1 or contained2)
+
+class ChunkDAG:
+
+    def __init__(self):
+        self.chunks = []
+        self.chunk_paths = {} # chunk -> ChunkOp. Stores the entry point to where every chunk is created
+        self.max_hops = -1
+
+    # Initialize the ChunkDAG with starting chunks
+    def init_chunk(self, chunk, ref):
+        op = ChunkOp(ChunkInstruction.start, None, ref, steps_from_start=-1)
+        self.chunks.append(chunk)
+        self.chunk_paths[chunk] = op
+
+    def _find_prev_op_for_chunk(self, chunk, ref):
+        prev_op = None
+        frontier = [self.chunk_paths[chunk]]
+        while len(frontier) > 0:
+            current_op = frontier[0]
+            if overlap_refs(ref, current_op.dst):
+                prev_op = current_op
+            frontier = frontier[1:] + current_op.next
+        return prev_op
+
+    def add_send(self, chunks, src, dst, sendtb, recvtb, ch):
+        # Find the previous operation for these chunks
+        prev_ops = []
+        steps_from_start = 0
+        for chunk in chunks:
+            prev_op = self._find_prev_op_for_chunk(chunk, src)
+            steps_from_start = max(steps_from_start, prev_op.steps_from_start)
+            prev_ops.append(prev_op)
+        op = ChunkOp(ChunkInstruction.send, src, dst, sendtb, recvtb, ch, steps_from_start+1)
+        
+        for prev_op in prev_ops:
+            prev_op.next.append(op)
+        op.prev = prev_ops
+
+    def add_reduce(self, chunks1, chunks2, reduce_chunks, src, dst, sendtb, recvtb, ch):
+        # self.chunks.append(reduce_chunks)
+        prev_ops = []
+        steps_from_start = 0
+        # Find the previous operations that reduce builds off
+        for chunk1, chunk2 in zip(chunks1, chunks2):
+            prev_op_src = self._find_prev_op_for_chunk(chunk1, src)
+            prev_op_dst = self._find_prev_op_for_chunk(chunk2, dst)
+            steps_from_start = max(prev_op_src.steps_from_start, prev_op_dst.steps_from_start, steps_from_start)
+            prev_ops.append(prev_op_src)
+            prev_ops.append(prev_op_dst)
+            
+        op = ChunkOp(ChunkInstruction.reduce, src, dst, sendtb, recvtb, ch, steps_from_start+1)
+
+        for prev_op in prev_ops:
+            prev_op.next.append(op)
+            op.prev.append(prev_op)
+
+        # Reduce operations create new chunks, so keep a pointer to a new chunk
+        for rc in reduce_chunks:
+            self.chunk_paths[rc] = op
+
+    def _complete_metadata(self):
+        def dfs(op):
+            if len(op.next) == 0:
+                op.steps_to_end = 0
+            else:
+                for o in op.next:
+                    dfs(o)
+                op.steps_to_end = functools.reduce(lambda cur, x: max(cur, x.steps_to_end+1), op.next, 0)
+
+        for chunk, op in self.chunk_paths.items():
+            if op.inst == ChunkInstruction.start:
+                dfs(op)
+            
+    def lower_rank_dag(self, rank_dag):
+        frontier = []
+        visited = set()
+
+        for chunk, op in self.chunk_paths.items():
+            if len(op.prev) == 0: 
+                heapq.heappush(frontier, op)
+
+        while len(frontier) > 0:
+            op = heapq.heappop(frontier)
+            if op not in visited:
+                sendtb = op.sendtb
+                recvtb = op.recvtb
+                ch =  op.ch
+                if op.inst == ChunkInstruction.start:
+                    rank = op.dst.rank
+                    rank_dag.add_start(rank, op.dst.buffer, op.dst.index, op.dst)
+                elif op.inst == ChunkInstruction.send:
+                    sender = op.src.rank
+                    receiver = op.dst.rank
+                    if sender != receiver:
+                        sop = rank_dag.add_send(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2+1, sendtb, ch)
+                        rop = rank_dag.add_recv(receiver, op.src, op.dst, op.steps_from_start*2+1, op.steps_to_end*2, recvtb, ch)
+                        sop.match = [rop]
+                    else:
+                        rank_dag.add_copy(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2, sendtb)
+                elif op.inst == ChunkInstruction.reduce:
+                    sender = op.src.rank
+                    receiver = op.dst.rank
+                    if sender != receiver:
+                        sop = rank_dag.add_send(sender, op.src, op.dst, op.steps_from_start*2,op.steps_to_end*2+1, sendtb, ch)
+                        rop = rank_dag.add_recv_reduce_copy(receiver, op.src, op.dst, op.steps_from_start*2+1, op.steps_to_end*2, recvtb, ch)
+                        sop.match = [rop]
+                    else:
+                        rank_dag.add_reduce(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2, sendtb)
+
+                for o in op.next:
+                    heapq.heappush(frontier, o)
+                visited.add(op)
+        rank_dag.convert_set_list() # Pre-emptively convert sets to lists
\ No newline at end of file
diff --git a/sccl/language/buffer.py b/sccl/language/buffer.py
new file mode 100644
index 0000000..6f2a266
--- /dev/null
+++ b/sccl/language/buffer.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Scratch buffer slice with manual indexing
+class BufferSlice:
+    def __init__(self, buf, name):
+        self.name = name
+        self.buf = buf
+        self.offset = -1 # Offset into the global scratch buffer
+        self.chunks = []
+
+    # Returns the global index into the scratch buffer
+    def get_global_index(self, index):
+        assert (self.offset > -1), 'set_offset needs to be called first'
+        return self.offset + index
+
+    def get_buffer(self):
+        return self.buf
+
+    def instance_size(self):
+        return len(self.chunks)
+
+    def set_offset(self, offset):
+        self.offset = offset
+
+    def __getitem__(self, index):
+        return self.chunks[index]
+    
+    def __setitem__(self, index, value):
+        current_size = len(self.chunks)
+        while index > current_size:
+            self.chunks.append(None)
+            current_size = len(self.chunks)
+        if index == current_size:
+            self.chunks.append(value)
+        else:
+            self.chunks[index] = value
\ No newline at end of file
diff --git a/sccl/language/chunk.py b/sccl/language/chunk.py
new file mode 100644
index 0000000..6418b1c
--- /dev/null
+++ b/sccl/language/chunk.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from dataclasses import dataclass
+from sccl.language.ir import *
+
+@dataclass
+class Chunk:
+    origin_rank: int # Rank the chunk initially started at
+    origin_index: int # Index the chunk initially started at
+    dst_rank: int = -1
+    dst_index: int = -1
+
+    def reduce(self, chunk):
+        if type(chunk) is ReduceChunk:
+            return chunk.reduce(self)
+        elif type(chunk) is Chunk:  
+            chunks = [self, chunk]
+            return ReduceChunk(chunks)
+        else:
+            assert True, "Trying to reduce with chunk of None"
+            return None
+
+    def __hash__(self):
+        return hash((self.origin_rank, self.origin_index))
+
+    def __eq__(self, other):
+        return type(other) is Chunk and self.origin_rank == other.origin_rank and self.origin_index == other.origin_index
+
+    def __lt__(self, other):
+        return self.origin_rank < other.origin_rank or \
+               (self.origin_rank == other.origin_rank and self.origin_index < other.origin_index)
+
+@dataclass
+class ReduceChunk:
+    chunks: list # List of chunks reduced
+
+    def reduce(self, chunk):
+        if type(chunk) is ReduceChunk:
+            chunks = self.chunks + chunk.chunks
+        elif type(chunk) is Chunk:  
+            chunks =self.chunks + [chunk]
+        else:
+            assert True, "Trying to reduce with chunk of None"
+        return ReduceChunk(chunks)
+
+    def sort(self):
+        self.chunks.sort()
+
+    def __hash__(self):
+        return hash(tuple(self.chunks))
+
+    # Two reduce chunks are equal if they contain the same list of
+    # chunks being reduced
+    def __eq__(self, other):
+        self.sort()
+        other.sort()
+        return self.chunks == other.chunks
diff --git a/sccl/language/collectives.py b/sccl/language/collectives.py
new file mode 100644
index 0000000..bce39d2
--- /dev/null
+++ b/sccl/language/collectives.py
@@ -0,0 +1,222 @@
+from dataclasses import dataclass, field
+from sccl.language.ir import Buffer
+from sccl.language import *
+
+class Collective():
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        self.num_ranks = num_ranks
+        self.chunk_factor = chunk_factor
+        self.inplace = inplace
+        self.name = "custom"
+
+    def init_buffers(self):
+        pass
+
+    def check(self, prog):
+        pass
+
+    def get_buffer_index(self, rank, buffer, index):
+        return buffer, index
+
+
+class AllToAll(Collective):
+
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.name = 'alltoall'
+
+    def init_buffers(self):
+        chunks_per_node = self.num_ranks * self.chunk_factor
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = [None] * chunks_per_node
+            output_buffer = [None] * chunks_per_node
+            for index in range(chunks_per_node):
+                chunk = Chunk(r, index, index//self.chunk_factor, index % self.chunk_factor + r*self.chunk_factor)
+                input_buffer[index] = chunk
+            buffers = {Buffer.input : input_buffer, 
+                    Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+
+    # Expected output buffer for alltoall
+    def check(self, prog):
+        chunks_per_node = self.num_ranks * self.chunk_factor
+        correct = True
+        for r in range(self.num_ranks):
+            output = prog.buffers[r][Buffer.output]
+            for i in range(self.num_ranks):
+                for ch in range(self.chunk_factor):
+                    index = ch + i * self.chunk_factor
+                    chunk = output[index]
+                    expected_origin_index = ch + r * self.chunk_factor
+                    if chunk is None or chunk.origin_rank != i or chunk.origin_index != expected_origin_index:
+                        print(f'Rank {r} chunk {index} is incorrect should be chunk({i},{expected_origin_index}) given {chunk}')
+                        correct = False
+        return correct
+
+
+class AllGather(Collective):
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.name = 'allgather'
+
+    # Initializes input buffer for an allgather
+    def init_buffers(self):
+        rank_buffers = []
+        if self.inplace:
+            # Inplace AllGather only uses the output buffer   
+            for r in range(self.num_ranks):
+                output_buffer = [None] * (self.num_ranks * self.chunk_factor)
+                for ch in range(self.chunk_factor):
+                    output_buffer[r*self.chunk_factor+ch] = Chunk(r, ch, -1, r*self.chunk_factor+ch)
+                buffers = {Buffer.input : output_buffer[r*self.chunk_factor:(r+1)*self.chunk_factor],
+                           Buffer.output : output_buffer}
+                rank_buffers.append(buffers)
+        else:
+            for r in range(self.num_ranks):
+                input_buffer = [None] * self.chunk_factor
+                output_buffer = [None] * (self.num_ranks * self.chunk_factor)
+                for ch in range(self.chunk_factor):
+                    input_buffer[ch] = Chunk(r, ch, -1, r*self.chunk_factor+ch)
+                buffers = {Buffer.input : input_buffer, 
+                           Buffer.output : output_buffer}
+                rank_buffers.append(buffers)
+        return rank_buffers
+                
+    # Expected output buffer for allgather
+    def check(self, prog):
+        correct = True
+        buf = Buffer.output
+        for r in range(self.num_ranks):
+            output = prog.buffers[r][buf]
+            for i in range(self.num_ranks):
+                for ch in range(self.chunk_factor):
+                    index = i*self.chunk_factor + ch
+                    chunk = output[index]
+                    if chunk is None:
+                        print(f'Rank {r} chunk {index} is incorrect should be ({i}, {ch}) given None')
+                        correct = False
+                    elif chunk.origin_rank != i or chunk.origin_index != ch:
+                        print(f'Rank {r} chunk {index} is incorrect should be ({i}, {ch}) given ({chunk.origin_rank}, {chunk.origin_index})')
+                        correct = False
+        return correct
+
+    
+    def get_buffer_index(self, rank, buffer, index):
+        # For inplace AllGathers, the input buffer points into the output buffer
+        if self.inplace and buffer == Buffer.input:
+            return Buffer.output, index + rank * self.chunk_factor
+        else:
+            return buffer, index
+
+
+            
+class AllReduce(Collective):
+
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.name = 'allreduce'
+
+    def init_buffers(self):
+        chunks_per_node = self.chunk_factor
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = []
+            output_buffer = [None] * chunks_per_node
+            for c in range(chunks_per_node):
+                # Chunks start at rank r index c, and ends on all ranks (-1) at index r
+                input_buffer.append(Chunk(r, c, -1, c))
+            # Input and output buffer are the same.
+            if self.inplace:
+                buffers = {Buffer.input : input_buffer, 
+                           Buffer.output : input_buffer}
+            else:
+                buffers = {Buffer.input : input_buffer, 
+                           Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+
+    def check(self, prog):
+        chunks_per_node = self.chunk_factor
+        expected_chunks = []
+        buf = Buffer.input if self.inplace else Buffer.output
+
+        for c in range(chunks_per_node):
+            chunk = ReduceChunk([])
+            for r in range(self.num_ranks):
+                chunk = chunk.reduce(Chunk(r, c))
+            expected_chunks.append(chunk)
+
+        correct = True
+        for r in range(self.num_ranks):
+            output = prog.buffers[r][buf]
+            for c in range(chunks_per_node):
+                chunk = output[c]
+                if chunk is None or chunk != expected_chunks[c]:
+                    print(f'Rank {r} chunk {c} is incorrect should be ReduceChunk index {c} from all ranks, given {chunk}')
+                    correct = False
+        return correct
+
+    def get_buffer_index(self, rank, buffer, index):
+        if self.inplace and buffer == Buffer.output:
+            return Buffer.input, index
+        else:
+            return buffer, index
+
+
+class ReduceScatter(Collective):
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.name = 'reduce_scatter'
+
+    def init_buffers(self):
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            if self.inplace:
+                input_buffer = []
+                for i in range(self.num_ranks):
+                    for c in range(self.chunk_factor):
+                        input_buffer.append(Chunk(r, i*self.chunk_factor + c, i, c))
+                buffers = {Buffer.input : input_buffer}
+                rank_buffers.append(buffers)
+            else:
+                input_buffer = []
+                output_buffer = [None] * self.chunk_factor
+                for i in range(self.num_ranks):
+                    for c in range(self.chunk_factor):
+                        input_buffer.append(Chunk(r, i*self.chunk_factor + c, i, c))
+                buffers = {Buffer.input : input_buffer, 
+                        Buffer.output : output_buffer}
+                rank_buffers.append(buffers)
+        return rank_buffers
+
+    def check(self, prog):
+        expected_chunks = []
+        buf = Buffer.input if self.inplace else Buffer.output
+        for c in range(self.num_ranks * self.chunk_factor):
+            chunk = ReduceChunk([])
+            for r in range(self.num_ranks):
+                chunk = chunk.reduce(Chunk(r, c))
+            expected_chunks.append(chunk)
+
+        correct = True
+        for r in range(self.num_ranks):
+            output = prog.buffers[r][buf]
+            for c in range(self.chunk_factor):
+                correct_idx = r * self.chunk_factor + c
+                if self.inplace:
+                    c = correct_idx
+                chunk = output[c]
+                if chunk is None or chunk != expected_chunks[correct_idx]:
+                    print(f'Rank {r} chunk {c} is incorrect should be index {correct_idx} from all ranks given {chunk}')
+                    correct = False
+        return correct
+
+    def get_buffer_index(self, rank, buffer, index):
+        # For inplace ReduceScatter the output buffer is a pointer into the input buffer
+        if self.inplace and buffer == Buffer.output:
+            return Buffer.input, index + rank * self.chunk_factor
+        else:
+            return buffer, index
+
diff --git a/sccl/language/ir.py b/sccl/language/ir.py
new file mode 100644
index 0000000..df8751d
--- /dev/null
+++ b/sccl/language/ir.py
@@ -0,0 +1,313 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from lxml import etree as ET
+from dataclasses import dataclass, field
+from enum import Enum
+from collections import defaultdict
+
+
+@dataclass
+class Program:
+    name: str
+    collective: str
+    inplace: bool
+    protocol: str
+    gpus: list = field(default_factory=list)
+
+
+@dataclass
+class Gpu:
+    rank: int
+    threadblocks: list = field(default_factory=list)
+
+
+@dataclass
+class Threadblock:
+    channel: int = -1
+    send: int = -1
+    recv: int = -1
+    ops: list = field(default_factory=list)
+
+    def __eq__(self, other):
+        return self is other
+
+    def __hash__(self):
+        return id(self)
+
+
+class ChunkInstruction(Enum):
+    start = 'start'
+    reduce = 'reduce'
+    send = 'send'
+
+    def __str__(self):
+        return self.value
+
+
+class ThreadblockPolicy(Enum):
+    auto = 'auto'
+    manual = 'manual'
+
+    def __str__(self):
+        return self.value
+
+
+class Instruction(Enum):
+    nop = 'nop'
+    send = 's'
+    recv = 'r'
+    recv_copy_send = 'rcs'
+    recv_reduce_send = 'rrs'
+    recv_reduce_copy = 'rrc'
+    recv_reduce_copy_send = 'rrcs'
+    copy = 'cpy'
+    reduce = 're'
+    delete = 'd' 
+    start = 'st'
+
+    def __str__(self):
+        return self.value
+
+
+class Buffer(Enum):
+    input = 'i'
+    output = 'o'
+    scratch = 's'
+
+    def __str__(self):
+        return self.value
+
+
+@dataclass
+class ChunkRef:
+    rank: int
+    buffer: Buffer
+    index: int
+    size: int
+
+    def __hash__(self):
+        return hash((self.rank, self.buffer, self.index, self.size))
+
+
+@dataclass
+class Op:
+    inst: Instruction
+    rank: int
+    src: ChunkRef
+    dst: ChunkRef
+    depends: list = field(default_factory=list)
+    step: int = -1 # Step in the TB
+    tb: int = -1 # TB this op is assigned to
+    prev: list = field(default_factory=list)
+    next: list = field(default_factory=list)
+    num: int = -1
+    chunk_step: int = -1
+    priority: int = -1
+    match: list = field(default_factory=list) # This should be another Op
+    channel: int = -1
+
+    def cnt(self):
+        if self.src:
+            if self.dst:
+                assert self.src.size == self.dst.size
+            return self.src.size
+        elif self.dst:
+            return self.dst.size
+        else:
+            return 0
+
+    def is_send(self):
+         return self.inst == Instruction.send or \
+            self.inst == Instruction.recv_reduce_copy_send or \
+            self.inst == Instruction.recv_copy_send or \
+            self.inst == Instruction.recv_reduce_send
+
+    def is_recv(self):
+        return  self.inst == Instruction.recv or \
+            self.inst == Instruction.recv_reduce_copy or \
+            self.inst == Instruction.recv_reduce_copy_send or \
+            self.inst == Instruction.recv_copy_send or \
+            self.inst == Instruction.recv_reduce_send
+
+    def __eq__(self, other):
+        return self is other
+
+    def __lt__(self, other):
+        # Ordering of operations
+        # 1. Lower chunk step 2. Higher priority 3. Lower src index
+        if self.chunk_step == other.chunk_step:
+            if self.priority == other.priority:
+                return self.src.index < other.src.index
+            return self.priority > other.priority
+        return self.chunk_step < other.chunk_step
+
+    def __gt__(self, other):
+        return not self < other
+
+    def __hash__(self):
+        return id(self)
+
+    def __repr__(self):
+        return f'Op({self.inst}, {self.rank}, {self.src}, {self.dst}, step:{self.step}, tb:{self.tb})'
+
+
+# Instructions where src is on local GPU
+_local_src_insts = {Instruction.send, Instruction.copy, Instruction.reduce}
+# Instructions where dst is on local GPU
+_local_dst_insts = {Instruction.recv, Instruction.recv_copy_send, Instruction.recv_reduce_send,
+                    Instruction.recv_reduce_copy, Instruction.copy, Instruction.reduce,
+                    Instruction.recv_reduce_copy_send}
+
+
+def ir_to_xml(program: Program, old_format=True, use_scratch=True, pretty_print=True):
+    # Figure out sizes of buffers based on usage
+    buffer_sizes = defaultdict(lambda: 0)
+    for gpu in program.gpus:
+        for tb in gpu.threadblocks:
+            for op in tb.ops:
+                if op.inst in _local_src_insts:
+                    key = (gpu.rank, op.src.buffer)
+                    buffer_sizes[key] = max(
+                        buffer_sizes[key], op.src.index + op.src.size)
+                if op.inst in _local_dst_insts:
+                    key = (gpu.rank, op.dst.buffer)
+                    buffer_sizes[key] = max(
+                        buffer_sizes[key], op.dst.index + op.dst.size)
+
+    tb_id = {}
+    # Sort threadblocks in each GPU by peers and then the channel
+    # This is important as in NCCL threadblocks using the same NVLink concurrently should be close together
+    for gpu in program.gpus:
+        gpu.threadblocks = sorted(gpu.threadblocks,
+                                  key=lambda tb: (tb.send, tb.recv, tb.channel))
+        for i, tb in enumerate(gpu.threadblocks):
+            tb_id[tb] = i
+
+    # Filter out dependencies within the same threadblock
+    op_tb_id = {}
+    for gpu in program.gpus:
+        for tb in gpu.threadblocks:
+            for op in tb.ops:
+                op_tb_id[op] = tb_id[tb]
+    for gpu in program.gpus:
+        for tb in gpu.threadblocks:
+            for op in tb.ops:
+                op.depends = list(
+                    filter(lambda dep: op_tb_id[dep] != tb_id[tb], op.depends))
+    # Filter out redundant dependencies
+    # e.g. if op1 and op2 depend on op, and op1 happends before op2 
+    # then op2 does not need to explicitly depend on op
+    for gpu in program.gpus:
+        for tb in gpu.threadblocks:
+            running_depends = []
+            for op in tb.ops:
+                op.depends = list(
+                    filter(lambda dep: dep not in running_depends, op.depends))
+                running_depends = running_depends + op.depends
+
+    # Mark all ops that have a dependence on them
+    has_dependence = set()
+    for gpu in program.gpus:
+        for tb in gpu.threadblocks:
+            for op in tb.ops:
+                has_dependence.update(op.depends)
+
+    # Do some additional postprocessing of operations:
+    # - Expand operations with extra dependencies with no-ops
+    # - Mark the index of each operation taking any extra no-ops into account
+    op_idx = {}
+    for gpu in program.gpus:
+        for tb in gpu.threadblocks:
+            new_ops = []
+            for op in tb.ops:
+                # Expand extra dependencies into nop operations
+                if len(op.depends) > 1:
+                    extra_deps = op.depends[1:]
+                    op.depends = op.depends[:1]
+                    for i, dep in enumerate(extra_deps):
+                        new_ops.append(Op(Instruction.nop, -1, None, None, [dep]))
+                        op_idx[new_ops[-1]] = len(new_ops) - 1
+                        #op_tb_id[new_ops[-1]] = op_tb_id[op]
+                new_ops.append(op)
+                op_idx[new_ops[-1]] = len(new_ops) - 1
+            tb.ops = new_ops
+
+    # Generate the XML structure
+    algo_elem = ET.Element('algo')
+    algo_elem.set('name', program.name)
+    algo_elem.set('proto', program.protocol)
+    algo_elem.set('nchannels', str(
+        1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in program.gpus)))
+    if old_format:
+        algo_elem.set('nchunksperloop', str(
+            max(max(buffer_sizes[(gpu.rank, Buffer.input)], buffer_sizes[(gpu.rank, Buffer.output)]) for gpu in program.gpus)))
+    algo_elem.set('ngpus', str(len(program.gpus)))
+    algo_elem.set('coll', program.collective)
+    algo_elem.set('inplace', str(1 if program.inplace else 0))
+    for gpu in program.gpus:
+        gpu_elem = ET.SubElement(algo_elem, 'gpu')
+        gpu_elem.set('id', str(gpu.rank))
+        gpu_elem.set('i_chunks', str(buffer_sizes[(gpu.rank, Buffer.input)]))
+        gpu_elem.set('o_chunks', str(buffer_sizes[(gpu.rank, Buffer.output)]))
+        gpu_elem.set('s_chunks', str(buffer_sizes[(gpu.rank, Buffer.scratch)]))
+        for tb in gpu.threadblocks:
+            tb_elem = ET.SubElement(gpu_elem, 'tb')
+            tb_elem.set('id', str(tb_id[tb]))
+            tb_elem.set('send', str(tb.send))
+            tb_elem.set('recv', str(tb.recv))
+            tb_elem.set('chan', str(tb.channel))
+            for op in tb.ops:
+                op_elem = ET.SubElement(
+                    tb_elem, 'op' if not old_format else 'step')
+                op_elem.set('step' if not old_format else 's', str(op_idx[op]))
+                op_elem.set('type', str(op.inst))
+
+                # The NCCL backend currently wants scratch at the end of output
+                if not use_scratch:
+                    if op.src.buffer == Buffer.scratch:
+                        op.src.buffer = Buffer.output
+                        op.src.index += buffer_sizes[(gpu.rank, Buffer.output)]
+                    if op.dst_buffer == Buffer.scratch:
+                        op.dst.buffer = Buffer.output
+                        op.dst.index += buffer_sizes[(gpu.rank, Buffer.output)]
+
+                if old_format:
+                    if op.src is not None:
+                        op_elem.set('srcbuf', str(op.src.buffer))
+                        op_elem.set('srcoff', str(op.src.index))
+                    else:
+                        op_elem.set('srcbuf', 'i')
+                        op_elem.set('srcoff', '-1')
+                    if op.dst is not None:
+                        op_elem.set('dstbuf', str(op.dst.buffer))
+                        op_elem.set('dstoff', str(op.dst.index))
+                    else:
+                        op_elem.set('dstbuf', 'o')
+                        op_elem.set('dstoff', '-1')
+                else:
+                    if op.is_send():
+                        if op.src is not None:
+                            op_elem.set('buf', str(op.src.buffer))
+                            op_elem.set('off', str(op.src.index))
+                    else:
+                        if op.dst is not None:
+                            op_elem.set('buf', str(op.dst.buffer))
+                            op_elem.set('off', str(op.dst.index))
+                if op.cnt() > 1 or old_format:
+                    op_elem.set('cnt', str(op.cnt()))
+                assert len(op.depends) <= 1
+                if len(op.depends) == 1:
+                    op_elem.set('depid', str(op_tb_id[op.depends[0]]))
+                    op_elem.set('deps', str(op_idx[op.depends[0]]))
+                elif old_format:
+                    op_elem.set('depid', '-1')
+                    op_elem.set('deps', '-1')
+                if op in has_dependence:
+                    op_elem.set('hasdep', '1')
+                elif old_format:
+                    op_elem.set('hasdep', '0')
+
+    if pretty_print:
+        ET.indent(algo_elem, space='  ')
+    return ET.tostring(algo_elem, encoding='unicode')
diff --git a/sccl/language/passes.py b/sccl/language/passes.py
new file mode 100644
index 0000000..be3eda3
--- /dev/null
+++ b/sccl/language/passes.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sys
+from sccl.language.ir import *
+
+# Check that there are no cyclic dependencies within a Rank
+def check_dependency_cycles(tbs):
+    for rank, rank_tbs in enumerate(tbs):
+        for tbid, tb in rank_tbs.items():
+            for op in tb.ops:
+                deps = op.depends
+                chain = [op]
+                # DFS to check for cycles
+                while len(deps) > 0:
+                    dep = deps[0]
+                    if dep in chain:
+                        print(f"Cyclic dependency in rank {rank} threadblock {tbid} at {op}")
+                        for op in chain:
+                            print("  ", op)
+                        sys.exit(1)
+                    next_depends = dep.depends
+                    if len(next_depends) > 0:
+                        chain.append(dep)
+                    else:
+                        chain = [op]
+                    deps = next_depends + deps[1:]
+
+
+# Check there are no ordering violations between threadblocks across ranks
+def check_threadblock_ordering(rank_dag):
+    for rank in range(rank_dag.num_ranks):
+        for tb in rank_dag.tbs[rank].values():
+            prev_steps = {} # tbid -> step of last recv from tbid
+            # Check that sends and their corresponding receives between two threadblocks
+            # happen in the same order.
+            for op_step, op in enumerate(tb.ops):
+                if op.is_send():
+                    match = op.match[0]
+                    if match.is_recv():
+                        assert op.dst.rank == match.rank, f"Bug in SCCLang: Sends don't match receives"
+
+                    other_tbid = match.tb
+                    if other_tbid in prev_steps:
+                        assert match.step >  prev_steps[other_tbid], f"Rank {self.rank} sends op1 then op2 but {match.rank} receives op2 then op1"
+                    prev_steps[other_tbid] = match.step
diff --git a/sccl/language/rank_dag.py b/sccl/language/rank_dag.py
new file mode 100644
index 0000000..7badea9
--- /dev/null
+++ b/sccl/language/rank_dag.py
@@ -0,0 +1,401 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from dataclasses import dataclass
+from enum import Enum
+import heapq
+
+from sccl.language.ir import *
+from sccl.language.passes import *
+
+# Returns whether an operation writes to a particular slot
+def writes_to_slot(op, slot):
+    # If the instruction is a copy or reduce, check to see if the destination matches the slot
+    if op.inst == Instruction.copy or op.inst == Instruction.reduce:
+        cpy_src = op.src
+        _, buffer, index = slot
+        return buffer != cpy_src.buffer or (index < cpy_src.index and index > (cpy_src.index + cpy_src.size))
+    return op.inst != Instruction.send
+
+def remove_op(op):
+    for p in op.prev:
+        p.next.remove(op)
+        p.next += op.next
+
+    for n in op.next:
+        n.prev.remove(op)
+        n.prev =  op.prev.union(n.prev)
+
+def same_tb(op1, op2):
+    return op1.tb == op2.tb
+
+def same_count(op1, op2):
+    return op1.cnt() == op2.cnt()
+    
+def same_buf_dst(op1, op2):
+    return op1.dst.buffer == op2.dst.buffer and op1.dst.index == op2.dst.index
+
+class RankDAG:
+    def __init__(self, num_ranks, buffers):
+        self.num_ranks = num_ranks
+        self.buffers = buffers
+        self.slots = [] # slot = (rank, buffer, index)
+        self.operations = {} # slot -> operations
+        self.tbs = [] 
+        for _ in range(num_ranks):
+            self.tbs.append({}) 
+        self.tb_mapping = {}
+
+
+    def add_start(self, rank, buffer, index, ref):
+        slot = (rank, buffer, index)
+        self.slots.append(slot)
+
+        op = Op(Instruction.start, rank, ref, ref, next=set(), prev=set())
+        self.operations[slot] = op
+
+    # Find the last write to happen on this slot
+    def find_last_recv(self, slot):
+        def dfs(op):
+            # Found the last operation on the slot
+            if len(op.next) == 0:
+                return writes_to_slot(op, slot), op
+            else:
+                last_recvs = False
+                # Check if any of the children is the last write
+                for o in op.next:
+                    is_last_recv, recv_op = dfs(o)
+                    if is_last_recv:
+                        return True, recv_op
+                    last_recvs = last_recvs or is_last_recv
+                # Check if we are the last write
+                if writes_to_slot(op, slot) and not last_recvs:
+                    return True, op
+                return False, op
+        
+        result, op = dfs(self.operations[slot])
+        assert result
+        return op
+
+    # Find the last set of operations that happened on this slot
+    # There may be multiple as sends can happen in parallel
+    def find_last_ops(self, slot):
+        frontier = [self.operations[slot]]
+        last_ops = []
+        while len(frontier) > 0:
+            op = frontier[0]
+            if len(op.next) == 0:
+                last_ops.append(op)
+            frontier = frontier[1:] + list(op.next)   
+        return last_ops
+
+    def add_copy(self, rank, send_ref, recv_ref, step, priority, tb):
+        op = Op(Instruction.copy, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb)
+        dstbuffer = recv_ref.buffer
+        dstindex = recv_ref.index
+        srcbuffer = send_ref.buffer
+        srcindex = send_ref.index
+        size = recv_ref.size
+        prev_ops = []
+
+        # Sending part of copy
+        for i in range(srcindex, srcindex+size):
+            slot = (rank, srcbuffer, i)
+            prev_op = self.find_last_recv(slot) # All operations that need to happen before
+            prev_op.next.add(op)
+            op.prev.add(prev_op)
+
+        # Receiving part of copy
+        prev_ops = set()
+        for i in range(dstindex, dstindex+size):
+            slot = (rank, dstbuffer, i)
+            if slot in self.operations:
+                prev_op = self.find_last_ops(slot)
+                prev_ops.append(prev_op) # All operations that need to happen before
+            else:
+                self.operations[slot] = op
+
+        for prev_op in prev_ops:
+            if op not in prev_op.next:
+                prev_op.next.add(op)
+                op.prev.add(prev_op)
+
+    def add_reduce(self, rank, send_ref, recv_ref, step, priority, tb):
+        op = Op(Instruction.reduce, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb)
+        dstbuffer = recv_ref.buffer
+        dstindex = recv_ref.index
+        srcbuffer = send_ref.buffer
+        srcindex = send_ref.index
+        size = recv_ref.size
+        prev_ops = []
+
+        # B
+        for i in range(srcindex, srcindex+size):
+            slot = (rank, srcbuffer, i)
+            prev_op = self.find_last_recv(slot) # All operations that need to happen before
+            prev_op.next.add(op)
+            op.prev.add(prev_op)
+
+        # A
+        prev_ops = []
+        for i in range(dstindex, dstindex+size):
+            slot = (rank, dstbuffer, i)
+            if slot in self.operations:
+                prev_op = self.find_last_ops(slot)
+                prev_ops = prev_ops + prev_op # All operations that need to happen before
+       
+        for prev_op in prev_ops:
+            if op not in prev_op.next:
+                prev_op.next.add(op)
+                op.prev.add(prev_op)
+
+    def add_send(self, rank, send_ref, recv_ref, step, priority, tb, ch):
+        op = Op(Instruction.send, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb, channel=ch)
+        buffer = send_ref.buffer
+        index = send_ref.index
+        size = send_ref.size
+        prev_ops = []
+        for i in range(index, index+size):
+            slot = (rank, buffer, i)
+            prev_op = self.find_last_recv(slot)
+            prev_ops.append(prev_op) # All operations that need to happen before
+
+        for prev_op in prev_ops:
+            if op not in prev_op.next:
+                prev_op.next.add(op)
+                op.prev.add(prev_op)
+        return op
+
+    def add_recv(self, rank, send_ref, recv_ref, step, priority, tb, ch):
+        op = Op(Instruction.recv, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb, channel=ch)
+        buffer = recv_ref.buffer
+        index = recv_ref.index
+        size = recv_ref.size
+
+        prev_ops = set()
+        for i in range(index, index+size):
+            slot = (rank, buffer, i)
+            if slot in self.operations:
+                slot_prev_ops = self.find_last_ops(slot) # All operations that need to happen before
+                prev_ops = prev_ops.union(slot_prev_ops)        
+            else:
+                self.operations[slot] = op
+        if len(prev_ops) > 0:
+                for prev_op in prev_ops:
+                    prev_op.next.add(op)
+                    op.prev.add(prev_op)
+        return op
+
+    def add_recv_reduce_copy(self, rank, send_ref, recv_ref, step, priority, tb, ch):
+        op = Op(Instruction.recv_reduce_copy, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb, channel=ch)
+        buffer = recv_ref.buffer
+        index = recv_ref.index
+        size = recv_ref.size
+
+        prev_ops = set()
+        for i in range(index, index+size):
+            slot = (rank, buffer, i)
+            if slot in self.operations:
+                slot_prev_ops = self.find_last_ops(slot) # All operations that need to happen before
+                prev_ops = prev_ops.union(slot_prev_ops)        
+            else:
+                self.operations[slot] = op
+        if len(prev_ops) > 0:
+                for prev_op in prev_ops:
+                    prev_op.next.add(op)
+                    op.prev.add(prev_op)
+        return op
+
+    def convert_set_list(self):
+        ops = []
+        for slot, op in self.operations.items():
+            if op.inst == Instruction.start:
+                op.next = list(op.next)
+                for o in op.next:
+                    ops.append(o)
+            elif op.inst != Instruction.copy:
+                ops.append(op)
+
+            visited = set()
+            i = 0
+            while i < len(ops):
+                op = ops[i]
+                if op not in visited:
+                    visited.add(op)
+                    op.next = list(op.next)
+                    ops += op.next
+                i += 1
+
+    def optimize(self):
+        self._optimize_rrcs_rrs()
+        self._optimize_rcs()
+        
+    # Given the set of operations that operate over a particular slot (rank, buffer, idx) fixed
+    # Try and replace operations with pipelined ops like receive copy send (rcs)
+    # or receive reduce send (rrs) and receive reduce copy send (rrcs)
+    # Rules:
+    # recv-copy-send 
+    # recv(src, sbuf, si, _, _, _ ) send(_, _, _, dst, dbuf, di) -> recv_copy_send(src, sbuf, si, dst, dbuf, di)
+    def _optimize_rcs(self):
+        for slot, ops in self.operations.items():
+            frontier = [ops]
+            while len(frontier) > 0:
+                op = frontier[0]
+                if len(op.next) == 1:
+                    next_op = op.next[0] 
+                    if op.inst == Instruction.recv and next_op.inst == Instruction.send and same_tb(op, next_op) and same_count(op, next_op) and same_buf_dst(op, next_op):
+                        op.inst = Instruction.recv_copy_send
+                        op.dst = next_op.dst
+                        op.match = op.match + next_op.match
+                        remove_op(next_op)
+                frontier = frontier[1:] + op.next
+        
+    def _optimize_rrcs_rrs(self):
+        # RRC/S -> RRS
+        for slot, ops in self.operations.items():
+            frontier = [ops]
+            while len(frontier) > 0:
+                op = frontier[0]
+                if len(op.next) == 1:
+                    next_op = op.next[0]
+                    if len(next_op.next) == 1:
+                        nnext_op = next_op.next[0]
+                        if op.inst == Instruction.recv_reduce_copy and next_op.inst == Instruction.send and nnext_op.inst == Instruction.recv and same_tb(op, next_op) and same_count(op, next_op):
+                            op.inst = Instruction.recv_reduce_send
+                            op.dst = next_op.dst
+                            op.match = op.match + next_op.match
+                            remove_op(next_op)
+                    
+                    if op.inst == Instruction.recv_reduce_copy and next_op.inst == Instruction.send and same_tb(op, next_op) and same_count(op, next_op):
+                        op.inst = Instruction.recv_reduce_copy_send
+                        op.dst = next_op.dst
+                        op.match = op.match + next_op.match
+                        remove_op(next_op)
+                frontier = frontier[1:] + op.next
+
+    def lower_pt1(self, instances):
+        self.infer_dependencies()
+        self.lower_buffers(instances)
+    
+    def lower_pt2(self, instances, interleaved):
+        self.replicate(instances, interleaved)
+        return self.lower_tbs()
+
+
+    def infer_dependencies(self):
+        for slot, ops in self.operations.items():
+            frontier = [ops]
+            while len(frontier) > 0:
+                op = frontier[0]
+                # Dependencies for every op is the same as the ops that are stored in prev
+                # Filter out dependencies that are satisified by tbs executing ops sequentially
+                # If multiple dependent ops from the same tb keep the one that happens last
+                depends = {}
+                for dep_op in list(op.prev):
+                    if dep_op.inst != Instruction.start:
+                        tb = dep_op.tb
+                        if tb not in depends or dep_op.step > depends[tb].step:
+                            depends[tb] = dep_op
+                op.depends = list(depends.values())
+                frontier = frontier[1:] + op.next
+
+    # Convert local scratch buffers to index into one global scratch buffer
+    def lower_chunk(self, chunk):
+        if chunk.buffer is not Buffer.input and chunk.buffer is not Buffer.output:
+            buffer = self.buffers[chunk.rank][chunk.buffer].get_buffer()
+            index = self.buffers[chunk.rank][chunk.buffer].get_global_index(chunk.index)
+            return ChunkRef(chunk.rank, buffer, index, chunk.size)
+        return chunk
+
+    # Assigns each scratch buffer an offset into the global scratch buffer
+    def lower_buffers(self, instances):
+        for rank_buffers in self.buffers:
+            offset = 0
+            for key, buf in rank_buffers.items():
+                if key is not Buffer.input and key is not Buffer.output:
+                    buf.set_offset(offset)
+                    offset += buf.instance_size() * instances
+
+    # Preprocess the threadblocks for lowering into xml
+    def lower_tbs(self):
+        gpus = []
+        for rank, rank_tbs in enumerate(self.instanced_tbs):
+            lowered_tbs = {}
+            for tbid, tb in rank_tbs.items():
+                for op in tb.ops:
+                    op.src = self.lower_chunk(op.src)
+                    op.dst = self.lower_chunk(op.dst)
+                lowered_tbs[tbid] = tb
+            gpus.append(Gpu(rank, list(lowered_tbs.values())))
+        return gpus
+
+
+    # Automatically replicates the algorithm instance number of times
+    # interleaved sets the replication policy
+    # if True chunks are split as: ChunkA ChunkB -> ChunkA0 ChunkA1 .. ChunkB0 ChunkB1 ...
+    # if false chunks are divided as ChunkA0 ChunkB0 ChunkA1 ChunkB1 ...
+    # For collectives were chunks are designated for a particular GPU (e.g. AllToAll) 
+    # only interleaved replication will be correct
+    # Interleaved policy only supports single count sends/receives from the input/output buffer
+    # (multicount ops are fine between scratch)
+    def replicate(self, instances, interleaved):
+        if instances == 1:
+            self.instanced_tbs = self.tbs
+            return 
+
+        self.instanced_tbs = []
+        for _ in range(self.num_ranks):
+            self.instanced_tbs.append({})
+
+        def is_scratch(buffer):
+            return buffer != Buffer.input and buffer != Buffer.output
+
+        def get_new_index(rank, buffer, index, size, i):
+            # Scratch buffers always use batched
+            if is_scratch(buffer):
+                buf_instance_len = self.buffers[rank][buffer].instance_size()
+                return buf_instance_len * i + index
+            # If this is operating on the input/output buffer then replication strategy can be either interleaved or batched
+            # This is to fit with the semantics of certain collectives
+            elif interleaved:
+                return  index * instances + i * size
+            else:
+                return  len(self.buffers[rank][buffer]) * i + index
+
+        def get_instance_ref(ref):
+            iindex = get_new_index(ref.rank, ref.buffer, ref.index, ref.size, i)
+            iref = ChunkRef(ref.rank, ref.buffer, iindex, ref.size)
+            return iref
+
+        for i in range(instances):
+            # Generate all the threadblocks and ops
+            for rank, rank_tbs in enumerate(self.tbs):
+                rank_channels = self.num_channels [rank]
+                for tbid, tb in rank_tbs.items():
+                    instance_channel = rank_channels * i + tb.channel
+                    itb = Threadblock(instance_channel, tb.send, tb.recv)
+                    itbid = tbid * instances + i
+                    itb.ops = [None] * len(tb.ops)
+                    for s, op in enumerate(tb.ops):
+                        isrc = get_instance_ref(op.src)
+                        idst = get_instance_ref(op.dst)
+                        idepends = [] 
+                        # Note: We don't need the fill out the rest of the metadata since replication is the last optimization
+                        iop = Op(op.inst, op.rank, isrc, idst, idepends, op.step, itbid) 
+                        itb.ops[s] = iop
+                    self.instanced_tbs[op.rank][itbid] = itb
+        
+        # Redo dependency analysis
+        for rank, rank_tbs in enumerate(self.tbs):
+            for tbid, tb in rank_tbs.items():
+                for i in range(instances):
+                    itbid = tbid * instances + i
+                    itb = self.instanced_tbs[rank][itbid]
+                    for op, iop in zip(tb.ops, itb.ops):
+                        iop.depends = [None] * len(op.depends)
+                        for s, dep in enumerate(op.depends):
+                            dep_tbid = dep.tb
+                            dep_itbid = dep_tbid * instances + i
+                            dep_step = dep.step
+                            iop.depends[s] = self.instanced_tbs[op.rank][dep_itbid].ops[dep_step] 
+
diff --git a/sccl/language/tb_assignment.py b/sccl/language/tb_assignment.py
new file mode 100644
index 0000000..497633c
--- /dev/null
+++ b/sccl/language/tb_assignment.py
@@ -0,0 +1,185 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from dataclasses import dataclass
+from enum import Enum
+import heapq
+
+from sccl.language.ir import *
+from sccl.language.rank_dag import *
+
+
+def _verify_tb_op_compatible(tb, op):
+    s = op.dst.rank if op.is_send() else -1
+    r = op.src.rank if op.is_recv() else -1
+        
+    sends_ok = tb.send == s or s == -1 or tb.send == -1
+    recvs_ok = tb.recv == r or r == -1 or tb.recv == -1
+    channel_ok = tb.channel == op.channel or tb.channel == -1 or op.channel == -1
+    return sends_ok and recvs_ok and channel_ok
+
+# Manual threadblock, channel assignment
+def manual_assign_tbs(rank_dag):
+    ops = []
+    for slot, op in rank_dag.operations.items():
+        if op.inst == Instruction.start:
+            for o in op.next:
+                if o.inst == Instruction.send or o.inst == Instruction.copy:
+                    heapq.heappush(ops, o)
+
+    rank_dag.num_channels = [1] * rank_dag.num_ranks
+    visited = set()
+    while len(ops) > 0:
+        op = heapq.heappop(ops)
+        if op not in visited:
+            visited.add(op)
+            rank = op.rank
+            tbid = op.tb
+            if tbid not in rank_dag.tbs[rank]:
+                rank_dag.tbs[rank][tbid] = Threadblock()
+            tb = rank_dag.tbs[rank][tbid]
+            if _verify_tb_op_compatible(tb, op):
+                tb.ops.append(op)
+                tb.channel = op.channel if op.channel != -1 else 0
+                tb.send = op.dst.rank if op.is_send() else tb.send
+                tb.recv = op.src.rank if op.is_recv() else tb.recv
+                op.step = len(tb.ops)-1
+                rank_dag.num_channels[rank] = max(op.channel+1, rank_dag.num_channels[rank] )
+            else:
+                raise Exception(f"Illegal threadblock assignment. Trying to add {op} to threadblock {tbid}\n" \
+                    f"Threadblock {tbid} send:{tb.send} recv:{tb.recv} channel:{tb.channel}\n" \
+                    f"Operation send:{op.dst.rank if op.is_send() else -1} recv:{op.dst.rank if op.is_recv() else -1} channel:{op.channel}")
+            
+            for o in op.next:
+                heapq.heappush(ops, o)
+            for o in op.match:
+                heapq.heappush(ops, o)
+
+
+def _get_tb_options(mapping, send, recv, channel, num_tbs, num_channels):
+    if send == -1 and recv == -1: # Can go anywhere
+        return list(i for i in range(0, num_tbs))
+    if channel == -1: # Can go on any channel that matches to send, recv
+        options = []
+        for ch in range(num_channels):
+            if (send, recv, ch) in mapping:
+                options.append(mapping[(send, recv, ch)])
+        return options
+    elif (send, recv, channel) in mapping:
+        return [mapping[(send, recv, channel)]]
+    # Double up if necessary
+    else:
+        options = []
+        for requirements, tbid in mapping.items():
+            tb_s, tb_r, tb_c = requirements
+            sender_ok = send == -1 or tb_s == -1 or tb_s == send
+            receiver_ok = recv == -1 or tb_r == -1 or tb_r == recv
+            channel_ok = channel == -1 or channel == tb_c
+            if sender_ok and receiver_ok and channel_ok:
+                options.append(tbid)
+        return options
+
+def create_base_tbs(rank_dag):
+    ops = []
+    tbid = [0] * rank_dag.num_ranks
+    tb_assignments = [] # rank -> (sender, receiver, channel) -> tbid
+    for _ in range(rank_dag.num_ranks):
+        tb_assignments.append({})
+    num_channels = [1] * rank_dag.num_ranks
+
+    for slot, op in rank_dag.operations.items():
+        if op.inst == Instruction.start:
+            for o in op.next:
+                ops.append(o)
+        elif op.inst != Instruction.copy:
+            ops.append(op)
+
+    visited = set()
+    i = 0
+    while i < len(ops):
+        op = ops[i]
+        if op not in visited:
+            visited.add(op)
+            rank = op.rank
+            s = op.dst.rank if op.is_send() else -1
+            r = op.src.rank if op.is_recv() else -1
+            channel = 0 if op.channel == -1 else op.channel
+            if op.channel >= num_channels[rank]:
+                num_channels[rank] = op.channel + 1
+
+            if (s != -1 or r != -1) and (s,r,channel) not in tb_assignments[rank]:
+                rank_dag.tbs[rank][tbid[rank]] = Threadblock(send=s, recv=r, channel=channel)
+                tb_assignments[rank][(s,r,channel)] = tbid[rank]
+                tbid[rank] += 1
+            ops += op.next
+        i += 1
+
+    rank_dag.tb_assignments = tb_assignments
+    rank_dag.num_channels = num_channels
+
+
+def auto_assign_tbs(rank_dag):
+    # Allocate the base set of TBs
+    tb_assignments = rank_dag.tb_assignments
+    num_channels = rank_dag.num_channels
+    current_num_tb = []
+    for rank_tbs in rank_dag.tbs:
+        current_num_tb.append(len(rank_tbs))
+    current_tb_step = []
+    for rank_tbs in rank_dag.tbs:
+        tb_step = {}
+        for tbid in rank_tbs.keys():
+            tb_step[tbid] = 0
+        current_tb_step.append(tb_step)
+
+    ops = []
+    for slot, op in rank_dag.operations.items():
+        if op.inst == Instruction.start:
+            for o in op.next:
+                if o.inst == Instruction.send or o.inst == Instruction.copy:
+                    heapq.heappush(ops, o)
+    heapq.heapify(ops)
+
+    for o in ops:
+        if o.inst == Instruction.recv:
+            print(o)
+
+    visited = set()
+    while len(ops) > 0:
+        op = heapq.heappop(ops)
+        if op not in visited:
+            visited.add(op)
+            rank = op.rank
+            s = op.dst.rank if op.is_send() else -1
+            r = op.src.rank if op.is_recv() else -1
+            # Get all possible TBs this can be mapped to
+            tb_options = _get_tb_options(tb_assignments[rank], s, r, op.channel, current_num_tb[rank], num_channels[rank])
+            # If there are multiple options choose the TB at the lowest step
+            tbid = tb_options[0]
+            if len(tb_options) > 1:
+                for tbid_opt in tb_options:
+                    if current_tb_step[rank][tbid_opt] < current_tb_step[rank][tbid] and _verify_tb_op_compatible(rank_dag.tbs[rank][tbid], op):
+                        tbid = tbid_opt
+
+            tb = rank_dag.tbs[rank][tbid]
+            assert _verify_tb_op_compatible(tb, op), f"Failing: Channel {op.channel}, send {s} recv {r} {op}\n" \
+                    f"Threadblock send:{tb.send} recv:{tb.recv}  channel{tb.channel}"
+
+            tb.ops.append(op)
+            tb.send = op.dst.rank if op.is_send() else tb.send
+            tb.recv = op.src.rank if op.is_recv() else tb.recv
+            
+            op.step = len(tb.ops)-1
+            op.channel = tb.channel
+            op.tb = tbid
+            current_tb_step[rank][tbid] = op.chunk_step
+
+            # For correctness make certain the matching sends and receives
+            # happen on the same channel
+            for match in op.match:
+                match.channel = tb.channel
+
+            for o in op.next:
+                heapq.heappush(ops, o)
+            for o in op.match:
+                heapq.heappush(ops, o)
\ No newline at end of file
diff --git a/sccl/language/visualize.py b/sccl/language/visualize.py
new file mode 100644
index 0000000..5ffca4e
--- /dev/null
+++ b/sccl/language/visualize.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import igraph as ig
+from sccl.language.ir import *
+from sccl.language.rank_dag import *
+
+def visualize_chunk_dag(chunk_paths): # pragma: no cover
+    frontier = []
+    nnodes = 0
+    vertex_label = []
+    vertex_colors = []
+    edges = []
+    visited = set()
+
+    def add_node(op, nnodes, vertex_label, vertex_colors):
+        if op.num == -1:
+            op.num = nnodes
+            nnodes += 1
+            if op.inst == ChunkInstruction.start:
+                vertex_label.append(f'Start at {op.dst.rank}, {op.dst.index}.')
+                vertex_colors.append('yellow')
+            elif op.inst == ChunkInstruction.send:
+                vertex_label.append(f'Send to Rank {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
+                vertex_colors.append('blue')
+            elif op.inst == ChunkInstruction.reduce:
+                vertex_label.append(f'Reduce with {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
+                vertex_colors.append('green')
+        return nnodes
+
+    for chunk, op in chunk_paths.items():
+        if len(op.prev) == 0: 
+            frontier.append(op)
+
+    while len(frontier) > 0:
+        op = frontier[0]
+        if op in visited:
+            frontier = frontier[1:]
+        else:
+            nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
+            for next_op in op.next:
+                nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
+                edges.append([op.num, next_op.num])
+            frontier = frontier[1:] + op.next
+            visited.add(op)
+
+    g = ig.Graph(nnodes, edges, directed=True)
+    layout = g.layout(layout=ig.Graph.layout_grid)
+    ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='auto')
+
+def visualize_rank_dag(operations): # pragma: no cover
+    frontier = []
+    nnodes = 0
+    vertex_label = []
+    vertex_colors = []
+    edges = []
+    visited = set()
+    colors = ['red', 'green', 'blue', 'yellow', 'teal', 'pink', 'purple', 'orange']
+
+    def add_node(op, nnodes, vertex_label, vertex_colors):
+        if op.num == -1:
+            op.num = nnodes
+            nnodes += 1
+            # Add new node to graph
+            if op.inst == Instruction.start:
+                vertex_label.append(f'Chunk {op.src.index} Rank {op.src.rank}')
+            elif op.inst == Instruction.send:
+                vertex_label.append(f'S to Rank {op.dst.rank}')
+            elif op.inst == Instruction.recv:
+                vertex_label.append(f'R from {op.src.rank}')
+            elif op.inst == Instruction.recv_reduce_copy:
+                vertex_label.append(f'RRC from {op.src.rank}')
+            else:
+                vertex_label.append(f'{op.inst}')
+
+            # Add colors 
+            if op.inst == Instruction.start:
+                vertex_colors.append('gray')
+            else:
+                vertex_colors.append(colors[op.tb % len(colors)])
+        return nnodes
+
+    for slot, op in operations.items():
+        if len(op.prev) == 0: 
+            frontier.append(op)
+
+    while len(frontier) > 0:
+        op = frontier[0]
+
+        if op in visited:
+            frontier = frontier[1:]
+        else:
+            nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
+
+        for next_op in op.next:
+            nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
+            edges.append([op.num, next_op.num])
+            frontier = frontier[1:] + list(op.next)
+        visited.add(op)
+
+    g = ig.Graph(nnodes, edges, directed=True)
+    layout = g.layout(layout=ig.Graph.layout_grid)
+    ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='rt')
\ No newline at end of file
diff --git a/sccl/programs/__init__.py b/sccl/programs/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/sccl/programs/__init__.py
@@ -0,0 +1 @@
+
diff --git a/sccl/programs/allreduce_a100_ring.py b/sccl/programs/allreduce_a100_ring.py
new file mode 100644
index 0000000..a3feb20
--- /dev/null
+++ b/sccl/programs/allreduce_a100_ring.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+# Ring all reduce for A100s
+# Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
+# channels=1 is standard ring, all chunks are assigned to the same tb/channel
+# channels=8 devotes 1 tb/channel to handling 1 chunk of the data
+def allreduce_ring(size, channels):   
+    # Reduce ring
+    for step in range(0, size-1):
+        for index in range(0, size):
+            rank = (index + step) % size
+            c = chunk(rank, Buffer.input, index)
+            next_rank = (index + step + 1) % size
+            channel = index%channels
+            c = c.reduce(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+    # Propagate ring
+    for step in range(-1, size-2):
+        for index in range(0, size):
+            rank = (index + step) % size
+            c = chunk(rank, Buffer.input, index)
+            next_rank = (index + step + 1) % size
+            channel = index%channels
+            c = c.send(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
\ No newline at end of file
diff --git a/sccl/programs/alltoall_a100_yifan.py b/sccl/programs/alltoall_a100_yifan.py
new file mode 100644
index 0000000..730c560
--- /dev/null
+++ b/sccl/programs/alltoall_a100_yifan.py
@@ -0,0 +1,44 @@
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllToAll
+
+
+def alltoall_hierarchical(num_nodes, gpus_per_node):
+    num_ranks = num_nodes * gpus_per_node
+
+    for n1 in range(num_nodes):
+        for r in range(1,num_nodes):
+            n2 = (n1 + r) % num_nodes
+            # print(f"r {r} n1 {n1} n2 {n2}")
+
+            # Gather all local chunks for the node neighbor
+            for g1 in range(gpus_per_node):
+                rank1 = n1 * gpus_per_node + g1
+
+                for g2 in range(gpus_per_node):
+                    rank2 = n1 * gpus_per_node + g2
+                    # chunk to send: g2 on n2
+                    index = n2 * gpus_per_node + g2 
+                    c = chunk(rank1, Buffer.input, index)
+                    c = c.send(rank2, f'send_{n2}')
+
+        for r in range(1,num_nodes):
+            n2 = (n1 + r) % num_nodes
+            # IB send
+            for g1 in range(gpus_per_node):
+                rank = n1 * gpus_per_node + g1
+                ib_peer = n2 * gpus_per_node + g1
+                c = chunk(rank, f'send_{n2}', 0, 8)
+                c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=(n2 % 8)*2+(rank%2)+2)
+
+        
+    # Handle local chunks within a node
+    for rank in range(num_ranks):
+        for g in range(gpus_per_node):
+            index = (rank // gpus_per_node) * gpus_per_node + g
+            c = chunk(rank, Buffer.input, index)
+            c.send(c.get_dst_rank(), Buffer.output, c.get_dst_index())
+
+
diff --git a/tests/test_language.py b/tests/test_language.py
new file mode 100644
index 0000000..027a39c
--- /dev/null
+++ b/tests/test_language.py
@@ -0,0 +1,240 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sccl
+from sccl.topologies import line, fully_connected
+from sccl.language import *
+from sccl.language.collectives import *
+import os
+import pytest
+
+class Send(Collective):
+    # Initial state is chunk0 is on rank0 in the input buffer
+    def init_buffers(self):
+        chunks_per_node = self.chunk_factor
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = [None] * chunks_per_node
+            output_buffer = [None] * chunks_per_node 
+            if r == 0:
+                for c in range(chunks_per_node):
+                    input_buffer[c] = Chunk(r, c, 2, c)
+            buffers = {Buffer.input : input_buffer, 
+                    Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+            
+
+    # Final state chunk0 from rank0 is in the output buffer of rank2
+    def check(self, prog):
+        correct = True
+        output = prog.buffers[2][Buffer.output]
+        for c in range(self.chunk_factor):
+            chunk = output[c]
+            # Check that we got chunk 0 from rank 0
+            if chunk is None or chunk.origin_rank != 0 or chunk.origin_index != 0:
+                print(f'Rank 2 chunk {c} is incorrect should be ({0}, {0}) given {chunk}')
+                correct = False
+        return correct
+
+class Reduce(Collective):
+    # Initial state is chunk0,0 is on rank0 in the input buffer
+    # and chunk0,1 is on rank1 in the input buffer, etc.
+    def init_buffers(self):
+        chunks_per_node = self.chunk_factor
+        rank_buffers = []
+        for r in range(self.num_ranks):
+            input_buffer = [None] * chunks_per_node
+            output_buffer = [None] * chunks_per_node 
+            for c in range(chunks_per_node):
+                input_buffer[c] = Chunk(r, c, -1, c)
+            buffers = {Buffer.input : input_buffer, 
+                    Buffer.output : output_buffer}
+            rank_buffers.append(buffers)
+        return rank_buffers
+            
+
+    # Final state rank2 has a fully reduced chunk from gpus 0, 1, and 2
+    def check(self, prog):
+        expected_chunk = ReduceChunk([])
+        for r in range(self.num_ranks):
+            expected_chunk = expected_chunk.reduce(Chunk(r, 0))
+
+        correct = True
+        chunk = prog.buffers[2][Buffer.input][0]
+        if chunk is None or chunk != expected_chunk:
+            print(f'Rank 2 chunk 0 is incorrect should be ReduceChunk index 0 from all ranks, given {chunk}')
+            correct = False
+        return correct
+
+def test_send():
+    num_gpus = 3
+    topology = line(num_gpus)
+
+    chunksperloop = 1
+    instances = 1
+    collective = Send(num_gpus, chunksperloop, inplace=False)
+    with SCCLProgram("send", topology, collective, instances):
+        chunk(0, Buffer.input, 0).send(1, 'scratch').send(2, Buffer.output, 0)
+        assert Check()
+
+def test_reduce():
+    num_gpus = 3
+    topology = line(num_gpus)
+
+    chunksperloop = 1
+    instances = 1
+    collective = Reduce(num_gpus, chunksperloop, inplace=True)
+    with SCCLProgram("reduce", topology, collective, instances):
+        chunk(0, Buffer.input, 0).reduce(1, Buffer.input, 0).reduce(2, Buffer.input, 0)
+        assert Check()
+
+def test_local_copy():
+    num_gpus = 3
+    topology = fully_connected(num_gpus)
+
+    chunksperloop = 1
+    instances = 1
+    collective = Send(num_gpus, chunksperloop, inplace=False)
+    with SCCLProgram("cpy", topology, collective, instances):
+        chunk(0, Buffer.input, 0).send(2, 'scratch').send(2, Buffer.output, 0)
+        assert Check()
+
+def test_local_reduce():
+    num_gpus = 3
+    topology = line(num_gpus)
+
+    chunksperloop = 1
+    instances = 1
+    collective = Reduce(num_gpus, chunksperloop, inplace=True)
+    with SCCLProgram("local-reduce", topology, collective, instances):
+        chunk(0, Buffer.input, 0).reduce(1, Buffer.input, 0).send(2, 'scratch', 0).reduce(2, Buffer.input, 0)
+
+        XML()
+        assert Check()
+
+def test_scratch_buffers():
+    num_gpus = 3
+    topology = fully_connected(num_gpus)
+
+    chunksperloop = num_gpus
+    instances = 1
+    collective = AllReduce(num_gpus, chunksperloop, inplace=False)
+    with SCCLProgram("test", topology, collective, instances):
+        chunk(0, Buffer.input, 0).send(2, 'scratch', 2)
+        c = chunk(2, 'scratch', 2)
+        assert c.index == 2
+        c = chunk(1, Buffer.input, 0).send(2, 'scratch')
+        assert c.index == 3
+        XML()
+
+def test_allgather():
+    topology = fully_connected(2)
+    collective = AllGather(2, 1, True)
+    with SCCLProgram("allgather", topology, collective, 1):
+        chunk(0, Buffer.input, 0).send(1, Buffer.output, 0)
+        chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
+        assert Check()
+
+def test_reducescatter():
+    topology = fully_connected(2)
+    collective = ReduceScatter(2, 1, True)
+    with SCCLProgram("reducescatter", topology, collective, 1):
+        chunk(0, Buffer.input, 1).reduce(1, Buffer.input, 1)
+        chunk(1, Buffer.input, 0).reduce(0, Buffer.input, 0)
+        assert Check()
+
+
+def test_alltoall():
+    topology = fully_connected(2)
+    collective = AllToAll(2, 1, False)
+    with SCCLProgram("alltoall", topology, collective, 1):
+        chunk(0, Buffer.input, 0).send(0, Buffer.output, 0)
+        chunk(0, Buffer.input, 1).send(1, Buffer.output, 0)
+        chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
+        chunk(1, Buffer.input, 1).send(1, Buffer.output, 1)
+        assert Check()
+
+def test_allreduce():
+    topology = fully_connected(2)
+    collective = AllReduce(2, 2, True)
+    with SCCLProgram("allreduce", topology, collective, 1):
+        chunk(0, Buffer.input, 0).reduce(1, Buffer.output, 0).send(0, Buffer.input, 0)
+        chunk(1, Buffer.input, 1).reduce(0, Buffer.input, 1).send(1, Buffer.input, 1)
+        assert Check()
+
+def test_instruction_fusion():
+    topology = fully_connected(3)
+    collective = AllReduce(3, 3, True)
+    prgm = SCCLProgram("allreduce", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
+    with prgm:
+        c = chunk(0, Buffer.input, 0, 3).reduce(1, Buffer.input, 0,sendtb=0, recvtb=0).reduce(2, Buffer.input, 0, sendtb=0, recvtb=0)
+        c.send(0, Buffer.input, 0, sendtb=0, recvtb=0).send(1, Buffer.input, 0, sendtb=0, recvtb=0)
+        assert Check()
+    lowered_prgm = prgm.lower()
+    assert lowered_prgm.gpus[0].threadblocks[0].ops[0].inst == Instruction.send
+    assert lowered_prgm.gpus[0].threadblocks[0].ops[1].inst == Instruction.recv_copy_send
+    assert lowered_prgm.gpus[1].threadblocks[0].ops[0].inst == Instruction.recv_reduce_send
+    assert lowered_prgm.gpus[1].threadblocks[0].ops[1].inst == Instruction.recv
+    assert lowered_prgm.gpus[2].threadblocks[0].ops[0].inst == Instruction.recv_reduce_copy_send
+
+def test_replication():
+    topology = fully_connected(2)
+    collective = AllToAll(2, 1, False)
+    prgm = SCCLProgram("alltoall", topology, collective, 1)
+    with prgm:
+        chunk(0, Buffer.input, 0).send(0, Buffer.output, 0)
+        chunk(0, Buffer.input, 1).send(1, Buffer.output, 0)
+        chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
+        chunk(1, Buffer.input, 1).send(1, Buffer.output, 1)
+
+    instances = 2
+    replicated_prgm = SCCLProgram("alltoall", topology, collective, instances)
+    with replicated_prgm:
+            chunk(0, Buffer.input, 0).send(0, Buffer.output, 0)
+            chunk(0, Buffer.input, 1).send(1, Buffer.output, 0)
+            chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
+            chunk(1, Buffer.input, 1).send(1, Buffer.output, 1)
+
+    lowered_prgm = prgm.lower()
+    lowered_replicated_prgm = replicated_prgm.lower()
+
+    for gpu1, gpu2 in zip(lowered_prgm.gpus, lowered_replicated_prgm.gpus):
+        assert len(gpu1.threadblocks) * instances == len(gpu2.threadblocks)
+
+def test_illegal_tb_assignment():
+    num_gpus = 3
+    topology = fully_connected(num_gpus)
+    collective = AllToAll(num_gpus, 1, False)
+    prgm = SCCLProgram("alltoall", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
+    with prgm:
+        with pytest.raises(Exception):
+            # Cannot send to two different gpus on the same threadblock
+            chunk(0, Buffer.input, 0).send(1, Buffer.output, 0, sendtb=0, recvtb=1)
+            chunk(0, Buffer.input, 1).send(2, Buffer.output, 0, sendtb=0, recvtb=1)
+            XML()
+
+def test_registered_alltoall():
+    from sccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
+
+    num_nodes = 4
+    gpus_per_node = 8
+    num_ranks = num_nodes * gpus_per_node
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+        alltoall_hierarchical(num_nodes, gpus_per_node)
+        assert Check()
+
+def test_registered_allreduce():
+    from sccl.programs.allreduce_a100_ring import allreduce_ring 
+
+    num_ranks = 8
+    instances = 4
+    topology = fully_connected(num_ranks)
+    collective = AllReduce(num_ranks, num_ranks, inplace=True)
+    with SCCLProgram(f"allreduce", topology, collective, instances,
+        protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
+        allreduce_ring(num_ranks, num_ranks)
+        assert Check()
+        XML()
\ No newline at end of file

From 0a97e877ebde70d3b08ad2abb266660bbd55290b Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Mon, 7 Feb 2022 17:59:01 -0800
Subject: [PATCH 079/135] Speedup compilation (#11)

---
 sccl/language/rank_dag.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sccl/language/rank_dag.py b/sccl/language/rank_dag.py
index 7badea9..37ed867 100644
--- a/sccl/language/rank_dag.py
+++ b/sccl/language/rank_dag.py
@@ -217,15 +217,15 @@ def convert_set_list(self):
                 ops.append(op)
 
             visited = set()
-            i = 0
-            while i < len(ops):
-                op = ops[i]
+            while len(ops) > 0:
+                op = ops[0]
                 if op not in visited:
                     visited.add(op)
                     op.next = list(op.next)
-                    ops += op.next
-                i += 1
-
+                    ops = ops[1:] + op.next
+                else:
+                    ops = ops[1:]
+                    
     def optimize(self):
         self._optimize_rrcs_rrs()
         self._optimize_rcs()

From 5deb88bfd34bfb2b8e8ac20896b891755e7f63de Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 7 Feb 2022 18:03:29 -0800
Subject: [PATCH 080/135] Set NCCL_IB_AR_THRESHOLD for large ndv4 alltoalls

---
 sccl/autosynth/__init__.py | 16 ++++++++++++----
 tests/test_autosynth.py    |  7 +++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index a917566..c178c8e 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -53,7 +53,9 @@ def init(machine_type, num_machines, *collectives):
                 sizes = humanfriendly.parse_size(sizes)
             sizes = (sizes, sizes+1)
         candidates = synthesis_plans[(name, machine_type)]
-        selected_plans[name] = _select_plans(name, candidates, num_machines, sizes)
+        plans = _select_plans(name, candidates, num_machines, sizes)
+        if len(plans) > 0:
+            selected_plans[name] = plans
 
     # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by SCCL-RT.
     algos_elem = ET.Element('sccl_algos')
@@ -76,10 +78,16 @@ def init(machine_type, num_machines, *collectives):
         fd, path = tempfile.mkstemp()
         with os.fdopen(fd, 'w') as f:
             f.write(ET.tostring(algos_elem, encoding='unicode'))
-        os.environ.update({
+
+        # Set environment variables
+        env = {
             'SCCL_CONFIG': path,
-            'NCCL_NET_SHARED_BUFFERS': '0'
-        })
+            'NCCL_NET_SHARED_BUFFERS': '0',
+        }
+        if machine_type == 'ndv4' and num_machines >= 16 and 'alltoall' in selected_plans:
+            print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
+            env['NCCL_IB_AR_THRESHOLD'] = '0'
+        os.environ.update(env)
     else:
         print(f'SCCL: No algorithms were selected.')
 
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 5c7e86a..7fcf03f 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -12,14 +12,21 @@ def test_sccl_init(capsys):
     out, err = capsys.readouterr()
     assert 'No plan found' in out
     assert not 'SCCL_CONFIG' in os.environ
+
     sccl.init('ndv2', 2, ('alltoall', '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
     assert 'SCCL_CONFIG' in os.environ
+
     sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv4_hierarchical_alltoall' in out
 
+    sccl.init('ndv4', 16, (sccl.Collective.alltoall, '1MB'))
+    out, err = capsys.readouterr()
+    assert 'ndv4_alltoall' in out
+    assert 'NCCL_IB_AR_THRESHOLD' in os.environ
+
 
 def test_register_plan():
     @register_synthesis_plan('allgather', 'fancy_machine', sizes=(0, '4MB'))

From 4226aefb3b423076313d20cdc8c2fe3cd6ab5fd9 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 9 Feb 2022 13:29:30 -0800
Subject: [PATCH 081/135] adding a dockerfile for pytroch + msccl + sccl

---
 dockerfiles/Dockerfile | 96 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 dockerfiles/Dockerfile

diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile
new file mode 100644
index 0000000..323b58c
--- /dev/null
+++ b/dockerfiles/Dockerfile
@@ -0,0 +1,96 @@
+FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel
+
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
+
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages --no-install-recommends \
+    software-properties-common \
+    build-essential autotools-dev cmake g++ gcc \
+    openssh-client openssh-server \
+    nfs-common pdsh curl sudo net-tools \
+    vim iputils-ping wget perl unzip
+
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+    apt-get update && \
+    apt-get install -y git && \
+    git --version
+
+##############################################################################
+# Pip
+##############################################################################
+# pip version <= 20.1.1 is needed for the ruamel.yaml installation conflict
+# between conda and pip. ruamel.yaml is needed by azureml.
+# https://github.com/Azure/MachineLearningNotebooks/issues/1110 for more info.
+ENV PIP_VERSION=20.1.1
+RUN conda install -y pip=${PIP_VERSION} && \
+    # Print python an pip version
+    python -V && pip -V
+
+##############################################################################
+# MPI
+##############################################################################
+RUN cd ${STAGE_DIR} && mkdir openmpi/ && cd openmpi && wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.1.tar.gz && \
+    tar zxf openmpi-4.0.1.tar.gz && \
+    cd openmpi-4.0.1 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install && \
+    ldconfig && \
+    rm -rf ${STAGE_DIR}/openmpi/
+
+##############################################################################
+# SCCL
+##############################################################################
+
+# update NCCL in pytorch, install SCCL interpreter
+RUN pip uninstall torch -y
+
+RUN pip install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
+
+RUN conda install -c pytorch magma-cuda111 -y
+
+ENV CMAKE_PREFIX_PATH=/opt/conda
+
+# Change NCCL to SCCL Runtime version 0.3.1
+RUN cd ${STAGE_DIR}/ && \
+    git clone https://github.com/pytorch/pytorch.git && \
+    cd pytorch && \
+    git checkout tags/v1.9.0 -b v1.9.0_sccl && \
+    perl -p -i -e  's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules && \
+    git submodule sync third_party/nccl  && \
+    git submodule update --init --recursive  && \
+    git submodule update --init --recursive --remote third_party/nccl && \
+    cd third_party/nccl/nccl/ && \
+    git checkout master && \
+    cd ../../../ && \
+    git apply third_party/nccl/nccl/patches/nccl.cpp.patch && \
+    python setup.py install && \
+    cd ${STAGE_DIR} && \
+    rm -rf ${STAGE_DIR}/pytorch
+
+# Install SCCL Synthesizer version 2.1.2
+RUN cd ${STAGE_DIR}// && \
+    git clone https://github.com/microsoft/sccl.git && \
+    cd sccl/ && python setup.py install && \
+    cd ${STAGE_DIR} && \
+    rm -rf ${STAGE_DIR}/sccl/
+
+##############################################################################
+# inspector-topo
+##############################################################################
+
+RUN apt-get install libibverbs-dev libnuma-dev -y
+RUN cd ${STAGE_DIR}/ && git clone https://github.com/microsoft/inspector-topo.git && \
+    cd inspector-topo/ && make && make install
+

From 5df3fbe349a925cd5ecdd0d42783bab45c2529da Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 9 Feb 2022 14:28:04 -0800
Subject: [PATCH 082/135] Fix sccl.init to set NCCL_ALGOS (#12)

* Make sccl.init set NCCL_ALGOS
* Add tests for NCCL_ALGOS logic
---
 sccl/autosynth/__init__.py | 6 ++++++
 tests/test_autosynth.py    | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index c178c8e..ad67263 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -84,6 +84,12 @@ def init(machine_type, num_machines, *collectives):
             'SCCL_CONFIG': path,
             'NCCL_NET_SHARED_BUFFERS': '0',
         }
+        if 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] != '':
+            existing_algos = os.environ['NCCL_ALGOS']
+            if 'SCCL' not in existing_algos:
+                os.environ['NCCL_ALGOS'] = 'SCCL,' + existing_algos
+        else:
+            env['NCCL_ALGOS'] = 'SCCL,RING,TREE'
         if machine_type == 'ndv4' and num_machines >= 16 and 'alltoall' in selected_plans:
             print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 7fcf03f..34bdb48 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -17,15 +17,20 @@ def test_sccl_init(capsys):
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
     assert 'SCCL_CONFIG' in os.environ
+    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING,TREE'
 
+    os.environ['NCCL_ALGOS'] = 'RING'
     sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv4_hierarchical_alltoall' in out
+    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING'
 
+    os.environ['NCCL_ALGOS'] = 'HELLO,SCCL,WORLD'
     sccl.init('ndv4', 16, (sccl.Collective.alltoall, '1MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
     assert 'NCCL_IB_AR_THRESHOLD' in os.environ
+    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'HELLO,SCCL,WORLD'
 
 
 def test_register_plan():

From 3e89bb076f3d4a3b5dd6c00f8594ace42c7f7bfa Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 9 Feb 2022 15:08:14 -0800
Subject: [PATCH 083/135] Set sizes for algorithm registrations (#13)

* Set sizes for algorithm registrations

* Fix tests for removal of legacy algorithm

* Make NCCL_ALGOS logic more robust

* Add additional registration for 64 NDV4 nodes
---
 sccl/autosynth/__init__.py   |   2 +-
 sccl/autosynth/ndv2_plans.py |   2 +-
 sccl/autosynth/ndv4_plans.py | 102 +++--------------------------------
 tests/test_autosynth.py      |  12 +++--
 4 files changed, 17 insertions(+), 101 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index ad67263..8b9632e 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -86,7 +86,7 @@ def init(machine_type, num_machines, *collectives):
         }
         if 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] != '':
             existing_algos = os.environ['NCCL_ALGOS']
-            if 'SCCL' not in existing_algos:
+            if 'SCCL' not in existing_algos.split(','):
                 os.environ['NCCL_ALGOS'] = 'SCCL,' + existing_algos
         else:
             env['NCCL_ALGOS'] = 'SCCL,RING,TREE'
diff --git a/sccl/autosynth/ndv2_plans.py b/sccl/autosynth/ndv2_plans.py
index c7218ae..cbce8ae 100644
--- a/sccl/autosynth/ndv2_plans.py
+++ b/sccl/autosynth/ndv2_plans.py
@@ -10,7 +10,7 @@
 
 
 def register_ndv2_plans():
-    @register_synthesis_plan('alltoall', 'ndv2', machines=lambda x: x >= 2)
+    @register_synthesis_plan('alltoall', 'ndv2', sizes=('1MB', None), machines=lambda x: x >= 2)
     def synthesize_ndv2_relay_alltoall(machines):
         gather_coll = gather(8, 0)
         scatter_coll = scatter(8, 1)
diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
index 079d0ba..ea31dd9 100644
--- a/sccl/autosynth/ndv4_plans.py
+++ b/sccl/autosynth/ndv4_plans.py
@@ -10,104 +10,18 @@
 def register_ndv4_plans():
 
     @register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
-        instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+        instances=4, protocol='LL128', sizes=('256KB', '20MB'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
     def ndv4_ring_allreduce(prog, nodes):
         allreduce_ring(size=8, channels=8)
 
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', machines=lambda x: x == 8 or x == 16 or x == 32)
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32)
     def ndv4_alltoall(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
-    @register_synthesis_plan('alltoall', 'ndv4', machines=lambda x: x == 9)
-    def synthesize_ndv4_hierarchical_alltoall(machines):
-        xml = ""
-        nnodes = 9
-        assert(machines == nnodes)
-        ngpuspernode = 8
-        instances = 2
-        nchunksperloop = nnodes*ngpuspernode*instances
-        xml += ('<algo name="test" nchunksperloop="{}" nchannels="{}" proto="Simple">'.format(nchunksperloop, 2*instances)) + '\n'
-
-        def CrossNodeNghr(node, g):
-            nghrNode = g if node > g else g+1
-            nghrG = node if nghrNode > node else node-1
-            return nghrNode, nghrG, nghrNode * ngpuspernode + nghrG
-        for node in range(nnodes):
-            for g in range(ngpuspernode):
-                tbindex = 0
-                nghrNode, nghrG, crossnodenghr = CrossNodeNghr(node,g)
-                xml += ('  <gpu id="{}" i_chunks="{}" o_chunks="{}" s_chunks="{}">'.format(node*ngpuspernode+g, nchunksperloop, nchunksperloop, instances*2*ngpuspernode**2)) + '\n'
-                for ch in range(instances):
-                    xml += ('    <tb id="{}" send="{}" recv="-1" chan="{}">'.format(tbindex, crossnodenghr, ch)) + '\n'
-                    xml += ('      <step s="0" type="s" srcbuf="s" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="{}" deps="{}" hasdep="0"/>'.format(ch*ngpuspernode**2, instances*ngpuspernode**2+ch*ngpuspernode**2, ngpuspernode**2, instances*(2+2*g)+ch, ngpuspernode)) + '\n'
-                    xml += ('    </tb>') + '\n'
-                    tbindex+=1
-                for ch in range(instances):
-                    xml += ('    <tb id="{}" send="-1" recv="{}" chan="{}">'.format(tbindex, crossnodenghr, ch)) + '\n'
-                    xml += ('      <step s="0" type="r" srcbuf="s" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="1"/>'.format(ch*ngpuspernode**2, instances*ngpuspernode**2+ch*ngpuspernode**2, ngpuspernode**2)) + '\n'
-                    xml += ('    </tb>') + '\n'
-                    tbindex+=1
-                for withinnodenghr  in range(ngpuspernode):
-                    withinNghrNode, withinNghrG, withinCrossNodeNghr = CrossNodeNghr(node, withinnodenghr)
-                    if withinnodenghr == g:
-                        for ch in range(instances):
-                            step = 0
-                            xml += ('    <tb id="{}" send="-1" recv="-1" chan="0">'.format(tbindex)) + '\n'
-                            xml += ('      <step s="{}" type="cpy" srcbuf="i" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="{}"/>'.format(step, instances*nghrNode*ngpuspernode+ch*ngpuspernode, instances*g*ngpuspernode+ch*ngpuspernode, ngpuspernode, 1)) + '\n'
-                            step += 1
-                            for j in range(ch*(ngpuspernode//instances), (ch+1)*(ngpuspernode//instances)):
-                                for k in range(instances):
-                                    xml += ('      <step s="{}" type="nop" srcbuf="i" srcoff="0" dstbuf="o" dstoff="0" cnt="0" depid="{}" deps="{}" hasdep="{}"/>'.format(step, (instances*(2*j+2+1)+k) if j < g else (instances*(2*j+2)+k), 0, 1 if step == 1+ngpuspernode-1 else 0)) + '\n'
-                                    step += 1
-                            xml += ('    </tb>') + '\n'
-                            tbindex+=1
-                    else:
-                        for ch in range(instances):
-                            xml += ('    <tb id="{}" send="{}" recv="-1" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, ch)) + '\n'
-                            xml += ('      <step s="0" type="s" srcbuf="i" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(instances*withinNghrNode*ngpuspernode+ch*ngpuspernode, instances*g*ngpuspernode+ch*ngpuspernode, ngpuspernode)) + '\n'
-                            xml += ('    </tb>') + '\n'
-                            tbindex+=1
-                        for ch in range(instances):
-                            xml += ('    <tb id="{}" send="-1" recv="{}" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, ch)) + '\n'
-                            xml += ('      <step s="0" type="r" srcbuf="i" srcoff="{}" dstbuf="s" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="1"/>'.format(instances*nghrNode*ngpuspernode+ch*ngpuspernode, instances*withinnodenghr*ngpuspernode+ch*ngpuspernode, ngpuspernode)) + '\n'
-                            xml += ('    </tb>') + '\n'
-                            tbindex+=1
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32)
+    def ndv4_alltoall(prog, nodes):
+        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
-        # --------------------------------
-                for withinnodenghr  in range(ngpuspernode):
-                    withinNghrNode, withinNghrG, withinCrossNodeNghr = CrossNodeNghr(node, withinnodenghr)
-                    if withinnodenghr == g:
-                        for ch in range(instances):
-                            xml += ('    <tb id="{}" send="-1" recv="-1" chan="0">'.format(tbindex)) + '\n'
-                            step = 0
-                            xml += ('      <step s="{}" type="cpy" srcbuf="i" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(node*ngpuspernode+g)+ch, instances*(node*ngpuspernode+g)+ch, 1)) + '\n'
-                            step += 1
-                            for j in range(ngpuspernode):
-                                xml += ('      <step s="{}" type="cpy" srcbuf="s" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="{}" deps="{}" hasdep="0"/>'.format(step, instances*(ngpuspernode**2+j*ngpuspernode+g)+ch, instances*(nghrNode*ngpuspernode+j)+ch, 1, instances+(instances*(j*ngpuspernode+g)+ch)//((instances*ngpuspernode**2)//instances), 0)) + '\n'
-                                step += 1
-                            xml += ('    </tb>') + '\n'
-                            tbindex+=1
-                    else:
-                        for ch in range(instances):
-                            xml += ('    <tb id="{}" send="{}" recv="-1" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, instances+ch)) + '\n'
-                            step = 0
-                            xml += ('      <step s="{}" type="s" srcbuf="i" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(node*ngpuspernode+withinnodenghr)+ch, instances*(node*ngpuspernode+g)+ch, 1)) + '\n'
-                            step += 1
-                            for j in range(ngpuspernode):
-                                xml += ('      <step s="{}" type="s" srcbuf="s" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="{}" deps="{}" hasdep="0"/>'.format(step, instances*(ngpuspernode**2+j*ngpuspernode+withinnodenghr)+ch, instances*(nghrNode*ngpuspernode+j)+ch, 1, instances+(instances*(j*ngpuspernode+withinnodenghr)+ch)//((instances*ngpuspernode**2)//instances), 0)) + '\n'
-                                step += 1
-                            xml += ('    </tb>') + '\n'
-                            tbindex+=1
-                        for ch in range(instances):
-                            xml += ('    <tb id="{}" send="-1" recv="{}" chan="{}">'.format(tbindex, node*ngpuspernode+withinnodenghr, instances+ch)) + '\n'
-                            step = 0
-                            xml += ('      <step s="{}" type="r" srcbuf="i" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(node*ngpuspernode+g)+ch, instances*(node*ngpuspernode+withinnodenghr)+ch, 1)) + '\n'
-                            step += 1
-                            for j in range(ngpuspernode):
-                                xml += ('      <step s="{}" type="r" srcbuf="s" srcoff="{}" dstbuf="o" dstoff="{}" cnt="{}" depid="-1" deps="-1" hasdep="0"/>'.format(step, instances*(ngpuspernode**2+j*ngpuspernode+g)+ch, instances*(withinNghrNode*ngpuspernode+j)+ch, 1)) + '\n'
-                                step += 1
-                            xml += ('    </tb>') + '\n'
-                            tbindex+=1
-                xml += ('  </gpu>') + '\n'
-        xml += ('</algo>') + '\n'
-        return xml
\ No newline at end of file
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '128MB'), machines=lambda x: x == 64)
+    def ndv4_alltoall(prog, nodes):
+        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 34bdb48..15e7d0a 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -12,6 +12,7 @@ def test_sccl_init(capsys):
     out, err = capsys.readouterr()
     assert 'No plan found' in out
     assert not 'SCCL_CONFIG' in os.environ
+    assert 'NCCL_ALGOS' not in os.environ
 
     sccl.init('ndv2', 2, ('alltoall', '1MB'))
     out, err = capsys.readouterr()
@@ -19,14 +20,15 @@ def test_sccl_init(capsys):
     assert 'SCCL_CONFIG' in os.environ
     assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING,TREE'
 
-    os.environ['NCCL_ALGOS'] = 'RING'
-    sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1MB'))
+    os.environ['NCCL_ALGOS'] = 'RING,FAKE_SCCL'
+    sccl.init('ndv4', 8, (sccl.Collective.alltoall, '2MB'))
     out, err = capsys.readouterr()
-    assert 'synthesize_ndv4_hierarchical_alltoall' in out
-    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING'
+    assert 'ndv4_alltoall' in out
+    assert 'NCCL_IB_AR_THRESHOLD' not in os.environ
+    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING,FAKE_SCCL'
 
     os.environ['NCCL_ALGOS'] = 'HELLO,SCCL,WORLD'
-    sccl.init('ndv4', 16, (sccl.Collective.alltoall, '1MB'))
+    sccl.init('ndv4', 16, (sccl.Collective.alltoall, '35MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
     assert 'NCCL_IB_AR_THRESHOLD' in os.environ

From 28f62704d9802c5185eaed2f5e4f9b7501684e52 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Wed, 9 Feb 2022 15:18:37 -0800
Subject: [PATCH 084/135] AllToAll (#14)

Register 8k plus 1 alltoall
---
 sccl/autosynth/ndv4_plans.py        |  5 ++
 sccl/language/__init__.py           |  2 +-
 sccl/programs/alltoall_a100_8kp1.py | 85 +++++++++++++++++++++++++++++
 tests/test_language.py              | 17 +++++-
 4 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 sccl/programs/alltoall_a100_8kp1.py

diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
index ea31dd9..e8063ba 100644
--- a/sccl/autosynth/ndv4_plans.py
+++ b/sccl/autosynth/ndv4_plans.py
@@ -4,6 +4,7 @@
 from sccl.autosynth.registry import register_synthesis_plan, register_sccl_program
 from sccl.programs.allreduce_a100_ring import allreduce_ring
 from sccl.programs.alltoall_a100_yifan import alltoall_hierarchical
+from sccl.programs.alltoall_a100_8kp1 import alltoall_three_step
 from sccl.topologies import fully_connected
 from sccl.language.ir import ThreadblockPolicy
 
@@ -25,3 +26,7 @@ def ndv4_alltoall(prog, nodes):
     @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '128MB'), machines=lambda x: x == 64)
     def ndv4_alltoall(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
+
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('128MB', None), machines=lambda x: x == 64)
+    def ndv4_alltoall(prog, nodes):
+        alltoall_three_step(num_nodes=nodes, gpus_per_node=8)
diff --git a/sccl/language/__init__.py b/sccl/language/__init__.py
index bdfeff3..77588ea 100644
--- a/sccl/language/__init__.py
+++ b/sccl/language/__init__.py
@@ -154,7 +154,7 @@ def split(self, num):
         size = self.size // num
         for i in range(num):
             index = self.index + i * size
-            chunks[i] = self.prog.get_ref(self.buffer, self.rank, index, size)
+            chunks[i] = self.prog.get_ref(self.rank, self.buffer, index, size)
         return chunks
 
     def group(self, other):
diff --git a/sccl/programs/alltoall_a100_8kp1.py b/sccl/programs/alltoall_a100_8kp1.py
new file mode 100644
index 0000000..38c03cd
--- /dev/null
+++ b/sccl/programs/alltoall_a100_8kp1.py
@@ -0,0 +1,85 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllToAll
+
+def alltoall_three_step(num_nodes, gpus_per_node, instances=1, ib_connections=1):
+    num_ranks = num_nodes * gpus_per_node
+
+    # (node, local gpu) to rank
+    # (n, g) => r
+    def RankFromNodeGpuPair(n, g):
+        return n*gpus_per_node + g
+
+    # For cross node traffic from node n1 to node n2, returns the ranks g
+    # gpus on n1 and n2 that handle that traffic.
+    def CrossNodeGpus(n1, n2):
+        def LocalRank(n1, n2):
+            return (n2 if n1 > n2 else n2-1) % gpus_per_node
+        r1 = RankFromNodeGpuPair(n1, LocalRank(n1, n2))
+        r2 = RankFromNodeGpuPair(n2, LocalRank(n2, n1))
+        return (r1, r2)
+
+    # Groups chunk reference into one large chunk reference (used for IB)
+    # Save them under a key in the dictionary ib_chunks
+    def AddChunk(ib_chunks, key, c):
+        if key in ib_chunks: 
+            ib_chunks[key] = ib_chunks[key].group(c)
+        else:
+            ib_chunks[key] = c
+        
+
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, instances, inplace=False)
+    
+    ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
+    for n1 in range(num_nodes):
+        for g1 in range(gpus_per_node):
+            for ch in range(instances):
+                for n2 in range(num_nodes):
+                    r1 = RankFromNodeGpuPair(n1, g1)
+                    if (n1 != n2): 
+                        # Send over all chunks destined for that node to the peer gpu that handles chunks to that node
+                        c = chunk(r1, Buffer.input, n2 * gpus_per_node * instances + ch * gpus_per_node, gpus_per_node)
+                        # Gather chunks destined for cross node ranks in scratch to route through IB
+                        gather_rank, _ = CrossNodeGpus(n1, n2)
+                        buffer_key = (n1, n2)
+                        # Send chunk to the gather_rank. Send returns a chunk reference to the 
+                        # receiver's chunk
+                        c = c.send(gather_rank, buffer=buffer_key, ch=ch*2)
+                        # Group the chunks using a particular IB pair into one large chunk reference
+                        AddChunk(ib_chunks, buffer_key, c) 
+                    else:
+                        # Within a node - direct send/copy the chunks over nvlink to the output buffer. 
+                        # Use a different channel to ensure that we don't get in the way of sends/receives above
+                        # which are on the critical path.
+                        for g2 in range(gpus_per_node):
+                            r2 = RankFromNodeGpuPair(n2, g2)
+                            c = chunk(r1, Buffer.input, r2 * instances + ch)
+                            c.send(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
+
+                
+
+    # IB Send and local scatters
+    for buffer_key, ib_chunk in ib_chunks.items(): 
+        (n1, n2) = buffer_key
+        _, scatter_rank = CrossNodeGpus(n1, n2)
+        # IB send divided across multiple parallel channels
+        chunks = ib_chunk.split(ib_connections)
+        for ch, c in enumerate(chunks):
+            # Note: If we are only going to use 1 IB connection for each IB send
+            # alternate between channels 0 and 1 to utilize both IB links.
+            if ib_connections == 1:
+                ib_channel = c.rank % 2
+            else:
+                ib_channel = ch
+            c = c.send(scatter_rank, buffer=buffer_key, ch=ib_channel)
+            # Local scatter
+            cs = c.split(gpus_per_node * gpus_per_node)
+            for i, c in enumerate(cs):
+                # Access the chunk's destination rank and index to route it to its final place
+                final_rank = c.get_dst_rank()
+                index = c.get_dst_index()
+                c.send(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
\ No newline at end of file
diff --git a/tests/test_language.py b/tests/test_language.py
index 027a39c..ffc852f 100644
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -214,7 +214,7 @@ def test_illegal_tb_assignment():
             chunk(0, Buffer.input, 1).send(2, Buffer.output, 0, sendtb=0, recvtb=1)
             XML()
 
-def test_registered_alltoall():
+def test_registered_alltoall_yifan():
     from sccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
 
     num_nodes = 4
@@ -226,6 +226,19 @@ def test_registered_alltoall():
         alltoall_hierarchical(num_nodes, gpus_per_node)
         assert Check()
 
+def test_registered_alltoall_8kp1():
+    from sccl.programs.alltoall_a100_8kp1 import alltoall_three_step 
+
+    num_nodes = 9
+    gpus_per_node = 8
+    num_ranks = num_nodes * gpus_per_node
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+        alltoall_three_step(num_nodes, gpus_per_node)
+        assert Check()
+        XML()
+
 def test_registered_allreduce():
     from sccl.programs.allreduce_a100_ring import allreduce_ring 
 
@@ -237,4 +250,4 @@ def test_registered_allreduce():
         protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
         allreduce_ring(num_ranks, num_ranks)
         assert Check()
-        XML()
\ No newline at end of file
+        XML()

From e25f449a6ace11b82b0ffd6f496ac40728188a7f Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Wed, 9 Feb 2022 15:28:11 -0800
Subject: [PATCH 085/135] Update ranges for AllToAll (#15)

---
 sccl/autosynth/ndv4_plans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
index e8063ba..caa715e 100644
--- a/sccl/autosynth/ndv4_plans.py
+++ b/sccl/autosynth/ndv4_plans.py
@@ -23,10 +23,10 @@ def ndv4_alltoall(prog, nodes):
     def ndv4_alltoall(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '128MB'), machines=lambda x: x == 64)
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 64)
     def ndv4_alltoall(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('128MB', None), machines=lambda x: x == 64)
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
     def ndv4_alltoall(prog, nodes):
         alltoall_three_step(num_nodes=nodes, gpus_per_node=8)

From ee340cefa030929eee8ac2c195c6d910f8f91012 Mon Sep 17 00:00:00 2001
From: Abhinav Jangda <abhijangda@gmail.com>
Date: Wed, 9 Feb 2022 23:47:05 +0000
Subject: [PATCH 086/135] dockerfile is ready

---
 dockerfiles/Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile
index 323b58c..1464325 100644
--- a/dockerfiles/Dockerfile
+++ b/dockerfiles/Dockerfile
@@ -62,8 +62,8 @@ RUN conda install -c pytorch magma-cuda111 -y
 
 ENV CMAKE_PREFIX_PATH=/opt/conda
 
-# Change NCCL to SCCL Runtime version 0.3.1
-RUN cd ${STAGE_DIR}/ && \
+# Change NCCL to SCCL Runtime
+RUN cd ${STAGE_DIR} && \
     git clone https://github.com/pytorch/pytorch.git && \
     cd pytorch && \
     git checkout tags/v1.9.0 -b v1.9.0_sccl && \
@@ -79,8 +79,8 @@ RUN cd ${STAGE_DIR}/ && \
     cd ${STAGE_DIR} && \
     rm -rf ${STAGE_DIR}/pytorch
 
-# Install SCCL Synthesizer version 2.1.2
-RUN cd ${STAGE_DIR}// && \
+# Install SCCL
+RUN cd ${STAGE_DIR}/ && \
     git clone https://github.com/microsoft/sccl.git && \
     cd sccl/ && python setup.py install && \
     cd ${STAGE_DIR} && \

From 0ec0cd48e964b31d3c41a2eadeae300ac6282730 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Wed, 9 Feb 2022 16:18:35 -0800
Subject: [PATCH 087/135] Update setup.py

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7c76b1a..7df8aec 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,8 @@
         'z3-solver',
         'argcomplete',
         'lxml',
-        'humanfriendly'
+        'humanfriendly',
+        'igraph'
     ],
     python_requires='>=3.6',
 )

From af5078c48bfd7d27540414254288d0dffe6af0d9 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 9 Feb 2022 17:24:00 -0800
Subject: [PATCH 088/135] Correct NCCL_ALGOS to NCCL_ALGO (#18)

---
 sccl/autosynth/__init__.py |  8 ++++----
 tests/test_autosynth.py    | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 8b9632e..e7b68fc 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -84,12 +84,12 @@ def init(machine_type, num_machines, *collectives):
             'SCCL_CONFIG': path,
             'NCCL_NET_SHARED_BUFFERS': '0',
         }
-        if 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] != '':
-            existing_algos = os.environ['NCCL_ALGOS']
+        if 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] != '':
+            existing_algos = os.environ['NCCL_ALGO']
             if 'SCCL' not in existing_algos.split(','):
-                os.environ['NCCL_ALGOS'] = 'SCCL,' + existing_algos
+                os.environ['NCCL_ALGO'] = 'SCCL,' + existing_algos
         else:
-            env['NCCL_ALGOS'] = 'SCCL,RING,TREE'
+            env['NCCL_ALGO'] = 'SCCL,RING,TREE'
         if machine_type == 'ndv4' and num_machines >= 16 and 'alltoall' in selected_plans:
             print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 15e7d0a..bcd73ba 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -12,27 +12,27 @@ def test_sccl_init(capsys):
     out, err = capsys.readouterr()
     assert 'No plan found' in out
     assert not 'SCCL_CONFIG' in os.environ
-    assert 'NCCL_ALGOS' not in os.environ
+    assert 'NCCL_ALGO' not in os.environ
 
     sccl.init('ndv2', 2, ('alltoall', '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
     assert 'SCCL_CONFIG' in os.environ
-    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING,TREE'
+    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'SCCL,RING,TREE'
 
-    os.environ['NCCL_ALGOS'] = 'RING,FAKE_SCCL'
+    os.environ['NCCL_ALGO'] = 'RING,FAKE_SCCL'
     sccl.init('ndv4', 8, (sccl.Collective.alltoall, '2MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
     assert 'NCCL_IB_AR_THRESHOLD' not in os.environ
-    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'SCCL,RING,FAKE_SCCL'
+    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'SCCL,RING,FAKE_SCCL'
 
-    os.environ['NCCL_ALGOS'] = 'HELLO,SCCL,WORLD'
+    os.environ['NCCL_ALGO'] = 'HELLO,SCCL,WORLD'
     sccl.init('ndv4', 16, (sccl.Collective.alltoall, '35MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
     assert 'NCCL_IB_AR_THRESHOLD' in os.environ
-    assert 'NCCL_ALGOS' in os.environ and os.environ['NCCL_ALGOS'] == 'HELLO,SCCL,WORLD'
+    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'HELLO,SCCL,WORLD'
 
 
 def test_register_plan():

From d7e591507fede8bf7d2a8942cc2cd5e3f55f5aaf Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Thu, 10 Feb 2022 09:55:05 -0800
Subject: [PATCH 089/135] Remove igraph

---
 requirements.txt           |   1 -
 sccl/language/__init__.py  |  19 ++++---
 sccl/language/visualize.py | 103 -------------------------------------
 setup.py                   |   1 -
 4 files changed, 9 insertions(+), 115 deletions(-)
 delete mode 100644 sccl/language/visualize.py

diff --git a/requirements.txt b/requirements.txt
index 1d14d8c..ccbf125 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ z3-solver
 argcomplete
 lxml
 humanfriendly
-igraph
 pytest
 pytest-cov
 pytest-xdist
diff --git a/sccl/language/__init__.py b/sccl/language/__init__.py
index 77588ea..7479563 100644
--- a/sccl/language/__init__.py
+++ b/sccl/language/__init__.py
@@ -10,7 +10,6 @@
 from sccl.language.chunk import *
 from sccl.language.buffer import *
 from sccl.language.rank_dag import *
-from sccl.language.visualize import *
 import sccl.collectives as collectives
 
 
@@ -110,15 +109,15 @@ def lower(self):
         check_threadblock_ordering(self.rank_dag)
         return Program(self.name, self.collective.name, self.collective.inplace, self.protocol, gpu_prgms)  
 
-    def print_chunk_dag(self):
-        visualize_chunk_dag(self.chunk_dag.chunk_paths)
+    # def print_chunk_dag(self):
+    #     visualize_chunk_dag(self.chunk_dag.chunk_paths)
 
-    def print_rank_dags(self, rank):
-        if rank == -1:
-            for r in range(len(self.ranks)):
-                visualize_rank_dag(self.rank_dags[r].operations)
-        else:
-            visualize_rank_dag(self.rank_dags[rank].operations)
+    # def print_rank_dags(self, rank):
+    #     if rank == -1:
+    #         for r in range(len(self.ranks)):
+    #             visualize_rank_dag(self.rank_dags[r].operations)
+    #     else:
+    #         visualize_rank_dag(self.rank_dags[rank].operations)
 
 def Print():
     _curr().print_chunk_dag()
@@ -375,4 +374,4 @@ def lower_rank_dag(self, rank_dag):
                 for o in op.next:
                     heapq.heappush(frontier, o)
                 visited.add(op)
-        rank_dag.convert_set_list() # Pre-emptively convert sets to lists
\ No newline at end of file
+        rank_dag.convert_set_list() # Pre-emptively convert sets to lists
diff --git a/sccl/language/visualize.py b/sccl/language/visualize.py
deleted file mode 100644
index 5ffca4e..0000000
--- a/sccl/language/visualize.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import igraph as ig
-from sccl.language.ir import *
-from sccl.language.rank_dag import *
-
-def visualize_chunk_dag(chunk_paths): # pragma: no cover
-    frontier = []
-    nnodes = 0
-    vertex_label = []
-    vertex_colors = []
-    edges = []
-    visited = set()
-
-    def add_node(op, nnodes, vertex_label, vertex_colors):
-        if op.num == -1:
-            op.num = nnodes
-            nnodes += 1
-            if op.inst == ChunkInstruction.start:
-                vertex_label.append(f'Start at {op.dst.rank}, {op.dst.index}.')
-                vertex_colors.append('yellow')
-            elif op.inst == ChunkInstruction.send:
-                vertex_label.append(f'Send to Rank {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
-                vertex_colors.append('blue')
-            elif op.inst == ChunkInstruction.reduce:
-                vertex_label.append(f'Reduce with {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
-                vertex_colors.append('green')
-        return nnodes
-
-    for chunk, op in chunk_paths.items():
-        if len(op.prev) == 0: 
-            frontier.append(op)
-
-    while len(frontier) > 0:
-        op = frontier[0]
-        if op in visited:
-            frontier = frontier[1:]
-        else:
-            nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
-            for next_op in op.next:
-                nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
-                edges.append([op.num, next_op.num])
-            frontier = frontier[1:] + op.next
-            visited.add(op)
-
-    g = ig.Graph(nnodes, edges, directed=True)
-    layout = g.layout(layout=ig.Graph.layout_grid)
-    ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='auto')
-
-def visualize_rank_dag(operations): # pragma: no cover
-    frontier = []
-    nnodes = 0
-    vertex_label = []
-    vertex_colors = []
-    edges = []
-    visited = set()
-    colors = ['red', 'green', 'blue', 'yellow', 'teal', 'pink', 'purple', 'orange']
-
-    def add_node(op, nnodes, vertex_label, vertex_colors):
-        if op.num == -1:
-            op.num = nnodes
-            nnodes += 1
-            # Add new node to graph
-            if op.inst == Instruction.start:
-                vertex_label.append(f'Chunk {op.src.index} Rank {op.src.rank}')
-            elif op.inst == Instruction.send:
-                vertex_label.append(f'S to Rank {op.dst.rank}')
-            elif op.inst == Instruction.recv:
-                vertex_label.append(f'R from {op.src.rank}')
-            elif op.inst == Instruction.recv_reduce_copy:
-                vertex_label.append(f'RRC from {op.src.rank}')
-            else:
-                vertex_label.append(f'{op.inst}')
-
-            # Add colors 
-            if op.inst == Instruction.start:
-                vertex_colors.append('gray')
-            else:
-                vertex_colors.append(colors[op.tb % len(colors)])
-        return nnodes
-
-    for slot, op in operations.items():
-        if len(op.prev) == 0: 
-            frontier.append(op)
-
-    while len(frontier) > 0:
-        op = frontier[0]
-
-        if op in visited:
-            frontier = frontier[1:]
-        else:
-            nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
-
-        for next_op in op.next:
-            nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
-            edges.append([op.num, next_op.num])
-            frontier = frontier[1:] + list(op.next)
-        visited.add(op)
-
-    g = ig.Graph(nnodes, edges, directed=True)
-    layout = g.layout(layout=ig.Graph.layout_grid)
-    ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='rt')
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 7df8aec..e46ffef 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,6 @@
         'argcomplete',
         'lxml',
         'humanfriendly',
-        'igraph'
     ],
     python_requires='>=3.6',
 )

From 909b49bda6e9713b3439fa8988e0cd86a4ec6126 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <30272783+saeedmaleki@users.noreply.github.com>
Date: Thu, 10 Feb 2022 15:22:12 -0800
Subject: [PATCH 090/135] NET_SHARED_BUFFER=0 is no longer needed

---
 sccl/autosynth/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index e7b68fc..cbaaac0 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -82,7 +82,6 @@ def init(machine_type, num_machines, *collectives):
         # Set environment variables
         env = {
             'SCCL_CONFIG': path,
-            'NCCL_NET_SHARED_BUFFERS': '0',
         }
         if 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] != '':
             existing_algos = os.environ['NCCL_ALGO']

From 3ec5e3dd7c62addd38150daeb040b6b71966e439 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Thu, 10 Feb 2022 17:24:42 -0800
Subject: [PATCH 091/135] Tb fix (#21)

Fix for a2a tb assignement
---
 sccl/language/passes.py        | 13 +++++++++++--
 sccl/language/tb_assignment.py | 16 ++++++----------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/sccl/language/passes.py b/sccl/language/passes.py
index be3eda3..eeda2c9 100644
--- a/sccl/language/passes.py
+++ b/sccl/language/passes.py
@@ -42,5 +42,14 @@ def check_threadblock_ordering(rank_dag):
 
                     other_tbid = match.tb
                     if other_tbid in prev_steps:
-                        assert match.step >  prev_steps[other_tbid], f"Rank {self.rank} sends op1 then op2 but {match.rank} receives op2 then op1"
-                    prev_steps[other_tbid] = match.step
+                        if match.step <= prev_steps[other_tbid].step:
+                            print("Offending Steps", match.step, prev_steps[other_tbid].step)
+                            print("Sending tb")
+                            for op in tb.ops:
+                                print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority)}')
+                            print("Receiving tb")
+                            for op in rank_dag.tbs[match.rank][other_tbid].ops:
+                                print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority)}')
+                            assert match.step >  prev_steps[other_tbid].step, f"Rank {op.rank} sends op1 then op2 but {match.rank} receives op2 then op1"
+                        
+                    prev_steps[other_tbid] = match
diff --git a/sccl/language/tb_assignment.py b/sccl/language/tb_assignment.py
index 497633c..201f96c 100644
--- a/sccl/language/tb_assignment.py
+++ b/sccl/language/tb_assignment.py
@@ -137,16 +137,11 @@ def auto_assign_tbs(rank_dag):
         if op.inst == Instruction.start:
             for o in op.next:
                 if o.inst == Instruction.send or o.inst == Instruction.copy:
-                    heapq.heappush(ops, o)
-    heapq.heapify(ops)
-
-    for o in ops:
-        if o.inst == Instruction.recv:
-            print(o)
+                    heapq.heappush(ops, ((o.chunk_step, o.priority, o.dst.index), o))
 
     visited = set()
     while len(ops) > 0:
-        op = heapq.heappop(ops)
+        _, op = heapq.heappop(ops)
         if op not in visited:
             visited.add(op)
             rank = op.rank
@@ -179,7 +174,8 @@ def auto_assign_tbs(rank_dag):
             for match in op.match:
                 match.channel = tb.channel
 
-            for o in op.next:
-                heapq.heappush(ops, o)
             for o in op.match:
-                heapq.heappush(ops, o)
\ No newline at end of file
+                heapq.heappush(ops, ((o.chunk_step, o.priority, o.dst.index), o))
+            for o in op.next:
+                heapq.heappush(ops, ((o.chunk_step, o.priority, o.dst.index), o))
+            
\ No newline at end of file

From 11eacf47bf76efefb23639492978fa8bd0f75c20 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 11 Feb 2022 19:47:21 +0000
Subject: [PATCH 092/135] adding Relaxed ordering flag

---
 sccl/autosynth/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index cbaaac0..6041af4 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -92,6 +92,9 @@ def init(machine_type, num_machines, *collectives):
         if machine_type == 'ndv4' and num_machines >= 16 and 'alltoall' in selected_plans:
             print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'
+        if machine_type == 'ndv4':
+            print(f'SCCL: Setting NCCL_IB_PCI_RELAXED_ORDERING=1 (reason: it is necessary for ndv4 to have a relaxed ordering for PCIe)')
+            env['NCCL_IB_PCI_RELAXED_ORDERING'] = '1'
         os.environ.update(env)
     else:
         print(f'SCCL: No algorithms were selected.')

From 99ae14f3f6044c49fe69c4f18fc476dc1f3f9439 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 11 Feb 2022 19:49:50 +0000
Subject: [PATCH 093/135] adding Relaxed ordering flag

---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 6041af4..a6779c8 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -89,7 +89,7 @@ def init(machine_type, num_machines, *collectives):
                 os.environ['NCCL_ALGO'] = 'SCCL,' + existing_algos
         else:
             env['NCCL_ALGO'] = 'SCCL,RING,TREE'
-        if machine_type == 'ndv4' and num_machines >= 16 and 'alltoall' in selected_plans:
+        if machine_type == 'ndv4' and num_machines >= 8 and 'alltoall' in selected_plans:
             print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'
         if machine_type == 'ndv4':

From a060c38f7ae69311a267b36ccad596d286a2cc7f Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 11 Feb 2022 12:01:16 -0800
Subject: [PATCH 094/135] Fix tests for changed NCCL_IB_AR_THRESHOLD logic

---
 tests/test_autosynth.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index bcd73ba..8eeb2d5 100644
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -18,20 +18,20 @@ def test_sccl_init(capsys):
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
     assert 'SCCL_CONFIG' in os.environ
+    assert 'NCCL_IB_AR_THRESHOLD' not in os.environ
     assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'SCCL,RING,TREE'
 
     os.environ['NCCL_ALGO'] = 'RING,FAKE_SCCL'
     sccl.init('ndv4', 8, (sccl.Collective.alltoall, '2MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
-    assert 'NCCL_IB_AR_THRESHOLD' not in os.environ
+    assert 'NCCL_IB_AR_THRESHOLD' in os.environ
     assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'SCCL,RING,FAKE_SCCL'
 
     os.environ['NCCL_ALGO'] = 'HELLO,SCCL,WORLD'
     sccl.init('ndv4', 16, (sccl.Collective.alltoall, '35MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
-    assert 'NCCL_IB_AR_THRESHOLD' in os.environ
     assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'HELLO,SCCL,WORLD'
 
 

From 54e20f6b8eddab06d9f54d4c6ec7139c09d26cb6 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 12 Feb 2022 03:18:51 +0000
Subject: [PATCH 095/135] more necessary flags for A100s

---
 sccl/autosynth/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index a6779c8..0600125 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -93,8 +93,10 @@ def init(machine_type, num_machines, *collectives):
             print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'
         if machine_type == 'ndv4':
-            print(f'SCCL: Setting NCCL_IB_PCI_RELAXED_ORDERING=1 (reason: it is necessary for ndv4 to have a relaxed ordering for PCIe)')
+            print(f'SCCL: Setting relaxed orderin, topo file and visible devices order')
             env['NCCL_IB_PCI_RELAXED_ORDERING'] = '1'
+            env['NCCL_TOPO_FILE'] = '/opt/msft/topo.xml'
+            env['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
         os.environ.update(env)
     else:
         print(f'SCCL: No algorithms were selected.')

From 37f314dfb51c30b77c75fe76bd3ed3611e96aea4 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Mon, 14 Feb 2022 14:02:14 -0800
Subject: [PATCH 096/135] Balance alltoall channels over infiniband (#22)

---
 examples/scclang/alltoall_a100_yifan.py |  2 +-
 sccl/language/__init__.py               | 15 ++++++++++-----
 sccl/programs/alltoall_a100_yifan.py    |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/examples/scclang/alltoall_a100_yifan.py b/examples/scclang/alltoall_a100_yifan.py
index c4dbf77..d6793c4 100644
--- a/examples/scclang/alltoall_a100_yifan.py
+++ b/examples/scclang/alltoall_a100_yifan.py
@@ -35,7 +35,7 @@ def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
                     rank = n1 * gpus_per_node + g1
                     ib_peer = n2 * gpus_per_node + g1
                     c = chunk(rank, f'send_{n2}', 0, 8)
-                    c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=(n2 % 8)*2+(rank%2)+2)
+                    c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
 
           
         # Handle local chunks within a node
diff --git a/sccl/language/__init__.py b/sccl/language/__init__.py
index 7479563..2183501 100644
--- a/sccl/language/__init__.py
+++ b/sccl/language/__init__.py
@@ -22,7 +22,8 @@ def _curr():
 
 class SCCLProgram:
     def __init__(self, name, topo, collective, instances, protocol='Simple', \
-            threadblock_policy=ThreadblockPolicy.auto, interleaved_replication=True):
+            threadblock_policy=ThreadblockPolicy.auto, interleaved_replication=True,
+            check_xml=True):
         self.name = name
         self.topo = topo
         self.collective = collective       
@@ -31,6 +32,7 @@ def __init__(self, name, topo, collective, instances, protocol='Simple', \
         self.protocol = protocol
         self.threadblock_policy = threadblock_policy
         self.interleaved_replication = interleaved_replication
+        self.check_xml = check_xml
         assert protocol == 'Simple' or protocol == 'LL' or protocol == 'LL128', \
             f'Given protocol: {protocol}. Must be either Simple, LL, LL128'
         self.run_opt = True # Runs optimization passes
@@ -105,8 +107,11 @@ def lower(self):
             auto_assign_tbs(self.rank_dag)
         self.rank_dag.lower_pt1(self.instances)
         gpu_prgms = self.rank_dag.lower_pt2(self.instances, self.interleaved_replication)
-        check_dependency_cycles(self.rank_dag.tbs)
-        check_threadblock_ordering(self.rank_dag)
+        if self.check_xml:
+            # Check generated SCCL-EF for correctness - no circular dependencies, sends and receives are ordered
+            # For very large programs, turn off check_xml when shipping 
+            check_dependency_cycles(self.rank_dag.tbs)
+            check_threadblock_ordering(self.rank_dag)
         return Program(self.name, self.collective.name, self.collective.inplace, self.protocol, gpu_prgms)  
 
     # def print_chunk_dag(self):
@@ -119,8 +124,8 @@ def lower(self):
     #     else:
     #         visualize_rank_dag(self.rank_dags[rank].operations)
 
-def Print():
-    _curr().print_chunk_dag()
+# def Print():
+#     _curr().print_chunk_dag()
 
 def chunk(rank, buffer, index, size=1):
     return _curr().get_ref(rank, buffer, index, size)
diff --git a/sccl/programs/alltoall_a100_yifan.py b/sccl/programs/alltoall_a100_yifan.py
index 730c560..1a79cac 100644
--- a/sccl/programs/alltoall_a100_yifan.py
+++ b/sccl/programs/alltoall_a100_yifan.py
@@ -31,7 +31,7 @@ def alltoall_hierarchical(num_nodes, gpus_per_node):
                 rank = n1 * gpus_per_node + g1
                 ib_peer = n2 * gpus_per_node + g1
                 c = chunk(rank, f'send_{n2}', 0, 8)
-                c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=(n2 % 8)*2+(rank%2)+2)
+                c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
 
         
     # Handle local chunks within a node

From 30798dd7fe03eb1ced1f6c38c21b19afe1e17bea Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 15 Feb 2022 18:34:05 -0800
Subject: [PATCH 097/135] Add printing available plans (#24)

Available through sccl.print_plans() and sccl plans list in the CLI
---
 requirements.txt           |  1 +
 sccl/__init__.py           |  2 +-
 sccl/__main__.py           |  1 +
 sccl/autosynth/__init__.py | 46 ++++++++++++++++++++++++++++++++++++++
 sccl/cli/__init__.py       |  1 +
 sccl/cli/plans.py          | 23 +++++++++++++++++++
 setup.py                   |  1 +
 7 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 sccl/cli/plans.py

diff --git a/requirements.txt b/requirements.txt
index ccbf125..9146468 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ z3-solver
 argcomplete
 lxml
 humanfriendly
+tabulate
 pytest
 pytest-cov
 pytest-xdist
diff --git a/sccl/__init__.py b/sccl/__init__.py
index 745d1ef..2e640be 100644
--- a/sccl/__init__.py
+++ b/sccl/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.autosynth import init
+from sccl.autosynth import init, tabulate_plans, print_plans
 from sccl.autosynth import ndv2_perm
 from sccl.autosynth import Collective
diff --git a/sccl/__main__.py b/sccl/__main__.py
index f97f616..34e2197 100644
--- a/sccl/__main__.py
+++ b/sccl/__main__.py
@@ -24,6 +24,7 @@ def main():
     handlers.append(make_distributors(cmd_parsers))
     handlers.append(make_analyses(cmd_parsers))
     handlers.append(make_handle_ncclize(cmd_parsers))
+    handlers.append(make_plans(cmd_parsers))
 
     argcomplete.autocomplete(parser)
     args = parser.parse_args()
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 0600125..5359ca6 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -12,6 +12,7 @@
 import math
 import tempfile
 import humanfriendly
+from tabulate import tabulate
 from enum import Enum
 
 from sccl.autosynth.ndv2_plans import register_ndv2_plans
@@ -221,3 +222,48 @@ def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
                     f'expected an isomorphism to match our expectation but none of them did!')
         finally:
             fcntl.lockf(f, fcntl.LOCK_UN)
+
+
+_max_described_machines = 2048
+def _describe_machines(machines):
+    ranges = []
+    lower = None
+    for i in range(_max_described_machines):
+        if machines(i):
+            if lower is None:
+                lower = i
+        else:
+            if lower is not None:
+                if lower == i-1:
+                    ranges.append(str(i-1))
+                else:
+                    ranges.append(f'{lower}-{i-1}')
+                lower = None
+    if lower is not None:
+        ranges.append(f'>={lower}')
+    if len(ranges) > 0:
+        return ','.join(ranges)
+    else:
+        return '???'
+
+
+def _list_plan_parameters():
+    headers = ['Machine', 'Collective', '# machines', 'From', 'To', 'Protocol', 'Priority', 'Plan name']
+    rows = []
+    for key, plans in synthesis_plans.items():
+        collective, machine_type = key
+        for name, function, machines, (low, high), protocol, priority in plans:
+            # First tuple is the key to sort by, second is the actual columns
+            rows.append(((machine_type,collective,low,high,protocol,priority,name),
+                (machine_type, collective, _describe_machines(machines), _format_size(low), _format_size(high), protocol, priority, name)))
+    rows = [columns for _, columns in sorted(rows, key=lambda x: x[0])]
+    return headers, rows
+
+
+def tabulate_plans():
+    headers, rows = _list_plan_parameters()
+    return tabulate(rows, headers=headers, tablefmt='github')
+
+
+def print_plans():
+    print(tabulate_plans())
\ No newline at end of file
diff --git a/sccl/cli/__init__.py b/sccl/cli/__init__.py
index db5cbfc..734b440 100644
--- a/sccl/cli/__init__.py
+++ b/sccl/cli/__init__.py
@@ -5,3 +5,4 @@
 from .distribute import *
 from .analyze import *
 from .ncclize import *
+from .plans import *
diff --git a/sccl/cli/plans.py b/sccl/cli/plans.py
new file mode 100644
index 0000000..63732aa
--- /dev/null
+++ b/sccl/cli/plans.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .common import *
+from sccl.autosynth import *
+
+def make_plans(cmd_parsers):
+    handler_funcs = []
+    handler_funcs.append(make_handle_list)
+
+    return make_cmd_category(cmd_parsers, 'plans', 'subcommand', handler_funcs)
+
+def make_handle_list(cmd_parsers):
+    cmd = cmd_parsers.add_parser('list')
+
+    def handle(args, command):
+        if command != 'list':
+            return False
+
+        print_plans()
+        return True
+    
+    return handle
diff --git a/setup.py b/setup.py
index e46ffef..33dcbfc 100644
--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,7 @@
         'argcomplete',
         'lxml',
         'humanfriendly',
+        'tabulate',
     ],
     python_requires='>=3.6',
 )

From 0e0c0f559368378847694542c735e93ee6532dbd Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 1 Apr 2022 14:24:57 -0700
Subject: [PATCH 098/135] Add notebook for DGX-1 Allgather solving

---
 examples/dgx1_allgather.ipynb | 421 ++++++++++++++++++++++++++++++++++
 1 file changed, 421 insertions(+)
 create mode 100644 examples/dgx1_allgather.ipynb

diff --git a/examples/dgx1_allgather.ipynb b/examples/dgx1_allgather.ipynb
new file mode 100644
index 0000000..5a23691
--- /dev/null
+++ b/examples/dgx1_allgather.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Topologies are modeled in terms of the relative bandwidths of the links. In this version of the problem, we assume that all per-send latencies are uniform, which is mostly true over NVLinks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0, 2, 1, 1, 2, 0, 0, 0],\n",
+      " [2, 0, 1, 2, 0, 1, 0, 0],\n",
+      " [1, 1, 0, 2, 0, 0, 2, 0],\n",
+      " [1, 2, 2, 0, 0, 0, 0, 1],\n",
+      " [2, 0, 0, 0, 0, 2, 1, 1],\n",
+      " [0, 1, 0, 0, 2, 0, 1, 2],\n",
+      " [0, 0, 2, 0, 1, 1, 0, 2],\n",
+      " [0, 0, 0, 1, 1, 2, 2, 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sccl.topologies import dgx1\n",
+    "from pprint import pprint\n",
+    "topology = dgx1()\n",
+    "pprint(topology.links)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The collective is the specification for where chunks start at and where they need to go. Here we instantiate allgather for this topology."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sccl.collectives import allgather\n",
+    "collective = allgather(topology.num_nodes())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is the precondition. We can see that all ranks start with one chunk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[1, 0, 0, 0, 0, 0, 0, 0],\n",
+      " [0, 1, 0, 0, 0, 0, 0, 0],\n",
+      " [0, 0, 1, 0, 0, 0, 0, 0],\n",
+      " [0, 0, 0, 1, 0, 0, 0, 0],\n",
+      " [0, 0, 0, 0, 1, 0, 0, 0],\n",
+      " [0, 0, 0, 0, 0, 1, 0, 0],\n",
+      " [0, 0, 0, 0, 0, 0, 1, 0],\n",
+      " [0, 0, 0, 0, 0, 0, 0, 1]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint([[1 if collective.precondition(rank, chunk) else 0 for chunk in range(collective.num_chunks)] for rank in range(collective.num_nodes)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is the postcondition. All ranks need to get all chunks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1],\n",
+      " [1, 1, 1, 1, 1, 1, 1, 1]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint([[1 if collective.postcondition(rank, chunk) else 0 for chunk in range(collective.num_chunks)] for rank in range(collective.num_nodes)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets try to actually solve this for a specific number of steps. `sccl.strategies` offers entry points into the solver. We'll use one that just does a single solver call for now. The encoding itself lives in [path_encoding.py](../sccl/path_encoding.py). As expected, 1 step is not enough, because some ranks aren't directly connected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Solving instance steps=1... unsatisfiable. (0.2s)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sccl.strategies import solve_instance\n",
+    "from sccl.instance import Instance\n",
+    "algo = solve_instance(topology, collective, Instance(steps=1), logging=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "But 2 steps is."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Solving instance steps=2... synthesized! (0.3s)\n"
+     ]
+    }
+   ],
+   "source": [
+    "algo = solve_instance(topology, collective, Instance(steps=2), logging=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The algorithm is composed of the sends to perform in each global step in `(chunk, source, destination)` form. The `rounds` is how many multiples of the topology's available bandwidth is needed for that step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Step(rounds=1, sends=[(0, 0, 1), (0, 0, 3), (0, 0, 4), (1, 1, 2), (1, 1, 3), (1, 1, 5), (2, 2, 1), (2, 2, 3), (2, 2, 6), (3, 3, 1), (3, 3, 2), (3, 3, 7), (4, 4, 0), (4, 4, 6), (4, 4, 7), (5, 5, 1), (5, 5, 7), (6, 6, 4), (6, 6, 5), (6, 6, 7), (7, 7, 3), (7, 7, 4)]),\n",
+      " Step(rounds=1, sends=[(0, 3, 2), (0, 4, 5), (0, 4, 6), (0, 4, 7), (1, 2, 0), (1, 2, 6), (1, 3, 7), (1, 5, 4), (2, 1, 0), (2, 6, 4), (2, 6, 5), (2, 6, 7), (3, 1, 5), (3, 2, 6), (3, 3, 0), (3, 7, 4), (4, 0, 1), (4, 0, 2), (4, 0, 3), (4, 7, 5), (5, 1, 0), (5, 1, 2), (5, 1, 3), (5, 5, 4), (5, 5, 6), (6, 4, 0), (6, 5, 1), (6, 6, 2), (6, 7, 3), (7, 3, 1), (7, 3, 2), (7, 4, 0), (7, 7, 5), (7, 7, 6)])]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(algo.steps)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Neither of those instances considered dividing the chunks into smaller ones for more fine grained routing. That can be achieved by passing `chunks=N` to the `Instance`. The bandwidths in the topology are stated relative to the chunk size, so when the chunks parameter goes up, more steps may be needed. For example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Solving instance steps=2,chunks=2... unsatisfiable. (0.4s)\n",
+      "Solving instance steps=3,chunks=2... synthesized! (0.7s)\n"
+     ]
+    }
+   ],
+   "source": [
+    "algo = solve_instance(topology, collective, Instance(steps=2, chunks=2), logging=True)\n",
+    "algo = solve_instance(topology, collective, Instance(steps=3, chunks=2), logging=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, it turns out that 2 steps is enough *if we allow one step to take double the time*. The solver can be give these \"extra rounds\" of bandwidth to allocate to the steps with an `extra_rounds` parameter:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.7s)\n"
+     ]
+    }
+   ],
+   "source": [
+    "algo = solve_instance(topology, collective, Instance(steps=2, chunks=2, extra_rounds=1), logging=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In an alpha+beta cost model, `steps` is essentially how many times the alpha cost is paid, while the multiple for beta is `size*rounds/chunks`, where `size` is the size of the input. We've automated searching over different tradeoffs between steps, rounds and chunks in a `solve_all_latency_bandwidth_tradeoffs` strategy:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Algorithms need at least 2 steps.\n",
+      "Algorithms need at least 7/6 rounds per chunk.\n",
+      "Solving instance steps=2... synthesized! (0.3s)\n",
+      "Solving instance steps=2,rounds=3,chunks=2... synthesized! (0.6s)\n",
+      "Solving instance steps=2,rounds=4,chunks=3... unsatisfiable. (0.8s)\n",
+      "Solving instance steps=3,rounds=4,chunks=3... synthesized! (1.5s)\n",
+      "Solving instance steps=2,rounds=5,chunks=4... unsatisfiable. (1.3s)\n",
+      "Solving instance steps=3,rounds=5,chunks=4... synthesized! (6.8s)\n",
+      "Solving instance steps=2,rounds=6,chunks=5... unsatisfiable. (1.8s)\n",
+      "Solving instance steps=3,rounds=6,chunks=5... synthesized! (13.1s)\n",
+      "Solving instance steps=2,rounds=7,chunks=6... unsatisfiable. (2.9s)\n",
+      "Solving instance steps=3,rounds=7,chunks=6... synthesized! (124.1s)\n",
+      "Bandwidth optimal algorithm found!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sccl.strategies import solve_all_latency_bandwidth_tradeoffs\n",
+    "algos = list(solve_all_latency_bandwidth_tradeoffs(topology, collective, logging=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Two preprocessing steps are performed:\n",
+    "- The minimum number of steps required is lower bound based on the maximum of the shortest paths for each chunk considering the topology.\n",
+    "- A minimum number of rounds per chunk is lower bound using a kind of multi-commodity flow encoding in [rounds_bound.py](../sccl/rounds_bound.py).\n",
+    "\n",
+    "Then all relevant trade-offs are iterated until a bandwidth optimal algorithm is found (if the rounds per chunk lower bound happens to be exact).\n",
+    "\n",
+    "The synthesized algorithms contain many non-Pareto-optimal algorithms, which are dominated by some other algorithm for all input sizes. We can filter those out:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sccl.strategies import prune_pareto_optimal\n",
+    "algos = prune_pareto_optimal(algos)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets set up a function to analyze the performance of the remaining algorithms. Here we assume that alpha=1 and beta=1."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fractions import Fraction\n",
+    "def print_perf(size):\n",
+    "    print(f'Input size is {size}')\n",
+    "    for algo in algos:\n",
+    "        print(f'\\n{algo.name}')\n",
+    "        chunk_size = Fraction(1, algo.instance.chunks)\n",
+    "        print(f'Chunk size:             1/chunks = {chunk_size} = {float(chunk_size)}')\n",
+    "        bw_mult = algo.instance.rounds() * chunk_size\n",
+    "        print(f'BW multiples:      rounds/chunks = {bw_mult} = {float(bw_mult)}')\n",
+    "        time = algo.instance.steps + size * bw_mult\n",
+    "        print(f'Time: steps + size*rounds/chunks = {time}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When the input size is large, the second algorithm is better:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input size is 10.0\n",
+      "\n",
+      "Allgather(n=8)-DGX1-steps=2,rounds=3,chunks=2\n",
+      "Chunk size:             1/chunks = 1/2 = 0.5\n",
+      "BW multiples:      rounds/chunks = 3/2 = 1.5\n",
+      "Time: steps + size*rounds/chunks = 17.0\n",
+      "\n",
+      "Allgather(n=8)-DGX1-steps=3,rounds=7,chunks=6\n",
+      "Chunk size:             1/chunks = 1/6 = 0.16666666666666666\n",
+      "BW multiples:      rounds/chunks = 7/6 = 1.1666666666666667\n",
+      "Time: steps + size*rounds/chunks = 14.666666666666668\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_perf(10.0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For small inputs the first one is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input size is 0.1\n",
+      "\n",
+      "Allgather(n=8)-DGX1-steps=2,rounds=3,chunks=2\n",
+      "Chunk size:             1/chunks = 1/2 = 0.5\n",
+      "BW multiples:      rounds/chunks = 3/2 = 1.5\n",
+      "Time: steps + size*rounds/chunks = 2.15\n",
+      "\n",
+      "Allgather(n=8)-DGX1-steps=3,rounds=7,chunks=6\n",
+      "Chunk size:             1/chunks = 1/6 = 0.16666666666666666\n",
+      "BW multiples:      rounds/chunks = 7/6 = 1.1666666666666667\n",
+      "Time: steps + size*rounds/chunks = 3.1166666666666667\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_perf(0.1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.6.9 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From e3584185f316fd51e10f2c8a6c09dd33aa38c8c5 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 8 Apr 2022 11:14:55 -0700
Subject: [PATCH 099/135] Change capture_output to stdout/stderr

This fixes sccl_ndv2_launcher.sh for Python 3.6, which didn't have that
parameter yet.
---
 sccl/autosynth/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index 5359ca6..f36bf45 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -197,7 +197,7 @@ def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
                 print(
                     'SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
                 topo_detect = subprocess.run(
-                    ['/usr/local/bin/inspector-topo'], capture_output=True, env={"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"})
+                    ['/usr/local/bin/inspector-topo'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"})
                 print('SCCL: Finished running inspector-topo. Finding the permutaion.')
                 if topo_detect.returncode != 0:
                     raise RuntimeError(

From a10af333a5ab9ecc30e7cea6a0b6160e661f4844 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 8 Apr 2022 12:57:44 -0700
Subject: [PATCH 100/135] Avoid overwriting env when calling inspector-topo

---
 sccl/autosynth/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
index f36bf45..9952e64 100644
--- a/sccl/autosynth/__init__.py
+++ b/sccl/autosynth/__init__.py
@@ -196,8 +196,10 @@ def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
             else:
                 print(
                     'SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
+                env = os.environ.copy()
+                env['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
                 topo_detect = subprocess.run(
-                    ['/usr/local/bin/inspector-topo'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"})
+                    ['/usr/local/bin/inspector-topo'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
                 print('SCCL: Finished running inspector-topo. Finding the permutaion.')
                 if topo_detect.returncode != 0:
                     raise RuntimeError(

From ce1cd1159b1a4a26708c6fb466a1f8a22bd8c98d Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 22 Apr 2022 14:50:55 -0700
Subject: [PATCH 101/135] Update readme and cleanup algorithm registrations

---
 README.md                    | 110 ++++++++++++++++++++++-------------
 sccl/autosynth/ndv4_plans.py |  12 ++--
 2 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 5fd0874..bbebee3 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,89 @@
 # SCCL
 
-SCCL is a programmable GPU communication library that offers synthesis tools and a programming language, SCCLang, for
-building collective algorithms tailored to a particular hardware and workload.
+SCCL is a tool stack for programmable communication on GPUs. Algorithms created with SCCL can:
+- Implement either MPI-style collectives like Allreduce, or any application specific communication pattern.
+- Target specific hardware and interconnect topologies, unlocking their full potential.
+- Optimize for the data sizes in your application, making the best tradeoff between latency and bandwidth utilization.
+
+SCCL ships with algorithms targeting various Azure multi-GPU VM types. See the [Available Algorithms section](#available-algorithms) to find out what is currently available.
+
+SCCL has two ways of creating new algorithms:
+1. MSCCLang, a high-level DSL that talks about communication in an intuitive chunk-oriented form. See the [MSCCLang
+section](#mscclang) for how to get started.
+2. Synthesis, which automatically solves optimal algorithms for a given hardware topology. Making synthesis general
+enough for common use cases is an on-going research project See [the synthesis readme](SYNTHESIS.md) for an
+introduction.
+
+## Usage
+
+The SCCL Python package ships with a registry of synthesis strategies and hand optimized algorithms. These can be loaded
+into [the runtime](https://github.com/parasailteam/msccl) through the `sccl.init` function, which must be called before
+the application creates its NCCL communicator. For PyTorch this means before `torch.distributed` is initialized.
+
+The following snippet requests `sccl.init` to provide an Alltoall algorithm in a configuration of 2 Azure NDv2 machines:
+```
+import sccl
+sccl.init('ndv2', 2, (sccl.Collective.alltoall, ('1MB')))
+```
+This will find an algorithm provider that can create an Alltoall algorithm that is expected to be good with 1MB of data.
+That will call a synthesis routine that writes the algorithm to disk. `sccl.init` will then pass a configuration file
+pointing to this algorithm to the runtime through environment variables.
+
+See [the examples](examples/sccl_init.py) for more on `sccl.init` usage.
+
+## Available Algorithms
+
+To list the algorithms currently in SCCL's built-in registry, run `sccl plans list` on the command line. This will print out the following table (on 4/22/2022):
+
+| Machine   | Collective   | # machines   | From   | To       | Protocol   |   Priority | Plan name                           |
+|-----------|--------------|--------------|--------|----------|------------|------------|-------------------------------------|
+| ndv2      | alltoall     | >=2          | 1 MB   | infinity | Simple     |          0 | call synthesize_ndv2_relay_alltoall |
+| ndv4      | allreduce    | 1            | 256 KB | 20 MB    | LL128      |          0 | run ndv4_ring_allreduce             |
+| ndv4      | alltoall     | 8,16,32,64   | 1 MB   | 32 MB    | LL128      |          0 | run ndv4_alltoall_hierarchical      |
+| ndv4      | alltoall     | 8,16,32      | 32 MB  | infinity | Simple     |          0 | run ndv4_alltoall_hierarchical      |
+| ndv4      | alltoall     | 64           | 32 MB  | infinity | Simple     |          0 | run ndv4_alltoall_three_step        |
+
+Each line lists an algorithm registration and the conditions under which it is triggered. For example, the
+`ndv4_alltoall_hierarchical` algorithm will be used with NCCL's lower latency LL128 protocol when:
+- the user has called Alltoall,
+- there are 1, 2, 4 or 8 Azure NDv4 machines, and
+- the data size is from 1 MB to 32 MB.
+
+The repository [parasailteam/sccl-presynth](https://github.com/parasailteam/sccl-presynth) repository offers additional algorithms that have been
+pre-synthesized for fixed configurations. To enable them install the package and import it before the call to
+`sccl.init`.
+
+## MSCCLang
+
+MSCCLang is a high-level language for specifying collective communication algorithms in an intuitive chunk-oriented form. The language is available as a Python-integrated DSL.
+
+The language is still under development and lacks comprehensive documentation. For now, please refer to our [the pre-print of our upcoming paper](https://arxiv.org/pdf/2201.11840.pdf) and the examples in [examples/scclang](examples/scclang/).
+
+## Synthesis
+
+SCCL started out as a synthesizer for collective algorithms, and general synthesis of collective algorithms is an
+on-going research project. See [this readme](SYNTHESIS.md) for using SCCL as a synthesizer.
 
 ## Installation
 
-### Python package and tool
+### Python Package Installation
 
 To install either clone this repo and run "`pip install .`" or run:
 ```
 pip install git+https://github.com/microsoft/sccl.git
 ```
-This installs the Python package and the `sccl` command line tool.
 
-To enable Bash completion for the `sccl` tool:
+Installing the SCCL Python package also installs the `sccl` command line tool. To enable Bash completion for the `sccl`
+tool:
 ```
 echo 'eval "$(register-python-argcomplete sccl)"' >> ~/.bashrc
 ```
 
-### Runtime
+### Runtime Installation
 
-SCCL's algorithms run in [a modified version of NCCL that includes an interpreter](https://github.com/microsoft/msccl),
-which is API compatible with NCCL and is installed as normal. See https://github.com/microsoft/msccl for instructions.
+SCCL's algorithms are executed by the [Microsoft Collective Communication Library
+(MSCCL)](https://github.com/microsoft/msccl), which is API compatible with NCCL. See https://github.com/microsoft/msccl
+for instructions.
 
 To use SCCL with PyTorch, the built in NCCL submodule has to be replaced with SCCL's version. Additionally, to expose
 the new native Alltoall support that SCCL adds, PyTorch's `torch.distributed` package can optionally be patched. The
@@ -38,25 +100,6 @@ git apply third_party/nccl/nccl/patches/nccl.cpp.patch
 python setup.py install
 ```
 
-## Usage
-
-The SCCL Python package ships with a registry of synthesis strategies and hand optimized algorithms. These can be loaded
-into [the runtime](https://github.com/parasailteam/msccl) through the `sccl.init` function, which must be called before
-the application creates its NCCL communicator. For PyTorch this means before `torch.distributed` is initialized.
-
-The following snippet requests `sccl.init` to provide an Alltoall algorithm in a configuration of 2 Azure NDv2 machines:
-```
-import sccl
-sccl.init('ndv2', 2, (sccl.Collective.alltoall, ('1MB')))
-```
-The call will finds an algorithm provider that can create an Alltoall algorithm that is expected to be good with 1MB of
-data. That will call a synthesis routine that writes the algorithm to disk. `sccl.init` will then pass a configuration
-file pointing to this algorithm to the runtime through environment variables.
-
-See [the examples](examples/sccl_init.py) for more on `sccl.init` usage.
-
-Refer to the next section on availability of algorithms with `sccl.init`.
-
 ### Note on Azure NDv2
 
 Azure NDv2 does not expose the true PCIe topology of the machines to the VM and worse, does not assign PCIe devices
@@ -66,19 +109,6 @@ script solves the automorphisms from the local VM's NVLink topology to the refer
 automorphisms based on measured placement of the Infiniband card such that GPU 0 is close to the NIC. A tool called
 [inspector-topo](https://github.com/microsoft/inspector-topo) needs to be available for the latter step.
 
-## Available Algorithms
-
-SCCL's built-in algorithm providers currently includes an efficient Alltoall algorithm for Azure NDv2 nodes. Stay tuned
-for more algorithms coming soon!
-
-https://github.com/parasailteam/sccl-presynth offers additional algorithms that have been pre-synthesized for fixed
-configurations. To enable them install the package and import it before the call to `sccl.init`.
-
-## Synthesis
-
-SCCL started out as a synthesizer for collective algorithms, and has since expanded to cover a broader range of
-programmability. See [this readme](SYNTHESIS.md) for using SCCL as a synthesizer.
-
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a Contributor License
diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
index caa715e..03a2911 100644
--- a/sccl/autosynth/ndv4_plans.py
+++ b/sccl/autosynth/ndv4_plans.py
@@ -15,18 +15,14 @@ def register_ndv4_plans():
     def ndv4_ring_allreduce(prog, nodes):
         allreduce_ring(size=8, channels=8)
 
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32)
-    def ndv4_alltoall(prog, nodes):
+    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64)
+    def ndv4_alltoall_hierarchical(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
     @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32)
-    def ndv4_alltoall(prog, nodes):
-        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
-
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 64)
-    def ndv4_alltoall(prog, nodes):
+    def ndv4_alltoall_hierarchical(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
     @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
-    def ndv4_alltoall(prog, nodes):
+    def ndv4_alltoall_three_step(prog, nodes):
         alltoall_three_step(num_nodes=nodes, gpus_per_node=8)

From 24d8931439bfa53fbcf7dcb01099b5b57b0ea6be Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 22 Apr 2022 15:02:33 -0700
Subject: [PATCH 102/135] Typo and remark

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bbebee3..3990368 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,9 @@ See [the examples](examples/sccl_init.py) for more on `sccl.init` usage.
 
 ## Available Algorithms
 
-To list the algorithms currently in SCCL's built-in registry, run `sccl plans list` on the command line. This will print out the following table (on 4/22/2022):
+SCCL's built-in algorithms are registered for combinations of hardware configuration and size of input data where we
+have benchmarked them to provide speedup over NCCL. To list the algorithms currently in SCCL's built-in registry, run
+`sccl plans list` on the command line. This will print out the following table (on 4/22/2022):
 
 | Machine   | Collective   | # machines   | From   | To       | Protocol   |   Priority | Plan name                           |
 |-----------|--------------|--------------|--------|----------|------------|------------|-------------------------------------|
@@ -57,7 +59,7 @@ pre-synthesized for fixed configurations. To enable them install the package and
 
 MSCCLang is a high-level language for specifying collective communication algorithms in an intuitive chunk-oriented form. The language is available as a Python-integrated DSL.
 
-The language is still under development and lacks comprehensive documentation. For now, please refer to our [the pre-print of our upcoming paper](https://arxiv.org/pdf/2201.11840.pdf) and the examples in [examples/scclang](examples/scclang/).
+The language is still under development and lacks comprehensive documentation. For now, please refer to [the pre-print of our upcoming paper](https://arxiv.org/pdf/2201.11840.pdf) and the examples in [examples/scclang](examples/scclang/).
 
 ## Synthesis
 

From 341a8353aba906fb236b0b7f33ceeeadfc0c58f6 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 2 May 2022 16:06:11 -0700
Subject: [PATCH 103/135] Fix machine count mismatch in README.md (#27)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3990368..5074370 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ have benchmarked them to provide speedup over NCCL. To list the algorithms curre
 Each line lists an algorithm registration and the conditions under which it is triggered. For example, the
 `ndv4_alltoall_hierarchical` algorithm will be used with NCCL's lower latency LL128 protocol when:
 - the user has called Alltoall,
-- there are 1, 2, 4 or 8 Azure NDv4 machines, and
+- there are 8, 16, 32 or 64 Azure NDv4 machines, and
 - the data size is from 1 MB to 32 MB.
 
 The repository [parasailteam/sccl-presynth](https://github.com/parasailteam/sccl-presynth) repository offers additional algorithms that have been

From d11da9cb373190c4889ec52a8fb3aa63edfec9e9 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Tue, 10 May 2022 13:50:51 -0700
Subject: [PATCH 104/135] Fix sccl init example (#28)

---
 examples/sccl_init.py | 54 +++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/examples/sccl_init.py b/examples/sccl_init.py
index 56bb4f8..44ca7ad 100644
--- a/examples/sccl_init.py
+++ b/examples/sccl_init.py
@@ -4,14 +4,13 @@
 import os
 
 def show():
-    print()
-    print(f"SCCL_CONFIG = {os.environ['SCCL_CONFIG']}")
-    print(f"NCCL_MIN_NCHANNELS = {os.environ['NCCL_MIN_NCHANNELS']}")
-    print(f"NCCL_NET_SHARED_BUFFERS = {os.environ['NCCL_NET_SHARED_BUFFERS']}")
-    print(f"Contents of {os.environ['SCCL_CONFIG']}:")
-    with open(os.environ['SCCL_CONFIG']) as f:
-        print(f.read())
-    print()
+    if 'SCCL_CONFIG' in os.environ:
+        print()
+        print(f"SCCL_CONFIG = {os.environ['SCCL_CONFIG']}")
+        print(f"Contents of {os.environ['SCCL_CONFIG']}:")
+        with open(os.environ['SCCL_CONFIG']) as f:
+            print(f.read())
+        print()
 
 
 print('=== Trigger a builtin synthesis plan ===')
@@ -57,32 +56,33 @@ def alltoall_9000(machines):
 show()
 
 
-print('=== SCCLang program ===')
+# TODO: Update the following programs to use the new syntax
+# print('=== SCCLang program ===')
 
-from sccl.autosynth.registry import register_sccl_program
-from sccl.topologies import line
-from sccl.language import *
+# from sccl.autosynth.registry import register_sccl_program
+# from sccl.topologies import line
+# from sccl.language import *
 
-@register_sccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1)
-def trivial_allgather(prog, nodes):
-    chunk(Buffer.input, 0, 0).send(0, Buffer.output, 0).send(1)
-    chunk(Buffer.input, 1, 0).send(1, Buffer.output, 1).send(0)
+# @register_sccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1)
+# def trivial_allgather(prog, nodes):
+#     chunk(Buffer.input, 0, 0).send(0, Buffer.output, 0).send(1)
+#     chunk(Buffer.input, 1, 0).send(1, Buffer.output, 1).send(0)
 
-sccl.init('two_gpus', 1, (sccl.Collective.allgather, (0, None)))
+# sccl.init('two_gpus', 1, (sccl.Collective.allgather, (0, None)))
 
-show()
+# show()
 
 
-print('=== SCCLang program example ====')
+# print('=== SCCLang program example ====')
 
-from sccl.topologies import fully_connected
-from sccl.programs.allreduce_a100_ring import allreduce_ring
+# from sccl.topologies import fully_connected
+# from sccl.programs.allreduce_a100_ring import allreduce_ring
 
-@register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
-    instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines=lambda x: x == 1)
-def ndv4_ring_allreduce(prog, nodes):
-    allreduce_ring(size=8, channels=8)
+# @register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+#     instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines=lambda x: x == 1)
+# def ndv4_ring_allreduce(prog, nodes):
+#     allreduce_ring(size=8, channels=8)
 
-sccl.init('ndv4', 1, (sccl.Collective.allreduce, (0, None)))
+# sccl.init('ndv4', 1, (sccl.Collective.allreduce, (0, None)))
 
-show()
\ No newline at end of file
+# show()
\ No newline at end of file

From 43edba7e5a342351796d329f2690b5295fc8a38c Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Fri, 20 May 2022 13:07:00 -0700
Subject: [PATCH 105/135] Make tests trigger on pull requests

---
 .github/workflows/tests.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 971e0d0..999c966 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -1,6 +1,9 @@
 name: Tests
 
-on: [push]
+on:
+  push:
+  pull_request:
+    branches: [ main ]
 
 jobs:
   test:

From 18f3e69922d93d484de5e0d3fe8b948e3f18ff27 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Fri, 20 May 2022 13:17:01 -0700
Subject: [PATCH 106/135] MSCCL DSL updates (#29)

1. Syntactic updates to the DSL
2. Optimized XML generation for AllReduce AllPairs algorithm
3. More algorithms for AllReduce, AllGather, and AllToAll
4. Faster compilation times
5. Bug fixes in threadblock/channel assignment
---
 .github/workflows/codeql.yml                  |   0
 .github/workflows/tests.yaml                  |   0
 .gitignore                                    |   0
 CODE_OF_CONDUCT.md                            |   0
 CONTRIBUTING.md                               |   0
 LICENSE                                       |   0
 README.md                                     |   0
 SECURITY.md                                   |   0
 SUPPORT.md                                    |   0
 SYNTHESIS.md                                  |   0
 dockerfiles/Dockerfile                        |   0
 examples/requirements_sccl_init.txt           |   0
 examples/sccl_init.py                         |   0
 .../scclang/allgather_recursive_doubling.py   |  33 ++
 examples/scclang/allgather_ring.py            |  36 ++
 examples/scclang/allreduce_a100_allpairs.py   |  34 +-
 ...lreduce_a100_recursive_doubling_halving.py |  88 ++++
 examples/scclang/allreduce_a100_ring.py       |  16 +-
 examples/scclang/allreduce_a100_tree.py       |  87 ----
 examples/scclang/allreduce_binomial_tree.py   |  63 +++
 examples/scclang/allreduce_dgx1.py            |  53 +-
 examples/scclang/allreduce_ndv2.py            |  11 +-
 .../allreduce_recursive_doubling_halving.py   |  46 ++
 examples/scclang/alltoall_a100.py             |  24 +-
 examples/scclang/alltoall_a100_mesh.py        | 117 -----
 examples/scclang/alltoall_a100_yifan.py       |  12 +-
 examples/scclang/alltoall_allpairs.py         |   4 +-
 examples/scclang/alltonext_backward.py        |  22 +-
 examples/scclang/alltonext_forward.py         |  22 +-
 examples/scclang/reducegather.py              |   0
 examples/scclang/simple/allgather_ring.py     |  10 +-
 examples/scclang/simple/allreduce_ring.py     |  16 +-
 examples/scclang/simple/custom_collective.py  |  20 +-
 examples/send.py                              |   0
 examples/unpermute_dgx1.py                    |   0
 pytest.ini                                    |   0
 requirements.txt                              |   0
 sccl/__init__.py                              |   0
 sccl/__main__.py                              |   0
 sccl/algorithm.py                             |   0
 sccl/autosynth/__init__.py                    |   0
 sccl/autosynth/ndv2_plans.py                  |   0
 sccl/autosynth/ndv4_plans.py                  |   0
 sccl/autosynth/registry.py                    |   0
 sccl/cli/__init__.py                          |   0
 sccl/cli/analyze.py                           |   0
 sccl/cli/common.py                            |   0
 sccl/cli/distribute.py                        |   0
 sccl/cli/known_collectives.py                 |   0
 sccl/cli/known_distributed_topologies.py      |   0
 sccl/cli/known_topologies.py                  |   0
 sccl/cli/known_transformers.py                |   0
 sccl/cli/ncclize.py                           |   0
 sccl/cli/plans.py                             |   0
 sccl/cli/solve.py                             |   0
 sccl/collectives.py                           |   0
 sccl/distributors/__init__.py                 |   0
 sccl/distributors/alltoall_subproblem.py      |   0
 sccl/distributors/gather_scatter_alltoall.py  |   0
 sccl/distributors/greedy_alltoall.py          |   0
 sccl/instance.py                              |   0
 sccl/isomorphisms.py                          |   0
 sccl/language/__init__.py                     | 461 ++++++++++--------
 sccl/language/buffer.py                       |   5 +-
 sccl/language/chunk.py                        |  14 +-
 sccl/language/collectives.py                  |  16 +-
 sccl/language/ir.py                           | 100 +++-
 sccl/language/passes.py                       |   6 +-
 sccl/language/rank_dag.py                     | 276 +++++------
 sccl/language/routines.py                     |  28 ++
 sccl/language/tb_assignment.py                | 325 ++++++------
 sccl/language/visualize.py                    | 103 ++++
 sccl/ncd_reduction.py                         |   0
 sccl/path_encoding.py                         |   0
 sccl/programs/__init__.py                     |   0
 sccl/programs/allreduce_a100_ring.py          |  14 +-
 sccl/programs/alltoall_a100_8kp1.py           |  16 +-
 sccl/programs/alltoall_a100_yifan.py          |  15 +-
 sccl/rounds_bound.py                          |   0
 sccl/serialization.py                         |   0
 sccl/steps_bound.py                           |   0
 sccl/strategies.py                            |   0
 sccl/topologies/__init__.py                   |   0
 sccl/topologies/amd.py                        |   0
 sccl/topologies/distributed.py                |   0
 sccl/topologies/generic.py                    |   0
 sccl/topologies/nvidia.py                     |   0
 sccl/topologies/topology.py                   |   0
 sccl/topologies/transformers.py               |   0
 setup.py                                      |   1 +
 tests/__init__.py                             |   0
 tests/common.py                               |   0
 tests/test_algorithm.py                       |   0
 tests/test_analyses.py                        |   0
 tests/test_autosynth.py                       |   0
 tests/test_cli.py                             |   0
 tests/test_distributors.py                    |   0
 tests/test_language.py                        | 137 ++++--
 tests/test_path_encoding.py                   |   0
 tests/test_serialization.py                   |   0
 tests/test_topologies.py                      |   0
 101 files changed, 1339 insertions(+), 892 deletions(-)
 mode change 100644 => 100755 .github/workflows/codeql.yml
 mode change 100644 => 100755 .github/workflows/tests.yaml
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 CODE_OF_CONDUCT.md
 mode change 100644 => 100755 CONTRIBUTING.md
 mode change 100644 => 100755 LICENSE
 mode change 100644 => 100755 README.md
 mode change 100644 => 100755 SECURITY.md
 mode change 100644 => 100755 SUPPORT.md
 mode change 100644 => 100755 SYNTHESIS.md
 mode change 100644 => 100755 dockerfiles/Dockerfile
 mode change 100644 => 100755 examples/requirements_sccl_init.txt
 mode change 100644 => 100755 examples/sccl_init.py
 create mode 100755 examples/scclang/allgather_recursive_doubling.py
 create mode 100755 examples/scclang/allgather_ring.py
 mode change 100644 => 100755 examples/scclang/allreduce_a100_allpairs.py
 create mode 100755 examples/scclang/allreduce_a100_recursive_doubling_halving.py
 mode change 100644 => 100755 examples/scclang/allreduce_a100_ring.py
 delete mode 100644 examples/scclang/allreduce_a100_tree.py
 create mode 100755 examples/scclang/allreduce_binomial_tree.py
 mode change 100644 => 100755 examples/scclang/allreduce_dgx1.py
 mode change 100644 => 100755 examples/scclang/allreduce_ndv2.py
 create mode 100755 examples/scclang/allreduce_recursive_doubling_halving.py
 mode change 100644 => 100755 examples/scclang/alltoall_a100.py
 delete mode 100644 examples/scclang/alltoall_a100_mesh.py
 mode change 100644 => 100755 examples/scclang/alltoall_a100_yifan.py
 mode change 100644 => 100755 examples/scclang/alltoall_allpairs.py
 mode change 100644 => 100755 examples/scclang/alltonext_backward.py
 mode change 100644 => 100755 examples/scclang/alltonext_forward.py
 mode change 100644 => 100755 examples/scclang/reducegather.py
 mode change 100644 => 100755 examples/scclang/simple/allgather_ring.py
 mode change 100644 => 100755 examples/scclang/simple/allreduce_ring.py
 mode change 100644 => 100755 examples/scclang/simple/custom_collective.py
 mode change 100644 => 100755 examples/send.py
 mode change 100644 => 100755 examples/unpermute_dgx1.py
 mode change 100644 => 100755 pytest.ini
 mode change 100644 => 100755 requirements.txt
 mode change 100644 => 100755 sccl/__init__.py
 mode change 100644 => 100755 sccl/__main__.py
 mode change 100644 => 100755 sccl/algorithm.py
 mode change 100644 => 100755 sccl/autosynth/__init__.py
 mode change 100644 => 100755 sccl/autosynth/ndv2_plans.py
 mode change 100644 => 100755 sccl/autosynth/ndv4_plans.py
 mode change 100644 => 100755 sccl/autosynth/registry.py
 mode change 100644 => 100755 sccl/cli/__init__.py
 mode change 100644 => 100755 sccl/cli/analyze.py
 mode change 100644 => 100755 sccl/cli/common.py
 mode change 100644 => 100755 sccl/cli/distribute.py
 mode change 100644 => 100755 sccl/cli/known_collectives.py
 mode change 100644 => 100755 sccl/cli/known_distributed_topologies.py
 mode change 100644 => 100755 sccl/cli/known_topologies.py
 mode change 100644 => 100755 sccl/cli/known_transformers.py
 mode change 100644 => 100755 sccl/cli/ncclize.py
 mode change 100644 => 100755 sccl/cli/plans.py
 mode change 100644 => 100755 sccl/cli/solve.py
 mode change 100644 => 100755 sccl/collectives.py
 mode change 100644 => 100755 sccl/distributors/__init__.py
 mode change 100644 => 100755 sccl/distributors/alltoall_subproblem.py
 mode change 100644 => 100755 sccl/distributors/gather_scatter_alltoall.py
 mode change 100644 => 100755 sccl/distributors/greedy_alltoall.py
 mode change 100644 => 100755 sccl/instance.py
 mode change 100644 => 100755 sccl/isomorphisms.py
 mode change 100644 => 100755 sccl/language/__init__.py
 mode change 100644 => 100755 sccl/language/buffer.py
 mode change 100644 => 100755 sccl/language/chunk.py
 mode change 100644 => 100755 sccl/language/collectives.py
 mode change 100644 => 100755 sccl/language/ir.py
 mode change 100644 => 100755 sccl/language/passes.py
 mode change 100644 => 100755 sccl/language/rank_dag.py
 create mode 100644 sccl/language/routines.py
 mode change 100644 => 100755 sccl/language/tb_assignment.py
 create mode 100755 sccl/language/visualize.py
 mode change 100644 => 100755 sccl/ncd_reduction.py
 mode change 100644 => 100755 sccl/path_encoding.py
 mode change 100644 => 100755 sccl/programs/__init__.py
 mode change 100644 => 100755 sccl/programs/allreduce_a100_ring.py
 mode change 100644 => 100755 sccl/programs/alltoall_a100_8kp1.py
 mode change 100644 => 100755 sccl/programs/alltoall_a100_yifan.py
 mode change 100644 => 100755 sccl/rounds_bound.py
 mode change 100644 => 100755 sccl/serialization.py
 mode change 100644 => 100755 sccl/steps_bound.py
 mode change 100644 => 100755 sccl/strategies.py
 mode change 100644 => 100755 sccl/topologies/__init__.py
 mode change 100644 => 100755 sccl/topologies/amd.py
 mode change 100644 => 100755 sccl/topologies/distributed.py
 mode change 100644 => 100755 sccl/topologies/generic.py
 mode change 100644 => 100755 sccl/topologies/nvidia.py
 mode change 100644 => 100755 sccl/topologies/topology.py
 mode change 100644 => 100755 sccl/topologies/transformers.py
 mode change 100644 => 100755 setup.py
 mode change 100644 => 100755 tests/__init__.py
 mode change 100644 => 100755 tests/common.py
 mode change 100644 => 100755 tests/test_algorithm.py
 mode change 100644 => 100755 tests/test_analyses.py
 mode change 100644 => 100755 tests/test_autosynth.py
 mode change 100644 => 100755 tests/test_cli.py
 mode change 100644 => 100755 tests/test_distributors.py
 mode change 100644 => 100755 tests/test_language.py
 mode change 100644 => 100755 tests/test_path_encoding.py
 mode change 100644 => 100755 tests/test_serialization.py
 mode change 100644 => 100755 tests/test_topologies.py

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
old mode 100644
new mode 100755
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
old mode 100644
new mode 100755
diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
old mode 100644
new mode 100755
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
old mode 100644
new mode 100755
diff --git a/LICENSE b/LICENSE
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/SECURITY.md b/SECURITY.md
old mode 100644
new mode 100755
diff --git a/SUPPORT.md b/SUPPORT.md
old mode 100644
new mode 100755
diff --git a/SYNTHESIS.md b/SYNTHESIS.md
old mode 100644
new mode 100755
diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile
old mode 100644
new mode 100755
diff --git a/examples/requirements_sccl_init.txt b/examples/requirements_sccl_init.txt
old mode 100644
new mode 100755
diff --git a/examples/sccl_init.py b/examples/sccl_init.py
old mode 100644
new mode 100755
diff --git a/examples/scclang/allgather_recursive_doubling.py b/examples/scclang/allgather_recursive_doubling.py
new file mode 100755
index 0000000..5cd4f2e
--- /dev/null
+++ b/examples/scclang/allgather_recursive_doubling.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllGather
+
+# https://web.cels.anl.gov/~thakur/papers/mpi-coll.pdf
+def allgather_recursive_doubling(size, instances, protocol):
+    topology = fully_connected(size)
+    collective = AllGather(size, instances, True)
+    with SCCLProgram("allgather_recursive_doubling", topology, collective, 1, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
+        count = 1
+        while count < size:
+            # Every rank exchanges count chunks with neighbor count away
+            for rank in range(size):
+                for i in range(instances):
+                    peer = rank ^ count
+                    index = ((rank // count) * count) * instances + i * count
+                    chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank) 
+            count *= 2
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help ='number of instances')
+parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+allgather_recursive_doubling(args.num_gpus, args.instances, args.protocol)
diff --git a/examples/scclang/allgather_ring.py b/examples/scclang/allgather_ring.py
new file mode 100755
index 0000000..8a6aaa0
--- /dev/null
+++ b/examples/scclang/allgather_ring.py
@@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllGather
+
+# Ring allgather for A100s
+# Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
+# channels=1 is standard ring, all chunks are assigned to the same tb/channel
+# channels=8 devotes 1 tb/channel to handling 1 chunk of the data
+def allgather_ring(size, channels, instances, protocol):
+    topology = fully_connected(size)
+    collective = AllGather(size, 1, True)
+    with SCCLProgram(f"allgather_ring_{channels}channelsperring", topology, collective, instances,
+         protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
+        for step in range(0, size-1):
+            for index in range(0, size):
+                rank = (index + step) % size
+                c = chunk(rank, Buffer.output, index)
+                next_rank = (index + step + 1) % size
+                channel = index%channels
+                c = c.copy(next_rank, Buffer.output, index, sendtb=channel, recvtb=channel, ch=channel)   
+        XML()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+allgather_ring(args.num_gpus, args.channels, args.instances, args.protocol)
diff --git a/examples/scclang/allreduce_a100_allpairs.py b/examples/scclang/allreduce_a100_allpairs.py
old mode 100644
new mode 100755
index 1835529..97fe9d5
--- a/examples/scclang/allreduce_a100_allpairs.py
+++ b/examples/scclang/allreduce_a100_allpairs.py
@@ -6,43 +6,45 @@
 from sccl.topologies import *
 from sccl.language.collectives import AllReduce
 
-def allreduce_allpairs(instances):
-    size = 8
-    chunksperloop = 8
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = gpus * gpus
     topology = fully_connected(size)
     collective = AllReduce(size, chunksperloop, True)
-    with SCCLProgram("allreduce_pairs", topology, collective, instances, protocol="LL", 
-        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual):
+    with SCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
         
         # Each rank sends the nth chunk to the nth rank into scratch space
         for r1 in range(size):
             for r2 in range(size):
                 if r1 != r2:
-                    index = r2
-                    c = chunk(r1, Buffer.input, index)
-                    c.send(r2, 'scratch', sendtb=r2, recvtb=r1, ch=0)
+                    index = r2 * size
+                    c = chunk(r1, Buffer.input, index, size=size)
+                    c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
 
         # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
         for r in range(size):
-            for index in range(0, 7):
-                c = chunk('scratch', r, index)
-                c.reduce(r, Buffer.input, r, sendtb=r, ch=0)
+            for index in range(0, size * (size-1)):
+                    c = chunk(r, Buffer.input, r*size + (index % size))
+                    c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
         
         # Each rank sends the fully reduced nth chunk to all other gpus
         for r1 in range(size):
             for r2 in range(size):
                 if r1 != r2:
-                    index = r1
-                    c = chunk(r1, Buffer.input, index)
-                    c.send(r2, Buffer.input, index, sendtb=r2, recvtb=r1, ch=0)
+                    index = r1 * size
+                    c = chunk(r1, Buffer.input, index, size)
+                    c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
                 
         XML()
         Check()
 
 parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
 parser.add_argument('instances', type=int, help='number of instances')
-# parser.add_argument('threadblocks', type=int, default=0, help='number of threadblocks per instance')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
 
 args = parser.parse_args()
 
-allreduce_allpairs(args.instances)
\ No newline at end of file
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file
diff --git a/examples/scclang/allreduce_a100_recursive_doubling_halving.py b/examples/scclang/allreduce_a100_recursive_doubling_halving.py
new file mode 100755
index 0000000..81e4cb7
--- /dev/null
+++ b/examples/scclang/allreduce_a100_recursive_doubling_halving.py
@@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Halving-doubling implementation of allreduce
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+
+def allreduce(ways, instances, protocol):
+    topology = fully_connected(8)
+    size = topology.num_nodes() #  Number of gpus
+    logical_chunk = 8 * ways
+    collective = AllReduce(size, logical_chunk, True)
+    with SCCLProgram("allreduce_a100_recursive_doubling_halving", topology, collective, instances, protocol, interleaved_replication=False):
+        # 1 reduction between pairs of gpus of count
+        def recursive_doubling(pairs, count, next_index, lc, sendtb, recvtb):
+            current_index = next_index.copy()
+            for r in range(size):
+                next = r ^ pairs
+                offset = (count if r <= next else 0) 
+                next_index[next] += offset
+                # Split the reduce into two separate reduces to enable fused instructions
+                block = 2 ** pairs
+                for x in range(count):
+                    index = current_index[r] + offset + lc*8 + x
+                    c1 = chunk(r, Buffer.input, index)
+                    c = chunk(next, Buffer.input, index)
+                    c.reduce(c1 ch=lc, sendtb=sendtb, recvtb=recvtb)
+
+
+        # Propagates reduced chunks in reverse order 
+        def recursive_halving(pairs, count, next_index, lc, sendtb, recvtb):
+            current_index = next_index.copy()            
+            for r in range(size):
+                next = r ^ pairs
+                offset = (count if r > next else 0) 
+                next_index[r] -= offset
+                index = current_index[r] + lc*8
+                c = chunk(r, Buffer.input, index, count)
+                c.copy(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb)
+
+        next_index = [0] * 8
+        recursive_doubling(1, 4, next_index, 0, 0, 1)
+        recursive_doubling(2, 2, next_index, 0, 1, 2)
+        recursive_doubling(4, 1, next_index, 0, 2, 3)
+
+        recursive_halving(4, 1, next_index, 0, 2, 3)
+        recursive_halving(2, 2, next_index, 0, 1, 2)
+        recursive_halving(1, 4, next_index, 0, 0, 1)
+
+        if ways > 1:
+            next_index = [0] * 8
+            lc = 1
+            recursive_doubling(4, 4, next_index, lc, 8, 9)
+            recursive_doubling(2, 2, next_index, lc, 9, 10)
+            recursive_doubling(1, 1, next_index, lc, 10, 11)
+
+            recursive_halving(1, 1, next_index, lc, 10, 11)
+            recursive_halving(2, 2, next_index, lc, 9, 10)
+            recursive_halving(4, 4, next_index, lc, 8, 9)
+            
+        if ways > 2:
+            next_index = [0] * 8
+            lc = 2
+            recursive_doubling(2, 4, next_index, lc, 4, 5)
+            recursive_doubling(1, 2, next_index, lc, 5, 6)
+            recursive_doubling(4, 1, next_index, lc, 6, 7)
+
+            
+            recursive_halving(4, 1, next_index, lc, 6, 7)
+            recursive_halving(1, 2, next_index, lc, 5, 6)
+            recursive_halving(2, 4, next_index, lc, 4, 5)
+            
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('ways', type=int, help='number of parallel trees to perform reduction min:1 max:2')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+assert args.ways >= 1 and args.ways <= 3
+allreduce(args.ways, args.instances, args.protocol)
diff --git a/examples/scclang/allreduce_a100_ring.py b/examples/scclang/allreduce_a100_ring.py
old mode 100644
new mode 100755
index 37ae767..3906a99
--- a/examples/scclang/allreduce_a100_ring.py
+++ b/examples/scclang/allreduce_a100_ring.py
@@ -10,20 +10,19 @@
 # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
 # channels=1 is standard ring, all chunks are assigned to the same tb/channel
 # channels=8 devotes 1 tb/channel to handling 1 chunk of the data
-def allreduce_ring(instances, channels):
-    size = 8
+def allreduce_ring(size, instances, channels, protocol):
     topology = fully_connected(size)
     collective = AllReduce(size, size, True)
     with SCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
-         protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
+         protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
         # Reduce ring
         for step in range(0, size-1):
             for index in range(0, size):
                 rank = (index + step) % size
-                c = chunk(rank, Buffer.input, index)
                 next_rank = (index + step + 1) % size
                 channel = index%channels
-                c = c.reduce(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+                c = chunk(next_rank, Buffer.input, index)
+                c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel)
         # Propagate ring
         for step in range(-1, size-2):
             for index in range(0, size):
@@ -31,15 +30,16 @@ def allreduce_ring(instances, channels):
                 c = chunk(rank, Buffer.input, index)
                 next_rank = (index + step + 1) % size
                 channel = index%channels
-                c = c.send(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+                c = c.copy(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
                
         XML()
         Check()
 
-
 parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
 parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
 parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: LL128')
 args = parser.parse_args()
 
-allreduce_ring(args.instances, args.channels)
+allreduce_ring(args.num_gpus, args.instances, args.channels, args.protocol)
diff --git a/examples/scclang/allreduce_a100_tree.py b/examples/scclang/allreduce_a100_tree.py
deleted file mode 100644
index 28e8e36..0000000
--- a/examples/scclang/allreduce_a100_tree.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-# Halving-doubling implementation of allreduce
-
-import argparse
-
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
-
-
-def allreduce(ways, instances):
-    topology = fully_connected(8)
-    size = topology.num_nodes() #  Number of gpus
-    logical_chunk = 8 * ways
-    tb_per_channel = 12
-    collective = AllReduce(size, logical_chunk, True)
-    with SCCLProgram("allreduce_a100_tree", topology, collective, instances, 'Simple', interleaved_replication=False):
-        # 1 reduction between pairs of gpus of count
-        def reduce_tree(pairs, count, next_index, lc, sendtb, recvtb):
-            current_index = next_index.copy()
-            for r in range(size):
-                next = r ^ pairs
-                offset = (count if r <= next else 0) 
-                next_index[next] += offset
-                # Split the reduce into two separate reduces to enable fused instructions
-                block = 2 ** pairs
-                for x in range(count):
-                    index = current_index[r] + offset + lc*8 + x
-                    c = chunk(r, Buffer.input, index)
-                    c.reduce(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb)
-
-
-        # Propagates reduced chunks in reverse order 
-        def propagate_tree(pairs, count, next_index, lc, sendtb, recvtb):
-            current_index = next_index.copy()            
-            for r in range(size):
-                next = r ^ pairs
-                offset = (count if r > next else 0) 
-                next_index[r] -= offset
-                index = current_index[r] + lc*8
-                c = chunk(r, Buffer.input, index, count)
-                c.send(next, Buffer.input, index, ch=lc, sendtb=sendtb, recvtb=recvtb)
-
-        next_index = [0] * 8
-        reduce_tree(1, 4, next_index, 0, 0, 1)
-        reduce_tree(2, 2, next_index, 0, 1, 2)
-        reduce_tree(4, 1, next_index, 0, 2, 3)
-
-        propagate_tree(4, 1, next_index, 0, 2, 3)
-        propagate_tree(2, 2, next_index, 0, 1, 2)
-        propagate_tree(1, 4, next_index, 0, 0, 1)
-
-        if ways > 1:
-            next_index = [0] * 8
-            lc = 1
-            reduce_tree(4, 4, next_index, lc, 8, 9)
-            reduce_tree(2, 2, next_index, lc, 9, 10)
-            reduce_tree(1, 1, next_index, lc, 10, 11)
-
-            propagate_tree(1, 1, next_index, lc, 10, 11)
-            propagate_tree(2, 2, next_index, lc, 9, 10)
-            propagate_tree(4, 4, next_index, lc, 8, 9)
-            
-        if ways > 2:
-            next_index = [0] * 8
-            lc = 2
-            reduce_tree(2, 4, next_index, lc, 4, 5)
-            reduce_tree(1, 2, next_index, lc, 5, 6)
-            reduce_tree(4, 1, next_index, lc, 6, 7)
-
-            
-            propagate_tree(4, 1, next_index, lc, 6, 7)
-            propagate_tree(1, 2, next_index, lc, 5, 6)
-            propagate_tree(2, 4, next_index, lc, 4, 5)
-            
-
-        XML()
-        Check()
-
-parser = argparse.ArgumentParser()
-parser.add_argument('ways', type=int, help='number of parallel trees to perform reduction min:1 max:2')
-parser.add_argument('instances', type=int, help='number of instances')
-args = parser.parse_args()
-assert args.ways >=0 and args.ways <= 3
-allreduce(args.ways, args.instances)
diff --git a/examples/scclang/allreduce_binomial_tree.py b/examples/scclang/allreduce_binomial_tree.py
new file mode 100755
index 0000000..8abe9e4
--- /dev/null
+++ b/examples/scclang/allreduce_binomial_tree.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+# Binomial tree and mirrored binomial tree
+# Mirrored trees adopted from: http://algo2.iti.kit.edu/documents/2tree.pdf
+def allreduce_binomial_tree(size, instances, trees, protocol):
+    topology = fully_connected(size)
+    collective = AllReduce(size, trees, True)
+    with SCCLProgram("allreduce_binomial_tree", topology, collective, instances, protocol=protocol):
+        distance = 1
+        # Reduce tree - reducing onto Rank 0
+        while distance <= size // 2:
+            # Reduce onto the left neighbor that is distance away
+            for rank in range(0, size, distance*2):
+                peer = rank + distance
+                c1 = chunk(peer, Buffer.input, 0)
+                chunk(rank, Buffer.input).reduce(c1, 0)
+            distance *= 2
+        # Broadcast tree - root is Rank 0
+        distance = distance // 2
+        while distance >= 1:
+            # Copy to the right neighbor that is distance away
+            for rank in range(0, size, distance*2):
+                peer = rank + distance
+                chunk(rank, Buffer.input, 0).copy(peer, Buffer.input, 0)
+            distance = distance // 2
+
+        # Mirrored version of the tree
+        # Reduce tree - reducing onto Rank N-1
+        if trees == 2:
+            distance = 1
+            while distance <= size // 2:
+                # Reduce onto the right neighbor that is distance away
+                for rank in range(size-1, 0, -distance*2):
+                    peer = rank - distance
+                    c1 = chunk(peer, Buffer.input, 1)
+                    chunk(rank, Buffer.input, 1).reduce(c1)
+                distance *= 2
+            # Broadcast tree - root is Rank N-1
+            distance = distance // 2
+            while distance >= 1:
+                # Copy to the left neighbor that is distance away
+                for rank in range(size-1, 0, -distance*2):
+                    peer = rank - distance
+                    chunk(rank, Buffer.input, 1).copy(peer, Buffer.input, 1)
+                distance = distance // 2
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('trees', type=int, choices=[1, 2], help ='number of trees')
+parser.add_argument('instances', type=int, help ='number of instances')
+
+parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+allreduce_binomial_tree(args.num_gpus, args.instances, args.trees, args.protocol)
\ No newline at end of file
diff --git a/examples/scclang/allreduce_dgx1.py b/examples/scclang/allreduce_dgx1.py
old mode 100644
new mode 100755
index 2318687..79157bd
--- a/examples/scclang/allreduce_dgx1.py
+++ b/examples/scclang/allreduce_dgx1.py
@@ -14,41 +14,36 @@ def allreduce(num_nodes, instances):
     remote_bw = 1
     topology = distributed_fully_connected(local_topology, num_nodes, remote_bw)
     size = topology.num_nodes()
-    collective = AllReduce(size, instances, True)
-    local_ring_order = [1,3,2,6,7,5,4,0] # Reductions will happen locally within a node in this order.
+    collective = AllReduce(size, 1, True)
+    local_ring_order = [1,3,2,6,7,5,4,0]
 
     def rank(n, g):
         return local_ring_order[g] + n * num_local_gpus
         
-    with SCCLProgram("allreduce_ring_dgx1", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual):
+    with SCCLProgram("allreduce_ring_dgx1", topology, collective, 1):
 
         # Chunks travels around local rings being reduced (local_gpus-1 hops) starting at local gpu 1
         # At the end of the most reduced chunk ends up on local gpu 0 every each node
-        for ch in range(instances):
-            for n in range(num_nodes):
-                r = rank(n, 0) # Start at local gpu 1 (index 0 in local_ring_order)
-                c = chunk(r, Buffer.input, ch)
-                for g in range(1, 8):
-                    next = rank(n, g)
-                    c = c.reduce(next, buffer=Buffer.input, index=ch, ch=ch, sendtb=0+3*ch, recvtb=0+3*ch)
-
-            # At this point gpu0 and gpu8 have the two most reduced chunks
-            # 1 IB send to fully reduce chunk + 1 IB send to update other node 
-            c0 = chunk(0, Buffer.input, ch)
-            c0 = c0.send(9, buffer=Buffer.input, index=ch, ch=ch, sendtb=0+3*ch, recvtb=0+3*ch)
-            c1 = chunk(8, Buffer.input, ch)
-            c1 = c1.send(1, buffer=Buffer.input, index=ch, ch=ch, sendtb=0+3*ch, recvtb=0+3*ch)
-
-            c0 = c0.reduce(8, buffer=Buffer.input, index=ch, ch=ch, sendtb=2+3*ch, recvtb=2+3*ch) # Completely reduced chunk on node 1, gpu0
-            c1 = c1.reduce(0, buffer=Buffer.input, index=ch, ch=ch, sendtb=2+3*ch, recvtb=2+3*ch) # Completely reduced chunk on node 0, gpu0
-
-            #  Propagate the fully reduced chunks going backwards around the ring
-            for n in range(num_nodes):
-                r = rank(n, -1) 
-                c = chunk(r, Buffer.input, ch)
-                for g in range(6, -1, -1):
-                    next = rank(n, g)
-                    c = c.send(next, buffer=Buffer.input, index=ch, ch=ch, sendtb=2+3*ch, recvtb=2+3*ch)
+        for n in range(num_nodes):
+            r = rank(n, 0) # Start at local gpu 1 (index 0 in local_ring_order)
+            c = chunk(r, Buffer.input, 0)
+            for g in range(1, 8):
+                c = c.reduce(rank(n,g), Buffer.input, 0)
+    
+        # At this point gpu0 and gpu8 have the two most reduced chunks
+        # 1 IB send to fully reduce chunk + 1 IB send to update other node 
+
+        chunk(0, Buffer.input, 0).send(9, Buffer.input, 0)
+        chunk(8, Buffer.input, 0).send(1, Buffer.input, 0).reduce(0, Buffer.input, 0)
+        chunk(9, Buffer.input, 0).reduce(8, Buffer.input, 0)
+
+        #  Propagate the fully reduced chunks going backwards around the ring
+        for n in range(num_nodes):
+            r = rank(n, 7) 
+            c = chunk(r, Buffer.input, 0)
+            for g in range(6, -1, -1):
+                next = rank(n, g)
+                c = c.send(next, Buffer.input, 0)
 
         XML()
         Check()
@@ -58,6 +53,6 @@ def rank(n, g):
 parser.add_argument('instances', type=int, help='number of instances')
 args = parser.parse_args()
 
-assert args.num_nodes > 1, "Number of nodes must be greater than 1"
+assert args.num_nodes == 2, "Only works for 2 nodes right now"
 
 allreduce(args.num_nodes, args.instances)
diff --git a/examples/scclang/allreduce_ndv2.py b/examples/scclang/allreduce_ndv2.py
old mode 100644
new mode 100755
index a178923..12ba399
--- a/examples/scclang/allreduce_ndv2.py
+++ b/examples/scclang/allreduce_ndv2.py
@@ -2,17 +2,18 @@
 # Licensed under the MIT License.
 
 import argparse
-
 from sccl.language import *
 from sccl.topologies.distributed import *
 from sccl.topologies.nvidia import *
+from sccl.topologies import *
 from sccl.language.collectives import AllReduce
 
 def allreduce(instances):
-    topology = dgx1()
-    num_local_gpus = 8
-    size = topology.num_nodes() #  Number of gpus
-    logical_chunk = 8
+    size = 8
+    num_local_gpus = size
+    topology = fully_connected(size)
+    # size = topology.num_nodes() #  Number of gpus
+    logical_chunk = size
     collective = AllReduce(size, logical_chunk, True)
     with SCCLProgram("allreduce_ndv2", topology, collective, instances, interleaved_replication=False):
         # local reduce_scatter
diff --git a/examples/scclang/allreduce_recursive_doubling_halving.py b/examples/scclang/allreduce_recursive_doubling_halving.py
new file mode 100755
index 0000000..dde2cb4
--- /dev/null
+++ b/examples/scclang/allreduce_recursive_doubling_halving.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+import argparse
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import AllReduce
+
+def reduce_scatter_vector_halving_distance_doubling(size):
+    count = size // 2
+    while count >= 1:
+        for rank in range(size):
+            peer = rank ^ count
+            index = ((peer // count) * count)
+            c1 = chunk(rank, Buffer.input, index, size=count)
+            chunk(peer, Buffer.output, index, size=count).reduce(c1, sendtb=peer, recvtb=rank, ch=0)
+        count //= 2
+
+def allgather_recursive_vector_doubling_distance_halving(size):
+    count = 1
+    while count < size:
+        for rank in range(size):
+            peer = rank ^ count
+            index = ((rank // count) * count)
+            chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank, ch=0) 
+        count *= 2
+
+def allreduce(size, instances, protocol):
+    topology = fully_connected(size)
+    logical_chunk = size
+    collective = AllReduce(size, logical_chunk, True)
+    with SCCLProgram("allreduce_recursive_doubling_halving", topology, collective, instances, protocol,
+         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual):
+        reduce_scatter_vector_halving_distance_doubling(size)
+        allgather_recursive_vector_doubling_distance_halving(size)
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+allreduce(args.num_gpus, args.instances, args.protocol)
diff --git a/examples/scclang/alltoall_a100.py b/examples/scclang/alltoall_a100.py
old mode 100644
new mode 100755
index 5a295ef..27a615e
--- a/examples/scclang/alltoall_a100.py
+++ b/examples/scclang/alltoall_a100.py
@@ -62,13 +62,13 @@ def AddChunk(ib_chunks, key, c):
         #                         buffer_key = (n1, n2)
         #                         # Send chunk to the gather_rank. Send returns a chunk reference to the 
         #                         # receiver's chunk
-        #                         c = c.send(gather_rank, buffer=buffer_key, ch=ch)
+        #                         c = c.copy(gather_rank, buffer=buffer_key, ch=ch)
         #                         # Group the chunks using a particular IB pair into one large chunk reference
         #                         AddChunk(ib_chunks, buffer_key, c) 
         #                     else:
-        #                         # Directly send chunks destined for ranks within the node or
+        #                         # Directly copy chunks destined for ranks within the node or
         #                         # copy chunks destined for current rank into the output buffer
-        #                         c.send(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch)
+        #                         c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch)
 
         for n1 in range(num_nodes):
             for g1 in range(gpus_per_node):
@@ -83,17 +83,17 @@ def AddChunk(ib_chunks, key, c):
                             buffer_key = (n1, n2)
                             # Send chunk to the gather_rank. Send returns a chunk reference to the 
                             # receiver's chunk
-                            c = c.send(gather_rank, buffer=buffer_key, ch=ch*2)
+                            c = c.copy(gather_rank, buffer=buffer_key, ch=ch*2)
                             # Group the chunks using a particular IB pair into one large chunk reference
                             AddChunk(ib_chunks, buffer_key, c) 
                         else:
-                            # Within a node - direct send/copy the chunks over nvlink to the output buffer. 
-                            # Use a different channel to ensure that we don't get in the way of sends/receives above
+                            # Within a node - direct copy/copy the chunks over nvlink to the output buffer. 
+                            # Use a different channel to ensure that we don't get in the way of copys/receives above
                             # which are on the critical path.
                             for g2 in range(gpus_per_node):
                                 r2 = RankFromNodeGpuPair(n2, g2)
                                 c = chunk(r1, Buffer.input, r2 * instances + ch)
-                                c.send(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
+                                c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
 
                     
 
@@ -101,23 +101,23 @@ def AddChunk(ib_chunks, key, c):
         for buffer_key, ib_chunk in ib_chunks.items(): 
             (n1, n2) = buffer_key
             _, scatter_rank = CrossNodeGpus(n1, n2)
-            # IB send divided across multiple parallel channels
+            # IB copy divided across multiple parallel channels
             chunks = ib_chunk.split(ib_connections)
             for ch, c in enumerate(chunks):
-                # Note: If we are only going to use 1 IB connection for each IB send
+                # Note: If we are only going to use 1 IB connection for each IB copy
                 # alternate between channels 0 and 1 to utilize both IB links.
                 if ib_connections == 1:
                     ib_channel = c.rank % 2
                 else:
                     ib_channel = ch
-                c = c.send(scatter_rank, buffer=buffer_key, ch=ib_channel)
+                c = c.copy(scatter_rank, buffer=buffer_key, ch=ib_channel)
                 # Local scatter
                 cs = c.split(gpus_per_node * gpus_per_node)
                 for i, c in enumerate(cs):
                     # Access the chunk's destination rank and index to route it to its final place
                     final_rank = c.get_dst_rank()
                     index = c.get_dst_index()
-                    c.send(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
+                    c.copy(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
 
         XML() # Prints the XML
         Check()
@@ -126,7 +126,7 @@ def AddChunk(ib_chunks, key, c):
 parser.add_argument('num_nodes', type=int, help ='number of nodes')
 parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
 parser.add_argument('instances', type=int, help='number of instances')
-parser.add_argument('--ib_connections', type=int, default=-1, help='Number of connections used for each IB send. Default: number of instances')
+parser.add_argument('--ib_connections', type=int, default=-1, help='Number of connections used for each IB copy. Default: number of instances')
 args = parser.parse_args()
 
 if args.ib_connections == -1:
diff --git a/examples/scclang/alltoall_a100_mesh.py b/examples/scclang/alltoall_a100_mesh.py
deleted file mode 100644
index 311c44e..0000000
--- a/examples/scclang/alltoall_a100_mesh.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import argparse
-from typing import Callable, List
-
-import sccl.language
-import sccl.topologies
-from sccl.language.collectives import AllToAll
-
-
-def alltoall_mesh(nnodes: int, ngpus: int, nchannels: int, threadblocks: int) -> None:
-    """Generate XML for 4-phase mesh alltoall algorithm.
-
-    Args:
-        nnodes (int): Number of nodes.
-        ngpus (int): Number of GPUs per node.
-        nchannels (int): Number of channels/instances.
-    """
-    nranks: int = nnodes * ngpus
-    node_rank: Callable[[int], int] = lambda r: r // ngpus
-    local_rank: Callable[[int], int] = lambda r: r % ngpus
-    stride_idx: Callable[[int, int, int], int] = lambda r, step, n: n // step * (r % step) + r // step
-    shift_channel: Callable[[int, int], int] = lambda chunk_idx, ch: chunk_idx + nranks * ch
-
-    topology = sccl.topologies.fully_connected(nranks)
-    collective = AllToAll(nranks, nchannels, inplace=False, name='alltoall')
-    with sccl.language.SCCLProgram('alltoall_mesh', topology, collective, instances=1, protocol='Simple', threadblocks=threadblocks):
-        # get device on all ranks
-        devices: List[sccl.language.Process] = list(map(lambda r: sccl.language.Rank(r), range(nranks)))
-
-        for ch in range(nchannels):
-            # phase-0: per-gpu (step=ngpus) stride copy
-            for r in range(nranks):
-                for peer in range(nranks):
-                    chunk = devices[r].input(peer * nchannels + ch)
-                    chunk.send(r, buffer='phase-0', index=shift_channel(stride_idx(peer, ngpus, nranks), ch), ch=ch)
-
-            # phase-1: intra-node alltoall
-            for r in range(nranks):
-                for g in range(ngpus):
-                    peer = g + node_rank(r) * ngpus
-                    chunk = devices[r].scratch('phase-0', shift_channel(g * nnodes, ch), size=nnodes)
-                    chunk.send(peer, buffer='phase-1', index=shift_channel(local_rank(r) * nnodes, ch), ch=ch)
-
-            # phase-2: per-gpu (step=nnodes) stride copy
-            for r in range(nranks):
-                for peer in range(nranks):
-                    chunk = devices[r].scratch('phase-1', shift_channel(peer, ch))
-                    chunk.send(r, buffer='phase-2', index=shift_channel(stride_idx(peer, nnodes, nranks), ch), ch=ch)
-
-            # phase-3: inter-node alltoall
-            for r in range(nranks):
-                for n in range(nnodes):
-                    peer = local_rank(r) + n * ngpus
-                    chunk = devices[r].scratch('phase-2', shift_channel(n * ngpus, ch), size=ngpus)
-                    if nchannels > 1:
-                        chunk.send(peer, buffer='phase-3', index=shift_channel(node_rank(r) * ngpus, ch), ch=ch)
-                    else:
-                        chunk.send(
-                            peer,
-                            buffer=sccl.language.Buffer.output,
-                            index=shift_channel(node_rank(r) * ngpus, ch),
-                            ch=ch
-                        )
-
-            # re-order chunks in channels
-            if nchannels <= 1:
-                continue
-            for r in range(nranks):
-                for peer in range(nranks):
-                    chunk = devices[r].scratch('phase-3', shift_channel(peer, ch))
-                    chunk.send(
-                        r,
-                        buffer=sccl.language.Buffer.output,
-                        index=stride_idx(peer, nranks, nranks * nchannels) + ch,
-                        ch=ch
-                    )
-
-        sccl.language.XML()
-        sccl.language.Check()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '-n',
-        '--num_nodes',
-        type=int,
-        default=2,
-        help='number of nodes',
-    )
-    parser.add_argument(
-        '-g',
-        '--gpus_per_node',
-        type=int,
-        default=4,
-        help='gpus per node',
-    )
-    parser.add_argument(
-        '-c',
-        '--channels',
-        type=int,
-        default=2,
-        help='number of channels',
-    )
-
-    parser.add_argument(
-        '-t',
-        '--threadblocks',
-        type=int,
-        default=0,
-        help='number of threadblocks. Default: 0, SCCLang controlled',
-    )
-    args = parser.parse_args()
-
-    alltoall_mesh(args.num_nodes, args.gpus_per_node, args.channels, args.threadblocks)
\ No newline at end of file
diff --git a/examples/scclang/alltoall_a100_yifan.py b/examples/scclang/alltoall_a100_yifan.py
old mode 100644
new mode 100755
index d6793c4..3d94484
--- a/examples/scclang/alltoall_a100_yifan.py
+++ b/examples/scclang/alltoall_a100_yifan.py
@@ -23,19 +23,19 @@ def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
 
                     for g2 in range(gpus_per_node):
                         rank2 = n1 * gpus_per_node + g2
-                        # chunk to send: g2 on n2
+                        # chunk to copy: g2 on n2
                         index = n2 * gpus_per_node + g2 
                         c = chunk(rank1, Buffer.input, index)
-                        c = c.send(rank2, f'send_{n2}')
+                        c = c.copy(rank2, f'copy_{n2}')
 
             for r in range(1,num_nodes):
                 n2 = (n1 + r) % num_nodes
-                # IB send
+                # IB copy
                 for g1 in range(gpus_per_node):
                     rank = n1 * gpus_per_node + g1
                     ib_peer = n2 * gpus_per_node + g1
-                    c = chunk(rank, f'send_{n2}', 0, 8)
-                    c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
+                    c = chunk(rank, f'copy_{n2}', 0, 8)
+                    c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
 
           
         # Handle local chunks within a node
@@ -43,7 +43,7 @@ def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
             for g in range(gpus_per_node):
                 index = (rank // gpus_per_node) * gpus_per_node + g
                 c = chunk(rank, Buffer.input, index)
-                c.send(c.get_dst_rank(), Buffer.output, c.get_dst_index())
+                c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index())
 
         XML() # Prints the XML
         Check()
diff --git a/examples/scclang/alltoall_allpairs.py b/examples/scclang/alltoall_allpairs.py
old mode 100644
new mode 100755
index c037149..3a93797
--- a/examples/scclang/alltoall_allpairs.py
+++ b/examples/scclang/alltoall_allpairs.py
@@ -14,7 +14,7 @@ def alltoall(num_ranks, instances, protocol):
     with SCCLProgram("alltoall_allpairs", topology, collective, instances=instances, protocol=protocol):
         for r in range(num_ranks):
             for index in range(num_ranks):
-                chunk(r, Buffer.input, index).send(index, Buffer.output, r)
+                chunk(r, Buffer.input, index).copy(index, Buffer.output, r)
         XML()
         Check()
 
@@ -22,7 +22,7 @@ def alltoall(num_ranks, instances, protocol):
 parser = argparse.ArgumentParser()
 parser.add_argument('num_gpus', type=int, help ='number of gpus')
 parser.add_argument('instances', type=int, help ='number of instances')
-parser.add_argument('protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
 args = parser.parse_args()
 
 alltoall(args.num_gpus, args.instances, args.protocol)
diff --git a/examples/scclang/alltonext_backward.py b/examples/scclang/alltonext_backward.py
old mode 100644
new mode 100755
index fb3e12b..f56b56d
--- a/examples/scclang/alltonext_backward.py
+++ b/examples/scclang/alltonext_backward.py
@@ -59,26 +59,26 @@ def rank(node, local_rank):
                 if r == 0:
                     continue
 
-                # Cross node send - cooperative
+                # Cross node copy - cooperative
                 if g == 0:
                     for ch in range(chunks):
                         c = chunk(r, Buffer.input, ch)
                         if ch == 0:
-                            # 2 steps: IB send to (node-1, g) then gather onto (node+1, num_local_gpus-1)
-                            c = c.send(rank(n-1, ch), 'gather', 0, ch=ch%2)
+                            # 2 steps: IB copy to (node-1, g) then gather onto (node+1, num_local_gpus-1)
+                            c = c.copy(rank(n-1, ch), f's{n}->{n+1}', 0, ch=ch%2)
                         elif ch == num_local_gpus-1: 
-                            # 2 steps: Scatter - send to (node, num_local_gpus-1), IB send to (node+1, num_local_gpus-1)
-                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
+                            # 2 steps: Scatter - copy to (node, num_local_gpus-1), IB copy to (node+1, num_local_gpus-1)
+                            c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
                         else:
-                            # 3 steps: Scatter - send to (node, g), IB send to (node-1, g), gather onto (node-1, num_local_gpus-1)
-                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
-                            c = c.send(rank(n-1, ch), 'gather', 0, ch=ch%2)
-                        c.send(r-1, Buffer.output, c.get_dst_index(), ch=ch%2)
+                            # 3 steps: Scatter - copy to (node, g), IB copy to (node-1, g), gather onto (node-1, num_local_gpus-1)
+                            c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
+                            c = c.copy(rank(n-1, ch), f's{n}->{n+1}', 0, ch=ch%2)
+                        c.copy(r-1, Buffer.output, c.get_dst_index(), ch=ch%2)
                         
-                # Normal send - directly
+                # Normal copy - directly
                 else:
                     c = chunk(r, Buffer.input, 0, chunks)
-                    c.send(r-1, Buffer.output, 0, ch=g%2)
+                    c.copy(r-1, Buffer.output, 0, ch=g%2)
         
         Check()
         XML()
diff --git a/examples/scclang/alltonext_forward.py b/examples/scclang/alltonext_forward.py
old mode 100644
new mode 100755
index 2edc419..3579dd4
--- a/examples/scclang/alltonext_forward.py
+++ b/examples/scclang/alltonext_forward.py
@@ -61,27 +61,27 @@ def rank(node, local_rank):
                 if r == size - 1:
                     continue
 
-                # Cross node send - cooperative
+                # Cross node copy - cooperative
                 if g == num_local_gpus -1:
                     for ch in range(chunks):
                         c = chunk(r, Buffer.input, ch)
-                        if ch == 0: # 2 steps: Scatter - send to (node, 0), IB send to (node+1, 0)
-                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
+                        if ch == 0: # 2 steps: Scatter - copy to (node, 0), IB copy to (node+1, 0)
+                            c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
 
                         elif ch == num_local_gpus-1:
-                            # 2 steps: IB send to (node+1, g) then gather onto (node+1, 0)
-                            c = c.send(rank(n+1, ch), 'gather', 0, ch=ch%2)
+                            # 2 steps: IB copy to (node+1, g) then gather onto (node+1, 0)
+                            c = c.copy(rank(n+1, ch), f's{n}->{n+1}', 0, ch=ch%2)
                         else:
-                            # 3 steps: Scatter - send to (node, g), IB send to (node+1, g), gather onto (node+1, 0)
-                            c = c.send(rank(n, ch), 'scatter', 0, ch=ch%2)
-                            c = c.send(rank(n+1, ch), 'gather', 0, ch=ch%2)
+                            # 3 steps: Scatter - copy to (node, g), IB copy to (node+1, g), gather onto (node+1, 0)
+                            c = c.copy(rank(n, ch), f's{n}->{n+1}', 0, ch=ch%2)
+                            c = c.copy(rank(n+1, ch), f's{n}->{n+1}', 0, ch=ch%2)
                         
-                        c.send(r+1, Buffer.output, c.get_dst_index(), ch=ch%2)
+                        c.copy(r+1, Buffer.output, c.get_dst_index(), ch=ch%2)
                         
-                # Normal send - directly
+                # Normal copy - directly
                 else:
                     c = chunk(r, Buffer.input, 0, chunks)
-                    c.send(r+1, Buffer.output, 0, ch=g%2)
+                    c.copy(r+1, Buffer.output, 0, ch=g%2)
         
         Check()
         XML()
diff --git a/examples/scclang/reducegather.py b/examples/scclang/reducegather.py
old mode 100644
new mode 100755
diff --git a/examples/scclang/simple/allgather_ring.py b/examples/scclang/simple/allgather_ring.py
old mode 100644
new mode 100755
index 8ce40ed..6d82031
--- a/examples/scclang/simple/allgather_ring.py
+++ b/examples/scclang/simple/allgather_ring.py
@@ -14,14 +14,14 @@ def allgather_ring(size):
             # Get the chunk at rank r, input[r]
             c = chunk(r, Buffer.input, 0)
             # Copy chunk to the output buffer
-            c = c.send(r, buffer=Buffer.output, index=r, sendtb=0)
+            c = c.copy(r, buffer=Buffer.output, index=r, sendtb=0)
 
             next = (r + 1) % size
             while next != r:
                 # For each rank in the ring, send the chunk to the next rank
                 # Setting sender's tb and receiver's tb to be 0 so that send/receives on the
                 # same rank can be merged into a receive-copy-send
-                c = c.send(next, buffer=Buffer.output, index=r)
+                c = c.copy(next, buffer=Buffer.output, index=r)
                 next = (next + 1) % size
         XML()
         Check()
@@ -40,10 +40,10 @@ def allgather_ring_inplace(size):
                 # For each rank in the ring, send the chunk to the next rank
                 # Setting sender's tb and receiver's tb to be 0 so that send/receives on the
                 # same rank can be merged into a receive-copy-send
-                c = c.send(next, buffer=Buffer.output, index=r)
+                c = c.copy(next, buffer=Buffer.output, index=r)
                 next = (next + 1) % size
         XML()
         Check()
 
-# allgather_ring(4)
-allgather_ring_inplace(4)
\ No newline at end of file
+allgather_ring(4)
+# allgather_ring_inplace(4)
\ No newline at end of file
diff --git a/examples/scclang/simple/allreduce_ring.py b/examples/scclang/simple/allreduce_ring.py
old mode 100644
new mode 100755
index 1f32b78..7dc2566
--- a/examples/scclang/simple/allreduce_ring.py
+++ b/examples/scclang/simple/allreduce_ring.py
@@ -9,22 +9,27 @@
 from sccl.language.collectives import AllReduce
 
 
-def allreduce_ring(size, instances, threadblocks):
+def allreduce_ring(size, instances):
+    # Logical topology
     topology = fully_connected(size)
     collective = AllReduce(size, size, inplace=True)
-    with SCCLProgram("allreduce_ring_inplace", topology, collective, instances, threadblocks):
+
+    with SCCLProgram("allreduce_ring_inplace", topology, collective, instances):
         for r in range(size):
             index = r
+            # (rank, buffer, index)
             c = chunk(r, Buffer.input, index)
             next = (r + 1) % size
             # Chunk travels around the ring being reduced
             while next != r:
-                c = c.reduce(next, buffer=Buffer.input, index=r)
+                c1 = chunk(next, buffer=Buffer.input, index=r)
+                # c1 += c
+                c = c1.reduce(c)
                 next = (next + 1) % size
             
             # Send the fully reduced chunk around the ring
             while next != (r - 1) % size:
-                c = c.send(next, buffer=Buffer.input, index=r)
+                c = c.copy(next, buffer=Buffer.input, index=r)
                 next = (next + 1) % size
 
         Check()
@@ -33,8 +38,7 @@ def allreduce_ring(size, instances, threadblocks):
 parser = argparse.ArgumentParser()
 parser.add_argument('num_gpus', type=int, help ='number of gpus')
 parser.add_argument('instances', type=int, help='number of instances')
-parser.add_argument('threadblocks', type=int, help='number of threadblocks per instance')
 
 args = parser.parse_args()
 
-allreduce_ring(args.num_gpus, args.instances, args.threadblocks)
+allreduce_ring(args.num_gpus, args.instances)
diff --git a/examples/scclang/simple/custom_collective.py b/examples/scclang/simple/custom_collective.py
old mode 100644
new mode 100755
index 5d18e78..3c64a8d
--- a/examples/scclang/simple/custom_collective.py
+++ b/examples/scclang/simple/custom_collective.py
@@ -13,7 +13,7 @@
 class CollEx(Collective):
     # Initial state is chunk0 is on rank0 in the input buffer
     def init_buffers(self):
-        chunks_per_node = self.instances
+        chunks_per_node = self.chunk_factor
         rank_buffers = []
         for r in range(self.num_ranks):
             input_buffer = [None] * chunks_per_node
@@ -35,7 +35,7 @@ def check(self, prog):
         correct = True
         for r in range(1, self.num_ranks):
             output = prog.buffers[r][Buffer.output]
-            for c in range(self.instances):
+            for c in range(self.chunk_factor):
                 chunk = output[c]
                 # Check that we got chunk 0 from rank 0
                 if chunk is None or chunk.origin_rank != 0 or chunk.origin_index != 0:
@@ -50,16 +50,16 @@ def custom_example1():
     size = 3
     topology = fully_connected(size) 
     # Collectives take in number of ranks in the network, chunksperloop of the collective, whether it is inplace, 
-    collective = CollEx(size, 1, inplace=False")
-    with SCCLProgram("allgather_ring", topology, collective, 1, protocol="Simple"):
+    collective = CollEx(size, 1, inplace=False)
+    with SCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
         # Get the chunk at rank 0 index 0 of the input buffer
         c = chunk(0, Buffer.input, 0)
         # Send chunks to 1 and 2
         # Can specify the sender's tb, receiver's tb, and channel for the send operation
         # SCCLang provides a default threadblock assignment if they aren't specified
         # SCCLang will also check the tb/channel combos are valid
-        c.send(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
-        c.send(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
+        c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
+        c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
 
         XML() # Generates the XML for this collective
         Check() # Checks the routes defined for each chunk are correct. Currently doesn't check XML correct
@@ -70,16 +70,16 @@ def custom_example2():
     topology = fully_connected(size) 
 
     collective = CollEx(size, 1, inplace=False)
-    with SCCLProgram("allgather_ring", topology, collective, 1, protocol="Simple"):
+    with SCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
         c = chunk(0, Buffer.input, 0)
         # This is the same program as above but instead of rank 0 sending to 1 and 2
         # 0 sends to 1 which sends to 2
         # send returns the chunk on the receiver's side
-        c = c.send(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
-        c.send(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
+        c = c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
+        c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
 
         XML()
         Check() 
 
 custom_example1()
-custom_example2()
\ No newline at end of file
+custom_example2()
diff --git a/examples/send.py b/examples/send.py
old mode 100644
new mode 100755
diff --git a/examples/unpermute_dgx1.py b/examples/unpermute_dgx1.py
old mode 100644
new mode 100755
diff --git a/pytest.ini b/pytest.ini
old mode 100644
new mode 100755
diff --git a/requirements.txt b/requirements.txt
old mode 100644
new mode 100755
diff --git a/sccl/__init__.py b/sccl/__init__.py
old mode 100644
new mode 100755
diff --git a/sccl/__main__.py b/sccl/__main__.py
old mode 100644
new mode 100755
diff --git a/sccl/algorithm.py b/sccl/algorithm.py
old mode 100644
new mode 100755
diff --git a/sccl/autosynth/__init__.py b/sccl/autosynth/__init__.py
old mode 100644
new mode 100755
diff --git a/sccl/autosynth/ndv2_plans.py b/sccl/autosynth/ndv2_plans.py
old mode 100644
new mode 100755
diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
old mode 100644
new mode 100755
diff --git a/sccl/autosynth/registry.py b/sccl/autosynth/registry.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/__init__.py b/sccl/cli/__init__.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/analyze.py b/sccl/cli/analyze.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/common.py b/sccl/cli/common.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/distribute.py b/sccl/cli/distribute.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/known_collectives.py b/sccl/cli/known_collectives.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/known_distributed_topologies.py b/sccl/cli/known_distributed_topologies.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/known_topologies.py b/sccl/cli/known_topologies.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/known_transformers.py b/sccl/cli/known_transformers.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/ncclize.py b/sccl/cli/ncclize.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/plans.py b/sccl/cli/plans.py
old mode 100644
new mode 100755
diff --git a/sccl/cli/solve.py b/sccl/cli/solve.py
old mode 100644
new mode 100755
diff --git a/sccl/collectives.py b/sccl/collectives.py
old mode 100644
new mode 100755
diff --git a/sccl/distributors/__init__.py b/sccl/distributors/__init__.py
old mode 100644
new mode 100755
diff --git a/sccl/distributors/alltoall_subproblem.py b/sccl/distributors/alltoall_subproblem.py
old mode 100644
new mode 100755
diff --git a/sccl/distributors/gather_scatter_alltoall.py b/sccl/distributors/gather_scatter_alltoall.py
old mode 100644
new mode 100755
diff --git a/sccl/distributors/greedy_alltoall.py b/sccl/distributors/greedy_alltoall.py
old mode 100644
new mode 100755
diff --git a/sccl/instance.py b/sccl/instance.py
old mode 100644
new mode 100755
diff --git a/sccl/isomorphisms.py b/sccl/isomorphisms.py
old mode 100644
new mode 100755
diff --git a/sccl/language/__init__.py b/sccl/language/__init__.py
old mode 100644
new mode 100755
index 2183501..57c19bf
--- a/sccl/language/__init__.py
+++ b/sccl/language/__init__.py
@@ -11,7 +11,7 @@
 from sccl.language.buffer import *
 from sccl.language.rank_dag import *
 import sccl.collectives as collectives
-
+# from sccl.language.visualize import *
 
 _current_program = None
 def _curr():
@@ -23,7 +23,7 @@ def _curr():
 class SCCLProgram:
     def __init__(self, name, topo, collective, instances, protocol='Simple', \
             threadblock_policy=ThreadblockPolicy.auto, interleaved_replication=True,
-            check_xml=True):
+            instr_fusion=True, check_xml=True, dependence_nop=False):
         self.name = name
         self.topo = topo
         self.collective = collective       
@@ -32,18 +32,22 @@ def __init__(self, name, topo, collective, instances, protocol='Simple', \
         self.protocol = protocol
         self.threadblock_policy = threadblock_policy
         self.interleaved_replication = interleaved_replication
+        self.instr_fusion = instr_fusion
         self.check_xml = check_xml
+        self.dependence_nop = dependence_nop
         assert protocol == 'Simple' or protocol == 'LL' or protocol == 'LL128', \
             f'Given protocol: {protocol}. Must be either Simple, LL, LL128'
         self.run_opt = True # Runs optimization passes
         # Initialize the input buffers
-        self.chunk_dag = ChunkDAG()
+        # self.chunk_dag = ChunkDAG()
         self.buffers = collective.init_buffers()
-        self.rank_dag = RankDAG(self.num_ranks, self.buffers)
+        self.instr_dag = InstructionDAG(self.num_ranks, self.buffers)
         for r in range(self.num_ranks):
             for index, chunk in enumerate(self.buffers[r][Buffer.input]):
-                ref = self.get_ref(r, Buffer.input, index, 1)
-                self.chunk_dag.init_chunk(chunk, ref)
+                buffer, index = self.collective.get_buffer_index(r, Buffer.input, index)
+                ref = self.get_ref(r, buffer, index, 1)
+                # self.chunk_dag.init_chunk(chunk, ref)
+                self.instr_dag.add_start(r, buffer, index, ref)
 
     def __enter__(self):
         global _current_program
@@ -57,7 +61,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
             raise RuntimeError("This program is not currently in context")
         _current_program = None
 
-    def add_send(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size):
+    # Tracks a send operation on the buffers
+    def apply_send(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size):
         src_buffer, src_index = self.collective.get_buffer_index(src, src_buffer, src_index)
         dst_buffer, dst_index = self.collective.get_buffer_index(dst, dst_buffer, dst_index)
         sb = self.buffers[src][src_buffer]
@@ -65,7 +70,8 @@ def add_send(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size)
         for i in range(size):
             db[dst_index + i] = sb[src_index + i]
 
-    def add_reduce(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size):
+    # Tracks a reduce operation on the buffers
+    def apply_reduce(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, size):
         src_buffer, src_index = self.collective.get_buffer_index(src, src_buffer, src_index)
         dst_buffer, dst_index = self.collective.get_buffer_index(dst, dst_buffer, dst_index)
         sb = self.buffers[src][src_buffer]
@@ -73,7 +79,7 @@ def add_reduce(self, src, src_buffer, src_index, dst, dst_buffer, dst_index, siz
         for i in range(size):
             reduce_chunk = db[dst_index + i]
             sent_chunk = sb[src_index + i]
-            db[dst_index + i] = reduce_chunk.reduce(sent_chunk)
+            db[dst_index + i] = reduce_chunk.reduce(dst, sent_chunk)
 
     def get_ref(self, rank, buffer, index, size):
         buffer, index = self.collective.get_buffer_index(rank, buffer, index)
@@ -81,8 +87,11 @@ def get_ref(self, rank, buffer, index, size):
 
     def get_chunks(self, rank, buffer, index, size=1):
         chunks = [None] * size
-        for i in range(index, index+size):
-            chunks[i-index] = self.buffers[rank][buffer][i]
+        for i in range(0, size):
+            if self.buffers[rank][buffer] and index+i < len(self.buffers[rank][buffer]):
+                chunks[i] = self.buffers[rank][buffer][index+i]
+            else:
+                 chunks[i] = None
         return chunks
 
     def check_buffer_exists(self, rank, name):
@@ -96,36 +105,41 @@ def check(self):
 
     # Lower program to XML
     def lower(self):
-        self.chunk_dag._complete_metadata()
-        self.chunk_dag.lower_rank_dag(self.rank_dag)
-       
-        self.rank_dag.optimize()
+        # self.chunk_dag._complete_metadata()
+        # self.chunk_dag.channel_assignment()
+        # self.chunk_dag.lower_instr_dag(self.instr_dag)
+        self.instr_dag.convert_set_list() # Pre-emptively convert sets to lists
+        if self.instr_fusion:
+            self.instr_dag.optimize()
+        self.instr_dag._complete_metadata()
         if self.threadblock_policy == ThreadblockPolicy.manual:
-            manual_assign_tbs(self.rank_dag)
+            manual_assign_tbs(self.instr_dag)
         else:
-            create_base_tbs(self.rank_dag)
-            auto_assign_tbs(self.rank_dag)
-        self.rank_dag.lower_pt1(self.instances)
-        gpu_prgms = self.rank_dag.lower_pt2(self.instances, self.interleaved_replication)
+            auto_assign_tbs(self.instr_dag)
+        self.instr_dag.lower_pt1(self.instances)
+        gpu_prgms = self.instr_dag.lower_pt2(self.instances, self.interleaved_replication)
         if self.check_xml:
-            # Check generated SCCL-EF for correctness - no circular dependencies, sends and receives are ordered
+            # Check generated SCCL-IR for correctness - no circular dependencies, sends and receives are ordered
             # For very large programs, turn off check_xml when shipping 
-            check_dependency_cycles(self.rank_dag.tbs)
-            check_threadblock_ordering(self.rank_dag)
+            check_dependency_cycles(self.instr_dag.tbs)
+            check_threadblock_ordering(self.instr_dag)
         return Program(self.name, self.collective.name, self.collective.inplace, self.protocol, gpu_prgms)  
 
-    # def print_chunk_dag(self):
-    #     visualize_chunk_dag(self.chunk_dag.chunk_paths)
+    def generate_xml(self):
+        return ir_to_xml(self.lower(), dependence_nop=self.dependence_nop)
+    
+    def print_chunk_dag(self):
+        visualize_chunk_dag(self.chunk_dag.chunk_paths)
 
-    # def print_rank_dags(self, rank):
-    #     if rank == -1:
-    #         for r in range(len(self.ranks)):
-    #             visualize_rank_dag(self.rank_dags[r].operations)
-    #     else:
-    #         visualize_rank_dag(self.rank_dags[rank].operations)
+    def print_instr_dags(self, rank):
+        if rank == 0:
+            for r in range(len(self.ranks)):
+                visualize_instr_dag(self.instr_dags[r].operations)
+        else:
+            visualize_instr_dag(self.instr_dags[rank].operations)
 
-# def Print():
-#     _curr().print_chunk_dag()
+def Print():
+    _curr().print_chunk_dag()
 
 def chunk(rank, buffer, index, size=1):
     return _curr().get_ref(rank, buffer, index, size)
@@ -134,7 +148,7 @@ def create_scratch(rank, name):
     return _curr().create_scratch(rank, name)
 
 def XML():
-   print(ir_to_xml(_curr().lower()))
+   print(_curr().generate_xml())
 
 def Check():
     return _curr().check()
@@ -174,8 +188,8 @@ def group(self, other):
         end = max(first._end(), second._end())
         return Ref(self.rank, self.buffer, first.index, end - first.index, self.prog)
         
-
-    def send(self, dst, buffer=None, index=-1, sendtb=-1, recvtb=-1, ch=-1):
+    # Copies the chunk(s) referenced by this chunkref onto Rank dst at location (buffer, index)
+    def copy(self, dst, buffer=None, index=-1, sendtb=-1, recvtb=-1, ch=-1):
         self.prog.check_buffer_exists(dst, buffer)
 
         # If index is not specified assume it is going to the same place in the next gpu
@@ -192,31 +206,50 @@ def send(self, dst, buffer=None, index=-1, sendtb=-1, recvtb=-1, ch=-1):
         assert (self.prog.topo.link(self.rank, dst) or dst == self.rank), f'No link from {self.rank} to {dst}'
         dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
 
-        self.prog.add_send(self.rank, self.buffer, self.index, dst, buffer, index, self.size)
+        # Check if we are copying the chunk to the same index (easy mistake when we are using inplace)
+        if dst_chunkref == self:
+            return
 
-        chunks = self.prog.get_chunks(self.rank, self.buffer, self.index, self.size)
-        self.prog.chunk_dag.add_send(chunks, self, dst_chunkref, sendtb, recvtb, ch)
+        # chunks = self.prog.get_chunks(self.rank, self.buffer, self.index, self.size)
+        # overwritten_chunks = self.prog.get_chunks(dst, buffer, index, self.size)
+        
+        self.prog.apply_send(self.rank, self.buffer, self.index, dst, buffer, index, self.size)
+
+        # self.prog.chunk_dag.add_send(chunks, overwritten_chunks, self, dst_chunkref, sendtb, recvtb, ch)
+        sender = self.rank
+        receiver = dst
+        if sender != receiver:
+            sop = self.prog.instr_dag.add_send(sender, self, dst_chunkref, sendtb, ch)
+            rop = self.prog.instr_dag.add_recv(receiver, self, dst_chunkref, recvtb, ch, sop)
+            sop.recv_match = rop
+        else:
+            self.prog.instr_dag.add_copy(sender, self, dst_chunkref, sendtb, ch)
 
         return dst_chunkref
 
-    def reduce(self, dst, buffer, index=-1, sendtb=-1, recvtb=-1, ch=0):
-        self.prog.check_buffer_exists(dst, buffer)
-
-        # Some inplace collectives have custom logic for buffers and index (ReduceScatter, AllGather)
-        buffer, index = self.prog.collective.get_buffer_index(self.rank, buffer, index)
-
+    # Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref
+    def reduce(self, other_chunkref, sendtb=-1, recvtb=-1, ch=-1):
         # Receive reduce copy
-        assert (self.prog.topo.link(self.rank, dst) or dst == self.rank), f'No link from {self.rank} to {dst}'
-        dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
-
-        chunks1 = self.prog.get_chunks(self.rank, self.buffer, self.index, self.size)
-        chunks2 = self.prog.get_chunks(dst, buffer, index, self.size)
-
-        self.prog.add_reduce(self.rank, self.buffer, self.index, dst, buffer, index, self.size)
+        dst = self.rank
+        src = other_chunkref.rank
+        assert (self.prog.topo.link(src, dst) or src == dst), f'No link from {src} to {dst}'
+        # dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
+
+        # chunks1 = self.prog.get_chunks(self.rank, self.buffer, self.index, self.size)
+        # chunks2 = self.prog.get_chunks(other_chunkref.rank, other_chunkref.buffer, other_chunkref.index self.size)
+
+        self.prog.apply_reduce(src, other_chunkref.buffer, other_chunkref.index, dst, self.buffer, self.index, self.size)
+
+        # reduce_chunks = self.prog.get_chunks(dst, buffer, index, self.size)
+        # self.prog.chunk_dag.add_reduce(chunks1, chunks2, reduce_chunks, self, dst_chunkref, sendtb, recvtb, ch)
+        if src != dst:
+            sop = self.prog.instr_dag.add_send(src, other_chunkref, self, sendtb, ch)
+            rop = self.prog.instr_dag.add_recv_reduce_copy(dst, other_chunkref, self, recvtb, ch, sop)
+            sop.recv_match = rop
+        else:
+            self.prog.instr_dag.add_reduce(src, other_chunkref, self, sendtb, ch)
 
-        reduce_chunks = self.prog.get_chunks(dst, buffer, index, self.size)
-        self.prog.chunk_dag.add_reduce(chunks1, chunks2, reduce_chunks, self, dst_chunkref, sendtb, recvtb, ch)
-        return dst_chunkref
+        return self
 
     def get_origin_index(self, index=0):
         return self._get_chunk(index + self.index).origin_index
@@ -234,149 +267,181 @@ def print_chunk_info(self, index=0):
         print(self._get_chunk(index + self.index)) 
 
 
-@dataclass
-class ChunkOp():
-    inst: ChunkInstruction
-    src: Ref # Ref Chunk acted on
-    dst: Ref # Ref Chunk created
-    sendtb: int = -1# For lowering to RankInstructions
-    recvtb: int = -1#  For lowering to RankInstructions
-    ch: int = -1 # For lowering to RankInstructions
-    steps_from_start:int  = -1
-    steps_to_end: int = -1 
-    prev: list = field(default_factory=list) # Previous ChunkOps
-    next: list = field(default_factory=list) # Next ChunkOps
-    visited = False
-    num = -1
-
-    def __repr__(self):
-        return f'ChunkOp({self.inst} {self.dst.rank} {self.dst.buffer} {self.dst.index})'
-
-    def __lt__(self, other):
-        return self.steps_from_start < other.steps_from_start
-
-    def __hash__(self):
-        return hash((self.inst, self.dst.rank, self.dst.index, self.dst.buffer)) # TODO 
-
-def same_slot(ref1, ref2):
-    return ref1.rank == ref2.rank and ref1.buffer == ref2.buffer and ref1.index == ref2.index
-
-# Returns if there is overlap between the refs
-def overlap_refs(ref1, ref2):
-    same_location = ref1.rank == ref2.rank and ref1.buffer == ref2.buffer
-    contained1 = ref1.index >= ref2.index and (ref1.index + ref1.size) <= (ref2.index + ref2.size)
-    contained2 = ref2.index >= ref1.index and (ref2.index + ref2.size) <= (ref1.index + ref1.size)
-    return same_location and (contained1 or contained2)
-
-class ChunkDAG:
-
-    def __init__(self):
-        self.chunks = []
-        self.chunk_paths = {} # chunk -> ChunkOp. Stores the entry point to where every chunk is created
-        self.max_hops = -1
-
-    # Initialize the ChunkDAG with starting chunks
-    def init_chunk(self, chunk, ref):
-        op = ChunkOp(ChunkInstruction.start, None, ref, steps_from_start=-1)
-        self.chunks.append(chunk)
-        self.chunk_paths[chunk] = op
-
-    def _find_prev_op_for_chunk(self, chunk, ref):
-        prev_op = None
-        frontier = [self.chunk_paths[chunk]]
-        while len(frontier) > 0:
-            current_op = frontier[0]
-            if overlap_refs(ref, current_op.dst):
-                prev_op = current_op
-            frontier = frontier[1:] + current_op.next
-        return prev_op
-
-    def add_send(self, chunks, src, dst, sendtb, recvtb, ch):
-        # Find the previous operation for these chunks
-        prev_ops = []
-        steps_from_start = 0
-        for chunk in chunks:
-            prev_op = self._find_prev_op_for_chunk(chunk, src)
-            steps_from_start = max(steps_from_start, prev_op.steps_from_start)
-            prev_ops.append(prev_op)
-        op = ChunkOp(ChunkInstruction.send, src, dst, sendtb, recvtb, ch, steps_from_start+1)
+# @dataclass
+# class ChunkOp():
+#     inst: ChunkInstruction
+#     src: Ref # Ref Chunk acted on
+#     dst: Ref # Ref Chunk created
+#     sendtb: int = -1# For lowering to RankInstructions
+#     recvtb: int = -1#  For lowering to RankInstructions
+#     ch: int = -1 # For lowering to RankInstructions
+#     steps_from_start:int  = -1
+#     steps_to_end: int = -1 
+#     prev: list = field(default_factory=list) # Previous ChunkOps
+#     next: list = field(default_factory=list) # Next ChunkOps
+#     visited = False
+#     num = -1
+
+#     def __repr__(self):
+#         return f'ChunkOp({self.inst} {self.dst.rank} {self.dst.buffer} {self.dst.index})'
+
+#     def __lt__(self, other):
+#         return self.steps_from_start < other.steps_from_start
+
+#     def __hash__(self):
+#         return hash((self.inst, self.dst.rank, self.dst.index, self.dst.buffer)) # TODO 
+
+# def same_slot(ref1, ref2):
+#     return ref1.rank == ref2.rank and ref1.buffer == ref2.buffer and ref1.index == ref2.index
+
+# # Returns if there is overlap between the refs
+# def overlap_refs(ref1, ref2):
+#     same_location = ref1.rank == ref2.rank and ref1.buffer == ref2.buffer
+#     if same_location:
+#         ref1_range = (ref1.index, ref1.index + ref1.size)
+#         ref2_range = (ref2.index, ref2.index + ref2.size)
+#         if ref1_range < ref2_range:
+#             return ref1_range[0] < ref2_range[1]
+#         else:
+#             return ref2_range[0] < ref1_range[1]
+#     return False
+
+# class ChunkDAG:
+
+#     def __init__(self):
+#         self.chunks = []
+#         self.chunk_paths = {} # chunk -> ChunkOp. Stores the entry point to where every chunk is created
+#         self.max_hops = -1
+
+#     # Initialize the ChunkDAG with starting chunks
+#     def init_chunk(self, chunk, ref):
+#         op = ChunkOp(ChunkInstruction.start, None, ref, steps_from_start=-1)
+#         self.chunks.append(chunk)
+#         self.chunk_paths[chunk] = op
+
+#     def _find_prev_op_for_chunk(self, chunk, ref):
+#         prev_op = None
+#         frontier = [self.chunk_paths[chunk]]
+#         while len(frontier) > 0:
+#             current_op = frontier[0]
+#             if overlap_refs(ref, current_op.dst):
+#                 prev_op = current_op
+#             frontier = frontier[1:] + current_op.next
+#         return prev_op
+
+#     def add_send(self, chunks, overwritten_chunks, src, dst, sendtb, recvtb, ch):
+#         # Find the previous operation for these chunks
+#         prev_ops = []
+#         steps_from_start = 0
+#         for chunk1, chunk2 in zip(chunks, overwritten_chunks):
+#             prev_op_src = self._find_prev_op_for_chunk(chunk1, src)
+#             if chunk2 is None:
+#                 steps_from_start = max(steps_from_start, prev_op_src.steps_from_start)
+#             else:
+#                 prev_op_dst = self._find_prev_op_for_chunk(chunk2, dst) # In case we overwrite
+#                 steps_from_start = max(prev_op_src.steps_from_start, prev_op_dst.steps_from_start, steps_from_start)
+#                 prev_ops.append(prev_op_dst)
+#             prev_ops.append(prev_op_src)
+#             # prev_op = self._find_prev_op_for_chunk(chunk, src)
+#             # steps_from_start = max(steps_from_start, prev_op.steps_from_start)
+#             # prev_ops.append(prev_op)
+#         op = ChunkOp(ChunkInstruction.send, src, dst, sendtb, recvtb, ch, steps_from_start+1)
         
-        for prev_op in prev_ops:
-            prev_op.next.append(op)
-        op.prev = prev_ops
-
-    def add_reduce(self, chunks1, chunks2, reduce_chunks, src, dst, sendtb, recvtb, ch):
-        # self.chunks.append(reduce_chunks)
-        prev_ops = []
-        steps_from_start = 0
-        # Find the previous operations that reduce builds off
-        for chunk1, chunk2 in zip(chunks1, chunks2):
-            prev_op_src = self._find_prev_op_for_chunk(chunk1, src)
-            prev_op_dst = self._find_prev_op_for_chunk(chunk2, dst)
-            steps_from_start = max(prev_op_src.steps_from_start, prev_op_dst.steps_from_start, steps_from_start)
-            prev_ops.append(prev_op_src)
-            prev_ops.append(prev_op_dst)
+#         for prev_op in prev_ops:
+#             prev_op.next.append(op)
+#         op.prev = prev_ops
+
+#     def add_reduce(self, chunks1, chunks2, reduce_chunks, src, dst, sendtb, recvtb, ch):
+#         # self.chunks.append(reduce_chunks)
+#         prev_ops = []
+#         steps_from_start = 0
+#         # Find the previous operations that reduce builds off
+#         for chunk1, chunk2 in zip(chunks1, chunks2):
+#             prev_op_src = self._find_prev_op_for_chunk(chunk1, src)
+#             prev_op_dst = self._find_prev_op_for_chunk(chunk2, dst)
+#             steps_from_start = max(prev_op_src.steps_from_start, prev_op_dst.steps_from_start, steps_from_start)
+#             prev_ops.append(prev_op_src)
+#             prev_ops.append(prev_op_dst)
             
-        op = ChunkOp(ChunkInstruction.reduce, src, dst, sendtb, recvtb, ch, steps_from_start+1)
-
-        for prev_op in prev_ops:
-            prev_op.next.append(op)
-            op.prev.append(prev_op)
-
-        # Reduce operations create new chunks, so keep a pointer to a new chunk
-        for rc in reduce_chunks:
-            self.chunk_paths[rc] = op
-
-    def _complete_metadata(self):
-        def dfs(op):
-            if len(op.next) == 0:
-                op.steps_to_end = 0
-            else:
-                for o in op.next:
-                    dfs(o)
-                op.steps_to_end = functools.reduce(lambda cur, x: max(cur, x.steps_to_end+1), op.next, 0)
-
-        for chunk, op in self.chunk_paths.items():
-            if op.inst == ChunkInstruction.start:
-                dfs(op)
+#         op = ChunkOp(ChunkInstruction.reduce, src, dst, sendtb, recvtb, ch, steps_from_start+1)
+
+#         for prev_op in prev_ops:
+#             prev_op.next.append(op)
+#             op.prev.append(prev_op)
+
+#         # Reduce operations create new chunks, so keep a pointer to a new chunk
+#         for rc in reduce_chunks:
+#             self.chunk_paths[rc] = op
+
+#     def _complete_metadata(self):
+#         def dfs(op):
+#             if len(op.next) == 0:
+#                 op.steps_to_end = 0
+#             else:
+#                 for o in op.next:
+#                     dfs(o)
+#                 op.steps_to_end = functools.reduce(lambda cur, x: max(cur, x.steps_to_end+1), op.next, 0)
+
+#         for chunk, op in self.chunk_paths.items():
+#             if op.inst == ChunkInstruction.start:
+#                 dfs(op)
             
-    def lower_rank_dag(self, rank_dag):
-        frontier = []
-        visited = set()
-
-        for chunk, op in self.chunk_paths.items():
-            if len(op.prev) == 0: 
-                heapq.heappush(frontier, op)
-
-        while len(frontier) > 0:
-            op = heapq.heappop(frontier)
-            if op not in visited:
-                sendtb = op.sendtb
-                recvtb = op.recvtb
-                ch =  op.ch
-                if op.inst == ChunkInstruction.start:
-                    rank = op.dst.rank
-                    rank_dag.add_start(rank, op.dst.buffer, op.dst.index, op.dst)
-                elif op.inst == ChunkInstruction.send:
-                    sender = op.src.rank
-                    receiver = op.dst.rank
-                    if sender != receiver:
-                        sop = rank_dag.add_send(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2+1, sendtb, ch)
-                        rop = rank_dag.add_recv(receiver, op.src, op.dst, op.steps_from_start*2+1, op.steps_to_end*2, recvtb, ch)
-                        sop.match = [rop]
-                    else:
-                        rank_dag.add_copy(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2, sendtb)
-                elif op.inst == ChunkInstruction.reduce:
-                    sender = op.src.rank
-                    receiver = op.dst.rank
-                    if sender != receiver:
-                        sop = rank_dag.add_send(sender, op.src, op.dst, op.steps_from_start*2,op.steps_to_end*2+1, sendtb, ch)
-                        rop = rank_dag.add_recv_reduce_copy(receiver, op.src, op.dst, op.steps_from_start*2+1, op.steps_to_end*2, recvtb, ch)
-                        sop.match = [rop]
-                    else:
-                        rank_dag.add_reduce(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2, sendtb)
-
-                for o in op.next:
-                    heapq.heappush(frontier, o)
-                visited.add(op)
-        rank_dag.convert_set_list() # Pre-emptively convert sets to lists
+
+#     # Assigns each send and a reduce a channel for communication based of policies
+#     def channel_assignment(self, channel_policy='zero'):
+#         frontier = []
+#         visited = set()
+#         for chunk, op in self.chunk_paths.items():
+#             if len(op.prev) == 0: 
+#                 heapq.heappush(frontier, op)
+
+#         # If an op isn't annotated with a channel set it to 0
+#         if channel_policy == 'zero':
+#             while len(frontier) > 0:
+#                 op = heapq.heappop(frontier)
+#                 if op not in visited:
+#                     op.ch = 0 if op.ch == -1 else op.ch
+#                     for o in op.next:
+#                         heapq.heappush(frontier, o)
+#                     visited.add(op)
+
+#     def lower_instr_dag(self, instr_dag):
+#         frontier = []
+#         visited = set()
+
+#         for chunk, op in self.chunk_paths.items():
+#             if len(op.prev) == 0: 
+#                 heapq.heappush(frontier, ((op.steps_from_start, op.steps_to_end), op))
+
+#         while len(frontier) > 0:
+#             _, op = heapq.heappop(frontier)
+#             if op not in visited:
+#                 sendtb = op.sendtb
+#                 recvtb = op.recvtb
+#                 ch =  op.ch
+#                 if op.inst == ChunkInstruction.start:
+#                     rank = op.dst.rank
+#                     instr_dag.add_start(rank, op.dst.buffer, op.dst.index, op.dst)
+#                 elif op.inst == ChunkInstruction.send:
+#                     sender = op.src.rank
+#                     receiver = op.dst.rank
+#                     if sender != receiver:
+#                         sop = instr_dag.add_send(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2+1, sendtb, ch)
+#                         rop = instr_dag.add_recv(receiver, op.src, op.dst, op.steps_from_start*2+1, op.steps_to_end*2, recvtb, ch)
+#                         sop.match = [rop]
+#                     else:
+#                         instr_dag.add_copy(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2, sendtb, ch)
+#                 elif op.inst == ChunkInstruction.reduce:
+#                     sender = op.src.rank
+#                     receiver = op.dst.rank
+#                     if sender != receiver:
+#                         sop = instr_dag.add_send(sender, op.src, op.dst, op.steps_from_start*2,op.steps_to_end*2+1, sendtb, ch)
+#                         rop = instr_dag.add_recv_reduce_copy(receiver, op.src, op.dst, op.steps_from_start*2+1, op.steps_to_end*2, recvtb, ch)
+#                         sop.match = [rop]
+#                     else:
+#                         instr_dag.add_reduce(sender, op.src, op.dst, op.steps_from_start*2, op.steps_to_end*2, sendtb, ch)
+
+#                 for o in op.next:
+#                     heapq.heappush(frontier, ((o.steps_from_start, o.steps_to_end), o))
+#                 visited.add(op)
+#         instr_dag.convert_set_list() # Pre-emptively convert sets to lists
diff --git a/sccl/language/buffer.py b/sccl/language/buffer.py
old mode 100644
new mode 100755
index 6f2a266..c0ab297
--- a/sccl/language/buffer.py
+++ b/sccl/language/buffer.py
@@ -34,4 +34,7 @@ def __setitem__(self, index, value):
         if index == current_size:
             self.chunks.append(value)
         else:
-            self.chunks[index] = value
\ No newline at end of file
+            self.chunks[index] = value
+
+    def __len__(self):
+        return len(self.chunks)
\ No newline at end of file
diff --git a/sccl/language/chunk.py b/sccl/language/chunk.py
old mode 100644
new mode 100755
index 6418b1c..2d4458a
--- a/sccl/language/chunk.py
+++ b/sccl/language/chunk.py
@@ -12,12 +12,12 @@ class Chunk:
     dst_rank: int = -1
     dst_index: int = -1
 
-    def reduce(self, chunk):
+    def reduce(self, dst, chunk):
         if type(chunk) is ReduceChunk:
-            return chunk.reduce(self)
+            return chunk.reduce(dst, self)
         elif type(chunk) is Chunk:  
             chunks = [self, chunk]
-            return ReduceChunk(chunks)
+            return ReduceChunk(dst, chunks)
         else:
             assert True, "Trying to reduce with chunk of None"
             return None
@@ -34,22 +34,24 @@ def __lt__(self, other):
 
 @dataclass
 class ReduceChunk:
+    creation_rank: int # Rank the Reduce Chunk is created. Necessary since the same ReduceChunk can be created on multiple ranks independently
     chunks: list # List of chunks reduced
 
-    def reduce(self, chunk):
+    def reduce(self, dst, chunk):
         if type(chunk) is ReduceChunk:
             chunks = self.chunks + chunk.chunks
         elif type(chunk) is Chunk:  
             chunks =self.chunks + [chunk]
         else:
             assert True, "Trying to reduce with chunk of None"
-        return ReduceChunk(chunks)
+        return ReduceChunk(self.creation_rank, chunks)
 
     def sort(self):
         self.chunks.sort()
 
     def __hash__(self):
-        return hash(tuple(self.chunks))
+        self.sort()
+        return hash((self.creation_rank,) + tuple(self.chunks))
 
     # Two reduce chunks are equal if they contain the same list of
     # chunks being reduced
diff --git a/sccl/language/collectives.py b/sccl/language/collectives.py
old mode 100644
new mode 100755
index bce39d2..8654665
--- a/sccl/language/collectives.py
+++ b/sccl/language/collectives.py
@@ -34,8 +34,12 @@ def init_buffers(self):
             for index in range(chunks_per_node):
                 chunk = Chunk(r, index, index//self.chunk_factor, index % self.chunk_factor + r*self.chunk_factor)
                 input_buffer[index] = chunk
-            buffers = {Buffer.input : input_buffer, 
-                    Buffer.output : output_buffer}
+            if self.inplace:
+                buffers = {Buffer.input : input_buffer, 
+                    Buffer.output : input_buffer}
+            else:
+                buffers = {Buffer.input : input_buffer, 
+                        Buffer.output : output_buffer}
             rank_buffers.append(buffers)
         return rank_buffers
 
@@ -143,9 +147,9 @@ def check(self, prog):
         buf = Buffer.input if self.inplace else Buffer.output
 
         for c in range(chunks_per_node):
-            chunk = ReduceChunk([])
+            chunk = ReduceChunk(-1, [])
             for r in range(self.num_ranks):
-                chunk = chunk.reduce(Chunk(r, c))
+                chunk = chunk.reduce(-1, Chunk(r, c))
             expected_chunks.append(chunk)
 
         correct = True
@@ -195,9 +199,9 @@ def check(self, prog):
         expected_chunks = []
         buf = Buffer.input if self.inplace else Buffer.output
         for c in range(self.num_ranks * self.chunk_factor):
-            chunk = ReduceChunk([])
+            chunk = ReduceChunk(-1, [])
             for r in range(self.num_ranks):
-                chunk = chunk.reduce(Chunk(r, c))
+                chunk = chunk.reduce(-1, Chunk(r, c))
             expected_chunks.append(chunk)
 
         correct = True
diff --git a/sccl/language/ir.py b/sccl/language/ir.py
old mode 100644
new mode 100755
index df8751d..1cbfade
--- a/sccl/language/ir.py
+++ b/sccl/language/ir.py
@@ -21,6 +21,18 @@ class Gpu:
     rank: int
     threadblocks: list = field(default_factory=list)
 
+    # From ncclize
+    precopies: list = field(default_factory=list)
+    postcopies: list = field(default_factory=list)
+    inputs: dict = field(default_factory=dict)
+    outputs: dict = field(default_factory=dict)
+    input_chunks: int = 0
+    output_chunks: int = 0
+    scratch: dict = field(default_factory=dict)
+
+    def scratch_size(self):
+        return max((idx for addr, idx in self.scratch.items()), default=-1) + 1
+
 
 @dataclass
 class Threadblock:
@@ -28,6 +40,7 @@ class Threadblock:
     send: int = -1
     recv: int = -1
     ops: list = field(default_factory=list)
+    rbid: int = -1 # threadblock id of the receiver
 
     def __eq__(self, other):
         return self is other
@@ -78,6 +91,13 @@ class Buffer(Enum):
     def __str__(self):
         return self.value
 
+    def __lt__(self, other):
+        return self.value < other.value
+    
+    def __gt__(self, other):
+        return self.value < other.value
+
+
 
 @dataclass
 class ChunkRef:
@@ -99,12 +119,13 @@ class Op:
     depends: list = field(default_factory=list)
     step: int = -1 # Step in the TB
     tb: int = -1 # TB this op is assigned to
-    prev: list = field(default_factory=list)
-    next: list = field(default_factory=list)
+    prev: list = field(default_factory=list) # List of instructions that happen before
+    next: list = field(default_factory=list) # List of instructions that happen after
     num: int = -1
     chunk_step: int = -1
     priority: int = -1
-    match: list = field(default_factory=list) # This should be another Op
+    recv_match =  None
+    send_match =  None
     channel: int = -1
 
     def cnt(self):
@@ -130,6 +151,33 @@ def is_recv(self):
             self.inst == Instruction.recv_copy_send or \
             self.inst == Instruction.recv_reduce_send
 
+    def is_fused(self):
+        return self.inst == Instruction.recv_reduce_copy_send or \
+            self.inst == Instruction.recv_copy_send or \
+            self.inst == Instruction.recv_reduce_send
+
+    def is_local(self):
+        return self.inst == Instruction.copy or \
+            self.inst == Instruction.reduce
+
+    def peer(self):
+        if self.inst == Instruction.send:
+            return self.dst.rank
+        elif self.inst == Instruction.recv:
+            return self.src.rank
+        else:
+            return None
+
+    def send_peer(self):
+        if self.is_send():
+            return self.dst.rank
+        return -1
+    
+    def recv_peer(self):
+        if self.is_recv():
+            return self.src.rank
+        return -1
+
     def __eq__(self, other):
         return self is other
 
@@ -160,7 +208,7 @@ def __repr__(self):
                     Instruction.recv_reduce_copy_send}
 
 
-def ir_to_xml(program: Program, old_format=True, use_scratch=True, pretty_print=True):
+def ir_to_xml(program: Program, old_format=True, use_scratch=True, pretty_print=True, dependence_nop=False):
     # Figure out sizes of buffers based on usage
     buffer_sizes = defaultdict(lambda: 0)
     for gpu in program.gpus:
@@ -213,6 +261,35 @@ def ir_to_xml(program: Program, old_format=True, use_scratch=True, pretty_print=
             for op in tb.ops:
                 has_dependence.update(op.depends)
 
+    if dependence_nop:
+        for gpu in program.gpus:
+            for tb in gpu.threadblocks:
+                pre_ops = []
+                after_ops = []
+                first_re = None
+                first_dep = None
+                for i, op in enumerate(tb.ops):
+                    # Expand extra dependencies into nop operations
+                    num_depends = len(op.depends)
+                    if op.inst is Instruction.reduce:
+                        if num_depends > 0:
+                            for dep in op.depends:
+                                if first_dep is None:
+                                    first_dep = dep
+                                else:    
+                                    pre_ops.append(Op(Instruction.nop, -1, None, None, [dep]))
+                            op.depends = []
+                        if first_re is None:
+                            first_re = op
+
+                    if first_re is not None:
+                        after_ops.append(op)
+                    else:
+                        pre_ops.append(op)
+                if first_dep is not None:
+                    first_re.depends = [first_dep]
+                tb.ops = pre_ops + after_ops
+
     # Do some additional postprocessing of operations:
     # - Expand operations with extra dependencies with no-ops
     # - Mark the index of each operation taking any extra no-ops into account
@@ -233,12 +310,17 @@ def ir_to_xml(program: Program, old_format=True, use_scratch=True, pretty_print=
                 op_idx[new_ops[-1]] = len(new_ops) - 1
             tb.ops = new_ops
 
+    nchannels = 0
+    for gpu in program.gpus:
+        max_tb_channels = 0
+        if len(gpu.threadblocks) > 0:
+            max_tb_channels = max(tb.channel+1 for tb in gpu.threadblocks)
+        nchannels = max(nchannels, max_tb_channels)
     # Generate the XML structure
     algo_elem = ET.Element('algo')
     algo_elem.set('name', program.name)
     algo_elem.set('proto', program.protocol)
-    algo_elem.set('nchannels', str(
-        1 + max(max(tb.channel for tb in gpu.threadblocks) for gpu in program.gpus)))
+    algo_elem.set('nchannels', str(nchannels))
     if old_format:
         algo_elem.set('nchunksperloop', str(
             max(max(buffer_sizes[(gpu.rank, Buffer.input)], buffer_sizes[(gpu.rank, Buffer.output)]) for gpu in program.gpus)))
@@ -248,9 +330,9 @@ def ir_to_xml(program: Program, old_format=True, use_scratch=True, pretty_print=
     for gpu in program.gpus:
         gpu_elem = ET.SubElement(algo_elem, 'gpu')
         gpu_elem.set('id', str(gpu.rank))
-        gpu_elem.set('i_chunks', str(buffer_sizes[(gpu.rank, Buffer.input)]))
-        gpu_elem.set('o_chunks', str(buffer_sizes[(gpu.rank, Buffer.output)]))
-        gpu_elem.set('s_chunks', str(buffer_sizes[(gpu.rank, Buffer.scratch)]))
+        gpu_elem.set('i_chunks', str(max(buffer_sizes[(gpu.rank, Buffer.input)], gpu.input_chunks)))
+        gpu_elem.set('o_chunks', str(max(buffer_sizes[(gpu.rank, Buffer.output)], gpu.output_chunks)))
+        gpu_elem.set('s_chunks', str(max(buffer_sizes[(gpu.rank, Buffer.scratch)], gpu.scratch_size())))
         for tb in gpu.threadblocks:
             tb_elem = ET.SubElement(gpu_elem, 'tb')
             tb_elem.set('id', str(tb_id[tb]))
diff --git a/sccl/language/passes.py b/sccl/language/passes.py
old mode 100644
new mode 100755
index eeda2c9..6f71edd
--- a/sccl/language/passes.py
+++ b/sccl/language/passes.py
@@ -36,7 +36,7 @@ def check_threadblock_ordering(rank_dag):
             # happen in the same order.
             for op_step, op in enumerate(tb.ops):
                 if op.is_send():
-                    match = op.match[0]
+                    match = op.recv_match
                     if match.is_recv():
                         assert op.dst.rank == match.rank, f"Bug in SCCLang: Sends don't match receives"
 
@@ -46,10 +46,10 @@ def check_threadblock_ordering(rank_dag):
                             print("Offending Steps", match.step, prev_steps[other_tbid].step)
                             print("Sending tb")
                             for op in tb.ops:
-                                print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority)}')
+                                print(f'{op.step}: Recv step: {op.recv_match.step if op.is_send() else -1} {op} priority:{(op.chunk_step, op.priority, op.dst.index)}')
                             print("Receiving tb")
                             for op in rank_dag.tbs[match.rank][other_tbid].ops:
-                                print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority)}')
+                                print(f'{op.step}: {op} priority:{(op.chunk_step, op.priority, op.dst.index)}')
                             assert match.step >  prev_steps[other_tbid].step, f"Rank {op.rank} sends op1 then op2 but {match.rank} receives op2 then op1"
                         
                     prev_steps[other_tbid] = match
diff --git a/sccl/language/rank_dag.py b/sccl/language/rank_dag.py
old mode 100644
new mode 100755
index 37ed867..45a52e9
--- a/sccl/language/rank_dag.py
+++ b/sccl/language/rank_dag.py
@@ -4,19 +4,11 @@
 from dataclasses import dataclass
 from enum import Enum
 import heapq
+import functools
 
 from sccl.language.ir import *
 from sccl.language.passes import *
 
-# Returns whether an operation writes to a particular slot
-def writes_to_slot(op, slot):
-    # If the instruction is a copy or reduce, check to see if the destination matches the slot
-    if op.inst == Instruction.copy or op.inst == Instruction.reduce:
-        cpy_src = op.src
-        _, buffer, index = slot
-        return buffer != cpy_src.buffer or (index < cpy_src.index and index > (cpy_src.index + cpy_src.size))
-    return op.inst != Instruction.send
-
 def remove_op(op):
     for p in op.prev:
         p.next.remove(op)
@@ -27,7 +19,7 @@ def remove_op(op):
         n.prev =  op.prev.union(n.prev)
 
 def same_tb(op1, op2):
-    return op1.tb == op2.tb
+    return op1.tb == op2.tb and op1.channel == op2.channel
 
 def same_count(op1, op2):
     return op1.cnt() == op2.cnt()
@@ -35,175 +27,130 @@ def same_count(op1, op2):
 def same_buf_dst(op1, op2):
     return op1.dst.buffer == op2.dst.buffer and op1.dst.index == op2.dst.index
 
-class RankDAG:
+class InstructionDAG:
     def __init__(self, num_ranks, buffers):
         self.num_ranks = num_ranks
         self.buffers = buffers
-        self.slots = [] # slot = (rank, buffer, index)
+        # State for the actual instruction DAG
         self.operations = {} # slot -> operations
+        self.last_writer = {} # slot -> last writing op
+        self.last_readers = defaultdict(list) # slot -> list of last reading ops
+        # State for the MSCCL-IR
         self.tbs = [] 
         for _ in range(num_ranks):
             self.tbs.append({}) 
         self.tb_mapping = {}
+        self.num_channels = [1] * num_ranks
+
 
+    # InstructionDAG helper - identifies the dependencies for a write-type operation (recv, copy, rrc, reduce)
+    def _write(self, rank, buffer, index, size, op, read=False):
+        prev_ops = set()
+        for i in range(index, index+size):
+            slot = (rank, buffer, i)
+            if read:
+                assert slot in self.last_writer, f"Destination slot has never been written before a reduce {op}"
 
+            # First write to this slot
+            if slot not in self.operations:
+                self.operations[slot] = op
+
+            # If there are active readers - these are the previous operations
+            # Else the previous operation is the last write (if there is one)
+            readers = self.last_readers[slot]
+            if len(readers) > 0:
+                prev_ops.update(readers)
+            elif slot in self.last_writer:
+                prev_ops.add(self.last_writer[slot])
+  
+            # Set the last_writer to this op, and clear all readers
+            self.last_writer[slot] = op
+            self.last_readers[slot] = []
+
+        # Update the next pointer of the previous ops
+        for prev_op in prev_ops:
+            prev_op.next.add(op)
+            op.prev.add(prev_op)
+
+    # InstructionDAG helper - identifies the dependencies for read-type operations (send, copy, reduce)
+    def _read(self, rank, buffer, index, size, op):
+        prev_ops = set()
+        for i in range(index, index+size):
+            slot = (rank, buffer, i)
+            assert slot in self.last_writer, f"Slot has never been written before a read-type {op}"
+            # The previous operation for a reader is the last write to the slot
+            writer = self.last_writer[slot]
+            prev_ops.add(writer)
+            self.last_readers[slot].append(op)
+            
+        # Update the next pointer of the previous ops
+        for prev_op in prev_ops:
+            prev_op.next.add(op)
+            op.prev.add(prev_op)
+
+    # InstructionDAG - builds the roots of the DAG
     def add_start(self, rank, buffer, index, ref):
         slot = (rank, buffer, index)
-        self.slots.append(slot)
-
-        op = Op(Instruction.start, rank, ref, ref, next=set(), prev=set())
+        op = Op(Instruction.start, rank, ref, ref, next=set(), prev=set(), chunk_step=-1)
         self.operations[slot] = op
+        self.last_writer[slot] = op
 
-    # Find the last write to happen on this slot
-    def find_last_recv(self, slot):
-        def dfs(op):
-            # Found the last operation on the slot
-            if len(op.next) == 0:
-                return writes_to_slot(op, slot), op
-            else:
-                last_recvs = False
-                # Check if any of the children is the last write
-                for o in op.next:
-                    is_last_recv, recv_op = dfs(o)
-                    if is_last_recv:
-                        return True, recv_op
-                    last_recvs = last_recvs or is_last_recv
-                # Check if we are the last write
-                if writes_to_slot(op, slot) and not last_recvs:
-                    return True, op
-                return False, op
-        
-        result, op = dfs(self.operations[slot])
-        assert result
-        return op
-
-    # Find the last set of operations that happened on this slot
-    # There may be multiple as sends can happen in parallel
-    def find_last_ops(self, slot):
-        frontier = [self.operations[slot]]
-        last_ops = []
-        while len(frontier) > 0:
-            op = frontier[0]
-            if len(op.next) == 0:
-                last_ops.append(op)
-            frontier = frontier[1:] + list(op.next)   
-        return last_ops
-
-    def add_copy(self, rank, send_ref, recv_ref, step, priority, tb):
-        op = Op(Instruction.copy, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb)
+    # InstructionDAG - adds a copy node
+    def add_copy(self, rank, send_ref, recv_ref, tb, ch):
+        op = Op(Instruction.copy, rank, send_ref, recv_ref, next=set(), prev=set(), tb=tb, channel=ch)
         dstbuffer = recv_ref.buffer
         dstindex = recv_ref.index
         srcbuffer = send_ref.buffer
         srcindex = send_ref.index
         size = recv_ref.size
-        prev_ops = []
-
-        # Sending part of copy
-        for i in range(srcindex, srcindex+size):
-            slot = (rank, srcbuffer, i)
-            prev_op = self.find_last_recv(slot) # All operations that need to happen before
-            prev_op.next.add(op)
-            op.prev.add(prev_op)
-
-        # Receiving part of copy
-        prev_ops = set()
-        for i in range(dstindex, dstindex+size):
-            slot = (rank, dstbuffer, i)
-            if slot in self.operations:
-                prev_op = self.find_last_ops(slot)
-                prev_ops.append(prev_op) # All operations that need to happen before
-            else:
-                self.operations[slot] = op
-
-        for prev_op in prev_ops:
-            if op not in prev_op.next:
-                prev_op.next.add(op)
-                op.prev.add(prev_op)
+        # Sending part of copy [Read]
+        self._read(rank, srcbuffer, srcindex, size, op)
+        # Receiving part of copy [Write]
+        self._write(rank, dstbuffer, dstindex, size, op)
+        return op
 
-    def add_reduce(self, rank, send_ref, recv_ref, step, priority, tb):
-        op = Op(Instruction.reduce, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb)
+    # InstructionDAG - adds a redduce node
+    def add_reduce(self, rank, send_ref, recv_ref, tb, ch):
+        op = Op(Instruction.reduce, rank, send_ref, recv_ref, next=set(), prev=set(), tb=tb, channel=ch)
         dstbuffer = recv_ref.buffer
         dstindex = recv_ref.index
         srcbuffer = send_ref.buffer
         srcindex = send_ref.index
         size = recv_ref.size
         prev_ops = []
+        # Sending part of reduce
+        self._read(rank, srcbuffer, srcindex, size, op)
+        # Reduce part of copy
+        self._write(rank, dstbuffer, dstindex, size, op, read=True)
+        return op
 
-        # B
-        for i in range(srcindex, srcindex+size):
-            slot = (rank, srcbuffer, i)
-            prev_op = self.find_last_recv(slot) # All operations that need to happen before
-            prev_op.next.add(op)
-            op.prev.add(prev_op)
-
-        # A
-        prev_ops = []
-        for i in range(dstindex, dstindex+size):
-            slot = (rank, dstbuffer, i)
-            if slot in self.operations:
-                prev_op = self.find_last_ops(slot)
-                prev_ops = prev_ops + prev_op # All operations that need to happen before
-       
-        for prev_op in prev_ops:
-            if op not in prev_op.next:
-                prev_op.next.add(op)
-                op.prev.add(prev_op)
-
-    def add_send(self, rank, send_ref, recv_ref, step, priority, tb, ch):
-        op = Op(Instruction.send, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb, channel=ch)
+    # InstructionDAG - adds a send node
+    def add_send(self, rank, send_ref, recv_ref, tb, ch):
+        op = Op(Instruction.send, rank, send_ref, recv_ref, next=set(), prev=set(), tb=tb, channel=ch)
         buffer = send_ref.buffer
         index = send_ref.index
         size = send_ref.size
-        prev_ops = []
-        for i in range(index, index+size):
-            slot = (rank, buffer, i)
-            prev_op = self.find_last_recv(slot)
-            prev_ops.append(prev_op) # All operations that need to happen before
-
-        for prev_op in prev_ops:
-            if op not in prev_op.next:
-                prev_op.next.add(op)
-                op.prev.add(prev_op)
+        self._read(rank, buffer, index, size, op)
         return op
 
-    def add_recv(self, rank, send_ref, recv_ref, step, priority, tb, ch):
-        op = Op(Instruction.recv, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb, channel=ch)
+    # InstructionDAG - adds a recv node
+    def add_recv(self, rank, send_ref, recv_ref, tb, ch, send_op):
+        op = Op(Instruction.recv, rank, send_ref, recv_ref, next=set(), prev=set(), tb=tb, channel=ch)
         buffer = recv_ref.buffer
         index = recv_ref.index
         size = recv_ref.size
-
-        prev_ops = set()
-        for i in range(index, index+size):
-            slot = (rank, buffer, i)
-            if slot in self.operations:
-                slot_prev_ops = self.find_last_ops(slot) # All operations that need to happen before
-                prev_ops = prev_ops.union(slot_prev_ops)        
-            else:
-                self.operations[slot] = op
-        if len(prev_ops) > 0:
-                for prev_op in prev_ops:
-                    prev_op.next.add(op)
-                    op.prev.add(prev_op)
+        self._write(rank, buffer, index, size, op)
+        op.send_match = send_op
         return op
 
-    def add_recv_reduce_copy(self, rank, send_ref, recv_ref, step, priority, tb, ch):
-        op = Op(Instruction.recv_reduce_copy, rank, send_ref, recv_ref, chunk_step=step, priority=priority, next=set(), prev=set(), tb=tb, channel=ch)
+    # InstructionDAG - adds a rrc node
+    def add_recv_reduce_copy(self, rank, send_ref, recv_ref, tb, ch, send_op):
+        op = Op(Instruction.recv_reduce_copy, rank, send_ref, recv_ref, next=set(), prev=set(), tb=tb, channel=ch)
         buffer = recv_ref.buffer
         index = recv_ref.index
         size = recv_ref.size
-
-        prev_ops = set()
-        for i in range(index, index+size):
-            slot = (rank, buffer, i)
-            if slot in self.operations:
-                slot_prev_ops = self.find_last_ops(slot) # All operations that need to happen before
-                prev_ops = prev_ops.union(slot_prev_ops)        
-            else:
-                self.operations[slot] = op
-        if len(prev_ops) > 0:
-                for prev_op in prev_ops:
-                    prev_op.next.add(op)
-                    op.prev.add(prev_op)
+        self._write(rank, buffer, index, size, op, read=True)
+        op.send_match = send_op
         return op
 
     def convert_set_list(self):
@@ -229,6 +176,29 @@ def convert_set_list(self):
     def optimize(self):
         self._optimize_rrcs_rrs()
         self._optimize_rcs()
+
+    # Completes metadata for chunk_steps (number of steps from a start op) and priority (number of steps to the last op)
+    def _complete_metadata(self):
+        def dfs(op, cs):
+            op.chunk_step = max(op.chunk_step, cs+1)
+
+            if len(op.next) == 0 and op.recv_match is None:
+                op.priority = 0
+            else:
+                for o in op.next:
+                    dfs(o, op.chunk_step)
+                # Priority = +1 of the highest priority child
+                if len(op.next) > 0:
+                    highest_next_priority = max([x.priority+1 for x in op.next])
+                    op.priority = max(highest_next_priority, op.priority)
+                if op.is_send():
+                    dfs(op.recv_match, op.chunk_step)
+                    op.priority = max(op.priority, op.recv_match.priority+1)
+
+        for chunk, op in self.operations.items():
+            if op.inst == Instruction.start:
+                dfs(op,-2) # Start instructions should start at -1
+            
         
     # Given the set of operations that operate over a particular slot (rank, buffer, idx) fixed
     # Try and replace operations with pipelined ops like receive copy send (rcs)
@@ -241,15 +211,20 @@ def _optimize_rcs(self):
             frontier = [ops]
             while len(frontier) > 0:
                 op = frontier[0]
-                if len(op.next) == 1:
-                    next_op = op.next[0] 
+                for next_op in op.next:
                     if op.inst == Instruction.recv and next_op.inst == Instruction.send and same_tb(op, next_op) and same_count(op, next_op) and same_buf_dst(op, next_op):
+                        # recv -> rcs, remove send
                         op.inst = Instruction.recv_copy_send
                         op.dst = next_op.dst
-                        op.match = op.match + next_op.match
+                        next_op.recv_match.send_match = op
+                        op.recv_match = next_op.recv_match
                         remove_op(next_op)
+                        break
                 frontier = frontier[1:] + op.next
-        
+    # recv-reduce-send - A rrc followed by a send that gets overwritten
+    # rrc(src, sbuf, si, ...) send(_, _, _, dst, dbuf, di) recv(_, _, _, dst, dbuf, di) 
+    # recv-reduce-copy-send - A rrc followed by a send that does not get overwritten
+    # rrc(src, sbuf, si, ...) send(_, _, _, dst, dbuf, di)
     def _optimize_rrcs_rrs(self):
         # RRC/S -> RRS
         for slot, ops in self.operations.items():
@@ -260,16 +235,18 @@ def _optimize_rrcs_rrs(self):
                     next_op = op.next[0]
                     if len(next_op.next) == 1:
                         nnext_op = next_op.next[0]
-                        if op.inst == Instruction.recv_reduce_copy and next_op.inst == Instruction.send and nnext_op.inst == Instruction.recv and same_tb(op, next_op) and same_count(op, next_op):
+                        if op.inst == Instruction.recv_reduce_copy and next_op.inst == Instruction.send and nnext_op.inst is Instruction.recv and same_tb(op, next_op) and same_count(op, next_op) and same_buf_dst(op, next_op):
                             op.inst = Instruction.recv_reduce_send
                             op.dst = next_op.dst
-                            op.match = op.match + next_op.match
+                            next_op.recv_match.send_match = op
+                            op.recv_match = next_op.recv_match
                             remove_op(next_op)
                     
-                    if op.inst == Instruction.recv_reduce_copy and next_op.inst == Instruction.send and same_tb(op, next_op) and same_count(op, next_op):
+                    if op.inst == Instruction.recv_reduce_copy and next_op.inst == Instruction.send and same_tb(op, next_op) and same_count(op, next_op) and same_buf_dst(op, next_op):
                         op.inst = Instruction.recv_reduce_copy_send
                         op.dst = next_op.dst
-                        op.match = op.match + next_op.match
+                        next_op.recv_match.send_match = op
+                        op.recv_match = next_op.recv_match
                         remove_op(next_op)
                 frontier = frontier[1:] + op.next
 
@@ -367,12 +344,13 @@ def get_instance_ref(ref):
             iref = ChunkRef(ref.rank, ref.buffer, iindex, ref.size)
             return iref
 
+        max_channels = max(self.num_channels)
         for i in range(instances):
             # Generate all the threadblocks and ops
             for rank, rank_tbs in enumerate(self.tbs):
-                rank_channels = self.num_channels [rank]
+                # rank_channels = self.num_channels[rank]
                 for tbid, tb in rank_tbs.items():
-                    instance_channel = rank_channels * i + tb.channel
+                    instance_channel = max_channels * i + tb.channel
                     itb = Threadblock(instance_channel, tb.send, tb.recv)
                     itbid = tbid * instances + i
                     itb.ops = [None] * len(tb.ops)
diff --git a/sccl/language/routines.py b/sccl/language/routines.py
new file mode 100644
index 0000000..6cb8898
--- /dev/null
+++ b/sccl/language/routines.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from sccl.language import *
+from sccl.topologies import *
+from sccl.language.collectives import *
+
+def allgather_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0):
+    for rank in range(gpu_offset, gpu_offset+gpus):
+            index = index_offset + rank - gpu_offset
+            c = chunk(rank, Buffer.input, 0)
+            for r_next in range(1, gpus):
+                next_rank = (rank + r_next) % gpus + gpu_offset
+                c = c.copy(next_rank, Buffer.output, index, ch=0)
+
+def allreduce_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0):
+    for rank in range(gpu_offset, gpu_offset+gpus):
+        index = index_offset + rank - gpu_offset
+        c = chunk(rank, Buffer.input, index)
+        # Reduce ring
+        for r_next in range(1, gpus):
+            next_rank = (rank + r_next) % gpus + gpu_offset
+            c = chunk(next_rank, Buffer.input, index).reduce(c, ch=ch)
+        # Propagate ring
+        for r_next in range(0, gpus-1):
+            next_rank = (rank + r_next) % gpus + gpu_offset
+            c = c.copy(next_rank, Buffer.input, index, ch=ch)
+    
\ No newline at end of file
diff --git a/sccl/language/tb_assignment.py b/sccl/language/tb_assignment.py
old mode 100644
new mode 100755
index 201f96c..e1761e8
--- a/sccl/language/tb_assignment.py
+++ b/sccl/language/tb_assignment.py
@@ -20,162 +20,207 @@ def _verify_tb_op_compatible(tb, op):
 
 # Manual threadblock, channel assignment
 def manual_assign_tbs(rank_dag):
-    ops = []
-    for slot, op in rank_dag.operations.items():
-        if op.inst == Instruction.start:
-            for o in op.next:
-                if o.inst == Instruction.send or o.inst == Instruction.copy:
-                    heapq.heappush(ops, o)
-
-    rank_dag.num_channels = [1] * rank_dag.num_ranks
-    visited = set()
-    while len(ops) > 0:
-        op = heapq.heappop(ops)
-        if op not in visited:
-            visited.add(op)
-            rank = op.rank
-            tbid = op.tb
-            if tbid not in rank_dag.tbs[rank]:
-                rank_dag.tbs[rank][tbid] = Threadblock()
-            tb = rank_dag.tbs[rank][tbid]
-            if _verify_tb_op_compatible(tb, op):
-                tb.ops.append(op)
-                tb.channel = op.channel if op.channel != -1 else 0
-                tb.send = op.dst.rank if op.is_send() else tb.send
-                tb.recv = op.src.rank if op.is_recv() else tb.recv
-                op.step = len(tb.ops)-1
-                rank_dag.num_channels[rank] = max(op.channel+1, rank_dag.num_channels[rank] )
-            else:
-                raise Exception(f"Illegal threadblock assignment. Trying to add {op} to threadblock {tbid}\n" \
-                    f"Threadblock {tbid} send:{tb.send} recv:{tb.recv} channel:{tb.channel}\n" \
-                    f"Operation send:{op.dst.rank if op.is_send() else -1} recv:{op.dst.rank if op.is_recv() else -1} channel:{op.channel}")
-            
-            for o in op.next:
-                heapq.heappush(ops, o)
-            for o in op.match:
-                heapq.heappush(ops, o)
-
-
-def _get_tb_options(mapping, send, recv, channel, num_tbs, num_channels):
-    if send == -1 and recv == -1: # Can go anywhere
-        return list(i for i in range(0, num_tbs))
-    if channel == -1: # Can go on any channel that matches to send, recv
-        options = []
-        for ch in range(num_channels):
-            if (send, recv, ch) in mapping:
-                options.append(mapping[(send, recv, ch)])
-        return options
-    elif (send, recv, channel) in mapping:
-        return [mapping[(send, recv, channel)]]
-    # Double up if necessary
-    else:
-        options = []
-        for requirements, tbid in mapping.items():
-            tb_s, tb_r, tb_c = requirements
-            sender_ok = send == -1 or tb_s == -1 or tb_s == send
-            receiver_ok = recv == -1 or tb_r == -1 or tb_r == recv
-            channel_ok = channel == -1 or channel == tb_c
-            if sender_ok and receiver_ok and channel_ok:
-                options.append(tbid)
-        return options
-
-def create_base_tbs(rank_dag):
-    ops = []
-    tbid = [0] * rank_dag.num_ranks
-    tb_assignments = [] # rank -> (sender, receiver, channel) -> tbid
-    for _ in range(rank_dag.num_ranks):
-        tb_assignments.append({})
-    num_channels = [1] * rank_dag.num_ranks
-
-    for slot, op in rank_dag.operations.items():
-        if op.inst == Instruction.start:
-            for o in op.next:
-                ops.append(o)
-        elif op.inst != Instruction.copy:
-            ops.append(op)
-
-    visited = set()
-    i = 0
-    while i < len(ops):
-        op = ops[i]
-        if op not in visited:
-            visited.add(op)
-            rank = op.rank
-            s = op.dst.rank if op.is_send() else -1
-            r = op.src.rank if op.is_recv() else -1
-            channel = 0 if op.channel == -1 else op.channel
-            if op.channel >= num_channels[rank]:
-                num_channels[rank] = op.channel + 1
-
-            if (s != -1 or r != -1) and (s,r,channel) not in tb_assignments[rank]:
-                rank_dag.tbs[rank][tbid[rank]] = Threadblock(send=s, recv=r, channel=channel)
-                tb_assignments[rank][(s,r,channel)] = tbid[rank]
-                tbid[rank] += 1
-            ops += op.next
-        i += 1
-
-    rank_dag.tb_assignments = tb_assignments
-    rank_dag.num_channels = num_channels
-
+    instrs = topo_sort_instrs(rank_dag)
+    for op in instrs:
+        
+        rank = op.rank
+        tbid = op.tb
+        if tbid not in rank_dag.tbs[rank]:
+            rank_dag.tbs[rank][tbid] = Threadblock()
+        tb = rank_dag.tbs[rank][tbid]
+        if _verify_tb_op_compatible(tb, op):
+            tb.ops.append(op)
+            tb.channel = op.channel if op.channel != -1 else 0
+            tb.send = op.dst.rank if op.is_send() else tb.send
+            tb.recv = op.src.rank if op.is_recv() else tb.recv
+            op.step = len(tb.ops)-1
+            rank_dag.num_channels[rank] = max(op.channel+1, rank_dag.num_channels[rank] )
+        else:
+            raise Exception(f"Illegal threadblock assignment. Trying to add {op} to threadblock {tbid}\n" \
+                f"Threadblock {tbid} send:{tb.send} recv:{tb.recv} channel:{tb.channel}\n" \
+                f"Operation send:{op.dst.rank if op.is_send() else -1} recv:{op.dst.rank if op.is_recv() else -1} channel:{op.channel}")
+
+def _get_tb_options(mapping, send, recv, channel, num_tbs):
+    options = []
+    for tbid, tb in mapping.items():
+        tb_s = tb.send
+        tb_r = tb.recv
+        tb_c = tb.channel
+        sender_ok = send == -1 or tb_s == send
+        receiver_ok = recv == -1 or tb_r == recv
+        channel_ok = channel == -1 or channel == tb_c
+        # For correctness - if one of the peer's channels is already allocated we must use it.
+        if channel_ok and ((tb_s == send and send != -1) or (tb_r == recv and recv != -1)):
+            return [tbid]
+        if sender_ok and receiver_ok and channel_ok:
+             options.append(tbid)
+    return options
 
 def auto_assign_tbs(rank_dag):
-    # Allocate the base set of TBs
-    tb_assignments = rank_dag.tb_assignments
-    num_channels = rank_dag.num_channels
-    current_num_tb = []
-    for rank_tbs in rank_dag.tbs:
-        current_num_tb.append(len(rank_tbs))
+    instrs = topo_sort_instrs(rank_dag)
+    channel_assignment(instrs, rank_dag)
+    rank_tbids = [0] * rank_dag.num_ranks
     current_tb_step = []
     for rank_tbs in rank_dag.tbs:
-        tb_step = {}
-        for tbid in rank_tbs.keys():
-            tb_step[tbid] = 0
-        current_tb_step.append(tb_step)
+        current_tb_step.append({})
+
+    for op in instrs:
+        rank = op.rank
+        s = op.send_peer()
+        r = op.recv_peer()
+        channel = 0 if op.channel == -1 else op.channel
+        # Get all possible TBs this can be mapped to
+        tb_options = _get_tb_options(rank_dag.tbs[rank], s, r, channel, rank_tbids[rank])
+        if len(tb_options) == 0: # If there are no options, create a new threadblock
+            tbid = rank_tbids[rank]
+            rank_dag.tbs[rank][tbid] = Threadblock(send=s, recv=r, channel=channel)
+            rank_tbids[rank] += 1
+        else: 
+            tbid = tb_options[0]
+            for tbid_opt in tb_options:
+                if current_tb_step[rank][tbid_opt] < current_tb_step[rank][tbid] and _verify_tb_op_compatible(rank_dag.tbs[rank][tbid], op):
+                    tbid = tbid_opt
+        
+        tb = rank_dag.tbs[rank][tbid]
+        assert _verify_tb_op_compatible(tb, op), f"Failing: Operations uses channel {op.channel}, send:{s} recv:{r} {op}\n" \
+                f"Threadblock uses send:{tb.send} recv:{tb.recv} channel:{tb.channel}"
+
+        rank_dag.num_channels[rank] = max(rank_dag.num_channels[rank], channel+1)
+
+        tb.ops.append(op)
+        tb.send = op.dst.rank if op.is_send() else tb.send
+        tb.recv = op.src.rank if op.is_recv() else tb.recv
+        
+        op.step = len(tb.ops)-1
+        op.tb = tbid
+        current_tb_step[rank][tbid] = op.chunk_step
 
+# Topologically orders instructions so that (1): Sends occur before their receives
+# (2): Dependent instructions occur before 
+def topo_sort_instrs(rank_dag):
+    def priority(op):
+        return ((op.chunk_step, -op.priority, op.dst.index))
+
+    visited = set()
     ops = []
+    ordered = []
     for slot, op in rank_dag.operations.items():
         if op.inst == Instruction.start:
+            visited.add(op)
             for o in op.next:
                 if o.inst == Instruction.send or o.inst == Instruction.copy:
-                    heapq.heappush(ops, ((o.chunk_step, o.priority, o.dst.index), o))
+                    heapq.heappush(ops, (priority(o), o))
 
-    visited = set()
     while len(ops) > 0:
         _, op = heapq.heappop(ops)
         if op not in visited:
+            rmatch = op.recv_match
+            ordered.append(op)
             visited.add(op)
+            
+            # Add a matching receive if one exists and its dependencies are satisfied
+            if rmatch is not None and all([x in visited for x in rmatch.prev]): 
+                heapq.heappush(ops, (priority(rmatch), rmatch))
+            # Add other operation that have dependencies satisfied
+            for o in op.next:
+                if all([x in visited for x in o.prev]):
+                    heapq.heappush(ops, (priority(o), o))
+    return ordered
+
+def channel_assignment(instrs, rank_dag):
+    def all_channels():
+        return set([x for x in range(32)])    # First handle flows - if an instruction at Rx is fused Rw->Rx->Ry and takes c
+    # Then flow Rw->Rx->Rz must be ib a different channel c' where c!=c'
+    rank2sendch = [defaultdict(all_channels) for _ in range(rank_dag.num_ranks)]
+    rank2recvch = [defaultdict(all_channels) for _ in range(rank_dag.num_ranks)]
+
+    # DFS through the InstructionDAG identifying flows
+    def valid_send_ch(sender, receiver, ch):
+        return ch in rank2sendch[sender][receiver]
+    def valid_recv_ch(sender, receiver, ch):
+        return ch in rank2recvch[receiver][sender]
+
+    # Returns a channel this flow can be scheduled on, else -1 
+    def is_matching_flow(flow):
+        if flow in flows:
+            return flow_channels[flows.index(flow)]
+        return -1
+
+    def reserve_channel(sender, receiver, ch):
+        if ch in rank2sendch[sender][receiver]:
+            rank2sendch[sender][receiver].remove(ch)
+        if ch in rank2recvch[receiver][sender]:
+            rank2recvch[receiver][sender].remove(ch)
+    flows = []
+    flow_channels = []
+
+    def create_flow(f):
+        flow = set()
+        for i in range(1, len(f)):
+            flow.add((f[i-1], f[i]))
+        return flow
+        
+    def dfs(op, channels, f):
+        if op.is_local():
+            op.channel = 0
+        elif op.is_send():
+            match = op.recv_match
+            sender = op.rank
+            receiver = match.rank
+            # Available channels
+            channels = rank2sendch[sender][receiver].intersection(rank2recvch[receiver][sender]).intersection(channels)
+            f.append(op.rank)
+            # If not a fused op use the first possible channel (send, recv/rrc)
+            if not match.is_fused():
+                f.append(match.rank)
+                flow = create_flow(f)
+                # If the user has already manually scheduled this onto a channel, respect it
+                if op.channel != -1:
+                    ch = op.channel
+                else:
+                    ch = is_matching_flow(flow)
+                    if ch == -1: # No flow matched - use the smallest available channel
+                        ch = min(channels)
+                        flows.append(flow)
+                        flow_channels.append(ch)
+
+                op.channel = ch
+                match.channel = ch
+                reserve_channel(sender, receiver, ch)
+            else:
+                dfs(match, channels, f)
+                ch = match.channel
+                op.channel = ch
+                reserve_channel(sender, receiver, ch)
+
+    # Assign channels to flows
+    for op in instrs:
+        if op.inst == Instruction.send and op.recv_match.is_fused():
+            dfs(op, all_channels(), [])
+
+    # Iterate through and make certain the sends and receives between a pair of GPUs is consistent
+    # Shift a (s,r) pair to another channel if the ordering isn't consistent
+    repeat = True
+    while repeat:
+        repeat = False
+        pending_recv = defaultdict(list)  # (sender, receiver, ch) -> pending receive
+        for op in instrs:
             rank = op.rank
-            s = op.dst.rank if op.is_send() else -1
-            r = op.src.rank if op.is_recv() else -1
-            # Get all possible TBs this can be mapped to
-            tb_options = _get_tb_options(tb_assignments[rank], s, r, op.channel, current_num_tb[rank], num_channels[rank])
-            # If there are multiple options choose the TB at the lowest step
-            tbid = tb_options[0]
-            if len(tb_options) > 1:
-                for tbid_opt in tb_options:
-                    if current_tb_step[rank][tbid_opt] < current_tb_step[rank][tbid] and _verify_tb_op_compatible(rank_dag.tbs[rank][tbid], op):
-                        tbid = tbid_opt
-
-            tb = rank_dag.tbs[rank][tbid]
-            assert _verify_tb_op_compatible(tb, op), f"Failing: Channel {op.channel}, send {s} recv {r} {op}\n" \
-                    f"Threadblock send:{tb.send} recv:{tb.recv}  channel{tb.channel}"
-
-            tb.ops.append(op)
-            tb.send = op.dst.rank if op.is_send() else tb.send
-            tb.recv = op.src.rank if op.is_recv() else tb.recv
+            channel = 0 if op.channel == -1 else op.channel
+            if op.is_send():
+                dst = op.dst.rank
+                pending_recv[(rank, dst, channel)].append(op.recv_match)
             
-            op.step = len(tb.ops)-1
-            op.channel = tb.channel
-            op.tb = tbid
-            current_tb_step[rank][tbid] = op.chunk_step
+            if op.is_recv():
+                src = op.src.rank
+                pr = pending_recv[(src, rank, channel)]
+                if op in pr:
+                    if pr[0] is op:
+                        del pr[0]
+                    else:
+                        repeat = True
+                        op.channel += 1
+                        op.send_match.channel += 1
+                        pr.remove(op)
 
-            # For correctness make certain the matching sends and receives
-            # happen on the same channel
-            for match in op.match:
-                match.channel = tb.channel
 
-            for o in op.match:
-                heapq.heappush(ops, ((o.chunk_step, o.priority, o.dst.index), o))
-            for o in op.next:
-                heapq.heappush(ops, ((o.chunk_step, o.priority, o.dst.index), o))
-            
\ No newline at end of file
diff --git a/sccl/language/visualize.py b/sccl/language/visualize.py
new file mode 100755
index 0000000..5ffca4e
--- /dev/null
+++ b/sccl/language/visualize.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import igraph as ig
+from sccl.language.ir import *
+from sccl.language.rank_dag import *
+
+def visualize_chunk_dag(chunk_paths): # pragma: no cover
+    frontier = []
+    nnodes = 0
+    vertex_label = []
+    vertex_colors = []
+    edges = []
+    visited = set()
+
+    def add_node(op, nnodes, vertex_label, vertex_colors):
+        if op.num == -1:
+            op.num = nnodes
+            nnodes += 1
+            if op.inst == ChunkInstruction.start:
+                vertex_label.append(f'Start at {op.dst.rank}, {op.dst.index}.')
+                vertex_colors.append('yellow')
+            elif op.inst == ChunkInstruction.send:
+                vertex_label.append(f'Send to Rank {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
+                vertex_colors.append('blue')
+            elif op.inst == ChunkInstruction.reduce:
+                vertex_label.append(f'Reduce with {op.dst.rank} {op.dst.index}. {op.steps_to_end}, {op.steps_from_start}')
+                vertex_colors.append('green')
+        return nnodes
+
+    for chunk, op in chunk_paths.items():
+        if len(op.prev) == 0: 
+            frontier.append(op)
+
+    while len(frontier) > 0:
+        op = frontier[0]
+        if op in visited:
+            frontier = frontier[1:]
+        else:
+            nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
+            for next_op in op.next:
+                nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
+                edges.append([op.num, next_op.num])
+            frontier = frontier[1:] + op.next
+            visited.add(op)
+
+    g = ig.Graph(nnodes, edges, directed=True)
+    layout = g.layout(layout=ig.Graph.layout_grid)
+    ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='auto')
+
+def visualize_rank_dag(operations): # pragma: no cover
+    frontier = []
+    nnodes = 0
+    vertex_label = []
+    vertex_colors = []
+    edges = []
+    visited = set()
+    colors = ['red', 'green', 'blue', 'yellow', 'teal', 'pink', 'purple', 'orange']
+
+    def add_node(op, nnodes, vertex_label, vertex_colors):
+        if op.num == -1:
+            op.num = nnodes
+            nnodes += 1
+            # Add new node to graph
+            if op.inst == Instruction.start:
+                vertex_label.append(f'Chunk {op.src.index} Rank {op.src.rank}')
+            elif op.inst == Instruction.send:
+                vertex_label.append(f'S to Rank {op.dst.rank}')
+            elif op.inst == Instruction.recv:
+                vertex_label.append(f'R from {op.src.rank}')
+            elif op.inst == Instruction.recv_reduce_copy:
+                vertex_label.append(f'RRC from {op.src.rank}')
+            else:
+                vertex_label.append(f'{op.inst}')
+
+            # Add colors 
+            if op.inst == Instruction.start:
+                vertex_colors.append('gray')
+            else:
+                vertex_colors.append(colors[op.tb % len(colors)])
+        return nnodes
+
+    for slot, op in operations.items():
+        if len(op.prev) == 0: 
+            frontier.append(op)
+
+    while len(frontier) > 0:
+        op = frontier[0]
+
+        if op in visited:
+            frontier = frontier[1:]
+        else:
+            nnodes = add_node(op, nnodes, vertex_label, vertex_colors)
+
+        for next_op in op.next:
+            nnodes = add_node(next_op, nnodes, vertex_label, vertex_colors)
+            edges.append([op.num, next_op.num])
+            frontier = frontier[1:] + list(op.next)
+        visited.add(op)
+
+    g = ig.Graph(nnodes, edges, directed=True)
+    layout = g.layout(layout=ig.Graph.layout_grid)
+    ig.plot(g, vertex_label=vertex_label, vertex_color=vertex_colors, layout='rt')
\ No newline at end of file
diff --git a/sccl/ncd_reduction.py b/sccl/ncd_reduction.py
old mode 100644
new mode 100755
diff --git a/sccl/path_encoding.py b/sccl/path_encoding.py
old mode 100644
new mode 100755
diff --git a/sccl/programs/__init__.py b/sccl/programs/__init__.py
old mode 100644
new mode 100755
diff --git a/sccl/programs/allreduce_a100_ring.py b/sccl/programs/allreduce_a100_ring.py
old mode 100644
new mode 100755
index a3feb20..1a06bf8
--- a/sccl/programs/allreduce_a100_ring.py
+++ b/sccl/programs/allreduce_a100_ring.py
@@ -13,12 +13,12 @@
 def allreduce_ring(size, channels):   
     # Reduce ring
     for step in range(0, size-1):
-        for index in range(0, size):
-            rank = (index + step) % size
-            c = chunk(rank, Buffer.input, index)
-            next_rank = (index + step + 1) % size
-            channel = index%channels
-            c = c.reduce(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+            for index in range(0, size):
+                rank = (index + step) % size
+                next_rank = (index + step + 1) % size
+                channel = index%channels
+                c = chunk(next_rank, Buffer.input, index)
+                c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel)
     # Propagate ring
     for step in range(-1, size-2):
         for index in range(0, size):
@@ -26,4 +26,4 @@ def allreduce_ring(size, channels):
             c = chunk(rank, Buffer.input, index)
             next_rank = (index + step + 1) % size
             channel = index%channels
-            c = c.send(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
\ No newline at end of file
+            c = c.copy(next_rank, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
\ No newline at end of file
diff --git a/sccl/programs/alltoall_a100_8kp1.py b/sccl/programs/alltoall_a100_8kp1.py
old mode 100644
new mode 100755
index 38c03cd..aca4795
--- a/sccl/programs/alltoall_a100_8kp1.py
+++ b/sccl/programs/alltoall_a100_8kp1.py
@@ -48,17 +48,17 @@ def AddChunk(ib_chunks, key, c):
                         buffer_key = (n1, n2)
                         # Send chunk to the gather_rank. Send returns a chunk reference to the 
                         # receiver's chunk
-                        c = c.send(gather_rank, buffer=buffer_key, ch=ch*2)
+                        c = c.copy(gather_rank, buffer=buffer_key, ch=ch*2)
                         # Group the chunks using a particular IB pair into one large chunk reference
                         AddChunk(ib_chunks, buffer_key, c) 
                     else:
-                        # Within a node - direct send/copy the chunks over nvlink to the output buffer. 
-                        # Use a different channel to ensure that we don't get in the way of sends/receives above
+                        # Within a node - direct copy/copy the chunks over nvlink to the output buffer. 
+                        # Use a different channel to ensure that we don't get in the way of copys/receives above
                         # which are on the critical path.
                         for g2 in range(gpus_per_node):
                             r2 = RankFromNodeGpuPair(n2, g2)
                             c = chunk(r1, Buffer.input, r2 * instances + ch)
-                            c.send(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
+                            c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
 
                 
 
@@ -66,20 +66,20 @@ def AddChunk(ib_chunks, key, c):
     for buffer_key, ib_chunk in ib_chunks.items(): 
         (n1, n2) = buffer_key
         _, scatter_rank = CrossNodeGpus(n1, n2)
-        # IB send divided across multiple parallel channels
+        # IB copy divided across multiple parallel channels
         chunks = ib_chunk.split(ib_connections)
         for ch, c in enumerate(chunks):
-            # Note: If we are only going to use 1 IB connection for each IB send
+            # Note: If we are only going to use 1 IB connection for each IB copy
             # alternate between channels 0 and 1 to utilize both IB links.
             if ib_connections == 1:
                 ib_channel = c.rank % 2
             else:
                 ib_channel = ch
-            c = c.send(scatter_rank, buffer=buffer_key, ch=ib_channel)
+            c = c.copy(scatter_rank, buffer=buffer_key, ch=ib_channel)
             # Local scatter
             cs = c.split(gpus_per_node * gpus_per_node)
             for i, c in enumerate(cs):
                 # Access the chunk's destination rank and index to route it to its final place
                 final_rank = c.get_dst_rank()
                 index = c.get_dst_index()
-                c.send(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
\ No newline at end of file
+                c.copy(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
\ No newline at end of file
diff --git a/sccl/programs/alltoall_a100_yifan.py b/sccl/programs/alltoall_a100_yifan.py
old mode 100644
new mode 100755
index 1a79cac..4d30efd
--- a/sccl/programs/alltoall_a100_yifan.py
+++ b/sccl/programs/alltoall_a100_yifan.py
@@ -11,7 +11,6 @@ def alltoall_hierarchical(num_nodes, gpus_per_node):
     for n1 in range(num_nodes):
         for r in range(1,num_nodes):
             n2 = (n1 + r) % num_nodes
-            # print(f"r {r} n1 {n1} n2 {n2}")
 
             # Gather all local chunks for the node neighbor
             for g1 in range(gpus_per_node):
@@ -19,19 +18,19 @@ def alltoall_hierarchical(num_nodes, gpus_per_node):
 
                 for g2 in range(gpus_per_node):
                     rank2 = n1 * gpus_per_node + g2
-                    # chunk to send: g2 on n2
+                    # chunk to copy: g2 on n2
                     index = n2 * gpus_per_node + g2 
                     c = chunk(rank1, Buffer.input, index)
-                    c = c.send(rank2, f'send_{n2}')
+                    c = c.copy(rank2, f'copy_{n2}')
 
         for r in range(1,num_nodes):
             n2 = (n1 + r) % num_nodes
-            # IB send
+            # IB copy
             for g1 in range(gpus_per_node):
                 rank = n1 * gpus_per_node + g1
                 ib_peer = n2 * gpus_per_node + g1
-                c = chunk(rank, f'send_{n2}', 0, 8)
-                c = c.send(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
+                c = chunk(rank, f'copy_{n2}', 0, 8)
+                c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
 
         
     # Handle local chunks within a node
@@ -39,6 +38,4 @@ def alltoall_hierarchical(num_nodes, gpus_per_node):
         for g in range(gpus_per_node):
             index = (rank // gpus_per_node) * gpus_per_node + g
             c = chunk(rank, Buffer.input, index)
-            c.send(c.get_dst_rank(), Buffer.output, c.get_dst_index())
-
-
+            c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index())
diff --git a/sccl/rounds_bound.py b/sccl/rounds_bound.py
old mode 100644
new mode 100755
diff --git a/sccl/serialization.py b/sccl/serialization.py
old mode 100644
new mode 100755
diff --git a/sccl/steps_bound.py b/sccl/steps_bound.py
old mode 100644
new mode 100755
diff --git a/sccl/strategies.py b/sccl/strategies.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/__init__.py b/sccl/topologies/__init__.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/amd.py b/sccl/topologies/amd.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/distributed.py b/sccl/topologies/distributed.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/generic.py b/sccl/topologies/generic.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/nvidia.py b/sccl/topologies/nvidia.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/topology.py b/sccl/topologies/topology.py
old mode 100644
new mode 100755
diff --git a/sccl/topologies/transformers.py b/sccl/topologies/transformers.py
old mode 100644
new mode 100755
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 33dcbfc..9bac3b3
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,7 @@
         'lxml',
         'humanfriendly',
         'tabulate',
+        'igraph'
     ],
     python_requires='>=3.6',
 )
diff --git a/tests/__init__.py b/tests/__init__.py
old mode 100644
new mode 100755
diff --git a/tests/common.py b/tests/common.py
old mode 100644
new mode 100755
diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py
old mode 100644
new mode 100755
diff --git a/tests/test_analyses.py b/tests/test_analyses.py
old mode 100644
new mode 100755
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
old mode 100644
new mode 100755
diff --git a/tests/test_cli.py b/tests/test_cli.py
old mode 100644
new mode 100755
diff --git a/tests/test_distributors.py b/tests/test_distributors.py
old mode 100644
new mode 100755
diff --git a/tests/test_language.py b/tests/test_language.py
old mode 100644
new mode 100755
index ffc852f..3125484
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -4,6 +4,7 @@
 import sccl
 from sccl.topologies import line, fully_connected
 from sccl.language import *
+from sccl.language.routines import *
 from sccl.language.collectives import *
 import os
 import pytest
@@ -56,9 +57,9 @@ def init_buffers(self):
 
     # Final state rank2 has a fully reduced chunk from gpus 0, 1, and 2
     def check(self, prog):
-        expected_chunk = ReduceChunk([])
+        expected_chunk = ReduceChunk(-1, [])
         for r in range(self.num_ranks):
-            expected_chunk = expected_chunk.reduce(Chunk(r, 0))
+            expected_chunk = expected_chunk.reduce(-1, Chunk(r, 0))
 
         correct = True
         chunk = prog.buffers[2][Buffer.input][0]
@@ -75,7 +76,7 @@ def test_send():
     instances = 1
     collective = Send(num_gpus, chunksperloop, inplace=False)
     with SCCLProgram("send", topology, collective, instances):
-        chunk(0, Buffer.input, 0).send(1, 'scratch').send(2, Buffer.output, 0)
+        chunk(0, Buffer.input, 0).copy(1, 'scratch').copy(2, Buffer.output, 0)
         assert Check()
 
 def test_reduce():
@@ -86,7 +87,8 @@ def test_reduce():
     instances = 1
     collective = Reduce(num_gpus, chunksperloop, inplace=True)
     with SCCLProgram("reduce", topology, collective, instances):
-        chunk(0, Buffer.input, 0).reduce(1, Buffer.input, 0).reduce(2, Buffer.input, 0)
+        c10 = chunk(1, Buffer.input, 0).reduce(chunk(0, Buffer.input, 0))
+        chunk(2, Buffer.input, 0).reduce(c10)
         assert Check()
 
 def test_local_copy():
@@ -97,7 +99,7 @@ def test_local_copy():
     instances = 1
     collective = Send(num_gpus, chunksperloop, inplace=False)
     with SCCLProgram("cpy", topology, collective, instances):
-        chunk(0, Buffer.input, 0).send(2, 'scratch').send(2, Buffer.output, 0)
+        chunk(0, Buffer.input, 0).copy(2, 'scratch').copy(2, Buffer.output, 0)
         assert Check()
 
 def test_local_reduce():
@@ -108,8 +110,9 @@ def test_local_reduce():
     instances = 1
     collective = Reduce(num_gpus, chunksperloop, inplace=True)
     with SCCLProgram("local-reduce", topology, collective, instances):
-        chunk(0, Buffer.input, 0).reduce(1, Buffer.input, 0).send(2, 'scratch', 0).reduce(2, Buffer.input, 0)
-
+        c = chunk(1, Buffer.input, 0).reduce(chunk(0, Buffer.input, 0))
+        c = c.copy(2, 'scratch', 0)
+        chunk(2, Buffer.input, 0).reduce(c)
         XML()
         assert Check()
 
@@ -121,27 +124,46 @@ def test_scratch_buffers():
     instances = 1
     collective = AllReduce(num_gpus, chunksperloop, inplace=False)
     with SCCLProgram("test", topology, collective, instances):
-        chunk(0, Buffer.input, 0).send(2, 'scratch', 2)
+        chunk(0, Buffer.input, 0).copy(2, 'scratch', 2)
         c = chunk(2, 'scratch', 2)
         assert c.index == 2
-        c = chunk(1, Buffer.input, 0).send(2, 'scratch')
+        c = chunk(1, Buffer.input, 0).copy(2, 'scratch')
         assert c.index == 3
         XML()
 
+def test_program_order():
+    num_gpus = 2
+    topology = fully_connected(num_gpus)
+
+    chunksperloop = num_gpus
+    instances = 1
+    collective = AllReduce(num_gpus, chunksperloop, inplace=False)
+    prgm = SCCLProgram("test", topology, collective, instances)
+    with prgm:
+        chunk(1, Buffer.input, 0).copy(0, 'sc', 1)
+        # This send should depend on the send above finishing
+        chunk(0, Buffer.input, 0).copy(1, Buffer.input, 0)
+    slot = (1, Buffer.input, 0)
+    prgm.lower()
+    op = prgm.instr_dag.operations[slot]
+    assert op.inst == Instruction.start
+    assert op.next[0].inst == Instruction.send
+    assert op.next[0].next[0].inst == Instruction.recv
+
 def test_allgather():
     topology = fully_connected(2)
     collective = AllGather(2, 1, True)
     with SCCLProgram("allgather", topology, collective, 1):
-        chunk(0, Buffer.input, 0).send(1, Buffer.output, 0)
-        chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
+        chunk(0, Buffer.input, 0).copy(1, Buffer.output, 0)
+        chunk(1, Buffer.input, 0).copy(0, Buffer.output, 1)
         assert Check()
 
 def test_reducescatter():
     topology = fully_connected(2)
     collective = ReduceScatter(2, 1, True)
     with SCCLProgram("reducescatter", topology, collective, 1):
-        chunk(0, Buffer.input, 1).reduce(1, Buffer.input, 1)
-        chunk(1, Buffer.input, 0).reduce(0, Buffer.input, 0)
+        chunk(1, Buffer.input, 1).reduce(chunk(0, Buffer.input, 1))
+        chunk(0, Buffer.input, 0).reduce(chunk(1, Buffer.input, 0))
         assert Check()
 
 
@@ -149,18 +171,18 @@ def test_alltoall():
     topology = fully_connected(2)
     collective = AllToAll(2, 1, False)
     with SCCLProgram("alltoall", topology, collective, 1):
-        chunk(0, Buffer.input, 0).send(0, Buffer.output, 0)
-        chunk(0, Buffer.input, 1).send(1, Buffer.output, 0)
-        chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
-        chunk(1, Buffer.input, 1).send(1, Buffer.output, 1)
+        chunk(0, Buffer.input, 0).copy(0, Buffer.output, 0)
+        chunk(0, Buffer.input, 1).copy(1, Buffer.output, 0)
+        chunk(1, Buffer.input, 0).copy(0, Buffer.output, 1)
+        chunk(1, Buffer.input, 1).copy(1, Buffer.output, 1)
         assert Check()
 
 def test_allreduce():
     topology = fully_connected(2)
     collective = AllReduce(2, 2, True)
     with SCCLProgram("allreduce", topology, collective, 1):
-        chunk(0, Buffer.input, 0).reduce(1, Buffer.output, 0).send(0, Buffer.input, 0)
-        chunk(1, Buffer.input, 1).reduce(0, Buffer.input, 1).send(1, Buffer.input, 1)
+        chunk(1, Buffer.output, 0).reduce(chunk(0, Buffer.input, 0)).copy(0, Buffer.input, 0)
+        chunk(0, Buffer.input, 1).reduce(chunk(1, Buffer.input, 1)).copy(1, Buffer.input, 1)
         assert Check()
 
 def test_instruction_fusion():
@@ -168,8 +190,9 @@ def test_instruction_fusion():
     collective = AllReduce(3, 3, True)
     prgm = SCCLProgram("allreduce", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
     with prgm:
-        c = chunk(0, Buffer.input, 0, 3).reduce(1, Buffer.input, 0,sendtb=0, recvtb=0).reduce(2, Buffer.input, 0, sendtb=0, recvtb=0)
-        c.send(0, Buffer.input, 0, sendtb=0, recvtb=0).send(1, Buffer.input, 0, sendtb=0, recvtb=0)
+        c01 = chunk(1, Buffer.input, 0, 3).reduce(chunk(0, Buffer.input, 0, 3), sendtb=0, recvtb=0, ch=0)
+        c012 = chunk(2, Buffer.input, 0, 3).reduce(c01, sendtb=0, recvtb=0, ch=0)
+        c012.copy(0, Buffer.input, 0, sendtb=0, recvtb=0, ch=0).copy(1, Buffer.input, 0, sendtb=0, recvtb=0, ch=0)
         assert Check()
     lowered_prgm = prgm.lower()
     assert lowered_prgm.gpus[0].threadblocks[0].ops[0].inst == Instruction.send
@@ -183,18 +206,18 @@ def test_replication():
     collective = AllToAll(2, 1, False)
     prgm = SCCLProgram("alltoall", topology, collective, 1)
     with prgm:
-        chunk(0, Buffer.input, 0).send(0, Buffer.output, 0)
-        chunk(0, Buffer.input, 1).send(1, Buffer.output, 0)
-        chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
-        chunk(1, Buffer.input, 1).send(1, Buffer.output, 1)
+        chunk(0, Buffer.input, 0).copy(0, Buffer.output, 0)
+        chunk(0, Buffer.input, 1).copy(1, Buffer.output, 0)
+        chunk(1, Buffer.input, 0).copy(0, Buffer.output, 1)
+        chunk(1, Buffer.input, 1).copy(1, Buffer.output, 1)
 
     instances = 2
     replicated_prgm = SCCLProgram("alltoall", topology, collective, instances)
     with replicated_prgm:
-            chunk(0, Buffer.input, 0).send(0, Buffer.output, 0)
-            chunk(0, Buffer.input, 1).send(1, Buffer.output, 0)
-            chunk(1, Buffer.input, 0).send(0, Buffer.output, 1)
-            chunk(1, Buffer.input, 1).send(1, Buffer.output, 1)
+            chunk(0, Buffer.input, 0).copy(0, Buffer.output, 0)
+            chunk(0, Buffer.input, 1).copy(1, Buffer.output, 0)
+            chunk(1, Buffer.input, 0).copy(0, Buffer.output, 1)
+            chunk(1, Buffer.input, 1).copy(1, Buffer.output, 1)
 
     lowered_prgm = prgm.lower()
     lowered_replicated_prgm = replicated_prgm.lower()
@@ -210,8 +233,8 @@ def test_illegal_tb_assignment():
     with prgm:
         with pytest.raises(Exception):
             # Cannot send to two different gpus on the same threadblock
-            chunk(0, Buffer.input, 0).send(1, Buffer.output, 0, sendtb=0, recvtb=1)
-            chunk(0, Buffer.input, 1).send(2, Buffer.output, 0, sendtb=0, recvtb=1)
+            chunk(0, Buffer.input, 0).copy(1, Buffer.output, 0, sendtb=0, recvtb=1)
+            chunk(0, Buffer.input, 1).copy(2, Buffer.output, 0, sendtb=0, recvtb=1)
             XML()
 
 def test_registered_alltoall_yifan():
@@ -251,3 +274,55 @@ def test_registered_allreduce():
         allreduce_ring(num_ranks, num_ranks)
         assert Check()
         XML()
+
+def test_routines_allgather_ring_inplace():
+    size = 4
+    topology = fully_connected(size)
+    collective = AllGather(size, 1, True)
+    with SCCLProgram("allgather_ring", topology, collective, 1):
+        allgather_ring_inplace(size)
+        assert Check()
+
+def test_routines_allgather_ring_nodes():
+    size = 8
+    topology = fully_connected(size)
+    collective = AllGather(size, 1, True)
+    with SCCLProgram("allgather_multi", topology, collective, 1):
+        # Two parallel rings [0-4] and [4-8]
+        allgather_ring_inplace(4, 0, 0)
+        allgather_ring_inplace(4, 4, 4)
+        # Exchange between peers (0,4) (1,5) etc.
+        for r in range(0,8):
+            peer = (r+4)%size
+            exchange_index = 0 if r < 4 else 4
+            chunk(r, Buffer.output, exchange_index, 4).copy(peer, Buffer.output, exchange_index)
+        assert Check()
+
+def test_routines_allreduce_ring_inplace():
+    size = 4
+    topology = fully_connected(size)
+    collective = AllReduce(size, size, True)
+    with SCCLProgram("allreduce_ring", topology, collective, 1):
+        allreduce_ring_inplace(size)
+        assert Check()
+
+def test_routines_allreduce_nodes():
+    size = 8
+    topology = fully_connected(size)
+    collective = AllReduce(size, size, True)
+    with SCCLProgram("allreduce_multi", topology, collective, 1):
+        # Two parallel rings [0-4] and [4-8]
+        allreduce_ring_inplace(4, 0, 0)
+        allreduce_ring_inplace(4, 0, 4, ch=1)
+
+        allreduce_ring_inplace(4, 4, 4)
+        allreduce_ring_inplace(4, 4, 0, ch=1)
+        # Reduction between peers (0,4) (1,5) etc.
+        for r in range(0,8):
+            peer = (r+4)%size
+            exchange_index = 0 if r < 4 else 4
+            c = chunk(peer, Buffer.output, exchange_index, 4)
+            c.reduce(chunk(r, Buffer.output, exchange_index, 4))
+            c = c.copy(r, Buffer.output, exchange_index)
+        XML()
+        assert Check()
\ No newline at end of file
diff --git a/tests/test_path_encoding.py b/tests/test_path_encoding.py
old mode 100644
new mode 100755
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
old mode 100644
new mode 100755
diff --git a/tests/test_topologies.py b/tests/test_topologies.py
old mode 100644
new mode 100755

From e46d5a79c603d4394366ee54aba5c53322c7aaa2 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Mon, 23 May 2022 10:30:38 -0700
Subject: [PATCH 107/135] Rename to MSCCL (#30)

---
 .github/workflows/tests.yaml                  |  2 +-
 .gitignore                                    |  6 +-
 README.md                                     | 79 ++++++++--------
 SYNTHESIS.md                                  | 46 +++++-----
 dockerfiles/Dockerfile                        | 16 ++--
 examples/dgx1_allgather.ipynb                 | 16 ++--
 .../allgather_recursive_doubling.py           |  8 +-
 .../{scclang => mscclang}/allgather_ring.py   |  8 +-
 .../allreduce_a100_allpairs.py                |  8 +-
 ...lreduce_a100_recursive_doubling_halving.py |  8 +-
 .../allreduce_a100_ring.py                    |  8 +-
 .../allreduce_binomial_tree.py                |  8 +-
 .../{scclang => mscclang}/allreduce_dgx1.py   | 10 +-
 .../{scclang => mscclang}/allreduce_ndv2.py   | 12 +--
 .../allreduce_recursive_doubling_halving.py   |  8 +-
 .../{scclang => mscclang}/alltoall_a100.py    |  8 +-
 .../alltoall_a100_yifan.py                    |  8 +-
 .../alltoall_allpairs.py                      |  8 +-
 .../alltonext_backward.py                     | 10 +-
 .../alltonext_forward.py                      | 10 +-
 .../{scclang => mscclang}/reducegather.py     |  8 +-
 .../simple/allgather_ring.py                  | 10 +-
 .../simple/allreduce_ring.py                  | 10 +-
 .../simple/custom_collective.py               | 18 ++--
 examples/requirements_sccl_init.txt           |  2 +-
 examples/sccl_init.py                         | 54 +++++------
 examples/send.py                              |  6 +-
 examples/unpermute_dgx1.py                    |  6 +-
 msccl/__init__.py                             |  6 ++
 {sccl => msccl}/__main__.py                   | 10 +-
 {sccl => msccl}/algorithm.py                  |  0
 {sccl => msccl}/autosynth/__init__.py         | 46 +++++-----
 msccl/autosynth/msccl_ndv2_launcher.sh        |  8 ++
 {sccl => msccl}/autosynth/ndv2_plans.py       | 12 +--
 msccl/autosynth/ndv4_plans.py                 | 28 ++++++
 {sccl => msccl}/autosynth/registry.py         | 14 +--
 {sccl => msccl}/cli/__init__.py               |  0
 {sccl => msccl}/cli/analyze.py                |  4 +-
 {sccl => msccl}/cli/common.py                 | 22 ++---
 {sccl => msccl}/cli/distribute.py             |  4 +-
 {sccl => msccl}/cli/known_collectives.py      |  6 +-
 .../cli/known_distributed_topologies.py       |  2 +-
 {sccl => msccl}/cli/known_topologies.py       |  6 +-
 {sccl => msccl}/cli/known_transformers.py     |  2 +-
 {sccl => msccl}/cli/ncclize.py                |  4 +-
 {sccl => msccl}/cli/plans.py                  |  2 +-
 {sccl => msccl}/cli/solve.py                  |  4 +-
 {sccl => msccl}/collectives.py                |  0
 {sccl => msccl}/distributors/__init__.py      |  0
 .../distributors/alltoall_subproblem.py       |  8 +-
 .../distributors/gather_scatter_alltoall.py   |  8 +-
 .../distributors/greedy_alltoall.py           |  6 +-
 {sccl => msccl}/instance.py                   |  0
 {sccl => msccl}/isomorphisms.py               |  6 +-
 {sccl => msccl}/language/__init__.py          | 24 ++---
 {sccl => msccl}/language/buffer.py            |  0
 {sccl => msccl}/language/chunk.py             |  2 +-
 {sccl => msccl}/language/collectives.py       |  4 +-
 {sccl => msccl}/language/ir.py                |  0
 {sccl => msccl}/language/passes.py            |  4 +-
 {sccl => msccl}/language/rank_dag.py          |  4 +-
 {sccl => msccl}/language/routines.py          |  6 +-
 {sccl => msccl}/language/tb_assignment.py     |  4 +-
 {sccl => msccl}/language/visualize.py         |  4 +-
 {sccl => msccl}/ncclize.py                    |  2 +-
 {sccl => msccl}/ncd_reduction.py              |  6 +-
 {sccl => msccl}/path_encoding.py              |  4 +-
 {sccl => msccl}/programs/__init__.py          |  0
 .../programs/allreduce_a100_ring.py           |  6 +-
 .../programs/alltoall_a100_8kp1.py            |  6 +-
 .../programs/alltoall_a100_yifan.py           |  6 +-
 {sccl => msccl}/rounds_bound.py               |  4 +-
 {sccl => msccl}/serialization.py              | 52 +++++------
 {sccl => msccl}/steps_bound.py                |  0
 {sccl => msccl}/strategies.py                 |  8 +-
 {sccl => msccl}/topologies/__init__.py        |  0
 {sccl => msccl}/topologies/amd.py             |  0
 {sccl => msccl}/topologies/distributed.py     |  0
 {sccl => msccl}/topologies/generic.py         |  0
 {sccl => msccl}/topologies/nvidia.py          |  0
 {sccl => msccl}/topologies/topology.py        |  0
 {sccl => msccl}/topologies/transformers.py    |  0
 pytest.ini                                    |  2 +-
 sccl/__init__.py                              |  6 --
 sccl/autosynth/ndv4_plans.py                  | 28 ------
 sccl/autosynth/sccl_ndv2_launcher.sh          |  8 --
 setup.py                                      |  6 +-
 tests/common.py                               |  2 +-
 tests/test_algorithm.py                       |  6 +-
 tests/test_analyses.py                        |  6 +-
 tests/test_autosynth.py                       | 28 +++---
 tests/test_cli.py                             | 92 +++++++++----------
 tests/test_distributors.py                    | 10 +-
 tests/test_language.py                        | 58 ++++++------
 tests/test_path_encoding.py                   |  8 +-
 tests/test_serialization.py                   | 12 +--
 tests/test_topologies.py                      |  2 +-
 97 files changed, 517 insertions(+), 512 deletions(-)
 rename examples/{scclang => mscclang}/allgather_recursive_doubling.py (82%)
 rename examples/{scclang => mscclang}/allgather_ring.py (87%)
 rename examples/{scclang => mscclang}/allreduce_a100_allpairs.py (90%)
 rename examples/{scclang => mscclang}/allreduce_a100_recursive_doubling_halving.py (93%)
 rename examples/{scclang => mscclang}/allreduce_a100_ring.py (90%)
 rename examples/{scclang => mscclang}/allreduce_binomial_tree.py (92%)
 rename examples/{scclang => mscclang}/allreduce_dgx1.py (89%)
 rename examples/{scclang => mscclang}/allreduce_ndv2.py (86%)
 rename examples/{scclang => mscclang}/allreduce_recursive_doubling_halving.py (88%)
 rename examples/{scclang => mscclang}/alltoall_a100.py (97%)
 rename examples/{scclang => mscclang}/alltoall_a100_yifan.py (91%)
 rename examples/{scclang => mscclang}/alltoall_allpairs.py (79%)
 rename examples/{scclang => mscclang}/alltonext_backward.py (93%)
 rename examples/{scclang => mscclang}/alltonext_forward.py (93%)
 rename examples/{scclang => mscclang}/reducegather.py (93%)
 rename examples/{scclang => mscclang}/simple/allgather_ring.py (87%)
 rename examples/{scclang => mscclang}/simple/allreduce_ring.py (83%)
 rename examples/{scclang => mscclang}/simple/custom_collective.py (84%)
 create mode 100755 msccl/__init__.py
 rename {sccl => msccl}/__main__.py (80%)
 rename {sccl => msccl}/algorithm.py (100%)
 rename {sccl => msccl}/autosynth/__init__.py (85%)
 create mode 100755 msccl/autosynth/msccl_ndv2_launcher.sh
 rename {sccl => msccl}/autosynth/ndv2_plans.py (64%)
 create mode 100755 msccl/autosynth/ndv4_plans.py
 rename {sccl => msccl}/autosynth/registry.py (87%)
 rename {sccl => msccl}/cli/__init__.py (100%)
 rename {sccl => msccl}/cli/analyze.py (93%)
 rename {sccl => msccl}/cli/common.py (91%)
 rename {sccl => msccl}/cli/distribute.py (97%)
 rename {sccl => msccl}/cli/known_collectives.py (95%)
 rename {sccl => msccl}/cli/known_distributed_topologies.py (97%)
 rename {sccl => msccl}/cli/known_topologies.py (95%)
 rename {sccl => msccl}/cli/known_transformers.py (94%)
 rename {sccl => msccl}/cli/ncclize.py (96%)
 rename {sccl => msccl}/cli/plans.py (94%)
 rename {sccl => msccl}/cli/solve.py (97%)
 rename {sccl => msccl}/collectives.py (100%)
 rename {sccl => msccl}/distributors/__init__.py (100%)
 rename {sccl => msccl}/distributors/alltoall_subproblem.py (98%)
 rename {sccl => msccl}/distributors/gather_scatter_alltoall.py (98%)
 rename {sccl => msccl}/distributors/greedy_alltoall.py (98%)
 rename {sccl => msccl}/instance.py (100%)
 rename {sccl => msccl}/isomorphisms.py (90%)
 rename {sccl => msccl}/language/__init__.py (97%)
 rename {sccl => msccl}/language/buffer.py (100%)
 rename {sccl => msccl}/language/chunk.py (98%)
 rename {sccl => msccl}/language/collectives.py (99%)
 rename {sccl => msccl}/language/ir.py (100%)
 rename {sccl => msccl}/language/passes.py (97%)
 rename {sccl => msccl}/language/rank_dag.py (99%)
 rename {sccl => msccl}/language/routines.py (91%)
 rename {sccl => msccl}/language/tb_assignment.py (99%)
 rename {sccl => msccl}/language/visualize.py (98%)
 rename {sccl => msccl}/ncclize.py (99%)
 rename {sccl => msccl}/ncd_reduction.py (95%)
 rename {sccl => msccl}/path_encoding.py (99%)
 rename {sccl => msccl}/programs/__init__.py (100%)
 rename {sccl => msccl}/programs/allreduce_a100_ring.py (91%)
 rename {sccl => msccl}/programs/alltoall_a100_8kp1.py (97%)
 rename {sccl => msccl}/programs/alltoall_a100_yifan.py (92%)
 rename {sccl => msccl}/rounds_bound.py (96%)
 rename {sccl => msccl}/serialization.py (74%)
 rename {sccl => msccl}/steps_bound.py (100%)
 rename {sccl => msccl}/strategies.py (97%)
 rename {sccl => msccl}/topologies/__init__.py (100%)
 rename {sccl => msccl}/topologies/amd.py (100%)
 rename {sccl => msccl}/topologies/distributed.py (100%)
 rename {sccl => msccl}/topologies/generic.py (100%)
 rename {sccl => msccl}/topologies/nvidia.py (100%)
 rename {sccl => msccl}/topologies/topology.py (100%)
 rename {sccl => msccl}/topologies/transformers.py (100%)
 delete mode 100755 sccl/__init__.py
 delete mode 100755 sccl/autosynth/ndv4_plans.py
 delete mode 100755 sccl/autosynth/sccl_ndv2_launcher.sh

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 999c966..97e9dea 100755
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -21,7 +21,7 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install sccl and dependencies
+    - name: Install msccl and dependencies
       run: |
         pip install --upgrade pip
         pip install -r requirements.txt
diff --git a/.gitignore b/.gitignore
index 7bc2779..21f3b4c 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
-# SCCL specific
-*.sccl.json
-*.sccl.xml
+# MSCCL specific
+*.msccl.json
+*.msccl.xml
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
index 5074370..9c1a118 100755
--- a/README.md
+++ b/README.md
@@ -1,13 +1,16 @@
-# SCCL
+# MSCCL-tools
 
-SCCL is a tool stack for programmable communication on GPUs. Algorithms created with SCCL can:
+This repo contains the developer tool stack of the [Microsoft Collective Communication Library
+(MSCCL)](https://github.com/microsoft/msccl), a platform for programmable communication on GPUs. Algorithms created with
+MSCCL can:
 - Implement either MPI-style collectives like Allreduce, or any application specific communication pattern.
 - Target specific hardware and interconnect topologies, unlocking their full potential.
 - Optimize for the data sizes in your application, making the best tradeoff between latency and bandwidth utilization.
 
-SCCL ships with algorithms targeting various Azure multi-GPU VM types. See the [Available Algorithms section](#available-algorithms) to find out what is currently available.
+MSCCL-tools also contains pre-made algorithms targeting various Azure multi-GPU VM types. See the [Available Algorithms
+section](#available-algorithms) to find out what is currently available.
 
-SCCL has two ways of creating new algorithms:
+MSCCL has two ways of creating new algorithms:
 1. MSCCLang, a high-level DSL that talks about communication in an intuitive chunk-oriented form. See the [MSCCLang
 section](#mscclang) for how to get started.
 2. Synthesis, which automatically solves optimal algorithms for a given hardware topology. Making synthesis general
@@ -16,26 +19,26 @@ introduction.
 
 ## Usage
 
-The SCCL Python package ships with a registry of synthesis strategies and hand optimized algorithms. These can be loaded
-into [the runtime](https://github.com/parasailteam/msccl) through the `sccl.init` function, which must be called before
-the application creates its NCCL communicator. For PyTorch this means before `torch.distributed` is initialized.
+The MSCCL Python package ships with a registry of synthesis strategies and hand optimized algorithms. These can be
+loaded into [the runtime](https://github.com/parasailteam/msccl) through the `msccl.init` function, which must be called
+before the application creates its NCCL communicator. For PyTorch this means before `torch.distributed` is initialized.
 
-The following snippet requests `sccl.init` to provide an Alltoall algorithm in a configuration of 2 Azure NDv2 machines:
+The following snippet requests `msccl.init` to provide an Alltoall algorithm in a configuration of 2 Azure NDv2 machines:
 ```
-import sccl
-sccl.init('ndv2', 2, (sccl.Collective.alltoall, ('1MB')))
+import msccl
+msccl.init('ndv2', 2, (msccl.Collective.alltoall, ('1MB')))
 ```
 This will find an algorithm provider that can create an Alltoall algorithm that is expected to be good with 1MB of data.
-That will call a synthesis routine that writes the algorithm to disk. `sccl.init` will then pass a configuration file
+That will call a synthesis routine that writes the algorithm to disk. `msccl.init` will then pass a configuration file
 pointing to this algorithm to the runtime through environment variables.
 
-See [the examples](examples/sccl_init.py) for more on `sccl.init` usage.
+See [the examples](examples/msccl_init.py) for more on `msccl.init` usage.
 
 ## Available Algorithms
 
-SCCL's built-in algorithms are registered for combinations of hardware configuration and size of input data where we
-have benchmarked them to provide speedup over NCCL. To list the algorithms currently in SCCL's built-in registry, run
-`sccl plans list` on the command line. This will print out the following table (on 4/22/2022):
+MSCCL's built-in algorithms are registered for combinations of hardware configuration and size of input data where we
+have benchmarked them to provide speedup over NCCL. To list the algorithms currently in MSCCL's built-in registry, run
+`msccl plans list` on the command line. This will print out the following table (on 4/22/2022):
 
 | Machine   | Collective   | # machines   | From   | To       | Protocol   |   Priority | Plan name                           |
 |-----------|--------------|--------------|--------|----------|------------|------------|-------------------------------------|
@@ -51,20 +54,23 @@ Each line lists an algorithm registration and the conditions under which it is t
 - there are 8, 16, 32 or 64 Azure NDv4 machines, and
 - the data size is from 1 MB to 32 MB.
 
-The repository [parasailteam/sccl-presynth](https://github.com/parasailteam/sccl-presynth) repository offers additional algorithms that have been
-pre-synthesized for fixed configurations. To enable them install the package and import it before the call to
-`sccl.init`.
+The repository [parasailteam/msccl-presynth](https://github.com/parasailteam/msccl-presynth) repository offers
+additional algorithms that have been pre-synthesized for fixed configurations. To enable them install the package and
+import it before the call to `msccl.init`.
 
 ## MSCCLang
 
-MSCCLang is a high-level language for specifying collective communication algorithms in an intuitive chunk-oriented form. The language is available as a Python-integrated DSL.
+MSCCLang is a high-level language for specifying collective communication algorithms in an intuitive chunk-oriented
+form. The language is available as a Python-integrated DSL.
 
-The language is still under development and lacks comprehensive documentation. For now, please refer to [the pre-print of our upcoming paper](https://arxiv.org/pdf/2201.11840.pdf) and the examples in [examples/scclang](examples/scclang/).
+The language is still under development and lacks comprehensive documentation. For now, please refer to [the pre-print
+of our upcoming paper](https://arxiv.org/pdf/2201.11840.pdf) and the examples in
+[examples/mscclang](examples/mscclang/).
 
 ## Synthesis
 
-SCCL started out as a synthesizer for collective algorithms, and general synthesis of collective algorithms is an
-on-going research project. See [this readme](SYNTHESIS.md) for using SCCL as a synthesizer.
+MSCCL started out as a synthesizer for collective algorithms, and general synthesis of collective algorithms is an
+on-going research project. See [this readme](SYNTHESIS.md) for using MSCCL as a synthesizer.
 
 ## Installation
 
@@ -72,28 +78,27 @@ on-going research project. See [this readme](SYNTHESIS.md) for using SCCL as a s
 
 To install either clone this repo and run "`pip install .`" or run:
 ```
-pip install git+https://github.com/microsoft/sccl.git
+pip install git+https://github.com/microsoft/msccl.git
 ```
 
-Installing the SCCL Python package also installs the `sccl` command line tool. To enable Bash completion for the `sccl`
-tool:
+Installing the MSCCL Python package also installs the `msccl` command line tool. To enable Bash completion for the
+`msccl` tool:
 ```
-echo 'eval "$(register-python-argcomplete sccl)"' >> ~/.bashrc
+echo 'eval "$(register-python-argcomplete msccl)"' >> ~/.bashrc
 ```
 
 ### Runtime Installation
 
-SCCL's algorithms are executed by the [Microsoft Collective Communication Library
-(MSCCL)](https://github.com/microsoft/msccl), which is API compatible with NCCL. See https://github.com/microsoft/msccl
-for instructions.
+Algorithms are executed by the [Microsoft Collective Communication Library (MSCCL)](https://github.com/microsoft/msccl),
+which is API compatible with NCCL. See https://github.com/microsoft/msccl for instructions.
 
-To use SCCL with PyTorch, the built in NCCL submodule has to be replaced with SCCL's version. Additionally, to expose
-the new native Alltoall support that SCCL adds, PyTorch's `torch.distributed` package can optionally be patched. The
-following commands perform these steps and install PyTorch with SCCL:
+To use MSCCL with PyTorch, the built in NCCL submodule has to be replaced with MSCCL's version. Additionally, to expose
+the new native Alltoall support that MSCCL adds, PyTorch's `torch.distributed` package can optionally be patched. The
+following commands perform these steps and install PyTorch with MSCCL:
 ```
 git clone https://github.com/pytorch/pytorch.git
 cd pytorch    
-git checkout tags/v1.9.0 -b v1.9.0_sccl
+git checkout tags/v1.9.0 -b v1.9.0_msccl
 perl -p -i -e  's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules
 git submodule sync third_party/nccl
 git submodule update --init --recursive
@@ -105,10 +110,10 @@ python setup.py install
 ### Note on Azure NDv2
 
 Azure NDv2 does not expose the true PCIe topology of the machines to the VM and worse, does not assign PCIe devices
-consistently to the virtual paths in the VM. As SCCL is generating topology-aware algorithms, this device ordering must
-be fixed. The [sccl_ndv2_launcher.sh](sccl/autosynth/sccl_ndv2_launcher.sh) script can be used to fix this problem. The
-script solves the automorphisms from the local VM's NVLink topology to the reference topology and selects one of the 4
-automorphisms based on measured placement of the Infiniband card such that GPU 0 is close to the NIC. A tool called
+consistently to the virtual paths in the VM. As MSCCL is generating topology-aware algorithms, this device ordering must
+be fixed. The [msccl_ndv2_launcher.sh](msccl/autosynth/msccl_ndv2_launcher.sh) script can be used to fix this problem.
+The script solves the automorphisms from the local VM's NVLink topology to the reference topology and selects one of the
+4 automorphisms based on measured placement of the Infiniband card such that GPU 0 is close to the NIC. A tool called
 [inspector-topo](https://github.com/microsoft/inspector-topo) needs to be available for the latter step.
 
 ## Contributing
diff --git a/SYNTHESIS.md b/SYNTHESIS.md
index 9b1b2e6..9f920ee 100755
--- a/SYNTHESIS.md
+++ b/SYNTHESIS.md
@@ -1,27 +1,27 @@
 ## Synthesizing Algorithms
 
-SCCL can synthesize algorithms for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
+MSCCL can synthesize algorithms for a given *topology* that implements a given *collective* in a given number of steps, bandwidth usage, memory limits, etc. These additional parameters are called the *instance*.
 
-SCCL groups its solver strategies under the `sccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
+MSCCL groups its solver strategies under the `msccl solve` subcommand. For example, to synthesize a specific `instance` of an Allgather algorithm for the [NVIDIA DGX-1](https://www.nvidia.com/en-us/data-center/dgx-1/) that completes in 4 steps:
 ```
-$ sccl solve instance DGX1 Allgather --steps 4
+$ msccl solve instance DGX1 Allgather --steps 4
 Solving instance steps=4... synthesized! (0.7s)
-Wrote to Allgather.n8-DGX1-steps4.sccl.json
+Wrote to Allgather.n8-DGX1-steps4.msccl.json
 ```
-The instance is satisfiable and `sccl` saves it to a file.
+The instance is satisfiable and `msccl` saves it to a file.
 
 Four steps is not necessarily the least number of steps required. To find the least steps:
 ```
-$ sccl solve least-steps DGX1 Allgather
+$ msccl solve least-steps DGX1 Allgather
 Algorithms need at least 2 steps.
 Solving instance steps=2... synthesized! (0.2s)
-Wrote to Allgather.n8-DGX1-steps2.sccl.json
+Wrote to Allgather.n8-DGX1-steps2.msccl.json
 ```
 The `least-steps` strategy statically determines that any Allgather in a DGX-1 requires at least 2 steps and starting from that finds the smallest satisfiable number of steps.
 
 While this two step algorithm is a latency-optimal one, there may be other algorithms that achieve higher bandwidth. The `pareto-optimal` strategy searches through different latency-bandwidth tradeoffs:
 ```
-$ sccl solve pareto-optimal DGX1 Allgather
+$ msccl solve pareto-optimal DGX1 Allgather
 Algorithms need at least 2 steps.
 Algorithms need at least 7/6 rounds per chunk.
 Solving instance steps=2... synthesized! (0.5s)
@@ -34,13 +34,13 @@ Solving instance steps=3,rounds=6,chunks=5... synthesized! (44.0s)
 Solving instance steps=3,rounds=7,chunks=6... synthesized! (56.1s)
 Bandwidth optimal algorithm found!
 Found 2 Pareto optimal algorithms. Pruned 4 non-optimal algorithms.
-Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.sccl.json
-Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.sccl.json
+Wrote to Allgather.n8-DGX1-steps2.rounds3.chunks2.msccl.json
+Wrote to Allgather.n8-DGX1-steps3.rounds7.chunks6.msccl.json
 ```
 
 ## Collectives
 
-SCCL includes a number of built in common collectives.
+MSCCL includes a number of built in common collectives.
 
 | Collective | Arguments | Description | Kind |
 | - | - | - | - |
@@ -60,16 +60,16 @@ SCCL includes a number of built in common collectives.
 
 Custom collectives may be defined by instantiating the `Collective` class, which is easiest through the `build_collective` function. For example, a send from rank 2 to rank 7 in an 8 node topology can be defined and saved with:
 ```
-from sccl.collectives import build_collective
-from sccl.serialization import save_sccl_object
+from msccl.collectives import build_collective
+from msccl.serialization import save_msccl_object
 
 precondition = lambda r, c: r == 2
 postcondition = lambda r, c: r == 7
 coll = build_collective('Send', 8, 1, precondition, postcondition)
-save_sccl_object(coll, 'send.json')
+save_msccl_object(coll, 'send.json')
 ```
 
-The *kind* of the collective determines support for some features of SCCL:
+The *kind* of the collective determines support for some features of MSCCL:
 - **NC** are non-combining collectives, and are always supported.
 - **CR** are combining collectives that have a non-combining dual collective, and are supported through a reduction.
 - **CNR** are combining collectives with no dual, which may not always be supported.
@@ -78,25 +78,25 @@ Currently the rounds per chunk analysis described below can not support CNR coll
 
 ## Steps and Rounds
 
-SCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step.
+MSCCL uses two related concepts, *steps and rounds*, to talk about the running time of algorithms. *Steps* is how many sequential sets of sends the algorithm consists of, where all sends inside a step execute in parallel. The number of sends between two nodes in a single step is limited by the bandwidth available in the topology. However, a step may consist of multiple *rounds*, which acts as a multiplier for all links in the topology during that step.
 
 How much data a single round corresponds to depends on what is the actual size of a chunk at runtime, and how many chunks a collective uses can change (e.g. you can control this directly in the `instance` strategy by setting `--chunks N`). Thus for each collective the total data usage of different algorithms implementing it can be measured with their *rounds per chunk*.
 
-SCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1:
+MSCCL provides a standalone analysis to find a lower bound for the *rounds per chunk* required by any instance. For example, to find the least rouds per chunk for an Alltoall in a DGX-1:
 ```
-$ sccl analyze rounds DGX1 Gather
+$ msccl analyze rounds DGX1 Gather
 Gather(n=8,root=0) algorithms need at least 7/6 rounds in DGX1 topology.
 ```
 In this case the bound happens to be tight and the `pareto-optimal` strategy would use it to detect that it has found a bandwidth optimal algorithm.
 
 ## Distributed Algorithms
 
-SCCL provides routines to synthesize algorithms for distributed topologies under the `sccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one.
+MSCCL provides routines to synthesize algorithms for distributed topologies under the `msccl distribute` subcommand. These work by using algorithms for a local collective and stitcing instances of it together to create a distributed one.
 
 **Alltoall from Gather and Scatter:** `alltoall-gather-scatter` combines a Gather and a Scatter algorithm with a transpose step in the middle to form a distributed Alltoall algorithm. For example, an Alltoall algorithm for a cluster of 4 DGX-1 machines can be created with:
 ```
-sccl solve least-steps DGX1 Gather -o gather.json
-sccl solve least-steps DGX1 Scatter -o scatter.json --root 1
-sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json
+msccl solve least-steps DGX1 Gather -o gather.json
+msccl solve least-steps DGX1 Scatter -o scatter.json --root 1
+msccl distribute alltoall-gather-scatter gather.json scatter.json --copies 4 -o alltoall.json
 ```
-This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. SCCL also provides multi-root versions of Gather and Scatter that can be substituted here.
+This distributor works with any Gather and Scatter algorithm, as long as their roots have a direct connection in the topology. MSCCL also provides multi-root versions of Gather and Scatter that can be substituted here.
diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile
index 1464325..21ee9be 100755
--- a/dockerfiles/Dockerfile
+++ b/dockerfiles/Dockerfile
@@ -50,10 +50,10 @@ RUN cd ${STAGE_DIR} && mkdir openmpi/ && cd openmpi && wget https://www.open-mpi
     rm -rf ${STAGE_DIR}/openmpi/
 
 ##############################################################################
-# SCCL
+# MSCCL
 ##############################################################################
 
-# update NCCL in pytorch, install SCCL interpreter
+# update NCCL in pytorch, install MSCCL interpreter
 RUN pip uninstall torch -y
 
 RUN pip install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
@@ -62,11 +62,11 @@ RUN conda install -c pytorch magma-cuda111 -y
 
 ENV CMAKE_PREFIX_PATH=/opt/conda
 
-# Change NCCL to SCCL Runtime
+# Change NCCL to MSCCL Runtime
 RUN cd ${STAGE_DIR} && \
     git clone https://github.com/pytorch/pytorch.git && \
     cd pytorch && \
-    git checkout tags/v1.9.0 -b v1.9.0_sccl && \
+    git checkout tags/v1.9.0 -b v1.9.0_msccl && \
     perl -p -i -e  's/url = https:\/\/github\.com\/NVIDIA\/nccl/url = https:\/\/github\.com\/microsoft\/msccl/g' .gitmodules && \
     git submodule sync third_party/nccl  && \
     git submodule update --init --recursive  && \
@@ -79,12 +79,12 @@ RUN cd ${STAGE_DIR} && \
     cd ${STAGE_DIR} && \
     rm -rf ${STAGE_DIR}/pytorch
 
-# Install SCCL
+# Install MSCCL
 RUN cd ${STAGE_DIR}/ && \
-    git clone https://github.com/microsoft/sccl.git && \
-    cd sccl/ && python setup.py install && \
+    git clone https://github.com/microsoft/msccl.git && \
+    cd msccl/ && python setup.py install && \
     cd ${STAGE_DIR} && \
-    rm -rf ${STAGE_DIR}/sccl/
+    rm -rf ${STAGE_DIR}/msccl/
 
 ##############################################################################
 # inspector-topo
diff --git a/examples/dgx1_allgather.ipynb b/examples/dgx1_allgather.ipynb
index 5a23691..61d7363 100644
--- a/examples/dgx1_allgather.ipynb
+++ b/examples/dgx1_allgather.ipynb
@@ -28,7 +28,7 @@
     }
    ],
    "source": [
-    "from sccl.topologies import dgx1\n",
+    "from msccl.topologies import dgx1\n",
     "from pprint import pprint\n",
     "topology = dgx1()\n",
     "pprint(topology.links)"
@@ -47,7 +47,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sccl.collectives import allgather\n",
+    "from msccl.collectives import allgather\n",
     "collective = allgather(topology.num_nodes())"
    ]
   },
@@ -117,7 +117,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Lets try to actually solve this for a specific number of steps. `sccl.strategies` offers entry points into the solver. We'll use one that just does a single solver call for now. The encoding itself lives in [path_encoding.py](../sccl/path_encoding.py). As expected, 1 step is not enough, because some ranks aren't directly connected."
+    "Lets try to actually solve this for a specific number of steps. `msccl.strategies` offers entry points into the solver. We'll use one that just does a single solver call for now. The encoding itself lives in [path_encoding.py](../msccl/path_encoding.py). As expected, 1 step is not enough, because some ranks aren't directly connected."
    ]
   },
   {
@@ -134,8 +134,8 @@
     }
    ],
    "source": [
-    "from sccl.strategies import solve_instance\n",
-    "from sccl.instance import Instance\n",
+    "from msccl.strategies import solve_instance\n",
+    "from msccl.instance import Instance\n",
     "algo = solve_instance(topology, collective, Instance(steps=1), logging=True)"
    ]
   },
@@ -271,7 +271,7 @@
     }
    ],
    "source": [
-    "from sccl.strategies import solve_all_latency_bandwidth_tradeoffs\n",
+    "from msccl.strategies import solve_all_latency_bandwidth_tradeoffs\n",
     "algos = list(solve_all_latency_bandwidth_tradeoffs(topology, collective, logging=True))"
    ]
   },
@@ -281,7 +281,7 @@
    "source": [
     "Two preprocessing steps are performed:\n",
     "- The minimum number of steps required is lower bound based on the maximum of the shortest paths for each chunk considering the topology.\n",
-    "- A minimum number of rounds per chunk is lower bound using a kind of multi-commodity flow encoding in [rounds_bound.py](../sccl/rounds_bound.py).\n",
+    "- A minimum number of rounds per chunk is lower bound using a kind of multi-commodity flow encoding in [rounds_bound.py](../msccl/rounds_bound.py).\n",
     "\n",
     "Then all relevant trade-offs are iterated until a bandwidth optimal algorithm is found (if the rounds per chunk lower bound happens to be exact).\n",
     "\n",
@@ -294,7 +294,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sccl.strategies import prune_pareto_optimal\n",
+    "from msccl.strategies import prune_pareto_optimal\n",
     "algos = prune_pareto_optimal(algos)"
    ]
   },
diff --git a/examples/scclang/allgather_recursive_doubling.py b/examples/mscclang/allgather_recursive_doubling.py
similarity index 82%
rename from examples/scclang/allgather_recursive_doubling.py
rename to examples/mscclang/allgather_recursive_doubling.py
index 5cd4f2e..a00074c 100755
--- a/examples/scclang/allgather_recursive_doubling.py
+++ b/examples/mscclang/allgather_recursive_doubling.py
@@ -2,15 +2,15 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllGather
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllGather
 
 # https://web.cels.anl.gov/~thakur/papers/mpi-coll.pdf
 def allgather_recursive_doubling(size, instances, protocol):
     topology = fully_connected(size)
     collective = AllGather(size, instances, True)
-    with SCCLProgram("allgather_recursive_doubling", topology, collective, 1, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
+    with MSCCLProgram("allgather_recursive_doubling", topology, collective, 1, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
         count = 1
         while count < size:
             # Every rank exchanges count chunks with neighbor count away
diff --git a/examples/scclang/allgather_ring.py b/examples/mscclang/allgather_ring.py
similarity index 87%
rename from examples/scclang/allgather_ring.py
rename to examples/mscclang/allgather_ring.py
index 8a6aaa0..3067bab 100755
--- a/examples/scclang/allgather_ring.py
+++ b/examples/mscclang/allgather_ring.py
@@ -2,9 +2,9 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllGather
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllGather
 
 # Ring allgather for A100s
 # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
@@ -13,7 +13,7 @@
 def allgather_ring(size, channels, instances, protocol):
     topology = fully_connected(size)
     collective = AllGather(size, 1, True)
-    with SCCLProgram(f"allgather_ring_{channels}channelsperring", topology, collective, instances,
+    with MSCCLProgram(f"allgather_ring_{channels}channelsperring", topology, collective, instances,
          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
         for step in range(0, size-1):
             for index in range(0, size):
diff --git a/examples/scclang/allreduce_a100_allpairs.py b/examples/mscclang/allreduce_a100_allpairs.py
similarity index 90%
rename from examples/scclang/allreduce_a100_allpairs.py
rename to examples/mscclang/allreduce_a100_allpairs.py
index 97fe9d5..e6ec80d 100755
--- a/examples/scclang/allreduce_a100_allpairs.py
+++ b/examples/mscclang/allreduce_a100_allpairs.py
@@ -2,16 +2,16 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 def allreduce_allpairs(gpus, instances, protocol):
     size = gpus
     chunksperloop = gpus * gpus
     topology = fully_connected(size)
     collective = AllReduce(size, chunksperloop, True)
-    with SCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
+    with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
         interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
         
         # Each rank sends the nth chunk to the nth rank into scratch space
diff --git a/examples/scclang/allreduce_a100_recursive_doubling_halving.py b/examples/mscclang/allreduce_a100_recursive_doubling_halving.py
similarity index 93%
rename from examples/scclang/allreduce_a100_recursive_doubling_halving.py
rename to examples/mscclang/allreduce_a100_recursive_doubling_halving.py
index 81e4cb7..67a06eb 100755
--- a/examples/scclang/allreduce_a100_recursive_doubling_halving.py
+++ b/examples/mscclang/allreduce_a100_recursive_doubling_halving.py
@@ -5,9 +5,9 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 
 def allreduce(ways, instances, protocol):
@@ -15,7 +15,7 @@ def allreduce(ways, instances, protocol):
     size = topology.num_nodes() #  Number of gpus
     logical_chunk = 8 * ways
     collective = AllReduce(size, logical_chunk, True)
-    with SCCLProgram("allreduce_a100_recursive_doubling_halving", topology, collective, instances, protocol, interleaved_replication=False):
+    with MSCCLProgram("allreduce_a100_recursive_doubling_halving", topology, collective, instances, protocol, interleaved_replication=False):
         # 1 reduction between pairs of gpus of count
         def recursive_doubling(pairs, count, next_index, lc, sendtb, recvtb):
             current_index = next_index.copy()
diff --git a/examples/scclang/allreduce_a100_ring.py b/examples/mscclang/allreduce_a100_ring.py
similarity index 90%
rename from examples/scclang/allreduce_a100_ring.py
rename to examples/mscclang/allreduce_a100_ring.py
index 3906a99..6c5886f 100755
--- a/examples/scclang/allreduce_a100_ring.py
+++ b/examples/mscclang/allreduce_a100_ring.py
@@ -2,9 +2,9 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 # Ring all reduce for A100s
 # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
@@ -13,7 +13,7 @@
 def allreduce_ring(size, instances, channels, protocol):
     topology = fully_connected(size)
     collective = AllReduce(size, size, True)
-    with SCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
+    with MSCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
         # Reduce ring
         for step in range(0, size-1):
diff --git a/examples/scclang/allreduce_binomial_tree.py b/examples/mscclang/allreduce_binomial_tree.py
similarity index 92%
rename from examples/scclang/allreduce_binomial_tree.py
rename to examples/mscclang/allreduce_binomial_tree.py
index 8abe9e4..c8740e4 100755
--- a/examples/scclang/allreduce_binomial_tree.py
+++ b/examples/mscclang/allreduce_binomial_tree.py
@@ -2,16 +2,16 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 # Binomial tree and mirrored binomial tree
 # Mirrored trees adopted from: http://algo2.iti.kit.edu/documents/2tree.pdf
 def allreduce_binomial_tree(size, instances, trees, protocol):
     topology = fully_connected(size)
     collective = AllReduce(size, trees, True)
-    with SCCLProgram("allreduce_binomial_tree", topology, collective, instances, protocol=protocol):
+    with MSCCLProgram("allreduce_binomial_tree", topology, collective, instances, protocol=protocol):
         distance = 1
         # Reduce tree - reducing onto Rank 0
         while distance <= size // 2:
diff --git a/examples/scclang/allreduce_dgx1.py b/examples/mscclang/allreduce_dgx1.py
similarity index 89%
rename from examples/scclang/allreduce_dgx1.py
rename to examples/mscclang/allreduce_dgx1.py
index 79157bd..5a31708 100755
--- a/examples/scclang/allreduce_dgx1.py
+++ b/examples/mscclang/allreduce_dgx1.py
@@ -3,10 +3,10 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies.distributed import *
-from sccl.topologies.nvidia import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies.distributed import *
+from msccl.topologies.nvidia import *
+from msccl.language.collectives import AllReduce
 
 def allreduce(num_nodes, instances):
     local_topology = dgx1()
@@ -20,7 +20,7 @@ def allreduce(num_nodes, instances):
     def rank(n, g):
         return local_ring_order[g] + n * num_local_gpus
         
-    with SCCLProgram("allreduce_ring_dgx1", topology, collective, 1):
+    with MSCCLProgram("allreduce_ring_dgx1", topology, collective, 1):
 
         # Chunks travels around local rings being reduced (local_gpus-1 hops) starting at local gpu 1
         # At the end of the most reduced chunk ends up on local gpu 0 every each node
diff --git a/examples/scclang/allreduce_ndv2.py b/examples/mscclang/allreduce_ndv2.py
similarity index 86%
rename from examples/scclang/allreduce_ndv2.py
rename to examples/mscclang/allreduce_ndv2.py
index 12ba399..703c98c 100755
--- a/examples/scclang/allreduce_ndv2.py
+++ b/examples/mscclang/allreduce_ndv2.py
@@ -2,11 +2,11 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies.distributed import *
-from sccl.topologies.nvidia import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies.distributed import *
+from msccl.topologies.nvidia import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 def allreduce(instances):
     size = 8
@@ -15,7 +15,7 @@ def allreduce(instances):
     # size = topology.num_nodes() #  Number of gpus
     logical_chunk = size
     collective = AllReduce(size, logical_chunk, True)
-    with SCCLProgram("allreduce_ndv2", topology, collective, instances, interleaved_replication=False):
+    with MSCCLProgram("allreduce_ndv2", topology, collective, instances, interleaved_replication=False):
         # local reduce_scatter
         instances = 1
         for lc in range(num_local_gpus//2):
diff --git a/examples/scclang/allreduce_recursive_doubling_halving.py b/examples/mscclang/allreduce_recursive_doubling_halving.py
similarity index 88%
rename from examples/scclang/allreduce_recursive_doubling_halving.py
rename to examples/mscclang/allreduce_recursive_doubling_halving.py
index dde2cb4..f955c0c 100755
--- a/examples/scclang/allreduce_recursive_doubling_halving.py
+++ b/examples/mscclang/allreduce_recursive_doubling_halving.py
@@ -4,9 +4,9 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 def reduce_scatter_vector_halving_distance_doubling(size):
     count = size // 2
@@ -31,7 +31,7 @@ def allreduce(size, instances, protocol):
     topology = fully_connected(size)
     logical_chunk = size
     collective = AllReduce(size, logical_chunk, True)
-    with SCCLProgram("allreduce_recursive_doubling_halving", topology, collective, instances, protocol,
+    with MSCCLProgram("allreduce_recursive_doubling_halving", topology, collective, instances, protocol,
          interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual):
         reduce_scatter_vector_halving_distance_doubling(size)
         allgather_recursive_vector_doubling_distance_halving(size)
diff --git a/examples/scclang/alltoall_a100.py b/examples/mscclang/alltoall_a100.py
similarity index 97%
rename from examples/scclang/alltoall_a100.py
rename to examples/mscclang/alltoall_a100.py
index 27a615e..549e65c 100755
--- a/examples/scclang/alltoall_a100.py
+++ b/examples/mscclang/alltoall_a100.py
@@ -8,9 +8,9 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllToAll
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
 
 def alltoall_hierarchical(num_nodes, gpus_per_node, instances, ib_connections):
     num_ranks = num_nodes * gpus_per_node
@@ -41,7 +41,7 @@ def AddChunk(ib_chunks, key, c):
     topology = fully_connected(num_ranks)
     collective = AllToAll(num_ranks, instances, inplace=False)
     
-    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
         ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
 
         # Local Gathers
diff --git a/examples/scclang/alltoall_a100_yifan.py b/examples/mscclang/alltoall_a100_yifan.py
similarity index 91%
rename from examples/scclang/alltoall_a100_yifan.py
rename to examples/mscclang/alltoall_a100_yifan.py
index 3d94484..0a1921a 100755
--- a/examples/scclang/alltoall_a100_yifan.py
+++ b/examples/mscclang/alltoall_a100_yifan.py
@@ -1,8 +1,8 @@
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllToAll
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
 
 
 def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
@@ -11,7 +11,7 @@ def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
     collective = AllToAll(num_ranks, 1, inplace=False)
 
         
-    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol):
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol):
         for n1 in range(num_nodes):
             for r in range(1,num_nodes):
                 n2 = (n1 + r) % num_nodes
diff --git a/examples/scclang/alltoall_allpairs.py b/examples/mscclang/alltoall_allpairs.py
similarity index 79%
rename from examples/scclang/alltoall_allpairs.py
rename to examples/mscclang/alltoall_allpairs.py
index 3a93797..89c6a64 100755
--- a/examples/scclang/alltoall_allpairs.py
+++ b/examples/mscclang/alltoall_allpairs.py
@@ -1,8 +1,8 @@
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllToAll
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
 
 # One-step AllToAll program
 # Each gpu makes sends and receives a chunk from every other gpu
@@ -11,7 +11,7 @@ def alltoall(num_ranks, instances, protocol):
     topology = fully_connected(num_ranks)
     collective = AllToAll(num_ranks, 1, inplace=False)
 
-    with SCCLProgram("alltoall_allpairs", topology, collective, instances=instances, protocol=protocol):
+    with MSCCLProgram("alltoall_allpairs", topology, collective, instances=instances, protocol=protocol):
         for r in range(num_ranks):
             for index in range(num_ranks):
                 chunk(r, Buffer.input, index).copy(index, Buffer.output, r)
diff --git a/examples/scclang/alltonext_backward.py b/examples/mscclang/alltonext_backward.py
similarity index 93%
rename from examples/scclang/alltonext_backward.py
rename to examples/mscclang/alltonext_backward.py
index f56b56d..5cb34da 100755
--- a/examples/scclang/alltonext_backward.py
+++ b/examples/mscclang/alltonext_backward.py
@@ -3,10 +3,10 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies.distributed import *
-from sccl.topologies import *
-from sccl.language.collectives import Collective
+from msccl.language import *
+from msccl.topologies.distributed import *
+from msccl.topologies import *
+from msccl.language.collectives import Collective
 
 class Pipeline(Collective):
     def init_buffers(self):
@@ -49,7 +49,7 @@ def pipeline(num_nodes, instances):
     def rank(node, local_rank):
         return node * num_local_gpus + local_rank
     
-    with SCCLProgram("alltonext-backwards", topology, collective, instances):
+    with MSCCLProgram("alltonext-backwards", topology, collective, instances):
 
         for n in range(num_nodes):
             for g in range(num_local_gpus):
diff --git a/examples/scclang/alltonext_forward.py b/examples/mscclang/alltonext_forward.py
similarity index 93%
rename from examples/scclang/alltonext_forward.py
rename to examples/mscclang/alltonext_forward.py
index 3579dd4..fe48770 100755
--- a/examples/scclang/alltonext_forward.py
+++ b/examples/mscclang/alltonext_forward.py
@@ -3,10 +3,10 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies.distributed import *
-from sccl.topologies import *
-from sccl.language.collectives import Collective
+from msccl.language import *
+from msccl.topologies.distributed import *
+from msccl.topologies import *
+from msccl.language.collectives import Collective
 
 class Pipeline(Collective):
     def init_buffers(self):
@@ -51,7 +51,7 @@ def pipeline(num_nodes, instances):
     def rank(node, local_rank):
         return node * num_local_gpus + local_rank
     
-    with SCCLProgram("alltonext-forward", topology, collective, instances):
+    with MSCCLProgram("alltonext-forward", topology, collective, instances):
 
         for n in range(num_nodes):
             for g in range(num_local_gpus):
diff --git a/examples/scclang/reducegather.py b/examples/mscclang/reducegather.py
similarity index 93%
rename from examples/scclang/reducegather.py
rename to examples/mscclang/reducegather.py
index a1001b8..cde2bef 100755
--- a/examples/scclang/reducegather.py
+++ b/examples/mscclang/reducegather.py
@@ -3,9 +3,9 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import Collective
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import Collective
 
 class ReduceGather(Collective):
     def __init__(self, num_ranks, chunk_factor, inplace, groups):
@@ -57,7 +57,7 @@ def program(num_ranks, groups, instances, protocol):
     inplace = False
     collective = ReduceGather(num_ranks, chunk_factor, inplace, groups)
 
-    with SCCLProgram("reduce-gather", topology, collective, instances, protocol, threadblock_policy=ThreadblockPolicy.manual):
+    with MSCCLProgram("reduce-gather", topology, collective, instances, protocol, threadblock_policy=ThreadblockPolicy.manual):
 
         # Per group reduce scatter
         for y in range(groups):
diff --git a/examples/scclang/simple/allgather_ring.py b/examples/mscclang/simple/allgather_ring.py
similarity index 87%
rename from examples/scclang/simple/allgather_ring.py
rename to examples/mscclang/simple/allgather_ring.py
index 6d82031..caea712 100755
--- a/examples/scclang/simple/allgather_ring.py
+++ b/examples/mscclang/simple/allgather_ring.py
@@ -1,14 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllGather
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllGather
 
 def allgather_ring(size):
     topology = fully_connected(size)
     collective = AllGather(size, 1, False)
-    with SCCLProgram("allgather_ring", topology, collective, 1):
+    with MSCCLProgram("allgather_ring", topology, collective, 1):
         # Loop over each chunk's root
         for r in range(size):
             # Get the chunk at rank r, input[r]
@@ -29,7 +29,7 @@ def allgather_ring(size):
 def allgather_ring_inplace(size):
     topology = fully_connected(size)
     collective = AllGather(size, 1, True)
-    with SCCLProgram("allgather_ring", topology, collective, 1):
+    with MSCCLProgram("allgather_ring", topology, collective, 1):
         # Loop over each chunk's root
         for r in range(size):
             # Get the chunk at rank r, input[r]
diff --git a/examples/scclang/simple/allreduce_ring.py b/examples/mscclang/simple/allreduce_ring.py
similarity index 83%
rename from examples/scclang/simple/allreduce_ring.py
rename to examples/mscclang/simple/allreduce_ring.py
index 7dc2566..067bb3b 100755
--- a/examples/scclang/simple/allreduce_ring.py
+++ b/examples/mscclang/simple/allreduce_ring.py
@@ -3,10 +3,10 @@
 
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.collectives import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.collectives import *
+from msccl.language.collectives import AllReduce
 
 
 def allreduce_ring(size, instances):
@@ -14,7 +14,7 @@ def allreduce_ring(size, instances):
     topology = fully_connected(size)
     collective = AllReduce(size, size, inplace=True)
 
-    with SCCLProgram("allreduce_ring_inplace", topology, collective, instances):
+    with MSCCLProgram("allreduce_ring_inplace", topology, collective, instances):
         for r in range(size):
             index = r
             # (rank, buffer, index)
diff --git a/examples/scclang/simple/custom_collective.py b/examples/mscclang/simple/custom_collective.py
similarity index 84%
rename from examples/scclang/simple/custom_collective.py
rename to examples/mscclang/simple/custom_collective.py
index 3c64a8d..5feb0bc 100755
--- a/examples/scclang/simple/custom_collective.py
+++ b/examples/mscclang/simple/custom_collective.py
@@ -3,12 +3,12 @@
 
 # Example of a simple custom collective where Rank 0 sends a chunk to Ranks 1 and 2
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import Collective
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import Collective
 
 # For custom collectives you need to define a new collective class
-# this is used by scclang to initialize buffers with chunks (pre-condition)
+# this is used by mscclang to initialize buffers with chunks (pre-condition)
 # and provide a checker to check that chunks satisfy the post-condition of the collective.
 class CollEx(Collective):
     # Initial state is chunk0 is on rank0 in the input buffer
@@ -45,19 +45,19 @@ def check(self, prog):
 
 
 def custom_example1():
-    # SCCLang programs take in a name for hte program, the topology of the network, 
+    # MSCCLang programs take in a name for hte program, the topology of the network, 
     # collective being implemented, chunksperloop of the collective, and optionally the NCCL protocol to be used
     size = 3
     topology = fully_connected(size) 
     # Collectives take in number of ranks in the network, chunksperloop of the collective, whether it is inplace, 
     collective = CollEx(size, 1, inplace=False)
-    with SCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
+    with MSCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
         # Get the chunk at rank 0 index 0 of the input buffer
         c = chunk(0, Buffer.input, 0)
         # Send chunks to 1 and 2
         # Can specify the sender's tb, receiver's tb, and channel for the send operation
-        # SCCLang provides a default threadblock assignment if they aren't specified
-        # SCCLang will also check the tb/channel combos are valid
+        # MSCCLang provides a default threadblock assignment if they aren't specified
+        # MSCCLang will also check the tb/channel combos are valid
         c.copy(1, buffer=Buffer.output, index=0, sendtb=1, recvtb=1, ch=0)
         c.copy(2, buffer=Buffer.output, index=0, sendtb=2, recvtb=1, ch=1)
 
@@ -70,7 +70,7 @@ def custom_example2():
     topology = fully_connected(size) 
 
     collective = CollEx(size, 1, inplace=False)
-    with SCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
+    with MSCCLProgram("allgather_ring", topology, collective, instances=1, protocol="Simple"):
         c = chunk(0, Buffer.input, 0)
         # This is the same program as above but instead of rank 0 sending to 1 and 2
         # 0 sends to 1 which sends to 2
diff --git a/examples/requirements_sccl_init.txt b/examples/requirements_sccl_init.txt
index c26d6b4..129d716 100755
--- a/examples/requirements_sccl_init.txt
+++ b/examples/requirements_sccl_init.txt
@@ -1 +1 @@
-git+https://github.com/parasailteam/sccl-presynth
\ No newline at end of file
+git+https://github.com/parasailteam/msccl-presynth
\ No newline at end of file
diff --git a/examples/sccl_init.py b/examples/sccl_init.py
index 44ca7ad..ffc0bc5 100755
--- a/examples/sccl_init.py
+++ b/examples/sccl_init.py
@@ -4,85 +4,85 @@
 import os
 
 def show():
-    if 'SCCL_CONFIG' in os.environ:
+    if 'MSCCL_CONFIG' in os.environ:
         print()
-        print(f"SCCL_CONFIG = {os.environ['SCCL_CONFIG']}")
-        print(f"Contents of {os.environ['SCCL_CONFIG']}:")
-        with open(os.environ['SCCL_CONFIG']) as f:
+        print(f"MSCCL_CONFIG = {os.environ['MSCCL_CONFIG']}")
+        print(f"Contents of {os.environ['MSCCL_CONFIG']}:")
+        with open(os.environ['MSCCL_CONFIG']) as f:
             print(f.read())
         print()
 
 
 print('=== Trigger a builtin synthesis plan ===')
 
-import sccl
-sccl.init('ndv4', 9, (sccl.Collective.alltoall, '1GB'))
+import msccl
+msccl.init('ndv4', 9, (msccl.Collective.alltoall, '1GB'))
 
 show()
 
 
 print('=== Register additional plans from a library ===')
 
-import sccl_presynth
-sccl.init('ndv2', 3,
-    (sccl.Collective.alltoall, '1GB'),
-    (sccl.Collective.allgather, (128, '1KB')))
+import msccl_presynth
+msccl.init('ndv2', 3,
+    (msccl.Collective.alltoall, '1GB'),
+    (msccl.Collective.allgather, (128, '1KB')))
 
 show()
 
 
 print('=== Register custom plans ===')
 
-from sccl.autosynth.registry import register_synthesis_plan
+from msccl.autosynth.registry import register_synthesis_plan
 
-@register_synthesis_plan(sccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1MB', None))
+@register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1MB', None))
 def alltoall_9000(machines):
     return """<algo name="a2andv9000" nchunksperloop="2" nchannels="1" inplace="0" ngpus="2" proto="Simple">
     ...
     </algo>"""
 
-sccl.init('ndv9000', 1, (sccl.Collective.alltoall, '2MB'))
+msccl.init('ndv9000', 1, (msccl.Collective.alltoall, '2MB'))
 
 show()
 
 
 print('=== Overlapping size ranges ===')
 
-register_synthesis_plan(sccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000)
-register_synthesis_plan(sccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000)
+register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, (0, '1KB'), protocol='LL')(alltoall_9000)
+register_synthesis_plan(msccl.Collective.alltoall, 'ndv9000', lambda m: m == 1, ('1KB', '1MB'), protocol='LL128')(alltoall_9000)
 
-sccl.init('ndv9000', 1, (sccl.Collective.alltoall, ('2KB', None)))
+msccl.init('ndv9000', 1, (msccl.Collective.alltoall, ('2KB', None)))
 
 show()
 
 
 # TODO: Update the following programs to use the new syntax
-# print('=== SCCLang program ===')
+# print('=== MSCCLang program ===')
 
-# from sccl.autosynth.registry import register_sccl_program
-# from sccl.topologies import line
-# from sccl.language import *
+# from msccl.autosynth.registry import register_msccl_program
+# from msccl.topologies import line
+# from msccl.language import *
 
-# @register_sccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1)
+# @register_msccl_program(line(2), 'allgather', 'two_gpus', machines= lambda m: m == 1)
 # def trivial_allgather(prog, nodes):
 #     chunk(Buffer.input, 0, 0).send(0, Buffer.output, 0).send(1)
 #     chunk(Buffer.input, 1, 0).send(1, Buffer.output, 1).send(0)
 
-# sccl.init('two_gpus', 1, (sccl.Collective.allgather, (0, None)))
+# msccl.init('two_gpus', 1, (msccl.Collective.allgather, (0, None)))
 
 # show()
 
 
-# print('=== SCCLang program example ====')
+# print('=== MSCCLang program example ====')
 
-# from sccl.topologies import fully_connected
-# from sccl.programs.allreduce_a100_ring import allreduce_ring
+# from msccl.topologies import fully_connected
+# from msccl.programs.allreduce_a100_ring import allreduce_ring
 
-# @register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+# @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
 #     instances=4, protocol='LL128', threadblock_policy=ThreadblockPolicy.manual, machines=lambda x: x == 1)
 # def ndv4_ring_allreduce(prog, nodes):
 #     allreduce_ring(size=8, channels=8)
 
-# sccl.init('ndv4', 1, (sccl.Collective.allreduce, (0, None)))
+# msccl.init('ndv4', 1, (msccl.Collective.allreduce, (0, None)))
 
 # show()
\ No newline at end of file
diff --git a/examples/send.py b/examples/send.py
index c4c1b46..fcb8090 100755
--- a/examples/send.py
+++ b/examples/send.py
@@ -3,10 +3,10 @@
 
 # This script defines and saves a custom collective to send from rank 2 to rank 7
 
-from sccl.collectives import build_collective
-from sccl.serialization import save_sccl_object
+from msccl.collectives import build_collective
+from msccl.serialization import save_msccl_object
 
 precondition = lambda r, c: r == 2
 postcondition = lambda r, c: r == 7
 coll = build_collective('Send', 8, 1, precondition, postcondition)
-save_sccl_object(coll, 'send.json')
+save_msccl_object(coll, 'send.json')
diff --git a/examples/unpermute_dgx1.py b/examples/unpermute_dgx1.py
index 74f2735..c0261f0 100755
--- a/examples/unpermute_dgx1.py
+++ b/examples/unpermute_dgx1.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# This script shows how to use SCCL to find a way to permute the nodes of a DGX1 to match the default order.
+# This script shows how to use MSCCL to find a way to permute the nodes of a DGX1 to match the default order.
 
-from sccl.topologies import *
-from sccl.isomorphisms import find_isomorphisms
+from msccl.topologies import *
+from msccl.isomorphisms import find_isomorphisms
 
 def solve_dgx1_permutation():
     local = nvlink_only()
diff --git a/msccl/__init__.py b/msccl/__init__.py
new file mode 100755
index 0000000..68951dd
--- /dev/null
+++ b/msccl/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from msccl.autosynth import init, tabulate_plans, print_plans
+from msccl.autosynth import ndv2_perm
+from msccl.autosynth import Collective
diff --git a/sccl/__main__.py b/msccl/__main__.py
similarity index 80%
rename from sccl/__main__.py
rename to msccl/__main__.py
index 34e2197..3551657 100755
--- a/sccl/__main__.py
+++ b/msccl/__main__.py
@@ -4,17 +4,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.collectives as collectives
-import sccl.topologies as topologies
-import sccl.strategies as strategies
-from sccl.cli import *
+import msccl.collectives as collectives
+import msccl.topologies as topologies
+import msccl.strategies as strategies
+from msccl.cli import *
 
 import argparse
 import argcomplete
 import sys
 
 def main():
-    parser = argparse.ArgumentParser('sccl')
+    parser = argparse.ArgumentParser('msccl')
 
     cmd_parsers = parser.add_subparsers(title='command', dest='command')
     cmd_parsers.required = True
diff --git a/sccl/algorithm.py b/msccl/algorithm.py
similarity index 100%
rename from sccl/algorithm.py
rename to msccl/algorithm.py
diff --git a/sccl/autosynth/__init__.py b/msccl/autosynth/__init__.py
similarity index 85%
rename from sccl/autosynth/__init__.py
rename to msccl/autosynth/__init__.py
index 9952e64..7c11e24 100755
--- a/sccl/autosynth/__init__.py
+++ b/msccl/autosynth/__init__.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.topologies import dgx1, nvlink_only
-from sccl.isomorphisms import find_isomorphisms
-from sccl.autosynth.registry import synthesis_plans
+from msccl.topologies import dgx1, nvlink_only
+from msccl.isomorphisms import find_isomorphisms
+from msccl.autosynth.registry import synthesis_plans
 from lxml import etree as ET
 import re
 import subprocess
@@ -15,8 +15,8 @@
 from tabulate import tabulate
 from enum import Enum
 
-from sccl.autosynth.ndv2_plans import register_ndv2_plans
-from sccl.autosynth.ndv4_plans import register_ndv4_plans
+from msccl.autosynth.ndv2_plans import register_ndv2_plans
+from msccl.autosynth.ndv4_plans import register_ndv4_plans
 register_ndv2_plans()
 register_ndv4_plans()
 
@@ -58,8 +58,8 @@ def init(machine_type, num_machines, *collectives):
         if len(plans) > 0:
             selected_plans[name] = plans
 
-    # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by SCCL-RT.
-    algos_elem = ET.Element('sccl_algos')
+    # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by MSCCL-RT.
+    algos_elem = ET.Element('msccl_algos')
     any_selected = False
     for collective_name, plans in selected_plans.items():
         for plan, params in plans:
@@ -82,25 +82,25 @@ def init(machine_type, num_machines, *collectives):
 
         # Set environment variables
         env = {
-            'SCCL_CONFIG': path,
+            'MSCCL_CONFIG': path,
         }
         if 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] != '':
             existing_algos = os.environ['NCCL_ALGO']
-            if 'SCCL' not in existing_algos.split(','):
-                os.environ['NCCL_ALGO'] = 'SCCL,' + existing_algos
+            if 'MSCCL' not in existing_algos.split(','):
+                os.environ['NCCL_ALGO'] = 'MSCCL,' + existing_algos
         else:
-            env['NCCL_ALGO'] = 'SCCL,RING,TREE'
+            env['NCCL_ALGO'] = 'MSCCL,RING,TREE'
         if machine_type == 'ndv4' and num_machines >= 8 and 'alltoall' in selected_plans:
-            print(f'SCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
+            print(f'MSCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'
         if machine_type == 'ndv4':
-            print(f'SCCL: Setting relaxed orderin, topo file and visible devices order')
+            print(f'MSCCL: Setting relaxed orderin, topo file and visible devices order')
             env['NCCL_IB_PCI_RELAXED_ORDERING'] = '1'
             env['NCCL_TOPO_FILE'] = '/opt/msft/topo.xml'
             env['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
         os.environ.update(env)
     else:
-        print(f'SCCL: No algorithms were selected.')
+        print(f'MSCCL: No algorithms were selected.')
 
 
 def _format_size(size):
@@ -152,10 +152,10 @@ def _select_plans(name, candidates, num_machines, sizes):
         sorted_candidates = sorted(candidates, key=_candidate_sort_key)
         description = f'{name} with sizes from {_format_size(isizes[0])} to {_format_size(isizes[1])}'
         if len(sorted_candidates) == 0:
-            print(f'SCCL: No plan found for {description}. Falling back to NCCL baseline.')
+            print(f'MSCCL: No plan found for {description}. Falling back to NCCL baseline.')
         else:
             desc, plan, _, _, proto, _ = sorted_candidates[-1]
-            print(f'SCCL: Plan for {description} is {desc} with {proto} protocol.')
+            print(f'MSCCL: Plan for {description} is {desc} with {proto} protocol.')
             if len(results) > 0 and plan == results[-1][0] and isizes[0] == results[-1][1][1] + 1 and proto == results[-1][1][2]:
                 results[-1][1][1] = isizes[1]
             else:
@@ -169,11 +169,11 @@ def _candidate_sort_key(candidate):
 
 
 def ndv2_perm(): # pragma: no cover
-    # This function is used in a hacky way right now. The sccl_ndv2_launcher.sh
+    # This function is used in a hacky way right now. The msccl_ndv2_launcher.sh
     # relies on the side effect of _select_isomorphism creating the lock file,
     # which is read by the script after calling this function, so the return
     # value does't currently get used. If you make changes, please fix or update
-    # sccl_ndv2_launcher.sh accordingly.
+    # msccl_ndv2_launcher.sh accordingly.
     isomorphisms = find_isomorphisms(dgx1(), nvlink_only())
     if len(isomorphisms) != 4:
         raise RuntimeError(
@@ -182,7 +182,7 @@ def ndv2_perm(): # pragma: no cover
 
 
 def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
-    with open('/var/lock/sccl_autosynth_inspector_topo.lock', "a+") as f:
+    with open('/var/lock/msccl_autosynth_inspector_topo.lock', "a+") as f:
         fcntl.lockf(f, fcntl.LOCK_EX)
         try:
             f.seek(0, 2)
@@ -191,16 +191,16 @@ def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
                 f.seek(0)
                 order = f.read()
                 if verbose:
-                    print(f'SCCL: Read IB placement from {f.name}')
+                    print(f'MSCCL: Read IB placement from {f.name}')
                 return order
             else:
                 print(
-                    'SCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
+                    'MSCCL: Running inspector-topo to find the IB placement. This will take a couple of minutes...')
                 env = os.environ.copy()
                 env['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
                 topo_detect = subprocess.run(
                     ['/usr/local/bin/inspector-topo'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
-                print('SCCL: Finished running inspector-topo. Finding the permutaion.')
+                print('MSCCL: Finished running inspector-topo. Finding the permutaion.')
                 if topo_detect.returncode != 0:
                     raise RuntimeError(
                         f'inspector-topo had a failure:\n{topo_detect.stdout}\n{topo_detect.stderr}')
@@ -218,7 +218,7 @@ def _select_isomorphism(isomorphisms, verbose=True): # pragma: no cover
                         f.write(order)
                         f.flush()
                         if verbose:
-                            print(f'SCCL: Wrote IB placement to {f.name}')
+                            print(f'MSCCL: Wrote IB placement to {f.name}')
                         return order
                 raise RuntimeError(
                     f'expected an isomorphism to match our expectation but none of them did!')
diff --git a/msccl/autosynth/msccl_ndv2_launcher.sh b/msccl/autosynth/msccl_ndv2_launcher.sh
new file mode 100755
index 0000000..186087d
--- /dev/null
+++ b/msccl/autosynth/msccl_ndv2_launcher.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+python -c "import msccl; msccl.ndv2_perm()"
+order=/var/lock/msccl_autosynth_inspector_topo.lock
+if [ -f "$order" ]; then
+    export CUDA_VISIBLE_DEVICES=$(</var/lock/msccl_autosynth_inspector_topo.lock)
+    echo "Set CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
+fi
+$@
\ No newline at end of file
diff --git a/sccl/autosynth/ndv2_plans.py b/msccl/autosynth/ndv2_plans.py
similarity index 64%
rename from sccl/autosynth/ndv2_plans.py
rename to msccl/autosynth/ndv2_plans.py
index cbce8ae..078f172 100755
--- a/sccl/autosynth/ndv2_plans.py
+++ b/msccl/autosynth/ndv2_plans.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.topologies import dgx1
-from sccl.collectives import gather, scatter
-from sccl.strategies import solve_least_steps
-from sccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
-from sccl.autosynth.registry import register_synthesis_plan
-from sccl.ncclize import ncclize
+from msccl.topologies import dgx1
+from msccl.collectives import gather, scatter
+from msccl.strategies import solve_least_steps
+from msccl.distributors.gather_scatter_alltoall import synthesize_gather_scatter_distributed_alltoall
+from msccl.autosynth.registry import register_synthesis_plan
+from msccl.ncclize import ncclize
 
 
 def register_ndv2_plans():
diff --git a/msccl/autosynth/ndv4_plans.py b/msccl/autosynth/ndv4_plans.py
new file mode 100755
index 0000000..98e3a05
--- /dev/null
+++ b/msccl/autosynth/ndv4_plans.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from msccl.autosynth.registry import register_synthesis_plan, register_msccl_program
+from msccl.programs.allreduce_a100_ring import allreduce_ring
+from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical
+from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step
+from msccl.topologies import fully_connected
+from msccl.language.ir import ThreadblockPolicy
+
+def register_ndv4_plans():
+
+    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+        instances=4, protocol='LL128', sizes=('256KB', '20MB'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+    def ndv4_ring_allreduce(prog, nodes):
+        allreduce_ring(size=8, channels=8)
+
+    @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64)
+    def ndv4_alltoall_hierarchical(prog, nodes):
+        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
+
+    @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32)
+    def ndv4_alltoall_hierarchical(prog, nodes):
+        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
+
+    @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
+    def ndv4_alltoall_three_step(prog, nodes):
+        alltoall_three_step(num_nodes=nodes, gpus_per_node=8)
diff --git a/sccl/autosynth/registry.py b/msccl/autosynth/registry.py
similarity index 87%
rename from sccl/autosynth/registry.py
rename to msccl/autosynth/registry.py
index 9d85dca..0edc7cc 100755
--- a/sccl/autosynth/registry.py
+++ b/msccl/autosynth/registry.py
@@ -8,10 +8,10 @@
 import atexit
 import humanfriendly
 
-from sccl.language import SCCLProgram, ir_to_xml
-from sccl.language.ir import ThreadblockPolicy
-import sccl.language.collectives as lang_collectives
-from sccl.topologies import distributed_fully_connected
+from msccl.language import MSCCLProgram, ir_to_xml
+from msccl.language.ir import ThreadblockPolicy
+import msccl.language.collectives as lang_collectives
+from msccl.topologies import distributed_fully_connected
 
 # The plans are keyed by (collective, machine_type) and each entry is a tuple
 # (name, function, machines, size_range, protocol, priority).
@@ -62,7 +62,7 @@ def wrapped(machines):
     return decorator
 
 
-def register_sccl_program(local_topology, collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', 
+def register_msccl_program(local_topology, collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', 
     chunk_factor=1, priority=0, collective_obj=None, instances=1, inplace=False, threadblock_policy=ThreadblockPolicy.auto):
     def decorator(fun):
         name = fun.__name__
@@ -79,8 +79,8 @@ def wrapped(machines):
                 elif collective == 'reduce_scatter':
                     co = lang_collectives.ReduceScatter(topology.num_nodes(), chunk_factor, inplace)
                 else:
-                    raise RuntimeError(f'No collective_obj in sccl.language.collectives known for "{collective}"')
-            prog = SCCLProgram(name, topology, co, instances, protocol, threadblock_policy)
+                    raise RuntimeError(f'No collective_obj in msccl.language.collectives known for "{collective}"')
+            prog = MSCCLProgram(name, topology, co, instances, protocol, threadblock_policy)
             with prog:
                 fun(prog, machines)
             prog.check()
diff --git a/sccl/cli/__init__.py b/msccl/cli/__init__.py
similarity index 100%
rename from sccl/cli/__init__.py
rename to msccl/cli/__init__.py
diff --git a/sccl/cli/analyze.py b/msccl/cli/analyze.py
similarity index 93%
rename from sccl/cli/analyze.py
rename to msccl/cli/analyze.py
index 6c1e3cc..299497e 100755
--- a/sccl/cli/analyze.py
+++ b/msccl/cli/analyze.py
@@ -4,8 +4,8 @@
 from .known_topologies import KnownTopologies
 from .known_collectives import KnownCollectives
 from .common import *
-from sccl.rounds_bound import lower_bound_rounds
-from sccl.isomorphisms import find_isomorphisms
+from msccl.rounds_bound import lower_bound_rounds
+from msccl.isomorphisms import find_isomorphisms
 
 def make_analyses(cmd_parsers):
     handler_funcs = []
diff --git a/sccl/cli/common.py b/msccl/cli/common.py
similarity index 91%
rename from sccl/cli/common.py
rename to msccl/cli/common.py
index 12110e3..aead356 100755
--- a/sccl/cli/common.py
+++ b/msccl/cli/common.py
@@ -1,22 +1,22 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.serialization import *
-from sccl.instance import *
+from msccl.serialization import *
+from msccl.instance import *
 from pathlib import Path
 import sys
 import re
 from fractions import Fraction
 
-def _legalize_sccl_name(name):
+def _legalize_msccl_name(name):
     name = name.replace('(', '.')
     name = name.replace('=', '')
     name = name.replace(',', '.')
     name = name.replace(')', '')
     return name
 
-def name_sccl_object(name, ending='sccl.json'):
-    return f'{_legalize_sccl_name(name)}.{ending}'
+def name_msccl_object(name, ending='msccl.json'):
+    return f'{_legalize_msccl_name(name)}.{ending}'
 
 def _validate_output_directory(directory):
     if not directory.exists():
@@ -80,7 +80,7 @@ def handle(args, algorithm):
         if algorithm == None:
             return # Strategies/distributors have their specific failure prints
 
-        handled = handle_file(args, lambda: SCCLEncoder().encode(algorithm), name_sccl_object(algorithm.name))
+        handled = handle_file(args, lambda: MSCCLEncoder().encode(algorithm), name_msccl_object(algorithm.name))
         if not handled:
             print(f'\n{algorithm.name} algorithm:')
             print(algorithm)
@@ -91,11 +91,11 @@ def add_output_topology(parser):
     validate_args, handle_file = add_output_file(parser)
 
     def handle(args, topology):
-        handled = handle_file(args, lambda: SCCLEncoder().encode(topology), name_sccl_object(topology.name))
+        handled = handle_file(args, lambda: MSCCLEncoder().encode(topology), name_msccl_object(topology.name))
 
     return validate_args, handle
 
-def add_output_sccl_objects(parser):
+def add_output_msccl_objects(parser):
     parser.add_argument('-d', '--directory', type=Path, default=Path(), help='directory to write outputs to', metavar='DIR')
     parser.add_argument('-f', '--force', action='store_true', help='overwrite existing files')
     parser.add_argument('--no-save', action='store_true', help='do not save to file')
@@ -103,9 +103,9 @@ def add_output_sccl_objects(parser):
     def validate_args(args):
         _validate_output_directory(args.directory)
 
-    def handle(args, sccl_object, name):
+    def handle(args, msccl_object, name):
         if not args.no_save:
-            _handle_write_to_directory(args.directory, args.force, lambda: SCCLEncoder().encode(sccl_object), name_sccl_object(name))
+            _handle_write_to_directory(args.directory, args.force, lambda: MSCCLEncoder().encode(msccl_object), name_msccl_object(name))
     
     return validate_args, handle
 
@@ -119,7 +119,7 @@ def read_algorithm(args):
                 print(f'error: input file not found: {input_file}', file=sys.stderr)
                 exit(1)
 
-            algo = load_sccl_object(input_file)
+            algo = load_msccl_object(input_file)
             algos.append(algo)
         if multiple:
             return algos
diff --git a/sccl/cli/distribute.py b/msccl/cli/distribute.py
similarity index 97%
rename from sccl/cli/distribute.py
rename to msccl/cli/distribute.py
index 09461a9..0159615 100755
--- a/sccl/cli/distribute.py
+++ b/msccl/cli/distribute.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.distributors import *
+from msccl.distributors import *
 from .known_distributed_topologies import KnownDistributedTopologies
 from .known_topologies import KnownTopologies
 from .common import *
@@ -65,7 +65,7 @@ def make_handle_create_subproblem_distributed_alltoall(cmd_parsers):
     cmd.add_argument('--relay-nodes', type=int, nargs='+', default=[0], help='relay nodes')
     cmd.add_argument('-bw', '--remote-bandwidth', type=int, default=1, help='remote bandwidth', metavar='N')
     cmd.add_argument('--share-bandwidth', action='store_true', help='share local bandwidth between relay nodes')
-    validate_output_args, output_handler = add_output_sccl_objects(cmd)
+    validate_output_args, output_handler = add_output_msccl_objects(cmd)
 
     def handle(args, command):
         if command != name:
diff --git a/sccl/cli/known_collectives.py b/msccl/cli/known_collectives.py
similarity index 95%
rename from sccl/cli/known_collectives.py
rename to msccl/cli/known_collectives.py
index 0ace187..8857346 100755
--- a/sccl/cli/known_collectives.py
+++ b/msccl/cli/known_collectives.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.collectives as collectives
-from sccl.serialization import *
+import msccl.collectives as collectives
+from msccl.serialization import *
 from pathlib import Path
 import sys
 
@@ -43,7 +43,7 @@ def make(size, args):
                 print(f'error: input file not found: {input_file}', file=sys.stderr)
                 exit(1)
 
-            return load_sccl_object(input_file)
+            return load_msccl_object(input_file)
         return make
 
     def _rooted_coll(self, fun):
diff --git a/sccl/cli/known_distributed_topologies.py b/msccl/cli/known_distributed_topologies.py
similarity index 97%
rename from sccl/cli/known_distributed_topologies.py
rename to msccl/cli/known_distributed_topologies.py
index b4db6a3..9d4640f 100755
--- a/sccl/cli/known_distributed_topologies.py
+++ b/msccl/cli/known_distributed_topologies.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.topologies as topologies
+import msccl.topologies as topologies
 import pathlib
 
 class KnownDistributedTopologies:
diff --git a/sccl/cli/known_topologies.py b/msccl/cli/known_topologies.py
similarity index 95%
rename from sccl/cli/known_topologies.py
rename to msccl/cli/known_topologies.py
index 70ec79d..5094367 100755
--- a/sccl/cli/known_topologies.py
+++ b/msccl/cli/known_topologies.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.topologies as topologies
-from sccl.serialization import *
+import msccl.topologies as topologies
+from msccl.serialization import *
 from .known_transformers import KnownTransformers
 from pathlib import Path
 import sys
@@ -51,7 +51,7 @@ def make(args):
                 print(f'error: input file not found: {input_file}', file=sys.stderr)
                 exit(1)
 
-            return load_sccl_object(input_file)
+            return load_msccl_object(input_file)
         return make
 
     def _fixed_topo(self, Cls):
diff --git a/sccl/cli/known_transformers.py b/msccl/cli/known_transformers.py
similarity index 94%
rename from sccl/cli/known_transformers.py
rename to msccl/cli/known_transformers.py
index 48268d5..d9a7921 100755
--- a/sccl/cli/known_transformers.py
+++ b/msccl/cli/known_transformers.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.topologies as topologies
+import msccl.topologies as topologies
 
 class KnownTransformers:
     def __init__(self, parser, tag=''):
diff --git a/sccl/cli/ncclize.py b/msccl/cli/ncclize.py
similarity index 96%
rename from sccl/cli/ncclize.py
rename to msccl/cli/ncclize.py
index 0a0cb7e..5bd9205 100755
--- a/sccl/cli/ncclize.py
+++ b/msccl/cli/ncclize.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.ncclize import *
+from msccl.ncclize import *
 from .common import *
 
 def make_handle_ncclize(cmd_parsers):
@@ -36,7 +36,7 @@ def handle(args, command):
                 instances=args.instances,
                 logging=True)
 
-            handled = output_handler(args, lambda: ncclized, name_sccl_object(algo.name, ending='sccl.xml'))
+            handled = output_handler(args, lambda: ncclized, name_msccl_object(algo.name, ending='msccl.xml'))
 
         return True
     
diff --git a/sccl/cli/plans.py b/msccl/cli/plans.py
similarity index 94%
rename from sccl/cli/plans.py
rename to msccl/cli/plans.py
index 63732aa..8683e76 100755
--- a/sccl/cli/plans.py
+++ b/msccl/cli/plans.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 from .common import *
-from sccl.autosynth import *
+from msccl.autosynth import *
 
 def make_plans(cmd_parsers):
     handler_funcs = []
diff --git a/sccl/cli/solve.py b/msccl/cli/solve.py
similarity index 97%
rename from sccl/cli/solve.py
rename to msccl/cli/solve.py
index 4f1c094..36634a9 100755
--- a/sccl/cli/solve.py
+++ b/msccl/cli/solve.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl.strategies as strategies
+import msccl.strategies as strategies
 from .known_topologies import KnownTopologies
 from .known_collectives import KnownCollectives
 from .common import *
@@ -55,7 +55,7 @@ def make_handle_solve_pareto_optimal(cmd_parsers):
     cmd = cmd_parsers.add_parser(name)
     topologies = KnownTopologies(cmd)
     collectives = KnownCollectives(cmd)
-    validate_output_args, output_handler = add_output_sccl_objects(cmd)
+    validate_output_args, output_handler = add_output_msccl_objects(cmd)
     cmd.add_argument('--min-chunks', type=int, default=1, metavar='N')
     cmd.add_argument('--max-chunks', type=int, default=None, metavar='N')
     cmd.add_argument('--assume-rpc-bound', default=None, help='assume bandwidth optimality requires at least this many rounds per chunk', metavar='N/N')
diff --git a/sccl/collectives.py b/msccl/collectives.py
similarity index 100%
rename from sccl/collectives.py
rename to msccl/collectives.py
diff --git a/sccl/distributors/__init__.py b/msccl/distributors/__init__.py
similarity index 100%
rename from sccl/distributors/__init__.py
rename to msccl/distributors/__init__.py
diff --git a/sccl/distributors/alltoall_subproblem.py b/msccl/distributors/alltoall_subproblem.py
similarity index 98%
rename from sccl/distributors/alltoall_subproblem.py
rename to msccl/distributors/alltoall_subproblem.py
index f2a6400..790b2e7 100755
--- a/sccl/distributors/alltoall_subproblem.py
+++ b/msccl/distributors/alltoall_subproblem.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.collectives import *
-from sccl.algorithm import *
-from sccl.instance import *
-from sccl.topologies import *
+from msccl.collectives import *
+from msccl.algorithm import *
+from msccl.instance import *
+from msccl.topologies import *
 
 def _alltoall_subproblem(local_nodes, num_copies):
     remote_node = local_nodes
diff --git a/sccl/distributors/gather_scatter_alltoall.py b/msccl/distributors/gather_scatter_alltoall.py
similarity index 98%
rename from sccl/distributors/gather_scatter_alltoall.py
rename to msccl/distributors/gather_scatter_alltoall.py
index 9cb3bc3..5a472d2 100755
--- a/sccl/distributors/gather_scatter_alltoall.py
+++ b/msccl/distributors/gather_scatter_alltoall.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.collectives import *
-from sccl.algorithm import *
-from sccl.instance import *
-from sccl.topologies import distributed_fully_connected
+from msccl.collectives import *
+from msccl.algorithm import *
+from msccl.instance import *
+from msccl.topologies import distributed_fully_connected
 
 def synthesize_gather_scatter_distributed_alltoall(num_copies, gather_algo, scatter_algo, remote_bw=1, logging=False):
     if gather_algo.is_pipelined() or scatter_algo.is_pipelined():
diff --git a/sccl/distributors/greedy_alltoall.py b/msccl/distributors/greedy_alltoall.py
similarity index 98%
rename from sccl/distributors/greedy_alltoall.py
rename to msccl/distributors/greedy_alltoall.py
index 50a9b47..b50625e 100755
--- a/sccl/distributors/greedy_alltoall.py
+++ b/msccl/distributors/greedy_alltoall.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.collectives import alltoall
-from sccl.algorithm import *
-from sccl.instance import *
+from msccl.collectives import alltoall
+from msccl.algorithm import *
+from msccl.instance import *
 
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/sccl/instance.py b/msccl/instance.py
similarity index 100%
rename from sccl/instance.py
rename to msccl/instance.py
diff --git a/sccl/isomorphisms.py b/msccl/isomorphisms.py
similarity index 90%
rename from sccl/isomorphisms.py
rename to msccl/isomorphisms.py
index 9dabe0b..144dc7e 100755
--- a/sccl/isomorphisms.py
+++ b/msccl/isomorphisms.py
@@ -48,14 +48,14 @@ def find_isomorphisms(topology, target_topology, limit=None, logging=False):
     Finds all isomorphisms from one topology to a target topology. Returns a list of permutations.
     '''
     if len(topology.switches) > 0:
-        print('SCCL Warning: Topologies with switches are not supported. import sccl will be ignored.')
+        print('MSCCL Warning: Topologies with switches are not supported. import msccl will be ignored.')
         return []
 
     if limit != None and limit <= 0:
-        raise ValueError('SCCL error: limit was set improperly.')
+        raise ValueError('MSCCL error: limit was set improperly.')
     
     if topology.num_nodes() != target_topology.num_nodes():
-        raise ValueError('SCCL error: target topology does not match with the given topology.')
+        raise ValueError('MSCCL error: target topology does not match with the given topology.')
 
     if logging:
         print(f'Encoding {topology.name} - {target_topology.name} isomorphisms to Z3')
diff --git a/sccl/language/__init__.py b/msccl/language/__init__.py
similarity index 97%
rename from sccl/language/__init__.py
rename to msccl/language/__init__.py
index 57c19bf..7e3bf95 100755
--- a/sccl/language/__init__.py
+++ b/msccl/language/__init__.py
@@ -4,14 +4,14 @@
 from dataclasses import dataclass
 from enum import Enum
 import functools
-from sccl.language.ir import *
-from sccl.language.passes import *
-from sccl.language.tb_assignment import *
-from sccl.language.chunk import *
-from sccl.language.buffer import *
-from sccl.language.rank_dag import *
-import sccl.collectives as collectives
-# from sccl.language.visualize import *
+from msccl.language.ir import *
+from msccl.language.passes import *
+from msccl.language.tb_assignment import *
+from msccl.language.chunk import *
+from msccl.language.buffer import *
+from msccl.language.rank_dag import *
+import msccl.collectives as collectives
+# from msccl.language.visualize import *
 
 _current_program = None
 def _curr():
@@ -20,7 +20,7 @@ def _curr():
         raise RuntimeError("No Program in context")
     return _current_program
 
-class SCCLProgram:
+class MSCCLProgram:
     def __init__(self, name, topo, collective, instances, protocol='Simple', \
             threadblock_policy=ThreadblockPolicy.auto, interleaved_replication=True,
             instr_fusion=True, check_xml=True, dependence_nop=False):
@@ -52,7 +52,7 @@ def __init__(self, name, topo, collective, instances, protocol='Simple', \
     def __enter__(self):
         global _current_program
         if _current_program != None:
-            raise RuntimeError("There is already a SCCL Program in context")
+            raise RuntimeError("There is already a MSCCL Program in context")
         _current_program = self
     
     def __exit__(self, exc_type, exc_value, exc_traceback):
@@ -119,7 +119,7 @@ def lower(self):
         self.instr_dag.lower_pt1(self.instances)
         gpu_prgms = self.instr_dag.lower_pt2(self.instances, self.interleaved_replication)
         if self.check_xml:
-            # Check generated SCCL-IR for correctness - no circular dependencies, sends and receives are ordered
+            # Check generated MSCCL-IR for correctness - no circular dependencies, sends and receives are ordered
             # For very large programs, turn off check_xml when shipping 
             check_dependency_cycles(self.instr_dag.tbs)
             check_threadblock_ordering(self.instr_dag)
@@ -155,7 +155,7 @@ def Check():
 
 @dataclass
 class Ref(ChunkRef):
-    prog: SCCLProgram
+    prog: MSCCLProgram
 
     def __repr__(self):
         return f'Ref(Buffer:{self.buffer}, Index:{self.index}, Size:{self.size}, Rank:{self.rank})'
diff --git a/sccl/language/buffer.py b/msccl/language/buffer.py
similarity index 100%
rename from sccl/language/buffer.py
rename to msccl/language/buffer.py
diff --git a/sccl/language/chunk.py b/msccl/language/chunk.py
similarity index 98%
rename from sccl/language/chunk.py
rename to msccl/language/chunk.py
index 2d4458a..8ed5bc3 100755
--- a/sccl/language/chunk.py
+++ b/msccl/language/chunk.py
@@ -3,7 +3,7 @@
 
 
 from dataclasses import dataclass
-from sccl.language.ir import *
+from msccl.language.ir import *
 
 @dataclass
 class Chunk:
diff --git a/sccl/language/collectives.py b/msccl/language/collectives.py
similarity index 99%
rename from sccl/language/collectives.py
rename to msccl/language/collectives.py
index 8654665..ddbacaa 100755
--- a/sccl/language/collectives.py
+++ b/msccl/language/collectives.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
-from sccl.language.ir import Buffer
-from sccl.language import *
+from msccl.language.ir import Buffer
+from msccl.language import *
 
 class Collective():
     def __init__(self, num_ranks, chunk_factor, inplace):
diff --git a/sccl/language/ir.py b/msccl/language/ir.py
similarity index 100%
rename from sccl/language/ir.py
rename to msccl/language/ir.py
diff --git a/sccl/language/passes.py b/msccl/language/passes.py
similarity index 97%
rename from sccl/language/passes.py
rename to msccl/language/passes.py
index 6f71edd..07ee396 100755
--- a/sccl/language/passes.py
+++ b/msccl/language/passes.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 import sys
-from sccl.language.ir import *
+from msccl.language.ir import *
 
 # Check that there are no cyclic dependencies within a Rank
 def check_dependency_cycles(tbs):
@@ -38,7 +38,7 @@ def check_threadblock_ordering(rank_dag):
                 if op.is_send():
                     match = op.recv_match
                     if match.is_recv():
-                        assert op.dst.rank == match.rank, f"Bug in SCCLang: Sends don't match receives"
+                        assert op.dst.rank == match.rank, f"Bug in MSCCLang: Sends don't match receives"
 
                     other_tbid = match.tb
                     if other_tbid in prev_steps:
diff --git a/sccl/language/rank_dag.py b/msccl/language/rank_dag.py
similarity index 99%
rename from sccl/language/rank_dag.py
rename to msccl/language/rank_dag.py
index 45a52e9..ae00960 100755
--- a/sccl/language/rank_dag.py
+++ b/msccl/language/rank_dag.py
@@ -6,8 +6,8 @@
 import heapq
 import functools
 
-from sccl.language.ir import *
-from sccl.language.passes import *
+from msccl.language.ir import *
+from msccl.language.passes import *
 
 def remove_op(op):
     for p in op.prev:
diff --git a/sccl/language/routines.py b/msccl/language/routines.py
similarity index 91%
rename from sccl/language/routines.py
rename to msccl/language/routines.py
index 6cb8898..86fa0e4 100644
--- a/sccl/language/routines.py
+++ b/msccl/language/routines.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import *
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import *
 
 def allgather_ring_inplace(gpus, gpu_offset=0, index_offset=0, ch=0):
     for rank in range(gpu_offset, gpu_offset+gpus):
diff --git a/sccl/language/tb_assignment.py b/msccl/language/tb_assignment.py
similarity index 99%
rename from sccl/language/tb_assignment.py
rename to msccl/language/tb_assignment.py
index e1761e8..9760be2 100755
--- a/sccl/language/tb_assignment.py
+++ b/msccl/language/tb_assignment.py
@@ -5,8 +5,8 @@
 from enum import Enum
 import heapq
 
-from sccl.language.ir import *
-from sccl.language.rank_dag import *
+from msccl.language.ir import *
+from msccl.language.rank_dag import *
 
 
 def _verify_tb_op_compatible(tb, op):
diff --git a/sccl/language/visualize.py b/msccl/language/visualize.py
similarity index 98%
rename from sccl/language/visualize.py
rename to msccl/language/visualize.py
index 5ffca4e..e24710a 100755
--- a/sccl/language/visualize.py
+++ b/msccl/language/visualize.py
@@ -2,8 +2,8 @@
 # Licensed under the MIT License.
 
 import igraph as ig
-from sccl.language.ir import *
-from sccl.language.rank_dag import *
+from msccl.language.ir import *
+from msccl.language.rank_dag import *
 
 def visualize_chunk_dag(chunk_paths): # pragma: no cover
     frontier = []
diff --git a/sccl/ncclize.py b/msccl/ncclize.py
similarity index 99%
rename from sccl/ncclize.py
rename to msccl/ncclize.py
index f1016ad..6515bd3 100644
--- a/sccl/ncclize.py
+++ b/msccl/ncclize.py
@@ -285,7 +285,7 @@ def __str__(self):
 
 def ncclize(algorithm, remap_scratch = None, channel_policy=ChannelPolicy.MatchTopology, pretty_print = True, use_scratch=True, merge_contiguous=True, greedy_scratch_sorting=False, instances=1, logging=False):
     '''
-    Generate the XML format used by the NCCL SCCL backend.
+    Generate the XML format used by the NCCL MSCCL backend.
 
     Sends are split into send/recv operations and grouped by the rank executing them. Within each rank operations are
     grouped under <threadblock/> tags, which handle 1) a single peer, 2) a single type of operation, and 3) at most one
diff --git a/sccl/ncd_reduction.py b/msccl/ncd_reduction.py
similarity index 95%
rename from sccl/ncd_reduction.py
rename to msccl/ncd_reduction.py
index e079761..3c623f5 100755
--- a/sccl/ncd_reduction.py
+++ b/msccl/ncd_reduction.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.collectives import *
-from sccl.topologies import reverse_topology
-from sccl.algorithm import Algorithm, Step
+from msccl.collectives import *
+from msccl.topologies import reverse_topology
+from msccl.algorithm import Algorithm, Step
 from collections import defaultdict
 
 class ReductionNotApplicableError(ValueError):
diff --git a/sccl/path_encoding.py b/msccl/path_encoding.py
similarity index 99%
rename from sccl/path_encoding.py
rename to msccl/path_encoding.py
index 44838b3..23d085d 100755
--- a/sccl/path_encoding.py
+++ b/msccl/path_encoding.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.algorithm import *
-from sccl.ncd_reduction import wrap_try_ncd_reduction
+from msccl.algorithm import *
+from msccl.ncd_reduction import wrap_try_ncd_reduction
 from z3 import *
 
 from collections import defaultdict
diff --git a/sccl/programs/__init__.py b/msccl/programs/__init__.py
similarity index 100%
rename from sccl/programs/__init__.py
rename to msccl/programs/__init__.py
diff --git a/sccl/programs/allreduce_a100_ring.py b/msccl/programs/allreduce_a100_ring.py
similarity index 91%
rename from sccl/programs/allreduce_a100_ring.py
rename to msccl/programs/allreduce_a100_ring.py
index 1a06bf8..03d507c 100755
--- a/sccl/programs/allreduce_a100_ring.py
+++ b/msccl/programs/allreduce_a100_ring.py
@@ -2,9 +2,9 @@
 # Licensed under the MIT License.
 
 import argparse
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllReduce
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
 
 # Ring all reduce for A100s
 # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
diff --git a/sccl/programs/alltoall_a100_8kp1.py b/msccl/programs/alltoall_a100_8kp1.py
similarity index 97%
rename from sccl/programs/alltoall_a100_8kp1.py
rename to msccl/programs/alltoall_a100_8kp1.py
index aca4795..32b1ed1 100755
--- a/sccl/programs/alltoall_a100_8kp1.py
+++ b/msccl/programs/alltoall_a100_8kp1.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllToAll
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
 
 def alltoall_three_step(num_nodes, gpus_per_node, instances=1, ib_connections=1):
     num_ranks = num_nodes * gpus_per_node
diff --git a/sccl/programs/alltoall_a100_yifan.py b/msccl/programs/alltoall_a100_yifan.py
similarity index 92%
rename from sccl/programs/alltoall_a100_yifan.py
rename to msccl/programs/alltoall_a100_yifan.py
index 4d30efd..d2196cf 100755
--- a/sccl/programs/alltoall_a100_yifan.py
+++ b/msccl/programs/alltoall_a100_yifan.py
@@ -1,8 +1,8 @@
 import argparse
 
-from sccl.language import *
-from sccl.topologies import *
-from sccl.language.collectives import AllToAll
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
 
 
 def alltoall_hierarchical(num_nodes, gpus_per_node):
diff --git a/sccl/rounds_bound.py b/msccl/rounds_bound.py
similarity index 96%
rename from sccl/rounds_bound.py
rename to msccl/rounds_bound.py
index eaaa417..77e43ea 100755
--- a/sccl/rounds_bound.py
+++ b/msccl/rounds_bound.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.ncd_reduction import non_combining_dual
-from sccl.topologies import reverse_topology
+from msccl.ncd_reduction import non_combining_dual
+from msccl.topologies import reverse_topology
 from z3 import *
 from fractions import Fraction
 
diff --git a/sccl/serialization.py b/msccl/serialization.py
similarity index 74%
rename from sccl/serialization.py
rename to msccl/serialization.py
index 4480517..69c6fd0 100755
--- a/sccl/serialization.py
+++ b/msccl/serialization.py
@@ -1,41 +1,41 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.algorithm import Algorithm, Step
-from sccl.topologies import Topology
-from sccl.instance import Instance
-from sccl.collectives import Collective, Chunk
+from msccl.algorithm import Algorithm, Step
+from msccl.topologies import Topology
+from msccl.instance import Instance
+from msccl.collectives import Collective, Chunk
 
 import json
 import warnings
 
-def _sccl_object_hook(o):
-    if not 'sccl_type' in o:
+def _msccl_object_hook(o):
+    if not 'msccl_type' in o:
         return o
-    if o['sccl_type'] == 'algorithm':
+    if o['msccl_type'] == 'algorithm':
         input_map = { int(k): set(v) for k, v in o['input_map'].items() }
         output_map = { int(k): set(v) for k, v in o['output_map'].items() }
         return Algorithm(o['name'], o['collective'], o['topology'], o['instance'], o['steps'], input_map, output_map)
-    if o['sccl_type'] == 'step':
+    if o['msccl_type'] == 'step':
         sends = [(addr, src, dst) for addr, src, dst in o['sends']]
         return Step(o['rounds'], sends)
-    if o['sccl_type'] == 'collective':
+    if o['msccl_type'] == 'collective':
         triggers = { (int(r), int(c)): v for r, rmap in o['triggers'].items() for c, v in rmap.items() }
         return Collective(o['name'], o['nodes'], o['chunks'], triggers, o['runtime_name'])
-    if o['sccl_type'] == 'chunk':
+    if o['msccl_type'] == 'chunk':
         pre = set(o['pre'])
         post = set(o['post'])
         return Chunk(pre, post, o['addr'])
-    if o['sccl_type'] == 'topology':
+    if o['msccl_type'] == 'topology':
         return Topology(o['name'], o['links'], o['switches'])
-    if o['sccl_type'] == 'instance':
+    if o['msccl_type'] == 'instance':
         return Instance(o['steps'], o['extra_rounds'], o['chunks'], o['pipeline'], o['extra_memory'], o['allow_exchange'])
-    warnings.warn('Unhandled sccl_type in JSON')
+    warnings.warn('Unhandled msccl_type in JSON')
 
-def SCCLDecoder():
-    return json.JSONDecoder(object_hook=_sccl_object_hook)
+def MSCCLDecoder():
+    return json.JSONDecoder(object_hook=_msccl_object_hook)
 
-class SCCLEncoder(json.JSONEncoder):
+class MSCCLEncoder(json.JSONEncoder):
     def __init__(self):
         super().__init__()
     
@@ -44,7 +44,7 @@ def default(self, o):
             input_map = { k: list(v) for k, v in o.input_map.items() }
             output_map = { k: list(v) for k, v in o.output_map.items() }
             return {
-                'sccl_type': 'algorithm',
+                'msccl_type': 'algorithm',
                 'name': o.name,
                 'instance': o.instance,
                 'input_map': input_map,
@@ -55,7 +55,7 @@ def default(self, o):
             }
         if isinstance(o, Step):
             return {
-                'sccl_type': 'step',
+                'msccl_type': 'step',
                 'rounds': o.rounds,
                 'sends': o.sends,
             }
@@ -66,7 +66,7 @@ def default(self, o):
                     triggers[r] = {}
                 triggers[r][c] = v
             return {
-                'sccl_type': 'collective',
+                'msccl_type': 'collective',
                 'name': o.name,
                 'nodes': o.num_nodes,
                 'chunks': o._chunks,
@@ -75,21 +75,21 @@ def default(self, o):
             }
         if isinstance(o, Chunk):
             return {
-                'sccl_type': 'chunk',
+                'msccl_type': 'chunk',
                 'pre': list(o.precondition),
                 'post': list(o.postcondition),
                 'addr': o.address,
             }
         if isinstance(o, Topology):
             return {
-                'sccl_type': 'topology',
+                'msccl_type': 'topology',
                 'name': o.name,
                 'switches': o.switches,
                 'links': o.links,
             }
         if isinstance(o, Instance):
             return {
-                'sccl_type': 'instance',
+                'msccl_type': 'instance',
                 'steps': o.steps,
                 'extra_rounds': o.extra_rounds,
                 'chunks': o.chunks,
@@ -99,10 +99,10 @@ def default(self, o):
             }
         return json.JSONEncoder.default(self, o)
 
-def save_sccl_object(obj, filename):
+def save_msccl_object(obj, filename):
     with open(filename, 'w') as f:
-        f.write(SCCLEncoder().encode(obj))
+        f.write(MSCCLEncoder().encode(obj))
 
-def load_sccl_object(filename):
+def load_msccl_object(filename):
     with open(filename) as f:
-        return SCCLDecoder().decode(f.read())
+        return MSCCLDecoder().decode(f.read())
diff --git a/sccl/steps_bound.py b/msccl/steps_bound.py
similarity index 100%
rename from sccl/steps_bound.py
rename to msccl/steps_bound.py
diff --git a/sccl/strategies.py b/msccl/strategies.py
similarity index 97%
rename from sccl/strategies.py
rename to msccl/strategies.py
index aa9dbc8..77e046e 100755
--- a/sccl/strategies.py
+++ b/msccl/strategies.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.instance import Instance
-from sccl.path_encoding import PathEncoding
-from sccl.rounds_bound import lower_bound_rounds
-from sccl.steps_bound import lower_bound_steps
+from msccl.instance import Instance
+from msccl.path_encoding import PathEncoding
+from msccl.rounds_bound import lower_bound_rounds
+from msccl.steps_bound import lower_bound_steps
 
 import time
 import math
diff --git a/sccl/topologies/__init__.py b/msccl/topologies/__init__.py
similarity index 100%
rename from sccl/topologies/__init__.py
rename to msccl/topologies/__init__.py
diff --git a/sccl/topologies/amd.py b/msccl/topologies/amd.py
similarity index 100%
rename from sccl/topologies/amd.py
rename to msccl/topologies/amd.py
diff --git a/sccl/topologies/distributed.py b/msccl/topologies/distributed.py
similarity index 100%
rename from sccl/topologies/distributed.py
rename to msccl/topologies/distributed.py
diff --git a/sccl/topologies/generic.py b/msccl/topologies/generic.py
similarity index 100%
rename from sccl/topologies/generic.py
rename to msccl/topologies/generic.py
diff --git a/sccl/topologies/nvidia.py b/msccl/topologies/nvidia.py
similarity index 100%
rename from sccl/topologies/nvidia.py
rename to msccl/topologies/nvidia.py
diff --git a/sccl/topologies/topology.py b/msccl/topologies/topology.py
similarity index 100%
rename from sccl/topologies/topology.py
rename to msccl/topologies/topology.py
diff --git a/sccl/topologies/transformers.py b/msccl/topologies/transformers.py
similarity index 100%
rename from sccl/topologies/transformers.py
rename to msccl/topologies/transformers.py
diff --git a/pytest.ini b/pytest.ini
index cb4173f..d68bf05 100755
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-addopts = --cov=sccl --cov-report term-missing:skip-covered --cov-fail-under 90 -n auto
+addopts = --cov=msccl --cov-report term-missing:skip-covered --cov-fail-under 90 -n auto
diff --git a/sccl/__init__.py b/sccl/__init__.py
deleted file mode 100755
index 2e640be..0000000
--- a/sccl/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-from sccl.autosynth import init, tabulate_plans, print_plans
-from sccl.autosynth import ndv2_perm
-from sccl.autosynth import Collective
diff --git a/sccl/autosynth/ndv4_plans.py b/sccl/autosynth/ndv4_plans.py
deleted file mode 100755
index 03a2911..0000000
--- a/sccl/autosynth/ndv4_plans.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-from sccl.autosynth.registry import register_synthesis_plan, register_sccl_program
-from sccl.programs.allreduce_a100_ring import allreduce_ring
-from sccl.programs.alltoall_a100_yifan import alltoall_hierarchical
-from sccl.programs.alltoall_a100_8kp1 import alltoall_three_step
-from sccl.topologies import fully_connected
-from sccl.language.ir import ThreadblockPolicy
-
-def register_ndv4_plans():
-
-    @register_sccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
-        instances=4, protocol='LL128', sizes=('256KB', '20MB'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
-    def ndv4_ring_allreduce(prog, nodes):
-        allreduce_ring(size=8, channels=8)
-
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64)
-    def ndv4_alltoall_hierarchical(prog, nodes):
-        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
-
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32)
-    def ndv4_alltoall_hierarchical(prog, nodes):
-        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
-
-    @register_sccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
-    def ndv4_alltoall_three_step(prog, nodes):
-        alltoall_three_step(num_nodes=nodes, gpus_per_node=8)
diff --git a/sccl/autosynth/sccl_ndv2_launcher.sh b/sccl/autosynth/sccl_ndv2_launcher.sh
deleted file mode 100755
index e2bb54b..0000000
--- a/sccl/autosynth/sccl_ndv2_launcher.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-python -c "import sccl; sccl.ndv2_perm()"
-order=/var/lock/sccl_autosynth_inspector_topo.lock
-if [ -f "$order" ]; then
-    export CUDA_VISIBLE_DEVICES=$(</var/lock/sccl_autosynth_inspector_topo.lock)
-    echo "Set CUDA_VISIBLE_DEVICES to: "$CUDA_VISIBLE_DEVICES
-fi
-$@
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9bac3b3..40ad440 100755
--- a/setup.py
+++ b/setup.py
@@ -4,16 +4,16 @@
 from setuptools import setup, find_packages
 
 setup(
-    name='sccl',
+    name='msccl',
     version='2.3.0',
     packages=find_packages(),
     entry_points={
         'console_scripts': [
-            'sccl = sccl.__main__:main',
+            'msccl = msccl.__main__:main',
         ],
     },
     scripts = [
-        'sccl/autosynth/sccl_ndv2_launcher.sh'
+        'msccl/autosynth/msccl_ndv2_launcher.sh'
     ],
     install_requires=[
         'dataclasses; python_version < "3.7"',
diff --git a/tests/common.py b/tests/common.py
index 1e9a70a..8479535 100755
--- a/tests/common.py
+++ b/tests/common.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.collectives import *
+from msccl.collectives import *
 
 def null_collective(num_nodes):
     return build_collective(f'Null(n={num_nodes})', num_nodes, 1,
diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py
index 2063b26..8b23b1e 100755
--- a/tests/test_algorithm.py
+++ b/tests/test_algorithm.py
@@ -3,9 +3,9 @@
 
 import pytest
 from .common import *
-from sccl.algorithm import Algorithm, Step
-from sccl.topologies import fully_connected
-from sccl.instance import Instance
+from msccl.algorithm import Algorithm, Step
+from msccl.topologies import fully_connected
+from msccl.instance import Instance
 
 def test_invalid_empty():
     with pytest.raises(RuntimeError):
diff --git a/tests/test_analyses.py b/tests/test_analyses.py
index cfcc9a6..a95068e 100755
--- a/tests/test_analyses.py
+++ b/tests/test_analyses.py
@@ -2,9 +2,9 @@
 # Licensed under the MIT License.
 
 import pytest
-from sccl.topologies import Topology
-from sccl.collectives import build_collective
-from sccl.rounds_bound import *
+from msccl.topologies import Topology
+from msccl.collectives import build_collective
+from msccl.rounds_bound import *
 
 def test_rounds_bound_unimplementable():
     topo = Topology('Unconnected', [[0,0],[0,0]])
diff --git a/tests/test_autosynth.py b/tests/test_autosynth.py
index 8eeb2d5..dd74b2d 100755
--- a/tests/test_autosynth.py
+++ b/tests/test_autosynth.py
@@ -2,37 +2,37 @@
 # Licensed under the MIT License.
 
 import pytest
-import sccl
+import msccl
 import os
-from sccl.autosynth.registry import register_synthesis_plan
+from msccl.autosynth.registry import register_synthesis_plan
 
 
-def test_sccl_init(capsys):
-    sccl.init('not_a_machine_type', 4, ('alltoall', 0))
+def test_msccl_init(capsys):
+    msccl.init('not_a_machine_type', 4, ('alltoall', 0))
     out, err = capsys.readouterr()
     assert 'No plan found' in out
-    assert not 'SCCL_CONFIG' in os.environ
+    assert not 'MSCCL_CONFIG' in os.environ
     assert 'NCCL_ALGO' not in os.environ
 
-    sccl.init('ndv2', 2, ('alltoall', '1MB'))
+    msccl.init('ndv2', 2, ('alltoall', '1MB'))
     out, err = capsys.readouterr()
     assert 'synthesize_ndv2_relay_alltoall' in out
-    assert 'SCCL_CONFIG' in os.environ
+    assert 'MSCCL_CONFIG' in os.environ
     assert 'NCCL_IB_AR_THRESHOLD' not in os.environ
-    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'SCCL,RING,TREE'
+    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'MSCCL,RING,TREE'
 
-    os.environ['NCCL_ALGO'] = 'RING,FAKE_SCCL'
-    sccl.init('ndv4', 8, (sccl.Collective.alltoall, '2MB'))
+    os.environ['NCCL_ALGO'] = 'RING,FAKE_MSCCL'
+    msccl.init('ndv4', 8, (msccl.Collective.alltoall, '2MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
     assert 'NCCL_IB_AR_THRESHOLD' in os.environ
-    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'SCCL,RING,FAKE_SCCL'
+    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'MSCCL,RING,FAKE_MSCCL'
 
-    os.environ['NCCL_ALGO'] = 'HELLO,SCCL,WORLD'
-    sccl.init('ndv4', 16, (sccl.Collective.alltoall, '35MB'))
+    os.environ['NCCL_ALGO'] = 'HELLO,MSCCL,WORLD'
+    msccl.init('ndv4', 16, (msccl.Collective.alltoall, '35MB'))
     out, err = capsys.readouterr()
     assert 'ndv4_alltoall' in out
-    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'HELLO,SCCL,WORLD'
+    assert 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] == 'HELLO,MSCCL,WORLD'
 
 
 def test_register_plan():
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6c72607..38aa058 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.topologies import *
-from sccl.collectives import *
-from sccl.serialization import *
+from msccl.topologies import *
+from msccl.collectives import *
+from msccl.serialization import *
 
 import os
 import sys
@@ -24,92 +24,92 @@ def __exit__(self, etype, value, traceback):
         shutil.rmtree(self.tempdir)
 
 def _check_ncclizes(path):
-    assert 0 == os.system(f'sccl ncclize {path} -o ncclized.sccl.xml')
-    assert os.path.exists('ncclized.sccl.xml')
+    assert 0 == os.system(f'msccl ncclize {path} -o ncclized.msccl.xml')
+    assert os.path.exists('ncclized.msccl.xml')
 
 def test_run_as_module():
-    assert 0 == os.system(f'{sys.executable} -m sccl --help')
+    assert 0 == os.system(f'{sys.executable} -m msccl --help')
 
 def test_entrypoint():
-    assert 0 == os.system('sccl --help')
+    assert 0 == os.system('msccl --help')
 
 def test_solve_instance():
     with in_tempdir():
-        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 4 --steps 1 -o algo.json')
+        assert 0 == os.system('msccl solve instance Ring Allgather --nodes 4 --steps 1 -o algo.json')
         assert not os.path.exists('algo.json')
-        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json')
+        assert 0 == os.system('msccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json')
         assert os.path.exists('algo.json')
-        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json --force')
+        assert 0 == os.system('msccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json --force')
         _check_ncclizes('algo.json')
 
 def test_extra_memory():
     with in_tempdir():
-        assert 0 == os.system('sccl solve instance Ring -n 4 Alltoall -s 2 --extra-memory 0 -o algo.json')
+        assert 0 == os.system('msccl solve instance Ring -n 4 Alltoall -s 2 --extra-memory 0 -o algo.json')
         _check_ncclizes('algo.json')
 
 def test_solve_least_steps():
-    assert 0 == os.system('sccl solve least-steps Ring Allgather --nodes 2')
-    assert 0 == os.system('sccl solve least-steps Ring Allgather --nodes 2 --initial-steps 2')
+    assert 0 == os.system('msccl solve least-steps Ring Allgather --nodes 2')
+    assert 0 == os.system('msccl solve least-steps Ring Allgather --nodes 2 --initial-steps 2')
 
 def test_solve_pareto_optimal():
     with in_tempdir():
-        assert 0 == os.system('sccl solve pareto-optimal Ring Allgather --nodes 4 -d .')
+        assert 0 == os.system('msccl solve pareto-optimal Ring Allgather --nodes 4 -d .')
         assert len(os.listdir('.')) == 1
     with in_tempdir():
-        assert 0 == os.system('sccl solve pareto-optimal Ring Allgather --nodes 4 -d . --save-eagerly')
+        assert 0 == os.system('msccl solve pareto-optimal Ring Allgather --nodes 4 -d . --save-eagerly')
         assert len(os.listdir('.')) == 2
-    assert 0 == os.system('sccl solve pareto-optimal Ring Alltoall --nodes 2 --assume-rpc-bound 1/1')
-    assert 0 == os.system('sccl solve pareto-optimal Ring Alltoall --nodes 2 --no-monotonic-feasibility')
+    assert 0 == os.system('msccl solve pareto-optimal Ring Alltoall --nodes 2 --assume-rpc-bound 1/1')
+    assert 0 == os.system('msccl solve pareto-optimal Ring Alltoall --nodes 2 --no-monotonic-feasibility')
 
 def test_ncclize():
     with in_tempdir():
-        assert 0 == os.system('sccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json')
+        assert 0 == os.system('msccl solve instance Ring Allgather --nodes 2 --steps 1 -o algo.json')
         assert os.path.exists('algo.json')
-        assert 0 == os.system('sccl ncclize algo.json -o ncclized1.sccl.xml')
-        assert os.path.exists('ncclized1.sccl.xml')
-        assert 0 == os.system('sccl ncclize algo.json -f --channel-policy One')
-        assert 0 == os.system('sccl ncclize algo.json -f --channel-policy MatchTopology')
-        assert 0 == os.system('sccl ncclize algo.json -f --no-merge-contiguous')
-        assert 0 == os.system('sccl solve instance Star Alltoall --nodes 4 --steps 2 --rounds 4 -o algo_scratch.json')
-        assert 0 == os.system('sccl ncclize algo_scratch.json -f --remap-scratch')
-        assert 0 == os.system('sccl ncclize algo_scratch.json -f --greedy-scratch-sorting')
+        assert 0 == os.system('msccl ncclize algo.json -o ncclized1.msccl.xml')
+        assert os.path.exists('ncclized1.msccl.xml')
+        assert 0 == os.system('msccl ncclize algo.json -f --channel-policy One')
+        assert 0 == os.system('msccl ncclize algo.json -f --channel-policy MatchTopology')
+        assert 0 == os.system('msccl ncclize algo.json -f --no-merge-contiguous')
+        assert 0 == os.system('msccl solve instance Star Alltoall --nodes 4 --steps 2 --rounds 4 -o algo_scratch.json')
+        assert 0 == os.system('msccl ncclize algo_scratch.json -f --remap-scratch')
+        assert 0 == os.system('msccl ncclize algo_scratch.json -f --greedy-scratch-sorting')
 
 def test_custom_topology_and_collective():
     with in_tempdir():
         topo = Topology('CT', [[0, 1], [1, 0]])
         coll = build_collective('CC', 2, 1, lambda r, c: r == 0, lambda r, c: r == 1)
-        save_sccl_object(topo, 'topo.json')
-        save_sccl_object(coll, 'coll.json')
-        assert 0 == os.system('sccl solve instance custom custom --topology-file topo.json --collective-file coll.json -s 1')
+        save_msccl_object(topo, 'topo.json')
+        save_msccl_object(coll, 'coll.json')
+        assert 0 == os.system('msccl solve instance custom custom --topology-file topo.json --collective-file coll.json -s 1')
 
 def test_solve_bound_rounds():
-    assert '7/6' in os.popen('sccl analyze rounds DGX1 Allgather').read()
+    assert '7/6' in os.popen('msccl analyze rounds DGX1 Allgather').read()
 
 def test_find_isomorphisms():
-    assert 0 == os.system('sccl analyze isomorphisms DGX1 DGX1')
+    assert 0 == os.system('msccl analyze isomorphisms DGX1 DGX1')
 
 def test_distribute_alltoall_greedy():
     with in_tempdir():
-        assert 0 == os.system('sccl solve instance Ring Alltoall --nodes 4 --steps 2 -o local.json')
-        assert 0 == os.system('sccl distribute alltoall-greedy local.json DistributedFullyConnected --copies 3 -o dist.json')
+        assert 0 == os.system('msccl solve instance Ring Alltoall --nodes 4 --steps 2 -o local.json')
+        assert 0 == os.system('msccl distribute alltoall-greedy local.json DistributedFullyConnected --copies 3 -o dist.json')
         assert os.path.exists('dist.json')
         _check_ncclizes('dist.json')
-        assert 0 == os.system('sccl distribute alltoall-greedy local.json DistributedHubAndSpoke --nodes 8')
-        assert 0 != os.system('sccl distribute alltoall-greedy local.json DistributedHubAndSpoke --nodes 5')
+        assert 0 == os.system('msccl distribute alltoall-greedy local.json DistributedHubAndSpoke --nodes 8')
+        assert 0 != os.system('msccl distribute alltoall-greedy local.json DistributedHubAndSpoke --nodes 5')
 
 def test_distribute_alltoall_scatter_gather():
     with in_tempdir():
-        assert 0 == os.system('sccl solve instance DGX1 Gather --root 5 --steps 2 -o gather.json')
-        assert 0 == os.system('sccl solve instance DGX1 Scatter --root 5 --steps 2 -o scatter.json')
-        assert 0 == os.system('sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 2 -o alltoall.json')
+        assert 0 == os.system('msccl solve instance DGX1 Gather --root 5 --steps 2 -o gather.json')
+        assert 0 == os.system('msccl solve instance DGX1 Scatter --root 5 --steps 2 -o scatter.json')
+        assert 0 == os.system('msccl distribute alltoall-gather-scatter gather.json scatter.json --copies 2 -o alltoall.json')
         assert os.path.exists('alltoall.json')
         _check_ncclizes('alltoall.json')
 
 def test_distribute_alltoall_scatter_gather_multiroot():
     with in_tempdir():
-        assert 0 == os.system('sccl solve instance Ring -n 3 MultirootGather --roots 0 1 --steps 1 -o gather.json')
-        assert 0 == os.system('sccl solve instance Ring -n 3 MultirootScatter --roots 1 2 --steps 1 -o scatter.json')
-        assert 0 == os.system('sccl distribute alltoall-gather-scatter gather.json scatter.json --copies 2 -o alltoall.json')
+        assert 0 == os.system('msccl solve instance Ring -n 3 MultirootGather --roots 0 1 --steps 1 -o gather.json')
+        assert 0 == os.system('msccl solve instance Ring -n 3 MultirootScatter --roots 1 2 --steps 1 -o scatter.json')
+        assert 0 == os.system('msccl distribute alltoall-gather-scatter gather.json scatter.json --copies 2 -o alltoall.json')
         assert os.path.exists('alltoall.json')
         _check_ncclizes('alltoall.json')
 
@@ -117,12 +117,12 @@ def test_distribute_alltoall_subproblem():
     # TODO: make this test less brittle. Currentl it will break when algorithm naming is changed, but we don't actually
     # want to test for that.
     with in_tempdir():
-        assert 0 == os.system('sccl distribute alltoall-create-subproblem Line -n 2 --copies 2')
-        coll_name = 'AlltoallSubproblem.n2.copies2.sccl.json'
-        topo_name = 'Subtopo.localLine.n2.relays.0.sccl.json'
+        assert 0 == os.system('msccl distribute alltoall-create-subproblem Line -n 2 --copies 2')
+        coll_name = 'AlltoallSubproblem.n2.copies2.msccl.json'
+        topo_name = 'Subtopo.localLine.n2.relays.0.msccl.json'
         assert os.path.exists(coll_name)
         assert os.path.exists(topo_name)
-        assert 0 == os.system('sccl solve instance custom custom --topology-file Subtopo.localLine.n2.relays.0.sccl.json --collective-file AlltoallSubproblem.n2.copies2.sccl.json -s 3 -r 4 -o subalgo.json')
-        assert 0 == os.system('sccl distribute alltoall-stitch-subproblem subalgo.json --copies 2 -o stitched.json')
+        assert 0 == os.system('msccl solve instance custom custom --topology-file Subtopo.localLine.n2.relays.0.msccl.json --collective-file AlltoallSubproblem.n2.copies2.msccl.json -s 3 -r 4 -o subalgo.json')
+        assert 0 == os.system('msccl distribute alltoall-stitch-subproblem subalgo.json --copies 2 -o stitched.json')
         assert os.path.exists('stitched.json')
         _check_ncclizes('stitched.json')
diff --git a/tests/test_distributors.py b/tests/test_distributors.py
index 178a170..5497899 100755
--- a/tests/test_distributors.py
+++ b/tests/test_distributors.py
@@ -2,11 +2,11 @@
 # Licensed under the MIT License.
 
 from .common import *
-from sccl.topologies import fully_connected, ring, distributed_fully_connected
-from sccl.collectives import alltoall
-from sccl.instance import Instance
-from sccl.path_encoding import PathEncoding
-from sccl.distributors import *
+from msccl.topologies import fully_connected, ring, distributed_fully_connected
+from msccl.collectives import alltoall
+from msccl.instance import Instance
+from msccl.path_encoding import PathEncoding
+from msccl.distributors import *
 
 
 def test_greedy_alltoall():
diff --git a/tests/test_language.py b/tests/test_language.py
index 3125484..c812c66 100755
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import sccl
-from sccl.topologies import line, fully_connected
-from sccl.language import *
-from sccl.language.routines import *
-from sccl.language.collectives import *
+import msccl
+from msccl.topologies import line, fully_connected
+from msccl.language import *
+from msccl.language.routines import *
+from msccl.language.collectives import *
 import os
 import pytest
 
@@ -75,7 +75,7 @@ def test_send():
     chunksperloop = 1
     instances = 1
     collective = Send(num_gpus, chunksperloop, inplace=False)
-    with SCCLProgram("send", topology, collective, instances):
+    with MSCCLProgram("send", topology, collective, instances):
         chunk(0, Buffer.input, 0).copy(1, 'scratch').copy(2, Buffer.output, 0)
         assert Check()
 
@@ -86,7 +86,7 @@ def test_reduce():
     chunksperloop = 1
     instances = 1
     collective = Reduce(num_gpus, chunksperloop, inplace=True)
-    with SCCLProgram("reduce", topology, collective, instances):
+    with MSCCLProgram("reduce", topology, collective, instances):
         c10 = chunk(1, Buffer.input, 0).reduce(chunk(0, Buffer.input, 0))
         chunk(2, Buffer.input, 0).reduce(c10)
         assert Check()
@@ -98,7 +98,7 @@ def test_local_copy():
     chunksperloop = 1
     instances = 1
     collective = Send(num_gpus, chunksperloop, inplace=False)
-    with SCCLProgram("cpy", topology, collective, instances):
+    with MSCCLProgram("cpy", topology, collective, instances):
         chunk(0, Buffer.input, 0).copy(2, 'scratch').copy(2, Buffer.output, 0)
         assert Check()
 
@@ -109,7 +109,7 @@ def test_local_reduce():
     chunksperloop = 1
     instances = 1
     collective = Reduce(num_gpus, chunksperloop, inplace=True)
-    with SCCLProgram("local-reduce", topology, collective, instances):
+    with MSCCLProgram("local-reduce", topology, collective, instances):
         c = chunk(1, Buffer.input, 0).reduce(chunk(0, Buffer.input, 0))
         c = c.copy(2, 'scratch', 0)
         chunk(2, Buffer.input, 0).reduce(c)
@@ -123,7 +123,7 @@ def test_scratch_buffers():
     chunksperloop = num_gpus
     instances = 1
     collective = AllReduce(num_gpus, chunksperloop, inplace=False)
-    with SCCLProgram("test", topology, collective, instances):
+    with MSCCLProgram("test", topology, collective, instances):
         chunk(0, Buffer.input, 0).copy(2, 'scratch', 2)
         c = chunk(2, 'scratch', 2)
         assert c.index == 2
@@ -138,7 +138,7 @@ def test_program_order():
     chunksperloop = num_gpus
     instances = 1
     collective = AllReduce(num_gpus, chunksperloop, inplace=False)
-    prgm = SCCLProgram("test", topology, collective, instances)
+    prgm = MSCCLProgram("test", topology, collective, instances)
     with prgm:
         chunk(1, Buffer.input, 0).copy(0, 'sc', 1)
         # This send should depend on the send above finishing
@@ -153,7 +153,7 @@ def test_program_order():
 def test_allgather():
     topology = fully_connected(2)
     collective = AllGather(2, 1, True)
-    with SCCLProgram("allgather", topology, collective, 1):
+    with MSCCLProgram("allgather", topology, collective, 1):
         chunk(0, Buffer.input, 0).copy(1, Buffer.output, 0)
         chunk(1, Buffer.input, 0).copy(0, Buffer.output, 1)
         assert Check()
@@ -161,7 +161,7 @@ def test_allgather():
 def test_reducescatter():
     topology = fully_connected(2)
     collective = ReduceScatter(2, 1, True)
-    with SCCLProgram("reducescatter", topology, collective, 1):
+    with MSCCLProgram("reducescatter", topology, collective, 1):
         chunk(1, Buffer.input, 1).reduce(chunk(0, Buffer.input, 1))
         chunk(0, Buffer.input, 0).reduce(chunk(1, Buffer.input, 0))
         assert Check()
@@ -170,7 +170,7 @@ def test_reducescatter():
 def test_alltoall():
     topology = fully_connected(2)
     collective = AllToAll(2, 1, False)
-    with SCCLProgram("alltoall", topology, collective, 1):
+    with MSCCLProgram("alltoall", topology, collective, 1):
         chunk(0, Buffer.input, 0).copy(0, Buffer.output, 0)
         chunk(0, Buffer.input, 1).copy(1, Buffer.output, 0)
         chunk(1, Buffer.input, 0).copy(0, Buffer.output, 1)
@@ -180,7 +180,7 @@ def test_alltoall():
 def test_allreduce():
     topology = fully_connected(2)
     collective = AllReduce(2, 2, True)
-    with SCCLProgram("allreduce", topology, collective, 1):
+    with MSCCLProgram("allreduce", topology, collective, 1):
         chunk(1, Buffer.output, 0).reduce(chunk(0, Buffer.input, 0)).copy(0, Buffer.input, 0)
         chunk(0, Buffer.input, 1).reduce(chunk(1, Buffer.input, 1)).copy(1, Buffer.input, 1)
         assert Check()
@@ -188,7 +188,7 @@ def test_allreduce():
 def test_instruction_fusion():
     topology = fully_connected(3)
     collective = AllReduce(3, 3, True)
-    prgm = SCCLProgram("allreduce", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
+    prgm = MSCCLProgram("allreduce", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
     with prgm:
         c01 = chunk(1, Buffer.input, 0, 3).reduce(chunk(0, Buffer.input, 0, 3), sendtb=0, recvtb=0, ch=0)
         c012 = chunk(2, Buffer.input, 0, 3).reduce(c01, sendtb=0, recvtb=0, ch=0)
@@ -204,7 +204,7 @@ def test_instruction_fusion():
 def test_replication():
     topology = fully_connected(2)
     collective = AllToAll(2, 1, False)
-    prgm = SCCLProgram("alltoall", topology, collective, 1)
+    prgm = MSCCLProgram("alltoall", topology, collective, 1)
     with prgm:
         chunk(0, Buffer.input, 0).copy(0, Buffer.output, 0)
         chunk(0, Buffer.input, 1).copy(1, Buffer.output, 0)
@@ -212,7 +212,7 @@ def test_replication():
         chunk(1, Buffer.input, 1).copy(1, Buffer.output, 1)
 
     instances = 2
-    replicated_prgm = SCCLProgram("alltoall", topology, collective, instances)
+    replicated_prgm = MSCCLProgram("alltoall", topology, collective, instances)
     with replicated_prgm:
             chunk(0, Buffer.input, 0).copy(0, Buffer.output, 0)
             chunk(0, Buffer.input, 1).copy(1, Buffer.output, 0)
@@ -229,7 +229,7 @@ def test_illegal_tb_assignment():
     num_gpus = 3
     topology = fully_connected(num_gpus)
     collective = AllToAll(num_gpus, 1, False)
-    prgm = SCCLProgram("alltoall", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
+    prgm = MSCCLProgram("alltoall", topology, collective, 1, threadblock_policy=ThreadblockPolicy.manual)
     with prgm:
         with pytest.raises(Exception):
             # Cannot send to two different gpus on the same threadblock
@@ -238,38 +238,38 @@ def test_illegal_tb_assignment():
             XML()
 
 def test_registered_alltoall_yifan():
-    from sccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
+    from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
 
     num_nodes = 4
     gpus_per_node = 8
     num_ranks = num_nodes * gpus_per_node
     topology = fully_connected(num_ranks)
     collective = AllToAll(num_ranks, 1, inplace=False)
-    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
         alltoall_hierarchical(num_nodes, gpus_per_node)
         assert Check()
 
 def test_registered_alltoall_8kp1():
-    from sccl.programs.alltoall_a100_8kp1 import alltoall_three_step 
+    from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step 
 
     num_nodes = 9
     gpus_per_node = 8
     num_ranks = num_nodes * gpus_per_node
     topology = fully_connected(num_ranks)
     collective = AllToAll(num_ranks, 1, inplace=False)
-    with SCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
         alltoall_three_step(num_nodes, gpus_per_node)
         assert Check()
         XML()
 
 def test_registered_allreduce():
-    from sccl.programs.allreduce_a100_ring import allreduce_ring 
+    from msccl.programs.allreduce_a100_ring import allreduce_ring 
 
     num_ranks = 8
     instances = 4
     topology = fully_connected(num_ranks)
     collective = AllReduce(num_ranks, num_ranks, inplace=True)
-    with SCCLProgram(f"allreduce", topology, collective, instances,
+    with MSCCLProgram(f"allreduce", topology, collective, instances,
         protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
         allreduce_ring(num_ranks, num_ranks)
         assert Check()
@@ -279,7 +279,7 @@ def test_routines_allgather_ring_inplace():
     size = 4
     topology = fully_connected(size)
     collective = AllGather(size, 1, True)
-    with SCCLProgram("allgather_ring", topology, collective, 1):
+    with MSCCLProgram("allgather_ring", topology, collective, 1):
         allgather_ring_inplace(size)
         assert Check()
 
@@ -287,7 +287,7 @@ def test_routines_allgather_ring_nodes():
     size = 8
     topology = fully_connected(size)
     collective = AllGather(size, 1, True)
-    with SCCLProgram("allgather_multi", topology, collective, 1):
+    with MSCCLProgram("allgather_multi", topology, collective, 1):
         # Two parallel rings [0-4] and [4-8]
         allgather_ring_inplace(4, 0, 0)
         allgather_ring_inplace(4, 4, 4)
@@ -302,7 +302,7 @@ def test_routines_allreduce_ring_inplace():
     size = 4
     topology = fully_connected(size)
     collective = AllReduce(size, size, True)
-    with SCCLProgram("allreduce_ring", topology, collective, 1):
+    with MSCCLProgram("allreduce_ring", topology, collective, 1):
         allreduce_ring_inplace(size)
         assert Check()
 
@@ -310,7 +310,7 @@ def test_routines_allreduce_nodes():
     size = 8
     topology = fully_connected(size)
     collective = AllReduce(size, size, True)
-    with SCCLProgram("allreduce_multi", topology, collective, 1):
+    with MSCCLProgram("allreduce_multi", topology, collective, 1):
         # Two parallel rings [0-4] and [4-8]
         allreduce_ring_inplace(4, 0, 0)
         allreduce_ring_inplace(4, 0, 4, ch=1)
diff --git a/tests/test_path_encoding.py b/tests/test_path_encoding.py
index 19205ac..719c6ae 100755
--- a/tests/test_path_encoding.py
+++ b/tests/test_path_encoding.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.path_encoding import PathEncoding
-from sccl.topologies import fully_connected, line, dgx1
-from sccl.collectives import *
-from sccl.instance import Instance
+from msccl.path_encoding import PathEncoding
+from msccl.topologies import fully_connected, line, dgx1
+from msccl.collectives import *
+from msccl.instance import Instance
 
 def test_fc_noncombining():
     num_nodes = 2
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index a875233..480cead 100755
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -2,10 +2,10 @@
 # Licensed under the MIT License.
 
 from .common import *
-from sccl.serialization import SCCLEncoder, SCCLDecoder
-from sccl.algorithm import Algorithm, Step
-from sccl.topologies import fully_connected
-from sccl.instance import Instance
+from msccl.serialization import MSCCLEncoder, MSCCLDecoder
+from msccl.algorithm import Algorithm, Step
+from msccl.topologies import fully_connected
+from msccl.instance import Instance
 
 def test_algorithm_roundtrip():
     name = 'test_algorithm'
@@ -15,10 +15,10 @@ def test_algorithm_roundtrip():
     steps = [Step(1,[(0,0,1)]),Step(2,[(1,1,0),(1,0,1)]),Step(1,[(0,1,0)])]
     instance = Instance(3, pipeline=2)
     algo1 = Algorithm(name, collective, topo, instance, steps)
-    json = SCCLEncoder().encode(algo1)
+    json = MSCCLEncoder().encode(algo1)
     assert json != None
 
-    algo2 = SCCLDecoder().decode(json)
+    algo2 = MSCCLDecoder().decode(json)
     assert algo2.name == name
     assert algo2.instance == instance
     assert algo2.steps == steps
diff --git a/tests/test_topologies.py b/tests/test_topologies.py
index 9c7b1ee..9da1366 100755
--- a/tests/test_topologies.py
+++ b/tests/test_topologies.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from sccl.topologies import *
+from msccl.topologies import *
 
 def test_local_topologies():
     assert hub_and_spoke(4) != None

From db40eb2e8ec43990686739a1be0893e69ae99f06 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Mon, 23 May 2022 16:03:51 -0700
Subject: [PATCH 108/135] Register allpairs (#31)

---
 msccl/autosynth/ndv4_plans.py         | 20 ++++++++++++++++--
 msccl/programs/allreduce_a100_ring.py |  3 ---
 msccl/programs/allreduce_allpairs.py  | 30 +++++++++++++++++++++++++++
 msccl/programs/alltoall_a100_8kp1.py  |  6 ------
 msccl/programs/alltoall_a100_yifan.py |  6 ++----
 tests/test_language.py                | 17 +++++++++++++--
 6 files changed, 65 insertions(+), 17 deletions(-)
 create mode 100644 msccl/programs/allreduce_allpairs.py

diff --git a/msccl/autosynth/ndv4_plans.py b/msccl/autosynth/ndv4_plans.py
index 98e3a05..376aea2 100755
--- a/msccl/autosynth/ndv4_plans.py
+++ b/msccl/autosynth/ndv4_plans.py
@@ -3,6 +3,7 @@
 
 from msccl.autosynth.registry import register_synthesis_plan, register_msccl_program
 from msccl.programs.allreduce_a100_ring import allreduce_ring
+from msccl.programs.allreduce_allpairs import allreduce_allpairs
 from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical
 from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step
 from msccl.topologies import fully_connected
@@ -11,9 +12,24 @@
 def register_ndv4_plans():
 
     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
-        instances=4, protocol='LL128', sizes=('256KB', '20MB'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+        instances=2, protocol='LL', sizes=('512B', '82944B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
     def ndv4_ring_allreduce(prog, nodes):
-        allreduce_ring(size=8, channels=8)
+        allreduce_allpairs(8)
+
+    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+        instances=4, protocol='LL', sizes=('82944B', '458752B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+    def ndv4_ring_allreduce(prog, nodes):
+        allreduce_allpairs(8)
+
+    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+        instances=8, protocol='LL', sizes=('458752B', '2129920B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+    def ndv4_ring_allreduce(prog, nodes):
+        allreduce_ring(size=8, channels=4)
+
+    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+        instances=8, protocol='LL128', sizes=('2129920B', '22806528B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+    def ndv4_ring_allreduce(prog, nodes):
+        allreduce_ring(size=8, channels=4)
 
     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64)
     def ndv4_alltoall_hierarchical(prog, nodes):
diff --git a/msccl/programs/allreduce_a100_ring.py b/msccl/programs/allreduce_a100_ring.py
index 03d507c..c0f9464 100755
--- a/msccl/programs/allreduce_a100_ring.py
+++ b/msccl/programs/allreduce_a100_ring.py
@@ -1,10 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import argparse
 from msccl.language import *
-from msccl.topologies import *
-from msccl.language.collectives import AllReduce
 
 # Ring all reduce for A100s
 # Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
diff --git a/msccl/programs/allreduce_allpairs.py b/msccl/programs/allreduce_allpairs.py
new file mode 100644
index 0000000..86eb108
--- /dev/null
+++ b/msccl/programs/allreduce_allpairs.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from msccl.language import *
+
+def allreduce_allpairs(size):
+    # Each rank sends the nth chunk to the nth rank into scratch space
+    for r1 in range(size):
+        for r2 in range(size):
+            if r1 != r2:
+                index = r2 * size
+                c = chunk(r1, Buffer.input, index, size=size)
+                c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
+
+    # Each rank performs a local reduction on the nth chunk
+    # Utilize 8 threadblocks for this reduction for better parallelism
+    for r in range(size):
+        for index in range(0, size * (size-1)):
+                c = chunk(r, Buffer.input, r*size + (index % size))
+                c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
+    
+    # Each rank sends the fully reduced nth chunk to all other gpus
+    for r1 in range(size):
+        for r2 in range(size):
+            if r1 != r2:
+                index = r1 * size
+                c = chunk(r1, Buffer.input, index, size)
+                c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
+
+                    
\ No newline at end of file
diff --git a/msccl/programs/alltoall_a100_8kp1.py b/msccl/programs/alltoall_a100_8kp1.py
index 32b1ed1..b9e8ed8 100755
--- a/msccl/programs/alltoall_a100_8kp1.py
+++ b/msccl/programs/alltoall_a100_8kp1.py
@@ -2,8 +2,6 @@
 # Licensed under the MIT License.
 
 from msccl.language import *
-from msccl.topologies import *
-from msccl.language.collectives import AllToAll
 
 def alltoall_three_step(num_nodes, gpus_per_node, instances=1, ib_connections=1):
     num_ranks = num_nodes * gpus_per_node
@@ -29,10 +27,6 @@ def AddChunk(ib_chunks, key, c):
             ib_chunks[key] = ib_chunks[key].group(c)
         else:
             ib_chunks[key] = c
-        
-
-    topology = fully_connected(num_ranks)
-    collective = AllToAll(num_ranks, instances, inplace=False)
     
     ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
     for n1 in range(num_nodes):
diff --git a/msccl/programs/alltoall_a100_yifan.py b/msccl/programs/alltoall_a100_yifan.py
index d2196cf..f20efb7 100755
--- a/msccl/programs/alltoall_a100_yifan.py
+++ b/msccl/programs/alltoall_a100_yifan.py
@@ -1,9 +1,7 @@
-import argparse
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
 
 from msccl.language import *
-from msccl.topologies import *
-from msccl.language.collectives import AllToAll
-
 
 def alltoall_hierarchical(num_nodes, gpus_per_node):
     num_ranks = num_nodes * gpus_per_node
diff --git a/tests/test_language.py b/tests/test_language.py
index c812c66..5ee93a6 100755
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -262,19 +262,32 @@ def test_registered_alltoall_8kp1():
         assert Check()
         XML()
 
-def test_registered_allreduce():
+def test_registered_allreduce_ring():
     from msccl.programs.allreduce_a100_ring import allreduce_ring 
 
     num_ranks = 8
     instances = 4
     topology = fully_connected(num_ranks)
     collective = AllReduce(num_ranks, num_ranks, inplace=True)
-    with MSCCLProgram(f"allreduce", topology, collective, instances,
+    with MSCCLProgram(f"allreduce_ring", topology, collective, instances,
         protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
         allreduce_ring(num_ranks, num_ranks)
         assert Check()
         XML()
 
+def test_registered_allreduce_allpairs():
+    from msccl.programs.allreduce_allpairs import allreduce_allpairs
+
+    num_ranks = 8
+    instances = 2
+    topology = fully_connected(num_ranks)
+    collective = AllReduce(num_ranks, num_ranks*num_ranks, inplace=True)
+    with MSCCLProgram(f"allreduce_allpairs", topology, collective, instances,
+        protocol="LL", threadblock_policy=ThreadblockPolicy.manual):
+        allreduce_allpairs(num_ranks)
+        assert Check()
+        XML()
+
 def test_routines_allgather_ring_inplace():
     size = 4
     topology = fully_connected(size)

From cfb9bd2fb74f3891c8441112f723f700e8807310 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Wed, 8 Jun 2022 09:57:03 -0700
Subject: [PATCH 109/135] registered allreduce fix plus tests

---
 msccl/autosynth/ndv4_plans.py | 16 +++++------
 tests/test_language.py        | 51 -----------------------------------
 2 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/msccl/autosynth/ndv4_plans.py b/msccl/autosynth/ndv4_plans.py
index 376aea2..39f4710 100755
--- a/msccl/autosynth/ndv4_plans.py
+++ b/msccl/autosynth/ndv4_plans.py
@@ -11,32 +11,32 @@
 
 def register_ndv4_plans():
 
-    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True,
         instances=2, protocol='LL', sizes=('512B', '82944B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
-    def ndv4_ring_allreduce(prog, nodes):
+    def ndv4_allpairs_allreduce_config1(prog, nodes):
         allreduce_allpairs(8)
 
-    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
+    @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True,
         instances=4, protocol='LL', sizes=('82944B', '458752B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
-    def ndv4_ring_allreduce(prog, nodes):
+    def ndv4_allpairs_allreduce_config2(prog, nodes):
         allreduce_allpairs(8)
 
     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
         instances=8, protocol='LL', sizes=('458752B', '2129920B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
-    def ndv4_ring_allreduce(prog, nodes):
+    def ndv4_ring_allreduce_config1(prog, nodes):
         allreduce_ring(size=8, channels=4)
 
     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=8, inplace=True,
         instances=8, protocol='LL128', sizes=('2129920B', '22806528B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
-    def ndv4_ring_allreduce(prog, nodes):
+    def ndv4_ring_allreduce_config2(prog, nodes):
         allreduce_ring(size=8, channels=4)
 
     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='LL128', sizes=('1MB', '32MB'), machines=lambda x: x == 8 or x == 16 or x == 32 or x == 64)
-    def ndv4_alltoall_hierarchical(prog, nodes):
+    def ndv4_alltoall_hierarchical_config1(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 8 or x == 16 or x == 32)
-    def ndv4_alltoall_hierarchical(prog, nodes):
+    def ndv4_alltoall_hierarchical_config2(prog, nodes):
         alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
 
     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
diff --git a/tests/test_language.py b/tests/test_language.py
index 5ee93a6..9c2a38d 100755
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -237,57 +237,6 @@ def test_illegal_tb_assignment():
             chunk(0, Buffer.input, 1).copy(2, Buffer.output, 0, sendtb=0, recvtb=1)
             XML()
 
-def test_registered_alltoall_yifan():
-    from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
-
-    num_nodes = 4
-    gpus_per_node = 8
-    num_ranks = num_nodes * gpus_per_node
-    topology = fully_connected(num_ranks)
-    collective = AllToAll(num_ranks, 1, inplace=False)
-    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
-        alltoall_hierarchical(num_nodes, gpus_per_node)
-        assert Check()
-
-def test_registered_alltoall_8kp1():
-    from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step 
-
-    num_nodes = 9
-    gpus_per_node = 8
-    num_ranks = num_nodes * gpus_per_node
-    topology = fully_connected(num_ranks)
-    collective = AllToAll(num_ranks, 1, inplace=False)
-    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
-        alltoall_three_step(num_nodes, gpus_per_node)
-        assert Check()
-        XML()
-
-def test_registered_allreduce_ring():
-    from msccl.programs.allreduce_a100_ring import allreduce_ring 
-
-    num_ranks = 8
-    instances = 4
-    topology = fully_connected(num_ranks)
-    collective = AllReduce(num_ranks, num_ranks, inplace=True)
-    with MSCCLProgram(f"allreduce_ring", topology, collective, instances,
-        protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
-        allreduce_ring(num_ranks, num_ranks)
-        assert Check()
-        XML()
-
-def test_registered_allreduce_allpairs():
-    from msccl.programs.allreduce_allpairs import allreduce_allpairs
-
-    num_ranks = 8
-    instances = 2
-    topology = fully_connected(num_ranks)
-    collective = AllReduce(num_ranks, num_ranks*num_ranks, inplace=True)
-    with MSCCLProgram(f"allreduce_allpairs", topology, collective, instances,
-        protocol="LL", threadblock_policy=ThreadblockPolicy.manual):
-        allreduce_allpairs(num_ranks)
-        assert Check()
-        XML()
-
 def test_routines_allgather_ring_inplace():
     size = 4
     topology = fully_connected(size)

From 3461f795f651bfbd357e5b438504735b482d52e4 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Wed, 8 Jun 2022 10:06:20 -0700
Subject: [PATCH 110/135] extra tests

---
 tests/test_programs.py | 90 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 tests/test_programs.py

diff --git a/tests/test_programs.py b/tests/test_programs.py
new file mode 100644
index 0000000..224a72a
--- /dev/null
+++ b/tests/test_programs.py
@@ -0,0 +1,90 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import msccl
+from msccl.topologies import fully_connected
+from msccl.language.collectives import *
+import os
+import pytest
+
+def test_registered_alltoall_yifan():
+    from msccl.programs.alltoall_a100_yifan import alltoall_hierarchical 
+
+    num_nodes = 4
+    gpus_per_node = 8
+    num_ranks = num_nodes * gpus_per_node
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+        alltoall_hierarchical(num_nodes, gpus_per_node)
+        assert Check()
+
+def test_registered_alltoall_8kp1():
+    from msccl.programs.alltoall_a100_8kp1 import alltoall_three_step 
+
+    num_nodes = 9
+    gpus_per_node = 8
+    num_ranks = num_nodes * gpus_per_node
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+        alltoall_three_step(num_nodes, gpus_per_node)
+        assert Check()
+        XML()
+
+def test_registered_allreduce_ring():
+    from msccl.programs.allreduce_a100_ring import allreduce_ring 
+
+    num_ranks = 8
+    instances = 4
+    topology = fully_connected(num_ranks)
+    collective = AllReduce(num_ranks, num_ranks, inplace=True)
+    with MSCCLProgram(f"allreduce_ring", topology, collective, instances,
+        protocol="LL128", threadblock_policy=ThreadblockPolicy.manual):
+        allreduce_ring(num_ranks, num_ranks)
+        assert Check()
+        XML()
+
+def test_registered_allreduce_allpairs():
+    from msccl.programs.allreduce_allpairs import allreduce_allpairs
+
+    num_ranks = 8
+    instances = 2
+    topology = fully_connected(num_ranks)
+    collective = AllReduce(num_ranks, num_ranks*num_ranks, inplace=True)
+    with MSCCLProgram(f"allreduce_allpairs", topology, collective, instances,
+        protocol="LL", threadblock_policy=ThreadblockPolicy.manual):
+        allreduce_allpairs(num_ranks)
+        assert Check()
+        XML()
+
+def test_registered_ndv4_allreduce(capsys):
+    msccl.init('ndv4', 1, (msccl.Collective.allreduce, (512, 1024)))
+    out, err = capsys.readouterr()
+    assert 'ndv4_allpairs_allreduce_config1 with LL protocol' in out
+
+    msccl.init('ndv4', 1, (msccl.Collective.allreduce, (82944, 458752)))
+    out, err = capsys.readouterr()
+    assert 'ndv4_allpairs_allreduce_config2 with LL protocol' in out
+
+    msccl.init('ndv4', 1, (msccl.Collective.allreduce, (458752, 2129920)))
+    out, err = capsys.readouterr()
+    assert 'ndv4_ring_allreduce_config1 with LL protocol' in out
+
+    msccl.init('ndv4', 1, (msccl.Collective.allreduce, (2129920, 22806528)))
+    out, err = capsys.readouterr()
+    assert 'ndv4_ring_allreduce_config2 with LL128 protocol' in out
+
+
+def test_registered_ndv4_alltoall(capsys):
+    msccl.init('ndv4', 8, (msccl.Collective.alltoall, ('1MB', '32MB')))
+    out, err = capsys.readouterr()
+    assert 'ndv4_alltoall_hierarchical_config1 with LL128 protocol' in out
+
+    msccl.init('ndv4', 8, (msccl.Collective.alltoall, ('32MB', '64MB')))
+    out, err = capsys.readouterr()
+    assert 'ndv4_alltoall_hierarchical_config2 with Simple protocol' in out
+
+    # msccl.init('ndv4', 64, (msccl.Collective.alltoall, ('32MB', '64MB')))
+    # out, err = capsys.readouterr()
+    # assert 'ndv4_alltoall_three_step with Simple protocol' in out

From 8efe0ee2d10f2e66e5d45a863fb4b605dbeb8be7 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Wed, 8 Jun 2022 12:19:38 -0700
Subject: [PATCH 111/135] change registry xml generator, add more flags (#33)

---
 msccl/autosynth/ndv4_plans.py | 4 ++--
 msccl/autosynth/registry.py   | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/msccl/autosynth/ndv4_plans.py b/msccl/autosynth/ndv4_plans.py
index 39f4710..add3cc7 100755
--- a/msccl/autosynth/ndv4_plans.py
+++ b/msccl/autosynth/ndv4_plans.py
@@ -12,12 +12,12 @@
 def register_ndv4_plans():
 
     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True,
-        instances=2, protocol='LL', sizes=('512B', '82944B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+        instances=2, protocol='LL', sizes=('512B', '82944B'), threadblock_policy=ThreadblockPolicy.manual, interleaved_replication=False, dependence_nop=True, machines= lambda x: x == 1)
     def ndv4_allpairs_allreduce_config1(prog, nodes):
         allreduce_allpairs(8)
 
     @register_msccl_program(fully_connected(8), 'allreduce', 'ndv4', chunk_factor=64, inplace=True,
-        instances=4, protocol='LL', sizes=('82944B', '458752B'), threadblock_policy=ThreadblockPolicy.manual, machines= lambda x: x == 1)
+        instances=4, protocol='LL', sizes=('82944B', '458752B'), threadblock_policy=ThreadblockPolicy.manual, interleaved_replication=False, dependence_nop=True, machines= lambda x: x == 1)
     def ndv4_allpairs_allreduce_config2(prog, nodes):
         allreduce_allpairs(8)
 
diff --git a/msccl/autosynth/registry.py b/msccl/autosynth/registry.py
index 0edc7cc..5dc89c6 100755
--- a/msccl/autosynth/registry.py
+++ b/msccl/autosynth/registry.py
@@ -63,7 +63,8 @@ def wrapped(machines):
 
 
 def register_msccl_program(local_topology, collective, machine_type, machines=lambda x: True, sizes=None, protocol='Simple', 
-    chunk_factor=1, priority=0, collective_obj=None, instances=1, inplace=False, threadblock_policy=ThreadblockPolicy.auto):
+    chunk_factor=1, priority=0, collective_obj=None, instances=1, inplace=False, threadblock_policy=ThreadblockPolicy.auto,
+    interleaved_replication=True, dependence_nop=False):
     def decorator(fun):
         name = fun.__name__
         def wrapped(machines):
@@ -80,11 +81,12 @@ def wrapped(machines):
                     co = lang_collectives.ReduceScatter(topology.num_nodes(), chunk_factor, inplace)
                 else:
                     raise RuntimeError(f'No collective_obj in msccl.language.collectives known for "{collective}"')
-            prog = MSCCLProgram(name, topology, co, instances, protocol, threadblock_policy)
+            prog = MSCCLProgram(name, topology, co, instances, protocol, threadblock_policy=threadblock_policy, 
+                interleaved_replication=interleaved_replication, dependence_nop=dependence_nop)
             with prog:
                 fun(prog, machines)
             prog.check()
-            ef = ir_to_xml(prog.lower())
+            ef = prog.generate_xml()
             fd, path = tempfile.mkstemp()
             with os.fdopen(fd, 'w') as f:
                 f.write(ef)

From 9c6ea088935fb2ddfb65ad6999fad0481e3ce408 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 8 Jun 2022 19:21:38 +0000
Subject: [PATCH 112/135] ndv4 works

---
 msccl/autosynth/__init__.py | 16 ++++++++++++++--
 msccl/topologies/nvidia.py  |  7 +++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/msccl/autosynth/__init__.py b/msccl/autosynth/__init__.py
index 7c11e24..9bcfd60 100755
--- a/msccl/autosynth/__init__.py
+++ b/msccl/autosynth/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from msccl.topologies import dgx1, nvlink_only
+from msccl.topologies import dgx1, dgx_a100, nvlink_only
 from msccl.isomorphisms import find_isomorphisms
 from msccl.autosynth.registry import synthesis_plans
 from lxml import etree as ET
@@ -34,6 +34,18 @@ def __str__(self):
 
 
 def init(machine_type, num_machines, *collectives):
+    # first detect the machine type in case auto was passed in
+    if machine_type == "auto":
+        nvlink_matrix = nvlink_only()
+        isomorphisms = find_isomorphisms(dgx1(), nvlink_matrix)
+        if len(isomorphisms) == 4:
+            machine_type = "ndv2"
+        elif nvlink_matrix.links == dgx_a100().links:
+            machine_type = "ndv4"
+        else:
+            raise RuntimeError(
+                f'Did not recognize the SKU type automatically. If you are sure about the SKU, try replacing "auto" with your explicit SKU name.')        
+
     # Collect and sort all plans that match the collectives and sizes given by the user.
     selected_plans = {}
     for collective in collectives:
@@ -268,4 +280,4 @@ def tabulate_plans():
 
 
 def print_plans():
-    print(tabulate_plans())
\ No newline at end of file
+    print(tabulate_plans())
diff --git a/msccl/topologies/nvidia.py b/msccl/topologies/nvidia.py
index 57f2d46..d2f9a19 100755
--- a/msccl/topologies/nvidia.py
+++ b/msccl/topologies/nvidia.py
@@ -40,6 +40,13 @@ def dgx1():
 
     return Topology('DGX1', links)
 
+def dgx_a100():
+    links = [[12]*8 for i in range(8)]  
+    for i in range(8):
+        links[i][i] = 0
+    
+    return Topology('DGX_A100', links)
+
 def nvlink_only(nvidia_smi_topo=None):
     if nvidia_smi_topo == None:
         nvidia_smi_topo = _get_nvidia_smi_topo()

From 0018bfe10e967ec0deba4ded955f7044a68446b9 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 8 Jun 2022 21:21:25 +0000
Subject: [PATCH 113/135] more log during detection

---
 msccl/autosynth/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/msccl/autosynth/__init__.py b/msccl/autosynth/__init__.py
index 9bcfd60..040e081 100755
--- a/msccl/autosynth/__init__.py
+++ b/msccl/autosynth/__init__.py
@@ -45,6 +45,7 @@ def init(machine_type, num_machines, *collectives):
         else:
             raise RuntimeError(
                 f'Did not recognize the SKU type automatically. If you are sure about the SKU, try replacing "auto" with your explicit SKU name.')        
+        print(f"The auto-detected SKU is a {machine_type}.")
 
     # Collect and sort all plans that match the collectives and sizes given by the user.
     selected_plans = {}
@@ -71,7 +72,7 @@ def init(machine_type, num_machines, *collectives):
             selected_plans[name] = plans
 
     # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by MSCCL-RT.
-    algos_elem = ET.Element('msccl_algos')
+    algos_elem = ET.Element('sccl_algos')
     any_selected = False
     for collective_name, plans in selected_plans.items():
         for plan, params in plans:
@@ -94,14 +95,14 @@ def init(machine_type, num_machines, *collectives):
 
         # Set environment variables
         env = {
-            'MSCCL_CONFIG': path,
+            'SCCL_CONFIG': path,
         }
         if 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] != '':
             existing_algos = os.environ['NCCL_ALGO']
             if 'MSCCL' not in existing_algos.split(','):
-                os.environ['NCCL_ALGO'] = 'MSCCL,' + existing_algos
+                os.environ['NCCL_ALGO'] = 'SCCL,' + existing_algos
         else:
-            env['NCCL_ALGO'] = 'MSCCL,RING,TREE'
+            env['NCCL_ALGO'] = 'SCCL,RING,TREE'
         if machine_type == 'ndv4' and num_machines >= 8 and 'alltoall' in selected_plans:
             print(f'MSCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'

From 7a55090be9604a3ca6560c0e98c1a909dd2b6117 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 8 Jun 2022 21:22:45 +0000
Subject: [PATCH 114/135] going back to msccl

---
 msccl/autosynth/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/msccl/autosynth/__init__.py b/msccl/autosynth/__init__.py
index 040e081..9b7216d 100755
--- a/msccl/autosynth/__init__.py
+++ b/msccl/autosynth/__init__.py
@@ -72,7 +72,7 @@ def init(machine_type, num_machines, *collectives):
             selected_plans[name] = plans
 
     # Execute the plans to find or synthesize the algorithms and format them in the XML format expected by MSCCL-RT.
-    algos_elem = ET.Element('sccl_algos')
+    algos_elem = ET.Element('msccl_algos')
     any_selected = False
     for collective_name, plans in selected_plans.items():
         for plan, params in plans:
@@ -95,14 +95,14 @@ def init(machine_type, num_machines, *collectives):
 
         # Set environment variables
         env = {
-            'SCCL_CONFIG': path,
+            'MSCCL_CONFIG': path,
         }
         if 'NCCL_ALGO' in os.environ and os.environ['NCCL_ALGO'] != '':
             existing_algos = os.environ['NCCL_ALGO']
             if 'MSCCL' not in existing_algos.split(','):
-                os.environ['NCCL_ALGO'] = 'SCCL,' + existing_algos
+                os.environ['NCCL_ALGO'] = 'MSCCL,' + existing_algos
         else:
-            env['NCCL_ALGO'] = 'SCCL,RING,TREE'
+            env['NCCL_ALGO'] = 'MSCCL,RING,TREE'
         if machine_type == 'ndv4' and num_machines >= 8 and 'alltoall' in selected_plans:
             print(f'MSCCL: Setting NCCL_IB_AR_THRESHOLD=0 (reason: alltoall and at least 16 ndv4 machines)')
             env['NCCL_IB_AR_THRESHOLD'] = '0'

From 989a327aa503ded8954ef0bce1219d5e246d74ee Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 8 Jun 2022 21:33:38 +0000
Subject: [PATCH 115/135] safe fallback to NCCL in case the SKU is not detected

---
 README.md                   | 3 ++-
 msccl/autosynth/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9c1a118..b43f8d3 100755
--- a/README.md
+++ b/README.md
@@ -30,7 +30,8 @@ msccl.init('ndv2', 2, (msccl.Collective.alltoall, ('1MB')))
 ```
 This will find an algorithm provider that can create an Alltoall algorithm that is expected to be good with 1MB of data.
 That will call a synthesis routine that writes the algorithm to disk. `msccl.init` will then pass a configuration file
-pointing to this algorithm to the runtime through environment variables.
+pointing to this algorithm to the runtime through environment variables. If the SKU is unknown, ```'auto'``` can be passed
+in instead.
 
 See [the examples](examples/msccl_init.py) for more on `msccl.init` usage.
 
diff --git a/msccl/autosynth/__init__.py b/msccl/autosynth/__init__.py
index 9b7216d..57a909d 100755
--- a/msccl/autosynth/__init__.py
+++ b/msccl/autosynth/__init__.py
@@ -43,8 +43,8 @@ def init(machine_type, num_machines, *collectives):
         elif nvlink_matrix.links == dgx_a100().links:
             machine_type = "ndv4"
         else:
-            raise RuntimeError(
-                f'Did not recognize the SKU type automatically. If you are sure about the SKU, try replacing "auto" with your explicit SKU name.')        
+            print(f'Did not recognize the SKU type automatically. If you are sure about the SKU, try replacing "auto" with your explicit SKU name. Falling back to NCCL.')
+            return
         print(f"The auto-detected SKU is a {machine_type}.")
 
     # Collect and sort all plans that match the collectives and sizes given by the user.

From 7ec860f661200553ccfde7fc4fe5fc21308b715c Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 10 Jun 2022 19:50:27 +0000
Subject: [PATCH 116/135] adding msccl 2D algorithms for 2,4 NDv4 nodes as well

---
 msccl/autosynth/ndv4_plans.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/msccl/autosynth/ndv4_plans.py b/msccl/autosynth/ndv4_plans.py
index add3cc7..a5453c0 100755
--- a/msccl/autosynth/ndv4_plans.py
+++ b/msccl/autosynth/ndv4_plans.py
@@ -42,3 +42,9 @@ def ndv4_alltoall_hierarchical_config2(prog, nodes):
     @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('32MB', None), machines=lambda x: x == 64)
     def ndv4_alltoall_three_step(prog, nodes):
         alltoall_three_step(num_nodes=nodes, gpus_per_node=8)
+
+    @register_msccl_program(fully_connected(8), 'alltoall', 'ndv4', protocol='Simple', sizes=('1KB', None), machines=lambda x: x == 2 or x == 4)
+    def ndv4_alltoall_hierarchical_config2(prog, nodes):
+        alltoall_hierarchical(num_nodes=nodes, gpus_per_node=8)
+
+        

From aa4ab59674849f958ea43fd1912f9ecb0c3c5e78 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Fri, 24 Jun 2022 12:29:16 -0700
Subject: [PATCH 117/135] Rename alltoalls and parameterize two step (#35)

---
 examples/mscclang/alltoall_a100.py       | 135 -----------------------
 examples/mscclang/alltoall_a100_yifan.py |  59 ----------
 2 files changed, 194 deletions(-)
 delete mode 100755 examples/mscclang/alltoall_a100.py
 delete mode 100755 examples/mscclang/alltoall_a100_yifan.py

diff --git a/examples/mscclang/alltoall_a100.py b/examples/mscclang/alltoall_a100.py
deleted file mode 100755
index 549e65c..0000000
--- a/examples/mscclang/alltoall_a100.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-# For AllToAll on 9 A100 nodes
-# alltoall_a100.py 9 8 2
-# For AllToAll on 16 A100 nodes
-# alltoall_a100.py 16 8 2 --ib_connections 1
-
-import argparse
-
-from msccl.language import *
-from msccl.topologies import *
-from msccl.language.collectives import AllToAll
-
-def alltoall_hierarchical(num_nodes, gpus_per_node, instances, ib_connections):
-    num_ranks = num_nodes * gpus_per_node
-
-    # (node, local gpu) to rank
-    # (n, g) => r
-    def RankFromNodeGpuPair(n, g):
-        return n*gpus_per_node + g
-
-    # For cross node traffic from node n1 to node n2, returns the ranks g
-    # gpus on n1 and n2 that handle that traffic.
-    def CrossNodeGpus(n1, n2):
-        def LocalRank(n1, n2):
-            return (n2 if n1 > n2 else n2-1) % gpus_per_node
-        r1 = RankFromNodeGpuPair(n1, LocalRank(n1, n2))
-        r2 = RankFromNodeGpuPair(n2, LocalRank(n2, n1))
-        return (r1, r2)
-
-    # Groups chunk reference into one large chunk reference (used for IB)
-    # Save them under a key in the dictionary ib_chunks
-    def AddChunk(ib_chunks, key, c):
-        if key in ib_chunks: 
-            ib_chunks[key] = ib_chunks[key].group(c)
-        else:
-            ib_chunks[key] = c
-        
-
-    topology = fully_connected(num_ranks)
-    collective = AllToAll(num_ranks, instances, inplace=False)
-    
-    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
-        ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
-
-        # Local Gathers
-        # for n1 in range(num_nodes):
-        #     for g1 in range(gpus_per_node):
-        #         for n2 in range(num_nodes):
-        #             for g2 in range(gpus_per_node):
-        #                 for ch in range(instances):
-        #                     r1 = RankFromNodeGpuPair(n1, g1)
-        #                     r2 = RankFromNodeGpuPair(n2, g2)
-        #                     # Rank(r) gives accesses the rth rank of the program
-        #                     # input(i) gives a reference to ith chunk
-        #                     c = Rank(r1).input(r2 * instances + ch)
-                            
-        #                     if (n1 != n2): 
-        #                         # Gather chunks destined for cross node ranks in scratch to route through IB
-        #                         gather_rank, _ = CrossNodeGpus(n1, n2)
-        #                         buffer_key = (n1, n2)
-        #                         # Send chunk to the gather_rank. Send returns a chunk reference to the 
-        #                         # receiver's chunk
-        #                         c = c.copy(gather_rank, buffer=buffer_key, ch=ch)
-        #                         # Group the chunks using a particular IB pair into one large chunk reference
-        #                         AddChunk(ib_chunks, buffer_key, c) 
-        #                     else:
-        #                         # Directly copy chunks destined for ranks within the node or
-        #                         # copy chunks destined for current rank into the output buffer
-        #                         c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch)
-
-        for n1 in range(num_nodes):
-            for g1 in range(gpus_per_node):
-                for ch in range(instances):
-                    for n2 in range(num_nodes):
-                        r1 = RankFromNodeGpuPair(n1, g1)
-                        if (n1 != n2): 
-                            # Send over all chunks destined for that node to the peer gpu that handles chunks to that node
-                            c = chunk(r1, Buffer.input, n2 * gpus_per_node * instances + ch * gpus_per_node, gpus_per_node)
-                            # Gather chunks destined for cross node ranks in scratch to route through IB
-                            gather_rank, _ = CrossNodeGpus(n1, n2)
-                            buffer_key = (n1, n2)
-                            # Send chunk to the gather_rank. Send returns a chunk reference to the 
-                            # receiver's chunk
-                            c = c.copy(gather_rank, buffer=buffer_key, ch=ch*2)
-                            # Group the chunks using a particular IB pair into one large chunk reference
-                            AddChunk(ib_chunks, buffer_key, c) 
-                        else:
-                            # Within a node - direct copy/copy the chunks over nvlink to the output buffer. 
-                            # Use a different channel to ensure that we don't get in the way of copys/receives above
-                            # which are on the critical path.
-                            for g2 in range(gpus_per_node):
-                                r2 = RankFromNodeGpuPair(n2, g2)
-                                c = chunk(r1, Buffer.input, r2 * instances + ch)
-                                c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
-
-                    
-
-        # IB Send and local scatters
-        for buffer_key, ib_chunk in ib_chunks.items(): 
-            (n1, n2) = buffer_key
-            _, scatter_rank = CrossNodeGpus(n1, n2)
-            # IB copy divided across multiple parallel channels
-            chunks = ib_chunk.split(ib_connections)
-            for ch, c in enumerate(chunks):
-                # Note: If we are only going to use 1 IB connection for each IB copy
-                # alternate between channels 0 and 1 to utilize both IB links.
-                if ib_connections == 1:
-                    ib_channel = c.rank % 2
-                else:
-                    ib_channel = ch
-                c = c.copy(scatter_rank, buffer=buffer_key, ch=ib_channel)
-                # Local scatter
-                cs = c.split(gpus_per_node * gpus_per_node)
-                for i, c in enumerate(cs):
-                    # Access the chunk's destination rank and index to route it to its final place
-                    final_rank = c.get_dst_rank()
-                    index = c.get_dst_index()
-                    c.copy(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
-
-        XML() # Prints the XML
-        Check()
-
-parser = argparse.ArgumentParser()
-parser.add_argument('num_nodes', type=int, help ='number of nodes')
-parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
-parser.add_argument('instances', type=int, help='number of instances')
-parser.add_argument('--ib_connections', type=int, default=-1, help='Number of connections used for each IB copy. Default: number of instances')
-args = parser.parse_args()
-
-if args.ib_connections == -1:
-    args.ib_connections = args.instances
-
-alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.instances, args.ib_connections)
\ No newline at end of file
diff --git a/examples/mscclang/alltoall_a100_yifan.py b/examples/mscclang/alltoall_a100_yifan.py
deleted file mode 100755
index 0a1921a..0000000
--- a/examples/mscclang/alltoall_a100_yifan.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import argparse
-
-from msccl.language import *
-from msccl.topologies import *
-from msccl.language.collectives import AllToAll
-
-
-def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
-    num_ranks = num_nodes * gpus_per_node
-    topology = fully_connected(num_ranks)
-    collective = AllToAll(num_ranks, 1, inplace=False)
-
-        
-    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol):
-        for n1 in range(num_nodes):
-            for r in range(1,num_nodes):
-                n2 = (n1 + r) % num_nodes
-                # print(f"r {r} n1 {n1} n2 {n2}")
-
-                # Gather all local chunks for the node neighbor
-                for g1 in range(gpus_per_node):
-                    rank1 = n1 * gpus_per_node + g1
-
-                    for g2 in range(gpus_per_node):
-                        rank2 = n1 * gpus_per_node + g2
-                        # chunk to copy: g2 on n2
-                        index = n2 * gpus_per_node + g2 
-                        c = chunk(rank1, Buffer.input, index)
-                        c = c.copy(rank2, f'copy_{n2}')
-
-            for r in range(1,num_nodes):
-                n2 = (n1 + r) % num_nodes
-                # IB copy
-                for g1 in range(gpus_per_node):
-                    rank = n1 * gpus_per_node + g1
-                    ib_peer = n2 * gpus_per_node + g1
-                    c = chunk(rank, f'copy_{n2}', 0, 8)
-                    c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % 8)*2+(rank%2)+2)
-
-          
-        # Handle local chunks within a node
-        for rank in range(num_ranks):
-            for g in range(gpus_per_node):
-                index = (rank // gpus_per_node) * gpus_per_node + g
-                c = chunk(rank, Buffer.input, index)
-                c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index())
-
-        XML() # Prints the XML
-        Check()
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument('num_nodes', type=int, help ='number of nodes')
-parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
-parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
-args = parser.parse_args()
-
-
-alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.protocol)

From 55fde5a1e890dd23ecec132f37b2d40574f092c2 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Fri, 24 Jun 2022 16:20:07 -0700
Subject: [PATCH 118/135] Alltoall rename (#37)

* Rename alltoalls and parameterize two step

* Renamed alltoalls
---
 examples/mscclang/alltoall_a100_three_step.py | 135 ++++++++++++++++++
 examples/mscclang/alltoall_a100_two_step.py   |  58 ++++++++
 2 files changed, 193 insertions(+)
 create mode 100755 examples/mscclang/alltoall_a100_three_step.py
 create mode 100755 examples/mscclang/alltoall_a100_two_step.py

diff --git a/examples/mscclang/alltoall_a100_three_step.py b/examples/mscclang/alltoall_a100_three_step.py
new file mode 100755
index 0000000..549e65c
--- /dev/null
+++ b/examples/mscclang/alltoall_a100_three_step.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# For AllToAll on 9 A100 nodes
+# alltoall_a100.py 9 8 2
+# For AllToAll on 16 A100 nodes
+# alltoall_a100.py 16 8 2 --ib_connections 1
+
+import argparse
+
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
+
+def alltoall_hierarchical(num_nodes, gpus_per_node, instances, ib_connections):
+    num_ranks = num_nodes * gpus_per_node
+
+    # (node, local gpu) to rank
+    # (n, g) => r
+    def RankFromNodeGpuPair(n, g):
+        return n*gpus_per_node + g
+
+    # For cross node traffic from node n1 to node n2, returns the ranks g
+    # gpus on n1 and n2 that handle that traffic.
+    def CrossNodeGpus(n1, n2):
+        def LocalRank(n1, n2):
+            return (n2 if n1 > n2 else n2-1) % gpus_per_node
+        r1 = RankFromNodeGpuPair(n1, LocalRank(n1, n2))
+        r2 = RankFromNodeGpuPair(n2, LocalRank(n2, n1))
+        return (r1, r2)
+
+    # Groups chunk reference into one large chunk reference (used for IB)
+    # Save them under a key in the dictionary ib_chunks
+    def AddChunk(ib_chunks, key, c):
+        if key in ib_chunks: 
+            ib_chunks[key] = ib_chunks[key].group(c)
+        else:
+            ib_chunks[key] = c
+        
+
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, instances, inplace=False)
+    
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1):
+        ib_chunks = {} # Keeps track of chunks going over IB buffer buffer name -> chunk
+
+        # Local Gathers
+        # for n1 in range(num_nodes):
+        #     for g1 in range(gpus_per_node):
+        #         for n2 in range(num_nodes):
+        #             for g2 in range(gpus_per_node):
+        #                 for ch in range(instances):
+        #                     r1 = RankFromNodeGpuPair(n1, g1)
+        #                     r2 = RankFromNodeGpuPair(n2, g2)
+        #                     # Rank(r) gives accesses the rth rank of the program
+        #                     # input(i) gives a reference to ith chunk
+        #                     c = Rank(r1).input(r2 * instances + ch)
+                            
+        #                     if (n1 != n2): 
+        #                         # Gather chunks destined for cross node ranks in scratch to route through IB
+        #                         gather_rank, _ = CrossNodeGpus(n1, n2)
+        #                         buffer_key = (n1, n2)
+        #                         # Send chunk to the gather_rank. Send returns a chunk reference to the 
+        #                         # receiver's chunk
+        #                         c = c.copy(gather_rank, buffer=buffer_key, ch=ch)
+        #                         # Group the chunks using a particular IB pair into one large chunk reference
+        #                         AddChunk(ib_chunks, buffer_key, c) 
+        #                     else:
+        #                         # Directly copy chunks destined for ranks within the node or
+        #                         # copy chunks destined for current rank into the output buffer
+        #                         c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch)
+
+        for n1 in range(num_nodes):
+            for g1 in range(gpus_per_node):
+                for ch in range(instances):
+                    for n2 in range(num_nodes):
+                        r1 = RankFromNodeGpuPair(n1, g1)
+                        if (n1 != n2): 
+                            # Send over all chunks destined for that node to the peer gpu that handles chunks to that node
+                            c = chunk(r1, Buffer.input, n2 * gpus_per_node * instances + ch * gpus_per_node, gpus_per_node)
+                            # Gather chunks destined for cross node ranks in scratch to route through IB
+                            gather_rank, _ = CrossNodeGpus(n1, n2)
+                            buffer_key = (n1, n2)
+                            # Send chunk to the gather_rank. Send returns a chunk reference to the 
+                            # receiver's chunk
+                            c = c.copy(gather_rank, buffer=buffer_key, ch=ch*2)
+                            # Group the chunks using a particular IB pair into one large chunk reference
+                            AddChunk(ib_chunks, buffer_key, c) 
+                        else:
+                            # Within a node - direct copy/copy the chunks over nvlink to the output buffer. 
+                            # Use a different channel to ensure that we don't get in the way of copys/receives above
+                            # which are on the critical path.
+                            for g2 in range(gpus_per_node):
+                                r2 = RankFromNodeGpuPair(n2, g2)
+                                c = chunk(r1, Buffer.input, r2 * instances + ch)
+                                c.copy(r2, buffer=Buffer.output, index=c.get_dst_index(), ch=ch*2)
+
+                    
+
+        # IB Send and local scatters
+        for buffer_key, ib_chunk in ib_chunks.items(): 
+            (n1, n2) = buffer_key
+            _, scatter_rank = CrossNodeGpus(n1, n2)
+            # IB copy divided across multiple parallel channels
+            chunks = ib_chunk.split(ib_connections)
+            for ch, c in enumerate(chunks):
+                # Note: If we are only going to use 1 IB connection for each IB copy
+                # alternate between channels 0 and 1 to utilize both IB links.
+                if ib_connections == 1:
+                    ib_channel = c.rank % 2
+                else:
+                    ib_channel = ch
+                c = c.copy(scatter_rank, buffer=buffer_key, ch=ib_channel)
+                # Local scatter
+                cs = c.split(gpus_per_node * gpus_per_node)
+                for i, c in enumerate(cs):
+                    # Access the chunk's destination rank and index to route it to its final place
+                    final_rank = c.get_dst_rank()
+                    index = c.get_dst_index()
+                    c.copy(final_rank, buffer=Buffer.output, index=index, ch=ch*2 + 1)
+
+        XML() # Prints the XML
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_nodes', type=int, help ='number of nodes')
+parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--ib_connections', type=int, default=-1, help='Number of connections used for each IB copy. Default: number of instances')
+args = parser.parse_args()
+
+if args.ib_connections == -1:
+    args.ib_connections = args.instances
+
+alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.instances, args.ib_connections)
\ No newline at end of file
diff --git a/examples/mscclang/alltoall_a100_two_step.py b/examples/mscclang/alltoall_a100_two_step.py
new file mode 100755
index 0000000..3544e68
--- /dev/null
+++ b/examples/mscclang/alltoall_a100_two_step.py
@@ -0,0 +1,58 @@
+import argparse
+
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllToAll
+
+
+def alltoall_hierarchical(num_nodes, gpus_per_node, protocol):
+    num_ranks = num_nodes * gpus_per_node
+    topology = fully_connected(num_ranks)
+    collective = AllToAll(num_ranks, 1, inplace=False)
+
+        
+    with MSCCLProgram("hierarchical_all_to_all", topology, collective, 1, protocol=protocol):
+        for n1 in range(num_nodes):
+            for r in range(1,num_nodes):
+                n2 = (n1 + r) % num_nodes
+
+                # Gather all local chunks for the node neighbor
+                for g1 in range(gpus_per_node):
+                    rank1 = n1 * gpus_per_node + g1
+
+                    for g2 in range(gpus_per_node):
+                        rank2 = n1 * gpus_per_node + g2
+                        # chunk to copy: g2 on n2
+                        index = n2 * gpus_per_node + g2 
+                        c = chunk(rank1, Buffer.input, index)
+                        c = c.copy(rank2, f'copy_{n2}')
+
+            for r in range(1,num_nodes):
+                n2 = (n1 + r) % num_nodes
+                # IB copy
+                for g1 in range(gpus_per_node):
+                    rank = n1 * gpus_per_node + g1
+                    ib_peer = n2 * gpus_per_node + g1
+                    c = chunk(rank, f'copy_{n2}', 0, gpus_per_node)
+                    c = c.copy(ib_peer, Buffer.output, c.get_dst_index(), ch=((n1+n2) % gpus_per_node)*2+(rank%2)+2)
+
+          
+        # Handle local chunks within a node
+        for rank in range(num_ranks):
+            for g in range(gpus_per_node):
+                index = (rank // gpus_per_node) * gpus_per_node + g
+                c = chunk(rank, Buffer.input, index)
+                c.copy(c.get_dst_rank(), Buffer.output, c.get_dst_index())
+
+        XML() # Prints the XML
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_nodes', type=int, help ='number of nodes')
+parser.add_argument('gpus_per_node', type=int, help ='gpus per node')
+parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+
+alltoall_hierarchical(args.num_nodes, args.gpus_per_node, args.protocol)

From a8656a36106a9072f276cb0b57a031d11f75b31f Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Fri, 24 Jun 2022 16:29:15 -0700
Subject: [PATCH 119/135] Hierarchical AllReduce (#38)

* Hierarchical allreduce for two nodes
---
 examples/mscclang/hierarchical_allreduce.py | 89 +++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 examples/mscclang/hierarchical_allreduce.py

diff --git a/examples/mscclang/hierarchical_allreduce.py b/examples/mscclang/hierarchical_allreduce.py
new file mode 100644
index 0000000..7ad8727
--- /dev/null
+++ b/examples/mscclang/hierarchical_allreduce.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+# Blue Connect style AllReduce https://proceedings.mlsys.org/paper/2019/file/9b8619251a19057cff70779273e95aa6-Paper.pdf
+# Assumes only two-level switches
+
+def ring_reduce_scatter(size, rank_offset=0, rank_step=1, local_chunk_size=1, chunk_offset=0, chunk_stride=1, chan=-1):
+    for ch in range(0, size):
+        index = ch * chunk_stride * local_chunk_size + chunk_offset
+        for step in range(0, size-1):
+            other = chunk(((step+1+ch) % size)*rank_step +rank_offset, Buffer.input, index, local_chunk_size)
+            c = chunk(((step+2+ch) % size)*rank_step+rank_offset, Buffer.input, index, local_chunk_size)
+            c.reduce(other, ch=chan)
+
+def ring_all_gather(size, rank_offset=0, rank_step=1, local_chunk_size=1, chunk_offset=0, chunk_stride=1, chan=-1):
+    for ch in range(0, size):
+        index = ch * chunk_stride * local_chunk_size + chunk_offset
+        for step in range(0, size-1):
+            c = chunk(((step+ch) % size)*rank_step + rank_offset, Buffer.input, index, local_chunk_size)
+            c.copy(((step+1+ch) % size)*rank_step + rank_offset, Buffer.input, index, ch=chan)
+
+def hierarchical_allreduce(num_local_gpus, num_nodes, instances, protocol, schedule):
+    num_gpus = num_local_gpus * num_nodes
+    topology = fully_connected(num_gpus)
+    collective = AllReduce(num_gpus, num_gpus, True)
+
+    with MSCCLProgram("hierarchical_allreduce", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False):
+
+        local_chunk_size = num_nodes
+        if schedule == 'auto':
+            for n in range(num_nodes):
+                for offset in range(num_nodes):
+                    ring_reduce_scatter(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes)
+
+            # Cross node Reduce-Scatter
+            for g in range(num_local_gpus):
+                ring_reduce_scatter(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes)
+
+            # Cross node All-gather
+            for g in range(num_local_gpus):
+                ring_all_gather(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes)
+
+
+            # All gather within each node
+            for n in range(num_nodes):
+                for offset in range(num_nodes):
+                    ring_all_gather(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes)
+
+        else:
+            # Reduce Scatter within each node
+            for n in range(num_nodes):
+                for offset in range(num_nodes):
+                    ring_reduce_scatter(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes, chan=offset)
+
+            # Cross node Reduce-Scatter
+            for g in range(num_local_gpus):
+                ring_reduce_scatter(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes, chan=g%2+num_nodes*2)
+
+            # Cross node All-gather
+            for g in range(num_local_gpus):
+                ring_all_gather(num_nodes, rank_offset=g, rank_step=num_local_gpus, chunk_offset=g*num_nodes, chan=g%2+num_nodes*2)
+
+
+            # All gather within each node
+            for n in range(num_nodes):
+                for offset in range(num_nodes):
+                    ring_all_gather(num_local_gpus, rank_offset=n * num_local_gpus, chunk_offset=offset, chunk_stride=num_nodes, chan=offset+num_nodes)
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help='number of gpus per node')
+parser.add_argument('num_nodes', type=int, help='number of nodes')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='Simple', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+parser.add_argument('--schedule', type=str, default='auto', choices=['auto', 'manual'], help='Scheduling')
+
+args = parser.parse_args()
+
+hierarchical_allreduce(args.num_gpus, args.num_nodes, args.instances, args.protocol, args.schedule)
+

From 9b7adbfa9c13aa05e39d6033925eb481a9f15608 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 13 Jul 2022 04:09:13 +0000
Subject: [PATCH 120/135] 1 step allreduce

---
 examples/mscclang/allreduce_1step.py | 49 ++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 examples/mscclang/allreduce_1step.py

diff --git a/examples/mscclang/allreduce_1step.py b/examples/mscclang/allreduce_1step.py
new file mode 100644
index 0000000..45c75e1
--- /dev/null
+++ b/examples/mscclang/allreduce_1step.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = gpus
+    topology = fully_connected(size)
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
+        
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for rank in range(size):
+            tb = 0
+            for nghr in range(size):
+                if rank != nghr:
+                    c = chunk(rank, Buffer.input, index=0, size=size)
+                    c.copy(nghr, 'scratch', sendtb=nghr, recvtb=rank)
+                    tb += 1
+
+        # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
+        for rank in range(size):
+            index = 0
+            tb = 0
+            for nghr in range(size):
+                if rank != nghr:
+                    for s in range(size):
+                        c = chunk(rank, Buffer.input, s)
+                        c.reduce(chunk(rank, 'scratch', index), sendtb=s)
+                        index += 1
+                        tb += 1
+                
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file

From d55321cc8d1a44c5f9e2c6c166f380a2c91744fc Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 10 Aug 2022 06:12:32 +0000
Subject: [PATCH 121/135] new allpair allreduce

---
 .../mscclang/allreduce_a100_allpairs_v2.py    | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100755 examples/mscclang/allreduce_a100_allpairs_v2.py

diff --git a/examples/mscclang/allreduce_a100_allpairs_v2.py b/examples/mscclang/allreduce_a100_allpairs_v2.py
new file mode 100755
index 0000000..11d7c1d
--- /dev/null
+++ b/examples/mscclang/allreduce_a100_allpairs_v2.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+import math
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = gpus
+    topology = fully_connected(size)
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=False):
+        
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r2 
+                    c = chunk(r1, Buffer.input, index)
+                    c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
+
+        # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
+        for r in range(size):
+            for k in range(1,int(math.log2(size)+1)):
+              level = 2**k
+              for index in range(0, size//level):
+                    if index == 0:
+                        c = chunk(r, Buffer.input, r)
+                    else:
+                        c = chunk(r, 'scratch', (index-1))
+                    c.reduce(chunk(r, 'scratch', (index+size//level-1)), sendtb=index)
+                    #c = chunk(r, Buffer.input, r*size + (index % size))
+                    #c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
+        
+        # Each rank sends the fully reduced nth chunk to all other gpus
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r1
+                    c = chunk(r1, Buffer.input, index)
+                    c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
+                
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)

From 6af8aad61ebd28d238a468be857a26890674261d Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 7 Sep 2022 13:02:20 -0700
Subject: [PATCH 122/135] Add support for reductions in ncclize (#40)

Add support for reduction in ncclize, including concurrent reduction support.
---
 msccl/ncclize.py | 222 +++++++++++++++++++++++++++--------------------
 1 file changed, 127 insertions(+), 95 deletions(-)

diff --git a/msccl/ncclize.py b/msccl/ncclize.py
index 6515bd3..f2ac2b8 100644
--- a/msccl/ncclize.py
+++ b/msccl/ncclize.py
@@ -334,6 +334,8 @@ def allocate_scratch(gpu, addr):
         # Analyze liveness of indices in buffers and remap scratch into input/output as possible
         liveness = _analyze_liveness(gpus, algorithm)
         _remap_scratch_into_input_output(liveness, gpus, logging)
+        if algorithm.collective.is_combining:
+            raise RuntimeError('Combining collectives are not supported yet with scratch remapping.')
     elif greedy_scratch_sorting:
         _greedy_scratch_sort(algorithm, gpus)
     else:
@@ -385,15 +387,27 @@ def get_buffer_and_offset(gpu, addr):
         else:
             raise RuntimeError('Address is not mapped to a buffer')
 
+    def categorize_ops(sends, initialized):
+        sends_by_dest = defaultdict(set)
+        for addr, src, dst in sends:
+            dstbuf, dstoff = get_buffer_and_offset(gpus[dst], addr)
+            sends_by_dest[(dst, dstbuf, dstoff)].add((src, addr))
+        for key in sends_by_dest:
+            dst, dstbuf, dstoff = key
+            for idx, (src, addr) in enumerate(sends_by_dest[key]):
+                # Receives into initialized buffer indices turn into reductions
+                op_type = 'r' if idx == 0 and not (dstbuf, dstoff) in initialized[dst] else 'rrc'
+                yield (addr, src, dst, op_type, idx)
+
     def make_intervals(src, dst, addrs_set):
         if len(addrs_set) == 0:
             return
 
         buffs_and_offs = []
-        for addr in addrs_set:
+        for addr, dst_op_type in addrs_set:
             srcbuff, srcoff = get_buffer_and_offset(gpus[src], addr)
             dstbuff, dstoff = get_buffer_and_offset(gpus[dst], addr)
-            buffs_and_offs.append((srcbuff, srcoff, dstbuff, dstoff))
+            buffs_and_offs.append((srcbuff, srcoff, dstbuff, dstoff, dst_op_type))
         
         if merge_contiguous:
             # Sort sends by both buffers and offsets and merge sends into larger intervals when both the source and
@@ -404,10 +418,10 @@ def make_intervals(src, dst, addrs_set):
             def make_interval(a,b):
                 cnt = b[1] - a[1] + 1
                 assert cnt == b[3] - a[3] + 1, 'Source and destination count mismatch'
-                return (a[0], a[1], a[2], a[3], cnt)
+                return (a[0], a[1], a[2], a[3], a[4], cnt)
         
             for x in buffs_and_offs[1:]:
-                if x[0] == prev[0] and x[1] == prev[1] + 1 and x[2] == prev[2] and x[3] == prev[3] + 1:
+                if x[0] == prev[0] and x[1] == prev[1] + 1 and x[2] == prev[2] and x[3] == prev[3] + 1 and x[4] == prev[4]:
                     # Merge into previous interval if buffers match and the new offsets are at the end of the interval
                     prev = x
                 else:
@@ -418,8 +432,8 @@ def make_interval(a,b):
             yield make_interval(start, prev)
         else:
             # Just yield size 1 intervals if merging is disabled
-            for srcbuff, srcoff, dstbuff, dstoff in buffs_and_offs:
-                yield (srcbuff, srcoff, dstbuff, dstoff, 1)    
+            for srcbuff, srcoff, dstbuff, dstoff, dst_op_type in buffs_and_offs:
+                yield (srcbuff, srcoff, dstbuff, dstoff, dst_op_type, 1)    
 
     # Turn all steps of the algorithm into operations
     ops_by_channel = defaultdict(list)
@@ -427,6 +441,9 @@ def make_interval(a,b):
     writers = defaultdict(list)
     # Track all the reads since the last write to each buffer index
     readers = defaultdict(list)
+    # Track which addresses are initialized on each rank
+    initialized = [set(itertools.chain((('i', offset) for offset in gpu.inputs.values()),
+        ((copy.dst_buffer, copy.dst_offset) for copy in gpu.precopies))) for gpu in gpus.values()]
 
     # Initialize readers and writers for precopies
     for rank, gpu in gpus.items():
@@ -435,96 +452,111 @@ def make_interval(a,b):
                 readers[(rank,op.src_buffer,op.src_offset+i)].append(op)
                 writers[(rank,op.dst_buffer,op.dst_offset+i)].append(op)
 
-    for step_idx, step in enumerate(algorithm.steps):
-        new_writers = defaultdict(list)
-        new_readers = defaultdict(list)
-
-        # Group sent addresses by edge
-        grouped_sends = defaultdict(set)
-        for addr, src, dst in step.sends:
-            grouped_sends[(src,dst)].add(addr)
-
-        # Combine sends into intervals and create multiple instances if necessary
-        sends = []
-        for (src, dst), addrs in grouped_sends.items():
-            intervals = list(make_intervals(src, dst, addrs))
-            if channel_policy == ChannelPolicy.One:
-                num_chans = 1
-                channeled_intervals = [ (src_buf, src_off, dst_buf, dst_off, cnt, 0) for src_buf, src_off, dst_buf, dst_off, cnt in intervals ]
-            elif channel_policy == ChannelPolicy.MatchTopology:
-                # Divide sends onto channels matching the topology (assume bw is ideal concurrency)
-                # Sends are split to balance channels if necessary
-                num_chans = algorithm.topology.link(src,dst)
-                channeled_intervals = []
-
-                intervals.sort(key=lambda x: x[4])
-                counts = [x[4] for x in intervals]
-                total = sum(counts)
-                targets = [(total//num_chans) + (1 if i < (total%num_chans) else 0) for i in range(num_chans)]
-
-                chan = 0
-                while len(intervals) > 0:
-                    if targets[chan] >= counts[-1]:
-                        i = -1
-                    else:
-                        i = bisect.bisect_left(counts, targets[chan])
-                        if i == len(counts) or counts[i] != targets[chan]:
+    step_idx = 0
+    for algo_step in algorithm.steps:
+        # Categorize and serialize sends
+        serialized_steps = []
+        for addr, src, dst, dst_op_type, idx in categorize_ops(algo_step.sends, initialized):
+            if idx >= len(serialized_steps):
+                serialized_steps.extend([] for _ in range(idx - len(serialized_steps) + 1))
+            serialized_steps[idx].append((addr, src, dst, dst_op_type))
+        
+        for categorized_sends in serialized_steps:
+            new_writers = defaultdict(list)
+            new_readers = defaultdict(list)
+
+            # Group sent addresses by edge
+            grouped_sends = defaultdict(set)
+            for addr, src, dst, dst_op_type in categorized_sends:
+                grouped_sends[(src,dst)].add((addr, dst_op_type))
+
+            # Combine sends into intervals and create multiple instances if necessary
+            sends = []
+            for (src, dst), addrs in grouped_sends.items():
+                intervals = list(make_intervals(src, dst, addrs))
+                if channel_policy == ChannelPolicy.One:
+                    num_chans = 1
+                    channeled_intervals = [ (src_buf, src_off, dst_buf, dst_off, dst_op_type, cnt, 0) for src_buf, src_off, dst_buf, dst_off, dst_op_type, cnt in intervals ]
+                elif channel_policy == ChannelPolicy.MatchTopology:
+                    # Divide sends onto channels matching the topology (assume bw is ideal concurrency)
+                    # Sends are split to balance channels if necessary
+                    num_chans = algorithm.topology.link(src,dst)
+                    channeled_intervals = []
+
+                    intervals.sort(key=lambda x: x[-1])
+                    counts = [x[-1] for x in intervals]
+                    total = sum(counts)
+                    targets = [(total//num_chans) + (1 if i < (total%num_chans) else 0) for i in range(num_chans)]
+
+                    chan = 0
+                    while len(intervals) > 0:
+                        if targets[chan] >= counts[-1]:
                             i = -1
-                    src_buf, src_off, dst_buf, dst_off, cnt = intervals[i]
-                    del intervals[i]
-                    del counts[i]
-                    if cnt > targets[chan]:
-                        rem = cnt - targets[chan]
-                        cnt = targets[chan]
-                        j = bisect.bisect_left(counts, rem)
-                        intervals.insert(j, (src_buf, src_off + cnt, dst_buf, dst_off + cnt, rem))
-                        counts.insert(j, rem)
-
-                    channeled_intervals.append((src_buf, src_off, dst_buf, dst_off, cnt, chan))
-                    targets[chan] -= cnt
-                    assert targets[chan] >= 0
-                    if targets[chan] == 0:
-                        chan += 1
-            else:
-                assert False, 'Unhandled channel policy'
-
-            for src_buf, src_off, dst_buf, dst_off, cnt, chan in channeled_intervals:
-                for i in range(instances):
-                    new_src_off = src_off * instances + i * cnt
-                    new_dst_off = dst_off * instances + i * cnt
-                    send = (src, dst, src_buf, new_src_off, dst_buf, new_dst_off, cnt, chan * instances + i)
-                    sends.append(send)
-
-        # Perform dependency tracking and create _Op instances
-        for src, dst, src_buf, src_off, dst_buf, dst_off, cnt, chan in sends:
-            read_keys = [(src,src_buf,src_off+i) for i in range(cnt)]
-            # A send must wait for the previous recv (if any) to finish
-            send_depends = list(set(d for k in read_keys for d in writers[k]))
-
-            write_keys = [(dst,dst_buf,dst_off+i) for i in range(cnt)]
-            # A receive must wait for both the previous recv and any previous sends to finish
-            recv_depends = list(set(d for deps in (readers, writers) for k in write_keys for d in deps[k]))
-
-            send_op = _Op(src, dst, step_idx, True, 's', src_buf, src_off, dst_buf, dst_off, cnt, send_depends)
-            recv_op = _Op(dst, src, step_idx, False, 'r', src_buf, src_off, dst_buf, dst_off, cnt, recv_depends)
-            # Record the send and receive as a set of operations that must happen on the same channel
-            ops_by_channel[chan].extend([send_op, recv_op])
-
-            # Mark writers and readers to be added for the next step
-            for k in write_keys:
-                new_writers[k].append(recv_op)
-            for k in read_keys:
-                new_readers[k].append(send_op)
-        # Writes cut the dependency to both previous writes and reads
-        for key, deps in new_writers.items():
-            if key in new_readers:
-                gpu, buf, off = key
-                raise RuntimeError(f'Encountered receive and send on the same buffer index on step {step_idx + 1} (gpu={gpu}, buf={buf}, off={off})')
-            writers[key] = deps
-            readers[key] = []
-        # Reads get added to any previous reads
-        for key, deps in new_readers.items():
-            readers[key].extend(deps)
+                        else:
+                            i = bisect.bisect_left(counts, targets[chan])
+                            if i == len(counts) or counts[i] != targets[chan]:
+                                i = -1
+                        src_buf, src_off, dst_buf, dst_off, dst_op_type, cnt = intervals[i]
+                        del intervals[i]
+                        del counts[i]
+                        if cnt > targets[chan]:
+                            rem = cnt - targets[chan]
+                            cnt = targets[chan]
+                            j = bisect.bisect_left(counts, rem)
+                            intervals.insert(j, (src_buf, src_off + cnt, dst_buf, dst_off + cnt, dst_op_type, rem))
+                            counts.insert(j, rem)
+
+                        channeled_intervals.append((src_buf, src_off, dst_buf, dst_off, dst_op_type, cnt, chan))
+                        targets[chan] -= cnt
+                        assert targets[chan] >= 0
+                        if targets[chan] == 0:
+                            chan += 1
+                else:
+                    assert False, 'Unhandled channel policy'
+
+                for src_buf, src_off, dst_buf, dst_off, dst_op_type, cnt, chan in channeled_intervals:
+                    for i in range(instances):
+                        new_src_off = src_off * instances + i * cnt
+                        new_dst_off = dst_off * instances + i * cnt
+                        send = (src, dst, src_buf, new_src_off, dst_buf, new_dst_off, dst_op_type, cnt, chan * instances + i)
+                        sends.append(send)
+
+            # Perform dependency tracking and create _Op instances
+            for src, dst, src_buf, src_off, dst_buf, dst_off, dst_op_type, cnt, chan in sends:
+                read_keys = [(src,src_buf,src_off+i) for i in range(cnt)]
+                # A send must wait for the previous recv (if any) to finish
+                send_depends = list(set(d for k in read_keys for d in writers[k]))
+
+                write_keys = [(dst,dst_buf,dst_off+i) for i in range(cnt)]
+                # A receive must wait for both the previous recv and any previous sends to finish
+                recv_depends = list(set(d for deps in (readers, writers) for k in write_keys for d in deps[k]))
+
+                send_op = _Op(src, dst, step_idx, True, 's', src_buf, src_off, dst_buf, dst_off, cnt, send_depends)
+                recv_op = _Op(dst, src, step_idx, False, dst_op_type, src_buf, src_off, dst_buf, dst_off, cnt, recv_depends)
+                # Record the send and receive as a set of operations that must happen on the same channel
+                ops_by_channel[chan].extend([send_op, recv_op])
+
+                # Mark writers and readers to be added for the next step
+                for k in write_keys:
+                    new_writers[k].append(recv_op)
+                for k in read_keys:
+                    new_readers[k].append(send_op)
+            # Writes cut the dependency to both previous writes and reads
+            for key, deps in new_writers.items():
+                if key in new_readers:
+                    gpu, buf, off = key
+                    raise RuntimeError(f'Encountered receive and send on the same buffer index on step {step_idx + 1} (gpu={gpu}, buf={buf}, off={off})')
+                writers[key] = deps
+                readers[key] = []
+            # Reads get added to any previous reads
+            for key, deps in new_readers.items():
+                readers[key].extend(deps)
+            # Update initialized sets
+            for ops in ops_by_channel.values():
+                for op in ops:
+                    if not op.is_send:
+                        initialized[op.gpu].add((op.dst_buffer, op.dst_offset))
+            step_idx += 1
 
     # Add dependencies for postcopies
     for rank, gpu in gpus.items():

From 2e05f9003f59e9d5fb9a597af111973b63dbe108 Mon Sep 17 00:00:00 2001
From: Olli Saarikivi <olsaarik@microsoft.com>
Date: Wed, 7 Sep 2022 13:14:47 -0700
Subject: [PATCH 123/135] Allreduce composition with CLI support (#41)

Implements the standard ReduceScatter+Allgather composition.
Adds new "compose allreduce" CLI command.
---
 msccl/__main__.py     |  1 +
 msccl/cli/__init__.py |  1 +
 msccl/cli/compose.py  | 31 +++++++++++++++++++++++++++++++
 msccl/composers.py    | 26 ++++++++++++++++++++++++++
 tests/test_cli.py     |  8 ++++++++
 5 files changed, 67 insertions(+)
 create mode 100644 msccl/cli/compose.py
 create mode 100644 msccl/composers.py

diff --git a/msccl/__main__.py b/msccl/__main__.py
index 3551657..5f266c4 100755
--- a/msccl/__main__.py
+++ b/msccl/__main__.py
@@ -21,6 +21,7 @@ def main():
 
     handlers = []
     handlers.append(make_solvers(cmd_parsers))
+    handlers.append(make_composers(cmd_parsers))
     handlers.append(make_distributors(cmd_parsers))
     handlers.append(make_analyses(cmd_parsers))
     handlers.append(make_handle_ncclize(cmd_parsers))
diff --git a/msccl/cli/__init__.py b/msccl/cli/__init__.py
index 734b440..f07721b 100755
--- a/msccl/cli/__init__.py
+++ b/msccl/cli/__init__.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT License.
 
 from .solve import *
+from .compose import *
 from .distribute import *
 from .analyze import *
 from .ncclize import *
diff --git a/msccl/cli/compose.py b/msccl/cli/compose.py
new file mode 100644
index 0000000..870cdb7
--- /dev/null
+++ b/msccl/cli/compose.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from msccl.composers import *
+from .common import *
+
+def make_composers(cmd_parsers):
+    handler_funcs = []
+    handler_funcs.append(make_handle_allreduce)
+
+    return make_cmd_category(cmd_parsers, 'compose', 'composer', handler_funcs)
+
+def make_handle_allreduce(cmd_parsers):
+    name = 'allreduce'
+    cmd = cmd_parsers.add_parser(name)
+    read_reducescatter_algorithm = add_input_algorithm(cmd, name="reducescatter-algorithm")
+    read_allgather_algorithm = add_input_algorithm(cmd, name="allgather-algorithm")
+    validate_output_args, output_handler = add_output_algorithm(cmd)
+
+    def handle(args, command):
+        if command != name:
+            return False
+
+        reducescatter_algorithm = read_reducescatter_algorithm(args)
+        allgather_algorithm = read_allgather_algorithm(args)
+        validate_output_args(args)
+        algo = compose_allreduce(reducescatter_algorithm, allgather_algorithm, logging=True)
+        output_handler(args, algo)
+        return True
+
+    return handle
\ No newline at end of file
diff --git a/msccl/composers.py b/msccl/composers.py
new file mode 100644
index 0000000..38b1f62
--- /dev/null
+++ b/msccl/composers.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from msccl.collectives import allreduce
+from msccl.algorithm import *
+from msccl.instance import *
+
+def compose_allreduce(reducescatter_algo, allgather_algo, logging=False):
+    if reducescatter_algo.is_pipelined() or allgather_algo.is_pipelined():
+        raise ValueError('Pipelining is not supported.')
+
+    if reducescatter_algo.instance.chunks != allgather_algo.instance.chunks:
+        raise ValueError(f'ReduceScatter and Allgather must have the same chunks (got {reducescatter_algo.instance.chunks} and {allgather_algo.instance.chunks})')
+
+    if reducescatter_algo.topology.name != allgather_algo.topology.name:
+        # TODO: improve this check to check actual structure, not just name
+        raise ValueError(f'ReduceScatter and Allgather must have the same topology (got {reducescatter_algo.topology.name} and {allgather_algo.topology.name})')
+    topo = reducescatter_algo.topology
+
+    coll = allreduce(topo.num_nodes())
+
+    steps = reducescatter_algo.steps + allgather_algo.steps
+    instance = Instance(len(steps),
+        extra_rounds=reducescatter_algo.instance.extra_rounds+allgather_algo.instance.extra_rounds,
+        chunks=reducescatter_algo.instance.chunks)
+    return Algorithm.make_implementation(coll, topo, instance, steps)
\ No newline at end of file
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 38aa058..8e9e6ea 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -126,3 +126,11 @@ def test_distribute_alltoall_subproblem():
         assert 0 == os.system('msccl distribute alltoall-stitch-subproblem subalgo.json --copies 2 -o stitched.json')
         assert os.path.exists('stitched.json')
         _check_ncclizes('stitched.json')
+
+def test_compose_allreduce():
+    with in_tempdir():
+        assert 0 == os.system('msccl solve instance DGX1 ReduceScatter -s 2 -r 3 -c 2 -o reducescatter.json')
+        assert 0 == os.system('msccl solve instance DGX1 Allgather -s 2 -r 3 -c 2 -o allgather.json')
+        assert 0 == os.system('msccl compose allreduce reducescatter.json allgather.json -o allreduce.json')
+        assert os.path.exists('allreduce.json')
+        _check_ncclizes('allreduce.json')
\ No newline at end of file

From 251dd26f9ae0ee40c8e2b0241c9f0e154b6516af Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Mon, 26 Sep 2022 17:20:21 -0700
Subject: [PATCH 124/135] Fix - no reference to chunks that doesn't exist (#42)

* Fix AllGather recursive doubling

* No reference to non-existent chunk
---
 examples/mscclang/allgather_recursive_doubling.py | 11 +++++------
 msccl/language/__init__.py                        |  2 ++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/mscclang/allgather_recursive_doubling.py b/examples/mscclang/allgather_recursive_doubling.py
index a00074c..a6407ea 100755
--- a/examples/mscclang/allgather_recursive_doubling.py
+++ b/examples/mscclang/allgather_recursive_doubling.py
@@ -9,16 +9,15 @@
 # https://web.cels.anl.gov/~thakur/papers/mpi-coll.pdf
 def allgather_recursive_doubling(size, instances, protocol):
     topology = fully_connected(size)
-    collective = AllGather(size, instances, True)
-    with MSCCLProgram("allgather_recursive_doubling", topology, collective, 1, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
+    collective = AllGather(size, 1, True)
+    with MSCCLProgram("allgather_recursive_doubling", topology, collective, instances, protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
         count = 1
         while count < size:
             # Every rank exchanges count chunks with neighbor count away
             for rank in range(size):
-                for i in range(instances):
-                    peer = rank ^ count
-                    index = ((rank // count) * count) * instances + i * count
-                    chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank) 
+                peer = rank ^ count
+                index = (rank // count) * count
+                chunk(rank, Buffer.output, index, size=count).copy(peer, Buffer.output, index, sendtb=peer, recvtb=rank) 
             count *= 2
 
         XML()
diff --git a/msccl/language/__init__.py b/msccl/language/__init__.py
index 7e3bf95..1473849 100755
--- a/msccl/language/__init__.py
+++ b/msccl/language/__init__.py
@@ -142,6 +142,8 @@ def Print():
     _curr().print_chunk_dag()
 
 def chunk(rank, buffer, index, size=1):
+    if _curr().buffers[rank][buffer][index] is None:
+        return None
     return _curr().get_ref(rank, buffer, index, size)
 
 def create_scratch(rank, name):

From 7279a5f23c2f89daeb1c81a76490866110af30c9 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Thu, 27 Oct 2022 17:27:53 -0700
Subject: [PATCH 125/135] Fix/minmaxbytes (#43)

* Fix AllGather recursive doubling

* Capitalize minBytes
---
 msccl/autosynth/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/msccl/autosynth/__init__.py b/msccl/autosynth/__init__.py
index 57a909d..a362cf6 100755
--- a/msccl/autosynth/__init__.py
+++ b/msccl/autosynth/__init__.py
@@ -81,9 +81,9 @@ def init(machine_type, num_machines, *collectives):
             load_elem.set('path', path)
             minsize, maxsize, proto = params
             if minsize != 0:
-                load_elem.set('minbytes', str(minsize))
+                load_elem.set('minBytes', str(minsize))
             if maxsize != math.inf:
-                load_elem.set('maxbytes', str(maxsize))
+                load_elem.set('maxBytes', str(maxsize))
             load_elem.set('proto', proto)
             any_selected = True
     ET.indent(algos_elem, space='  ')

From 45aeb93f961841f6b8da62c9c56f942b79bc583d Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 31 Oct 2022 22:31:24 +0000
Subject: [PATCH 126/135] new algorithm for multinode

---
 .../allreduce_a100_multinode_allpairs.py      | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100755 examples/mscclang/allreduce_a100_multinode_allpairs.py

diff --git a/examples/mscclang/allreduce_a100_multinode_allpairs.py b/examples/mscclang/allreduce_a100_multinode_allpairs.py
new file mode 100755
index 0000000..7b49a41
--- /dev/null
+++ b/examples/mscclang/allreduce_a100_multinode_allpairs.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = gpus * gpus
+    topology = fully_connected(2*size)
+    collective = AllReduce(2*size, chunksperloop, True)
+    with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r2 * size
+                    c = chunk(r1, Buffer.input, index, size=size)
+                    c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
+
+                    c2 = chunk(r1+size, Buffer.input, index, size=size)
+                    c2.copy(r2+size, 'scratch', sendtb=r2, recvtb=r1)
+
+        # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
+        for r in range(size):
+            for index in range(0, size * (size-1)):
+                    c = chunk(r, Buffer.input, r*size + (index % size))
+                    c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
+
+                    c2 = chunk(r+size, Buffer.input, r*size + (index % size))
+                    c2.reduce(chunk(r+size, 'scratch', index), sendtb=(index % size))
+
+
+        for r in range(size):
+            index = r*size
+            c = chunk(r, Buffer.input, index, size)
+            c = c.copy(r+size, 'scratch2', index=0, sendtb=size, recvtb=size+1, ch=r%2)
+
+            c2 = chunk(r+size, Buffer.input, index, size)
+            c2 = c2.copy(r, 'scratch2', index=0, sendtb=size+2, recvtb=size+3, ch=r%2)
+
+            chunk(r, Buffer.input, index, size).reduce(c2, sendtb=size+3, recvtb=size+4, ch=r%2)
+            chunk(r+size, Buffer.input, index, size).reduce(c, sendtb=size+1, recvtb=size+1, ch=r%2)
+
+        
+        # Each rank sends the fully reduced nth chunk to all other gpus
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r1 * size
+                    c = chunk(r1, Buffer.input, index, size)
+                    c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
+
+                    c2 = chunk(r1+size, Buffer.input, index, size)
+                    c2.copy(r2+size, Buffer.input, index, sendtb=r2, recvtb=r1)
+                
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)

From afabc1269ac77e9c5a21b337ae5fc2b3ce6eff0a Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Tue, 1 Nov 2022 22:23:58 +0000
Subject: [PATCH 127/135] pipeline parallelism algorithms

---
 examples/mscclang/pipeline_a100_allpairs.py | 52 +++++++++++++++++++++
 examples/mscclang/pipeline_a100_ring.py     | 49 +++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100755 examples/mscclang/pipeline_a100_allpairs.py
 create mode 100755 examples/mscclang/pipeline_a100_ring.py

diff --git a/examples/mscclang/pipeline_a100_allpairs.py b/examples/mscclang/pipeline_a100_allpairs.py
new file mode 100755
index 0000000..1f5ec30
--- /dev/null
+++ b/examples/mscclang/pipeline_a100_allpairs.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = gpus * gpus
+    topology = fully_connected(2*size)
+    collective = AllReduce(2*size, chunksperloop, True)
+    with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
+        
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r2 * size
+                    c = chunk(r1, Buffer.input, index, size=size)
+                    c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
+
+        # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
+        for r in range(size):
+            for index in range(0, size * (size-1)):
+                    c = chunk(r, Buffer.input, r*size + (index % size))
+                    c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
+            c = chunk(r, Buffer.input, r*size, size=size)
+            c.copy(r+size, Buffer.input, r*size, ch=r%2)
+        
+        # Each rank sends the fully reduced nth chunk to all other gpus
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r1 * size
+                    c = chunk(r1+size, Buffer.input, index, size)
+                    c.copy(r2+size, Buffer.input, index, sendtb=r2, recvtb=r1)
+                
+        XML()
+        #Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file
diff --git a/examples/mscclang/pipeline_a100_ring.py b/examples/mscclang/pipeline_a100_ring.py
new file mode 100755
index 0000000..7a443f1
--- /dev/null
+++ b/examples/mscclang/pipeline_a100_ring.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+# Ring all reduce for A100s
+# Vary channels from [1-8] to divide parts of the ring over multiple channels/tbs.
+# channels=1 is standard ring, all chunks are assigned to the same tb/channel
+# channels=8 devotes 1 tb/channel to handling 1 chunk of the data
+def allreduce_ring(size, instances, channels, protocol):
+    topology = fully_connected(2*size)
+    collective = AllReduce(2*size, size, True)
+    with MSCCLProgram(f"allreduce_ring_{channels}channelsperring", topology, collective, instances,
+         protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
+        # Reduce ring
+        for step in range(0, size-1):
+            for index in range(0, size):
+                rank = (index + step) % size
+                next_rank = (index + step + 1) % size
+                channel = index%channels
+                c = chunk(next_rank, Buffer.input, index)
+                c.reduce(chunk(rank, Buffer.input, index), ch=channel, recvtb=channel, sendtb=channel)
+        # Propagate ring
+        for index in range(0, size):
+            rank = (index - 1) % size
+            c = chunk(rank, Buffer.input, index)
+            c.copy(rank+size, Buffer.input, index, ch=rank%2)
+        for step in range(-1, size-2):
+            for index in range(0, size):
+                rank = (index + step) % size
+                c = chunk(rank+size, Buffer.input, index)
+                next_rank = (index + step + 1) % size
+                channel = index%channels
+                c = c.copy(next_rank+size, Buffer.input, index, ch=channel, recvtb=channel, sendtb=channel)
+               
+        XML()
+#        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('channels', type=int, help='Number of channels to use for 1 instance of the ring [1-8]')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: LL128')
+args = parser.parse_args()
+
+allreduce_ring(args.num_gpus, args.instances, args.channels, args.protocol)

From a8485bf62fb076f5b7cc8c5d66f6811472b409f9 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <meghancowan@microsoft.com>
Date: Thu, 17 Nov 2022 11:01:45 -0800
Subject: [PATCH 128/135] Update README.md (#44)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b43f8d3..257296c 100755
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ on-going research project. See [this readme](SYNTHESIS.md) for using MSCCL as a
 
 To install either clone this repo and run "`pip install .`" or run:
 ```
-pip install git+https://github.com/microsoft/msccl.git
+pip install git+https://github.com/microsoft/msccl-tools.git
 ```
 
 Installing the MSCCL Python package also installs the `msccl` command line tool. To enable Bash completion for the

From edb7861ba5cd5ab7bef70157667e5bac340291d0 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 20 Apr 2023 22:58:22 +0000
Subject: [PATCH 129/135] ncv4 allreduce

---
 .../mscclang/allreduce_a100_recursive_doubling_halving.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/mscclang/allreduce_a100_recursive_doubling_halving.py b/examples/mscclang/allreduce_a100_recursive_doubling_halving.py
index 67a06eb..757216a 100755
--- a/examples/mscclang/allreduce_a100_recursive_doubling_halving.py
+++ b/examples/mscclang/allreduce_a100_recursive_doubling_halving.py
@@ -11,7 +11,7 @@
 
 
 def allreduce(ways, instances, protocol):
-    topology = fully_connected(8)
+    topology = fully_connected(4)
     size = topology.num_nodes() #  Number of gpus
     logical_chunk = 8 * ways
     collective = AllReduce(size, logical_chunk, True)
@@ -29,7 +29,7 @@ def recursive_doubling(pairs, count, next_index, lc, sendtb, recvtb):
                     index = current_index[r] + offset + lc*8 + x
                     c1 = chunk(r, Buffer.input, index)
                     c = chunk(next, Buffer.input, index)
-                    c.reduce(c1 ch=lc, sendtb=sendtb, recvtb=recvtb)
+                    c.reduce(c1, sendtb=sendtb, recvtb=recvtb)
 
 
         # Propagates reduced chunks in reverse order 

From aafcc1908c9cd99b51f64fe4f72abba2ac866031 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 21 Apr 2023 23:26:05 +0000
Subject: [PATCH 130/135] new algo

---
 examples/mscclang/allreduce_a100_ncv4.py | 47 ++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100755 examples/mscclang/allreduce_a100_ncv4.py

diff --git a/examples/mscclang/allreduce_a100_ncv4.py b/examples/mscclang/allreduce_a100_ncv4.py
new file mode 100755
index 0000000..64cbf1d
--- /dev/null
+++ b/examples/mscclang/allreduce_a100_ncv4.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = 2
+    topology = fully_connected(size)
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, dependence_nop=True):
+        for chnk in range(chunksperloop):
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.input, chnk)
+                    c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk))
+
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.input, chnk)
+                    c.copy((r+2) % size, 'scratch', chnk)
+                    
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.input, chnk)
+                    c.reduce(chunk(r, 'scratch', chnk))
+            
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.input, chnk)
+                    c.copy(r + 1 - 2 * chnk, Buffer.input, chnk)
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file

From 23d24abb0f7b1c4e0414d9ecbf1abf98bffc1531 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 26 Apr 2023 20:08:59 +0000
Subject: [PATCH 131/135] another ncv4 algorithm

---
 examples/mscclang/allreduce_a100_ncv4_v2.py | 41 +++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100755 examples/mscclang/allreduce_a100_ncv4_v2.py

diff --git a/examples/mscclang/allreduce_a100_ncv4_v2.py b/examples/mscclang/allreduce_a100_ncv4_v2.py
new file mode 100755
index 0000000..7c0413c
--- /dev/null
+++ b/examples/mscclang/allreduce_a100_ncv4_v2.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+def tree_algo(tree, chnk, size):
+    for i in range(size-1):
+        nextNghr = tree[i+1]
+        curNode = tree[i]
+        c = chunk(nextNghr, Buffer.input, chnk)
+        c.reduce(chunk(curNode, Buffer.input, chnk), sendtb=2*chnk, recvtb=2*chnk)
+    for i in range(size-1):
+        curNode = tree[size-1-i]
+        nextNghr = tree[size-1-i-1]
+        c = chunk(curNode, Buffer.input, chnk)
+        c.copy(nextNghr, Buffer.input, chnk, sendtb=2*chnk+1, recvtb=2*chnk+1)
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = 2
+    topology = fully_connected(size)
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
+        tree_algo([3,2,1,0], 0, size)
+        tree_algo([2,3,0,1], 1, size)
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file

From 430cc81045b7d95b222039bdad8cc51d3b304b59 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 26 Apr 2023 20:29:48 +0000
Subject: [PATCH 132/135] bug fix

---
 examples/mscclang/allreduce_a100_ncv4_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/mscclang/allreduce_a100_ncv4_v2.py b/examples/mscclang/allreduce_a100_ncv4_v2.py
index 7c0413c..27da358 100755
--- a/examples/mscclang/allreduce_a100_ncv4_v2.py
+++ b/examples/mscclang/allreduce_a100_ncv4_v2.py
@@ -11,12 +11,12 @@ def tree_algo(tree, chnk, size):
         nextNghr = tree[i+1]
         curNode = tree[i]
         c = chunk(nextNghr, Buffer.input, chnk)
-        c.reduce(chunk(curNode, Buffer.input, chnk), sendtb=2*chnk, recvtb=2*chnk)
+        c.reduce(chunk(curNode, Buffer.input, chnk), sendtb=2*chnk, recvtb=2*chnk, ch=chnk)
     for i in range(size-1):
         curNode = tree[size-1-i]
         nextNghr = tree[size-1-i-1]
         c = chunk(curNode, Buffer.input, chnk)
-        c.copy(nextNghr, Buffer.input, chnk, sendtb=2*chnk+1, recvtb=2*chnk+1)
+        c.copy(nextNghr, Buffer.input, chnk, sendtb=2*chnk+1, recvtb=2*chnk+1, ch=chnk)
 
 def allreduce_allpairs(gpus, instances, protocol):
     size = gpus

From 0dc07809b0fabba6a622f5b277be6d0de6d72472 Mon Sep 17 00:00:00 2001
From: pash-msft <pashupati.kumar@microsoft.com>
Date: Fri, 28 Apr 2023 00:45:06 +0000
Subject: [PATCH 133/135] AllGather algos

---
 examples/mscclang/allgather_allpairs.py       |  35 ++++++
 .../allreduce_a100_pcie_hierarchical.py       | 106 ++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 examples/mscclang/allgather_allpairs.py
 create mode 100644 examples/mscclang/allreduce_a100_pcie_hierarchical.py

diff --git a/examples/mscclang/allgather_allpairs.py b/examples/mscclang/allgather_allpairs.py
new file mode 100644
index 0000000..fdfb202
--- /dev/null
+++ b/examples/mscclang/allgather_allpairs.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllGather
+
+# Allpairs allgather for A100 
+def allgather_allpairs(gpus, instances, protocol):
+    size = gpus
+    topology = fully_connected(gpus)
+    collective = AllGather(size, size, True)
+
+    with MSCCLProgram(f"allgather_allpairs", topology, collective, instances,
+         protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
+        
+        # Each rank sends its nth chunk to all other gpus
+        for r1 in range(gpus):
+            for r2 in range(gpus):
+                if r1 != r2:
+                    index = 0
+                    c = chunk(r1, Buffer.input, index, size)
+                    c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
+        XML()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+allgather_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file
diff --git a/examples/mscclang/allreduce_a100_pcie_hierarchical.py b/examples/mscclang/allreduce_a100_pcie_hierarchical.py
new file mode 100644
index 0000000..3ea7460
--- /dev/null
+++ b/examples/mscclang/allreduce_a100_pcie_hierarchical.py
@@ -0,0 +1,106 @@
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllReduce
+
+def allpairs_reduce_scatter(gpuIds, size, offset):
+    ngpus = len(gpuIds)
+
+    # Each rank sends the nth chunk to the nth rank into scratch space
+    for r1 in range(ngpus):
+        for r2 in range(ngpus):
+            if gpuIds[r1] != gpuIds[r2]:
+                index = offset + r2 * size
+                c = chunk(gpuIds[r1], Buffer.input, index, size=size)
+                c.copy(gpuIds[r2], 'scratch', sendtb=gpuIds[r2], recvtb=gpuIds[r1])
+
+    # Each rank performs a local reduction on the nth chunk
+    # Utilize 8 threadblocks for this reduction for better parallelism
+    for r in range(ngpus):
+        for index in range(0, size * (ngpus-1)):
+                c = chunk(gpuIds[r], Buffer.input, offset + r*size + (index % size))
+                c.reduce(chunk(gpuIds[r], 'scratch', index), sendtb=(index % size))
+
+
+def allpairs_all_gather(gpuIds, size, offset):
+    ngpus = len(gpuIds)
+
+    # Each rank sends its nth chunk to all other gpus
+    for r1 in range(ngpus):
+        for r2 in range(ngpus):
+            if r1 != r2:
+                index = offset + r1 * size
+                c = chunk(gpuIds[r1], Buffer.input, index, size)
+                c.copy(gpuIds[r2], Buffer.input, index, sendtb=gpuIds[r2], recvtb=gpuIds[r1])
+
+# Performs two levels of allReduce
+def hierarchical_allreduce(gpus, instances, protocol):
+    ncols = 2
+    nrows = gpus // ncols
+    chunkperloop = gpus * gpus
+    topology = fully_connected(gpus)
+    collective = AllReduce(gpus, chunkperloop, True)
+
+    with MSCCLProgram("hierarchical_allreduce", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):     
+        
+        # A 4 x 3 GPU arranagement: 4 local GPUs, 3 instances, GPU Ids are numbered as such
+        # 0  4   8
+        # 1  5   9
+        # 2  6   10
+        # 3  7  11
+        # Reduce-Scatter on each column first, assumption being GPUs in a column have faster connectivity - NVLINK
+        # Each GPU exchanges (nrows - 1) * 1/rows of data with other GPUs in the same column 
+        # After this step, first GPU in each column will have 1st 1/nrows, 2nd GPU will have 2nd of 1/nrows data reduced
+        size = chunkperloop // nrows
+        offset = 0
+        for n in range(ncols):
+            gpuIds = []
+            for m in range(nrows): # collect all GPU Ids in a column
+                gpuIds.append( n * nrows + m)
+            
+            allpairs_reduce_scatter(gpuIds, size, 0)
+
+        # Reduce-Scatter across rows, assumption being GPUs in a row have slower connectivity - PCIe, IP NW
+        # Each GPU exachanges (1 / rows * cols) * (cols - 1) of data with other GPUs in the same row - less data is exchanged
+        # After this step, first GPU each row, will have 1st 1/(nrows * ncols), 2nd will have 2nd of 1/(nrows * ncols)
+        offset = size
+        size = chunkperloop // (nrows * ncols)
+        for n in range(nrows):
+            gpuIds = []
+            for m in range(ncols):
+                gpuIds.append(n + m * nrows)
+
+            allpairs_reduce_scatter(gpuIds, size, offset * n)
+
+        # AllGather: AllGather phase goes in reverse order, first gather across rows of GPU
+        # After this step, Each GPU in a rows have 1/ncols of data
+        for n in range(nrows):
+            gpuIds = []
+            for m in range(ncols):
+                gpuIds.append(n + m * nrows)
+
+            allpairs_all_gather(gpuIds, size, offset * n)
+
+        # AllGather: AllGather phase goes in reverse order, 2nd AllGather across columns of GPU
+        # After this step, Each GPU the systems will have complete reduced data
+        size = chunkperloop // nrows
+        offset = 0
+        for n in range(ncols):
+            gpuIds = []
+            for m in range(nrows):
+                gpuIds.append( n * nrows + m)
+
+            allpairs_all_gather(gpuIds, size, 0)
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+hierarchical_allreduce(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file

From db171e04d57f54756fc2065bc875493ac768accb Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 28 Apr 2023 04:18:32 +0000
Subject: [PATCH 134/135] new algorithms for allgather

---
 examples/mscclang/allgather_a100_pcie.py | 42 ++++++++++++++++++++++++
 examples/mscclang/allreduce_a100_ncv4.py | 10 +++---
 2 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100644 examples/mscclang/allgather_a100_pcie.py

diff --git a/examples/mscclang/allgather_a100_pcie.py b/examples/mscclang/allgather_a100_pcie.py
new file mode 100644
index 0000000..06d9ae3
--- /dev/null
+++ b/examples/mscclang/allgather_a100_pcie.py
@@ -0,0 +1,42 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import AllGather
+
+# Allpairs allgather for A100 
+def allgather_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = 1
+    topology = fully_connected(gpus)
+    collective = AllGather(size, chunksperloop, True)
+
+    with MSCCLProgram("allgather_hierarchical", topology, collective, instances, protocol=protocol, 
+        interleaved_replication=True, dependence_nop=True):
+        for chnk in range(2):
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.input, 0)
+                    c.copy(r + 1 - 2 * chnk, Buffer.output, r)
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.input, 0)
+                    c.copy((r+2) % size, Buffer.output, r)
+            for r in range(size):
+                if ((r % 2) == chnk):
+                    c = chunk(r, Buffer.output, (r+2) % size)
+                    c.copy(r + 1 - 2 * chnk, Buffer.output, (r+2) % size)
+
+        XML()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('instances', type=int, help='number of instances')
+parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
+args = parser.parse_args()
+
+allgather_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file
diff --git a/examples/mscclang/allreduce_a100_ncv4.py b/examples/mscclang/allreduce_a100_ncv4.py
index 64cbf1d..c3f8a06 100755
--- a/examples/mscclang/allreduce_a100_ncv4.py
+++ b/examples/mscclang/allreduce_a100_ncv4.py
@@ -12,27 +12,27 @@ def allreduce_allpairs(gpus, instances, protocol):
     topology = fully_connected(size)
     collective = AllReduce(size, chunksperloop, True)
     with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, 
-        interleaved_replication=False, dependence_nop=True):
+        interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
         for chnk in range(chunksperloop):
             for r in range(size):
                 if ((r % 2) == chnk):
                     c = chunk(r, Buffer.input, chnk)
-                    c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk))
+                    c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk), sendtb=0, recvtb=0, ch=0)
 
             for r in range(size):
                 if ((r % 2) == chnk):
                     c = chunk(r, Buffer.input, chnk)
-                    c.copy((r+2) % size, 'scratch', chnk)
+                    c.copy((r+2) % size, 'scratch', chnk, sendtb=1, recvtb=1, ch=0)
                     
             for r in range(size):
                 if ((r % 2) == chnk):
                     c = chunk(r, Buffer.input, chnk)
-                    c.reduce(chunk(r, 'scratch', chnk))
+                    c.reduce(chunk(r, 'scratch', chnk), sendtb=1, recvtb=1, ch=0)
             
             for r in range(size):
                 if ((r % 2) == chnk):
                     c = chunk(r, Buffer.input, chnk)
-                    c.copy(r + 1 - 2 * chnk, Buffer.input, chnk)
+                    c.copy(r + 1 - 2 * chnk, Buffer.input, chnk, sendtb=2, recvtb=2, ch=1)
 
         XML()
         Check()

From 030a750fc56e03fb863ffe48268672d84d4cdc7b Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 28 Apr 2023 05:18:18 +0000
Subject: [PATCH 135/135] clean ups

---
 examples/mscclang/allgather_a100_pcie.py | 6 +++---
 examples/mscclang/allgather_allpairs.py  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/mscclang/allgather_a100_pcie.py b/examples/mscclang/allgather_a100_pcie.py
index 06d9ae3..29e7cad 100644
--- a/examples/mscclang/allgather_a100_pcie.py
+++ b/examples/mscclang/allgather_a100_pcie.py
@@ -6,8 +6,8 @@
 from msccl.topologies import *
 from msccl.language.collectives import AllGather
 
-# Allpairs allgather for A100 
-def allgather_allpairs(gpus, instances, protocol):
+# Hierarchical allgather for A100 
+def allgather_hier(gpus, instances, protocol):
     size = gpus
     chunksperloop = 1
     topology = fully_connected(gpus)
@@ -39,4 +39,4 @@ def allgather_allpairs(gpus, instances, protocol):
 parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple')
 args = parser.parse_args()
 
-allgather_allpairs(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file
+allgather_hier(args.num_gpus, args.instances, args.protocol)
\ No newline at end of file
diff --git a/examples/mscclang/allgather_allpairs.py b/examples/mscclang/allgather_allpairs.py
index fdfb202..fdd926d 100644
--- a/examples/mscclang/allgather_allpairs.py
+++ b/examples/mscclang/allgather_allpairs.py
@@ -10,7 +10,7 @@
 def allgather_allpairs(gpus, instances, protocol):
     size = gpus
     topology = fully_connected(gpus)
-    collective = AllGather(size, size, True)
+    collective = AllGather(size, 1, True)
 
     with MSCCLProgram(f"allgather_allpairs", topology, collective, instances,
          protocol=protocol, threadblock_policy=ThreadblockPolicy.manual):
@@ -20,7 +20,7 @@ def allgather_allpairs(gpus, instances, protocol):
             for r2 in range(gpus):
                 if r1 != r2:
                     index = 0
-                    c = chunk(r1, Buffer.input, index, size)
+                    c = chunk(r1, Buffer.input, index, 1)
                     c.copy(r2, Buffer.input, index, sendtb=r2, recvtb=r1)
         XML()
         Check()