diff --git a/docs/perseus/index.md b/docs/perseus/index.md index 8d92a0b2..6958fd71 100644 --- a/docs/perseus/index.md +++ b/docs/perseus/index.md @@ -5,6 +5,9 @@ Currently, we have all the low-level APIs in place, but it's not a turnkey solution yet. This document always reflects the master `HEAD`. +!!! Tip + Stay tuned for the paper release :) + ## Overview @@ -27,7 +30,7 @@ Our core algorithm, implemented as a separate library called [`lowtime`](https:/ Currently, it's a three-step process: -1. **Profile**: Profile the computation time and energy consumption of the forward and backward instructions in *each stage* and *each GPU frequency*. +1. **Profile**: Profile the computation time and energy consumption of the forward and backward instructions in *each stage* and *each GPU frequency* and the P2P blocking power consumption of the GPU. 2. **Optimize**: Use [`lowtime`](https://github.com/ml-energy/lowtime) to generate all Pareto-optimal frequency plans. 3. **Choose and start training**: Among all the frequency plans generated by `lowtime`, choose the one that suits your use case. @@ -62,6 +65,9 @@ That being said, you can obtain this profiling information in however way you wa But as a reference, we have implemented an automatic profiler in Merak. Please refer to the [examples](https://github.com/ml-energy/merak-zeus/tree/main/examples) directory in Merak for profiling instructions. +Finally, we also need to take into account the power consumption of the GPU while it is blocking on P2P communication, i.e., waiting for either the activation or gradient from its neighbor stage. +You can use [our profiling script](https://github.com/ml-energy/zeus/tree/master/examples/perseus/profile_p2p.py) for that. + !!! Tip As you profile the time and energy consumption of an instruction, you will scan down from the highest to the lowest frequency. However, as you lower the GPU's frequency, both time and energy will start to inflate after some point. diff --git a/examples/perseus/README.md b/examples/perseus/README.md new file mode 100644 index 00000000..9d9dca97 --- /dev/null +++ b/examples/perseus/README.md @@ -0,0 +1,6 @@ +# Perseus: Energy Scheduling in Large Model Training + +Perseus is an energy optimizer for large model training. +It gives you the complete training time-energy Pareto frontier, on which you can choose the right tradeoff point for you use case. + +Please refer to our [Getting Started guide](https://ml.energy/zeus/perseus). diff --git a/examples/perseus/profile_p2p.py b/examples/perseus/profile_p2p.py new file mode 100644 index 00000000..273ab5f7 --- /dev/null +++ b/examples/perseus/profile_p2p.py @@ -0,0 +1,75 @@ +"""Profile the power cosumtion of the GPU while waiting on P2P communication.""" + +import os +import time +import multiprocessing as mp + +import torch +import torch.distributed as dist +from zeus.monitor import ZeusMonitor + + +def main() -> None: + """Run the main routine.""" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "29500" + + worker0 = mp.Process(target=worker, args=(0,)) + worker1 = mp.Process(target=worker, args=(1,)) + + worker0.start() + worker1.start() + + worker0.join() + worker1.join() + + +def worker(rank: int) -> None: + """Run the worker routine.""" + if rank not in [0, 1]: + raise ValueError(f"Invalid rank: {rank}") + + torch.cuda.set_device(rank) + dist.init_process_group(backend="nccl", init_method="env://", world_size=2, rank=rank) + + # Allocate large tensor and run some computation to warm up the GPU. + tensor = torch.rand(10000, 10000, device="cuda") + tensor = tensor @ tensor @ tensor @ tensor @ tensor @ tensor @ tensor @ tensor @ tensor @ tensor + + if rank == 0: + monitor = ZeusMonitor(gpu_indices=[rank]) + + # Communication warmup + for _ in range(5): + dist.recv(tensor, src=1 - rank) + dist.send(tensor, dst=1 - rank) + torch.cuda.synchronize() + + # Measure while the GPU is blocking on P2P communication. + # Rank 1 is just sleeping. + monitor.begin_window("p2p") + dist.recv(tensor, src=1 - rank) + measurement = monitor.end_window("p2p") + + torch.cuda.synchronize() + + print(f"Time (s): {measurement.time}") + print(f"Energy (J): {measurement.total_energy}") + print(f"Power (W): {measurement.total_energy / measurement.time}") + + else: + # Communication warmup + for _ in range(5): + dist.send(tensor, dst=1 - rank) + dist.recv(tensor, src=1 - rank) + torch.cuda.synchronize() + + print("Sleeping for 60 seconds") + time.sleep(60) + dist.send(tensor, dst=1 - rank) + + torch.cuda.synchronize() + + +if __name__ == "__main__": + main()