diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py index 36334b0d0ffc95..3f73c64b6b01c8 100644 --- a/test/distributed/test_multi_threaded_pg.py +++ b/test/distributed/test_multi_threaded_pg.py @@ -220,5 +220,34 @@ def test_gather(self): for i in range(self.world_size): self.assertEqual(gather_list[i], torch.ones(3, 3) * i) +class TestLargeWorld(MultiThreadedTestCase): + @property + def world_size(self): + return 64 + + def setUp(self): + super().setUp() + self._spawn_threads() + + def test_gloo_init(self): + groups = [] + num_ports_used = 0 + num_groups = 4 + # create multiple gloo groups with 64 ranks + for i in range(num_groups): + group = dist.new_group(backend="gloo") + groups.append(group) + + # tear down gloo groups + for i in range(num_groups): + dist.destroy_process_group(groups[i]) + groups.clear() + self.assertEqual(len(groups), 0) + + # create multiple gloo groups with 64 ranks + for i in range(num_groups): + group = dist.new_group(backend="gloo") + groups.append(group) + if __name__ == "__main__": run_tests() diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index 9ebb35afccc5bd..a6caf6167d1f30 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -638,6 +638,24 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { struct addrinfo* rp = nullptr; for (rp = result; rp != nullptr; rp = rp->ai_next) { auto fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + + // Set SO_REUSEADDR to signal that reuse of the listening port is OK. + int on = 1; + rv = setsockopt( + fd, + SOL_SOCKET, + SO_REUSEADDR, + reinterpret_cast(&on), + sizeof(on)); + if (rv == -1) { +#ifdef _WIN32 + closesocket(fd); +#else + close(fd); +#endif + logAndThrow("setsockopt: ", strerror(errno)); + } + if (fd == -1) { continue; }