From 191a459cfed8a639de5091c2ac81c1f45f88c6ce Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Mon, 27 Mar 2023 08:54:53 -0700 Subject: [PATCH] Allow ports to be reused in gloo Summary: ProcessGroupGloo and gloo seem to be opening and closing sockets without allowing the port to be reused. We see this issue pop up in larger training jobs "Address already in use" and we assume it to be because all the ephemeral ports are exhausted. This diff allows ports to be reused, we see a reduced number of ports being in `TIME_WAIT` state. context: https://fb.workplace.com/groups/319878845696681/permalink/5988899781205532/ another issue: https://fb.workplace.com/groups/319878845696681/permalink/958768178474408/ Differential Revision: D44029927 fbshipit-source-id: 45e7305df8c5fae764a5d93478ac007f604be7dd --- gloo/transport/tcp/device.cc | 10 ++++++++++ gloo/transport/tcp/pair.cc | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gloo/transport/tcp/device.cc b/gloo/transport/tcp/device.cc index 481d67f7d..064ef2851 100644 --- a/gloo/transport/tcp/device.cc +++ b/gloo/transport/tcp/device.cc @@ -101,6 +101,16 @@ static void lookupAddrForHostname(struct attr& attr) { struct addrinfo* rp; for (rp = result; rp != nullptr; rp = rp->ai_next) { auto fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + + // Set SO_REUSEADDR to signal that reuse of the listening port is OK. + printf("in tcp/device.cc"); + int on = 1; + rv = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + if (rv == -1) { + close(fd); + GLOO_ENFORCE_NE(rv, -1); + } + if (fd == -1) { continue; } diff --git a/gloo/transport/tcp/pair.cc b/gloo/transport/tcp/pair.cc index 389d9f2a2..39d01d5c4 100644 --- a/gloo/transport/tcp/pair.cc +++ b/gloo/transport/tcp/pair.cc @@ -162,9 +162,9 @@ void Pair::listen() { signalAndThrowException(GLOO_ERROR_MSG("socket: ", strerror(errno))); } - // Set SO_REUSEADDR to signal that reuse of the listening port is OK. + // Set SO_REUSEPORT to signal that reuse of the listening port is OK. int on = 1; - rv = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + rv = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)); if (rv == -1) { ::close(fd); signalAndThrowException(GLOO_ERROR_MSG("setsockopt: ", strerror(errno)));