You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I want to parallelize tuning on a slurm cluster using Dask. Specifically, I used the hydra sweeper. The problem is that the workers don't connect to the Dask cluster and simply time out.
Ideally the cluster would distribute the function evaluation runs to a new slurm script which would in turn report back to the cluster.
Actual Results
I get this error:
[WARNING][dask_runner.py:135] No workers are available. This could mean workers crashed. Waiting for new workers...
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/hydra/_internal/utils.py", line 219, in run_and_report
return func()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/hydra/_internal/utils.py", line 466, in
lambda: hydra.multirun(
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 162, in multirun
ret = sweeper.sweep(arguments=task_overrides)
File "/bigwork/nhwpeimt/hydra-smac-sweeper/hydra_plugins/hydra_smac_sweeper/smac_sweeper.py", line 54, in sweep
return self.sweeper.sweep(arguments)
File "/bigwork/nhwpeimt/hydra-smac-sweeper/hydra_plugins/hydra_smac_sweeper/smac_sweeper_backend.py", line 296, in sweep
incumbent = smac.optimize()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/smac/facade/abstract_facade.py", line 317, in optimize
incumbents = self._optimizer.optimize(data_to_scatter=data_to_scatter)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/smac/main/smbo.py", line 304, in optimize
self._runner.submit_trial(trial_info=trial_info, **dask_data_to_scatter)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/smac/runner/dask_runner.py", line 138, in submit_trial
raise RuntimeError(
RuntimeError: Tried to execute a job, but no worker was ever available.This likely means that a worker crashed or no workers were properly configured.
Dask suggests printing and running the job script by itself which looks correct to me, but gives this error:
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/core.py", line 289, in connect
comm = await asyncio.wait_for(
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/asyncio/tasks.py", line 479, in wait_for
return fut.result()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/tcp.py", line 444, in connect
convert_stream_closed_error(self, e)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/tcp.py", line 133, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc.class.name}: {exc}") from exc
distributed.comm.core.CommClosedError: in <distributed.comm.tcp.TCPConnector object at 0x2ac245b74ca0>: ConnectionRefusedError: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 287, in _
await asyncio.wait_for(self.start(), timeout=timeout)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/asyncio/tasks.py", line 479, in wait_for
return fut.result()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/nanny.py", line 329, in start
msg = await self.scheduler.register_nanny()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 919, in send_recv_from_rpc
comm = await self.pool.connect(self.addr)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 1089, in connect
comm = await fut
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/core.py", line 315, in connect
raise OSError(
OSError: Timed out trying to connect to tcp://130.75.7.144:35161 after 30 s
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 495, in
go() # pragma: no cover
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 491, in go
main()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 1128, in call
return self.main(*args, **kwargs)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 477, in main
loop.run_sync(run)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/tornado/ioloop.py", line 530, in run_sync
return future_cell[0].result()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 471, in run
await asyncio.gather(*nannies)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/asyncio/tasks.py", line 688, in _wrap_awaitable
return (yield from awaitable.await())
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 291, in _
raise TimeoutError(
asyncio.exceptions.TimeoutError: Nanny failed to start in 60 seconds
Since this cluster blocks internet access, I can imagine it also restricts connections between nodes in some ways. I didn't really find much information on this in the dask documentation and also couldn't solve it so far. Maybe this is something you know and would want to include in the documentation, though?
Versions
'2.0.1'
The text was updated successfully, but these errors were encountered:
Description
I want to parallelize tuning on a slurm cluster using Dask. Specifically, I used the hydra sweeper. The problem is that the workers don't connect to the Dask cluster and simply time out.
Steps/Code to Reproduce
I got the error both on my own code and the multifidelity MLP example here: https://github.com/automl/hydra-smac-sweeper/blob/main/examples/multifidelity_mlp.py
Expected Results
Ideally the cluster would distribute the function evaluation runs to a new slurm script which would in turn report back to the cluster.
Actual Results
I get this error:
[WARNING][dask_runner.py:135] No workers are available. This could mean workers crashed. Waiting for new workers...
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/hydra/_internal/utils.py", line 219, in run_and_report
return func()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/hydra/_internal/utils.py", line 466, in
lambda: hydra.multirun(
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 162, in multirun
ret = sweeper.sweep(arguments=task_overrides)
File "/bigwork/nhwpeimt/hydra-smac-sweeper/hydra_plugins/hydra_smac_sweeper/smac_sweeper.py", line 54, in sweep
return self.sweeper.sweep(arguments)
File "/bigwork/nhwpeimt/hydra-smac-sweeper/hydra_plugins/hydra_smac_sweeper/smac_sweeper_backend.py", line 296, in sweep
incumbent = smac.optimize()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/smac/facade/abstract_facade.py", line 317, in optimize
incumbents = self._optimizer.optimize(data_to_scatter=data_to_scatter)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/smac/main/smbo.py", line 304, in optimize
self._runner.submit_trial(trial_info=trial_info, **dask_data_to_scatter)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/smac/runner/dask_runner.py", line 138, in submit_trial
raise RuntimeError(
RuntimeError: Tried to execute a job, but no worker was ever available.This likely means that a worker crashed or no workers were properly configured.
Dask suggests printing and running the job script by itself which looks correct to me, but gives this error:
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/core.py", line 289, in connect
comm = await asyncio.wait_for(
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/asyncio/tasks.py", line 479, in wait_for
return fut.result()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/tcp.py", line 444, in connect
convert_stream_closed_error(self, e)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/tcp.py", line 133, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc.class.name}: {exc}") from exc
distributed.comm.core.CommClosedError: in <distributed.comm.tcp.TCPConnector object at 0x2ac245b74ca0>: ConnectionRefusedError: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 287, in _
await asyncio.wait_for(self.start(), timeout=timeout)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/asyncio/tasks.py", line 479, in wait_for
return fut.result()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/nanny.py", line 329, in start
msg = await self.scheduler.register_nanny()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 919, in send_recv_from_rpc
comm = await self.pool.connect(self.addr)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 1089, in connect
comm = await fut
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/comm/core.py", line 315, in connect
raise OSError(
OSError: Timed out trying to connect to tcp://130.75.7.144:35161 after 30 s
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 495, in
go() # pragma: no cover
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 491, in go
main()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 1128, in call
return self.main(*args, **kwargs)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 477, in main
loop.run_sync(run)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/tornado/ioloop.py", line 530, in run_sync
return future_cell[0].result()
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/cli/dask_worker.py", line 471, in run
await asyncio.gather(*nannies)
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/asyncio/tasks.py", line 688, in _wrap_awaitable
return (yield from awaitable.await())
File "/bigwork/nhwpeimt/miniconda3/envs/autorl/lib/python3.9/site-packages/distributed/core.py", line 291, in _
raise TimeoutError(
asyncio.exceptions.TimeoutError: Nanny failed to start in 60 seconds
Since this cluster blocks internet access, I can imagine it also restricts connections between nodes in some ways. I didn't really find much information on this in the dask documentation and also couldn't solve it so far. Maybe this is something you know and would want to include in the documentation, though?
Versions
'2.0.1'
The text was updated successfully, but these errors were encountered: