You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
你好,非常想知道单卡单机训练时该怎么修改代码,自己尝试直接单卡运行,在此处报错
Traceback (most recent call last):
File "train.py", line 429, in
train()
File "train.py", line 292, in train
out, out16, out32, detail8 = net(im)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 885, in forward
inputs, kwargs = self.to_kwargs(inputs, kwargs, self.device_ids[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 992, in to_kwargs
inputs = self._recursive_to(inputs, device_id) if inputs else []
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 986, in _recursive_to
res = to_map(inputs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 977, in to_map
return list(zip(*map(to_map, obj)))
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 955, in to_map
if obj.device == torch.device("cuda", target_gpu):
RuntimeError: Device index must not be negative
将local_rank默认值改为0后又有如下报错:
Traceback (most recent call last):
File "train.py", line 429, in
train()
File "train.py", line 292, in train
out, out16, out32, detail8 = net(im)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/models/model_stages.py", line 272, in forward
feat_res2, feat_res4, feat_res8, feat_res16, feat_cp8, feat_cp16 = self.cp(x)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/models/model_stages.py", line 141, in forward
avg = self.conv_avg(avg)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/models/model_stages.py", line 31, in forward
x = self.bn(x)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/modules/bn.py", line 118, in forward
return inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
RuntimeError: Some elements marked as dirty during the forward method were not returned as output. The inputs that are modified inplace must all be outputs of the Function.
RuntimeError: Some elements marked as dirty during the forward method were not returned as output. The inputs that are modified inplace must all be outputs of the Function. @yang-stephen@Lee6384
你好,非常想知道单卡单机训练时该怎么修改代码,自己尝试直接单卡运行,在此处报错
Traceback (most recent call last):
File "train.py", line 429, in
train()
File "train.py", line 292, in train
out, out16, out32, detail8 = net(im)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 885, in forward
inputs, kwargs = self.to_kwargs(inputs, kwargs, self.device_ids[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 992, in to_kwargs
inputs = self._recursive_to(inputs, device_id) if inputs else []
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 986, in _recursive_to
res = to_map(inputs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 977, in to_map
return list(zip(*map(to_map, obj)))
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 955, in to_map
if obj.device == torch.device("cuda", target_gpu):
RuntimeError: Device index must not be negative
将local_rank默认值改为0后又有如下报错:
Traceback (most recent call last):
File "train.py", line 429, in
train()
File "train.py", line 292, in train
out, out16, out32, detail8 = net(im)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/models/model_stages.py", line 272, in forward
feat_res2, feat_res4, feat_res8, feat_res16, feat_cp8, feat_cp16 = self.cp(x)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/models/model_stages.py", line 141, in forward
avg = self.conv_avg(avg)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/models/model_stages.py", line 31, in forward
x = self.bn(x)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/STDC-Seg/modules/bn.py", line 118, in forward
return inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
RuntimeError: Some elements marked as dirty during the forward method were not returned as output. The inputs that are modified inplace must all be outputs of the Function.
不知道该怎么解决,非常想知道怎么解决,或者希望能指点要去掉分布式训练该怎么修train.py,万分感谢!
The text was updated successfully, but these errors were encountered: