diff --git a/docs/getting_started.md b/docs/getting_started.md
index 3098ea1..9140435 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -271,36 +271,23 @@ Usually it is slow if you do not have high speed networking like InfiniBand.
 ### Launch multiple jobs on a single machine
 
 If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
-you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+you need to specify different ports (29500 by default) for each job to avoid communication conflict. Otherwise, there will be error message saying `RuntimeError: Address already in use`.
 
-If you use `dist_train.sh` to launch training jobs, you can set the port in commands.
+If you use `dist_train.sh` to launch training jobs, you can set the port in commands with environment variable `PORT`.
 
 ```shell
 CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
 CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
 ```
 
-If you use launch training jobs with Slurm, you need to modify the config files (usually the 6th line from the bottom in config files) to set different communication ports.
+If you use `slurm_train.sh` to launch training jobs, you can set the port in commands with environment variable `MASTER_PORT`.
 
-In `config1.py`,
-```python
-dist_params = dict(backend='nccl', port=29500)
-```
-
-In `config2.py`,
-```python
-dist_params = dict(backend='nccl', port=29501)
-```
-
-Then you can launch two jobs with `config1.py` ang `config2.py`.
 
 ```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+MASTER_PORT=29500 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE}
+MASTER_PORT=29501 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE}
 ```
 
-Or you could specify port by `---options dist_params.port=29501`
-
 ## Useful tools
 
 We provide lots of useful tools under `tools/` directory.
diff --git a/docs/tutorials/training_tricks.md b/docs/tutorials/training_tricks.md
index 2a56daf..11b3480 100644
--- a/docs/tutorials/training_tricks.md
+++ b/docs/tutorials/training_tricks.md
@@ -25,7 +25,7 @@ model=dict(
     decode_head=dict(
         sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=100000)) )
 ```
-In this way, only pixels with confidence score under 0.7 are used to train. And we keep at least 100000 pixels during training.
+In this way, only pixels with confidence score under 0.7 are used to train. And we keep at least 100000 pixels during training. If `thresh` is not specified, pixels of top ``min_kept`` loss will be selected.
 
 ## Class Balanced Loss
 For dataset that is not balanced in classes distribution, you may change the loss weight of each class.
diff --git a/mmseg/core/seg/sampler/ohem_pixel_sampler.py b/mmseg/core/seg/sampler/ohem_pixel_sampler.py
index 28c14ab..88bb10d 100644
--- a/mmseg/core/seg/sampler/ohem_pixel_sampler.py
+++ b/mmseg/core/seg/sampler/ohem_pixel_sampler.py
@@ -10,22 +10,25 @@ class OHEMPixelSampler(BasePixelSampler):
     """Online Hard Example Mining Sampler for segmentation.
 
     Args:
-        thresh (float): The threshold for hard example selection. Below
-            which, are prediction with low confidence. Default: 0.7.
-        min_kept (int): The minimum number of predictions to keep.
+        context (nn.Module): The context of sampler, subclass of
+            :obj:`BaseDecodeHead`.
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: None.
+        min_kept (int, optional): The minimum number of predictions to keep.
             Default: 100000.
-        ignore_index (int): The ignore index for training. Default: 255.
     """
 
-    def __init__(self, thresh=0.7, min_kept=100000, ignore_index=255):
+    def __init__(self, context, thresh=None, min_kept=100000):
         super(OHEMPixelSampler, self).__init__()
+        self.context = context
         assert min_kept > 1
         self.thresh = thresh
         self.min_kept = min_kept
-        self.ignore_index = ignore_index
 
     def sample(self, seg_logit, seg_label):
-        """
+        """Sample pixels that have high loss or with low prediction confidence.
 
         Args:
             seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W)
@@ -33,32 +36,41 @@ def sample(self, seg_logit, seg_label):
 
         Returns:
             torch.Tensor: segmentation weight, shape (N, H, W)
-
         """
         with torch.no_grad():
             assert seg_logit.shape[2:] == seg_label.shape[2:]
             assert seg_label.shape[1] == 1
             seg_label = seg_label.squeeze(1).long()
             batch_kept = self.min_kept * seg_label.size(0)
-            seg_prob = F.softmax(seg_logit, dim=1)
-            mask = seg_label.contiguous().view(-1, ) != self.ignore_index
+            valid_mask = seg_label != self.context.ignore_index
+            seg_weight = seg_logit.new_zeros(size=seg_label.size())
+            valid_seg_weight = seg_weight[valid_mask]
+            if self.thresh is not None:
+                seg_prob = F.softmax(seg_logit, dim=1)
 
-            tmp_seg_label = seg_label.clone()
-            tmp_seg_label[tmp_seg_label == self.ignore_index] = 0
-            seg_prob = seg_prob.gather(1, tmp_seg_label.unsqueeze(1))
-            sort_prob, sort_indices = seg_prob.contiguous().view(
-                -1, )[mask].contiguous().sort()
+                tmp_seg_label = seg_label.clone().unsqueeze(1)
+                tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0
+                seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1)
+                sort_prob, sort_indices = seg_prob[valid_mask].sort()
 
-            if sort_prob.numel() > 0:
-                min_threshold = sort_prob[min(batch_kept,
-                                              sort_prob.numel() - 1)]
+                if sort_prob.numel() > 0:
+                    min_threshold = sort_prob[min(batch_kept,
+                                                  sort_prob.numel() - 1)]
+                else:
+                    min_threshold = 0.0
+                threshold = max(min_threshold, self.thresh)
+                valid_seg_weight[seg_prob[valid_mask] < threshold] = 1.
             else:
-                min_threshold = 0.0
-            threshold = max(min_threshold, self.thresh)
+                losses = self.context.loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=None,
+                    ignore_index=self.context.ignore_index,
+                    reduction_override='none')
+                # faster than topk according to https://github.com/pytorch/pytorch/issues/22812  # noqa
+                _, sort_indices = losses[valid_mask].sort(descending=True)
+                valid_seg_weight[sort_indices[:batch_kept]] = 1.
 
-            seg_weight = seg_logit.new_ones(size=seg_label.size())
-            seg_weight = seg_weight.view(-1)
-            seg_weight[mask][sort_prob < threshold] = 0.
-            seg_weight = seg_weight.view_as(seg_label)
+            seg_weight[valid_mask] = valid_seg_weight
 
             return seg_weight
diff --git a/mmseg/models/decode_heads/decode_head.py b/mmseg/models/decode_heads/decode_head.py
index 9f55fee..0f58c80 100644
--- a/mmseg/models/decode_heads/decode_head.py
+++ b/mmseg/models/decode_heads/decode_head.py
@@ -73,7 +73,7 @@ def __init__(self,
         self.ignore_index = ignore_index
         self.align_corners = align_corners
         if sampler is not None:
-            self.sampler = build_pixel_sampler(sampler)
+            self.sampler = build_pixel_sampler(sampler, context=self)
         else:
             self.sampler = None
 
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
index af26b8d..3c79c16 100644
--- a/tests/test_sampler.py
+++ b/tests/test_sampler.py
@@ -2,20 +2,37 @@
 import torch
 
 from mmseg.core import OHEMPixelSampler
+from mmseg.models.decode_heads import FCNHead
+
+
+def _context_for_ohem():
+    return FCNHead(in_channels=32, channels=16, num_classes=19)
 
 
 def test_ohem_sampler():
 
     with pytest.raises(AssertionError):
         # seg_logit and seg_label must be of the same size
-        sampler = OHEMPixelSampler()
+        sampler = OHEMPixelSampler(context=_context_for_ohem())
         seg_logit = torch.randn(1, 19, 45, 45)
         seg_label = torch.randint(0, 19, size=(1, 1, 89, 89))
         sampler.sample(seg_logit, seg_label)
 
-    sampler = OHEMPixelSampler()
+    # test with thresh
+    sampler = OHEMPixelSampler(
+        context=_context_for_ohem(), thresh=0.7, min_kept=200)
+    seg_logit = torch.randn(1, 19, 45, 45)
+    seg_label = torch.randint(0, 19, size=(1, 1, 45, 45))
+    seg_weight = sampler.sample(seg_logit, seg_label)
+    assert seg_weight.shape[0] == seg_logit.shape[0]
+    assert seg_weight.shape[1:] == seg_logit.shape[2:]
+    assert seg_weight.sum() > 200
+
+    # test w.o thresh
+    sampler = OHEMPixelSampler(context=_context_for_ohem(), min_kept=200)
     seg_logit = torch.randn(1, 19, 45, 45)
     seg_label = torch.randint(0, 19, size=(1, 1, 45, 45))
     seg_weight = sampler.sample(seg_logit, seg_label)
     assert seg_weight.shape[0] == seg_logit.shape[0]
     assert seg_weight.shape[1:] == seg_logit.shape[2:]
+    assert seg_weight.sum() == 200