From a8ddd2210c2898bd2eb3dd698bfbda1bc2ad6351 Mon Sep 17 00:00:00 2001 From: coincheung <867153576@qq.com> Date: Fri, 10 Jul 2020 03:40:30 +0000 Subject: [PATCH] more tuning on bisenetv2 --- README.md | 20 ++++++++++++++++++-- bisenetv2/cityscapes_cv2.py | 5 +++-- bisenetv2/evaluatev2.py | 2 +- train.py | 1 + 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ecce803..2e92dcb 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,25 @@ BiSeNetV2 is faster and requires less memory, you can try BiSeNetV2 on cityscape $ export CUDA_VISIBLE_DEVICES=0,1 $ python -m torch.distributed.launch --nproc_per_node=2 bisenetv2/train.py --fp16 ``` -This would train the model and then compute the mIOU on eval set. +This would train the model and then compute the mIOU on eval set. + +~~I barely achieve mIOU of around 71. Though I can boost the performace by adding more regularizations and pretraining, as this would be beyond the scope of the paper, let's wait for the official implementation and see how they achieved that mIOU of 73.~~ + +Here is the tips how I achieved 74.39 mIOU: +1. larger training scale range: In the paper, they say the images are first resized to range (0.75, 2), then 1024x2048 patches are cropped and resized to 512x1024, which equals to first resized to (0.375, 1) then crop with 512x1024 patches. In my implementation, I first rescale the image by range of (0.25, 2), and then directly crop 512x1024 patches to train. + +2. original inference scale: In the paper, they first rescale the image into 512x1024 to run inference, then rescale back to original size of 1024x2048. In my implementation, I directly use original size of 1024x2048 to inference. + +3. colorjitter as augmentations. + +Note that, like bisenetv1, bisenetv2 also has a relatively big variance. Here is the mIOU after training 5 times on my platform: + +| #No. | 1 | 2 | 3 | 4 | 5 | +|:---|:---|:---|:---|:---|:---| +| mIOU | 74.28 | 72.96 | 73.73 | 74.39 | 73.77 | + +You can download the pretrained model with mIOU of 74.39 following this [link](https://drive.google.com/file/d/1r_F-KZg-3s2pPcHRIuHZhZ0DQ0wocudk/view?usp=sharing). -I barely achieve mIOU of around 71. Though I can boost the performace by adding more regularizations and pretraining, as this would be beyond the scope of the paper, let's wait for the official implementation and see how they achieved that mIOU of 73. # BiSeNet diff --git a/bisenetv2/cityscapes_cv2.py b/bisenetv2/cityscapes_cv2.py index 04e00de..a9cb3ba 100644 --- a/bisenetv2/cityscapes_cv2.py +++ b/bisenetv2/cityscapes_cv2.py @@ -127,7 +127,8 @@ class TransformationTrain(object): def __init__(self): self.trans_func = T.Compose([ - T.RandomResizedCrop([0.375, 1.], [512, 1024]), + # T.RandomResizedCrop([0.375, 1.], [512, 1024]), + T.RandomResizedCrop([0.25, 2], [512, 1024]), T.RandomHorizontalFlip(), T.ColorJitter( brightness=0.4, @@ -145,7 +146,7 @@ class TransformationVal(object): def __call__(self, im_lb): im, lb = im_lb['im'], im_lb['lb'] - im = cv2.resize(im, (1024, 512)) + # im = cv2.resize(im, (1024, 512)) return dict(im=im, lb=lb) diff --git a/bisenetv2/evaluatev2.py b/bisenetv2/evaluatev2.py index 2f330dd..2d8f875 100644 --- a/bisenetv2/evaluatev2.py +++ b/bisenetv2/evaluatev2.py @@ -91,7 +91,7 @@ def evaluate(weight_pth): ) ## evaluator - eval_model(net, 4) + eval_model(net, 2) def parse_args(): diff --git a/train.py b/train.py index 3798eb6..9f13983 100644 --- a/train.py +++ b/train.py @@ -61,6 +61,7 @@ def train(): n_img_per_gpu = 8 n_workers = 4 cropsize = [1024, 1024] + # cropsize = [1024, 512] ds = CityScapes('./data', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds,