From 2eb94e65a65c6616f3984b02dffa2e56b64f21df Mon Sep 17 00:00:00 2001 From: "Alexander (Max) deGroot" Date: Fri, 30 Mar 2018 00:03:26 -0700 Subject: [PATCH 1/2] Update README.md (#136) --- README.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index da7a5f7ae..483f3c6f0 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,5 @@ # SSD: Single Shot MultiBox Object Detector, in PyTorch -A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg. The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd). - -***UPDATE:*** We have just added support for MS COCO! Check it out [below](#coco). - -## Authors - -* [**Max deGroot**](https://github.com/amdegroot) -* [**Ellis Brown**](http://github.com/ellisbrown) - -***Note:*** Unfortunately, this is just a hobby for us and not a full-time job, so we'll do our best to keep things up to date, but no guarantees. That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible. +A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg. The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd). @@ -167,6 +158,13 @@ We have accumulated the following to-do list, which we hope to complete in the n * [ ] Support for SSD512 training and testing * [ ] Support for training on custom datasets +## Authors + +* [**Max deGroot**](https://github.com/amdegroot) +* [**Ellis Brown**](http://github.com/ellisbrown) + +***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees. That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible. + ## References - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)). - [Original Implementation (CAFFE)](https://github.com/weiliu89/caffe/tree/ssd) From 5c55b0310023847a812123e5dba8e62f75c7537b Mon Sep 17 00:00:00 2001 From: Max deGroot Date: Fri, 30 Mar 2018 00:40:31 -0700 Subject: [PATCH 2/2] Sync develop with master --- README.md | 26 ++--- data/__init__.py | 3 +- data/coco.py | 24 ++--- data/coco_labels.txt | 80 ++++++++++++++ data/config.py | 6 ++ data/voc0712.py | 19 ++-- demo/demo.ipynb | 24 +++-- demo/live.py | 14 +-- eval.py | 54 ++++++---- layers/functions/prior_box.py | 1 - ssd.py | 18 ++-- train.py | 196 ++++++++++++++++++++-------------- 12 files changed, 302 insertions(+), 163 deletions(-) create mode 100644 data/coco_labels.txt diff --git a/README.md b/README.md index 483f3c6f0..0256fac99 100644 --- a/README.md +++ b/README.md @@ -24,10 +24,10 @@ A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detecto - Clone this repository. * Note: We currently only support Python 3+. - Then download the dataset by following the [instructions](#datasets) below. -- We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training! - * To use Visdom in the browser: +- We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training! + * To use Visdom in the browser: ```Shell - # First install Python server and client + # First install Python server and client pip install visdom # Start the server (probably in a screen or tmux) python -m visdom.server @@ -40,7 +40,7 @@ To make things easy, we provide bash scripts to handle the dataset downloads and ### COCO -Microsoft COCO: Common Objects in Context +Microsoft COCO: Common Objects in Context ##### Download COCO 2014 ```Shell @@ -83,7 +83,7 @@ python train.py * For training, an NVIDIA GPU is strongly recommended for speed. * For instructions on Visdom usage/installation, see the Installation section. * You can pick-up training from a checkpoint by specifying the path as one of the training parameters (again, see `train.py` for options) - + ## Evaluation To evaluate a trained network: @@ -107,7 +107,7 @@ You can specify the parameters listed in the `eval.py` file by flagging them or | 77.2 % | 77.26 % | 58.12% | 77.43 % | ##### FPS -**GTX 1060:** ~45.45 FPS +**GTX 1060:** ~45.45 FPS ## Demos @@ -115,23 +115,23 @@ You can specify the parameters listed in the `eval.py` file by flagging them or #### Download a pre-trained network - We are trying to provide PyTorch `state_dicts` (dict of weight tensors) of the latest SSD model definitions trained on different datasets. -- Currently, we provide the following PyTorch models: +- Currently, we provide the following PyTorch models: * SSD300 trained on VOC0712 (newest PyTorch weights) - https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth * SSD300 trained on VOC0712 (original Caffe weights) - https://s3.amazonaws.com/amdegroot-models/ssd_300_VOC0712.pth -- Our goal is to reproduce this table from the [original paper](http://arxiv.org/abs/1512.02325) +- Our goal is to reproduce this table from the [original paper](http://arxiv.org/abs/1512.02325)

SSD results on multiple datasets

### Try the demo notebook - Make sure you have [jupyter notebook](http://jupyter.readthedocs.io/en/latest/install.html) installed. - Two alternatives for installing jupyter notebook: - 1. If you installed PyTorch with [conda](https://www.continuum.io/downloads) (recommended), then you should already have it. (Just navigate to the ssd.pytorch cloned repo and run): - `jupyter notebook` + 1. If you installed PyTorch with [conda](https://www.continuum.io/downloads) (recommended), then you should already have it. (Just navigate to the ssd.pytorch cloned repo and run): + `jupyter notebook` 2. If using [pip](https://pypi.python.org/pypi/pip): - + ```Shell # make sure pip is upgraded pip3 install --upgrade pip @@ -169,5 +169,5 @@ We have accumulated the following to-do list, which we hope to complete in the n - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)). - [Original Implementation (CAFFE)](https://github.com/weiliu89/caffe/tree/ssd) - A huge thank you to [Alex Koltun](https://github.com/alexkoltun) and his team at [Webyclip](webyclip.com) for their help in finishing the data augmentation portion. -- A list of other great SSD ports that were sources of inspiration (especially the Chainer repo): - * [Chainer](https://github.com/Hakuyume/chainer-ssd), [Keras](https://github.com/rykov8/ssd_keras), [MXNet](https://github.com/zhreshold/mxnet-ssd), [Tensorflow](https://github.com/balancap/SSD-Tensorflow) +- A list of other great SSD ports that were sources of inspiration (especially the Chainer repo): + * [Chainer](https://github.com/Hakuyume/chainer-ssd), [Keras](https://github.com/rykov8/ssd_keras), [MXNet](https://github.com/zhreshold/mxnet-ssd), [Tensorflow](https://github.com/balancap/SSD-Tensorflow) diff --git a/data/__init__.py b/data/__init__.py index 88111e3d4..a67c1cf9f 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -1,5 +1,6 @@ from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT -from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT + +from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map from .config import * import torch import cv2 diff --git a/data/coco.py b/data/coco.py index 7523fe156..765531761 100644 --- a/data/coco.py +++ b/data/coco.py @@ -30,6 +30,15 @@ 'teddy bear', 'hair drier', 'toothbrush') +def get_label_map(label_file): + label_map = {} + labels = open(label_file, 'r') + for line in labels: + ids = line.split(',') + label_map[int(ids[0])] = int(ids[1]) + return label_map + + class COCOAnnotationTransform(object): """Transforms a COCO annotation into a Tensor of bbox coords and label index Initilized with a dictionary lookup of classnames to indexes @@ -74,8 +83,8 @@ class COCODetection(data.Dataset): in the target (bbox) and transforms it. """ - def __init__(self, root, image_set, transform=None, - target_transform=None): + def __init__(self, root, image_set='trainval35k', transform=None, + target_transform=COCOAnnotationTransform(), dataset_name='MS COCO'): sys.path.append(osp.join(root, COCO_API)) from pycocotools.coco import COCO self.root = osp.join(root, IMAGES, image_set) @@ -84,7 +93,7 @@ def __init__(self, root, image_set, transform=None, self.ids = list(self.coco.imgToAnns.keys()) self.transform = transform self.target_transform = target_transform - self.name = 'MS COCO ' + image_set + self.name = dataset_name def __getitem__(self, index): """ @@ -169,12 +178,3 @@ def __repr__(self): tmp = ' Target Transforms (if any): ' fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) return fmt_str - - -def get_label_map(label_file): - label_map = {} - labels = open(label_file, 'r') - for line in labels: - ids = line.split(',') - label_map[int(ids[0])] = int(ids[1]) - return label_map diff --git a/data/coco_labels.txt b/data/coco_labels.txt new file mode 100644 index 000000000..146dd8daa --- /dev/null +++ b/data/coco_labels.txt @@ -0,0 +1,80 @@ +1,1,person +2,2,bicycle +3,3,car +4,4,motorcycle +5,5,airplane +6,6,bus +7,7,train +8,8,truck +9,9,boat +10,10,traffic light +11,11,fire hydrant +13,12,stop sign +14,13,parking meter +15,14,bench +16,15,bird +17,16,cat +18,17,dog +19,18,horse +20,19,sheep +21,20,cow +22,21,elephant +23,22,bear +24,23,zebra +25,24,giraffe +27,25,backpack +28,26,umbrella +31,27,handbag +32,28,tie +33,29,suitcase +34,30,frisbee +35,31,skis +36,32,snowboard +37,33,sports ball +38,34,kite +39,35,baseball bat +40,36,baseball glove +41,37,skateboard +42,38,surfboard +43,39,tennis racket +44,40,bottle +46,41,wine glass +47,42,cup +48,43,fork +49,44,knife +50,45,spoon +51,46,bowl +52,47,banana +53,48,apple +54,49,sandwich +55,50,orange +56,51,broccoli +57,52,carrot +58,53,hot dog +59,54,pizza +60,55,donut +61,56,cake +62,57,chair +63,58,couch +64,59,potted plant +65,60,bed +67,61,dining table +70,62,toilet +72,63,tv +73,64,laptop +74,65,mouse +75,66,remote +76,67,keyboard +77,68,cell phone +78,69,microwave +79,70,oven +80,71,toaster +81,72,sink +82,73,refrigerator +84,74,book +85,75,clock +86,76,vase +87,77,scissors +88,78,teddy bear +89,79,hair drier +90,80,toothbrush diff --git a/data/config.py b/data/config.py index f164b4f36..8999622cc 100644 --- a/data/config.py +++ b/data/config.py @@ -12,6 +12,9 @@ # SSD300 CONFIGS voc = { + 'num_classes': 21, + 'lr_steps': (80000, 100000, 120000), + 'max_iter': 120000, 'feature_maps': [38, 19, 10, 5, 3, 1], 'min_dim': 300, 'steps': [8, 16, 32, 64, 100, 300], @@ -24,6 +27,9 @@ } coco = { + 'num_classes': 201, + 'lr_steps': (280000, 360000, 400000), + 'max_iter': 400000, 'feature_maps': [38, 19, 10, 5, 3, 1], 'min_dim': 300, 'steps': [8, 16, 32, 64, 100, 300], diff --git a/data/voc0712.py b/data/voc0712.py index 105c75f95..a3e80d037 100644 --- a/data/voc0712.py +++ b/data/voc0712.py @@ -6,13 +6,10 @@ Updated by: Ellis Brown, Max deGroot """ from .config import HOME -import os -import os.path +import os.path as osp import sys import torch import torch.utils.data as data -import torchvision.transforms as transforms -from PIL import Image, ImageDraw, ImageFont import cv2 import numpy as np if sys.version_info[0] == 2: @@ -28,7 +25,7 @@ 'sheep', 'sofa', 'train', 'tvmonitor') # note: if you used our download scripts, this should be right -VOC_ROOT = os.path.join(HOME, "data/VOCdevkit/") +VOC_ROOT = osp.join(HOME, "data/VOCdevkit/") class VOCAnnotationTransform(object): @@ -97,19 +94,21 @@ class VOCDetection(data.Dataset): (default: 'VOC2007') """ - def __init__(self, root, image_sets, transform=None, target_transform=None, + def __init__(self, root, + image_sets=[('2007', 'trainval'), ('2012', 'trainval')], + transform=None, target_transform=VOCAnnotationTransform(), dataset_name='VOC0712'): self.root = root self.image_set = image_sets self.transform = transform self.target_transform = target_transform self.name = dataset_name - self._annopath = os.path.join('%s', 'Annotations', '%s.xml') - self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') + self._annopath = osp.join('%s', 'Annotations', '%s.xml') + self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') self.ids = list() for (year, name) in image_sets: - rootpath = os.path.join(self.root, 'VOC' + year) - for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')): + rootpath = osp.join(self.root, 'VOC' + year) + for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): self.ids.append((rootpath, line.strip())) def __getitem__(self, index): diff --git a/demo/demo.ipynb b/demo/demo.ipynb index 2bb802f49..2642b8460 100644 --- a/demo/demo.ipynb +++ b/demo/demo.ipynb @@ -26,16 +26,12 @@ "import torch.nn as nn\n", "import torch.backends.cudnn as cudnn\n", "from torch.autograd import Variable\n", - "import torch.utils.data as data\n", - "import torchvision.transforms as transforms\n", - "from torch.utils.serialization import load_lua\n", "import numpy as np\n", "import cv2\n", "if torch.cuda.is_available():\n", " torch.set_default_tensor_type('torch.cuda.FloatTensor')\n", "\n", - "from ssd import build_ssd\n", - "# from models import build_ssd as build_ssd_v1 # uncomment for older pool6 model" + "from ssd import build_ssd" ] }, { @@ -52,6 +48,7 @@ "cell_type": "code", "execution_count": 2, "metadata": { + "collapsed": false, "scrolled": false }, "outputs": [ @@ -80,7 +77,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -97,9 +96,9 @@ "# image = cv2.imread('./data/example.jpg', cv2.IMREAD_COLOR) # uncomment if dataset not downloaded\n", "%matplotlib inline\n", "from matplotlib import pyplot as plt\n", - "from data import VOCDetection, VOCroot, AnnotationTransform\n", + "from data import VOCDetection, VOC_ROOT, VOCAnnotationTransform\n", "# here we specify year (07 or 12) and dataset ('test', 'val', 'train') \n", - "testset = VOCDetection(VOCroot, [('2007', 'val')], None, AnnotationTransform())\n", + "testset = VOCDetection(VOC_ROOT, [('2007', 'val')], None, VOCAnnotationTransform())\n", "img_id = 60\n", "image = testset.pull_image(img_id)\n", "rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n", @@ -123,7 +122,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -157,6 +158,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], @@ -179,7 +181,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { diff --git a/demo/live.py b/demo/live.py index 5ff56e942..900a341c3 100644 --- a/demo/live.py +++ b/demo/live.py @@ -10,7 +10,7 @@ parser.add_argument('--weights', default='weights/ssd_300_VOC0712.pth', type=str, help='Trained state_dict file path') parser.add_argument('--cuda', default=False, type=bool, - help='Use cuda to train model') + help='Use cuda in live demo') args = parser.parse_args() COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] @@ -30,10 +30,12 @@ def predict(frame): j = 0 while detections[0, i, j, 0] >= 0.6: pt = (detections[0, i, j, 1:] * scale).cpu().numpy() - cv2.rectangle(frame, (int(pt[0]), int(pt[1])), (int(pt[2]), - int(pt[3])), COLORS[i % 3], 2) - cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), FONT, - 2, (255, 255, 255), 2, cv2.LINE_AA) + cv2.rectangle(frame, + (int(pt[0]), int(pt[1])), + (int(pt[2]), int(pt[3])), + COLORS[i % 3], 2) + cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), + FONT, 2, (255, 255, 255), 2, cv2.LINE_AA) j += 1 return frame @@ -77,8 +79,8 @@ def predict(frame): transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) fps = FPS().start() - # stop the timer and display FPS information cv2_demo(net.eval(), transform) + # stop the timer and display FPS information fps.stop() print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) diff --git a/eval.py b/eval.py index 4b4f81e4d..217f045f2 100644 --- a/eval.py +++ b/eval.py @@ -8,13 +8,11 @@ import torch import torch.nn as nn import torch.backends.cudnn as cudnn -import torchvision.transforms as transforms from torch.autograd import Variable -from data import VOCroot +from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform from data import VOC_CLASSES as labelmap import torch.utils.data as data -from data import AnnotationTransform, VOCDetection, BaseTransform, VOC_CLASSES from ssd import build_ssd import sys @@ -30,12 +28,16 @@ else: import xml.etree.ElementTree as ET + def str2bool(v): return v.lower() in ("yes", "true", "t", "1") -parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') -parser.add_argument('--trained_model', default='weights/ssd300_mAP_77.43_v2.pth', - type=str, help='Trained state_dict file path to open') + +parser = argparse.ArgumentParser( + description='Single Shot MultiBox Detector Evaluation') +parser.add_argument('--trained_model', + default='weights/ssd300_mAP_77.43_v2.pth', type=str, + help='Trained state_dict file path to open') parser.add_argument('--save_folder', default='eval/', type=str, help='File path to save results') parser.add_argument('--confidence_threshold', default=0.01, type=float, @@ -44,26 +46,36 @@ def str2bool(v): help='Further restrict the number of predictions to parse') parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model') -parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory') -parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks") +parser.add_argument('--voc_root', default=VOC_ROOT, + help='Location of VOC root directory') +parser.add_argument('--cleanup', default=True, type=str2bool, + help='Cleanup and remove results files following eval') + args = parser.parse_args() if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) -if args.cuda and torch.cuda.is_available(): - torch.set_default_tensor_type('torch.cuda.FloatTensor') +if torch.cuda.is_available(): + if args.cuda: + torch.set_default_tensor_type('torch.cuda.FloatTensor') + if not args.cuda: + print("WARNING: It looks like you have a CUDA device, but aren't using \ + CUDA. Run with --cuda for optimal eval speed.") + torch.set_default_tensor_type('torch.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml') imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg') -imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', 'Main', '{:s}.txt') +imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', + 'Main', '{:s}.txt') YEAR = '2007' -devkit_path = VOCroot + 'VOC' + YEAR +devkit_path = args.voc_root + 'VOC' + YEAR dataset_mean = (104, 117, 123) set_type = 'test' + class Timer(object): """A simple timer.""" def __init__(self): @@ -183,7 +195,7 @@ def voc_ap(rec, prec, use_07_metric=True): """ ap = voc_ap(rec, prec, [use_07_metric]) Compute VOC AP given precision and recall. If use_07_metric is true, uses the - VOC 07 11 point method (default:False). + VOC 07 11 point method (default:True). """ if use_07_metric: # 11 point metric @@ -236,7 +248,7 @@ def voc_eval(detpath, cachedir: Directory for caching the annotations [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation - (default False) + (default True) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) @@ -351,7 +363,6 @@ def voc_eval(detpath, def test_net(save_folder, net, cuda, dataset, transform, top_k, im_size=300, thresh=0.05): - """Test a Fast R-CNN network on an image database.""" num_images = len(dataset) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in @@ -387,8 +398,9 @@ def test_net(save_folder, net, cuda, dataset, transform, top_k, boxes[:, 1] *= h boxes[:, 3] *= h scores = dets[:, 0].cpu().numpy() - cls_dets = np.hstack((boxes.cpu().numpy(), scores[:, np.newaxis])) \ - .astype(np.float32, copy=False) + cls_dets = np.hstack((boxes.cpu().numpy(), + scores[:, np.newaxis])).astype(np.float32, + copy=False) all_boxes[j][i] = cls_dets print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, @@ -408,13 +420,15 @@ def evaluate_detections(box_list, output_dir, dataset): if __name__ == '__main__': # load net - num_classes = len(VOC_CLASSES) + 1 # +1 background - net = build_ssd('test', 300, num_classes) # initialize SSD + num_classes = len(labelmap) + 1 # +1 for background + net = build_ssd('test', 300, num_classes) # initialize SSD net.load_state_dict(torch.load(args.trained_model)) net.eval() print('Finished loading model!') # load data - dataset = VOCDetection(args.voc_root, [('2007', set_type)], BaseTransform(300, dataset_mean), AnnotationTransform()) + dataset = VOCDetection(args.voc_root, [('2007', set_type)], + BaseTransform(300, dataset_mean), + VOCAnnotationTransform()) if args.cuda: net = net.cuda() cudnn.benchmark = True diff --git a/layers/functions/prior_box.py b/layers/functions/prior_box.py index 96907ccf8..7848a390d 100644 --- a/layers/functions/prior_box.py +++ b/layers/functions/prior_box.py @@ -10,7 +10,6 @@ class PriorBox(object): """ def __init__(self, cfg): super(PriorBox, self).__init__() - # self.type = cfg.name self.image_size = cfg['min_dim'] # number of priors for feature map location (either 4 or 6) self.num_priors = len(cfg['aspect_ratios']) diff --git a/ssd.py b/ssd.py index 69cd69cf3..80a23d638 100644 --- a/ssd.py +++ b/ssd.py @@ -19,19 +19,20 @@ class SSD(nn.Module): Args: phase: (string) Can be "test" or "train" + size: input image size base: VGG16 layers for input, size of either 300 or 500 extras: extra layers that feed to multibox loc and conf layers head: "multibox head" consists of loc and conf conv layers """ - def __init__(self, phase, base, extras, head, num_classes): + def __init__(self, phase, size, base, extras, head, num_classes): super(SSD, self).__init__() self.phase = phase self.num_classes = num_classes - # TODO: implement __call__ in PriorBox - self.priorbox = PriorBox(coco) + self.cfg = (coco, voc)[num_classes == 21] + self.priorbox = PriorBox(self.cfg) self.priors = Variable(self.priorbox.forward(), volatile=True) - self.size = 300 + self.size = size # SSD network self.vgg = nn.ModuleList(base) @@ -98,7 +99,7 @@ def forward(self, x): output = self.detect( loc.view(loc.size(0), -1, 4), # loc preds self.softmax(conf.view(conf.size(0), -1, - self.num_classes)), # conf preds + self.num_classes)), # conf preds self.priors.type(type(x.data)) # default boxes ) else: @@ -196,12 +197,13 @@ def multibox(vgg, extra_layers, cfg, num_classes): def build_ssd(phase, size=300, num_classes=21): if phase != "test" and phase != "train": - print("Error: Phase not recognized") + print("ERROR: Phase: " + phase + " not recognized") return if size != 300: - print("Error: Sorry, only SSD300 is currently supported!") + print("ERROR: You specified size " + repr(size) + ". However, " + + "currently only SSD300 (size=300) is supported!") return base_, extras_, head_ = multibox(vgg(base[str(size)], 3), add_extras(extras[str(size)], 1024), mbox[str(size)], num_classes) - return SSD(phase, base_, extras_, head_, num_classes) + return SSD(phase, size, base_, extras_, head_, num_classes) diff --git a/train.py b/train.py index 199cefe24..427dd9244 100644 --- a/train.py +++ b/train.py @@ -1,8 +1,9 @@ -from data import coco, voc, VOCAnnotationTransform, COCOAnnotationTransform, VOCDetection, COCODetection, detection_collate, VOC_ROOT, COCO_ROOT, VOC_CLASSES, COCO_CLASSES, MEANS +from data import * from utils.augmentations import SSDAugmentation from layers.modules import MultiBoxLoss from ssd import build_ssd import os +import sys import time import torch from torch.autograd import Variable @@ -19,115 +20,136 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") -parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training') -parser.add_argument('--dataset', default='COCO', help='VOC or COCO') -parser.add_argument('--image_set', default='trainval35k', help='Specific image set within dataset') -parser.add_argument('--basenet', default='vgg16_reducedfc.pth', help='Pretrained base model') -parser.add_argument('--batch_size', default=32, type=int, help='Batch size for training') -parser.add_argument('--resume', default=None, type=str, help='Resume training from checkpoint') -parser.add_argument('--start_iter', default=0, type=int, help='Begin counting iterations starting from this value (should be used with resume)') -parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading') -parser.add_argument('--max_iter', default=400000, type=int, help='Number of training iterations') -parser.add_argument('--cuda', default=True, type=str2bool, help='Use CUDA to train model') -parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate') -parser.add_argument('--momentum', default=0.9, type=float, help='Momentum value for optim') -parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD') -parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD') -parser.add_argument('--log_iters', default=True, type=bool, help='Print the loss at each iteration') -parser.add_argument('--visdom', default=False, type=str2bool, help='Use visdom for loss visualization') -parser.add_argument('--send_images_to_visdom', type=str2bool, default=False, help='Sample a random image from every 10th batch, send it to visdom after augmentations step') -parser.add_argument('--save_folder', default='weights/', help='Directory for saving checkpoint models') -parser.add_argument('--dataset_root', default=COCO_ROOT, help='Dataset root directory path') -parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks") +parser = argparse.ArgumentParser( + description='Single Shot MultiBox Detector Training With Pytorch') +train_set = parser.add_mutually_exclusive_group() +parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'], + type=str, help='VOC or COCO') +parser.add_argument('--dataset_root', default=VOC_ROOT, + help='Dataset root directory path') +parser.add_argument('--basenet', default='vgg16_reducedfc.pth', + help='Pretrained base model') +parser.add_argument('--batch_size', default=32, type=int, + help='Batch size for training') +parser.add_argument('--resume', default=None, type=str, + help='Checkpoint state_dict file to resume training from') +parser.add_argument('--start_iter', default=0, type=int, + help='Resume training at this iter') +parser.add_argument('--num_workers', default=4, type=int, + help='Number of workers used in dataloading') +parser.add_argument('--cuda', default=True, type=str2bool, + help='Use CUDA to train model') +parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, + help='initial learning rate') +parser.add_argument('--momentum', default=0.9, type=float, + help='Momentum value for optim') +parser.add_argument('--weight_decay', default=5e-4, type=float, + help='Weight decay for SGD') +parser.add_argument('--gamma', default=0.1, type=float, + help='Gamma update for SGD') +parser.add_argument('--visdom', default=False, type=str2bool, + help='Use visdom for loss visualization') +parser.add_argument('--save_folder', default='weights/', + help='Directory for saving checkpoint models') args = parser.parse_args() -if args.cuda and torch.cuda.is_available(): - torch.set_default_tensor_type('torch.cuda.FloatTensor') + +if torch.cuda.is_available(): + if args.cuda: + torch.set_default_tensor_type('torch.cuda.FloatTensor') + if not args.cuda: + print("WARNING: It looks like you have a CUDA device, but aren't " + + "using CUDA.\nRun with --cuda for optimal training speed.") + torch.set_default_tensor_type('torch.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') -# CONFIG = (voc, coco)[args.v == 'COCO'] - if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) -SSD_DIM = 300 # only support 300 now -NUM_CLASSES = len(COCO_CLASSES) + 1 -STEP_VALUES = (280000, 360000, 400000) - -if args.visdom: - import visdom - viz = visdom.Visdom() - -ssd_net = build_ssd('train', SSD_DIM, NUM_CLASSES) -net = ssd_net - -if args.cuda: - net = torch.nn.DataParallel(ssd_net) - cudnn.benchmark = True - -if args.resume: - print('Resuming training, loading {}...'.format(args.resume)) - ssd_net.load_weights(args.resume) -else: - vgg_weights = torch.load(args.save_folder + args.basenet) - print('Loading base network...') - ssd_net.vgg.load_state_dict(vgg_weights) - -if args.cuda: - net = net.cuda() - - -def xavier(param): - init.xavier_uniform(param) - - -def weights_init(m): - if isinstance(m, nn.Conv2d): - xavier(m.weight.data) - m.bias.data.zero_() +def train(): + if args.dataset == 'COCO': + if args.dataset_root == VOC_ROOT: + if not os.path.exists(COCO_ROOT): + parser.error('Must specify dataset_root if specifying dataset') + print("WARNING: Using default COCO dataset_root because " + + "--dataset_root was not specified.") + args.dataset_root = COCO_ROOT + cfg = coco + dataset = COCODetection(root=args.dataset_root, + transform=SSDAugmentation(cfg['min_dim'], + MEANS)) + elif args.dataset == 'VOC': + if args.dataset_root == COCO_ROOT: + parser.error('Must specify dataset if specifying dataset_root') + cfg = voc + dataset = VOCDetection(root=args.dataset_root, + transform=SSDAugmentation(cfg['min_dim'], + MEANS)) -if not args.resume: - print('Initializing weights...') - # initialize newly added layers' weights with xavier method - ssd_net.extras.apply(weights_init) - ssd_net.loc.apply(weights_init) - ssd_net.conf.apply(weights_init) - -optimizer = optim.SGD(net.parameters(), lr=args.lr, - momentum=args.momentum, weight_decay=args.weight_decay) -criterion = MultiBoxLoss(NUM_CLASSES, 0.5, True, 0, True, 3, 0.5, False, - args.cuda) - + if args.visdom: + import visdom + viz = visdom.Visdom() + + ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) + net = ssd_net + + if args.cuda: + net = torch.nn.DataParallel(ssd_net) + cudnn.benchmark = True + + if args.resume: + print('Resuming training, loading {}...'.format(args.resume)) + ssd_net.load_weights(args.resume) + else: + vgg_weights = torch.load(args.save_folder + args.basenet) + print('Loading base network...') + ssd_net.vgg.load_state_dict(vgg_weights) + + if args.cuda: + net = net.cuda() + + if not args.resume: + print('Initializing weights...') + # initialize newly added layers' weights with xavier method + ssd_net.extras.apply(weights_init) + ssd_net.loc.apply(weights_init) + ssd_net.conf.apply(weights_init) + + optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, + weight_decay=args.weight_decay) + criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, + False, args.cuda) -def train(): net.train() # loss counters loc_loss = 0 conf_loss = 0 epoch = 0 - print('Loading Dataset...') - dataset = COCODetection(args.dataset_root, args.image_set, SSDAugmentation( - SSD_DIM, MEANS), COCOAnnotationTransform()) + print('Loading the dataset...') epoch_size = len(dataset) // args.batch_size - print('Training SSD on', dataset.name) + print('Training SSD on:', dataset.name) + print('Using the specified args:') + print(args) + step_index = 0 if args.visdom: - vis_title = 'SSD.PyTorch on ' + args.image_set + vis_title = 'SSD.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) + data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # create batch iterator batch_iterator = iter(data_loader) - for iteration in range(args.start_iter, args.max_iter): - if iteration != 0 and (iteration % epoch_size == 0) and args.visdom: + for iteration in range(args.start_iter, cfg['max_iter']): + if args.visdom and iteration != 0 and (iteration % epoch_size == 0): update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) # reset epoch loss counters @@ -135,7 +157,7 @@ def train(): conf_loss = 0 epoch += 1 - if iteration in STEP_VALUES: + if iteration in cfg['lr_steps']: step_index += 1 adjust_learning_rate(optimizer, args.gamma, step_index) @@ -169,7 +191,7 @@ def train(): update_vis_plot(iteration, loss_l.data[0], loss_c.data[0], iter_plot, epoch_plot, 'append') - if iteration % 5000 == 0: + if iteration != 0 and iteration % 5000 == 0: print('Saving state, iter:', iteration) torch.save(ssd_net.state_dict(), 'weights/ssd300_COCO_' + repr(iteration) + '.pth') @@ -188,6 +210,16 @@ def adjust_learning_rate(optimizer, gamma, step): param_group['lr'] = lr +def xavier(param): + init.xavier_uniform(param) + + +def weights_init(m): + if isinstance(m, nn.Conv2d): + xavier(m.weight.data) + m.bias.data.zero_() + + def create_vis_plot(_xlabel, _ylabel, _title, _legend): return viz.line( X=torch.zeros((1,)).cpu(),