diff --git a/.gitmodules b/.gitmodules index 22cff10..c149f60 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "vqa/external/skip-thoughts.torch"] path = vqa/external/skip-thoughts.torch url = https://github.com/Cadene/skip-thoughts.torch.git +[submodule "vqa/external/pretrained-models.pytorch"] + path = vqa/external/pretrained-models.pytorch + url = https://github.com/Cadene/pretrained-models.pytorch.git diff --git a/README.md b/README.md index 0c74163..054054e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Visual Question Answering in pytorch -This repo was made by [Remi Cadene](http://remicadene.com) (LIP6) and [Hedi Ben-Younes](https://twitter.com/labegne) (LIP6-Heuritech), two PhD Students working on VQA at [UPMC-LIP6](http://lip6.fr) and their professors [Matthieu Cord](http://webia.lip6.fr/~cord) (LIP6) and [Nicolas Thome](http://webia.lip6.fr/~thomen) (LIP6-CNAM). We developped this code in the frame of a research paper called [MUTAN: Multimodal Tucker Fusion for VQA](https://arxiv.org/abs/1705.06676) which is (as far as we know) the current state-of-the-art on the [VQA-1 dataset](http://visualqa.org). +This repo was made by [Remi Cadene](http://remicadene.com) (LIP6) and [Hedi Ben-Younes](https://twitter.com/labegne) (LIP6-Heuritech), two PhD Students working on VQA at [UPMC-LIP6](http://lip6.fr) and their professors [Matthieu Cord](http://webia.lip6.fr/~cord) (LIP6) and [Nicolas Thome](http://webia.lip6.fr/~thomen) (LIP6-CNAM). We developped this code in the frame of a research paper called [MUTAN: Multimodal Tucker Fusion for VQA](https://arxiv.org/abs/1705.06676) which is (as far as we know) the current state-of-the-art on the [VQA 1.0 dataset](http://visualqa.org). The goal of this repo is two folds: - to make it easier to reproduce our results, @@ -8,6 +8,13 @@ The goal of this repo is two folds: If you have any questions about our code or model, don't hesitate to contact us or to submit any issues. Pull request are welcome! +#### News: + +- coming soon: pretrained models on VQA2, features of FBResnet152, web app demo +- 18th july 2017: VQA2, VisualGenome, FBResnet152 (for pytorch) added +- 16th july 2017: paper accepted at ICCV2017 +- 30th may 2017: poster accepted at CVPR2017 (VQA Workshop) + #### Summary: * [Introduction](#introduction) @@ -27,7 +34,10 @@ If you have any questions about our code or model, don't hesitate to contact us * [Models](#models) * [Quick examples](#quick-examples) * [Extract features from COCO](#extract-features-from-coco) - * [Train models on VQA](#train-models-on-vqa) + * [Extract features from VisualGenome](#extract-features-from-visualgenome) + * [Train models on VQA 1.0](#train-models-on-vqa-1-0) + * [Train models on VQA 2.0](#train-models-on-vqa-2-0) + * [Train models on VQA + VisualGenome](#train-models-on-vqa-2-0) * [Monitor training](#monitor-training) * [Restart training](#restart-training) * [Evaluate models on VQA](#evaluate-models-on-vqa) @@ -108,7 +118,7 @@ Our code has two external dependencies: Data will be automaticaly downloaded and preprocessed when needed. Links to data are stored in `vqa/datasets/vqa.py` and `vqa/datasets/coco.py`. -## Reproducing results +## Reproducing results on VQA 1.0 ### Features @@ -173,7 +183,7 @@ To obtain test and testdev results, you will need to zip your result json file ( | ├── train.py # train & eval models ├── eval_res.py # eval results files with OpenEnded metric -├── extract.pt # extract features from coco with CNNs +├── extract.py # extract features from coco with CNNs └── visu.py # visualize logs and monitor training ``` @@ -189,16 +199,15 @@ You can easly add new options in your custom yaml file if needed. Also, if you w ### Datasets -We currently provide three datasets: +We currently provide four datasets: - [COCOImages](http://mscoco.org/) currently used to extract features, it comes with three datasets: trainset, valset and testset -- COCOFeatures used by any VQA datasets -- [VQA](http://www.visualqa.org/vqa_v1_download.html) comes with four datasets: trainset, valset, testset (including test-std and test-dev) and "trainvalset" (concatenation of trainset and valset) +- [VisualGenomeImages]() currently used to extract features, it comes with one split: trainset +- [VQA 1.0](http://www.visualqa.org/vqa_v1_download.html) comes with four datasets: trainset, valset, testset (including test-std and test-dev) and "trainvalset" (concatenation of trainset and valset) +- [VQA 2.0](http://www.visualqa.org) same but twice bigger (however same images than VQA 1.0) We plan to add: -- [VisualGenome](http://visualgenome.org/) -- [VQA2](http://www.visualqa.org/) - [CLEVR](http://cs.stanford.edu/people/jcjohns/clevr/) ### Models @@ -245,7 +254,16 @@ CUDA_VISIBLE_DEVICES=0 python extract.py CUDA_VISIBLE_DEVICES=1,2 python extract.py ``` -### Train models on VQA +### Extract features from VisualGenome + +Same here, but only train is available: + +``` +python extract.py --dataset vgenome --dir_data data/vgenome --data_split train +``` + + +### Train models on VQA 1.0 Display help message, selected options and run default. The needed data will be automaticaly downloaded and processed using the options in `options/default.yaml`. @@ -258,19 +276,19 @@ python train.py Run a MutanNoAtt model with default options. ``` -python train.py --path_opt options/vqa/mutan_noatt.yaml --dir_logs logs/vqa/mutan_noatt +python train.py --path_opt options/vqa/mutan_noatt.yaml --dir_logs logs/vqa/mutan_noatt_train ``` Run a MutanAtt model on the trainset and evaluate on the valset after each epoch. ``` -python train.py --vqa_trainsplit train --path_opt options/vqa/mutan_att.yaml +python train.py --vqa_trainsplit train --path_opt options/vqa/mutan_att_trainval.yaml ``` Run a MutanAtt model on the trainset and valset (by default) and run throw the testset after each epoch (produce a results file that you can submit to the evaluation server). ``` -python train.py --vqa_trainsplit trainval --path_opt options/vqa/mutan_att.yaml +python train.py --vqa_trainsplit trainval --path_opt options/vqa/mutan_att_trainval.yaml ``` ### Monitor training @@ -301,6 +319,22 @@ Create a visualization of multiple experiments to compare them or monitor them l python visu.py --dir_logs logs/vqa/mutan_noatt,logs/vqa/mutan_att ``` +### Train models on VQA 2.0 + +See options of [vqa2/mutan_att_trainval](https://github.com/Cadene/vqa.pytorch/blob/master/options/vqa2/mutan_att_trainval.yaml): + +``` +python train.py --path_opt options/vqa2/mutan_att_trainval.yaml +``` + +### Train models on VQA (1.0 or 2.0) + VisualGenome + +See options of [vqa2/mutan_att_trainval_vg](https://github.com/Cadene/vqa.pytorch/blob/master/options/vqa2/mutan_att_trainval_vg.yaml): + +``` +python train.py --path_opt options/vqa2/mutan_att_trainval_vg.yaml +``` + ### Restart training Restart the model from the last checkpoint. @@ -329,13 +363,14 @@ Please cite the arXiv paper if you use Mutan in your work: ``` @article{benyounescadene2017mutan, - title={MUTAN: Multimodal Tucker Fusion for Visual Question Answering}, - author={Hedi Ben-Younes and - R{\'{e}}mi Cad{\`{e}}ne and - Nicolas Thome and - Matthieu Cord}}, - journal={arXiv preprint arXiv:1705.06676}, - year={2017} +author = {Hedi Ben-Younes and +R{\'{e}}mi Cad{\`{e}}ne and +Nicolas Thome and +Matthieu Cord}, +title = {MUTAN: Multimodal Tucker Fusion for Visual Question Answering}, +journal = {ICCV}, +year = {2017}, +url = {http://arxiv.org/abs/1705.06676} } ``` diff --git a/extract.py b/extract.py index 8d8b713..773cea2 100644 --- a/extract.py +++ b/extract.py @@ -8,60 +8,73 @@ import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn +from torch.autograd import Variable import torchvision.transforms as transforms import torchvision.datasets as datasets -import torchvision.models as models -import vqa.datasets.coco as coco +import vqa.models.convnets as convnets +import vqa.datasets as datasets from vqa.lib.dataloader import DataLoader -from vqa.models.utils import ResNet from vqa.lib.logger import AvgMeter -model_names = sorted(name for name in models.__dict__ - if name.islower() and name.startswith("resnet") - and callable(models.__dict__[name])) - parser = argparse.ArgumentParser(description='Extract') -parser.add_argument('--dir_data', default='data/coco', metavar='DIR', - help='dir dataset: mscoco or visualgenome') +parser.add_argument('--dataset', default='coco', + choices=['coco', 'vgenome'], + help='dataset type: coco (default) | vgenome') +parser.add_argument('--dir_data', default='data/coco', + help='dir dataset to download or/and load images') parser.add_argument('--data_split', default='train', type=str, help='Options: (default) train | val | test') -parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet152', - choices=model_names, +parser.add_argument('--arch', '-a', default='resnet152', + choices=convnets.model_names, help='model architecture: ' + - ' | '.join(model_names) + - ' (default: resnet152)') -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loading workers (default: 8)') -parser.add_argument('--batch_size', '-b', default=80, type=int, metavar='N', + ' | '.join(convnets.model_names) + + ' (default: fbresnet152)') +parser.add_argument('--workers', default=4, type=int, + help='number of data loading workers (default: 4)') +parser.add_argument('--batch_size', '-b', default=80, type=int, help='mini-batch size (default: 80)') parser.add_argument('--mode', default='both', type=str, help='Options: att | noatt | (default) both') +parser.add_argument('--size', default=448, type=int, + help='Image size (448 for noatt := avg pooling to get 224) (default:448)') def main(): + global args args = parser.parse_args() print("=> using pre-trained model '{}'".format(args.arch)) - model = models.__dict__[args.arch](pretrained=True) - model = ResNet(model, False) - model = nn.DataParallel(model).cuda() + model = convnets.factory({'arch':args.arch}, cuda=True, data_parallel=True) - #extract_name = 'arch,{}_layer,{}_resize,{}'.format() - extract_name = 'arch,{}'.format(args.arch) + extract_name = 'arch,{}_size,{}'.format(args.arch, args.size) - #dir_raw = os.path.join(args.dir_data, 'raw') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - dataset = coco.COCOImages(args.data_split, dict(dir=args.dir_data), - transform=transforms.Compose([ - transforms.Scale(448), - transforms.CenterCrop(448), - transforms.ToTensor(), - normalize, - ])) + if args.dataset == 'coco': + if 'coco' not in args.dir_data: + raise ValueError('"coco" string not in dir_data') + dataset = datasets.COCOImages(args.data_split, dict(dir=args.dir_data), + transform=transforms.Compose([ + transforms.Scale(args.size), + transforms.CenterCrop(args.size), + transforms.ToTensor(), + normalize, + ])) + elif args.dataset == 'vgenome': + if args.data_split != 'train': + raise ValueError('train split is required for vgenome') + if 'vgenome' not in args.dir_data: + raise ValueError('"vgenome" string not in dir_data') + dataset = datasets.VisualGenomeImages(args.data_split, dict(dir=args.dir_data), + transform=transforms.Compose([ + transforms.Scale(args.size), + transforms.CenterCrop(args.size), + transforms.ToTensor(), + normalize, + ])) data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, @@ -79,13 +92,19 @@ def extract(data_loader, model, path_file, mode): path_txt = path_file + '.txt' hdf5_file = h5py.File(path_hdf5, 'w') + # estimate output shapes + output = model(Variable(torch.ones(1, 3, args.size, args.size), + volatile=True)) + nb_images = len(data_loader.dataset) if mode == 'both' or mode == 'att': - shape_att = (nb_images, 2048, 14, 14) + shape_att = (nb_images, output.size(1), output.size(2), output.size(3)) + print('Warning: shape_att={}'.format(shape_att)) hdf5_att = hdf5_file.create_dataset('att', shape_att, dtype='f')#, compression='gzip') if mode == 'both' or mode == 'noatt': - shape_noatt = (nb_images, 2048) + shape_noatt = (nb_images, output.size(1)) + print('Warning: shape_noatt={}'.format(shape_noatt)) hdf5_noatt = hdf5_file.create_dataset('noatt', shape_noatt, dtype='f')#, compression='gzip') @@ -98,7 +117,7 @@ def extract(data_loader, model, path_file, mode): idx = 0 for i, input in enumerate(data_loader): - input_var = torch.autograd.Variable(input['visual'], volatile=True) + input_var = Variable(input['visual'], volatile=True) output_att = model(input_var) nb_regions = output_att.size(2) * output_att.size(3) @@ -111,6 +130,7 @@ def extract(data_loader, model, path_file, mode): hdf5_noatt[idx:idx+batch_size] = output_noatt.data.cpu().numpy() idx += batch_size + torch.cuda.synchronize() batch_time.update(time.time() - end) end = time.time() diff --git a/options/vqa2/default.yaml b/options/vqa2/default.yaml new file mode 100644 index 0000000..258d698 --- /dev/null +++ b/options/vqa2/default.yaml @@ -0,0 +1,40 @@ +logs: + dir_logs: logs/vqa2/default +vqa: + dataset: VQA2 + dir: data/vqa2 + trainsplit: train + nans: 2000 + maxlength: 26 + minwcount: 0 + nlp: mcb + pad: right + samplingans: True +coco: + dir: data/coco + arch: fbresnet152 + mode: noatt + size: 448 +model: + arch: MLBNoAtt + seq2vec: + arch: skipthoughts + dir_st: data/skip-thoughts + type: UniSkip + dropout: 0.25 + fixed_emb: False + fusion: + dim_v: 2048 + dim_q: 2400 + dim_h: 1200 + dropout_v: 0.5 + dropout_q: 0.5 + activation_v: tanh + activation_q: tanh + classif: + activation: tanh + dropout: 0.5 +optim: + lr: 0.0001 + batch_size: 512 + epochs: 100 diff --git a/options/vqa2/mlb_att_trainval.yaml b/options/vqa2/mlb_att_trainval.yaml new file mode 100644 index 0000000..861a42b --- /dev/null +++ b/options/vqa2/mlb_att_trainval.yaml @@ -0,0 +1,49 @@ +logs: + dir_logs: logs/vqa2/mlb_att_trainval +vqa: + dataset: VQA2 + dir: data/vqa2 + trainsplit: trainval + nans: 2000 + maxlength: 26 + minwcount: 0 + nlp: mcb + pad: right + samplingans: True +coco: + dir: data/coco + arch: fbresnet152 + mode: att + size: 448 +model: + arch: MLBAtt + dim_v: 2048 + dim_q: 2400 + seq2vec: + arch: skipthoughts + dir_st: data/skip-thoughts + type: BayesianUniSkip + dropout: 0.25 + fixed_emb: False + attention: + nb_glimpses: 4 + dim_h: 1200 + dropout_v: 0.5 + dropout_q: 0.5 + dropout_mm: 0.5 + activation_v: tanh + activation_q: tanh + activation_mm: tanh + fusion: + dim_h: 1200 + dropout_v: 0.5 + dropout_q: 0.5 + activation_v: tanh + activation_q: tanh + classif: + activation: tanh + dropout: 0.5 +optim: + lr: 0.0001 + batch_size: 128 + epochs: 100 diff --git a/options/vqa2/mutan_att_trainval.yaml b/options/vqa2/mutan_att_trainval.yaml new file mode 100644 index 0000000..21a9086 --- /dev/null +++ b/options/vqa2/mutan_att_trainval.yaml @@ -0,0 +1,57 @@ +logs: + dir_logs: logs/vqa2/mutan_att_trainval +vqa: + dataset: VQA2 + dir: data/vqa2 + trainsplit: trainval + nans: 2000 + maxlength: 26 + minwcount: 0 + nlp: mcb + pad: right + samplingans: True +coco: + dir: data/coco + arch: fbresnet152 + mode: att + size: 448 +model: + arch: MutanAtt + dim_v: 2048 + dim_q: 2400 + seq2vec: + arch: skipthoughts + dir_st: data/skip-thoughts + type: BayesianUniSkip + dropout: 0.25 + fixed_emb: False + attention: + nb_glimpses: 2 + dim_hv: 310 + dim_hq: 310 + dim_mm: 510 + R: 5 + dropout_v: 0.5 + dropout_q: 0.5 + dropout_mm: 0.5 + activation_v: tanh + activation_q: tanh + dropout_hv: 0 + dropout_hq: 0 + fusion: + dim_hv: 620 + dim_hq: 310 + dim_mm: 510 + R: 5 + dropout_v: 0.5 + dropout_q: 0.5 + activation_v: tanh + activation_q: tanh + dropout_hv: 0 + dropout_hq: 0 + classif: + dropout: 0.5 +optim: + lr: 0.0001 + batch_size: 128 + epochs: 100 diff --git a/options/vqa2/mutan_att_trainval_vg.yaml b/options/vqa2/mutan_att_trainval_vg.yaml new file mode 100644 index 0000000..103888c --- /dev/null +++ b/options/vqa2/mutan_att_trainval_vg.yaml @@ -0,0 +1,68 @@ +logs: + dir_logs: logs/vqa2/mutan_att_trainval +vqa: + dataset: VQA2 + dir: data/vqa2 + trainsplit: trainval + nans: 2000 + maxlength: 26 + minwcount: 0 + nlp: mcb + pad: right + samplingans: True +coco: + dir: data/coco + arch: fbresnet152torchported + mode: att + size: 448 +vgenome: + trainsplit: train + dir: data/vgenome + arch: fbresnet152 + mode: att + size: 448 + nans: 2000 + maxlength: 26 + minwcount: 0 + nlp: mcb + pad: right +model: + arch: MutanAtt + dim_v: 2048 + dim_q: 2400 + seq2vec: + arch: skipthoughts + dir_st: data/skip-thoughts + type: BayesianUniSkip + dropout: 0.25 + fixed_emb: False + attention: + nb_glimpses: 2 + dim_hv: 310 + dim_hq: 310 + dim_mm: 510 + R: 5 + dropout_v: 0.5 + dropout_q: 0.5 + dropout_mm: 0.5 + activation_v: tanh + activation_q: tanh + dropout_hv: 0 + dropout_hq: 0 + fusion: + dim_hv: 620 + dim_hq: 310 + dim_mm: 510 + R: 5 + dropout_v: 0.5 + dropout_q: 0.5 + activation_v: tanh + activation_q: tanh + dropout_hv: 0 + dropout_hq: 0 + classif: + dropout: 0.5 +optim: + lr: 0.0001 + batch_size: 128 + epochs: 100 diff --git a/options/vqa2/mutan_noatt_train.yaml b/options/vqa2/mutan_noatt_train.yaml new file mode 100644 index 0000000..0591039 --- /dev/null +++ b/options/vqa2/mutan_noatt_train.yaml @@ -0,0 +1,44 @@ +logs: + dir_logs: logs/vqa2/mutan_noatt_train +vqa: + dataset: VQA2 + dir: data/vqa2 + trainsplit: train + nans: 2000 + maxlength: 26 + minwcount: 0 + nlp: mcb + pad: right + samplingans: True +coco: + dir: data/coco + arch: fbresnet152 + mode: noatt + size: 448 +model: + arch: MutanNoAtt + seq2vec: + arch: skipthoughts + dir_st: data/skip-thoughts + type: BayesianUniSkip + dropout: 0.25 + fixed_emb: False + fusion: + dim_v: 2048 + dim_q: 2400 + dim_hv: 360 + dim_hq: 360 + dim_mm: 360 + R: 10 + dropout_v: 0.5 + dropout_q: 0.5 + activation_v: tanh + activation_q: tanh + dropout_hv: 0 + dropout_hq: 0 + classif: + dropout: 0.5 +optim: + lr: 0.0001 + batch_size: 512 + epochs: 100 diff --git a/train.py b/train.py index 0b81283..7df46b4 100644 --- a/train.py +++ b/train.py @@ -17,10 +17,6 @@ import vqa.datasets as datasets import vqa.models as models -model_names = sorted(name for name in models.__dict__ - if not name.startswith("__") - and callable(models.__dict__[name])) - parser = argparse.ArgumentParser( description='Train/Evaluate models', formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -35,13 +31,11 @@ # data options parser.add_argument('--vqa_trainsplit', type=str, choices=['train','trainval']) # model options -parser.add_argument('--arch', choices=model_names, +parser.add_argument('--arch', choices=models.model_names, help='vqa model architecture: ' + - ' | '.join(model_names)) + ' | '.join(models.model_names)) parser.add_argument('--st_type', help='skipthoughts type') -parser.add_argument('--emb_drop', type=float, - help='embedding dropout') parser.add_argument('--st_dropout', type=float) parser.add_argument('--st_fixed_emb', default=None, type=utils.str2bool, help='backprop on embedding') @@ -78,11 +72,13 @@ def main(): global args, best_acc1 args = parser.parse_args() - # Set options + ######################################################################################### + # Create options + ######################################################################################### + options = { 'vqa' : { - 'trainsplit': args.vqa_trainsplit, - 'dropout': args.emb_drop + 'trainsplit': args.vqa_trainsplit }, 'logs': { 'dir_logs': args.dir_logs @@ -110,8 +106,18 @@ def main(): if args.help_opt: return - # Set datasets - trainset = datasets.factory_VQA(options['vqa']['trainsplit'], options['vqa'], options['coco']) + # Set datasets options + if 'vgenome' not in options: + options['vgenome'] = None + + ######################################################################################### + # Create needed datasets + ######################################################################################### + + trainset = datasets.factory_VQA(options['vqa']['trainsplit'], + options['vqa'], + options['coco'], + options['vgenome']) train_loader = trainset.data_loader(batch_size=options['optim']['batch_size'], num_workers=args.workers, shuffle=True) @@ -120,22 +126,27 @@ def main(): valset = datasets.factory_VQA('val', options['vqa'], options['coco']) val_loader = valset.data_loader(batch_size=options['optim']['batch_size'], num_workers=args.workers) + if options['vqa']['trainsplit'] == 'trainval' or args.evaluate: testset = datasets.factory_VQA('test', options['vqa'], options['coco']) test_loader = testset.data_loader(batch_size=options['optim']['batch_size'], num_workers=args.workers) - # Set model, criterion and optimizer - model = getattr(models, options['model']['arch'])( - options['model'], trainset.vocab_words(), trainset.vocab_answers()) - - model = nn.DataParallel(model).cuda() - criterion = criterions.factory_loss(options['vqa'], cuda=True) - #optimizer = torch.optim.Adam([model.module.seq2vec.rnn.gru_cell.parameters()], options['optim']['lr']) - #optimizer = torch.optim.Adam(model.parameters(), options['optim']['lr']) - optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), options['optim']['lr']) + ######################################################################################### + # Create model, criterion and optimizer + ######################################################################################### + + model = models.factory(options['model'], + trainset.vocab_words(), trainset.vocab_answers(), + cuda=True, data_parallel=True) + criterion = criterions.factory(options['vqa'], cuda=True) + optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), + options['optim']['lr']) - # Optionally resume from a checkpoint + ######################################################################################### + # args.resume: resume from a checkpoint OR create logs directory + ######################################################################################### + exp_logger = None if args.resume: args.start_epoch, best_acc1, exp_logger = load_checkpoint(model.module, optimizer, @@ -168,13 +179,16 @@ def main(): exp_logger.info['model_params'] = utils.params_count(model) print('Model has {} parameters'.format(exp_logger.info['model_params'])) - # Begin evaluation and training + ######################################################################################### + # args.evaluate: on valset OR/AND on testset + ######################################################################################### + if args.evaluate: path_logger_json = os.path.join(options['logs']['dir_logs'], 'logger.json') if options['vqa']['trainsplit'] == 'train': acc1, val_results = engine.validate(val_loader, model, criterion, - exp_logger, args.start_epoch, args.print_freq) + exp_logger, args.start_epoch, args.print_freq) # save results and compute OpenEnd accuracy exp_logger.to_json(path_logger_json) save_results(val_results, args.start_epoch, valset.split_name(), @@ -190,6 +204,10 @@ def main(): options['logs']['dir_logs'], options['vqa']['dir']) return + ######################################################################################### + # Begin training on train/val or trainval/test + ######################################################################################### + for epoch in range(args.start_epoch+1, options['optim']['epochs']): #adjust_learning_rate(optimizer, epoch) diff --git a/vqa/datasets/__init__.py b/vqa/datasets/__init__.py index 044d42c..f494f65 100644 --- a/vqa/datasets/__init__.py +++ b/vqa/datasets/__init__.py @@ -1 +1,3 @@ -from .vqa import factory_VQA \ No newline at end of file +from .vqa import factory as factory_VQA +from .coco import COCOImages +from .vgenome import VisualGenomeImages \ No newline at end of file diff --git a/vqa/datasets/coco.py b/vqa/datasets/coco.py index 0d3135d..238e295 100644 --- a/vqa/datasets/coco.py +++ b/vqa/datasets/coco.py @@ -88,10 +88,10 @@ def default_transform(size): ]) return transform -def factory_COCO(data_split, opt, transform=None): +def factory(data_split, opt, transform=None): if data_split == 'trainval': - trainset = factory_COCO('train', opt, transform) - valset = factory_COCO('val', opt, transform) + trainset = factory('train', opt, transform) + valset = factory('val', opt, transform) return COCOTrainval(trainset, valset) elif data_split in ['train', 'val', 'test']: if opt['mode'] == 'img': diff --git a/vqa/datasets/features.py b/vqa/datasets/features.py index 5073be8..d6a1c4d 100644 --- a/vqa/datasets/features.py +++ b/vqa/datasets/features.py @@ -12,10 +12,12 @@ def __init__(self, data_split, opt): self.dir_extract = os.path.join(self.opt['dir'], 'extract', 'arch,' + self.opt['arch']) + if 'size' in opt: + self.dir_extract += '_size,' + str(opt['size']) self.path_hdf5 = os.path.join(self.dir_extract, data_split + 'set.hdf5') assert os.path.isfile(self.path_hdf5), \ - 'you must extract the features first with extract.py' + 'File not found in {}, you must extract the features first with extract.py'.format(self.path_hdf5) self.hdf5_file = h5py.File(self.path_hdf5, 'r')#, driver='mpio', comm=MPI.COMM_WORLD) self.dataset_features = self.hdf5_file[self.opt['mode']] self.index_to_name, self.name_to_index = self._load_dicts() diff --git a/vqa/datasets/vgenome.py b/vqa/datasets/vgenome.py new file mode 100644 index 0000000..cb617d2 --- /dev/null +++ b/vqa/datasets/vgenome.py @@ -0,0 +1,111 @@ +import os +import torch +import torch.utils.data as data +import copy + +from .images import ImagesFolder, AbstractImagesDataset, default_loader +from .features import FeaturesDataset +from .vgenome_interim import vgenome_interim +from .vgenome_processed import vgenome_processed +from .coco import default_transform +from .utils import AbstractVQADataset + +def raw(dir_raw): + dir_img = os.path.join(dir_raw, 'images') + os.system('wget http://visualgenome.org/static/data/dataset/image_data.json.zip -P '+dir_raw) + os.system('wget http://visualgenome.org/static/data/dataset/question_answers.json.zip -P '+dir_raw) + os.system('wget https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip -P '+dir_raw) + os.system('wget https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip -P '+dir_raw) + + os.system('unzip '+os.path.join(dir_raw, 'image_data.json.zip')+' -d '+dir_raw) + os.system('unzip '+os.path.join(dir_raw, 'question_answers.json.zip')+' -d '+dir_raw) + os.system('unzip '+os.path.join(dir_raw, 'images.zip')+' -d '+dir_raw) + os.system('unzip '+os.path.join(dir_raw, 'images2.zip')+' -d '+dir_raw) + + os.system('mv '+os.path.join(dir_raw, 'VG_100K')+' '+dir_img) + + #os.system('mv '+os.path.join(dir_raw, 'VG_100K_2', '*.jpg')+' '+dir_img) + os.system('find '+os.path.join(dir_raw, 'VG_100K_2')+' -type f -name \'*\' -exec mv {} '+dir_img+' \\;') + os.system('rm -rf '+os.path.join(dir_raw, 'VG_100K_2')) + + # remove images with 0 octet in a ugly but efficient way :') + #print('for f in $(ls -lh '+dir_img+' | grep " 0 " | cut -s -f14 --delimiter=" "); do rm '+dir_img+'/${f}; done;') + os.system('for f in $(ls -lh '+dir_img+' | grep " 0 " | cut -s -f14 --delimiter=" "); do echo '+dir_img+'/${f}; done;') + os.system('for f in $(ls -lh '+dir_img+' | grep " 0 " | cut -s -f14 --delimiter=" "); do rm '+dir_img+'/${f}; done;') + + +class VisualGenome(AbstractVQADataset): + + def __init__(self, data_split, opt, dataset_img=None): + super(VisualGenome, self).__init__(data_split, opt, dataset_img) + + def __getitem__(self, index): + item_qa = self.dataset[index] + item = {} + if self.dataset_img is not None: + item_img = self.dataset_img.get_by_name(item_qa['image_name']) + item['visual'] = item_img['visual'] + # DEBUG + #item['visual_debug'] = item_qa['image_name'] + item['question'] = torch.LongTensor(item_qa['question_wids']) + # DEBUG + #item['question_debug'] = item_qa['question'] + item['question_id'] = item_qa['question_id'] + item['answer'] = item_qa['answer_aid'] + # DEBUG + #item['answer_debug'] = item_qa['answer'] + return item + + def _raw(self): + raw(self.dir_raw) + + def _interim(self): + vgenome_interim(self.opt) + + def _processed(self): + vgenome_processed(self.opt) + + def __len__(self): + return len(self.dataset) + + +class VisualGenomeImages(AbstractImagesDataset): + + def __init__(self, data_split, opt, transform=None, loader=default_loader): + super(VisualGenomeImages, self).__init__(data_split, opt, transform, loader) + self.dir_img = os.path.join(self.dir_raw, 'images') + self.dataset = ImagesFolder(self.dir_img, transform=self.transform, loader=self.loader) + self.name_to_index = self._load_name_to_index() + + def _raw(self): + raw(self.dir_raw) + + def _load_name_to_index(self): + self.name_to_index = {name:index for index, name in enumerate(self.dataset.imgs)} + return self.name_to_index + + def __getitem__(self, index): + item = self.dataset[index] + return item + + def __len__(self): + return len(self.dataset) + + +def factory(opt, vqa=False, transform=None): + + if vqa: + dataset_img = factory(opt, vqa=False, transform=transform) + return VisualGenome('train', opt, dataset_img) + + if opt['mode'] == 'img': + if transform is None: + transform = default_transform(opt['size']) + + elif opt['mode'] in ['noatt', 'att']: + return FeaturesDataset('train', opt) + + else: + raise ValueError + + diff --git a/vqa/datasets/vgenome_interim.py b/vqa/datasets/vgenome_interim.py new file mode 100644 index 0000000..8f77dbc --- /dev/null +++ b/vqa/datasets/vgenome_interim.py @@ -0,0 +1,46 @@ +import json +import os +import argparse + +# def get_image_path(subtype='train2014', image_id='1', format='%s/COCO_%s_%012d.jpg'): +# return format%(subtype, subtype, image_id) + +def interim(questions_annotations): + data = [] + for i in range(len(questions_annotations)): + qa_img = questions_annotations[i] + qa_img_id = qa_img['id'] + for j in range(len(qa_img['qas'])): + qa = qa_img['qas'][j] + row = {} + row['question_id'] = qa['qa_id'] + row['image_id'] = qa_img_id + row['image_name'] = str(qa_img_id) + '.jpg' + row['question'] = qa['question'] + row['answer'] = qa['answer'] + data.append(row) + return data + +def vgenome_interim(params): + ''' + Put the VisualGenomme VQA data into single json file in data/interim + or train, val, trainval : [[question_id, image_id, question, answer] ... ] + ''' + path_qa = os.path.join(params['dir'], 'interim', 'questions_annotations.json') + os.system('mkdir -p ' + os.path.join(params['dir'], 'interim')) + + print('Loading annotations and questions...') + questions_annotations = json.load(open(os.path.join(params['dir'], 'raw', 'question_answers.json'), 'r')) + + data = interim(questions_annotations) + print('Questions number %d'%len(data)) + print('Write', path_qa) + json.dump(data, open(path_qa, 'w')) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--dir_vg', default='data/visualgenome', type=str, help='Path to visual genome data directory') + args = parser.parse_args() + params = vars(args) + vgenome_interim(params) diff --git a/vqa/datasets/vgenome_processed.py b/vqa/datasets/vgenome_processed.py new file mode 100644 index 0000000..1225e2e --- /dev/null +++ b/vqa/datasets/vgenome_processed.py @@ -0,0 +1,171 @@ +""" +Preprocess an interim json data files +into one preprocess hdf5/json data files. +Caption: Use nltk, or mcb, or split function to get tokens. +""" +from random import shuffle, seed +import sys +import os.path +import argparse +import numpy as np +import scipy.io +import pdb +import h5py +from nltk.tokenize import word_tokenize +import json +import csv +import re +import math +import pickle + +from .vqa_processed import get_top_answers, remove_examples, tokenize, tokenize_mcb, \ + preprocess_questions, remove_long_tail_train, \ + encode_question, encode_answer + +def preprocess_answers(examples, nlp='nltk'): + print('Example of modified answers after preprocessing:') + for i, ex in enumerate(examples): + s = ex['answer'] + if nlp == 'nltk': + ex['answer'] = " ".join(word_tokenize(str(s).lower())) + elif nlp == 'mcb': + ex['answer'] = " ".join(tokenize_mcb(s)) + else: + ex['answer'] = " ".join(tokenize(s)) + if i < 10: print(s, 'became', "->"+ex['answer']+"<-") + if i>0 and i % 1000 == 0: + sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(examples), i*100.0/len(examples)) ) + sys.stdout.flush() + return examples + +def build_csv(path, examples, split='train', delimiter_col='~', delimiter_number='|'): + with open(path, 'wb') as f: + writer = csv.writer(f, delimiter=delimiter_col) + for ex in examples: + import ipdb; ipdb.set_trace() + row = [] + row.append(ex['question_id']) + row.append(ex['question']) + row.append(delimiter_number.join(ex['question_words_UNK'])) + row.append(delimiter_number.join(ex['question_wids'])) + + row.append(ex['image_id']) + + if split in ['train','val','trainval']: + row.append(ex['answer_aid']) + row.append(ex['answer']) + writer.writerow(row) + +def vgenome_processed(params): + + ##################################################### + ## Read input files + ##################################################### + + path_train = os.path.join(params['dir'], 'interim', 'questions_annotations.json') + + # An example is a tuple (question, image, answer) + # /!\ test and test-dev have no answer + trainset = json.load(open(path_train, 'r')) + + ##################################################### + ## Preprocess examples (questions and answers) + ##################################################### + + trainset = preprocess_answers(trainset, params['nlp']) + + top_answers = get_top_answers(trainset, params['nans']) + aid_to_ans = {i+1:w for i,w in enumerate(top_answers)} + ans_to_aid = {w:i+1 for i,w in enumerate(top_answers)} + + # Remove examples if answer is not in top answers + #trainset = remove_examples(trainset, ans_to_aid) + + # Add 'question_words' to the initial tuple + trainset = preprocess_questions(trainset, params['nlp']) + + # Also process top_words which contains a UNK char + trainset, top_words = remove_long_tail_train(trainset, params['minwcount']) + wid_to_word = {i+1:w for i,w in enumerate(top_words)} + word_to_wid = {w:i+1 for i,w in enumerate(top_words)} + + #examples_test = remove_long_tail_test(examples_test, word_to_wid) + + trainset = encode_question(trainset, word_to_wid, params['maxlength'], params['pad']) + + trainset = encode_answer(trainset, ans_to_aid) + + ##################################################### + ## Write output files + ##################################################### + + # Paths to output files + # Ex: data/vqa/preprocess/nans,3000_maxlength,15_..._trainsplit,train_testsplit,val/id_to_word.json + subdirname = 'nans,'+str(params['nans']) + for param in ['maxlength', 'minwcount', 'nlp', 'pad', 'trainsplit']: + subdirname += '_' + param + ',' + str(params[param]) + os.system('mkdir -p ' + os.path.join(params['dir'], 'processed', subdirname)) + + path_wid_to_word = os.path.join(params['dir'], 'processed', subdirname, 'wid_to_word.pickle') + path_word_to_wid = os.path.join(params['dir'], 'processed', subdirname, 'word_to_wid.pickle') + path_aid_to_ans = os.path.join(params['dir'], 'processed', subdirname, 'aid_to_ans.pickle') + path_ans_to_aid = os.path.join(params['dir'], 'processed', subdirname, 'ans_to_aid.pickle') + #path_csv_train = os.path.join(params['dir'], 'processed', subdirname, 'train.csv') + path_trainset = os.path.join(params['dir'], 'processed', subdirname, 'trainset.pickle') + + print('Write wid_to_word to', path_wid_to_word) + with open(path_wid_to_word, 'wb') as handle: + pickle.dump(wid_to_word, handle) + + print('Write word_to_wid to', path_word_to_wid) + with open(path_word_to_wid, 'wb') as handle: + pickle.dump(word_to_wid, handle) + + print('Write aid_to_ans to', path_aid_to_ans) + with open(path_aid_to_ans, 'wb') as handle: + pickle.dump(aid_to_ans, handle) + + print('Write ans_to_aid to', path_ans_to_aid) + with open(path_ans_to_aid, 'wb') as handle: + pickle.dump(ans_to_aid, handle) + + print('Write trainset to', path_trainset) + with open(path_trainset, 'wb') as handle: + pickle.dump(trainset, handle) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--dir_vg', + default='data/visualgenome', + type=str, + help='Root directory containing raw, interim and processed directories' + ) + parser.add_argument('--nans', + default=10000, + type=int, + help='Number of top answers for the final classifications' + ) + parser.add_argument('--maxlength', + default=26, + type=int, + help='Max number of words in a caption. Captions longer get clipped' + ) + parser.add_argument('--minwcount', + default=0, + type=int, + help='Words that occur less than that are removed from vocab' + ) + parser.add_argument('--nlp', + default='mcb', + type=str, + help='Token method ; Options: nltk | mcb | naive' + ) + parser.add_argument('--pad', + default='left', + type=str, + help='Padding ; Options: right (finish by zeros) | left (begin by zeros)' + ) + args = parser.parse_args() + params = vars(args) + vgenome_processed(params) \ No newline at end of file diff --git a/vqa/datasets/vqa.py b/vqa/datasets/vqa.py index 03239c6..268d8d6 100644 --- a/vqa/datasets/vqa.py +++ b/vqa/datasets/vqa.py @@ -9,8 +9,11 @@ from ..lib.dataloader import DataLoader from .utils import AbstractVQADataset from .vqa_interim import vqa_interim +from .vqa2_interim import vqa_interim as vqa2_interim from .vqa_processed import vqa_processed -from .coco import factory_COCO +from . import coco +from . import vgenome + class AbstractVQA(AbstractVQADataset): @@ -139,14 +142,126 @@ def _processed(self): vqa_processed(self.opt) -def factory_VQA(data_split, opt, opt_coco=None): +class VQA2(AbstractVQA): + + def __init__(self, data_split, opt, dataset_img=None): + super(VQA2, self).__init__(data_split, opt, dataset_img) + + def _raw(self): + dir_zip = os.path.join(self.dir_raw, 'zip') + dir_ann = os.path.join(self.dir_raw, 'annotations') + os.system('mkdir -p '+dir_zip) + os.system('mkdir -p '+dir_ann) + os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip -P '+dir_zip) + os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Questions_Val_mscoco.zip -P '+dir_zip) + os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Questions_Test_mscoco.zip -P '+dir_zip) + os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip -P '+dir_zip) + os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Annotations_Val_mscoco.zip -P '+dir_zip) + os.system('unzip '+os.path.join(dir_zip, 'v2_Questions_Train_mscoco.zip')+' -d '+dir_ann) + os.system('unzip '+os.path.join(dir_zip, 'v2_Questions_Val_mscoco.zip')+' -d '+dir_ann) + os.system('unzip '+os.path.join(dir_zip, 'v2_Questions_Test_mscoco.zip')+' -d '+dir_ann) + os.system('unzip '+os.path.join(dir_zip, 'v2_Annotations_Train_mscoco.zip')+' -d '+dir_ann) + os.system('unzip '+os.path.join(dir_zip, 'v2_Annotations_Val_mscoco.zip')+' -d '+dir_ann) + os.system('mv '+os.path.join(dir_ann, 'v2_mscoco_train2014_annotations.json')+' ' + +os.path.join(dir_ann, 'mscoco_train2014_annotations.json')) + os.system('mv '+os.path.join(dir_ann, 'v2_mscoco_val2014_annotations.json')+' ' + +os.path.join(dir_ann, 'mscoco_val2014_annotations.json')) + os.system('mv '+os.path.join(dir_ann, 'v2_OpenEnded_mscoco_train2014_questions.json')+' ' + +os.path.join(dir_ann, 'OpenEnded_mscoco_train2014_questions.json')) + os.system('mv '+os.path.join(dir_ann, 'v2_OpenEnded_mscoco_val2014_questions.json')+' ' + +os.path.join(dir_ann, 'OpenEnded_mscoco_val2014_questions.json')) + os.system('mv '+os.path.join(dir_ann, 'v2_OpenEnded_mscoco_test2015_questions.json')+' ' + +os.path.join(dir_ann, 'OpenEnded_mscoco_test2015_questions.json')) + os.system('mv '+os.path.join(dir_ann, 'v2_OpenEnded_mscoco_test-dev2015_questions.json')+' ' + +os.path.join(dir_ann, 'OpenEnded_mscoco_test-dev2015_questions.json')) + + def _interim(self): + vqa2_interim(self.opt['dir']) + + def _processed(self): + vqa_processed(self.opt) + + +class VQAVisualGenome(data.Dataset): + + def __init__(self, dataset_vqa, dataset_vgenome): + self.dataset_vqa = dataset_vqa + self.dataset_vgenome = dataset_vgenome + self._filter_dataset_vgenome() + def _filter_dataset_vgenome(self): + print('-> Filtering dataset vgenome') + data_vg = self.dataset_vgenome.dataset + ans_to_aid = self.dataset_vqa.ans_to_aid + word_to_wid = self.dataset_vqa.word_to_wid + data_vg_new = [] + not_in = 0 + for i in range(len(data_vg)): + if data_vg[i]['answer'] not in ans_to_aid: + not_in += 1 + else: + data_vg[i]['answer_aid'] = ans_to_aid[data_vg[i]['answer']] + for j in range(data_vg[i]['seq_length']): + word = data_vg[i]['question_words_UNK'][j] + if word in word_to_wid: + wid = word_to_wid[word] + else: + wid = word_to_wid['UNK'] + data_vg[i]['question_wids'][j] = wid + data_vg_new.append(data_vg[i]) + print('-> {} / {} items removed'.format(not_in, len(data_vg))) + self.dataset_vgenome.dataset = data_vg_new + print('-> {} items left in visual genome'.format(len(self.dataset_vgenome))) + print('-> {} items total in vqa+vg'.format(len(self))) + + + def __getitem__(self, index): + if index < len(self.dataset_vqa): + item = self.dataset_vqa[index] + #print('vqa') + else: + item = self.dataset_vgenome[index - len(self.dataset_vqa)] + #print('vg') + #import ipdb; ipdb.set_trace() + return item + + def __len__(self): + return len(self.dataset_vqa) + len(self.dataset_vgenome) + + def num_classes(self): + return self.dataset_vqa.num_classes() + + def vocab_words(self): + return self.dataset_vqa.vocab_words() + + def vocab_answers(self): + return self.dataset_vqa.vocab_answers() + + def data_loader(self, batch_size=10, num_workers=4, shuffle=False): + return DataLoader(self, + batch_size=batch_size, shuffle=shuffle, + num_workers=num_workers, pin_memory=True) + + def split_name(self, testdev=False): + return self.dataset_vqa.split_name(testdev=testdev) + + +def factory(data_split, opt, opt_coco=None, opt_vgenome=None): dataset_img = None + if opt_coco is not None: - dataset_img = factory_COCO(data_split, opt_coco) - if opt['dataset'] == 'VQA': + dataset_img = coco.factory(data_split, opt_coco) + + if opt['dataset'] == 'VQA' and '2' not in opt['dir']: # sanity check dataset_vqa = VQA(data_split, opt, dataset_img) + elif opt['dataset'] == 'VQA2' and '2' in opt['dir']: # sanity check + dataset_vqa = VQA2(data_split, opt, dataset_img) else: raise ValueError - return dataset_vqa + + if opt_vgenome is not None: + dataset_vgenome = vgenome.factory(opt_vgenome, vqa=True) + return VQAVisualGenome(dataset_vqa, dataset_vgenome) + else: + return dataset_vqa diff --git a/vqa/datasets/vqa2_interim.py b/vqa/datasets/vqa2_interim.py new file mode 100644 index 0000000..d8f40ee --- /dev/null +++ b/vqa/datasets/vqa2_interim.py @@ -0,0 +1,90 @@ +import json +import os +import argparse +from collections import Counter + +def get_subtype(split='train'): + if split in ['train', 'val']: + return split + '2014' + else: + return 'test2015' + +def get_image_name_old(subtype='train2014', image_id='1', format='%s/COCO_%s_%012d.jpg'): + return format%(subtype, subtype, image_id) + +def get_image_name(subtype='train2014', image_id='1', format='COCO_%s_%012d.jpg'): + return format%(subtype, image_id) + +def interim(questions, split='train', annotations=[]): + print('Interim', split) + data = [] + for i in range(len(questions)): + row = {} + row['question_id'] = questions[i]['question_id'] + row['image_name'] = get_image_name(get_subtype(split), questions[i]['image_id']) + row['question'] = questions[i]['question'] + #row['MC_answer'] = questions[i]['multiple_choices'] + if split in ['train', 'val', 'trainval']: + row['answer'] = annotations[i]['multiple_choice_answer'] + answers = [] + for ans in annotations[i]['answers']: + answers.append(ans['answer']) + row['answers_occurence'] = Counter(answers).most_common() + data.append(row) + return data + +def vqa_interim(dir_vqa): + ''' + Put the VQA data into single json file in data/interim + or train, val, trainval : [[question_id, image_name, question, MC_answer, answer] ... ] + or test, test-dev : [[question_id, image_name, question, MC_answer] ... ] + ''' + + path_train_qa = os.path.join(dir_vqa, 'interim', 'train_questions_annotations.json') + path_val_qa = os.path.join(dir_vqa, 'interim', 'val_questions_annotations.json') + path_trainval_qa = os.path.join(dir_vqa, 'interim', 'trainval_questions_annotations.json') + path_test_q = os.path.join(dir_vqa, 'interim', 'test_questions.json') + path_testdev_q = os.path.join(dir_vqa, 'interim', 'testdev_questions.json') + + os.system('mkdir -p ' + os.path.join(dir_vqa, 'interim')) + + print('Loading annotations and questions...') + annotations_train = json.load(open(os.path.join(dir_vqa, 'raw', 'annotations', 'mscoco_train2014_annotations.json'), 'r')) + annotations_val = json.load(open(os.path.join(dir_vqa, 'raw', 'annotations', 'mscoco_val2014_annotations.json'), 'r')) + questions_train = json.load(open(os.path.join(dir_vqa, 'raw', 'annotations', 'OpenEnded_mscoco_train2014_questions.json'), 'r')) + questions_val = json.load(open(os.path.join(dir_vqa, 'raw', 'annotations', 'OpenEnded_mscoco_val2014_questions.json'), 'r')) + questions_test = json.load(open(os.path.join(dir_vqa, 'raw', 'annotations', 'OpenEnded_mscoco_test2015_questions.json'), 'r')) + questions_testdev = json.load(open(os.path.join(dir_vqa, 'raw', 'annotations', 'OpenEnded_mscoco_test-dev2015_questions.json'), 'r')) + + data_train = interim(questions_train['questions'], 'train', annotations_train['annotations']) + print('Train size %d'%len(data_train)) + print('Write', path_train_qa) + json.dump(data_train, open(path_train_qa, 'w')) + + data_val = interim(questions_val['questions'], 'val', annotations_val['annotations']) + print('Val size %d'%len(data_val)) + print('Write', path_val_qa) + json.dump(data_val, open(path_val_qa, 'w')) + + print('Concat. train and val') + data_trainval = data_train + data_val + print('Trainval size %d'%len(data_trainval)) + print('Write', path_trainval_qa) + json.dump(data_trainval, open(path_trainval_qa, 'w')) + + data_testdev = interim(questions_testdev['questions'], 'testdev') + print('Testdev size %d'%len(data_testdev)) + print('Write', path_testdev_q) + json.dump(data_testdev, open(path_testdev_q, 'w')) + + data_test = interim(questions_test['questions'], 'test') + print('Test size %d'%len(data_test)) + print('Write', path_test_q) + json.dump(data_test, open(path_test_q, 'w')) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--dir_vqa', default='data/vqa', type=str, help='Path to vqa data directory') + args = parser.parse_args() + vqa_interim(args.dir_vqa) diff --git a/vqa/external/pretrained-models.pytorch b/vqa/external/pretrained-models.pytorch new file mode 160000 index 0000000..fd347f4 --- /dev/null +++ b/vqa/external/pretrained-models.pytorch @@ -0,0 +1 @@ +Subproject commit fd347f4ac3063a4ce3831345e48977594dffe28a diff --git a/vqa/lib/criterions.py b/vqa/lib/criterions.py index 0667699..5b23f6b 100644 --- a/vqa/lib/criterions.py +++ b/vqa/lib/criterions.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn -def factory_loss(opt, cuda=True): +def factory(opt, cuda=True): criterion = nn.CrossEntropyLoss() if cuda: criterion = criterion.cuda() diff --git a/vqa/lib/engine.py b/vqa/lib/engine.py index 0f846ee..df3ebd6 100644 --- a/vqa/lib/engine.py +++ b/vqa/lib/engine.py @@ -97,7 +97,7 @@ def validate(loader, model, criterion, logger, epoch=0, print_freq=10): end = time.time() if i % print_freq == 0: - print('Test: [{0}/{1}]\t' + print('Val: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {acc1.val:.3f} ({acc1.avg:.3f})\t' diff --git a/vqa/models/__init__.py b/vqa/models/__init__.py index c5ffdac..d7c8c72 100644 --- a/vqa/models/__init__.py +++ b/vqa/models/__init__.py @@ -1,2 +1,4 @@ from .noatt import MLBNoAtt, MutanNoAtt from .att import MLBAtt, MutanAtt +from .utils import factory +from .utils import model_names \ No newline at end of file diff --git a/vqa/models/convnets.py b/vqa/models/convnets.py new file mode 100644 index 0000000..aa19d4b --- /dev/null +++ b/vqa/models/convnets.py @@ -0,0 +1,89 @@ +import copy +import torch +import torch.nn as nn +import torchvision.models as pytorch_models + +import sys +sys.path.append('vqa/external/pretrained-models.pytorch') +import pretrainedmodels as torch7_models + +pytorch_resnet_names = sorted(name for name in pytorch_models.__dict__ + if name.islower() + and name.startswith("resnet") + and callable(pytorch_models.__dict__[name])) + +torch7_resnet_names = sorted(name for name in torch7_models.__dict__ + if name.islower() + and callable(torch7_models.__dict__[name])) + +model_names = pytorch_resnet_names + torch7_resnet_names + +def factory(opt, cuda=True, data_parallel=True): + opt = copy.copy(opt) + + # forward_* will be better handle in futur release + def forward_resnet(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + if 'pooling' in opt and opt['pooling']: + x = self.avgpool(x) + div = x.size(3) + x.size(2) + x = x.sum(3) + x = x.sum(2) + x = x.view(x.size(0), -1) + x = x.div(div) + + return x + + def forward_resnext(self, x): + x = self.features(x) + + if 'pooling' in opt and opt['pooling']: + x = self.avgpool(x) + div = x.size(3) + x.size(2) + x = x.sum(3) + x = x.sum(2) + x = x.view(x.size(0), -1) + x = x.div(div) + + return x + + if opt['arch'] in pytorch_resnet_names: + model = pytorch_models.__dict__[opt['arch']](pretrained=True) + + convnet = model # ugly hack in case of DataParallel wrapping + model.forward = lambda x: forward_resnet(convnet, x) + + elif opt['arch'] == 'fbresnet152': + model = torch7_models.__dict__[opt['arch']](num_classes=1000, + pretrained='imagenet') + + convnet = model # ugly hack in case of DataParallel wrapping + model.forward = lambda x: forward_resnet(convnet, x) + + elif opt['arch'] in torch7_resnet_names: + model = torch7_models.__dict__[opt['arch']](num_classes=1000, + pretrained='imagenet') + + convnet = model # ugly hack in case of DataParallel wrapping + model.forward = lambda x: forward_resnext(convnet, x) + + else: + raise ValueError + + if data_parallel: + model = nn.DataParallel(model).cuda() + if not cuda: + raise ValueError + + if cuda: + model.cuda() + + return model \ No newline at end of file diff --git a/vqa/models/utils.py b/vqa/models/utils.py index 0f6f2f5..d13a250 100644 --- a/vqa/models/utils.py +++ b/vqa/models/utils.py @@ -1,4 +1,5 @@ import sys +import copy import torch import torch.nn as nn import torchvision.models as models @@ -6,43 +7,23 @@ from .noatt import MLBNoAtt, MutanNoAtt from .att import MLBAtt, MutanAtt -class ResNet(nn.Module): +model_names = sorted(name for name in sys.modules[__name__].__dict__ + if not name.startswith("__"))# and 'Att' in name) - def __init__(self, resnet, pooling, fix_until=None): - # pooling: boolean - # fix_until: None or layer name (included) - super(ResNet, self).__init__() - self.resnet = resnet - self.pooling = pooling - if fix_until is not None: - self.fixable_layers = [ - 'conv1', 'bn1', 'relu', 'maxpool', - 'layer1', 'layer2', 'layer3', 'layer4'] - if fix_until in self.fixable_layers: - self.fix_until = fix_until - self._fix_layers(fix_until) - else: - raise ValueError +def factory(opt, vocab_words, vocab_answers, cuda=True, data_parallel=True): + opt = copy.copy(opt) - def _fix_layers(self, fix_until): - for layer in self.fixable_layers: - print('Warning models/utils.py: Fix cnn layer '+layer) - for p in getattr(self.resnet, layer).parameters(): - p.requires_grad = False - if layer == self.fix_until: - break + if opt['arch'] in model_names: + model = getattr(sys.modules[__name__], opt['arch'])(opt, vocab_words, vocab_answers) + else: + raise ValueError - def forward(self, x): - x = self.resnet.conv1(x) - x = self.resnet.bn1(x) - x = self.resnet.relu(x) - x = self.resnet.maxpool(x) - x = self.resnet.layer1(x) - x = self.resnet.layer2(x) - x = self.resnet.layer3(x) - x = self.resnet.layer4(x) - if self.pooling: - x = self.resnet.avgpool(x) - x = x.view(x.size(0), -1) - # x = self.fc(x) - return x + if data_parallel: + model = nn.DataParallel(model).cuda() + if not cuda: + raise ValueError + + if cuda: + model.cuda() + + return model \ No newline at end of file