-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain_pretrain.py
161 lines (127 loc) · 6.07 KB
/
train_pretrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from utils.logger import setup_logger
from datasets import make_dataloader_vl_img_txt_pre as make_dataloader
from model.make_model_vl import make_model
from solver import make_optimizer_triplet as make_optimizer
# from solver import make_optimizer_uvt as make_optimizer
from solver.scheduler_factory import create_scheduler
# from loss import make_loss_triplet as make_loss
from loss import make_loss_entropy as make_loss
from processor import do_train_cuhk_img_pre as do_train
from processor import do_inference_cuhk_img_txt_pre as do_inference
import random
import torch
import numpy as np
import os
import argparse
import subprocess
# from timm.scheduler import create_scheduler
from config import cfg
from model.backbones.tokenization_bert import BertTokenizer
from loss.cmps_loss import Loss
import torch.distributed as dist
def set_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0
def resume_checkpoint(cfg, model, optimizer):
path_c = os.path.join(cfg.OUTPUT_DIR, 'checkpoint.pth')
if not os.path.exists(path_c):
return model, 0, optimizer
# net_dict = torch.load(path_c)
net_dict = torch.load(path_c, map_location="cpu")
param_dict = net_dict['state_dict']
for i in param_dict:
model.state_dict()[i.replace('module.', '')].copy_(param_dict[i])
# model.load_state_dict(net_dict['state_dict'])
start_epoch = net_dict['epoch']
optimizer.load_state_dict(net_dict['optimizer'])
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.cuda()
return model, start_epoch, optimizer
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="ReID Baseline Training")
parser.add_argument("--config_file", default="configs/CUHK_PEDES/vit_base_uvt_bert.yml", help="path to config file", type=str)
parser.add_argument("--gpu_id", default="0", type=str)
parser.add_argument("--dataset", default="pretrain_cuhk", type=str)
parser.add_argument("--pretrain_choice", default="imagenet", type=str)
parser.add_argument("--loss_type", default="softmax", type=str, help='softmax, softmax_triplet')
parser.add_argument("--model_name", default="transformer_uvt_img_txt_pretrain", type=str)
parser.add_argument("--transformer_type", default="vit_base_patch16_224_uvt_img_txt", type=str)
parser.add_argument("--batch_size", default=48, type=int, help='28, 8gpu: 512, 24gpu: 1344, 32gpu: 1792, 48gpu: 2688')
parser.add_argument("--data_dir", default="data_root", type=str)
parser.add_argument("--logs_dir", default="logs/20220402_img_txt_pretrain", type=str)
parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER)
parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument("--test_mode", default=False, type=bool)
parser.add_argument("--resume", default=True, type=bool)
args = parser.parse_args()
if args.config_file != "":
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.MODEL.DEVICE_ID = args.gpu_id
cfg.MODEL.PRETRAIN_CHOICE = args.pretrain_choice
cfg.MODEL.TRANSFORMER_TYPE = args.transformer_type
cfg.MODEL.NAME = args.model_name
cfg.DATALOADER.SAMPLER = args.loss_type
cfg.OUTPUT_DIR = args.logs_dir
cfg.DATASETS.NAMES = args.dataset
cfg.DATASETS.ROOT_DIR = args.data_dir
cfg.SOLVER.IMS_PER_BATCH = args.batch_size
cfg.freeze()
set_seed(cfg.SOLVER.SEED)
if cfg.MODEL.DIST_TRAIN:
torch.cuda.set_device(args.local_rank)
output_dir = cfg.OUTPUT_DIR
# if is_main_process() and output_dir and not os.path.exists(output_dir):
# os.makedirs(output_dir)
logger = setup_logger("transreid", output_dir, if_train=True)
logger.info("Saving model in the path :{}".format(cfg.OUTPUT_DIR))
logger.info(args)
if args.config_file != "":
logger.info("Loaded configuration file {}".format(args.config_file))
with open(args.config_file, 'r') as cf:
config_str = "\n" + cf.read()
logger.info(config_str)
logger.info("Running with config:\n{}".format(cfg))
if cfg.MODEL.DIST_TRAIN:
torch.distributed.init_process_group(backend='nccl', init_method='env://')
os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID
train_loader, test_loader, num_classes = make_dataloader(cfg)
# tokenizer = BertTokenizer.from_pretrained(cfg.TEXT.TEXT_ENCODER)
tokenizer = BertTokenizer.from_pretrained('./bert_tool/uncased_L-12_H-768_A-12')
model = make_model(cfg, num_class=num_classes)
if args.test_mode:
if is_main_process():
param_dict = torch.load(os.path.join(cfg.OUTPUT_DIR, 'checkpoint.pth'))['state_dict']
for i in param_dict:
model.state_dict()[i.replace('module.', '')].copy_(param_dict[i])
print('Test:')
R1, R5, R10, mAP = do_inference(cfg, model, test_loader, logger, tokenizer)
else:
loss_func = make_loss(cfg, num_classes=num_classes)
loss_cmpm = Loss(num_classes=num_classes, feature_size=768, resume=False, epsilon=1e-8).cuda()
optimizer = make_optimizer(cfg, model)
scheduler = create_scheduler(cfg, optimizer)
start_epoch = 0
if args.resume:
model, start_epoch, optimizer = resume_checkpoint(cfg, model, optimizer)
print('resume checkpoint successful. start_epoch: ', start_epoch)
do_train(cfg, model, train_loader, test_loader, optimizer, scheduler, loss_func, loss_cmpm, tokenizer, args.local_rank, start_epoch, logger)