-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_original.py
163 lines (129 loc) · 7.82 KB
/
main_original.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import numpy as np
import argparse
import torch
from copy import deepcopy
from option_critic import OptionCriticFeatures, OptionCriticConv, deoc_entropy
from option_critic import critic_loss as critic_loss_fn
from option_critic import actor_loss as actor_loss_fn
from experience_replay import ReplayBuffer
from utils import make_env, to_tensor
from logger import Logger
import time
parser = argparse.ArgumentParser(description="Option Critic PyTorch")
parser.add_argument('--env', default='CartPole-v0', help='ROM to run')
parser.add_argument('--optimal-eps', type=float, default=0.05, help='Epsilon when playing optimally')
parser.add_argument('--frame-skip', default=4, type=int, help='Every how many frames to process')
parser.add_argument('--learning-rate',type=float, default=.0005, help='Learning rate')
parser.add_argument('--gamma', type=float, default=.99, help='Discount rate')
parser.add_argument('--epsilon-start', type=float, default=1.0, help=('Starting value for epsilon.'))
parser.add_argument('--epsilon-min', type=float, default=.1, help='Minimum epsilon.')
parser.add_argument('--epsilon-decay', type=float, default=20000, help=('Number of steps to minimum epsilon.'))
parser.add_argument('--max-history', type=int, default=10000, help=('Maximum number of steps stored in replay'))
parser.add_argument('--batch-size', type=int, default=32, help='Batch size.')
parser.add_argument('--freeze-interval', type=int, default=200, help=('Interval between target freezes.'))
parser.add_argument('--update-frequency', type=int, default=4, help=('Number of actions before each SGD update.'))
parser.add_argument('--termination-reg', type=float, default=0.01, help=('Regularization to decrease termination prob.'))
parser.add_argument('--entropy-reg', type=float, default=0.01, help=('Regularization to increase policy entropy.'))
parser.add_argument('--num-options', type=int, default=2, help=('Number of options to create.'))
parser.add_argument('--temp', type=float, default=1, help='Action distribution softmax tempurature param.')
parser.add_argument('--max_steps_ep', type=int, default=18000, help='number of maximum steps per episode.')
parser.add_argument('--max_steps_total', type=int, default=int(4e6), help='number of maximum steps to take.') # bout 4 million
parser.add_argument('--cuda', type=bool, default=True, help='Enable CUDA training (recommended if possible).')
parser.add_argument('--seed', type=int, default=0, help='Random seed for numpy, torch, random.')
parser.add_argument('--logdir', type=str, default='runs', help='Directory for logging statistics')
parser.add_argument('--exp', type=str, default=None, help='optional experiment name')
parser.add_argument('--switch-goal', type=bool, default=False, help='switch goal after 2k eps')
# diversity-enriched arguments
parser.add_argument('--diversity_learning', type=bool, default=False, help='Whether to use diversity enriched learning')
parser.add_argument('--diversity_termination', type=bool, default=False, help='Whether to use diversity enriched termination')
parser.add_argument('--diversity_tradeoff', type=float, default=0.0001, help='Tradeoff between diversity and reward')
parser.add_argument('--deoc_entropy_samples', type=int, default=6, help='Number of samples to estimate entropy')
def run(args):
env, is_atari = make_env(args.env)
option_critic = OptionCriticConv if is_atari else OptionCriticFeatures
device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
option_critic = option_critic(
in_features=env.observation_space.shape[0],
num_actions=env.action_space.n,
num_options=args.num_options,
temperature=args.temp,
eps_start=args.epsilon_start,
eps_min=args.epsilon_min,
eps_decay=args.epsilon_decay,
eps_test=args.optimal_eps,
device=device
)
# Create a prime network for more stable Q values
option_critic_prime = deepcopy(option_critic)
optim = torch.optim.RMSprop(option_critic.parameters(), lr=args.learning_rate)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
buffer = ReplayBuffer(capacity=args.max_history, seed=args.seed)
logger = Logger(logdir=args.logdir, run_name=f"{OptionCriticFeatures.__name__}-{args.env}-{args.exp}-{time.ctime()}")
sum_entropy = 0
steps = 0 ;
if args.switch_goal: print(f"Current goal {env.goal}")
while steps < args.max_steps_total:
rewards = 0 ; option_lengths = {opt:[] for opt in range(args.num_options)}
obs = env.reset()
state = option_critic.get_state(to_tensor(obs))
greedy_option = option_critic.greedy_option(state)
current_option = 0
# Goal switching experiment: run for 1k episodes in fourrooms, switch goals and run for another
# 2k episodes. In option-critic, if the options have some meaning, only the policy-over-options
# should be finedtuned (this is what we would hope).
if args.switch_goal and logger.n_eps == 1000:
torch.save({'model_params': option_critic.state_dict(),
'goal_state': env.goal},
f'models/option_critic_seed={args.seed}_1k')
env.switch_goal()
print(f"New goal {env.goal}")
if args.switch_goal and logger.n_eps > 2000:
torch.save({'model_params': option_critic.state_dict(),
'goal_state': env.goal},
f'models/option_critic_seed={args.seed}_2k')
break
done = False ; ep_steps = 0 ; option_termination = True ; curr_op_len = 0
while not done and ep_steps < args.max_steps_ep:
epsilon = option_critic.epsilon
if option_termination:
option_lengths[current_option].append(curr_op_len)
current_option = np.random.choice(args.num_options) if np.random.rand() < epsilon else greedy_option
curr_op_len = 0
import pdb; pdb.set_trace()
action, logp, entropy = option_critic.get_action(state, current_option)
next_obs, reward, done, _ = env.step(action)
if args.diversity_learning:
entropy_loss = deoc_entropy(option_critic, state, option_critic.options_W, args)
sum_entropy += entropy_loss
pseudo_reward = (1 - args.diversity_tradeoff) * reward + args.diversity_tradeoff * entropy_loss
reward = pseudo_reward
buffer.push(obs, current_option, reward, next_obs, done)
rewards += reward
actor_loss, critic_loss = None, None
if len(buffer) > args.batch_size:
actor_loss = actor_loss_fn(obs, current_option, logp, entropy, \
reward, done, next_obs, option_critic, option_critic_prime, args, sum_entropy / steps)
loss = actor_loss
if steps % args.update_frequency == 0:
data_batch = buffer.sample(args.batch_size)
critic_loss = critic_loss_fn(option_critic, option_critic_prime, data_batch, args)
loss += critic_loss
optim.zero_grad()
loss.backward()
optim.step()
if steps % args.freeze_interval == 0:
option_critic_prime.load_state_dict(option_critic.state_dict())
state = option_critic.get_state(to_tensor(next_obs))
option_termination, greedy_option = option_critic.predict_option_termination(state, current_option)
# update global steps etc
steps += 1
ep_steps += 1
curr_op_len += 1
obs = next_obs
logger.log_data(steps, actor_loss, critic_loss, entropy.item(), epsilon)
logger.log_episode(steps, rewards, option_lengths, ep_steps, epsilon)
if __name__=="__main__":
args = parser.parse_args()
run(args)