-
Notifications
You must be signed in to change notification settings - Fork 0
/
ac-independent-SUMO.py
669 lines (519 loc) · 30.6 KB
/
ac-independent-SUMO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
"""
ac-independent-SUMO.py
Description:
Implementation of actor critic adapted for multi-agent environments. This implementation is origianlly based on
the Clean-RL version https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/sac_atari.py but uses Cross-Entropy for the actor's
loss computation. The critic utilizes the same Q-Network structure as as the other MARL methods in this repository and the
actor utilizes the same structure as the critic but with the addition of a layer that utilizes the softmax activation function
Usage:
python ac-indepndent-SUMO.py -c experiments/sumo-2x2-ac-independent.config
References:
- https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/sac_atari.py
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
# # TODO: fix conda environment to include the version of gym that has Monitor module
# from gym.wrappers import TimeLimit#, Monitor
from datetime import datetime
import random
import os
import csv
from pettingzoo.butterfly import pistonball_v6
from pettingzoo.mpe import simple_spread_v3
# SUMO dependencies
import sumo_rl
import sys
from sumo_utils.sumo_custom.sumo_custom_observation import CustomObservationFunction
from sumo_utils.sumo_custom.sumo_custom_reward import CreateSumoReward
from sumo_utils.sumo_custom.calculate_speed_control import CalculateSpeedError
# Config Parser
from marl_utils.MARLConfigParser import MARLConfigParser
from rl_core.actor_critic import QNetwork, Actor, one_hot_q_values
from marl_utils.linear_schedule import LinearSchedule
from marl_utils.replay_buffer import ReplayBuffer
if __name__ == "__main__":
# Get config parameters
parser = MARLConfigParser()
args = parser.parse_args()
# The SUMO environment is slightly different from the defaul PettingZoo envs so set a flag to indicate if the SUMO env is being used
using_sumo = False
if args.gym_id == 'sumo':
using_sumo = True
# Make sure SUMO env variable is set
if 'SUMO_HOME' in os.environ:
tools = os.path.join(os.environ['SUMO_HOME'], 'tools')
sys.path.append(tools)
else:
sys.exit("Please declare the environment variable 'SUMO_HOME'")
if not args.seed:
args.seed = int(datetime.now())
# TRY NOT TO MODIFY: setup the environment
if args.gpu_id is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
experiment_time = str(datetime.now()).split('.')[0].replace(':','-')
experiment_name = "{}__N{}__exp{}__seed{}__{}".format(args.gym_id, args.N, args.exp_name, args.seed, experiment_time)
writer = SummaryWriter(f"runs/{experiment_name}")
writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
'\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))
if args.prod_mode:
import wandb
wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True)
writer = SummaryWriter(f"/tmp/{experiment_name}")
# Specify directories for logging
nn_dir = f"nn/{experiment_name}"
csv_dir = f"csv/{experiment_name}"
os.makedirs(f"{nn_dir}/critic_networks")
os.makedirs(f"{nn_dir}/actor_networks")
os.makedirs(csv_dir)
# TRY NOT TO MODIFY: seeding
device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
# Define an additional output file for the sumo-specific data
if using_sumo:
sumo_csv = "{}/_SUMO_alpha{}_gamma{}_{}".format(csv_dir, args.learning_rate, args.gamma, experiment_time)
print("\n=================== Environment Information ===================")
# Instantiate the environment
if using_sumo:
# Sumo must be created using the sumo-rl module
# Note we have to use the parallel env here to conform to this implementation of dqn
sumo_reward_function = CreateSumoReward(args=args)
env = sumo_rl.parallel_env(net_file=args.net,
route_file=args.route,
use_gui=args.sumo_gui,
max_green=args.max_green,
min_green=args.min_green,
num_seconds=args.sumo_seconds,
add_system_info=True, # Default is True
add_per_agent_info=True, # Default is True
reward_fn=sumo_reward_function,
observation_class=CustomObservationFunction,
sumo_warnings=False)
else:
print(" > ENV ARGS: {}".format(args.env_args))
exec(f" > env = {args.gym_id}.parallel_env({args.env_args})")
agents = env.possible_agents
print(" > agents:\n {}".format(agents))
num_agents = len(env.possible_agents)
print(" > num_agents:\n {}".format(num_agents))
# TODO: these dictionaries are deprecated, use action_space & observation_space functions instead
action_spaces = env.action_spaces
observation_spaces = env.observation_spaces
# CSV files to save episode metrics during training
with open(f"{csv_dir}/critic_loss.csv", "w", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_loss', 'global_step'])
csv_writer.writeheader()
with open(f"{csv_dir}/actor_loss.csv", "w", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_actor_loss', 'global_step'])
csv_writer.writeheader()
# system_episode_reward: the cumulative reward of all agents during the episode
# global_step: the global step in training
with open(f"{csv_dir}/episode_reward.csv", "w", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_episode_reward', 'global_step'])
csv_writer.writeheader()
# system_episode_max_speed: Maximum speed observed by all agents during an episode
# system_episode_min_max_speed: The lowest of all maximum speeds observed by all agents during an episode
# i.e. if four agents observed max speeds of [6.6, 7.0, 10.0, 12.0] during the episode,
# system_episode_min_max_speed would return 6.6 and system_episode_max_speed would return 12.0
# system_accumulated_stopped: Accumulated number of stopped cars observed by all agents during the episode
with open(f"{csv_dir}/episode_max_speeds.csv", "w", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_episode_max_speed',
'system_episode_min_max_speed',
'system_accumulated_stopped',
'global_step'])
csv_writer.writeheader()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
env.reset(seed=args.seed)
for agent in agents:
action_spaces[agent].seed(args.seed)
observation_spaces[agent].seed(args.seed)
# respect the default timelimit
# assert isinstance(env.action_space, Discrete), "only discrete action space is supported"
# TODO: Monitor was not working
# if args.capture_video:
# env = Monitor(env, f'videos/{experiment_name}')
# TRY NOT TO MODIFY: start the game
print(f" > Initializing environment")
obses, _ = env.reset()
# Initialize data structures for training
# NOTE: When using parameter sharing, we only need one network & optimizer but when not using parameter sharing,
# each agent gets its own
# Dictionary for storing replay buffers (maps agent to a replay buffer)
rb = {agent: ReplayBuffer(args.buffer_size) for agent in agents}
print(" > Initializing neural networks")
if args.parameter_sharing_model:
# Parameter sharing is being used
print(f" > Parameter sharing enabled")
eg_agent = agents[0]
onehot_keys = {agent: i for i, agent in enumerate(agents)}
if args.global_obs:
print(f" > Global observations enabled")
# Define the observation space dimensions (depending on whether or not global observations are being used)
observation_space_shape = tuple((shape+1) * (num_agents) for shape in observation_spaces[eg_agent].shape)
global_obs = np.hstack(list(obses.values()))
for agent in agents:
onehot = np.zeros(num_agents)
onehot[onehot_keys[agent]] = 1.0
obses[agent] = np.hstack([onehot, global_obs])
else:
print(f" > Global observations NOT enabled")
# Convert (X,) shape from tuple to int so it can be modified
observation_space_shape = np.array(observation_spaces[eg_agent].shape).prod() + num_agents
observation_space_shape = tuple(np.array([observation_space_shape]))
for agent in agents:
onehot = np.zeros(num_agents)
onehot[onehot_keys[agent]] = 1.0
obses[agent] = np.hstack([onehot, obses[agent]])
# Single q-network (i.e. "critic") for training
q_network = QNetwork(observation_space_shape, action_spaces[eg_agent].n).to(device)
# The target q-network
target_network = QNetwork(observation_space_shape, action_spaces[eg_agent].n).to(device)
target_network.load_state_dict(q_network.state_dict())
# Single actor network for training
actor_network = Actor(observation_space_shape, action_spaces[eg_agent].n).to(device)
# Optimizer for the critic
optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
# Optimizer for the actor
actor_optimizer = optim.Adam(list(actor_network.parameters()), lr=args.learning_rate)
print(f" > Observation space shape: {observation_space_shape}".format(observation_space_shape))
print(f" > Actionspace shape: {action_spaces[agent].n}")
print(f" > Q-network structure: { q_network}")
else:
print(f" > Parameter sharing NOT enabled")
# Dictionary for storing q-networks (maps agent to a q-network), these are the "critics"
q_network = {}
# Dictionary for storing target networks (maps agent to a network)
target_network = {}
# Dictionary for storing actor networks (maps agents to a network)
actor_network = {}
# Dictionary for storing optimizer for each agent's network
optimizer = {}
# Dictionary for storing the optimizers used to train the actor networks
actor_optimizer = {}
for agent in agents:
observation_space_shape = tuple(shape * num_agents for shape in observation_spaces[agent].shape) if args.global_obs else observation_spaces[agent].shape
q_network[agent] = QNetwork(observation_space_shape, action_spaces[agent].n).to(device)
target_network[agent] = QNetwork(observation_space_shape, action_spaces[agent].n).to(device)
# Intialize the target network the same as the critic network
target_network[agent].load_state_dict(q_network[agent].state_dict())
actor_network[agent] = Actor(observation_space_shape, action_spaces[agent].n).to(device)
# All agents use the same optimizer for training
optimizer[agent] = optim.Adam(q_network[agent].parameters(), lr=args.learning_rate)
actor_optimizer[agent] = optim.Adam(list(actor_network[agent].parameters()), lr=args.learning_rate)
print(f" > Agent: {agent}".format(agent))
print(f" > Observation space shape: {observation_space_shape}".format(observation_space_shape))
print(f" > Action space shape: {action_spaces[agent].n}")
# network of last agent
print(f" > Q-network structure: { q_network[agent]}")
# Global states
if args.global_obs:
print(f" > Global observations enabled")
global_obs = np.hstack(list(obses.values()))
obses = {agent: global_obs for agent in agents}
else:
print(f" > Global observations NOT enabled")
loss_fn = nn.MSELoss() # TODO: should the loss function be configurable?
actor_loss_fn = nn.CrossEntropyLoss()
print(" > Device: ",device.__repr__())
if args.render:
env.render() # TODO: verify that the sumo env supports render
# Dictionary that maps the each agent to its cumulative reward each episode
episode_rewards = {agent: 0 for agent in agents}
# Dictionary that maps each agent to the maximum speed observed at each step of the agent's episode
episode_max_speeds = {agent: [] for agent in agents}
# Dictionary that maps each agent to the avg speed reward obbtained by each agent
episode_avg_speed_rewards = {agent: 0 for agent in agents}
# Dictionary that maps each agent to the accumulated number of stopped cars it observes during episode
episode_accumulated_stopped = {agent: 0 for agent in agents}
# Dictionary that maps each agent to the action it selected
actions = {agent: None for agent in agents}
# Dictionary that maps each agent to the loss values for its critic network
losses = {agent: None for agent in agents}
# Dictionary that maps each agent to the loss values for its actor network
actor_losses = {agent: None for agent in agents}
lir_1 = 0
uir_1 = 0
var_1 = 0
cnt = 0
for global_step in range(args.total_timesteps):
# ALGO LOGIC: put action logic here
# Inflate/Deflate the total number of timesteps based on the exploration fraction
epsilon = LinearSchedule(args.start_e, args.end_e, args.exploration_fraction*args.total_timesteps, global_step)
# Set the action for each agent
for agent in agents:
if random.random() < epsilon:
actions[agent] = action_spaces[agent].sample()
else:
# Actor choses the actions
if args.parameter_sharing_model:
action, _, _ = actor_network.get_action(obses[agent])
else:
action, _, _ = actor_network[agent].get_action(obses[agent])
actions[agent] = action.detach().cpu().numpy()
# Letting critic pick the actions
# logits = q_network[agent].forward(obses[agent].reshape((1,)+obses[agent].shape)) # Used in SUMO but not in simple_spread
# actions[agent] = torch.argmax(logits, dim=1).tolist()[0]
# TRY NOT TO MODIFY: execute the game and log data.
next_obses, rewards, dones, _, _ = env.step(actions)
if args.parameter_sharing_model:
# When using parameter sharing, add one hot encoding for either global observations or independent observations
if args.global_obs:
global_next_obs = np.hstack(list(next_obses.values()))
for agent in agents:
onehot = np.zeros(num_agents)
onehot[onehot_keys[agent]] = 1.0
next_obses[agent] = np.hstack([onehot, global_next_obs])
else:
for agent in agents:
onehot = np.zeros(num_agents)
onehot[onehot_keys[agent]] = 1.0
next_obses[agent] = np.hstack([onehot, next_obses[agent]])
else:
# Global states
if args.global_obs:
global_obs = np.hstack(list(next_obses.values()))
next_obses = {agent: global_obs for agent in agents}
if args.render:
env.render()
# Extract performance about how we're doing so far
# Accumulated min reward received by any agent this step
lir_1 += min(rewards.values())
# Accumulated max reward received by any agent this step
uir_1 += max(rewards.values())
# Accumulated variance of rewards received by all agents this step
var_1 += np.var(list(rewards.values()))
cnt += 1
for agent in agents:
episode_rewards[agent] += rewards[agent]
# TODO: need to modify this for global observations
# max speed is the last element of the custom observation array
episode_max_speeds[agent].append(next_obses[agent][-1])
# Avg speed reward has been added to observation (as the second to last element)
agent_avg_speed = next_obses[agent][-2]
# The wrapper class needs to be unwrapped for some reason in order to properly access info
info = env.unwrapped.env._compute_info()
# Total number of cars stopped at this agent
agent_cars_stopped = info[f'{agent}_stopped']
if ((agent_cars_stopped == 0.0) and (agent_avg_speed == 0.0)):
# No cars and no average speed means there are no cars present in the intersection
avg_speed_reward = 0.0
else:
# Compute this metric only if there were cars present in the intersection
# This conforms to the way the avg speed rewards are calculated
avg_speed_reward = CalculateSpeedError(speed=agent_avg_speed,
speed_limit=args.sumo_average_speed_limit,
lower_speed_limit=args.sumo_average_speed_limit)
episode_avg_speed_rewards[agent] += avg_speed_reward
episode_accumulated_stopped[agent] += agent_cars_stopped
rb[agent].put((obses[agent], actions[agent], rewards[agent], next_obses[agent], dones[agent]))
# ALGO LOGIC: critic training
if (global_step > args.learning_starts) and (global_step % args.train_frequency == 0):
if args.parameter_sharing_model:
# Randomly sample an agent to use to calculate the estimated state-action values
# NOTE: Observations pulled from the replay buffer have one-hot encoding applied to them already
agent = random.choice(agents)
s_obses, s_actions, s_rewards, s_next_obses, s_dones = rb[agent].sample(args.batch_size)
with torch.no_grad():
target = torch.max(target_network.forward(s_next_obses), dim=1)[0]
td_target = torch.Tensor(s_rewards).to(device) + args.gamma * target * (1 - torch.Tensor(s_dones).to(device))
q_values = q_network.forward(s_obses)
# Get the max Q(s,a) for each observation in the batch
old_val = q_values.gather(1, torch.LongTensor(s_actions).view(-1,1).to(device)).squeeze()
loss = loss_fn(td_target, old_val)
losses[agent] = loss.item()
# optimize the model
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(list(q_network.parameters()), args.max_grad_norm)
optimizer.step()
# Actor training
a, log_pi, action_probs = actor_network.get_action(s_obses)
# Compute the loss for this agent's actor
# NOTE: Actor uses cross-entropy loss function where
# input is the policy dist and the target is the value function with one-hot encoding applied
# Q-values from "critic" encoded so that the highest state-action value maps to a probability of 1
q_values_one_hot = one_hot_q_values(q_values)
actor_loss = actor_loss_fn(action_probs, q_values_one_hot.to(device))
actor_losses[agent] = actor_loss.item()
actor_optimizer.zero_grad()
actor_loss.backward()
nn.utils.clip_grad_norm_(list(actor_network.parameters()), args.max_grad_norm)
actor_optimizer.step()
# update the target network
if global_step % args.target_network_frequency == 0:
# TODO: could also perform a "soft update" of the target network (see
# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#:~:text=%23%20Soft%20update%20of%20the%20target%20network%27s%20weights)
target_network.load_state_dict(q_network.state_dict())
if global_step % args.nn_save_freq == 0:
for agent in agents:
torch.save(q_network.state_dict(), f"{nn_dir}/critic_networks/{global_step}.pt")
torch.save(actor_network.state_dict(), f"{nn_dir}/actor_networks/{global_step}.pt")
else:
# Update the networks for each agent
for agent in agents:
s_obses, s_actions, s_rewards, s_next_obses, s_dones = rb[agent].sample(args.batch_size)
with torch.no_grad():
target_max = torch.max(target_network[agent].forward(s_next_obses), dim=1)[0]
td_target = torch.Tensor(s_rewards).to(device) + args.gamma * target_max * (1 - torch.Tensor(s_dones).to(device))
q_values = q_network[agent].forward(s_obses)
# Get the max Q(s,a) for each observation in the batch
old_val = q_values.gather(1, torch.LongTensor(s_actions).view(-1,1).to(device)).squeeze()
# Compute loss for agent's critic
loss = loss_fn(td_target, old_val)
losses[agent] = loss.item()
# optimize the model for the critic
optimizer[agent].zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(list(q_network[agent].parameters()), args.max_grad_norm)
optimizer[agent].step()
# Actor training
a, log_pi, action_probs = actor_network[agent].get_action(s_obses)
# Compute the loss for this agent's actor
# NOTE: Actor uses cross-entropy loss function where
# input is the policy dist and the target is the value function with one-hot encoding applied
# Q-values from "critic" encoded so that the highest state-action value maps to a probability of 1
q_values_one_hot = one_hot_q_values(q_values)
actor_loss = actor_loss_fn(action_probs, q_values_one_hot.to(device))
actor_losses[agent] = actor_loss.item()
actor_optimizer[agent].zero_grad()
actor_loss.backward()
nn.utils.clip_grad_norm_(list(actor_network[agent].parameters()), args.max_grad_norm)
actor_optimizer[agent].step()
# update the target network
if global_step % args.target_network_frequency == 0:
target_network[agent].load_state_dict(q_network[agent].state_dict())
# Save loss values occasionally
if global_step % 100 == 0:
writer.add_scalar("losses/td_loss/" + agent, loss, global_step)
writer.add_scalar("losses/actor_loss/" + agent, actor_loss, global_step)
# Save a snapshot of the actor and critic networks at this iteration of training
if global_step % args.nn_save_freq == 0:
for a in agents:
torch.save(q_network[a].state_dict(), f"{nn_dir}/critic_networks/{global_step}-{a}.pt")
torch.save(actor_network[a].state_dict(), f"{nn_dir}/actor_networks/{global_step}-{a}.pt")
# Save loss values occassionally
# NOTE: When using parameter sharing, loss is not calculated for all agents each update step. This means that when loss is logged,
# only 1 agent has the most updated. In other words, if there are 4 agents, the row in the loss csv file for training step X
# will have values for all 4 agents but only one of those values will be current (the other 3 will be stale by some number of
# training steps). This really should not matter as the log file should still show the same trends for each agent and the entire
# system overall
if (global_step > args.learning_starts) and (global_step % args.train_frequency == 0):
if (global_step % 100 == 0):
# Log the data to TensorBoard
system_loss = sum(list(losses.values()))
writer.add_scalar("losses/system_td_loss/", system_loss, global_step)
system_actor_loss = sum(list(actor_losses.values()))
writer.add_scalar("losses/system_actor_loss/", system_actor_loss, global_step)
# Log data to CSV
with open(f"{csv_dir}/critic_loss.csv", "a", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_loss', 'global_step'])
csv_writer.writerow({**losses, **{'system_loss': system_loss, 'global_step': global_step}})
with open(f"{csv_dir}/actor_loss.csv", "a", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_actor_loss', 'global_step'])
csv_writer.writerow({**actor_losses, **{'system_actor_loss': system_actor_loss, 'global_step': global_step}})
# TRY NOT TO MODIFY: CRUCIAL step easy to overlook
obses = next_obses
# If all agents are done, log the results and reset the evnironment to continue training
if np.prod(list(dones.values())) or (global_step % args.max_cycles == args.max_cycles-1):
# Accumulated reward of all agents
system_episode_reward = sum(list(episode_rewards.values()))
# Accumulated avg speed reward of all agents
system_episode_avg_speed_reward = sum(list(episode_avg_speed_rewards.values()))
# Accumulated number of cars stopped at each step for all agents
system_accumulated_stopped = sum(list(episode_accumulated_stopped.values()))
# Calculate the maximum of all max speeds observed from each agent during the episode
agent_max_speeds = {agent:0 for agent in agents}
for agent in agents:
agent_max_speeds[agent] = max(episode_max_speeds[agent])
system_episode_max_speed = max(list(agent_max_speeds.values()))
system_episode_min_max_speed = min(list(agent_max_speeds.values()))
# The wrapper class needs to be unwrapped in order to properly access info
info = env.unwrapped.env._compute_info()
# Total number of cars stopped at end of episode
agents_total_stopped = info['agents_total_stopped']
print(f"\n > global_step={global_step}")
print(f" > agent_max_speeds {agent_max_speeds}")
print(f" > system_episode_max_speed {system_episode_max_speed}")
print(f" > system_episode_min_max_speed {system_episode_min_max_speed}")
print(f" > system total avg speed reward: {system_episode_avg_speed_reward}")
# TRY NOT TO MODIFY: record rewards for plotting purposes
print(f" > system reward: {system_episode_reward} using definition: {args.sumo_reward}")
print(f" > total cars stopped at end of episode: {agents_total_stopped}")
diff_1 = uir_1-lir_1
# var_1 = var_1/(cnt-1e-7)
lir_2 = min(episode_rewards.values())
uir_2 = max(episode_rewards.values())
diff_2 = uir_2-lir_2
var_2 = np.var(list(episode_rewards.values()))
print(f" > system_episode_diff_1={diff_1}")
print(f" > uir1={uir_1}")
print(f" > lir1={lir_1}")
print(f" > system_variance1={var_1}")
print(f" > system_episode_diff_2={diff_2}")
print(f" > uir2={uir_2}")
print(f" > lir2={lir_2}")
print(f" > system_variance2={var_2}")
# Logging should only be done after we've started training, up until then, the agents are just getting experience
if global_step > args.learning_starts:
for agent in agents:
writer.add_scalar("charts/episode_reward/" + agent, episode_rewards[agent], global_step)
writer.add_scalar("charts/episode_reward/uir_1", uir_1, global_step)
writer.add_scalar("charts/episode_reward/lir_1", lir_1, global_step)
writer.add_scalar("charts/episode_reward/diff_1", diff_1, global_step)
writer.add_scalar("charts/episode_reward/var_1", var_1, global_step)
writer.add_scalar("charts/episode_reward/uir_2", uir_2, global_step)
writer.add_scalar("charts/episode_reward/lir_2", lir_2, global_step)
writer.add_scalar("charts/episode_reward/diff_2", diff_2, global_step)
writer.add_scalar("charts/episode_reward/var_2", var_2, global_step)
writer.add_scalar("charts/epsilon/", epsilon, global_step)
writer.add_scalar("charts/system_episode_reward/", system_episode_reward, global_step)
with open(f"{csv_dir}/episode_reward.csv", "a", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_episode_reward', 'global_step'])
csv_writer.writerow({**episode_rewards, **{'system_episode_reward': system_episode_reward, 'global_step': global_step}})
with open(f"{csv_dir}/episode_max_speeds.csv", "a", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=agents+['system_episode_max_speed', 'system_episode_min_max_speed', 'system_accumulated_stopped', 'global_step'])
csv_writer.writerow({**agent_max_speeds, **{'system_episode_max_speed': system_episode_max_speed,
'system_episode_min_max_speed': system_episode_min_max_speed,
'system_accumulated_stopped' : system_accumulated_stopped,
'global_step': global_step}})
# If we're using the SUMO env, also save some data specific to that environment
if using_sumo:
env.unwrapped.save_csv(sumo_csv, global_step)
# Reset the env to continue training
obses, _ = env.reset()
lir_1 = 0
uir_1 = 0
var_1 = 0
cnt = 0
if args.parameter_sharing_model:
# Add one hot encoding for either global observations or independent observations once the environment has been reset
if args.global_obs:
global_obs = np.hstack(list(obses.values()))
for agent in agents:
onehot = np.zeros(num_agents)
onehot[onehot_keys[agent]] = 1.0
obses[agent] = np.hstack([onehot, global_obs])
else:
for agent in agents:
onehot = np.zeros(num_agents)
onehot[onehot_keys[agent]] = 1.0
obses[agent] = np.hstack([onehot, obses[agent]])
else:
# Global states
if args.global_obs:
global_obs = np.hstack(list(obses.values()))
obses = {agent: global_obs for agent in agents}
if args.render:
env.render()
# Reset dictionaries for next episode
episode_rewards = {agent: 0 for agent in agents}
episode_avg_speed_rewards = {agent: 0 for agent in agents}
episode_accumulated_stopped = {agent: 0 for agent in agents}
episode_max_speeds = {agent: [0] for agent in agents}
actions = {agent: None for agent in agents}
env.close()
writer.close()