-
Notifications
You must be signed in to change notification settings - Fork 7
Example runs
ARAKAWA, Naoya edited this page Dec 2, 2019
·
2 revisions
Operate the robot with the arrow keys without reinforcement learning.
import gym
import gym.spaces
import gym_foodhunting
import pybullet as p
def getAction():
keys = p.getKeyboardEvents()
if p.B3G_UP_ARROW in keys and keys[p.B3G_UP_ARROW] & p.KEY_IS_DOWN:
return 0
elif p.B3G_LEFT_ARROW in keys and keys[p.B3G_LEFT_ARROW] & p.KEY_IS_DOWN:
return 1
elif p.B3G_RIGHT_ARROW in keys and keys[p.B3G_RIGHT_ARROW] & p.KEY_IS_DOWN:
return 2
else:
return 0
def main():
env = gym.make('FoodHuntingDiscreteGUI-v0')
# env = gym.make('FoodHuntingDiscrete-v0')
print(env.observation_space, env.action_space)
obs = env.reset()
while True:
action = getAction()
# action = env.action_space.sample()
obs, reward, done, info = env.step(action)
# print(action, obs, reward, done, info)
if done:
obs = env.reset()
env.close()
if __name__ == '__main__':
main()
PPO is used as the RN algorithm and CNN as the NN model.
import argparse
import gym
import gym_foodhunting
from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
def learn(env_name, save_file, total_timesteps):
env = DummyVecEnv([lambda: gym.make(env_name)])
model = PPO2(CnnPolicy, env, verbose=1)
model.learn(total_timesteps=total_timesteps)
model.save(save_file)
del model
env.close()
def play(env_name, load_file, total_timesteps):
env = DummyVecEnv([lambda: gym.make(env_name)])
model = PPO2.load(load_file, verbose=1)
obs = env.reset()
for i in range(total_timesteps):
action, _states = model.predict(obs)
obs, reward, done, info = env.step(action)
# env.render() # dummy
if done:
print(info)
del model
env.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--play', action='store_true', help='play or learn.')
parser.add_argument('--env_name', type=str, default='FoodHuntingDiscreteGUI-v0', help='environment name.')
parser.add_argument('--filename', type=str, default='saved_model', help='filename to save/load model.')
parser.add_argument('--total_timesteps', type=int, default=10000, help='total timesteps.')
args = parser.parse_args()
if args.play:
play(args.env_name, args.filename, args.total_timesteps)
else:
learn(args.env_name, args.filename, args.total_timesteps)
Learning:
# Learn
python examples/example_rl.py --env_name="FoodHuntingDiscrete-v0" --total_timesteps=10000 --filename="saved_model"
Evaluation:
# Play without GUI
python examples/example_rl.py --env_name="FoodHuntingDiscrete-v0" --total_timesteps=10000 --filename="saved_model" --play
# Play with GUI
python examples/example_rl.py --env_name="FoodHuntingDiscreteGUI-v0" --total_timesteps=10000 --filename="saved_model" --play
Multi-processing is used to simultaneously execute multiple environments to reduce computation time (n_cpu parameter). Monitoring during calculation is performed and the model file is saved when the maximum value of the average episode reward is updated. When the average episode reward exceeds the specified threshold, learning is terminated (reward_threshold parameter).
-
Source code
- PyLIS/gym-foodhunting/agents/ppo_agent.py
cd gym-foodhunting
# Run this to enable SubprocVecEnv on Mac OS X.
# export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
# see see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331
# See available options.
python agents/ppo_agent.py --help
# Learn
# This may take a few hours.
time python agents/ppo_agent.py --env_name="FoodHuntingHSRDiscrete-v1" --total_timesteps=500000 --n_cpu=8 --reward_threshold=3.0 --tensorboard_log="tblog"
# Monitor
tensorboard --logdir tblog
# Open web browser and access http://localhost:6006/
# Play with GUI
# This will open PyBullet window.
time python agents/ppo_agent.py --env_name="FoodHuntingHSRDiscrete-v1" --load_file="FoodHuntingHSR-v1_best.pkl" --total_timesteps=500000 --n_cpu=8 --play