From 7c321f4b1b6eb700206caf0ee78bbfb47a84eb51 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Thu, 20 Oct 2022 16:29:58 +0100 Subject: [PATCH 1/2] Feature: Merged Simmer+Saute/BugFix: SAC logging --- README.md | 86 ++- SAUTE/.gitignore | 140 ---- SAUTE/README.md | 114 --- SAUTE/common/__init__.py | 0 SAUTE/common/argument_parser.py | 18 - SAUTE/common/base_runner.py | 327 --------- SAUTE/common/utils.py | 28 - SAUTE/envs/__init__.py | 0 SAUTE/envs/mountain_car/__init__.py | 16 - SAUTE/envs/mountain_car/mountain_car.py | 79 --- SAUTE/envs/mountain_car/test.py | 19 - SAUTE/envs/pendula/__init__.py | 29 - SAUTE/envs/pendula/double_pendulum.py | 59 -- SAUTE/envs/pendula/single_pendulum.py | 216 ------ SAUTE/envs/pendula/test.py | 27 - SAUTE/envs/reacher/__init__.py | 16 - SAUTE/envs/reacher/reacher.py | 96 --- SAUTE/envs/reacher/test.py | 21 - SAUTE/envs/safety_gym/augmented_sg_envs.py | 136 ---- SAUTE/envs/utils.py | 13 - SAUTE/envs/wrappers/safe_env.py | 22 - SAUTE/envs/wrappers/saute_env.py | 115 --- SAUTE/exps/__init__.py | 0 .../tf_trpo_double_pendulum_v1.py | 94 --- .../tf_trpo_double_pendulum_v2.py | 110 --- .../tf_trpo_double_pendulum_v3.py | 95 --- .../tf_trpo_double_pendulum_v4.py | 67 -- .../tf_trpo_double_pendulum_v5.py | 60 -- SAUTE/exps/mountain_car/tf_sac_v1.py | 76 -- SAUTE/exps/reacher/tf_trpo_reacher_v1.py | 77 -- .../exps/safety_gym/tf_trpo_safety_gym_v1.py | 77 -- .../single_pendulum/tf_ppo_pendulum_v1.py | 76 -- .../single_pendulum/tf_sac_pendulum_v1.py | 77 -- .../single_pendulum/tf_trpo_pendulum_v1.py | 76 -- .../tf_trpo_pendulum_v2_abblation.py | 74 -- SAUTE/main.py | 79 --- SAUTE/pip_repos.txt | 71 -- SAUTE/sauterl.yml | 25 - SAUTE/tf_algos/__init__.py | 0 SAUTE/tf_algos/common/__init__.py | 0 SAUTE/tf_algos/common/runner.py | 229 ------ SAUTE/tf_algos/common/utils.py | 9 - .../safety_starter_agents/__init__.py | 0 .../tf_algos/safety_starter_agents/agents.py | 267 ------- .../safety_starter_agents/run_agents.py | 650 ----------------- .../safety_starter_agents/sac_utils.py | 160 ----- .../safety_starter_agents/test_agents.py | 84 --- .../tf_algos/safety_starter_agents/tf_cpo.py | 14 - .../tf_algos/safety_starter_agents/tf_ppo.py | 51 -- .../tf_algos/safety_starter_agents/tf_sac.py | 661 ------------------ .../tf_algos/safety_starter_agents/tf_trpo.py | 72 -- SIMMER/.gitignore | 162 +++++ SIMMER/README.md | 23 +- .../single_pendulum/key_observation_cfg.py | 37 + SIMMER/main.py | 3 + .../tf_algos/safety_starter_agents/tf_sac.py | 36 +- 56 files changed, 303 insertions(+), 4866 deletions(-) delete mode 100644 SAUTE/.gitignore delete mode 100644 SAUTE/README.md delete mode 100644 SAUTE/common/__init__.py delete mode 100644 SAUTE/common/argument_parser.py delete mode 100644 SAUTE/common/base_runner.py delete mode 100644 SAUTE/common/utils.py delete mode 100644 SAUTE/envs/__init__.py delete mode 100644 SAUTE/envs/mountain_car/__init__.py delete mode 100644 SAUTE/envs/mountain_car/mountain_car.py delete mode 100644 SAUTE/envs/mountain_car/test.py delete mode 100644 SAUTE/envs/pendula/__init__.py delete mode 100644 SAUTE/envs/pendula/double_pendulum.py delete mode 100644 SAUTE/envs/pendula/single_pendulum.py delete mode 100644 SAUTE/envs/pendula/test.py delete mode 100644 SAUTE/envs/reacher/__init__.py delete mode 100644 SAUTE/envs/reacher/reacher.py delete mode 100644 SAUTE/envs/reacher/test.py delete mode 100644 SAUTE/envs/safety_gym/augmented_sg_envs.py delete mode 100644 SAUTE/envs/utils.py delete mode 100644 SAUTE/envs/wrappers/safe_env.py delete mode 100644 SAUTE/envs/wrappers/saute_env.py delete mode 100644 SAUTE/exps/__init__.py delete mode 100644 SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v1.py delete mode 100644 SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v2.py delete mode 100644 SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v3.py delete mode 100644 SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v4.py delete mode 100644 SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v5.py delete mode 100644 SAUTE/exps/mountain_car/tf_sac_v1.py delete mode 100644 SAUTE/exps/reacher/tf_trpo_reacher_v1.py delete mode 100644 SAUTE/exps/safety_gym/tf_trpo_safety_gym_v1.py delete mode 100644 SAUTE/exps/single_pendulum/tf_ppo_pendulum_v1.py delete mode 100644 SAUTE/exps/single_pendulum/tf_sac_pendulum_v1.py delete mode 100644 SAUTE/exps/single_pendulum/tf_trpo_pendulum_v1.py delete mode 100644 SAUTE/exps/single_pendulum/tf_trpo_pendulum_v2_abblation.py delete mode 100644 SAUTE/main.py delete mode 100644 SAUTE/pip_repos.txt delete mode 100644 SAUTE/sauterl.yml delete mode 100644 SAUTE/tf_algos/__init__.py delete mode 100644 SAUTE/tf_algos/common/__init__.py delete mode 100644 SAUTE/tf_algos/common/runner.py delete mode 100644 SAUTE/tf_algos/common/utils.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/__init__.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/agents.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/run_agents.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/sac_utils.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/test_agents.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/tf_cpo.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/tf_ppo.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/tf_sac.py delete mode 100644 SAUTE/tf_algos/safety_starter_agents/tf_trpo.py create mode 100644 SIMMER/.gitignore create mode 100644 SIMMER/exps/single_pendulum/key_observation_cfg.py diff --git a/README.md b/README.md index c2f7ff2b..eaa58b4c 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,15 @@ This directory contains official implementations for Bayesian optimisation & Reinforcement Learning works developped by Huawei, Noah's Ark Lab. -- [HEBO: Heteroscedastic Evolutionary Bayesian Optimisation](./HEBO) -- [T-LBO](./T-LBO) -- [BOiLS: Bayesian Optimisation for Logic Synthesis](./BOiLS) -- [Bayesian Optimisation with Compositional Optimisers](./CompBO) -- [Sauté RL: Almost Surely Safe RL Using State Augmentation](./SAUTE) -- [SIMMER - Enhancing Safe Exploration Using Safety State Augmentation](./SIMMER) +- Bayesian Optimisation Research + - [HEBO: Heteroscedastic Evolutionary Bayesian Optimisation](./HEBO) + - [T-LBO](./T-LBO) + - [BOiLS: Bayesian Optimisation for Logic Synthesis](./BOiLS) + - [Bayesian Optimisation with Compositional Optimisers](./CompBO) + - [AntBO: Antibody Design with Combinatorial Bayesian Optimisation](./AntBO) +- Reinforcement Learning Research + - [Sauté RL: and Simmer RL: Safe Reinforcement Learning Using Safety State Augmentation ](./SIMMER) + - [Model-Based Offline Reinforcement Learning with Pessimism-Modulated Dynamics Belief](./PMDB) Further instructions are provided in the README files associated to each project. @@ -89,29 +92,37 @@ comprising synthetic optimisation tasks as well as tasks from Bayesmark. Given t function maximisation subroutine, we posit that the adoption of compositional optimisers has the potential to yield performance improvements across all domains in which Bayesian optimisation is currently being applied. -# Reinforcement Learning Research +## [AntBO: Antibody Design with Combinatorial Bayesian Optimisation](./AntBO) -## [Sauté RL: Almost Surely Safe RL Using State Augmentation](./SAUTE/) +![AntBO overview](./AntBO/figures/AntBO_illustrationPNG.PNG?raw=true) -### Sautéing a safe environment +Codebase associated to: [AntBO: Towards Real-World Automated Antibody Design with Combinatorial Bayesian Optimisation](https://arxiv.org/abs/2201.12570). -Safety state augmentation (sautéing) is done in a straightforward manner. Assume a safe environment is defined in -a class `MySafeEnv`. The sautéed environment is defined using a decorator `saute_env`, which contains all the -required definitions. Custom and overloaded functions can be defined in the class body. +##### Abstract -```python -from envs.common.saute_env import saute_env +Antibodies are canonically Y-shaped multimeric proteins capable of highly specific molecular recognition. The CDRH3 +region located at the tip of variable chains of an antibody dominates antigen-binding specificity. Therefore, it is a +priority to design optimal antigen-specific CDRH3 regions to develop therapeutic antibodies to combat harmful pathogens. +However, the combinatorial nature of CDRH3 sequence space makes it impossible to search for an optimal binding sequence +exhaustively and efficiently, especially not experimentally. Here, we present AntBO: a Combinatorial Bayesian +Optimisation framework enabling efficient in silico design of the CDRH3 region. Ideally, antibodies should bind to +their target antigen and be free from any harmful outcomes. Therefore, we introduce the CDRH3 trust region that +restricts the search to sequences with feasible developability scores. To benchmark AntBO, we use the Absolut! software +suite as a black-box oracle because it can score the target specificity and affinity of designed antibodies in silico +in an unconstrained fashion. The results across 188 antigens demonstrate the benefit of AntBO in designing CDRH3 regions +with diverse biophysical properties. In under 200 protein designs, AntBO can suggest antibody sequences that outperform +the best binding sequence drawn from 6.9 million experimentally obtained CDRH3s and a commonly used genetic algorithm +baseline. Additionally, AntBO finds very-high affinity CDRH3 sequences in only 38 protein designs whilst requiring no +domain knowledge. We conclude AntBO brings automated antibody design methods closer to what is practically viable for +in vitro experimentation. +# Reinforcement Learning Research -@saute_env -class MySautedEnv(MySafeEnv): - """New sauteed class.""" -``` +## [Sauté RL and Simmer RL: Safe Reinforcement Learning Using Safety State Augmentation](./SIMMER) -Codebase associated to: [Sauté RL: Almost Surely Safe RL Using State Augmentation](https://arxiv.org/pdf/2202.06558.pdf). -. +Codebase associated to: [Sauté RL: Almost Surely Safe RL Using State Augmentation](https://arxiv.org/pdf/2202.06558.pdf) and [Enhancing Safe Exploration Using Safety State Augmentation](https://arxiv.org/pdf/2206.02675.pdf). -##### Abstract +##### Abstract for Sauté RL: Almost Surely Safe RL Using State Augmentation (ICML 2022) Satisfying safety constraints almost surely (or with probability one) can be critical for deployment of Reinforcement Learning (RL) in real-life applications. For example, plane landing and take-off should ideally occur with probability @@ -123,12 +134,9 @@ approach has a plug-and-play nature, i.e., any RL algorithm can be "sauteed". Ad for policy generalization across safety constraints. We finally show that Saute RL algorithms can outperform their state-of-the-art counterparts when constraint satisfaction is of high importance. -## [SIMMER](./SIMMER) - -Codebase associated to: [Enhancing Safe Exploration Using Safety State Augmentation](https://arxiv.org/pdf/2206.02675.pdf). -##### Abstract +##### Abstract for Effects of Safety State Augmentation on Safe Exploration (NeurIPS 2022) Safe exploration is a challenging and important problem in model-free reinforcement learning (RL). Often the safety cost is sparse and unknown, which unavoidably leads to constraint violations -- a phenomenon ideally to be avoided in safety-critical applications. We tackle this problem by augmenting the state-space with a safety state, which is @@ -140,9 +148,33 @@ Safe exploration is a challenging and important problem in model-free reinforcem that simmering a safe algorithm can improve safety during training for both settings. We further show that Simmer can stabilize training and improve the performance of safe RL with average constraints. + + +## [Model-Based Offline Reinforcement Learning with Pessimism-Modulated Dynamics Belief](./PMDB) + +Code associdated to: [Model-Based Offline Reinforcement Learning with Pessimism-Modulated Dynamics Belief](https://nips.cc/Conferences/2022/Schedule?showEvent=54842) accepted +at **NeurIPS22** conference. + +#### Abstract +Model-based offline reinforcement learning (RL) aims to find highly rewarding policy, by leveraging a previously +collected static dataset and a dynamics model. While learned through reuse of static dataset, the dynamics model's +generalization ability hopefully promotes policy learning if properly utilized. To that end, several works propose to +quantify the uncertainty of predicted dynamics, and explicitly apply it to penalize reward. However, as the dynamics and +the reward are intrinsically different factors in context of MDP, characterizing the impact of dynamics uncertainty +through reward penalty may incur unexpected tradeoff between model utilization and risk avoidance. In this work, we +instead maintain a belief distribution over dynamics, and evaluate/optimize policy through biased sampling from the +belief. The sampling procedure, biased towards pessimism, is derived based on an alternating Markov game formulation +of offline RL. We formally show that the biased sampling naturally induces an updated dynamics belief with +policy-dependent reweighting factor, termed *Pessimism-Modulated Dynamics Belief*. To improve policy, we devise an +iterative regularized policy optimization algorithm for the game, with guarantee of monotonous improvement under certain +condition. To make practical, we further devise an offline RL algorithm to approximately find the solution. Empirical +results show that the proposed approach achieves state-of-the-art performance on a wide range of benchmark tasks. + ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -### Codebase Contributors - Alexander I Cowen-Rivers, Antoine Grosnit, Alexandre Max Maravel, Aivar Sootla, Taher Jafferjee, Ryan Rhys Griffiths, Wenlong Lyu, Zhi Wang. +## Codebase Contributors + + Current contributors: Antoine Grosnit, Alexandre Max Maravel, Taher Jafferjee, Wenlong Lyu, Kaiyang Guo. + Alumni contributors: Alexander I. Cowen-Rivers, Aivar Sootla, Ryan Rhys Griffiths, Zhi Wang. diff --git a/SAUTE/.gitignore b/SAUTE/.gitignore deleted file mode 100644 index e57ef03f..00000000 --- a/SAUTE/.gitignore +++ /dev/null @@ -1,140 +0,0 @@ -# project specific -logs/ -plots/ - -*.png -*.npz - -# vscode stuff -.vscode -# pycharms stuff - -.idea/* -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -# env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - diff --git a/SAUTE/README.md b/SAUTE/README.md deleted file mode 100644 index c9570dc5..00000000 --- a/SAUTE/README.md +++ /dev/null @@ -1,114 +0,0 @@ -# Almost Surely Safe RL Using State Augmentation - Sauté RL - -## About - -Satisfying safety constraints almost surely (or with probability one) can be critical for deployment of Reinforcement Learning (RL) in real-life applications. For example, plane landing and take-off should ideally occur with probability one. We address the problem by introducing Safety Augmented (Sauté) Markov Decision Processes (MDPs), where the safety constraints are eliminated by augmenting them into the state-space and reshaping the objective. We show that Sauté MDP satisfies the Bellman equation and moves us closer to solving Safe RL with constraints satisfied almost surely. We argue that Sauté MDP allows to view Safe RL problem from a different perspective enabling new features. For instance, our approach has a plug-and-play nature, i.e., any RL algorithm can be "sautéed". Additionally, state augmentation allows for policy generalization across safety constraints. In our experiments, we show that Sauté RL algorithms outperforms their state-of-the-art counterparts when constraint satisfaction is of high importance. - -## Installation - -The following installation commands were tested in Ubuntu 18.04. - -Create a conda environment - -```console -conda config --append channels conda-forge -conda env create -f sauterl.yml -conda activate sauterl -``` - -Our implementation is based on the Open AI safety starter agents. To install the Open AI libraries run the following commands: - -```console -mkdir safe-rl -cd safe-rl -git clone https://github.com/openai/safety-starter-agents.git && cd safety-starter-agents && pip install -e . && cd .. -git clone https://github.com/openai/safety-gym.git && cd safety-gym && pip install -e . && cd .. -cd .. -``` - -Install the remaining libraries - -```console -pip install -r pip_repos.txt -``` - -## Sautéing an environment -Using our approach in practice is straightforward and requires just three steps: - -1. Creating a safe environment -2. Sautéing the safe environment -3. Running a standard Reinforcement Learning algorithm - -### Creating a safe environment - -In order to create a custom safe environment we need to define the safety cost and to inherit the rest of the definitions from the standard gym environment `MyEnv` -and the provided class `SafeEnv`. - - -```python -from envs.common.safe_env import SafeEnv - -class MySafeEnv(SafeEnv, MyEnv): - """New safety class""" - def _safety_cost_fn(self, state:np.ndarray, action:np.ndarray, next_state:np.ndarray) -> np.ndarray: - """Define the safety cost here." -``` - -The class `SafeEnv` contains the changes to the `step` method, which incorporates the safety constraints. Note that we assume that there is a method `MyEnv._get_obs()`. - -### Sautéing a safe environment -Safety state augmentation (sautéing) is done in a straightforward manner. Assume a safe environment is defined in a class `MySafeEnv`. The sautéed environment is defined using a decorator `saute_env`, which contains all the required definitions. Custom and overloaded functions can be defined in the class body. - -```python -from envs.common.saute_env import saute_env - -@saute_env -class MySautedEnv(MySafeEnv): - """New sauteed class.""" -``` - -## Running - -We release a few tested safe environments, which can be evaluated using main.py. The file takes two arguments: the experiment identifier and the number of experiments for a particular algorithm to run in parallel. For instance, - -```console -python main.py --experiment 11 --num-exps 5 -``` - - -Our experiments: - -ID | Environment | Algorithms | Type of Experiment ---- | --- | --- | --- -10 | Pendulum swing-up | SAC, Langrangian SAC, Saute SAC | Performance -11 | Pendulum swing-up | PPO, Langrangian PPO, Saute PPO | Performance -12 | Pendulum swing-up | TRPO, Langrangian TRPO, Saute TRPO, CPO | Performance -13 | Pendulum swing-up | Saute TRPO | Ablation -20 | Double Pendulum | TRPO, Saute TRPO, CPO| Performance -21 | Double Pendulum | Lagrangian TRPO | Performance -22 | Double Pendulum | Saute TRPO | Naive generalization across safety budgets -23 | Double Pendulum | Saute TRPO | Smart generalization across safety budgets -24 | Double Pendulum | Saute TRPO | Ablation over unsafe reward value -30 | Reacher | TRPO, Langrangian TRPO, Saute TRPO, CPO | Performance -40 | Safety gym | TRPO, Langrangian TRPO, Saute TRPO, CPO | Performance - - -## Output - -By default the output is saved to `./logs/` directory (the directory can be modified in the method `set_all_overrides` in the `BaseRunner` class). - -By default no checkpoints are saved, but the results are tracked in the tensorboard. - - -## Citation - -If you find our code useful please cite our paper! - -``` -@article{sootla2022saute, - title={SAUT\'E RL: Almost Surely Safe Reinforcement Learning Using State Augmentation}, - author={Sootla, Aivar and Cowen-Rivers, Alexander I and Jafferjee, Taher and Wang, Ziyan and Mguni, David and Wang, Jun and Bou-Ammar, Haitham}, - journal={arXiv preprint arXiv:2202.06558}, - year={2022} -} -``` diff --git a/SAUTE/common/__init__.py b/SAUTE/common/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SAUTE/common/argument_parser.py b/SAUTE/common/argument_parser.py deleted file mode 100644 index 3e6821d6..00000000 --- a/SAUTE/common/argument_parser.py +++ /dev/null @@ -1,18 +0,0 @@ -import argparse - -class GeneralArgumentParser(object): - """"Argument parser for the main running file main.py.""" - def __init__(self): - self.parser = argparse.ArgumentParser(description='General argument parser') - self.general_parameters() - - def parse_args(self): - return self.parser.parse_args() - - def general_parameters(self): - self.parser.add_argument('--experiment', type=int, default=0, - help='Experiment number') - self.parser.add_argument('--num-exps', type=int, default=1, - help='number of simultaneous experiments to run in the parameter sweep (default 1)') - self.parser.add_argument('--smoketest', default=False, action='store_true', - help='Perform a smoke test (default False)') \ No newline at end of file diff --git a/SAUTE/common/base_runner.py b/SAUTE/common/base_runner.py deleted file mode 100644 index 7a170876..00000000 --- a/SAUTE/common/base_runner.py +++ /dev/null @@ -1,327 +0,0 @@ -import gym -import os -from tensorboardX import SummaryWriter - -from typing import Tuple, Dict, List, Callable -import itertools -# environments -from envs.mountain_car.mountain_car import mcar_cfg -from envs.pendula.single_pendulum import pendulum_cfg -from envs.pendula.double_pendulum import double_pendulum_cfg -from envs.reacher.reacher import reacher_cfg - - -#utils -from common.utils import set_overrides, create_path - -import logging -import json - -def is_safety_gym_env(env_name): - """ - Checks if the environment is a safety gym environment. - :returns: True if the environment is safety gym environment - """ - return ('Point' in env_name or 'Car' in env_name or 'Doggo' in env_name) and \ - ('Goal' in env_name) and \ - ('Static' in env_name or 'Dynamic' in env_name) - -class BaseRunner: - """Base class for learning the polices.""" - def __init__( - self, - experiment_name:str, - agent_name:str, - task_name:str, - param_sweep_lists:List[List], - agent_cfg_overrides:Dict, - env_cfg_overrides:Dict - ): - self.experiment_name = experiment_name - self.agent_name = agent_name - self.task_name = task_name - self.param_sweep_lists = param_sweep_lists - self.all_overrides = itertools.product(*param_sweep_lists) - self.agent_cfg_overrides = agent_cfg_overrides - self.env_cfg_overrides = env_cfg_overrides - - @staticmethod - def create_env( - agent_cfg:Dict, - env_cfg_override:Dict - ) -> Tuple[Callable, Callable, Dict]: - """ - Script for creating environments specified in cofiguration files. - :param agent_cfg: dictionary with the agent config files - :param env_cfg_override: dictionary with ovverides for the environment config files - """ - if is_safety_gym_env(agent_cfg['env_name']): - if 'Static' in agent_cfg['env_name']: - from envs.safety_gym.augmented_sg_envs import static_engine_cfg - engine_cfg = static_engine_cfg - elif 'Dynamic' in agent_cfg['env_name']: - from envs.safety_gym.augmented_sg_envs import dynamic_engine_cfg - engine_cfg = dynamic_engine_cfg - if 'Point' in agent_cfg['env_name']: - engine_cfg['robot_base'] = 'xmls/point.xml' - elif 'Car' in agent_cfg['env_name']: - engine_cfg['robot_base'] = 'xmls/car.xml' - elif 'Doggo' in agent_cfg['env_name']: - engine_cfg['robot_base'] = 'xmls/doggo.xml' - if 'Goal' in agent_cfg['env_name']: - engine_cfg['task'] = 'goal' - if 'Sauted' in agent_cfg['env_name']: - from envs.safety_gym.augmented_sg_envs import AugmentedSafeEngine, saute_env_cfg - env_cfg = saute_env_cfg - engine_cfg['num_steps'] = env_cfg['max_ep_len'] - train_env_fn = lambda: AugmentedSafeEngine( - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - mode="train", - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'], - engine_cfg=engine_cfg) - env_cfg['mode'] = "test" - test_env_fn = lambda: AugmentedSafeEngine( - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - mode="train", - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'], - engine_cfg=engine_cfg) - else: - from envs.safety_gym.augmented_sg_envs import BaselineEngine, baseline_env_cfg - env_cfg = baseline_env_cfg - engine_cfg['num_steps']=env_cfg['max_ep_len'] - train_env_fn = lambda: BaselineEngine( - max_ep_len=env_cfg['max_ep_len'], - mode="train", - engine_cfg=engine_cfg) - test_env_fn = lambda: BaselineEngine( - max_ep_len=env_cfg['max_ep_len'], - mode="train", - engine_cfg=engine_cfg) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - elif agent_cfg['env_name'] == 'MountainCar': - env_cfg = set_overrides(mcar_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make( - "SafeMountainCar-v0", - mode="train", - ) - test_env_fn = lambda : gym.make( - "SafeMountainCar-v0", - mode="test", - ) - elif agent_cfg['env_name'] == 'SautedMountainCar': - env_cfg = set_overrides(mcar_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make("SautedMountainCar-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - mode="train", - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'], - ) - test_env_fn = lambda : gym.make("SautedMountainCar-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - mode="test", - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'], - ) - elif agent_cfg['env_name'] == 'Pendulum': - env_cfg = set_overrides(pendulum_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make( - "SafePendulum-v0", - mode="train", - ) - test_env_fn = lambda : gym.make( - "SafePendulum-v0", - mode="test", - ) - elif agent_cfg['env_name'] == 'SautedPendulum': - env_cfg = set_overrides(pendulum_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make("SautedPendulum-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - mode="train", - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'], - ) - test_env_fn = lambda : gym.make("SautedPendulum-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - mode="test", - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'], - ) - elif agent_cfg['env_name'] == 'DoublePendulum': - env_cfg = set_overrides(double_pendulum_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make("SafeDoublePendulum-v0", mode="train") - test_env_fn = lambda : gym.make("SafeDoublePendulum-v0", mode="test") - elif agent_cfg['env_name'] == 'SautedDoublePendulum': - env_cfg = set_overrides(double_pendulum_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make("SautedDoublePendulum-v0", - safety_budget=agent_cfg['safety_budget'], - mode="train", - saute_discount_factor=env_cfg['saute_discount_factor'], - unsafe_reward=env_cfg['unsafe_reward'], - max_ep_len=agent_cfg['max_ep_len'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'] - ) - test_env_fn = lambda : gym.make("SautedDoublePendulum-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - mode="test", - max_ep_len=agent_cfg['max_ep_len'], - unsafe_reward=env_cfg['unsafe_reward'], - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'] - ) - elif agent_cfg['env_name'] == 'Reacher': - env_cfg = set_overrides(reacher_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make( - "SafeReacher-v0", - mode="train", - ) - test_env_fn = lambda : gym.make( - "SafeReacher-v0", - mode="test", - ) - elif agent_cfg['env_name'] == 'SautedReacher': - env_cfg = set_overrides(reacher_cfg, env_cfg_override) - agent_cfg['max_ep_len'] = env_cfg['max_ep_len'] - train_env_fn = lambda : gym.make("SautedReacher-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - unsafe_reward=env_cfg['unsafe_reward'], - mode="train", - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'] - ) - test_env_fn = lambda : gym.make("SautedReacher-v0", - safety_budget=agent_cfg['safety_budget'], - saute_discount_factor=env_cfg['saute_discount_factor'], - max_ep_len=agent_cfg['max_ep_len'], - unsafe_reward=env_cfg['unsafe_reward'], - mode="train", - min_rel_budget=env_cfg['min_rel_budget'], - max_rel_budget=env_cfg['max_rel_budget'], - test_rel_budget=env_cfg['test_rel_budget'], - use_reward_shaping=env_cfg['use_reward_shaping'], - use_state_augmentation=env_cfg['use_state_augmentation'] - ) - else: - raise NotImplementedError(f"Env {agent_cfg['env_name']} is not implemented") - return train_env_fn, test_env_fn, agent_cfg, env_cfg - - def set_all_overrides(self): - """Creating the configrations for all experiments including paths.""" - all_agent_cfg_overrides, all_env_cfg_overrides = {}, {} - experiment_paths = [] - for count, overrides in enumerate(self.all_overrides): - cur_agent_overrides = {} - cur_env_overrides = {} - cur_params = {} - for override in overrides: - if override[0] == 'agent_cfg_overrides': - cur_agent_overrides[override[1]] = override[2] - if override[0] == 'env_cfg_overrides': - cur_env_overrides[override[1]] = override[2] - cur_params[override[1]] = override[2] - all_agent_cfg_overrides[count] = set_overrides(self.agent_cfg_overrides, cur_agent_overrides) - all_env_cfg_overrides[count] = set_overrides(self.env_cfg_overrides, cur_env_overrides) - experiment_paths.append( - create_path( - experiment_name=self.experiment_name, - agent_name=self.agent_name, - task_name=self.task_name, - params=cur_params) - ) - return all_agent_cfg_overrides, all_env_cfg_overrides, experiment_paths - - def setup_log(self, exp_dir:str, agent_cfg:Dict, env_cfg:Dict) -> Tuple[SummaryWriter, Callable, Callable]: - """ - Setting the log for the experiment. - :param exp_dir: string specifying the directory to save experiment data - :param agent_cfg: dictionary with the agent config files - :param env_cfg: dictionary with the environment config files - """ - if agent_cfg['log']: - train_dir = os.path.join(exp_dir, "train") - if not os.path.isdir(train_dir): - os.makedirs(train_dir) - test_dir = os.path.join(exp_dir, 'test') - if not os.path.isdir(test_dir): - os.makedirs(test_dir) - writer = SummaryWriter(log_dir=train_dir) - if agent_cfg['log_updates']: - logging.basicConfig(level=logging.INFO, - format='%(message)s', - filename=train_dir + '/logs.txt', - filemode='w') - console = logging.StreamHandler() - console.setLevel(logging.INFO) - log = logging.getLogger() - log.addHandler(console) - with open(os.path.join(train_dir, "configurations.json"), 'w') as json_file: - json.dump(agent_cfg, json_file, sort_keys=False, indent=4) - json_file.write(',\n') - json.dump(env_cfg, json_file, sort_keys=False, indent=4) - else: - writer = None - train_dir = None - test_dir = None - if self.agent_cfg['log_updates']: - logging.basicConfig(format='%(message)s', level=logging.INFO) - logging.info(agent_cfg) - logging.info(env_cfg) - return writer, train_dir, test_dir - diff --git a/SAUTE/common/utils.py b/SAUTE/common/utils.py deleted file mode 100644 index 362b60dd..00000000 --- a/SAUTE/common/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -import copy -import os -from typing import Dict - - -def set_overrides(cfg:Dict, overrides:Dict): - """ - Overriding the specified entries in the dictionary. - :param overrides: override entries - :param cfg: main dictionary - """ - new_cfg = copy.deepcopy(cfg) - if overrides: - for key in overrides.keys(): - new_cfg[key] = overrides[key] - return new_cfg - -def create_path(experiment_name:str, agent_name:str, task_name:str, params:Dict): - """Create a path for saving the experiments.""" - exp_dir = os.path.join('logs', - experiment_name, - task_name, - agent_name) - for key in params.keys(): - exp_dir = os.path.join(exp_dir, key + '_' + str(params[key])) - if not os.path.isdir(exp_dir): - os.makedirs(exp_dir) - return exp_dir \ No newline at end of file diff --git a/SAUTE/envs/__init__.py b/SAUTE/envs/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SAUTE/envs/mountain_car/__init__.py b/SAUTE/envs/mountain_car/__init__.py deleted file mode 100644 index 81898f6c..00000000 --- a/SAUTE/envs/mountain_car/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from envs.mountain_car.mountain_car import SafeMountainCarEnv, SautedMountainCarEnv, mcar_cfg -from gym.envs import register - -print('LOADING SAFE ENVIROMENTS') - -register( - id='SafeMountainCar-v0', - entry_point='envs.mountain_car:SafeMountainCarEnv', - max_episode_steps=mcar_cfg['max_ep_len'], -) - -register( - id='SautedMountainCar-v0', - entry_point='envs.mountain_car:SautedMountainCarEnv', - max_episode_steps=mcar_cfg['max_ep_len'], -) diff --git a/SAUTE/envs/mountain_car/mountain_car.py b/SAUTE/envs/mountain_car/mountain_car.py deleted file mode 100644 index fc3948e8..00000000 --- a/SAUTE/envs/mountain_car/mountain_car.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np -import math -from gym.envs.classic_control.continuous_mountain_car import Continuous_MountainCarEnv -from envs.wrappers.saute_env import saute_env -from envs.wrappers.safe_env import SafeEnv - -mcar_cfg = dict( - action_dim=1, - action_range=[-1, 1], - unsafe_reward=0., - saute_discount_factor=1.0, - max_ep_len=999, - min_rel_budget=1.0, - max_rel_budget=1.0, - test_rel_budget=1.0, - use_reward_shaping=True, - use_state_augmentation=True -) - -class OurMountainCarEnv(Continuous_MountainCarEnv): - def step(self, action): - position = prev_position = self.state[0] - velocity = self.state[1] - force = min(max(action[0], self.min_action), self.max_action) - - velocity += force * self.power - 0.0025 * math.cos(3 * position) - if velocity > self.max_speed: - velocity = self.max_speed - if velocity < -self.max_speed: - velocity = -self.max_speed - position += velocity - if position > self.max_position: - position = self.max_position - if position < self.min_position: - position = self.min_position - if position == self.min_position and velocity < 0: - velocity = 0 - - # Convert a possible numpy bool to a Python bool. - done = bool(position >= self.goal_position and velocity >= self.goal_velocity) - - reward = position - prev_position #+ 0.1 * (velocity - self.goal_velocity) - if done: - reward += 10.0 - # reward -= math.pow(action[0], 2) * 0.1 # remove penalty on action - - self.state = np.array([position, velocity], dtype=np.float32) - return self.state, reward, done, {} - -class SafeMountainCarEnv(SafeEnv, OurMountainCarEnv): - """Safe Mountain Car Environment.""" - def __init__(self, mode:int="train", **kwargs): - self._mode = mode - super().__init__(**kwargs) - - def _get_obs(self): - return self.state - - def reset(self): - if self._mode == "train": - # making our lives easier with random starts - self.state = np.array([ - self.np_random.uniform(low=-0.6, high=0.4), - self.np_random.uniform(low=-self.max_speed, high=self.max_speed) - ]) - elif self._mode == "test": - self.state = np.array([ - self.np_random.uniform(low=-0.6, high=-0.4), - 0 - ]) - return np.array(self.state) - - def _safety_cost_fn(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray) -> np.ndarray: - """Computes a fuel cost on the mountain car""" - return np.linalg.norm(actions) - -@saute_env -class SautedMountainCarEnv(SafeMountainCarEnv): - """Sauted safe mountain car.""" diff --git a/SAUTE/envs/mountain_car/test.py b/SAUTE/envs/mountain_car/test.py deleted file mode 100644 index a3ca1592..00000000 --- a/SAUTE/envs/mountain_car/test.py +++ /dev/null @@ -1,19 +0,0 @@ -import sys -sys.path.append(".") -import envs.mountain_car -import gym - -if __name__ == "__main__": - env = gym.make('SautedMountainCar-v0') - env.reset() - states, actions, next_states, rewards, dones, infos = [env.reset()], [], [], [], [], [] - for _ in range(300000): - a = env.action_space.sample() - s, r, d, i = env.step(a) - states.append(s) - actions.append(a) - next_states.append(s) - rewards.append(r) - dones.append(d) - infos.append(i) - print("dones") \ No newline at end of file diff --git a/SAUTE/envs/pendula/__init__.py b/SAUTE/envs/pendula/__init__.py deleted file mode 100644 index 97c6b3e3..00000000 --- a/SAUTE/envs/pendula/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from envs.pendula.single_pendulum import pendulum_cfg, SafePendulumEnv, SautedPendulumEnv -from envs.pendula.double_pendulum import double_pendulum_cfg, SafeDoublePendulumEnv, SautedDoublePendulumEnv -from gym.envs import register - -print('LOADING SAFE ENVIROMENTS') - -register( - id='SafePendulum-v0', - entry_point='envs.pendula:SafePendulumEnv', - max_episode_steps=pendulum_cfg['max_ep_len'] -) - -register( - id='SautedPendulum-v0', - entry_point='envs.pendula:SautedPendulumEnv', - max_episode_steps=pendulum_cfg['max_ep_len'] -) - -register( - id='SafeDoublePendulum-v0', - entry_point='envs.pendula:SafeDoublePendulumEnv', - max_episode_steps=double_pendulum_cfg['max_ep_len'] -) - -register( - id='SautedDoublePendulum-v0', - entry_point='envs.pendula:SautedDoublePendulumEnv', - max_episode_steps=double_pendulum_cfg['max_ep_len'] -) \ No newline at end of file diff --git a/SAUTE/envs/pendula/double_pendulum.py b/SAUTE/envs/pendula/double_pendulum.py deleted file mode 100644 index b339900c..00000000 --- a/SAUTE/envs/pendula/double_pendulum.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -from gym.envs.mujoco import InvertedDoublePendulumEnv -from envs.wrappers.saute_env import saute_env -from envs.wrappers.safe_env import SafeEnv - -from typing import Dict, Tuple - -double_pendulum_cfg = dict( - action_dim=1, - action_range=[ - -1, - 1], - unsafe_reward=-200., - saute_discount_factor=1.0, - max_ep_len=200, - min_rel_budget=1.0, - max_rel_budget=1.0, - test_rel_budget=1.0, - use_reward_shaping=True, - use_state_augmentation=True - -) - -class DoublePendulumEnv(InvertedDoublePendulumEnv): - """Custom double pendulum.""" - def __init__(self, mode="train"): - assert mode == "train" or mode == "test" or mode == "deterministic", "mode can be deterministic, test or train" - self._mode = mode - super().__init__() - - def step(self, action:np.ndarray) -> Tuple[np.ndarray, float, bool, Dict]: - next_state, reward, done, info = super().step(action) - reward /= 10. # adjusting the reward to match the cost - return next_state, reward, done, info - -class SafeDoublePendulumEnv(SafeEnv, DoublePendulumEnv): - """Safe double pendulum.""" - def __init__(self, **kwargs): - self.unsafe_min = np.pi * (-25. / 180.) - self.unsafe_max = np.pi * (75. / 180.) - self.unsafe_middle = 0.5 * (self.unsafe_max + self.unsafe_min) - self.max_distance = 0.5 * (self.unsafe_max - self.unsafe_min) - super().__init__(**kwargs) - - def _safety_cost_fn(self, state: np.ndarray, action: np.ndarray, next_state: np.ndarray) -> np.ndarray: - """Computes a linear safety cost between the current position - (if its near the unsafe area, aka in the hazard region) - and the centre of the unsafe region.""" - assert type(state) is np.ndarray and type(next_state) is np.ndarray and type(action) is np.ndarray, "Arguments must be np.ndarray" - thetas = np.arctan2(state[..., 1], state[..., 3]) - dist_to_center = np.abs(self.unsafe_middle - thetas) - unsafe_mask = np.float64(((self.unsafe_min) <= thetas) & (thetas <= (self.unsafe_max))) - costs = ((self.max_distance - dist_to_center) / (self.max_distance)) * unsafe_mask - return costs - -@saute_env -class SautedDoublePendulumEnv(SafeDoublePendulumEnv): - """Sauted safe double pendulum.""" - diff --git a/SAUTE/envs/pendula/single_pendulum.py b/SAUTE/envs/pendula/single_pendulum.py deleted file mode 100644 index ff8d73e5..00000000 --- a/SAUTE/envs/pendula/single_pendulum.py +++ /dev/null @@ -1,216 +0,0 @@ -import gym -from gym import spaces -from gym.utils import seeding -import numpy as np -from typing import Callable, List, Dict, Tuple -import torch -from os import path -from envs.utils import angle_normalize, Array -from envs.wrappers.saute_env import saute_env -from envs.wrappers.safe_env import SafeEnv - -class PendulumSwingUpParams: - """Params for the system dynamics""" - g = 10. - m = 1. - l = 1. - dt = .05 - max_speed = 8. - max_torque = 2. - - theta_penalty = 1. - theta_dot_penalty = .1 - action_penalty = 0.001 #.001 - reward_offset = np.ceil(theta_penalty * np.pi ** 2 + theta_dot_penalty * max_speed ** 2 + action_penalty * max_torque ** 2) - # reward_bias = reward_offset - - unsafe_min = np.pi * (20. / 180) - unsafe_max = np.pi * (30. / 180) - hazard_area_size = np.pi * (1. / 4) - n_constraints = 1 - - def __str__(self): - _dyn_params = {'g': self.g, 'm': self.m, 'l':self.l, 'dt': self.dt} - _state_lims = { 'max_speed': self.max_speed, 'max_torque': self.max_torque} - _reward_params = {'theta_penalty': self.theta_penalty, 'theta_dot_penalty': self.theta_dot_penalty, 'action_penalty': self.action_penalty} - _safety_params = {'unsafe_min': self.unsafe_min, 'unsafe_max': self.unsafe_max, 'hazard_area_size':self.hazard_area_size, 'n_constraints': self.n_constraints} - return {"Dynamics parameters" : _dyn_params, "State Limits": _state_lims, "Reward Parameters": _reward_params, 'Safety Parameters': _safety_params}.__str__() - - -pendulum_cfg = { - 'action_dim' : 1, # are used - 'action_range': [-1, 1], # are used - 'unsafe_reward': 0., - 'saute_discount_factor':1.0, - 'max_ep_len':200, - 'min_rel_budget':1.0, - 'max_rel_budget':1.0, - 'test_rel_budget':1.0, - 'use_reward_shaping': True, - 'use_state_augmentation':True -} - -class PendulumEnv(gym.Env): - metadata = { - 'render.modes' : ['human', 'rgb_array'], - 'video.frames_per_second' : 30 - } - - # Used for labels when plotting. - obs_labels = [ - r'$\cos(\theta)$', - r'$\sin(\theta)$', - r'$\partial \theta$', - ] - - def __init__( - self, - params:Callable=None, - mode:str="train"): - self.viewer = None - if params is None: - params = PendulumSwingUpParams() - self.params = params - self.obs_high = np.array([1., 1., self.params.max_speed], dtype=np.float32) - self.observation_space = spaces.Box(low=-self.obs_high, high=self.obs_high) - action_high = np.float32(self.params.max_torque) - self.action_space = spaces.Box(low=-action_high, high=action_high, shape=(1,)) - - assert mode == "train" or mode == "test" or mode == "deterministic", "mode can be determinstic, test or train" - self._mode = mode - self.seed() - - def seed(self, seed:int=None) -> List[int]: - self.np_random, seed = seeding.np_random(seed) - return [seed] - - def do_simulation(self, u:np.ndarray): - """One step simulation of dynamics on the single pendulum""" - th, thdot = self.state # th := theta - dt = self.params.dt - u = self.params.max_torque * u - u = np.clip(u.squeeze(), -self.params.max_torque, self.params.max_torque) - self.last_u = u # for rendering - - newthdot = thdot + (-3 * self.params.g / (2 * self.params.l) * np.sin(th + np.pi) + 3. / (self.params.m * self.params.l ** 2) * u) * dt - newth = th + newthdot * dt - newthdot = np.clip(newthdot, -self.params.max_speed, self.params.max_speed) # pylint: disable=E1111 - - self.state = np.array([newth, newthdot]) - - def step(self, action:np.ndarray) -> Tuple[np.ndarray, float, bool, Dict]: - obs = self._get_obs() - self.do_simulation(action) # bug fix do simulations with numpy actions not torch - next_obs = self._get_obs() - reward = self._reward_fn(obs, action, next_obs, is_tensor=False) - done = self._termination_fn(obs, action, next_obs, is_tensor=False) - info = dict() - return next_obs, reward, done, info - - def reset(self) -> np.ndarray: - if self._mode == "train": - high = np.array([np.pi, 1], dtype=np.float32) - self.state = self.np_random.uniform(low=-high, high=high) - elif self._mode == "test": - high = np.array([0.2, 0.1], dtype=np.float32) - low = np.array([-0.2, -0.1], dtype=np.float32) - self.state = np.array([np.pi, 0], dtype=np.float32) + self.np_random.uniform(low=low, high=high) - elif self._mode == "deterministic": - self.state = np.array([np.pi, 0], dtype=np.float32) - else: - raise NotImplementedError - self.last_u = None - return self._get_obs() - - def _get_obs(self) -> np.ndarray: - theta, thetadot = self.state - return np.array([np.cos(theta), np.sin(theta), thetadot]) - - def _reward_fn(self, states: Array, actions: Array, next_states: Array, is_tensor:bool=True) -> Array: - """Compute rewards in batch if needed - Mostly copied from openAI gym Pendulum-v0 and ported into torch. - https://github.com/openai/gym/blob/master/gym/envs/classic_control/pendulum.py """ - - actions = self.params.max_torque * actions - cos_th, sin_th, thdot = states[..., 0], states[..., 1], states[..., 2] - if is_tensor: - assert type(states) is torch.Tensor and type(next_states) is torch.Tensor and type(actions) is torch.Tensor, "Arguments must be torch.Tensor" - th = torch.atan2(sin_th, cos_th) - th_norm = angle_normalize(th, is_tensor=True) - action_squared = actions.clamp(-self.params.max_torque, self.params.max_torque) - costs = self.params.theta_penalty * th_norm ** 2 + self.params.theta_dot_penalty * thdot ** 2 + self.params.action_penalty * action_squared.squeeze() ** 2 - reward = (-costs + self.params.reward_offset ) / self.params.reward_offset - return reward.view(-1, 1) - else: - assert type(states) is np.ndarray and type(next_states) is np.ndarray and type(actions) is np.ndarray, "Arguments must be np.ndarray" - th = np.arctan2(sin_th, cos_th) - th_norm = angle_normalize(th, is_tensor=False) - action_squared = np.clip(actions, -self.params.max_torque, self.params.max_torque) - costs = self.params.theta_penalty * th_norm ** 2 + self.params.theta_dot_penalty * thdot ** 2 + self.params.action_penalty * action_squared.squeeze() ** 2 - reward = (-costs + self.params.reward_offset ) / self.params.reward_offset - return reward - - def reward_fn(self, states: Array, actions: Array, next_states: Array) -> Array: - """Compute rewards in batch if needed""" - return self._reward_fn(states, actions, next_states, is_tensor=True) - - def _termination_fn(self, states:Array, actions:Array, next_states: Array, is_tensor:bool=True) -> np.ndarray: - """Returns done""" - if is_tensor: - return torch.zeros(1,).cuda() - else: - return False - - def render(self, mode='human'): - - if self.viewer is None: - from gym.envs.classic_control import rendering - self.viewer = rendering.Viewer(500,500) - self.viewer.set_bounds(-2.2,2.2,-2.2,2.2) - rod = rendering.make_capsule(1, .2) - rod.set_color(.8, .3, .3) - self.pole_transform = rendering.Transform() - rod.add_attr(self.pole_transform) - self.viewer.add_geom(rod) - axle = rendering.make_circle(.05) - axle.set_color(0,0,0) - self.viewer.add_geom(axle) - fname = path.join(path.dirname(__file__), "assets/clockwise.png") - self.img = rendering.Image(fname, 1., 1.) - self.imgtrans = rendering.Transform() - self.img.add_attr(self.imgtrans) - - self.viewer.add_onetime(self.img) - self.pole_transform.set_rotation(self.state[0] + np.pi/2) - if self.last_u: - self.imgtrans.scale = (-self.last_u/2, np.abs(self.last_u)/2) - - return self.viewer.render(return_rgb_array = mode=='rgb_array') - - def close(self): - if self.viewer: self.viewer.close() - - -class SafePendulumEnv(SafeEnv, PendulumEnv): - """Safe Pendulum environment.""" - def _is_near_unsafe_area_batch(self, thetas): - return ((self.params.unsafe_min - self.params.hazard_area_size) <= thetas) & (thetas <= (self.params.unsafe_max + self.params.hazard_area_size)) - - def _safety_cost_fn(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray) -> np.ndarray: - """Computes a linear safety cost between the current position - (if its near the unsafe area, aka in the hazard region) - and the centre of the unsafe region""" - unsafe_angle_middle = 0.5 * (self.params.unsafe_max + self.params.unsafe_min) # 25 = (20 + 30) /2 - max_distance = self.params.hazard_area_size + (unsafe_angle_middle - self.params.unsafe_min) * 1.0 # 50 = 45 + (25 - 20) - assert type(states) is np.ndarray and type(next_states) is np.ndarray and type(actions) is np.ndarray, "Arguments must be np.ndarray" - thetas = np.arctan2(states[..., 1], states[..., 0]) - dist_to_center = np.abs(unsafe_angle_middle - thetas) # |25 - theta| - unsafe_mask = np.float64(self._is_near_unsafe_area_batch(thetas)) # 20-45 = -25 <= theta <= 75 = 30+45 - costs = ((max_distance - dist_to_center) / (max_distance)) * unsafe_mask - return costs - - -@saute_env -class SautedPendulumEnv(SafePendulumEnv): - """Sauted safe pendulum.""" - diff --git a/SAUTE/envs/pendula/test.py b/SAUTE/envs/pendula/test.py deleted file mode 100644 index 8e6d9590..00000000 --- a/SAUTE/envs/pendula/test.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys -sys.path.append(".") -import envs.pendula -import gym - - -if __name__ == "__main__": - - env = gym.make('SautedDoublePendulum-v0',safety_budget=1.,saute_discount_factor=0.1,mode="deterministic") - env.reset() - env2 = gym.make('SafeDoublePendulum-v0', mode="deterministic") - env2.reset() - print(env.wrap._mode) - states, actions, next_states, rewards, dones, infos = [env.reset()], [], [], [], [], [] - for _ in range(3000): - a = env.action_space.sample() - s, r, d, i = env.step(a) - s2, r2, d2, i2 = env2.step(a) - print(s, s2) - print(r, r2) - states.append(s) - actions.append(a) - next_states.append(s) - rewards.append(r) - dones.append(d) - infos.append(i) - print("dones") \ No newline at end of file diff --git a/SAUTE/envs/reacher/__init__.py b/SAUTE/envs/reacher/__init__.py deleted file mode 100644 index f9d3061b..00000000 --- a/SAUTE/envs/reacher/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from envs.reacher.reacher import SafeReacherEnv, SautedReacherEnv, reacher_cfg -from gym.envs import register - -print('LOADING SAFE ENVIROMENTS') - -register( - id='SafeReacher-v0', - entry_point='envs.reacher:SafeReacherEnv', - max_episode_steps=reacher_cfg['max_ep_len'], -) - -register( - id='SautedReacher-v0', - entry_point='envs.reacher:SautedReacherEnv', - max_episode_steps=reacher_cfg['max_ep_len'], -) diff --git a/SAUTE/envs/reacher/reacher.py b/SAUTE/envs/reacher/reacher.py deleted file mode 100644 index 0e1ddfbf..00000000 --- a/SAUTE/envs/reacher/reacher.py +++ /dev/null @@ -1,96 +0,0 @@ -from gym import spaces -from gym.utils import seeding -from typing import Tuple, Dict, List - -from gym.envs.mujoco.reacher import ReacherEnv -import numpy as np -from envs.wrappers.saute_env import saute_env -from envs.wrappers.safe_env import SafeEnv - -reacher_cfg = { - 'action_dim': 1, - 'action_range': [-2, 2], - 'unsafe_reward': -3.75, - 'saute_discount_factor':1.0, - 'max_ep_len': 50, - 'min_rel_budget':1.0, - 'max_rel_budget':1.0, - 'test_rel_budget':1.0, - 'use_reward_shaping':True, - 'use_state_augmentation':True -} - -class CustomReacherEnv(ReacherEnv): - """Custom reacher.""" - def __init__( - self, - mode: str = "train" - ): - self.observation_space_high = np.array( - [np.pi, np.pi, np.pi, np.pi, 1., 1., 1., 1., 1., 1., 1.], dtype=np.float32) # TODO: figure out - self.observation_space = spaces.Box(low=-self.observation_space_high, high=self.observation_space_high) - self.target_position = np.array([0, 0, 0]) - assert mode == "train" or mode == "test" or mode == "deterministic", "mode can be deterministic, test or train" - self._mode = mode - self.seed() - super(CustomReacherEnv, self).__init__() - - def seed(self, seed:int=None) -> List[int]: - self.np_random, seed = seeding.np_random(seed) - return [seed] - - def step(self, action:np.ndarray) -> Tuple[np.ndarray, float, bool, Dict]: - vec = self.get_body_com("fingertip") - self.target_position - reward_dist = -np.linalg.norm(vec) - reward_ctrl = -np.square(action).sum() - reward = reward_dist + reward_ctrl - self.do_simulation(action, self.frame_skip) - next_state = self._get_obs() # next state - done = False - return next_state, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) - - def reset(self) -> np.ndarray: - qpos = ( - self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) - + self.init_qpos - ) - self.goal = np.array([1.0, 1.0]) - self.target_position = np.concatenate([self.goal, [.01]]) - - qpos[-2:] = self.goal - qvel = self.init_qvel + self.np_random.uniform( - low=-0.005, high=0.005, size=self.model.nv - ) - qvel[-2:] = 0 - self.set_state(qpos, qvel) - return self._get_obs() - - def _get_obs(self) -> np.ndarray: - theta = self.sim.data.qpos.flat[:2] - return np.concatenate( - [ - np.cos(theta), - np.sin(theta), - self.sim.data.qpos.flat[2:], - self.sim.data.qvel.flat[:2], - self.get_body_com("fingertip") - self.target_position, - ] - ) - -class SafeReacherEnv(SafeEnv, CustomReacherEnv): - safety_center = np.array([[.50, .50, 0.0]]) - - def _safety_cost_fn(self, state:np.ndarray, action:np.ndarray, next_state:np.ndarray) -> np.ndarray: - """Computes the safety cost.""" - safety_vec = self.get_body_com("fingertip") - self.safety_center - dist = np.linalg.norm(safety_vec) - if dist<0.5: - #Linearly increasse from 0 to 100 based on distance - return (1.0 - dist * 2.) * 100.0 - else: - return 0 - -@saute_env -class SautedReacherEnv(SafeReacherEnv): - """Sauted safe reacher.""" - diff --git a/SAUTE/envs/reacher/test.py b/SAUTE/envs/reacher/test.py deleted file mode 100644 index 9241c6d0..00000000 --- a/SAUTE/envs/reacher/test.py +++ /dev/null @@ -1,21 +0,0 @@ -import sys -sys.path.append(".") -import envs.reacher -import gym - - -if __name__ == "__main__": - - env = gym.make('SautedReacher-v0') - env.reset() - states, actions, next_states, rewards, dones, infos = [env.reset()], [], [], [], [], [] - for _ in range(300000): - a = env.action_space.sample() - s, r, d, i = env.step(a) - states.append(s) - actions.append(a) - next_states.append(s) - rewards.append(r) - dones.append(d) - infos.append(i) - print("dones") \ No newline at end of file diff --git a/SAUTE/envs/safety_gym/augmented_sg_envs.py b/SAUTE/envs/safety_gym/augmented_sg_envs.py deleted file mode 100644 index 23a136b0..00000000 --- a/SAUTE/envs/safety_gym/augmented_sg_envs.py +++ /dev/null @@ -1,136 +0,0 @@ -import numpy as np -from typing import Tuple, Dict, List -from safety_gym.envs.engine import Engine -from gym.utils import seeding -from envs.wrappers.saute_env import saute_env - -baseline_env_cfg = dict( - action_dim=2, - action_range=[-1, 1], - max_ep_len=200, # check? - mode="train" -) - -saute_env_cfg = dict( - action_dim=2, - action_range=[-1, 1], - saute_discount_factor=1.0, - safety_budget=15, - unsafe_reward=-1.0, - max_ep_len=200, # check? - min_rel_budget=1., - max_rel_budget=1., - test_rel_budget=1., - mode="train", - use_reward_shaping=True, - use_state_augmentation=True -) - -static_engine_cfg=dict( - placements_extents=[-1.5, -1.5, 1.5, 1.5], - goal_size=0.3, - goal_keepout=0.305, - goal_locations=[(1.1, 1.1)], - observe_goal_lidar=True, - observe_hazards=True, - constrain_hazards=True, - lidar_max_dist=3, - lidar_num_bins=16, - hazards_num=1, - hazards_size=0.7, - hazards_keepout=0.705, - hazards_locations=[(0, 0)] -) - -dynamic_engine_cfg = dict( - placements_extents=[-1.5, -1.5, 1.5, 1.5], - goal_size=0.3, - goal_keepout=0.305, - observe_goal_lidar=True, - observe_hazards=True, - constrain_hazards=True, - lidar_max_dist=3, - lidar_num_bins=16, - hazards_num=3, - hazards_size=0.3, - hazards_keepout=0.305 -) - - -class BaselineEngine(Engine): - """ - Base class for the safety gym environments - """ - def __init__( - self, - max_ep_len:int=200, - mode:str="train", - engine_cfg:Dict=None, - ): - super(BaselineEngine, self).__init__(engine_cfg) - assert mode == "train" or mode == "test" or mode == "deterministic", "mode can be deterministic, test or train" - assert max_ep_len > 0 - self.max_episode_steps = max_ep_len - self._mode = mode - - def seed(self, seed:int=None) -> List[int]: - super(BaselineEngine, self).seed(seed) - self.np_random, seed = seeding.np_random(self._seed) - return [seed] - - def step(self, action:np.ndarray) -> Tuple[np.ndarray, int, bool, Dict]: - obs, reward, done, info = super(BaselineEngine, self).step(action) - info['pos_com'] = self.world.robot_com() # saving position of the robot to plot - return obs, reward, done, info - - -@saute_env -class AugmentedSafeEngine(BaselineEngine): - """Sauted pendulum using a wrapper""" - - -if __name__ == '__main__': - envs = [ - 'StaticPointGoalEnv-v0', 'StaticCarGoalEnv-v0', - 'DynamicPointGoalEnv-v0', 'DynamicCarGoalEnv-v0', - 'DynamicPointDoggoEnv-v0', 'DynamicDoggoGoalEnv-v0', - 'SautedStaticPointGoalEnv-v0', 'SautedDynamicPointGoalEnv-v0', - 'SautedStaticCarGoalEnv-v0', 'SautedDynamicCarGoalEnv-v0', - 'SautedStaticDoggoGoalEnv-v0', 'SautedDynamicDoggoGoalEnv-v0' - ] - def is_safety_gym_env(env_name): - return ('Point' in env_name or 'Car' in env_name or 'Doggo' in env_name) and \ - ('Goal' in env_name) and \ - ('Static' in env_name or 'Dynamic' in env_name) - for env_name in envs: - # env_name = envs[idx] - # print(env_name) - if is_safety_gym_env(env_name): - if 'Static' in env_name: - engine_cfg = static_engine_cfg - elif 'Dynamic' in env_name: - engine_cfg = dynamic_engine_cfg - if 'Point' in env_name: - engine_cfg['robot_base'] = 'xmls/point.xml' - elif 'Car' in env_name: - engine_cfg['robot_base'] = 'xmls/car.xml' - elif 'Doggo' in env_name: - engine_cfg['robot_base'] = 'xmls/doggo.xml' - if 'Goal' in env_name: - engine_cfg['task'] = 'goal' - - if 'Sauted' in env_name: - env = AugmentedSafeEngine(saute_env_cfg, engine_cfg) - else: - env = BaselineEngine(baseline_env_cfg, engine_cfg) - print(env_name, env, env.config['robot_base'], env.config['task']) - d = False - min_reward = 0 - max_reward = 0 - rewards = [] - for _ in range(1): - obs = env.reset() - while not d: - o, r, d, i = env.step(env.action_space.sample()) - rewards.append(r) - print(np.mean(rewards), np.std(rewards), min(rewards), max(rewards)) diff --git a/SAUTE/envs/utils.py b/SAUTE/envs/utils.py deleted file mode 100644 index b2578223..00000000 --- a/SAUTE/envs/utils.py +++ /dev/null @@ -1,13 +0,0 @@ -import torch -import numpy as np -from typing import Union -Array = Union[torch.Tensor, np.ndarray] - -def angle_normalize(theta:Array, is_tensor:bool=True) -> Array: - """Normalizes an angle theta to be between -pi and pi.""" - if is_tensor: - torch_pi = torch.Tensor(np.asarray(np.pi)) - return ((theta + torch_pi) % (2 * torch_pi)) - torch_pi - else: - return (((theta+np.pi) % (2*np.pi)) - np.pi) - diff --git a/SAUTE/envs/wrappers/safe_env.py b/SAUTE/envs/wrappers/safe_env.py deleted file mode 100644 index 22767234..00000000 --- a/SAUTE/envs/wrappers/safe_env.py +++ /dev/null @@ -1,22 +0,0 @@ -from gym import Env -import numpy as np - - -class SafeEnv(Env): - """Safe environment wrapper.""" - def step(self, action:np.ndarray) -> np.ndarray: - state = self._get_state() - next_state, reward, done, info = super().step(action) - info['cost'] = self._safety_cost_fn(state, action, next_state) - return next_state, reward, done, info - - def _get_state(self): - """Returns current state. Uses _get_obs() method if it is implemented.""" - if hasattr(self, "_get_obs"): - return self._get_obs() - else: - raise NotImplementedError("Please implement _get_obs method returning the current state") - - def _safety_cost_fn(self, state: np.ndarray, action: np.ndarray, next_state: np.ndarray) -> np.ndarray: - """Returns current safety cost.""" - raise NotImplementedError("Please implement _safety_cost_fn method returning the current safety cost") \ No newline at end of file diff --git a/SAUTE/envs/wrappers/saute_env.py b/SAUTE/envs/wrappers/saute_env.py deleted file mode 100644 index 7022083c..00000000 --- a/SAUTE/envs/wrappers/saute_env.py +++ /dev/null @@ -1,115 +0,0 @@ -import numpy as np -import torch -from gym import Env -from gym import spaces -from envs.utils import Array - -def saute_env(cls): - """ Class decorator for sauteing an environment. """ - class SauteEnv(Env): - def __init__( - self, - safety_budget:float=1.0, - saute_discount_factor:float=0.99, - max_ep_len:int=200, - min_rel_budget:float=1., # minimum relative (with respect to safety_budget) budget - max_rel_budget:float=1., # maximum relative (with respect to safety_budget) budget - test_rel_budget:float=1., # test relative budget - unsafe_reward:float=0, - use_reward_shaping:bool=True, # ablation - use_state_augmentation:bool=True, # ablation - **kwargs - ): - assert safety_budget > 0, "Please specify a positive safety budget" - assert saute_discount_factor > 0 and saute_discount_factor <= 1, "Please specify a discount factor in (0, 1]" - assert min_rel_budget <= max_rel_budget, "Minimum relative budget should be smaller or equal to maximum relative budget" - assert max_ep_len > 0 - - self.wrap = cls(**kwargs) - self.use_reward_shaping = use_reward_shaping - self.use_state_augmentation = use_state_augmentation - self.max_ep_len = max_ep_len - self.min_rel_budget = min_rel_budget - self.max_rel_budget = max_rel_budget - self.test_rel_budget = test_rel_budget - - if saute_discount_factor < 1: - safety_budget = safety_budget * (1 - saute_discount_factor ** self.max_ep_len) / (1 - saute_discount_factor) / self.max_ep_len - self._safety_budget = np.float32(safety_budget) - - self._safety_state = 1. - self._saute_discount_factor = saute_discount_factor - self._unsafe_reward = unsafe_reward - - self.action_space = self.wrap.action_space - self.obs_high = self.wrap.observation_space.high - self.obs_low = self.wrap.observation_space.low - if self.use_state_augmentation: - self.obs_high = np.array(np.hstack([self.obs_high, np.inf]), dtype=np.float32) - self.obs_low = np.array(np.hstack([self.obs_low, -np.inf]), dtype=np.float32) - self.observation_space = spaces.Box(high=self.obs_high, low=self.obs_low) - - @property - def safety_budget(self): - return self._safety_budget - - @property - def saute_discount_factor(self): - return self._saute_discount_factor - - @property - def unsafe_reward(self): - return self._unsafe_reward - - def reset(self) -> np.ndarray: - """Resets the environment.""" - state = self.wrap.reset() - if self.wrap._mode == "train": - self._safety_state = self.wrap.np_random.uniform(low=self.min_rel_budget, high=self.max_rel_budget) - elif self.wrap._mode == "test" or self.wrap._mode == "deterministic": - self._safety_state = self.test_rel_budget - else: - raise NotImplementedError("this error should not exist!") - augmented_state = self._augment_state(state, self._safety_state) - return augmented_state - - def _augment_state(self, state:np.ndarray, safety_state:np.ndarray): - """Augmenting the state with the safety state, if needed""" - augmented_state = np.hstack([state, safety_state]) if self.use_state_augmentation else state - return augmented_state - - def safety_step(self, cost:np.ndarray) -> np.ndarray: - """ Update the normalized safety state z' = (z - l / d) / gamma. """ - self._safety_state -= cost / self.safety_budget - self._safety_state /= self.saute_discount_factor - return self._safety_state - - def step(self, action): - """ Step through the environment. """ - next_obs, reward, done, info = self.wrap.step(action) - next_safety_state = self.safety_step(info['cost']) - info['true_reward'] = reward - info['next_safety_state'] = next_safety_state - reward = self.reshape_reward(reward, next_safety_state) - augmented_state = self._augment_state(next_obs, next_safety_state) - return augmented_state, reward, done, info - - def reshape_reward(self, reward:Array, next_safety_state:Array): - """ Reshaping the reward. """ - if self.use_reward_shaping: - reward = reward * (next_safety_state > 0) + self.unsafe_reward * (next_safety_state <= 0) - return reward - - def reward_fn(self, states: torch.Tensor, actions: torch.Tensor, next_states: torch.Tensor) -> torch.Tensor: - """ Compute rewards in a batch. """ - reward = self.wrap._reward_fn(states, actions, next_states, is_tensor=True) - if self.use_state_augmentation: - # shape reward for model-based predictions - reward = self.reshape_reward(reward, next_states[:, -1].view(-1, 1)) - return reward - - return SauteEnv - - -if __name__ == "__main__": - pass \ No newline at end of file diff --git a/SAUTE/exps/__init__.py b/SAUTE/exps/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v1.py b/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v1.py deleted file mode 100644 index 58e463fb..00000000 --- a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v1.py +++ /dev/null @@ -1,94 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_double_pendulum_v1( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """ - Running Lagrangian TRPO, CPO, Vanilla TRPO, Saute TRPO on the double pendulum environment. - """ - if experiment_name is None: - experiment_name = 'performance' - task_name = 'DoublePendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - safety_discount_factor = 0.99, - checkpoint_frequency = 0, - epochs=300, - ) - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - for agent_name in ['SauteTRPO', 'CPO', 'VanillaTRPO']: # Lagrangian TRPO is not run in this file - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 40.0], - # ['agent_cfg_overrides', 'safety_budget', 20.0], - # ['agent_cfg_overrides', 'safety_budget', 60.0], - # ['agent_cfg_overrides', 'safety_budget', 80.0], - ] - - env_cfg_overrides = {} - param_list = [] - if agent_name == 'VanillaTRPO': - param_list = [seeds] - if agent_name == 'LagrangianTRPO' or agent_name == 'SauteLagrangianTRPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'CPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteTRPO': - max_rel_budgets = [ - ['env_cfg_overrides', 'max_rel_budget', 1.0], - ] - min_rel_budgets = [ - ['env_cfg_overrides', 'min_rel_budget', 1.0], - ] - param_list = [safety_budgets, max_rel_budgets, min_rel_budgets, seeds] - if agent_name == 'SauteTRPO_allbudgets': - env_cfg_overrides = dict( - test_rel_budget=1.0, - ) - - max_rel_budgets = [ - ['env_cfg_overrides', 'max_rel_budget', 2.0], - ] - min_rel_budgets = [ - ['env_cfg_overrides', 'min_rel_budget', 0.1], - ] - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 50.0], - ] - param_list = [safety_budgets, max_rel_budgets, min_rel_budgets, seeds] # seeds are the last - agent_name = 'SauteTRPO' # NB! - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v2.py b/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v2.py deleted file mode 100644 index bc858f5f..00000000 --- a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v2.py +++ /dev/null @@ -1,110 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_double_pendulum_v2( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """ - Tuning Lagrangian TRPO. - """ - if experiment_name is None: - experiment_name = 'performance' - task_name = 'DoublePendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - safety_discount_factor = 0.99, - checkpoint_frequency = 0, - epochs=300, - ) - - safety_budgets = [ - # ['agent_cfg_overrides', 'safety_budget', 20.0], - ['agent_cfg_overrides', 'safety_budget', 40.0], - # ['agent_cfg_overrides', 'safety_budget', 60.0], - # ['agent_cfg_overrides', 'safety_budget', 80.0], - ] - penalty_lrs = [ - # ['agent_cfg_overrides', 'penalty_lr', 1e-3], - # ['agent_cfg_overrides', 'penalty_lr', 1e-2], - ['agent_cfg_overrides', 'penalty_lr', 5e-2], - ] - value_fn_lrs = [ - ['agent_cfg_overrides', 'value_fn_lr', 5e-3], - # ['agent_cfg_overrides', 'value_fn_lr', 1e-3], - # ['agent_cfg_overrides', 'value_fn_lr', 5e-4], - ] - backtrack_iterss = [ - ['agent_cfg_overrides', 'backtrack_iters', 20], - # ['agent_cfg_overrides', 'backtrack_iters', 15], - # ['agent_cfg_overrides', 'backtrack_iters', 10], - ] - steps_per_epochs = [ - # ['agent_cfg_overrides', 'steps_per_epoch', 20000], - # ['agent_cfg_overrides', 'steps_per_epoch', 10000], - ['agent_cfg_overrides', 'steps_per_epoch', 4000] - ] - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - for agent_name in ['LagrangianTRPO']: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - param_list = [] - if agent_name == 'VanillaTRPO': - param_list = [seeds] - if agent_name == 'LagrangianTRPO': - param_list = [safety_budgets, safety_discount_factors, steps_per_epochs, backtrack_iterss, penalty_lrs, value_fn_lrs, seeds] - if agent_name == 'CPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteTRPO': - max_rel_budgets = [ - ['env_cfg_overrides', 'max_rel_budget', 2.0], - ] - min_rel_budgets = [ - ['env_cfg_overrides', 'min_rel_budget', 0.1], - ] - param_list = [safety_budgets, max_rel_budgets, min_rel_budgets, seeds] - if agent_name == 'SauteTRPO_allbudgets': - env_cfg_overrides = dict( - test_rel_budget=1.0, - ) - max_rel_budgets = [ - ['env_cfg_overrides', 'max_rel_budget', 2.0], - ] - min_rel_budgets = [ - ['env_cfg_overrides', 'min_rel_budget', 0.1], - ] - param_list = [safety_budgets, max_rel_budgets, min_rel_budgets, seeds] - agent_name = 'SauteTRPO' # NB! - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last ## policy_lrs, penalty_lrs, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v3.py b/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v3.py deleted file mode 100644 index e34de525..00000000 --- a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v3.py +++ /dev/null @@ -1,95 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_double_pendulum_v3( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """Smart generalization to different safety budgtes for the double pendulum Saute TRPO (testing only).""" - if experiment_name is None: - experiment_name = 'smart_generalization' - task_name = 'DoublePendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - safety_discount_factor = 0.99, - checkpoint_frequency = 0, - epochs=300, - ) - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - test_safety_budgets = [20.0, 40.0, 60.0, 80.0] - - max_rel_budgets = [ - ['env_cfg_overrides', 'max_rel_budget', 2.0], - ] - min_rel_budgets = [ - ['env_cfg_overrides', 'min_rel_budget', 0.1], - ] - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 50.0], - ] - ## training - - param_list = [] - env_cfg_overrides = dict() - param_list = [safety_budgets, max_rel_budgets, min_rel_budgets, seeds] # seeds are the last - agent_name = 'SauteTRPO' # NB! - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - - ## testing - for test_safety_budget in test_safety_budgets: - param_list = [] - env_cfg_overrides = dict( - test_rel_budget=test_safety_budget / safety_budgets[0][2] - ) - param_list = [safety_budgets, max_rel_budgets, min_rel_budgets, seeds] # seeds are the last - agent_name = 'SauteTRPO' # NB! - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 1000000 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=False, - test=True, - evaluate_last_only=True, # computing only the results only for the last epoch - data_filename=f"{env_cfg_overrides['test_rel_budget']}_test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v4.py b/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v4.py deleted file mode 100644 index 2f6cf628..00000000 --- a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v4.py +++ /dev/null @@ -1,67 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_double_pendulum_v4( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """"Ablation over unsafe reward value for the double pendulum Saute TRPO.""" - if experiment_name is None: - experiment_name = 'ablation/unsafe_val' - task_name = 'DoublePendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - safety_discount_factor = 0.99, - checkpoint_frequency = 0, - epochs=300, - ) - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 40.0], - ] - unsafe_rewards = [ - ['env_cfg_overrides', 'unsafe_reward', -0.0], - ['env_cfg_overrides', 'unsafe_reward', -10.0], - ['env_cfg_overrides', 'unsafe_reward', -100.0], - ['env_cfg_overrides', 'unsafe_reward', -1000.0], - ['env_cfg_overrides', 'unsafe_reward', -10000.0], - ['env_cfg_overrides', 'unsafe_reward', -100000.0] - ] - for agent_name in ['SauteTRPO']: - env_cfg_overrides = {} - param_list = [] - if agent_name == 'SauteTRPO': - param_list = [safety_budgets, unsafe_rewards, seeds] - else: - raise NotImplementedError - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v5.py b/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v5.py deleted file mode 100644 index f9d2f8d9..00000000 --- a/SAUTE/exps/double_pendulum/tf_trpo_double_pendulum_v5.py +++ /dev/null @@ -1,60 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_double_pendulum_v5( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """Naive generalization to different safety budgtes for the double pendulum Saute TRPO (testing only).""" - if experiment_name is None: - experiment_name = 'performance' - task_name = 'DoublePendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - safety_discount_factor = 1.0, - checkpoint_frequency = 0, - epochs=600, - ) - env_cfg_overrides = dict() - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - test_safety_budgets = [20.0] - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 60.0], - ] - - for test_safety_budget in test_safety_budgets: - param_list = [] - env_cfg_overrides['test_rel_budget'] = test_safety_budget / safety_budgets[0][2] - param_list = [safety_budgets, seeds] # seeds are the last ## policy_lrs, penalty_lrs, - agent_name = 'SauteTRPO' # NB! - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=False, # no training is needed, this flag should be false - test=True, - evaluate_last_only=True, # computing only the results only for the last epoch - data_filename=f"{test_safety_budget}_test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/mountain_car/tf_sac_v1.py b/SAUTE/exps/mountain_car/tf_sac_v1.py deleted file mode 100644 index cd57de9b..00000000 --- a/SAUTE/exps/mountain_car/tf_sac_v1.py +++ /dev/null @@ -1,76 +0,0 @@ -from tf_algos.common.runner import TFRunner -""" -Runs SAC algorithms for safety: -VanillaSAC - Vanilla SAC, -LagrangianSAC - SAC with a lagrangian constraint, -SauteSAC - Saute SAC, -WorstCaseSAC - distributional SAC with a c-var constraint https://www.st.ewi.tudelft.nl/mtjspaan/pub/Yang21aaai.pdf . -""" -def run_tf_sac_v1( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - if experiment_name is None: - experiment_name = 'test2/example' - task_name = 'MountainCar' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - steps_per_epoch = 999, - epochs=1000, - checkpoint_frequency = 0, - penalty_lr=5e-2, - n_test_episodes=100, - ) - - # parameter sweep - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 50.0], - ] - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - # 'SauteLagrangianSAC', 'VanillaSAC', 'LagrangianSAC', - for agent_name in [ 'SauteSAC']: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - if agent_name == 'VanillaSAC': - param_list = [seeds] - if agent_name == 'WorstCaseSAC': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'LagrangianSAC' or agent_name == 'SauteLagrangianSAC': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteSAC': - param_list = [safety_budgets, seeds] - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 1000000 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - # evaluate_last_only=True, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/reacher/tf_trpo_reacher_v1.py b/SAUTE/exps/reacher/tf_trpo_reacher_v1.py deleted file mode 100644 index 374b8c18..00000000 --- a/SAUTE/exps/reacher/tf_trpo_reacher_v1.py +++ /dev/null @@ -1,77 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_reacher_v1( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """Running experiments for reacher environment.""" - if experiment_name is None: - experiment_name = 'performance' # - task_name = 'Reacher' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - checkpoint_frequency = 0, - epochs=200, - ) - - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 10.0], - ] - penalty_lrs = [ - ['agent_cfg_overrides', 'penalty_lr', 3e-2], - ] - value_fn_lrs = [ - ['agent_cfg_overrides', 'value_fn_lr', 1e-2], - ] - backtrack_iterss = [ - ['agent_cfg_overrides', 'backtrack_iters', 15], - ] - steps_per_epochs = [ - ['agent_cfg_overrides', 'steps_per_epoch', 4000] - ] - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - for agent_name in ['LagrangianTRPO', 'SauteTRPO', 'VanillaTRPO', 'CPO']: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - param_list = [] - if agent_name == 'VanillaTRPO': - param_list = [seeds] - if agent_name == 'LagrangianTRPO': - param_list = [safety_budgets, safety_discount_factors, value_fn_lrs, penalty_lrs, backtrack_iterss, steps_per_epochs, seeds] - if agent_name == 'CPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteTRPO': - param_list = [safety_budgets, seeds] - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") diff --git a/SAUTE/exps/safety_gym/tf_trpo_safety_gym_v1.py b/SAUTE/exps/safety_gym/tf_trpo_safety_gym_v1.py deleted file mode 100644 index 1ec6f15d..00000000 --- a/SAUTE/exps/safety_gym/tf_trpo_safety_gym_v1.py +++ /dev/null @@ -1,77 +0,0 @@ -from tf_algos.common.runner import TFRunner - - -def run_tf_trpo_safety_gym_v1( - experiment_name: str = None, - num_exps: int = 1, - smoketest: bool = True -): - """Running experiments for safety gym environments.""" - if experiment_name is None: - experiment_name = 'performance' - task_name = 'StaticPointGoal' # 'StaticCarGoal' - # big overrides - agent_cfg_overrides = dict( - env_name=task_name, # a necessary override - discount_factor=0.99, # a necessary override - safety_discount_factor=0.99, - checkpoint_frequency=0, - epochs=500, - ) - - env_cfg_overrides = dict( - ) - # parameter sweep - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 10.0], - ] - penalty_lrs = [ - ['agent_cfg_overrides', 'penalty_lr', 3e-2], - ] - value_fn_lrs = [ - ['agent_cfg_overrides', 'value_fn_lr', 5e-3], - ] - steps_per_epochs = [ - ['agent_cfg_overrides', 'steps_per_epoch', 10000], - ] - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - for agent_name in ['VanillaTRPO', 'LagrangianTRPO', 'SauteTRPO', "CPO"]: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - param_list = [] - if agent_name == 'VanillaTRPO': - param_list = [seeds] - if agent_name == 'LagrangianTRPO': - param_list = [safety_budgets, safety_discount_factors, penalty_lrs, value_fn_lrs, seeds] - if agent_name == 'CPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteTRPO': - param_list = [safety_budgets, steps_per_epochs, seeds] - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") diff --git a/SAUTE/exps/single_pendulum/tf_ppo_pendulum_v1.py b/SAUTE/exps/single_pendulum/tf_ppo_pendulum_v1.py deleted file mode 100644 index 6fc72683..00000000 --- a/SAUTE/exps/single_pendulum/tf_ppo_pendulum_v1.py +++ /dev/null @@ -1,76 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_ppo_pendulum_v1( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """ - Runs PPO algorithms for safety: - VanillaPPO - Vanilla PPO, - LagrangianPPO - PPO with a lagrangian constraint, - SautePPO - Saute PPO, - SauteLangrangianPPO - Lagrangian PPO with safety state augmentation. - """ - - if experiment_name is None: - experiment_name = 'plug_n_play' - task_name = 'Pendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - checkpoint_frequency = 0, - n_test_episodes=100, - penalty_lr=5e-2, - epochs=200 - ) - - env_cfg_overrides = {} - - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 30.0], - ] - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - for agent_name in ['VanillaPPO', 'SautePPO', 'LagrangianPPO']: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - param_list = [] - if agent_name == 'VanillaPPO': - param_list = [seeds] - if agent_name == 'LagrangianPPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SautePPO': - param_list = [safety_budgets, seeds] - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last ## policy_lrs, penalty_lrs, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/single_pendulum/tf_sac_pendulum_v1.py b/SAUTE/exps/single_pendulum/tf_sac_pendulum_v1.py deleted file mode 100644 index d722c6eb..00000000 --- a/SAUTE/exps/single_pendulum/tf_sac_pendulum_v1.py +++ /dev/null @@ -1,77 +0,0 @@ -from tf_algos.common.runner import TFRunner -""" -Runs SAC algorithms for safety: -VanillaSAC - Vanilla SAC, -LagrangianSAC - SAC with a lagrangian constraint, -SauteSAC - Saute SAC, -WorstCaseSAC - distributional SAC with a c-var constraint https://www.st.ewi.tudelft.nl/mtjspaan/pub/Yang21aaai.pdf . -""" -def run_tf_sac_pendulum_v1( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - if experiment_name is None: - experiment_name = 'plug_n_play' - task_name = 'Pendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - steps_per_epoch = 200, - epochs=200, - checkpoint_frequency = 0, - penalty_lr=5e-2, - n_test_episodes=100, - ) - - # parameter sweep - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 30.0], - ] - cls = [ - ['agent_cfg_overrides', 'cl', 0.3], - ] - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - for agent_name in [ 'VanillaSAC', 'LagrangianSAC','SauteSAC']: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - if agent_name == 'VanillaSAC': - param_list = [seeds] - if agent_name == 'WorstCaseSAC': - param_list = [safety_budgets, safety_discount_factors, cls, seeds] - if agent_name == 'LagrangianSAC': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteSAC': - param_list = [safety_budgets, seeds] - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/single_pendulum/tf_trpo_pendulum_v1.py b/SAUTE/exps/single_pendulum/tf_trpo_pendulum_v1.py deleted file mode 100644 index 4616d51c..00000000 --- a/SAUTE/exps/single_pendulum/tf_trpo_pendulum_v1.py +++ /dev/null @@ -1,76 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_pendulum_v1( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """ - Runs TRPO algorithms for safety: - VanillaTRPO - Vanilla TRPO, - LagrangianTRPO - TRPO with a Lagrangian constraint, - SauteTRPO - Saute TRPO, - CPO - CPO, - SauteLangrangianTRPO - Lagrangian TRPO with safety state augmentation. - """ - if experiment_name is None: - experiment_name = 'plug_n_play' - task_name = 'Pendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - checkpoint_frequency = 0, - n_test_episodes=100, - penalty_lr=5e-2, - epochs=200, - ) - - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 30.0], - ] - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - for agent_name in [ 'CPO','SauteTRPO', 'VanillaTRPO', 'LagrangianTRPO']: - safety_discount_factors = [ - ['agent_cfg_overrides', 'safety_discount_factor', 0.99], - ] - env_cfg_overrides = {} - param_list = [] - if agent_name == 'VanillaTRPO': - param_list = [seeds] - if agent_name == 'LagrangianTRPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'CPO': - param_list = [safety_budgets, safety_discount_factors, seeds] - if agent_name == 'SauteTRPO': - param_list = [safety_budgets, seeds] - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/exps/single_pendulum/tf_trpo_pendulum_v2_abblation.py b/SAUTE/exps/single_pendulum/tf_trpo_pendulum_v2_abblation.py deleted file mode 100644 index 5f16f271..00000000 --- a/SAUTE/exps/single_pendulum/tf_trpo_pendulum_v2_abblation.py +++ /dev/null @@ -1,74 +0,0 @@ - -from tf_algos.common.runner import TFRunner - -def run_tf_trpo_pendulum_v2_abblation( - experiment_name:str=None, - num_exps:int=1, - smoketest:bool=True - ): - """" - Ablation study on Saute TRPO. - """ - if experiment_name is None: - experiment_name = 'ablation' - task_name = 'Pendulum' - # big overrides - agent_cfg_overrides = dict( - env_name = task_name, # a necessary override - discount_factor = 0.99, # a necessary override - safety_discount_factor=0.99, - checkpoint_frequency = 0, - n_test_episodes=100, - penalty_lr=5e-2, - epochs=200, - ) - - safety_budgets = [ - ['agent_cfg_overrides', 'safety_budget', 40.0], - ] - - seeds = [ - ['agent_cfg_overrides', 'seed', 42], - ['agent_cfg_overrides', 'seed', 4242], - ['agent_cfg_overrides', 'seed', 424242], - ['agent_cfg_overrides', 'seed', 42424242], - ['agent_cfg_overrides', 'seed', 4242424242], - ] - - for agent_name in ['SauteTRPO']: - env_cfg_overrides = {} - param_list = [] - - if agent_name == 'SauteTRPO': - use_reward_shapings = [ - ['env_cfg_overrides', 'use_reward_shaping', False], - ['env_cfg_overrides', 'use_reward_shaping', True] - ] - use_state_augmentations = [ - ['env_cfg_overrides', 'use_state_augmentation', True], - ['env_cfg_overrides', 'use_state_augmentation', False] - ] - - param_list = [use_reward_shapings, use_state_augmentations, safety_budgets, seeds] - - if smoketest: - agent_cfg_overrides['epochs'] = 2 - agent_cfg_overrides['checkpoint_frequency'] = 0 - experiment_name = 'test' - param_list = [[seeds[0]]] - runner = TFRunner( - experiment_name, - agent_name, - task_name, - param_sweep_lists=param_list, # seeds are the last ## policy_lrs, penalty_lrs, - agent_cfg_overrides=agent_cfg_overrides, - env_cfg_overrides=env_cfg_overrides, - ) - runner.run_experiment( - train=True, - test=False, - data_filename="test_results.csv", - num_exps=num_exps - ) - print("done") - \ No newline at end of file diff --git a/SAUTE/main.py b/SAUTE/main.py deleted file mode 100644 index 9e718ab0..00000000 --- a/SAUTE/main.py +++ /dev/null @@ -1,79 +0,0 @@ -from common.argument_parser import GeneralArgumentParser - - -if __name__ == "__main__": - - exp_parser = GeneralArgumentParser() - args = exp_parser.parse_args() - - current_experiment = args.experiment -### Mountain Car - if current_experiment == 0: - from exps.mountain_car.tf_sac_v1 import run_tf_sac_v1 - run_tf_sac_v1(num_exps=args.num_exps, smoketest=args.smoketest) -### Single Pendulum - elif current_experiment == 10: - from exps.single_pendulum.tf_sac_pendulum_v1 import run_tf_sac_pendulum_v1 - run_tf_sac_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 11: - from exps.single_pendulum.tf_ppo_pendulum_v1 import run_tf_ppo_pendulum_v1 - run_tf_ppo_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 12: - from exps.single_pendulum.tf_trpo_pendulum_v1 import run_tf_trpo_pendulum_v1 - run_tf_trpo_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 13: - from exps.single_pendulum.tf_trpo_pendulum_v2_abblation import run_tf_trpo_pendulum_v2_abblation - run_tf_trpo_pendulum_v2_abblation(num_exps=args.num_exps, smoketest=args.smoketest) -### Double Pendulum - elif current_experiment == 20: - from exps.double_pendulum.tf_trpo_double_pendulum_v1 import run_tf_trpo_double_pendulum_v1 - run_tf_trpo_double_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 21: - from exps.double_pendulum.tf_trpo_double_pendulum_v2 import run_tf_trpo_double_pendulum_v2 - run_tf_trpo_double_pendulum_v2(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 22: - from exps.double_pendulum.tf_trpo_double_pendulum_v5 import run_tf_trpo_double_pendulum_v5 - run_tf_trpo_double_pendulum_v5(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 23: - from exps.double_pendulum.tf_trpo_double_pendulum_v3 import run_tf_trpo_double_pendulum_v3 - run_tf_trpo_double_pendulum_v3(num_exps=args.num_exps, smoketest=args.smoketest) - elif current_experiment == 24: - from exps.double_pendulum.tf_trpo_double_pendulum_v4 import run_tf_trpo_double_pendulum_v4 - run_tf_trpo_double_pendulum_v4(num_exps=args.num_exps, smoketest=args.smoketest) -### Reacher - elif current_experiment == 30: - from exps.reacher.tf_trpo_reacher_v1 import run_tf_trpo_reacher_v1 - run_tf_trpo_reacher_v1(num_exps=args.num_exps, smoketest=args.smoketest) -### Safety Gym - elif current_experiment == 40: - from exps.safety_gym.tf_trpo_safety_gym_v1 import run_tf_trpo_safety_gym_v1 - run_tf_trpo_safety_gym_v1(num_exps=args.num_exps, smoketest=args.smoketest) -### testing for minor bugs - elif current_experiment == -1: # experminetal feature - from exps.single_pendulum.tf_sac_pendulum_v1 import run_tf_sac_pendulum_v1 - from exps.single_pendulum.tf_ppo_pendulum_v1 import run_tf_ppo_pendulum_v1 - from exps.single_pendulum.tf_trpo_pendulum_v1 import run_tf_trpo_pendulum_v1 - from exps.double_pendulum.tf_trpo_double_pendulum_v1 import run_tf_trpo_double_pendulum_v1 - from exps.double_pendulum.tf_trpo_double_pendulum_v2 import run_tf_trpo_double_pendulum_v2 - from exps.double_pendulum.tf_trpo_double_pendulum_v3 import run_tf_trpo_double_pendulum_v3 - from exps.double_pendulum.tf_trpo_double_pendulum_v4 import run_tf_trpo_double_pendulum_v4 - from exps.reacher.tf_trpo_reacher_v1 import run_tf_trpo_reacher_v1 - from exps.safety_gym.tf_trpo_safety_gym_v1 import run_tf_trpo_safety_gym_v1 - args.smoketest = -1 - ## single pendulum - run_tf_sac_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - run_tf_ppo_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - run_tf_trpo_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - ## double pendulum - run_tf_trpo_double_pendulum_v1(num_exps=args.num_exps, smoketest=args.smoketest) - run_tf_trpo_double_pendulum_v2(num_exps=args.num_exps, smoketest=args.smoketest) - run_tf_trpo_double_pendulum_v3(num_exps=args.num_exps, smoketest=args.smoketest) - run_tf_trpo_double_pendulum_v4(num_exps=args.num_exps, smoketest=args.smoketest) - ## reacher - run_tf_trpo_reacher_v1(num_exps=args.num_exps, smoketest=args.smoketest) - ## safety gym - run_tf_trpo_safety_gym_v1(num_exps=args.num_exps, smoketest=args.smoketest) - else: - raise NotImplementedError - - \ No newline at end of file diff --git a/SAUTE/pip_repos.txt b/SAUTE/pip_repos.txt deleted file mode 100644 index 9b497f5f..00000000 --- a/SAUTE/pip_repos.txt +++ /dev/null @@ -1,71 +0,0 @@ -absl-py==0.14.1 -astor==0.8.1 -astunparse==1.6.3 -atari-py==0.2.9 -cached-property==1.5.2 -cachetools==4.2.4 -certifi==2021.5.30 -cffi==1.14.6 -charset-normalizer==2.0.6 -clang==5.0 -cloudpickle==1.2.2 -cycler==0.10.0 -Cython==0.29.24 -fasteners==0.16.3 -flatbuffers==1.12 -future==0.18.2 -gast==0.2.2 -glfw==2.3.0 -google-auth==1.35.0 -google-auth-oauthlib==0.4.6 -google-pasta==0.2.0 -grpcio==1.41.0 -gym==0.15.7 -h5py==3.1.0 -idna==3.2 -imageio==2.9.0 -importlib-metadata==4.8.1 -joblib==0.14.0 -keras==2.6.0 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.2 -kiwisolver==1.3.2 -Markdown==3.3.4 -matplotlib==3.4.3 -mpi4py==3.0.2 -mujoco-py==2.0.2.7 -numpy==1.21.2 -oauthlib==3.1.1 -opencv-python==4.5.3.56 -opt-einsum==3.3.0 -pandas==1.3.3 -Pillow==8.3.2 -protobuf==3.18.1 -psutil==5.8.0 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pycparser==2.20 -pyglet==1.5.0 -pyparsing==2.4.7 -python-dateutil==2.8.2 -pytz==2021.3 -requests==2.26.0 -requests-oauthlib==1.3.0 -rsa==4.7.2 -scipy==1.7.1 -seaborn==0.11.2 -six==1.15.0 -tensorboard==1.15.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.0 -tensorboardX==2.5 -tensorflow==1.15.0 -tensorflow-estimator==1.15.1 -termcolor==1.1.0 -torch==1.9.1 -typing-extensions==3.7.4.3 -urllib3==1.26.7 -Werkzeug==2.0.2 -wrapt==1.12.1 -xmltodict==0.12.0 -zipp==3.6.0 diff --git a/SAUTE/sauterl.yml b/SAUTE/sauterl.yml deleted file mode 100644 index 34a97631..00000000 --- a/SAUTE/sauterl.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: sauterl -channels: - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - ca-certificates=2021.9.30=h06a4308_1 - - certifi=2021.5.30=py37h06a4308_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - ncurses=6.2=he6710b0_1 - - openssl=1.1.1l=h7f8727e_0 - - pip=21.0.1=py37h06a4308_0 - - python=3.7.11=h12debd9_0 - - readline=8.1=h27cfd23_0 - - setuptools=58.0.4=py37h06a4308_0 - - sqlite=3.36.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - wheel=0.37.0=pyhd3eb1b0_1 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7b6447c_3 - diff --git a/SAUTE/tf_algos/__init__.py b/SAUTE/tf_algos/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SAUTE/tf_algos/common/__init__.py b/SAUTE/tf_algos/common/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SAUTE/tf_algos/common/runner.py b/SAUTE/tf_algos/common/runner.py deleted file mode 100644 index 97fc93f2..00000000 --- a/SAUTE/tf_algos/common/runner.py +++ /dev/null @@ -1,229 +0,0 @@ -from typing import Dict -import numpy as np - -import multiprocessing as mp -from common.base_runner import BaseRunner -from common.utils import set_overrides -import os -from tf_algos.safety_starter_agents.test_agents import evaluate_run - -class TFRunner(BaseRunner): - """Main runner class for tf-based algorithms.""" - def setup_algo( - self, - agent_cfg_overrides:Dict, - env_cfg_overrides:Dict, - exp_dir:str='' - ): - """ - Sets up the algorithm for training and testing - - :param agent_cfg_overrides: dictionary with ovverides for the agent config files, - :param env_cfg_overrides: dictionary with ovverides for the environment config files, - :param exp_dir: directory for logging the experiment, - - :returns: two functions for training and testing the algorithms. - """ - from tf_algos.safety_starter_agents.tf_ppo import ppo, saute_ppo, saute_ppo_lagrangian, ppo_lagrangian - from tf_algos.safety_starter_agents.tf_trpo import trpo, saute_trpo, saute_trpo_lagrangian, trpo_lagrangian, trpo_cvar - from tf_algos.safety_starter_agents.tf_cpo import cpo - from tf_algos.safety_starter_agents.run_agents import polopt_cfg - - from tf_algos.safety_starter_agents.sac_utils import mlp_actor, mlp_critic - from tf_algos.safety_starter_agents.tf_sac import vanilla_sac, saute_sac, saute_lagrangian_sac, lagrangian_sac, wc_sac, sac_cfg - if 'PPO' in self.agent_name: - if 'Saute' in self.agent_name: - agent_cfg_overrides['env_name'] = 'Sauted' + agent_cfg_overrides['env_name'] - agent_cfg = set_overrides(polopt_cfg, agent_cfg_overrides) - train_env_fn, test_env_fn, agent_cfg, env_cfg = self.create_env(agent_cfg, env_cfg_overrides) - self.writer, self.train_dir, self.test_dir = self.setup_log(exp_dir=exp_dir, agent_cfg=agent_cfg, env_cfg=env_cfg) - agent_cfg['logger_kwargs'] = dict(output_dir=self.train_dir, output_fname='logs.txt') - if self.agent_name == 'VanillaPPO': - train_algo = lambda : ppo( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'LagrangianPPO': - train_algo = lambda : ppo_lagrangian( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'SautePPO': - train_algo = lambda : saute_ppo( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'SauteLagrangianPPO': - train_algo = lambda : saute_ppo_lagrangian( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - else: - raise NotImplementedError(f"Agent {self.agent_name} is not implemented") - elif 'TRPO' in self.agent_name: - if 'Saute' in self.agent_name:# == 'SauteTRPO' or self.agent_name == 'SauteLagrangianTRPO': - agent_cfg_overrides['env_name'] = 'Sauted' + agent_cfg_overrides['env_name'] - agent_cfg = set_overrides(polopt_cfg, agent_cfg_overrides) - train_env_fn, test_env_fn, agent_cfg, env_cfg = self.create_env(agent_cfg, env_cfg_overrides) - self.writer, self.train_dir, self.test_dir = self.setup_log(exp_dir=exp_dir, agent_cfg=agent_cfg, env_cfg=env_cfg) - agent_cfg['logger_kwargs'] = dict(output_dir=self.train_dir, output_fname='logs.txt') - if self.agent_name == 'VanillaTRPO': - train_algo = lambda : trpo( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'LagrangianTRPO': - train_algo = lambda : trpo_lagrangian( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'CVaR_Lagranian_TRPO': - train_algo = lambda: trpo_cvar( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'SauteTRPO': - train_algo = lambda : saute_trpo( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'SauteLagrangianTRPO': - train_algo = lambda : saute_trpo_lagrangian( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - else: - raise NotImplementedError(f"Agent {self.agent_name} is not implemented") - elif self.agent_name == 'CPO': - agent_cfg = set_overrides(polopt_cfg, agent_cfg_overrides) - train_env_fn, test_env_fn, agent_cfg, env_cfg = self.create_env(agent_cfg, env_cfg_overrides) - self.writer, self.train_dir, self.test_dir = self.setup_log(exp_dir=exp_dir, agent_cfg=agent_cfg, env_cfg=env_cfg) - agent_cfg['logger_kwargs'] = dict(output_dir=self.train_dir, output_fname='logs.txt') - train_algo = lambda : cpo( - train_env_fn=train_env_fn, - writer=self.writer, - **agent_cfg - ) - elif 'SAC' in self.agent_name: - if 'Saute' in self.agent_name: - agent_cfg_overrides['env_name'] = 'Sauted' + agent_cfg_overrides['env_name'] - agent_cfg = set_overrides(sac_cfg, agent_cfg_overrides) - train_env_fn, test_env_fn, agent_cfg, env_cfg = self.create_env(agent_cfg, env_cfg_overrides) - self.writer, self.train_dir, self.test_dir = self.setup_log(exp_dir=exp_dir, agent_cfg=agent_cfg, env_cfg=env_cfg) - agent_cfg['logger_kwargs'] = dict(output_dir=self.train_dir, output_fname='logs.txt') - if self.agent_name == 'VanillaSAC': - train_algo = lambda: vanilla_sac( - train_env_fn=train_env_fn, - test_env_fn=test_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'LagrangianSAC': - train_algo = lambda: lagrangian_sac( - train_env_fn=train_env_fn, - test_env_fn=test_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'SauteSAC': - train_algo = lambda: saute_sac( - train_env_fn=train_env_fn, - test_env_fn=test_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'SauteLagrangianSAC': - train_algo = lambda: saute_lagrangian_sac( - train_env_fn=train_env_fn, - test_env_fn=test_env_fn, - writer=self.writer, - **agent_cfg - ) - elif self.agent_name == 'WorstCaseSAC': - train_algo = lambda: wc_sac( - train_env_fn=train_env_fn, - test_env_fn=test_env_fn, - writer=self.writer, - **agent_cfg - ) - else: - raise NotImplementedError(f"Agent {self.agent_name} is not implemented") - else: - raise NotImplementedError(f"Agent {self.agent_name} is not implemented") - test_algo = lambda last_only: evaluate_run(self.train_dir, env_fn=test_env_fn, evaluations=agent_cfg['n_test_episodes'], last_only=last_only) - return train_algo, test_algo - - def run_experiment( - self, - train:bool=False, - test:bool=False, - evaluate_last_only:bool=False, - data_filename:str="test_results.csv", - num_exps:int=1 - ): - """ - Main run file for the algorithm training and testing - - :param train: if true train policy, - :param test: if true test an already trained policy, - :param evaluate_last_only: evaluates only the last iteration, - :param data_filename: csv file containing the experimental data, - :param num_exps: number of experiments to run simulatenously. - """ - agent_overrides, env_overrides, experiment_paths = self.set_all_overrides() - - if train: - def _train_exps(count): - train_algo, _ = self.setup_algo( - agent_cfg_overrides=agent_overrides[count], - env_cfg_overrides=env_overrides[count], - exp_dir=experiment_paths[count] - ) - train_algo() - self._parallel_run(_train_exps, n_threads=num_exps, n_exps=len(experiment_paths)) - if test: - def _test_exps(count): - _, test_algo = self.setup_algo( - agent_cfg_overrides=agent_overrides[count], - env_cfg_overrides=env_overrides[count], - exp_dir=experiment_paths[count] - ) - df = test_algo(evaluate_last_only) - df.to_csv(os.path.join(experiment_paths[count], "test", data_filename)) - - self._parallel_run(_test_exps, n_threads=num_exps, n_exps=len(experiment_paths)) - - @staticmethod - def _parallel_run(func, n_threads, n_exps): - """ - Script for a parallel run of experiments, - :param func: a function to run, - :param n_threads: number of experiments to run simulatenously, - :param n_exps: total number of experiments to run. - """ - if n_threads > 1: - n_loops = int(np.ceil(n_exps / n_threads)) - for loop_idx in range(n_loops): - cur_range = np.arange(loop_idx * n_threads, min((loop_idx + 1) * n_threads, n_exps)) - processes = [] - print("-----------------------------------") - print(f"-------Starting Loop {loop_idx+1} / {n_loops}-------") - print("-----------------------------------") - for count in cur_range: - p = mp.Process(target=func, args=(count,)) - p.start() - processes.append(p) - for p in processes: - p.join() - else: - for count in range(n_exps): - func(count) \ No newline at end of file diff --git a/SAUTE/tf_algos/common/utils.py b/SAUTE/tf_algos/common/utils.py deleted file mode 100644 index 829c644f..00000000 --- a/SAUTE/tf_algos/common/utils.py +++ /dev/null @@ -1,9 +0,0 @@ -import random -import numpy as np -import tensorflow as tf - -def set_random_seed(seed:int): - """Set random seed.""" - random.seed(seed) - np.random.seed(0) - tf.random.set_seed(seed) \ No newline at end of file diff --git a/SAUTE/tf_algos/safety_starter_agents/__init__.py b/SAUTE/tf_algos/safety_starter_agents/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SAUTE/tf_algos/safety_starter_agents/agents.py b/SAUTE/tf_algos/safety_starter_agents/agents.py deleted file mode 100644 index 92c878e4..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/agents.py +++ /dev/null @@ -1,267 +0,0 @@ -""" -Agents copied from safety starter agents and modified. -""" -from safe_rl.pg.agents import Agent -from safe_rl.utils.mpi_tools import mpi_avg -import numpy as np -from safe_rl.pg.utils import EPS -import safe_rl.pg.trust_region as tro - -class PPOAgent(Agent): - - def __init__(self, clip_ratio=0.2, - pi_lr=3e-4, - pi_iters=80, - kl_margin=1.2, - **kwargs): - super().__init__(**kwargs) - self.clip_ratio = clip_ratio - self.pi_lr = pi_lr - self.pi_iters = pi_iters - self.kl_margin = kl_margin - self.params.update(dict( - clipped_adv=True, - first_order=True, - constrained=False - )) - - def update_pi(self, inputs): - - # Things we need from training package - train_pi = self.training_package['train_pi'] - d_kl = self.training_package['d_kl'] - target_kl = self.training_package['target_kl'] - - # Run the update - for i in range(self.pi_iters): - _, kl = self.sess.run([train_pi, d_kl], feed_dict=inputs) - kl = mpi_avg(kl) - if kl > self.kl_margin * target_kl: - self.logger.log('Early stopping at step %d due to reaching max kl.'%i) - break - self.logger.store(StopIter=i) - - def log(self): - self.logger.log_tabular('StopIter', average_only=True) - -class TrustRegionAgent(Agent): - - def __init__(self, damping_coeff=0.1, - backtrack_coeff=0.8, - backtrack_iters=10, - **kwargs): - super().__init__(**kwargs) - self.damping_coeff = damping_coeff - self.backtrack_coeff = backtrack_coeff - self.backtrack_iters = backtrack_iters - self.params.update(dict( - trust_region=True - )) - -class TRPOAgent(TrustRegionAgent): - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.params.update(dict( - constrained=False - )) - - def update_pi(self, inputs): - - flat_g = self.training_package['flat_g'] - v_ph = self.training_package['v_ph'] - hvp = self.training_package['hvp'] - get_pi_params = self.training_package['get_pi_params'] - set_pi_params = self.training_package['set_pi_params'] - pi_loss = self.training_package['pi_loss'] - d_kl = self.training_package['d_kl'] - target_kl = self.training_package['target_kl'] - - Hx = lambda x : mpi_avg(self.sess.run(hvp, feed_dict={**inputs, v_ph: x})) - g, pi_l_old = self.sess.run([flat_g, pi_loss], feed_dict=inputs) - g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) - - # Core calculations for TRPO or NPG - x = tro.cg(Hx, g) - alpha = np.sqrt(2*target_kl/(np.dot(x, Hx(x))+EPS)) - old_params = self.sess.run(get_pi_params) - - # Save lagrange multiplier - self.logger.store(Alpha=alpha) - - def set_and_eval(step): - self.sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) - return mpi_avg(self.sess.run([d_kl, pi_loss], feed_dict=inputs)) - - # TRPO augments NPG with backtracking line search, hard kl constraint - for j in range(self.backtrack_iters): - kl, pi_l_new = set_and_eval(step=self.backtrack_coeff**j) - if kl <= target_kl and pi_l_new <= pi_l_old: - self.logger.log('Accepting new params at step %d of line search.'%j) - self.logger.store(BacktrackIters=j) - break - - if j==self.backtrack_iters-1: - self.logger.log('Line search failed! Keeping old params.') - self.logger.store(BacktrackIters=j) - kl, pi_l_new = set_and_eval(step=0.) - - def log(self): - self.logger.log_tabular('Alpha', average_only=True) - self.logger.log_tabular('BacktrackIters', average_only=True) - -class CPOAgent(TrustRegionAgent): - - def __init__(self, learn_margin=False, **kwargs): - super().__init__(**kwargs) - self.learn_margin = learn_margin - self.params.update(dict( - constrained=True, - save_penalty=True - )) - self.margin = 0 - self.margin_lr = 0.05 - - - def update_pi(self, inputs): - - flat_g = self.training_package['flat_g'] - flat_b = self.training_package['flat_b'] - v_ph = self.training_package['v_ph'] - hvp = self.training_package['hvp'] - get_pi_params = self.training_package['get_pi_params'] - set_pi_params = self.training_package['set_pi_params'] - pi_loss = self.training_package['pi_loss'] - surr_cost = self.training_package['surr_cost'] - d_kl = self.training_package['d_kl'] - target_kl = self.training_package['target_kl'] - cost_lim = self.training_package['cost_lim'] - - Hx = lambda x : mpi_avg(self.sess.run(hvp, feed_dict={**inputs, v_ph: x})) - outs = self.sess.run([flat_g, flat_b, pi_loss, surr_cost], feed_dict=inputs) - outs = [mpi_avg(out) for out in outs] - g, b, pi_l_old, surr_cost_old = outs - - - # Need old params, old policy cost gap (epcost - limit), - # and surr_cost rescale factor (equal to average eplen). - old_params = self.sess.run(get_pi_params) - c = self.logger.get_stats('EpCost')[0] - cost_lim - rescale = self.logger.get_stats('EpLen')[0] - - # Consider the right margin - if self.learn_margin: - self.margin += self.margin_lr * c - self.margin = max(0, self.margin) - - # The margin should be the same across processes anyhow, but let's - # mpi_avg it just to be 100% sure there's no drift. :) - self.margin = mpi_avg(self.margin) - - # Adapt threshold with margin. - c += self.margin - - # c + rescale * b^T (theta - theta_k) <= 0, equiv c/rescale + b^T(...) - c /= (rescale + EPS) - - # Core calculations for CPO - v = tro.cg(Hx, g) - approx_g = Hx(v) - q = np.dot(v, approx_g) - - # Determine optim_case (switch condition for calculation, - # based on geometry of constrained optimization problem) - if np.dot(b,b) <= 1e-8 and c < 0: - # feasible and cost grad is zero---shortcut to pure TRPO update! - w, r, s, A, B = 0, 0, 0, 0, 0 - optim_case = 4 - else: - # cost grad is nonzero: CPO update! - w = tro.cg(Hx, b) - r = np.dot(w, approx_g) # b^T H^{-1} g - s = np.dot(w, Hx(w)) # b^T H^{-1} b - A = q - r**2 / s # should be always positive (Cauchy-Shwarz) - B = 2*target_kl - c**2 / s # does safety boundary intersect trust region? (positive = yes) - - if c < 0 and B < 0: - # point in trust region is feasible and safety boundary doesn't intersect - # ==> entire trust region is feasible - optim_case = 3 - elif c < 0 and B >= 0: - # x = 0 is feasible and safety boundary intersects - # ==> most of trust region is feasible - optim_case = 2 - elif c >= 0 and B >= 0: - # x = 0 is infeasible and safety boundary intersects - # ==> part of trust region is feasible, recovery possible - optim_case = 1 - self.logger.log('Alert! Attempting feasible recovery!', 'yellow') - else: - # x = 0 infeasible, and safety halfspace is outside trust region - # ==> whole trust region is infeasible, try to fail gracefully - optim_case = 0 - self.logger.log('Alert! Attempting infeasible recovery!', 'red') - - if optim_case in [3,4]: - lam = np.sqrt(q / (2*target_kl)) - nu = 0 - elif optim_case in [1,2]: - LA, LB = [0, r /c], [r/c, np.inf] - LA, LB = (LA, LB) if c < 0 else (LB, LA) - proj = lambda x, L : max(L[0], min(L[1], x)) - lam_a = proj(np.sqrt(A/B), LA) - lam_b = proj(np.sqrt(q/(2*target_kl)), LB) - f_a = lambda lam : -0.5 * (A / (lam+EPS) + B * lam) - r*c/(s+EPS) - f_b = lambda lam : -0.5 * (q / (lam+EPS) + 2 * target_kl * lam) - lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b - nu = max(0, lam * c - r) / (s + EPS) - else: - lam = 0 - nu = np.sqrt(2 * target_kl / (s+EPS)) - - # normal step if optim_case > 0, but for optim_case =0, - # perform infeasible recovery: step to purely decrease cost - x = (1./(lam+EPS)) * (v + nu * w) if optim_case > 0 else nu * w - - # save intermediates for diagnostic purposes - self.logger.store(Optim_A=A, Optim_B=B, Optim_c=c, - Optim_q=q, Optim_r=r, Optim_s=s, - Optim_Lam=lam, Optim_Nu=nu, - Penalty=nu, DeltaPenalty=0, - Margin=self.margin, - OptimCase=optim_case) - - def set_and_eval(step): - self.sess.run(set_pi_params, feed_dict={v_ph: old_params - step * x}) - return mpi_avg(self.sess.run([d_kl, pi_loss, surr_cost], feed_dict=inputs)) - - # CPO uses backtracking linesearch to enforce constraints - self.logger.log('surr_cost_old %.3f'%surr_cost_old, 'blue') - for j in range(self.backtrack_iters): - kl, pi_l_new, surr_cost_new = set_and_eval(step=self.backtrack_coeff**j) - self.logger.log('%d \tkl %.3f \tsurr_cost_new %.3f'%(j, kl, surr_cost_new), 'blue') - if (kl <= target_kl and - (pi_l_new <= pi_l_old if optim_case > 1 else True) and - surr_cost_new - surr_cost_old <= max(-c,0)): - self.logger.log('Accepting new params at step %d of line search.'%j) - self.logger.store(BacktrackIters=j) - break - - if j==self.backtrack_iters-1: - self.logger.log('Line search failed! Keeping old params.') - self.logger.store(BacktrackIters=j) - kl, pi_l_new, surr_cost_new = set_and_eval(step=0.) - - - def log(self): - self.logger.log_tabular('Optim_A', average_only=True) - self.logger.log_tabular('Optim_B', average_only=True) - self.logger.log_tabular('Optim_c', average_only=True) - self.logger.log_tabular('Optim_q', average_only=True) - self.logger.log_tabular('Optim_r', average_only=True) - self.logger.log_tabular('Optim_s', average_only=True) - self.logger.log_tabular('Optim_Lam', average_only=True) - self.logger.log_tabular('Optim_Nu', average_only=True) - self.logger.log_tabular('OptimCase', average_only=True) - self.logger.log_tabular('Margin', average_only=True) - self.logger.log_tabular('BacktrackIters', average_only=True) \ No newline at end of file diff --git a/SAUTE/tf_algos/safety_starter_agents/run_agents.py b/SAUTE/tf_algos/safety_starter_agents/run_agents.py deleted file mode 100644 index 5af2b2c9..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/run_agents.py +++ /dev/null @@ -1,650 +0,0 @@ -""" -Main run file copied from safety starter agents with the following modifications: -- added a capability for Sauteing Vanilla and Lagrangian methods -- added a capability to use CVaR constraints -""" -import numpy as np -from tensorboardX.writer import SummaryWriter -import tensorflow as tf -import pandas as pd -import time -import safe_rl.pg.trust_region as tro -from tf_algos.safety_starter_agents.agents import PPOAgent -from safe_rl.pg.buffer import CPOBuffer -from safe_rl.pg.network import count_vars, \ - get_vars, \ - mlp_actor_critic,\ - placeholders, \ - placeholders_from_spaces -from safe_rl.pg.utils import values_as_sorted_list -from safe_rl.utils.logx import EpochLogger -from safe_rl.utils.mpi_tf import MpiAdamOptimizer, sync_all_params -from safe_rl.utils.mpi_tools import * -from collections import deque - -polopt_cfg = dict( - # from our torch runners - log=True, - log_updates=False, - seed=0, - render=False, - saute_constraints=False, - saute_lagrangian=False, - # Experience collection: - steps_per_epoch=4000, - epochs=200, - # max_ep_len=0, # removed as it should be taken from env_cfg - # Discount factors: - discount_factor=0.99, - lam=0.97, - safety_discount_factor=0.99, - safety_lam=0.97, - safety_budget=1.0, - # Policy learning: - ent_reg=0., - # Cost constraints / penalties: - penalty_init=1., - penalty_lr=5e-2, - # KL divergence: - target_kl=0.01, - # Value learning: - value_fn_lr=1e-3, - gradient_steps=80, - # Logging: - checkpoint_frequency=1, - n_test_episodes=100, - n_train_episodes=100, - backtrack_iters=10 - ) - -class CVaREpochLogger(EpochLogger): - def __init__(self, risk=0.9, **kwargs): - super(CVaREpochLogger, self).__init__(**kwargs) - self.risk = risk - - def get_stats_cvar(self, key): - """ - Lets an algorithm ask the logger for CVaR at risk=alpha. - """ - v = self.epoch_dict[key] - # Each episode cost in a list, for each process - vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape) > 0 else v - # Assuming only one process, so all the data of cum costs in one list, calculate CVaR - n = len(vals) - # Calculate k for removed smallest k values from list - k = n-int(n*self.risk) - # Sort with k smalles values first then remove these and keep n-k largest costs - vals = np.partition(vals, k)[k:] - return mpi_statistics_scalar(vals) - -def run_polopt_agent(train_env_fn, - env_name:str='', - log:bool=True, - log_updates:bool=False, - agent=PPOAgent(), - actor_critic=mlp_actor_critic, - ac_kwargs=dict(), - seed=0, - render=False, - saute_constraints=False, - saute_lagrangian=False, - # Experience collection: - steps_per_epoch=4000, - epochs=50, - max_ep_len=200, - # Discount factors: - discount_factor=0.99, - lam=0.97, - safety_discount_factor=0.99, - safety_lam=0.97, - # Policy learning: - ent_reg=0., - # Cost constraints / penalties: - safety_budget=25, - penalty_init=1., - penalty_lr=5e-2, - # KL divergence: - target_kl=0.01, - # Value learning: - value_fn_lr=1e-3, - gradient_steps=80, - # TB logging - writer:SummaryWriter=None, - # Logging: - logger=None, - logger_kwargs=dict(), - checkpoint_frequency=1, - CVaR=False, - risk=0.9, - n_test_episodes=10, - n_train_episodes=10, - backtrack_iters=10 - ): - #=========================================================================# - # Prepare logger, seed, and environment in this process # - #=========================================================================# - if CVaR: - # MUST HAVE --cpu set to 1 or number of cpus set to 1 otherwise it won't work! - logger = CVaREpochLogger(risk=risk, **logger_kwargs) - - else: - logger = EpochLogger(**logger_kwargs) if logger is None else logger - logger.save_config(locals()) - - seed += 10000 * proc_id() - tf.set_random_seed(seed) - np.random.seed(seed) - - env = train_env_fn() - observation_space = env.observation_space - - agent.set_logger(logger) - - #=========================================================================# - # Create computation graph for actor and critic (not training routine) # - #=========================================================================# - - # Share information about action space with policy architecture - ac_kwargs['action_space'] = env.action_space - - # Inputs to computation graph from environment spaces - x_ph, a_ph = placeholders_from_spaces(observation_space, env.action_space) - - # Inputs to computation graph for batch data - adv_ph, cadv_ph, ret_ph, cret_ph, logp_old_ph = placeholders(*(None for _ in range(5))) - - # Inputs to computation graph for special purposes - surr_cost_rescale_ph = tf.placeholder(tf.float32, shape=()) - cur_cost_ph = tf.placeholder(tf.float32, shape=()) - - # Outputs from actor critic - ac_outs = actor_critic(x_ph, a_ph, **ac_kwargs) - pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent, v, vc = ac_outs - - # Organize placeholders for zipping with data from buffer on updates - buf_phs = [x_ph, a_ph, adv_ph, cadv_ph, ret_ph, cret_ph, logp_old_ph] - buf_phs += values_as_sorted_list(pi_info_phs) - - # Organize symbols we have to compute at each step of acting in env - get_action_ops = dict(pi=pi, - v=v, - logp_pi=logp_pi, - pi_info=pi_info) - - # If agent is reward penalized, it doesn't use a separate value function - # for costs and we don't need to include it in get_action_ops; otherwise we do. - if not(agent.reward_penalized): - get_action_ops['vc'] = vc - - # Count variables - var_counts = tuple(count_vars(scope) for scope in ['pi', 'vf', 'vc']) - logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t vc: %d\n'%var_counts) - # print('\nNumber of parameters: \t pi: %d, \t v: %d, \t vc: %d\n'%var_counts) - - # Make a sample estimate for entropy to use as sanity check - approx_ent = tf.reduce_mean(-logp) - - - #=========================================================================# - # Create replay buffer # - #=========================================================================# - - # Obs/act shapes - obs_shape = observation_space.shape - act_shape = env.action_space.shape - - # Experience buffer - local_steps_per_epoch = int(steps_per_epoch / num_procs()) - pi_info_shapes = {k: v.shape.as_list()[1:] for k,v in pi_info_phs.items()} - buf = CPOBuffer(local_steps_per_epoch, - obs_shape, - act_shape, - pi_info_shapes, - discount_factor, - lam, - safety_discount_factor, - safety_lam) - - - #=========================================================================# - # Create computation graph for penalty learning, if applicable # - #=========================================================================# - - if agent.use_penalty: - with tf.variable_scope('penalty'): - # param_init = np.log(penalty_init) - param_init = np.log(max(np.exp(penalty_init)-1, 1e-8)) - penalty_param = tf.get_variable('penalty_param', - initializer=float(param_init), - trainable=agent.learn_penalty, - dtype=tf.float32) - # penalty = tf.exp(penalty_param) - penalty = tf.nn.softplus(penalty_param) - - if agent.learn_penalty: - if agent.penalty_param_loss: - penalty_loss = -penalty_param * (cur_cost_ph - safety_budget) - else: - penalty_loss = -penalty * (cur_cost_ph - safety_budget) - train_penalty = MpiAdamOptimizer(learning_rate=penalty_lr).minimize(penalty_loss) - - - #=========================================================================# - # Create computation graph for policy learning # - #=========================================================================# - - # Likelihood ratio - ratio = tf.exp(logp - logp_old_ph) - - # Surrogate advantage / clipped surrogate advantage - if agent.clipped_adv: - min_adv = tf.where(adv_ph>0, - (1+agent.clip_ratio)*adv_ph, - (1-agent.clip_ratio)*adv_ph - ) - surr_adv = tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) - else: - surr_adv = tf.reduce_mean(ratio * adv_ph) - - # Surrogate cost - surr_cost = tf.reduce_mean(ratio * cadv_ph) - - # Create policy objective function, including entropy regularization - pi_objective = surr_adv + ent_reg * ent - - # Possibly include surr_cost in pi_objective - if agent.objective_penalized: - pi_objective -= penalty * surr_cost - pi_objective /= (1 + penalty) - - # Loss function for pi is negative of pi_objective - pi_loss = -pi_objective - - # Optimizer-specific symbols - if agent.trust_region: - - # Symbols needed for CG solver for any trust region method - pi_params = get_vars('pi') - flat_g = tro.flat_grad(pi_loss, pi_params) - v_ph, hvp = tro.hessian_vector_product(d_kl, pi_params) - if agent.damping_coeff > 0: - hvp += agent.damping_coeff * v_ph - - # Symbols needed for CG solver for CPO only - flat_b = tro.flat_grad(surr_cost, pi_params) - - # Symbols for getting and setting params - get_pi_params = tro.flat_concat(pi_params) - set_pi_params = tro.assign_params_from_flat(v_ph, pi_params) - - training_package = dict(flat_g=flat_g, - flat_b=flat_b, - v_ph=v_ph, - hvp=hvp, - get_pi_params=get_pi_params, - set_pi_params=set_pi_params) - - elif agent.first_order: - - # Optimizer for first-order policy optimization - train_pi = MpiAdamOptimizer(learning_rate=agent.pi_lr).minimize(pi_loss) - - # Prepare training package for agent - training_package = dict(train_pi=train_pi) - - else: - raise NotImplementedError - - # Provide training package to agent - training_package.update(dict(pi_loss=pi_loss, - surr_cost=surr_cost, - d_kl=d_kl, - target_kl=target_kl, - cost_lim=safety_budget)) - agent.prepare_update(training_package) - - #=========================================================================# - # Create computation graph for value learning # - #=========================================================================# - - # Value losses - v_loss = tf.reduce_mean((ret_ph - v)**2) - vc_loss = tf.reduce_mean((cret_ph - vc)**2) - - # If agent uses penalty directly in reward function, don't train a separate - # value function for predicting cost returns. (Only use one vf for r - p*c.) - if agent.reward_penalized: - total_value_loss = v_loss - else: - total_value_loss = v_loss + vc_loss - - # Optimizer for value learning - train_vf = MpiAdamOptimizer(learning_rate=value_fn_lr).minimize(total_value_loss) - - - #=========================================================================# - # Create session, sync across procs, and set up saver # - #=========================================================================# - - sess = tf.Session() - sess.run(tf.global_variables_initializer()) - - # Sync params across processes - sess.run(sync_all_params()) - - # Setup model saving - logger.setup_tf_saver( - sess, - inputs={'x': x_ph}, - outputs={'pi': pi, 'v': v, 'vc': vc} - ) - - - #=========================================================================# - # Provide session to agent # - #=========================================================================# - agent.prepare_session(sess) - - - #=========================================================================# - # Create function for running update (called at end of each epoch) # - #=========================================================================# - - def update(epoch:int): - if CVaR: - cur_cost = logger.get_stats_cvar('EpCost')[0] - else: - cur_cost = logger.get_stats('EpCost')[0] - c = cur_cost - safety_budget - if c > 0 and agent.cares_about_cost: - logger.log('Warning! Safety constraint is already violated.', 'red') - - #=====================================================================# - # Prepare feed dict # - #=====================================================================# - - inputs = {k:v for k,v in zip(buf_phs, buf.get())} - inputs[surr_cost_rescale_ph] = logger.get_stats('EpLen')[0] - inputs[cur_cost_ph] = cur_cost - - #=====================================================================# - # Make some measurements before updating # - #=====================================================================# - - measures = dict(LossPi=pi_loss, - SurrCost=surr_cost, - SurrAdv=surr_adv, - LossV=v_loss, - Entropy=ent) - if not(agent.reward_penalized): - measures['LossVC'] = vc_loss - # if writer is not None: - # writer.add_scalar('opt_info/LossVC',vc_loss, epoch) - if agent.use_penalty: - measures['Penalty'] = penalty - # if writer is not None: - # writer.add_scalar('opt_info/Penalty',penalty, epoch) - - pre_update_measures = sess.run(measures, feed_dict=inputs) - logger.store(**pre_update_measures) - # if writer is not None: - # writer.add_scalar('opt_info/LossPi',pi_loss, epoch) - # writer.add_scalar('opt_info/SurrCost',surr_cost, epoch) - # writer.add_scalar('opt_info/LossV',v_loss, epoch) - # writer.add_scalar('opt_info/Entropy',ent, epoch) - #=====================================================================# - # Update penalty if learning penalty # - #=====================================================================# - if agent.learn_penalty: - sess.run(train_penalty, feed_dict={cur_cost_ph: cur_cost}) - - #=====================================================================# - # Update policy # - #=====================================================================# - agent.update_pi(inputs) - - #=====================================================================# - # Update value function # - #=====================================================================# - for _ in range(gradient_steps): - sess.run(train_vf, feed_dict=inputs) - - #=====================================================================# - # Make some measurements after updating # - #=====================================================================# - - del measures['Entropy'] - measures['KL'] = d_kl - - post_update_measures = sess.run(measures, feed_dict=inputs) - deltas = dict() - for k in post_update_measures: - if k in pre_update_measures: - deltas['Delta'+k] = post_update_measures[k] - pre_update_measures[k] - logger.store(KL=post_update_measures['KL'], **deltas) - - - - - #=========================================================================# - # Run main environment interaction loop # - #=========================================================================# - - start_time = time.time() - o, r, d, c, ep_ret, ep_cost, ep_len = env.reset(), 0, False, 0, 0, 0, 0 - cur_penalty = 0 - cum_cost = 0 - if saute_constraints: - true_ep_ret = 0 - training_rewards = deque([0], maxlen=n_train_episodes) - training_costs = deque([0], maxlen=n_train_episodes) - df = pd.DataFrame() - for epoch in range(epochs): - - if agent.use_penalty: - cur_penalty = sess.run(penalty) - - for t in range(local_steps_per_epoch): - - # Possibly render - if render and proc_id()==0 and t < 1000: - env.render() - - # Get outputs from policy - get_action_outs = sess.run(get_action_ops, - feed_dict={x_ph: o[np.newaxis]}) - a = get_action_outs['pi'] - v_t = get_action_outs['v'] - vc_t = get_action_outs.get('vc', 0) # Agent may not use cost value func - logp_t = get_action_outs['logp_pi'] - pi_info_t = get_action_outs['pi_info'] - - # Step in environment - o2, r, d, info = env.step(a) - - # if saute_constraints and (objective_penalized and penalty_param_loss and ): - - # Include penalty on cost - c = info.get('cost', 0) - - # Track cumulative cost over training - cum_cost += c - if saute_lagrangian: - r = info['true_reward'] - # save and log - if agent.reward_penalized: - r_total = r - cur_penalty * c - r_total = r_total / (1 + cur_penalty) - buf.store(o, a, r_total, v_t, 0, 0, logp_t, pi_info_t) - else: - buf.store(o, a, r, v_t, c, vc_t, logp_t, pi_info_t) - logger.store(VVals=v_t, CostVVals=vc_t) - - o = o2 - ep_ret += r - if saute_constraints: - true_ep_ret += info['true_reward'] - ep_cost += c - ep_len += 1 - - terminal = d or (ep_len == max_ep_len) - if terminal or (t==local_steps_per_epoch-1): - - # If trajectory didn't reach terminal state, bootstrap value target(s) - if d and not(ep_len == max_ep_len): - # Note: we do not count env time out as true terminal state - last_val, last_cval = 0, 0 - else: - feed_dict={x_ph: o[np.newaxis]} - if agent.reward_penalized: - last_val = sess.run(v, feed_dict=feed_dict) - last_cval = 0 - else: - last_val, last_cval = sess.run([v, vc], feed_dict=feed_dict) - buf.finish_path(last_val, last_cval) - - # Only save EpRet / EpLen if trajectory finished - if terminal: - if saute_constraints: - logger.store(EpRet=true_ep_ret, EpLen=ep_len, EpCost=ep_cost) - training_rewards.extend([true_ep_ret]) - else: - logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost) - training_rewards.extend([ep_ret]) - training_costs.extend([ep_cost]) - - else: - print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) - - # Reset environment - o, r, d, c, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0, 0 - if saute_constraints: - true_ep_ret = 0 - #=====================================================================# - # Cumulative cost calculations # - #=====================================================================# - cumulative_cost = mpi_sum(cum_cost) - cost_rate = cumulative_cost / ((epoch+1)*steps_per_epoch) * max_ep_len - # Save model - if (checkpoint_frequency and (epoch % checkpoint_frequency == 0)) or (epoch == epochs-1): - logger.save_state({'env': env}, epoch) - df = df.append(pd.DataFrame({ - "episode_return": training_rewards, - "episode_cost": training_costs, - "accumulated_cost": cumulative_cost, - "cost_rate": cost_rate, - "epoch": epoch, - "run": np.arange(len(training_rewards)) - })) - df.to_csv(os.path.join(logger.output_dir, "train_results.csv")) - - #=====================================================================# - # Run RL update # - #=====================================================================# - update(epoch=epoch) - - # cumulative_cost = mpi_sum(cum_cost) - # cost_rate = cumulative_cost / ((epoch+1)*steps_per_epoch) - - #=====================================================================# - # Log performance and stats # - #=====================================================================# - - logger.log_tabular('Epoch', epoch) - - # Performance stats - logger.log_tabular('EpRet', with_min_and_max=True) - logger.log_tabular('EpCost', with_min_and_max=True) - logger.log_tabular('EpLen', average_only=True) - logger.log_tabular('CumulativeCost', cumulative_cost) - logger.log_tabular('CostRate', cost_rate) - - # Value function values - logger.log_tabular('VVals', with_min_and_max=True) - logger.log_tabular('CostVVals', with_min_and_max=True) - - # Pi loss and change - logger.log_tabular('LossPi', average_only=True) - logger.log_tabular('DeltaLossPi', average_only=True) - - # Surr adv and change - logger.log_tabular('SurrAdv', average_only=True) - logger.log_tabular('DeltaSurrAdv', average_only=True) - - # Surr cost and change - logger.log_tabular('SurrCost', average_only=True) - logger.log_tabular('DeltaSurrCost', average_only=True) - - # V loss and change - logger.log_tabular('LossV', average_only=True) - logger.log_tabular('DeltaLossV', average_only=True) - true_objective = logger.log_current_row['AverageVVals'] - - if writer is not None: - # optimization infos - writer.add_scalar('train_info/LossPi', logger.log_current_row['LossPi'], epoch) - writer.add_scalar('train_info/DeltaLossPi', logger.log_current_row['DeltaLossPi'], epoch) - writer.add_scalar('train_info/SurrAdv', logger.log_current_row['SurrAdv'], epoch) - writer.add_scalar('train_info/DeltaSurrAdv', logger.log_current_row['DeltaSurrAdv'], epoch) - writer.add_scalar('train_info/std_V_values', logger.log_current_row['StdVVals'], epoch) - writer.add_scalar('train_info/mean_V_values', logger.log_current_row['AverageVVals'], epoch) - writer.add_scalar('train_info/std_CostVVals', logger.log_current_row['StdCostVVals'], epoch) - writer.add_scalar('train_info/mean_CostVVals', logger.log_current_row['AverageCostVVals'], epoch) - writer.add_scalar('train_info/mean_SurrCost', logger.log_current_row['SurrCost'], epoch) - writer.add_scalar('train_info/mean_LossV', logger.log_current_row['LossV'], epoch) - writer.add_scalar('train_info/mean_DeltaSurrCost', logger.log_current_row['DeltaSurrCost'], epoch) - writer.add_scalar('train_info/mean_DeltaLossV', logger.log_current_row['DeltaLossV'], epoch) - # episode return - writer.add_scalar('train_return/StdEpRet', logger.log_current_row['StdEpRet'], epoch) - writer.add_scalar('train_return/AverageEpRet', logger.log_current_row['AverageEpRet'], epoch) - writer.add_scalar('train_return/MaxEpRet', logger.log_current_row['MaxEpRet'], epoch) - writer.add_scalar('train_return/MinEpRet', logger.log_current_row['MinEpRet'], epoch) - # episode cost - writer.add_scalar('train_cost/StdEpCost', logger.log_current_row['StdEpCost'], epoch) - writer.add_scalar('train_cost/AverageEpCost', logger.log_current_row['AverageEpCost'], epoch) - writer.add_scalar('train_cost/MaxEpCost', logger.log_current_row['MaxEpCost'], epoch) - writer.add_scalar('train_cost/MinEpCost', logger.log_current_row['MinEpCost'], epoch) - # accumulative cost - writer.add_scalar('train_acc_cost/CumulativeCost', logger.log_current_row['CumulativeCost'], epoch) - writer.add_scalar('train_acc_cost/CostRate', logger.log_current_row['CostRate'], epoch) - - # Vc loss and change, if applicable (reward_penalized agents don't use vc) - if not(agent.reward_penalized): - logger.log_tabular('LossVC', average_only=True) - logger.log_tabular('DeltaLossVC', average_only=True) - if writer: - writer.add_scalar('train_info/mean_LossVC', logger.log_current_row['LossVC'], epoch) - writer.add_scalar('train_info/mean_DeltaLossVC', logger.log_current_row['DeltaLossVC'], epoch) - - if agent.use_penalty or agent.save_penalty: - logger.log_tabular('Penalty', average_only=True) - logger.log_tabular('DeltaPenalty', average_only=True) - true_objective += logger.log_current_row['Penalty'] * (safety_budget - logger.log_current_row['AverageCostVVals']) - if writer: - writer.add_scalar('train_info/Penalty', logger.log_current_row['Penalty'], epoch) - writer.add_scalar('train_info/DeltaPenalty', logger.log_current_row['DeltaPenalty'], epoch) - writer.add_scalar('train_info/True_objective', true_objective, epoch) - else: - logger.log_tabular('Penalty', 0) - logger.log_tabular('DeltaPenalty', 0) - if writer: - writer.add_scalar('train_info/Penalty', 0, epoch) - writer.add_scalar('train_info/DeltaPenalty', 0, epoch) - writer.add_scalar('train_info/True_objective', true_objective, epoch) - - # Anything from the agent? - agent.log() - - # Policy stats - logger.log_tabular('Entropy', average_only=True) - logger.log_tabular('KL', average_only=True) - - # Time and steps elapsed - logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) - logger.log_tabular('Time', time.time()-start_time) - - # Show results! - logger.dump_tabular() - - sess.close() - tf.reset_default_graph() diff --git a/SAUTE/tf_algos/safety_starter_agents/sac_utils.py b/SAUTE/tf_algos/safety_starter_agents/sac_utils.py deleted file mode 100644 index 58541b4f..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/sac_utils.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python -""" -Copied from safety starter agents. -""" -import numpy as np -import tensorflow as tf - -EPS = 1e-8 - -def placeholder(dim=None): - return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) - -def placeholders(*args): - return [placeholder(dim) for dim in args] - -def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): - for h in hidden_sizes[:-1]: - x = tf.layers.dense(x, units=h, activation=activation) - return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) - -def get_vars(scope): - return [x for x in tf.global_variables() if scope in x.name] - -def count_vars(scope): - v = get_vars(scope) - return sum([np.prod(var.shape.as_list()) for var in v]) - -def gaussian_likelihood(x, mu, log_std): - pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) - return tf.reduce_sum(pre_sum, axis=1) - -def get_target_update(main_name, target_name, polyak): - ''' Get a tensorflow op to update target variables based on main variables ''' - main_vars = {x.name: x for x in get_vars(main_name)} - targ_vars = {x.name: x for x in get_vars(target_name)} - assign_ops = [] - for v_targ in targ_vars: - assert v_targ.startswith(target_name), f'bad var name {v_targ} for {target_name}' - v_main = v_targ.replace(target_name, main_name, 1) - assert v_main in main_vars, f'missing var name {v_main}' - assign_op = tf.assign(targ_vars[v_targ], polyak*targ_vars[v_targ] + (1-polyak)*main_vars[v_main]) - assign_ops.append(assign_op) - return tf.group(assign_ops) - - -""" -Policies -""" - -LOG_STD_MAX = 2 -LOG_STD_MIN = -20 - -def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): - act_dim = a.shape.as_list()[-1] - net = mlp(x, list(hidden_sizes), activation, activation) - mu = tf.layers.dense(net, act_dim, activation=output_activation) - log_std = tf.layers.dense(net, act_dim, activation=None) - log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) - - std = tf.exp(log_std) - pi = mu + tf.random_normal(tf.shape(mu)) * std - logp_pi = gaussian_likelihood(pi, mu, log_std) - return mu, pi, logp_pi - -def apply_squashing_func(mu, pi, logp_pi): - # Adjustment to log prob - logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1) - - # Squash those unbounded actions! - mu = tf.tanh(mu) - pi = tf.tanh(pi) - return mu, pi, logp_pi - - -""" -Actors and Critics -""" -def mlp_actor(x, a, name='pi', hidden_sizes=(256,256), activation=tf.nn.relu, - output_activation=None, policy=mlp_gaussian_policy, action_space=None): - # policy - with tf.variable_scope(name): - mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) - mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) - - # make sure actions are in correct range - action_scale = action_space.high[0] - mu *= action_scale - pi *= action_scale - - return mu, pi, logp_pi - - -def mlp_critic(x, a, pi, name, hidden_sizes=(256,256), activation=tf.nn.relu, - output_activation=None, policy=mlp_gaussian_policy, action_space=None): - - fn_mlp = lambda x : tf.squeeze(mlp(x=x, - hidden_sizes=list(hidden_sizes)+[1], - activation=activation, - output_activation=None), - axis=1) - with tf.variable_scope(name): - critic = fn_mlp(tf.concat([x,a], axis=-1)) - - with tf.variable_scope(name, reuse=True): - critic_pi = fn_mlp(tf.concat([x,pi], axis=-1)) - - return critic, critic_pi - -def mlp_var(x, a, pi, name, hidden_sizes=(64,64), activation=tf.nn.relu, - output_activation=None, policy=mlp_gaussian_policy, action_space=None): - - fn_mlp = lambda x : tf.squeeze(mlp(x=x, - hidden_sizes=list(hidden_sizes)+[1], - activation=activation, - output_activation=None), - axis=1) - - with tf.variable_scope(name): - var = fn_mlp(tf.concat([x,a], axis=-1)) - var = tf.nn.softplus(var) - - with tf.variable_scope(name, reuse=True): - var_pi = fn_mlp(tf.concat([x,pi], axis=-1)) - var_pi = tf.nn.softplus(var_pi) - - return var, var_pi - -class ReplayBuffer: - """ - A simple FIFO experience replay buffer for SAC agents. - """ - - def __init__(self, obs_dim, act_dim, size): - self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) - self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) - self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) - self.rews_buf = np.zeros(size, dtype=np.float32) - self.costs_buf = np.zeros(size, dtype=np.float32) - self.done_buf = np.zeros(size, dtype=np.float32) - self.ptr, self.size, self.max_size = 0, 0, size - - def store(self, obs, act, rew, next_obs, done, cost): - self.obs1_buf[self.ptr] = obs - self.obs2_buf[self.ptr] = next_obs - self.acts_buf[self.ptr] = act - self.rews_buf[self.ptr] = rew - self.costs_buf[self.ptr] = cost - self.done_buf[self.ptr] = done - self.ptr = (self.ptr+1) % self.max_size - self.size = min(self.size+1, self.max_size) - - def sample_batch(self, batch_size=32): - idxs = np.random.randint(0, self.size, size=batch_size) - return dict(obs1=self.obs1_buf[idxs], - obs2=self.obs2_buf[idxs], - acts=self.acts_buf[idxs], - rews=self.rews_buf[idxs], - costs=self.costs_buf[idxs], - done=self.done_buf[idxs]) - diff --git a/SAUTE/tf_algos/safety_starter_agents/test_agents.py b/SAUTE/tf_algos/safety_starter_agents/test_agents.py deleted file mode 100644 index 6bce7487..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/test_agents.py +++ /dev/null @@ -1,84 +0,0 @@ -import numpy as np -from safe_rl.utils.load_utils import load_policy -import os -import re -import gym -import pandas as pd -from typing import Tuple, Callable -""" -Scripts for evaluating agents. Copied from safety starter agents with minor modifications. -""" - -def evaluate_epoch( - fpath:str, - env_fn:gym.Env, - epoch:int, - evaluations:int -) -> pd.DataFrame: - """ - Computes policy evaluations - :param fpath: path with a saved policy, - :param epoch: epoch number for evaluation, - :param evaluations: number of evaluation episodes. - """ - # Lists to capture all measurements of performance - episode_returns = [] - episode_costs = [] - - # Load environment, policy, and session - saved_environment, policy, _ = load_policy(fpath=fpath, itr=epoch, deterministic=True) - if env_fn is None: - environment = saved_environment - else: - environment = env_fn() - # Run evaluation episodes - for _ in range(evaluations): - episode_return, episode_cost = run_policy(environment, policy) - episode_returns.append(episode_return); episode_costs.append(episode_cost) - - # Present performance metrics - df = pd.DataFrame({'episode_return':episode_returns, 'episode_cost':episode_costs}) - df['epoch'] = epoch - - return df - -# Runs a single episode and records return/cost -def run_policy( - environment, - policy -) -> Tuple[np.ndarray, np.ndarray]: - observation = environment.reset(); done = False - episode_return, episode_cost = 0, 0 - while not done: - action = policy(observation) - action = np.clip( - action, - environment.action_space.low, - environment.action_space.high) - observation, reward, done, info = environment.step(action) - episode_return += reward - episode_cost += info.get('cost', 0) - - return episode_return, episode_cost - -def evaluate_run( - path_dir:str, - env_fn:Callable, - evaluations: int=100, - last_only:bool=False # evaluate only the last policy -) -> pd.DataFrame: - # Figure out how many epochs there are to evaluate - run_contents = os.listdir(path_dir) - all_epochs = [] - for epoch in run_contents: - if epoch.startswith('vars'): - all_epochs.append(int(re.findall(r'\d+', epoch)[0])) - - df = pd.DataFrame() - if last_only: - all_epochs = [max(all_epochs)] - for idx, epoch in enumerate(all_epochs): - print(idx/max(all_epochs)) - df = pd.concat([df, evaluate_epoch(fpath=path_dir, env_fn=env_fn, epoch=epoch, evaluations=evaluations)], ignore_index=True) - - return df diff --git a/SAUTE/tf_algos/safety_starter_agents/tf_cpo.py b/SAUTE/tf_algos/safety_starter_agents/tf_cpo.py deleted file mode 100644 index c0c91cb5..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/tf_cpo.py +++ /dev/null @@ -1,14 +0,0 @@ -from safe_rl.pg.agents import CPOAgent -from tf_algos.safety_starter_agents.run_agents import run_polopt_agent - - -def cpo(**kwargs): - """Set up to run CPO.""" - cpo_kwargs = dict( - reward_penalized=False, # Irrelevant in CPO - objective_penalized=False, # Irrelevant in CPO - learn_penalty=False, # Irrelevant in CPO - penalty_param_loss=False # Irrelevant in CPO - ) - agent = CPOAgent(**cpo_kwargs) - run_polopt_agent(agent=agent, **kwargs) \ No newline at end of file diff --git a/SAUTE/tf_algos/safety_starter_agents/tf_ppo.py b/SAUTE/tf_algos/safety_starter_agents/tf_ppo.py deleted file mode 100644 index 14f592ce..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/tf_ppo.py +++ /dev/null @@ -1,51 +0,0 @@ -from tf_algos.safety_starter_agents.agents import PPOAgent -from tf_algos.safety_starter_agents.run_agents import run_polopt_agent - -# added env_name parameter for easy of overriding -def ppo(**kwargs): - """Set up to run Vanilla PPO.""" - ppo_kwargs = dict( - reward_penalized=False, - objective_penalized=False, - learn_penalty=False, - penalty_param_loss=False # Irrelevant in unconstrained - ) - agent = PPOAgent(**ppo_kwargs) - run_polopt_agent(agent=agent, **kwargs) - -def saute_ppo(**kwargs): - """Set up to run Saute PPO.""" - ppo_kwargs = dict( - reward_penalized=False, - objective_penalized=False, - learn_penalty=False, - penalty_param_loss=False # Irrelevant in unconstrained - ) - agent = PPOAgent(**ppo_kwargs) - kwargs['saute_constraints'] = True - run_polopt_agent(agent=agent, **kwargs) - -def ppo_lagrangian(**kwargs): - """Set up to run PPO Lagrangian.""" - # Objective-penalized form of Lagrangian PPO. - ppo_kwargs = dict( - reward_penalized=False, - objective_penalized=True, - learn_penalty=True, - penalty_param_loss=True - ) - agent = PPOAgent(**ppo_kwargs) - run_polopt_agent(agent=agent, **kwargs) - -def saute_ppo_lagrangian(**kwargs): - """Set up to run Saute PPO Lagrangian.""" - # Objective-penalized form of Lagrangian PPO. - ppo_kwargs = dict( - reward_penalized=False, - objective_penalized=True, - learn_penalty=True, - penalty_param_loss=True - ) - agent = PPOAgent(**ppo_kwargs) - kwargs['saute_lagrangian'] = True - run_polopt_agent(agent=agent, **kwargs) diff --git a/SAUTE/tf_algos/safety_starter_agents/tf_sac.py b/SAUTE/tf_algos/safety_starter_agents/tf_sac.py deleted file mode 100644 index 72990fe6..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/tf_sac.py +++ /dev/null @@ -1,661 +0,0 @@ -""" -SAC Agent and the run file. Copied from safety starter agents with minor modifications -""" -from typing import Callable -import numpy as np -import tensorflow as tf -import time -from safe_rl.utils.logx import EpochLogger -from safe_rl.utils.mpi_tf import sync_all_params, MpiAdamOptimizer -from safe_rl.utils.mpi_tools import mpi_fork, mpi_sum, proc_id, num_procs -from .sac_utils import mlp_actor, mlp_var, mlp_critic, placeholders, ReplayBuffer, count_vars, get_vars, get_target_update -from scipy.stats import norm -from collections import deque -import pandas as pd -import os - - -sac_cfg = dict( - log=True, - log_updates=True, - seed=42, - steps_per_epoch=1000, - epochs=1000, - replay_size=int(1e6), - discount_factor=0.99, - safety_discount_factor=0.99, - safety_budget=1.0, - tau=0.005, - lr=5e-4, - alpha_lr=5e-4, - penalty_lr=5e-2, - batch_size=1024, - local_start_steps=int(1e3), - # max_ep_len=0, # removed as it should be taken from env_cfg - checkpoint_frequency=10, - n_test_episodes=100, - n_train_episodes=100, - local_update_after=int(1e3), - train_frequency=1, - render=False, - fixed_entropy_bonus=None, - target_entropy=-1.0, - initial_alpha=0.0, - fixed_cost_penalty=None, - cost_constraint=None, - use_mean_constraints=False, - reward_scale=1, - use_cvar_constraints=False, - damp_scale=0.0, -) - -def vanilla_sac( - **kwargs -): - """Run Vanilla SAC.""" - - kwargs['fixed_cost_penalty'] = None - kwargs['cost_constraint'] = None - kwargs['safety_budget'] = None - kwargs['saute_constraints'] = False - kwargs['use_cvar_constraints'] = False - run_sac_algo(**kwargs) - - -def lagrangian_sac( - **kwargs -): - """Run SAC Lagrangian.""" - assert kwargs['safety_budget'] is not None - kwargs['fixed_cost_penalty'] = None - kwargs['cost_constraint'] = None - kwargs['saute_constraints'] = False - kwargs['use_cvar_constraints'] = False - kwargs['use_mean_constraints'] = True - run_sac_algo(**kwargs) - - -def saute_sac( - **kwargs -): - """Run Saute SAC.""" - assert kwargs['safety_budget'] is not None - kwargs['fixed_cost_penalty'] = None - kwargs['cost_constraint'] = None - kwargs['saute_constraints'] = True - kwargs['use_cvar_constraints'] = False - kwargs['use_mean_constraints'] = False - run_sac_algo(**kwargs) - -def wc_sac( - **kwargs -): - """Run Worst Case SAC from https://github.com/AlgTUDelft/WCSAC, which is based on safety starter agents https://github.com/openai/safety-starter-agents. """ - assert kwargs['safety_budget'] is not None and kwargs['safety_budget'] > 0 - kwargs['fixed_cost_penalty'] = None - kwargs['cost_constraint'] = None - kwargs['saute_constraints'] = False - kwargs['use_cvar_constraints'] = True - kwargs['use_mean_constraints'] = False - raise NotImplementedError("Due to licencing issues we cannot release this part, but modifications from https://github.com/AlgTUDelft/WCSAC can be easily adapted.") - -def saute_lagrangian_sac( - **kwargs -): - """Set up to run Saute SAC Lagrangian.""" - - assert kwargs['safety_budget'] is not None - kwargs['fixed_cost_penalty'] = None - kwargs['cost_constraint'] = None - kwargs['saute_constraints'] = True - kwargs['use_cvar_constraints'] = False - kwargs['use_mean_constraints'] = True - run_sac_algo(**kwargs) - -def run_sac_algo( - env_name:str="", - log:bool=True, - log_updates:bool=True, - train_env_fn:Callable=None, - test_env_fn:Callable=None, - actor_fn=mlp_actor, - critic_fn=mlp_critic, - var_fn=mlp_var, - ac_kwargs=dict(), - seed=42, - steps_per_epoch=1000, - epochs=100, - replay_size=int(1e6), - discount_factor=0.99, - safety_discount_factor=0.99, - tau=0.005, - lr=1e-4, - alpha_lr=1e-4, - penalty_lr=1e-5, - batch_size=1024, - local_start_steps=int(1e3), - max_ep_len=0, - logger_kwargs=dict(), - checkpoint_frequency=10, - n_test_episodes=10, - n_train_episodes=10, - local_update_after=int(1e3), - train_frequency=1, - render=False, - fixed_entropy_bonus=None, - target_entropy=-1.0, - initial_alpha=0.0, - fixed_cost_penalty=None, - cost_constraint=None, - saute_constraints:bool=False, - use_mean_constraints:bool=False, - safety_budget=None, - reward_scale=1, - writer=None, - use_cvar_constraints=False, # removed from repo - damp_scale=0.0, # removed from repo - ): - """ - - Args: - env_fn : A function which creates a copy of the environment. - The environment must satisfy the OpenAI Gym API. - - actor_fn: A function which takes in placeholder symbols - for state, ``x_ph``, and action, ``a_ph``, and returns the actor - outputs from the agent's Tensorflow computation graph: - - =========== ================ ====================================== - Symbol Shape Description - =========== ================ ====================================== - ``mu`` (batch, act_dim) | Computes mean actions from policy - | given states. - ``pi`` (batch, act_dim) | Samples actions from policy given - | states. - ``logp_pi`` (batch,) | Gives log probability, according to - | the policy, of the action sampled by - | ``pi``. Critical: must be differentiable - | with respect to policy parameters all - | the way through action sampling. - =========== ================ ====================================== - - critic_fn: A function which takes in placeholder symbols - for state, ``x_ph``, action, ``a_ph``, and policy ``pi``, - and returns the critic outputs from the agent's Tensorflow computation graph: - - =========== ================ ====================================== - Symbol Shape Description - =========== ================ ====================================== - ``critic`` (batch,) | Gives one estimate of Q* for - | states in ``x_ph`` and actions in - | ``a_ph``. - ``critic_pi`` (batch,) | Gives another estimate of Q* for - | states in ``x_ph`` and actions in - | ``a_ph``. - =========== ================ ====================================== - - ac_kwargs (dict): Any kwargs appropriate for the actor_fn / critic_fn - function you provided to SAC. - - seed (int): Seed for random number generators. - - steps_per_epoch (int): Number of steps of interaction (state-action pairs) - for the agent and the environment in each epoch. - - epochs (int): Number of epochs to run and train agent. - - replay_size (int): Maximum length of replay buffer. - - gamma (float): Discount factor. (Always between 0 and 1.) - - tau (float): Interpolation factor in tau averaging for target - networks. Target networks are updated towards main networks - according to: - - .. math:: \\theta_{\\text{targ}} \\leftarrow - \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - - where :math:`\\rho` is polyak=1-tau. (Always between 0 and 1, usually - close to 1.) - - lr (float): Learning rate (used for both policy and value learning). - - batch_size (int): Minibatch size for SGD. - - local_start_steps (int): Number of steps for uniform-random action selection, - before running real policy. Helps exploration. - - max_ep_len (int): Maximum length of trajectory / episode / rollout. - - logger_kwargs (dict): Keyword args for EpochLogger. - - checkpoint_frequency (int): How often (in terms of gap between epochs) to save - the current policy and value function. - - fixed_entropy_bonus (float or None): Fixed bonus to reward for entropy. - Units are (points of discounted sum of future reward) / (nats of policy entropy). - If None, use ``entropy_constraint`` to set bonus value instead. - - entropy_constraint (float): If ``fixed_entropy_bonus`` is None, - Adjust entropy bonus to maintain at least this much entropy. - Actual constraint value is multiplied by the dimensions of the action space. - Units are (nats of policy entropy) / (action dimenson). - - fixed_cost_penalty (float or None): Fixed penalty to reward for cost. - Units are (points of discounted sum of future reward) / (points of discounted sum of future costs). - If None, use ``cost_constraint`` to set penalty value instead. - - cost_constraint (float or None): If ``fixed_cost_penalty`` is None, - Adjust cost penalty to maintain at most this much cost. - Units are (points of discounted sum of future costs). - Note: to get an approximate cost_constraint from a cost_lim (undiscounted sum of costs), - multiply cost_lim by (1 - gamma ** episode_len) / (1 - gamma). - If None, use cost_lim to calculate constraint. - - cost_lim (float or None): If ``cost_constraint`` is None, - calculate an approximate constraint cost from this cost limit. - Units are (expectation of undiscounted sum of costs in a single episode). - If None, cost_lim is not used, and if no cost constraints are used, do naive optimization. - """ - assert 0 <= discount_factor <= 1 - assert 0 <= safety_discount_factor <= 1 - use_costs = fixed_cost_penalty or cost_constraint or use_mean_constraints - polyak = 1 - tau - logger = EpochLogger(**logger_kwargs) - logger.save_config(locals()) - - # Env instantiation - env, test_env = train_env_fn(), test_env_fn() - observation_space = env.observation_space - obs_dim = observation_space.shape[0] - act_dim = env.action_space.shape[0] - - # Setting seeds - tf.set_random_seed(seed) - np.random.seed(seed) - env.seed(seed) - test_env.seed(seed) - - # Action limit for clamping: critically, assumes all dimensions share the same bound! - act_limit = env.action_space.high[0] - - # Share information about action space with policy architecture - ac_kwargs['action_space'] = env.action_space - - # Inputs to computation graph - x_ph, a_ph, x2_ph, r_ph, d_ph, c_ph = placeholders(obs_dim, act_dim, obs_dim, None, None, None) - - # Main outputs from computation graph - with tf.variable_scope('main'): - mu, pi, logp_pi = actor_fn(x_ph, a_ph, **ac_kwargs) - qr1, qr1_pi = critic_fn(x_ph, a_ph, pi, name='qr1', **ac_kwargs) - qr2, qr2_pi = critic_fn(x_ph, a_ph, pi, name='qr2', **ac_kwargs) - qc, qc_pi = critic_fn(x_ph, a_ph, pi, name='qc', **ac_kwargs) - - with tf.variable_scope('main', reuse=True): - # Additional policy output from a different observation placeholder - # This lets us do separate optimization updates (actor, critics, etc) - # in a single tensorflow op. - _, pi2, logp_pi2 = actor_fn(x2_ph, a_ph, **ac_kwargs) - - # Target value network - with tf.variable_scope('target'): - _, qr1_pi_targ = critic_fn(x2_ph, a_ph, pi2, name='qr1', **ac_kwargs) - _, qr2_pi_targ = critic_fn(x2_ph, a_ph, pi2, name='qr2', **ac_kwargs) - _, qc_pi_targ = critic_fn(x2_ph, a_ph, pi2, name='qc', **ac_kwargs) - - - # Entropy bonus - if fixed_entropy_bonus is None: - with tf.variable_scope('entreg'): - soft_alpha = tf.get_variable('soft_alpha', - initializer=initial_alpha, - trainable=True, - dtype=tf.float32) - alpha = tf.nn.softplus(soft_alpha) - else: - alpha = tf.constant(fixed_entropy_bonus) - # log_alpha = tf.log(alpha) # original - log_alpha = tf.log(tf.clip_by_value(alpha, 1e-8, 1e8)) # clipping - - # Cost penalty - if use_costs: - if fixed_cost_penalty is None: - with tf.variable_scope('costpen'): - soft_beta = tf.get_variable('soft_beta', - initializer=0.0, - trainable=True, - dtype=tf.float32) - beta = tf.nn.softplus(soft_beta) - else: - beta = tf.constant(fixed_cost_penalty) - # log_beta = tf.log(beta) # original - log_beta = tf.log(tf.clip_by_value(beta, 1e-8, 1e8)) # clipping beta - else: - beta = 0.0 # costs do not contribute to policy optimization - print('Not using costs') - - # Experience buffer - replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) - - # Count variables - if proc_id()==0: - var_counts = tuple(count_vars(scope) for scope in - ['main/pi', 'main/qr1', 'main/qr2', 'main/qc', 'main']) - print(('\nNumber of parameters: \t pi: %d, \t qr1: %d, \t qr2: %d, \t qc: %d, \t total: %d\n')%var_counts) - - cost_normalizer = 1 - if use_costs: - if cost_constraint is None: - # Convert assuming equal cost accumulated each step - # Note this isn't the case, since the early in episode doesn't usually have cost, - # but since our algorithm optimizes the discounted infinite horizon from each entry - # in the replay buffer, we should be approximately correct here. - # It's worth checking empirical total undiscounted costs to see if they match. - if np.abs(safety_budget) >= 1e-6: - cost_constraint = 1 - cost_normalizer = np.abs(safety_budget) - if 0 <= safety_discount_factor < 1.0: - cost_normalizer *= (1 - safety_discount_factor ** max_ep_len) / (1 - safety_discount_factor) / max_ep_len - else: - cost_normalizer = 1 - cost_constraint = safety_budget - if 0 <= safety_discount_factor < 1.0: - cost_constraint *= (1 - safety_discount_factor ** max_ep_len) / (1 - safety_discount_factor) / max_ep_len - print('using cost constraint', cost_constraint) - # Min Double-Q: - min_q_pi = tf.minimum(qr1_pi, qr2_pi) - min_q_pi_targ = tf.minimum(qr1_pi_targ, qr2_pi_targ) - - # Targets for Q and V regression with normalized costs - q_backup = tf.stop_gradient(r_ph + discount_factor*(1-d_ph)*(min_q_pi_targ - alpha * logp_pi2)) - qc_backup = tf.stop_gradient(c_ph / cost_normalizer + safety_discount_factor*(1-d_ph)*qc_pi_targ) - damp = 0 - if use_costs: - violation = tf.reduce_mean(cost_constraint - qc) # normalized costs - - - # Soft actor-critic losses - qr1_loss = 0.5 * tf.reduce_mean((q_backup - qr1)**2) - qr2_loss = 0.5 * tf.reduce_mean((q_backup - qr2)**2) - qc_loss = 0.5 * tf.reduce_mean((qc_backup - qc)**2) - q_loss = qr1_loss + qr2_loss + qc_loss - pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi + (beta - damp) * qc_pi) - - # Loss for alpha - target_entropy *= act_dim - pi_entropy = -tf.reduce_mean(logp_pi) - # alpha_loss = - soft_alpha * (entropy_constraint - pi_entropy) - alpha_loss = - alpha * (target_entropy - pi_entropy) - print('using entropy constraint', target_entropy) - - # Loss for beta - if use_costs: - beta_loss = beta * (cost_constraint - qc) # normalized costs - - # Policy train op - # (has to be separate from value train op, because qr1_pi appears in pi_loss) - train_pi_op = MpiAdamOptimizer(learning_rate=lr).minimize(pi_loss, var_list=get_vars('main/pi'), name='train_pi') - - # Value train op - with tf.control_dependencies([train_pi_op]): - train_q_op = MpiAdamOptimizer(learning_rate=lr).minimize(q_loss, var_list=get_vars('main/q'), name='train_q') - - if fixed_entropy_bonus is None: - entreg_optimizer = MpiAdamOptimizer(learning_rate=alpha_lr) - with tf.control_dependencies([train_q_op]): - train_entreg_op = entreg_optimizer.minimize(alpha_loss, var_list=get_vars('entreg')) - - if use_costs and fixed_cost_penalty is None: - costpen_optimizer = MpiAdamOptimizer(learning_rate=penalty_lr) - with tf.control_dependencies([train_entreg_op]): - train_costpen_op = costpen_optimizer.minimize(beta_loss, var_list=get_vars('costpen')) - - # Polyak averaging for target variables - target_update = get_target_update('main', 'target', polyak) - - # Single monolithic update with explicit control dependencies - with tf.control_dependencies([train_pi_op]): - with tf.control_dependencies([train_q_op]): - grouped_update = tf.group([target_update]) - - if fixed_entropy_bonus is None: - grouped_update = tf.group([grouped_update, train_entreg_op]) - if use_costs and fixed_cost_penalty is None: - grouped_update = tf.group([grouped_update, train_costpen_op]) - - # Initializing targets to match main variables - # As a shortcut, use our exponential moving average update w/ coefficient zero - target_init = get_target_update('main', 'target', 0.0) - - sess = tf.Session() - sess.run(tf.global_variables_initializer()) - sess.run(target_init) - # Sync params across processes - sess.run(sync_all_params()) - - # Setup model saving - logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, - outputs={'mu': mu, 'pi': pi, 'qr1': qr1, 'qr2': qr2, 'qc': qc}) - - def get_action(o, deterministic=False): - act_op = mu if deterministic else pi - return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] - - def test_agent(n=10): - for j in range(n): - o, r, d, ep_ret, ep_cost, ep_len, ep_goals, = test_env.reset(), 0, False, 0, 0, 0, 0 - if saute_constraints: - true_ep_ret = 0 - while not(d or (ep_len == max_ep_len)): - # Take deterministic actions at test time - o, r, d, info = test_env.step(get_action(o, True)) - if render and proc_id() == 0 and j == 0: - test_env.render() - ep_ret += r - ep_cost += info.get('cost', 0) - ep_len += 1 - ep_goals += 1 if info.get('goal_met', False) else 0 - if saute_constraints: - true_ep_ret += info['true_reward'] - if saute_constraints: - logger.store(TestEpRet=true_ep_ret, TestEpCost=ep_cost, TestEpLen=ep_len, TestEpGoals=ep_goals) - else: - logger.store(TestEpRet=ep_ret, TestEpCost=ep_cost, TestEpLen=ep_len, TestEpGoals=ep_goals) - - start_time = time.time() - o, r, d, ep_ret, ep_cost, ep_len, ep_goals = env.reset(), 0, False, 0, 0, 0, 0 - if saute_constraints: - true_ep_ret = 0 - total_steps = steps_per_epoch * epochs - - # variables to measure in an update - vars_to_get = dict(LossPi=pi_loss, LossQR1=qr1_loss, LossQR2=qr2_loss, LossQC=qc_loss, - QR1Vals=qr1, QR2Vals=qr2, QCVals=qc, LogPi=logp_pi, PiEntropy=pi_entropy, - Alpha=alpha, LogAlpha=log_alpha, LossAlpha=alpha_loss) - if use_costs: - vars_to_get.update(dict(Beta=beta, LogBeta=log_beta, LossBeta=beta_loss, Violation=violation)) - - print('starting training', proc_id()) - - # Main loop: collect experience in env and update/log each epoch - local_steps = 0 - cum_cost = 0 - local_steps_per_epoch = steps_per_epoch // num_procs() - local_batch_size = batch_size // num_procs() - epoch_start_time = time.time() - training_rewards = deque([0], maxlen=n_train_episodes) - training_costs = deque([0], maxlen=n_train_episodes) - df = pd.DataFrame() - assert max_ep_len <= local_steps_per_epoch, "Episode length should be smaller or equal to local steps per epoch" - - for t in range(total_steps // num_procs()): - """ - Until local_start_steps have elapsed, randomly sample actions - from a uniform distribution for better exploration. Afterwards, - use the learned policy. - """ - if t > local_start_steps: - a = get_action(o) - else: - a = env.action_space.sample() - - # Step the env - o2, r, d, info = env.step(a) - r *= reward_scale # yee-haw - c = info.get('cost', 0) - - # Track cumulative cost over training - cum_cost += c - - ep_ret += r - ep_cost += c - ep_len += 1 - ep_goals += 1 if info.get('goal_met', False) else 0 - local_steps += 1 - if saute_constraints: - true_ep_ret += info['true_reward'] - # Ignore the "done" signal if it comes from hitting the time - # horizon (that is, when it's an artificial terminal signal - # that isn't based on the agent's state) - d = False if ep_len==max_ep_len else d - - # Store experience to replay buffer - if saute_constraints and use_mean_constraints: - replay_buffer.store(o, a, info['true_reward'], o2, d, c) - else: - replay_buffer.store(o, a, r, o2, d, c) - # Super critical, easy to overlook step: make sure to update - # most recent observation! - o = o2 - - if d or (ep_len == max_ep_len): - if saute_constraints: - logger.store(EpRet=true_ep_ret, EpCost=ep_cost, EpLen=ep_len, EpGoals=ep_goals) - training_rewards.extend([true_ep_ret]) - true_ep_ret = 0 - else: - logger.store(EpRet=ep_ret, EpCost=ep_cost, EpLen=ep_len, EpGoals=ep_goals) - training_rewards.extend([ep_ret]) - training_costs.extend([ep_cost]) - o, r, d, ep_ret, ep_cost, ep_len, ep_goals = env.reset(), 0, False, 0, 0, 0, 0 - - if t > 0 and t % train_frequency == 0: - for j in range(train_frequency): - batch = replay_buffer.sample_batch(local_batch_size) - feed_dict = {x_ph: batch['obs1'], - x2_ph: batch['obs2'], - a_ph: batch['acts'], - r_ph: batch['rews'], - c_ph: batch['costs'], - d_ph: batch['done'], - } - if t < local_update_after: - logger.store(**sess.run(vars_to_get, feed_dict)) - else: - values, _ = sess.run([vars_to_get, grouped_update], feed_dict) - logger.store(**values) - - - # End of epoch wrap-up - if t > 0 and t % local_steps_per_epoch == 0: - epoch = t // local_steps_per_epoch - - #=====================================================================# - # Cumulative cost calculations # - #=====================================================================# - - cumulative_cost = mpi_sum(cum_cost) - cost_rate = cumulative_cost / ((epoch+1)*steps_per_epoch) * max_ep_len # cost rate per episode - - # Save model - if (checkpoint_frequency and (epoch % checkpoint_frequency == 0)) or (epoch == epochs-1): - logger.save_state({'env': env}, epoch) - df = df.append(pd.DataFrame({ - "episode_return": training_rewards, - "episode_cost": training_costs, - "accumulated_cost": cumulative_cost, - "cost_rate": cost_rate, - "epoch": epoch, - "run": np.arange(len(training_rewards)) - })) - df.to_csv(os.path.join(logger.output_dir, "train_results.csv")) - - # Test the performance of the deterministic version of the agent. - test_start_time = time.time() - # test_agent(n=10) # minimal testing during training - logger.store(TestTime=time.time() - test_start_time) - - logger.store(EpochTime=time.time() - epoch_start_time) - epoch_start_time = time.time() - - # Log info about epoch - logger.log_tabular('Epoch', epoch) - logger.log_tabular('EpRet', with_min_and_max=True) - logger.log_tabular('EpCost', with_min_and_max=True) - logger.log_tabular('EpLen', average_only=True) - logger.log_tabular('EpGoals', average_only=True) - logger.log_tabular('TotalEnvInteracts', mpi_sum(local_steps)) - logger.log_tabular('QR1Vals', with_min_and_max=True) - logger.log_tabular('QR2Vals', with_min_and_max=True) - logger.log_tabular('QCVals', with_min_and_max=True) - logger.log_tabular('LogPi', with_min_and_max=True) - logger.log_tabular('LossPi', average_only=True) - logger.log_tabular('LossQR1', average_only=True) - logger.log_tabular('LossQR2', average_only=True) - logger.log_tabular('LossQC', average_only=True) - logger.log_tabular('LossAlpha', average_only=True) - logger.log_tabular('LogAlpha', average_only=True) - logger.log_tabular('Alpha', average_only=True) - logger.log_tabular('CostRate', cost_rate) - logger.log_tabular('CumulativeCost', cumulative_cost) - - if use_costs: - logger.log_tabular('LossBeta', average_only=True) - logger.log_tabular('LogBeta', average_only=True) - logger.log_tabular('Beta', average_only=True) - logger.log_tabular('Violation', average_only=True) - - - logger.log_tabular('PiEntropy', average_only=True) - # logger.log_tabular('TestTime', average_only=True) - logger.log_tabular('EpochTime', average_only=True) - logger.log_tabular('TotalTime', time.time()-start_time) - if writer is not None: - # optimization infos - writer.add_scalar('train_info/LossPi', logger.log_current_row['LossPi'], epoch) - writer.add_scalar('train_info/LossQC', logger.log_current_row['LossQC'], epoch) - writer.add_scalar('train_info/LossQR1', logger.log_current_row['LossQR1'], epoch) - writer.add_scalar('train_info/LossQR2', logger.log_current_row['LossQR2'], epoch) - writer.add_scalar('train_info/LossAlpha', logger.log_current_row['LossAlpha'], epoch) - writer.add_scalar('train_info/Alpha', logger.log_current_row['Alpha'], epoch) - writer.add_scalar('train_info/std_QR1Vals', logger.log_current_row['StdQR1Vals'], epoch) - writer.add_scalar('train_info/mean_QR1Vals', logger.log_current_row['AverageQR1Vals'], epoch) - writer.add_scalar('train_info/std_QR2Vals', logger.log_current_row['StdQR2Vals'], epoch) - writer.add_scalar('train_info/mean_QR2Vals', logger.log_current_row['AverageQR2Vals'], epoch) - if use_costs: - writer.add_scalar('train_info/std_QCVals', logger.log_current_row['StdQCVals'], epoch) - writer.add_scalar('train_info/mean_QCVals', logger.log_current_row['AverageQCVals'], epoch) - writer.add_scalar('train_info/LossBeta', logger.log_current_row['LossBeta'], epoch) - writer.add_scalar('train_info/Beta', logger.log_current_row['Beta'], epoch) - - # training costs - # episode return - writer.add_scalar('train_return/StdEpRet', logger.log_current_row['StdEpRet'], epoch) - writer.add_scalar('train_return/AverageEpRet', logger.log_current_row['AverageEpRet'], epoch) - writer.add_scalar('train_return/MaxEpRet', logger.log_current_row['MaxEpRet'], epoch) - writer.add_scalar('train_return/MinEpRet', logger.log_current_row['MinEpRet'], epoch) - # accumulative cost - writer.add_scalar('train_acc_cost/CumulativeCost', logger.log_current_row['CumulativeCost'], epoch) - writer.add_scalar('train_acc_cost/CostRate', logger.log_current_row['CostRate'], epoch) - # episode cost - if use_costs: - writer.add_scalar('train_cost/violation', logger.log_current_row['Violation'], epoch) - writer.add_scalar('train_cost/StdEpCost', logger.log_current_row['StdEpCost'], epoch) - writer.add_scalar('train_cost/AverageEpCost', logger.log_current_row['AverageEpCost'], epoch) - writer.add_scalar('train_cost/MaxEpCost', logger.log_current_row['MaxEpCost'], epoch) - writer.add_scalar('train_cost/MinEpCost', logger.log_current_row['MinEpCost'], epoch) - logger.dump_tabular() - sess.close() - tf.reset_default_graph() - diff --git a/SAUTE/tf_algos/safety_starter_agents/tf_trpo.py b/SAUTE/tf_algos/safety_starter_agents/tf_trpo.py deleted file mode 100644 index 77ded769..00000000 --- a/SAUTE/tf_algos/safety_starter_agents/tf_trpo.py +++ /dev/null @@ -1,72 +0,0 @@ -from safe_rl.pg.agents import TRPOAgent -from tf_algos.safety_starter_agents.run_agents import run_polopt_agent - - -def trpo(**kwargs): - """Run Vanilla TRPO Lagrangian.""" - trpo_kwargs = dict( - reward_penalized=False, - objective_penalized=False, - learn_penalty=False, - penalty_param_loss=False # Irrelevant in unconstrained - ) - agent = TRPOAgent(**trpo_kwargs) - run_polopt_agent(agent=agent, **kwargs) - - -def saute_trpo(**kwargs): - """Run Saute TRPO.""" - trpo_kwargs = dict( - reward_penalized=False, - objective_penalized=False, - learn_penalty=False, - penalty_param_loss=False # Irrelevant in unconstrained - ) - agent = TRPOAgent(**trpo_kwargs) - kwargs['saute_constraints'] = True - run_polopt_agent(agent=agent, **kwargs) - - -def trpo_lagrangian(**kwargs): - """Run TRPO Lagrangian.""" - # Objective-penalized form of Lagrangian TRPO. - trpo_kwargs = dict( - reward_penalized=False, - objective_penalized=True, - learn_penalty=True, - penalty_param_loss=True, - backtrack_iters=kwargs['backtrack_iters'] - ) - agent = TRPOAgent(**trpo_kwargs) - run_polopt_agent(agent=agent, **kwargs) - - -def trpo_cvar(**kwargs): - """ - Set up to run TRPO Lagrangian - """ - # Objective-penalized form of Lagrangian TRPO. - trpo_kwargs = dict( - reward_penalized=False, - objective_penalized=True, - learn_penalty=True, - penalty_param_loss=True, - backtrack_iters=kwargs['backtrack_iters'] - ) - agent = TRPOAgent(**trpo_kwargs) - kwargs['CVaR'] = True - run_polopt_agent(agent=agent, **kwargs) - - -def saute_trpo_lagrangian(**kwargs): - """Run Saute TRPO Lagrangian""" - # Objective-penalized form of Lagrangian PPO. - trpo_kwargs = dict( - reward_penalized=False, - objective_penalized=True, - learn_penalty=True, - penalty_param_loss=True - ) - agent = TRPOAgent(**trpo_kwargs) - kwargs['saute_lagrangian'] = True - run_polopt_agent(agent=agent, **kwargs) \ No newline at end of file diff --git a/SIMMER/.gitignore b/SIMMER/.gitignore new file mode 100644 index 00000000..be1e1cc0 --- /dev/null +++ b/SIMMER/.gitignore @@ -0,0 +1,162 @@ +logs/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/SIMMER/README.md b/SIMMER/README.md index ad073c65..87b59350 100644 --- a/SIMMER/README.md +++ b/SIMMER/README.md @@ -1,7 +1,20 @@ -# Enhancing Safe Exploration Using Safety State Augmentation +# Saut\'e and Simmer {RL}: Safe Reinforcement Learning Using Safety State Augmentation -## About +### Sauté RL: Almost Surely Safe RL Using State Augmentation +Satisfying safety constraints almost surely (or with probability one) can be critical for deployment of Reinforcement +Learning (RL) in real-life applications. For example, plane landing and take-off should ideally occur with probability +one. We address the problem by introducing Safety Augmented (Saute) Markov Decision Processes (MDPs), where the safety +constraints are eliminated by augmenting them into the state-space and reshaping the objective. We show that Saute MDP +satisfies the Bellman equation and moves us closer to solving Safe RL with constraints satisfied almost surely. We argue +that Saute MDP allows to view Safe RL problem from a different perspective enabling new features. For instance, our +approach has a plug-and-play nature, i.e., any RL algorithm can be "sauteed". Additionally, state augmentation allows +for policy generalization across safety constraints. We finally show that Saute RL algorithms can outperform their +state-of-the-art counterparts when constraint satisfaction is of high importance. + + + +### Effects of Safety State Augmentation on Safe Exploration Safe exploration is a challenging and important problem in model-free reinforcement learning (RL). Often the safety cost is sparse and unknown, which unavoidably leads to constraint violations -- a phenomenon ideally to be avoided in safety-critical applications. We tackle this problem by augmenting the state-space with a safety state, which is @@ -124,7 +137,7 @@ If you find our code useful please cite the following papers } @article{sootla2022simmer, - title = {Enhancing Safe Exploration Using Safety State Augmentation}, + title = {Effects of Safety State Augmentation on Safe Exploration}, author = {Sootla, Aivar and Cowen-Rivers, Alexander I. and Wang, Jun and Bou-Ammar, Haitham}, journal={arXiv preprint arXiv:2206.02675}, year={2022} @@ -135,8 +148,8 @@ or ``` @misc{sootla_saute_2022_git, - title={Saut\'e {RL}: Almost Surely Safe Reinforcement Learning Using State Augmentation}, - url = {https://github.com/huawei-noah/HEBO/tree/master/SAUTE}, + title={Saut\'e and Simmer {RL}: Safe Reinforcement Learning Using Safety State Augmentation}, + url = {https://github.com/huawei-noah/HEBO/tree/master/SIMMER}, year = {2022}, author = {Sootla, Aivar and Cowen-Rivers, Alexander I. and Jafferjee, Taher and Wang, Ziyan}, } diff --git a/SIMMER/exps/single_pendulum/key_observation_cfg.py b/SIMMER/exps/single_pendulum/key_observation_cfg.py new file mode 100644 index 00000000..df169cb3 --- /dev/null +++ b/SIMMER/exps/single_pendulum/key_observation_cfg.py @@ -0,0 +1,37 @@ +#cfg +cfg = dict( + experiment_name='plug_n_play', + agents=['SauteSAC'], + agent_cfg_overrides=dict( + env_name="Pendulum", # a necessary override + discount_factor=0.99, # a necessary override + steps_per_epoch=200, + epochs=200, + checkpoint_frequency=0, + penalty_lr=5e-2, + n_test_episodes=100, + ), + env_cfg_overrides=dict(), + param_sweep_list=[], + safety_budgets=[ + ['agent_cfg_overrides', 'safety_budget', 10.0], + ['agent_cfg_overrides', 'safety_budget', 20.0], + ['agent_cfg_overrides', 'safety_budget', 30.0], + ['agent_cfg_overrides', 'safety_budget', 40.0], + ['agent_cfg_overrides', 'safety_budget', 50.0], + ], + safety_discount_factors=[ + ['agent_cfg_overrides', 'safety_discount_factor', 0.99], + ], + seeds=[ + ['agent_cfg_overrides', 'seed', 42], + ['agent_cfg_overrides', 'seed', 4242], + ['agent_cfg_overrides', 'seed', 424242], + ['agent_cfg_overrides', 'seed', 42424242], + ['agent_cfg_overrides', 'seed', 4242424242], + ], + train=True, + test=False, + data_filename="test_results.csv", + num_exps=5, +) \ No newline at end of file diff --git a/SIMMER/main.py b/SIMMER/main.py index fee98222..2e610630 100644 --- a/SIMMER/main.py +++ b/SIMMER/main.py @@ -28,6 +28,9 @@ elif current_experiment == 15: from exps.single_pendulum.q_simmer_cfg import cfg run_simmer(**cfg, smoketest=args.smoketest) + elif current_experiment == 16: + from exps.single_pendulum.key_observation_cfg import cfg + run_saute(**cfg, smoketest=args.smoketest) ### Double Pendulum elif current_experiment == 20: from exps.double_pendulum.trpo_cfg import cfg diff --git a/SIMMER/tf_algos/safety_starter_agents/tf_sac.py b/SIMMER/tf_algos/safety_starter_agents/tf_sac.py index cd89757b..c49c5238 100644 --- a/SIMMER/tf_algos/safety_starter_agents/tf_sac.py +++ b/SIMMER/tf_algos/safety_starter_agents/tf_sac.py @@ -490,11 +490,12 @@ def test_agent(n=10): local_steps_per_epoch = steps_per_epoch // num_procs() local_batch_size = batch_size // num_procs() epoch_start_time = time.time() - training_rewards = deque([0], maxlen=n_train_episodes) - training_costs = deque([0], maxlen=n_train_episodes) df = pd.DataFrame() assert max_ep_len <= local_steps_per_epoch, "Episode length should be smaller or equal to local steps per epoch" + training_rewards = deque([0], maxlen= steps_per_epoch // max_ep_len) + training_costs = deque([0], maxlen= steps_per_epoch // max_ep_len) + for t in range(total_steps // num_procs()): """ Until local_start_steps have elapsed, randomly sample actions @@ -540,7 +541,7 @@ def test_agent(n=10): else: logger.store(EpRet=ep_ret, EpCost=ep_cost, EpLen=ep_len, EpGoals=ep_goals) training_rewards.extend([ep_ret]) - training_costs.extend([ep_cost]) + training_costs.extend([ep_cost]) o, r, d, ep_ret, ep_cost, ep_len, ep_goals = env.reset(), 0, False, 0, 0, 0, 0 if t > 0 and t % train_frequency == 0: @@ -574,15 +575,26 @@ def test_agent(n=10): # Save model if (checkpoint_frequency and (epoch % checkpoint_frequency == 0)) or (epoch == epochs-1): logger.save_state({'env': env}, epoch) - df = df.append(pd.DataFrame({ - "episode_return": training_rewards, - "episode_cost": training_costs, - "accumulated_cost": cumulative_cost, - "cost_rate": cost_rate, - "epoch": epoch, - "run": np.arange(len(training_rewards)) - })) - df.to_csv(os.path.join(logger.output_dir, "train_results.csv")) + if len(training_rewards) == 1: + cur_df = pd.DataFrame({ + "episode_return": training_rewards[0], + "episode_cost": training_costs[0], + "accumulated_cost": cumulative_cost, + "cost_rate": cost_rate, + "epoch": epoch, + "run": 0 + }, index=[epoch]) + else: + cur_df = pd.DataFrame({ + "episode_return": training_rewards, + "episode_cost": training_costs, + "accumulated_cost": cumulative_cost, + "cost_rate": cost_rate, + "epoch": epoch, + "run": np.arange(len(training_rewards)) + }) + df = df.append(cur_df) + df.to_csv(os.path.join(logger.output_dir, "train_results.csv")) # Test the performance of the deterministic version of the agent. test_start_time = time.time() From c2f629d87079c98c4ee854d9d358ba73a230abf4 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Thu, 20 Oct 2022 16:35:09 +0100 Subject: [PATCH 2/2] minor --- README.md | 2 +- SIMMER/.gitignore | 3 ++- SIMMER/exps/single_pendulum/key_observation_cfg.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eaa58b4c..ed013a6c 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Huawei, Noah's Ark Lab. - [Bayesian Optimisation with Compositional Optimisers](./CompBO) - [AntBO: Antibody Design with Combinatorial Bayesian Optimisation](./AntBO) - Reinforcement Learning Research - - [Sauté RL: and Simmer RL: Safe Reinforcement Learning Using Safety State Augmentation ](./SIMMER) + - [Sauté RL and Simmer RL: Safe Reinforcement Learning Using Safety State Augmentation ](./SIMMER) - [Model-Based Offline Reinforcement Learning with Pessimism-Modulated Dynamics Belief](./PMDB) Further instructions are provided in the README files associated to each project. diff --git a/SIMMER/.gitignore b/SIMMER/.gitignore index be1e1cc0..9d6c79b8 100644 --- a/SIMMER/.gitignore +++ b/SIMMER/.gitignore @@ -159,4 +159,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + diff --git a/SIMMER/exps/single_pendulum/key_observation_cfg.py b/SIMMER/exps/single_pendulum/key_observation_cfg.py index df169cb3..ba28b388 100644 --- a/SIMMER/exps/single_pendulum/key_observation_cfg.py +++ b/SIMMER/exps/single_pendulum/key_observation_cfg.py @@ -34,4 +34,5 @@ test=False, data_filename="test_results.csv", num_exps=5, -) \ No newline at end of file +) +