-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_experience_fqi.py
101 lines (89 loc) · 3.01 KB
/
generate_experience_fqi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import random
import os
import pandas as pd
from erl_config import build_env
from trade_simulator import TradeSimulator
import numpy as np
def random_policy(state):
return np.random.choice(3, size=state.shape[0])
def long_only_policy(state):
return 2
def short_only_policy(state):
return np.zeros(shape=(state.shape[0], ))
def flat_only_policy(state):
return 1
policies = {
'random_policy': random_policy,
'long_only_policy': long_only_policy,
'short_only_policy': short_only_policy,
'flat_only_policy': flat_only_policy
}
def generate_episode(pi, env, max_steps):
states = []
actions = []
rewards = []
next_states = []
absorbing_states = []
s, _ = env.reset()
for step in range(max_steps):
states.append(s[0].numpy())
a = pi(s)
s, r, done, truncated, info = env.step(a)
actions.append(a)
rewards.append(r.numpy()[0])
next_states.append(s[0].numpy())
if done:
absorbing_states.append(True)
break
else:
absorbing_states.append(False)
return states, actions, rewards, next_states, absorbing_states
def generate_experience(days_to_sample, policy, max_steps=360, episodes=1000, save=True, testing=False,
data_dir='./data/'):
env_args = {
"env_name": "TradeSimulator-v0",
"num_envs": episodes,
"max_step": max_steps,
"state_dim": 8 + 2, # factor_dim + (position, holding)
"action_dim": 3, # long, 0, short
"if_discrete": True,
"max_position": 1,
"slippage": 7e-7,
"num_sims": 1,
"step_gap": 2,
"env_class": TradeSimulator,
'days': days_to_sample
}
pi = policies[policy]
env = build_env(TradeSimulator, env_args, -1)
states = []
actions = []
rewards = []
absorbing_state = []
next_states = []
for episode in range(episodes):
# print("Episode: " + str(episode) + "; policy:" + policy)
s, _ = env.reset()
for step in range(max_steps):
states.append(s[0].numpy())
a = pi(s)
s, r, done, truncated, info = env.step(a)
actions.append(a)
rewards.append(r.numpy()[0])
next_states.append(s[0].numpy())
if done:
absorbing_state.append(True)
break
else:
absorbing_state.append(False)
df = pd.DataFrame({'state': states, 'action': actions, 'reward': rewards, "next_state": next_states,
'absorbing_state': absorbing_state})
print(f"Generated dataset days: {days_to_sample} ; policy:{policy}")
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if save:
if testing is True:
df.to_json(f'{data_dir}{policy}_testing_{days_to_sample}.json')
else:
df.to_json(f'{data_dir}{policy}_{days_to_sample}.json')
return df