-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIanSullivan.py
151 lines (125 loc) · 4.7 KB
/
IanSullivan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
from random import shuffle
import time
import sys
class Kunh:
def __init__(self):
self.nodeMap = {}
self.expected_game_value = 0
self.n_cards = 3
self.nash_equilibrium = dict()
self.current_player = 0
self.deck = np.array([0, 1, 2])
self.n_actions = 2
def train(self, n_iterations=50000):
expected_game_value = 0
for _ in range(n_iterations):
shuffle(self.deck)
expected_game_value += self.cfr('', 1, 1)
for _, v in self.nodeMap.items():
v.update_strategy()
expected_game_value /= n_iterations
display_results(expected_game_value, self.nodeMap)
def cfr(self, history, pr_1, pr_2):
n = len(history)
is_player_1 = n % 2 == 0
player_card = self.deck[0] if is_player_1 else self.deck[1]
if self.is_terminal(history):
card_player = self.deck[0] if is_player_1 else self.deck[1]
card_opponent = self.deck[1] if is_player_1 else self.deck[0]
reward = self.get_reward(history, card_player, card_opponent)
return reward
node = self.get_node(player_card, history)
strategy = node.strategy
# Counterfactual utility per action.
action_utils = np.zeros(self.n_actions)
for act in range(self.n_actions):
next_history = history + node.action_dict[act]
if is_player_1:
action_utils[act] = -1 * self.cfr(next_history, pr_1 * strategy[act], pr_2)
else:
action_utils[act] = -1 * self.cfr(next_history, pr_1, pr_2 * strategy[act])
# Utility of information set.
util = sum(action_utils * strategy)
regrets = action_utils - util
if is_player_1:
node.reach_pr += pr_1
node.regret_sum += pr_2 * regrets
else:
node.reach_pr += pr_2
node.regret_sum += pr_1 * regrets
return util
@staticmethod
def is_terminal(history):
if history[-2:] == 'pp' or history[-2:] == "bb" or history[-2:] == 'bp':
return True
@staticmethod
def get_reward(history, player_card, opponent_card):
terminal_pass = history[-1] == 'p'
double_bet = history[-2:] == "bb"
if terminal_pass:
if history[-2:] == 'pp':
return 1 if player_card > opponent_card else -1
else:
return 1
elif double_bet:
return 2 if player_card > opponent_card else -2
def get_node(self, card, history):
key = str(card) + " " + history
if key not in self.nodeMap:
action_dict = {0: 'p', 1: 'b'}
info_set = Node(key, action_dict)
self.nodeMap[key] = info_set
return info_set
return self.nodeMap[key]
class Node:
def __init__(self, key, action_dict, n_actions=2):
self.key = key
self.n_actions = n_actions
self.regret_sum = np.zeros(self.n_actions)
self.strategy_sum = np.zeros(self.n_actions)
self.action_dict = action_dict
self.strategy = np.repeat(1/self.n_actions, self.n_actions)
self.reach_pr = 0
self.reach_pr_sum = 0
def update_strategy(self):
self.strategy_sum += self.reach_pr * self.strategy
self.reach_pr_sum += self.reach_pr
self.strategy = self.get_strategy()
self.reach_pr = 0
def get_strategy(self):
regrets = self.regret_sum
regrets[regrets < 0] = 0
normalizing_sum = sum(regrets)
if normalizing_sum > 0:
return regrets / normalizing_sum
else:
return np.repeat(1/self.n_actions, self.n_actions)
def get_average_strategy(self):
strategy = self.strategy_sum / self.reach_pr_sum
# Re-normalize
total = sum(strategy)
strategy /= total
return strategy
def __str__(self):
strategies = ['{:03.2f}'.format(x)
for x in self.get_average_strategy()]
return '{} {}'.format(self.key.ljust(6), strategies)
def display_results(ev, i_map):
print('player 1 expected value: {}'.format(ev))
print('player 2 expected value: {}'.format(-1 * ev))
print()
print('player 1 strategies:')
sorted_items = sorted(i_map.items(), key=lambda x: x[0])
for _, v in filter(lambda x: len(x[0]) % 2 == 0, sorted_items):
print(v)
print()
print('player 2 strategies:')
for _, v in filter(lambda x: len(x[0]) % 2 == 1, sorted_items):
print(v)
if __name__ == "__main__":
time1 = time.time()
trainer = Kunh()
trainer.train(n_iterations=25000)
print(abs(time1 - time.time()))
print(sys.getsizeof(trainer))