-
Notifications
You must be signed in to change notification settings - Fork 1
/
simpleBattleshipAI.py
208 lines (166 loc) · 7.11 KB
/
simpleBattleshipAI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# coding: utf-8
# A simplified version of the game Battleship is used to train an agent using Reinforcement Learning.
#
# The board will consist of a single col with n rows, or squares. Only one ship is allowed and has a size of m<n squares.
#
# The expectation is for the agent to learn to
# a) fire at intervals of m squares (because a ship has a size of m squares and one would sample the space quicker this way), and
# b) fire at squares adjacent to the previous hit.
#
# We believe a policy like this one will minimize the number of shots required to sink the ship.
#
# Once this is achieved, we will move on to more, larger ships, and a bigger board.
# Let's first develop the game engine:
import random
import numpy as np
from enum import Enum
import tensorflow as tf
import matplotlib.pyplot as plt
class SHOT_RESULT(Enum):
INVALID = 0
HIT = 1
SINK = 2
MISS = 3
class GAME_STATUS(Enum):
IN_PROGRESS = 0
COMPLETE = 1
RewardFun = {SHOT_RESULT.MISS:0,\
SHOT_RESULT.HIT:1,\
SHOT_RESULT.SINK:2,\
SHOT_RESULT.INVALID:-2}
def genShipCoords(shipSize, minCoord, maxCoord):
coordinates = [0]*shipSize
coordinates[0] = random.randint(minCoord, maxCoord-1)
for i in range(1,shipSize):
coordinates[i] = coordinates[i-1] + 1 if coordinates[i-1] < maxCoord else coordinates[i-1]-1
return coordinates
class SimpleBattleship(object):
# Only a one-row board is implemented for now
def __init__(self, shipCoords, numBoardRows, numBoardCols):
self.shipCoords = shipCoords
self.numBoardRows = numBoardRows
self.numBoardCols = numBoardCols
self.shotsTakenSofar = []
# the board will always have another dimension to show
# where the crosshair is pointing at
self.board = np.zeros(shape=[self.numBoardRows+1, self.numBoardCols])
self.crossHairCoord = 0 # cross-hair always starts at left most pos
self.updateCrossHairCoord(self.crossHairCoord)
def updateCrossHairCoord(self, coord, val=1):
# reset board
self.board[self.numBoardRows,range(self.numBoardCols)] = 0
# show new crosshair location
self.board[self.numBoardRows, coord] = val
def fireShot(self, shotCoordinate):
# Cannot take the same shot twice
if shotCoordinate in self.shotsTakenSofar:
self.updateCrossHairCoord(shotCoordinate)
return self.getOneDimState(), RewardFun[SHOT_RESULT.INVALID], GAME_STATUS.COMPLETE
else:
self.shotsTakenSofar.append(shotCoordinate)
# Cannot play outside the board
if not 0 <= shotCoordinate <= self.numBoardCols-1:
# Invalid
# update crossHair to be outside board = all zero
self.updateCrossHairCoord(range(0,self.numBoardCols), 0)
return self.getOneDimState(), RewardFun[SHOT_RESULT.INVALID], GAME_STATUS.COMPLETE
elif shotCoordinate in self.shipCoords:
# Hit
self.updateState(shotCoordinate, 1)
# The crosshair moves to where we fired at.
self.updateCrossHairCoord(shotCoordinate)
# Did we sink the ship?
sunk = True
for coord in self.shipCoords:
if coord not in self.shotsTakenSofar:
sunk = False
if sunk:
return self.getOneDimState(), RewardFun[SHOT_RESULT.HIT], GAME_STATUS.COMPLETE
else:
return self.getOneDimState(), RewardFun[SHOT_RESULT.HIT], GAME_STATUS.IN_PROGRESS
else:
# Miss
self.updateState(shotCoordinate, -1)
# The crosshair moves to where we fired at.
self.updateCrossHairCoord(shotCoordinate)
return self.getOneDimState(), RewardFun[SHOT_RESULT.MISS], GAME_STATUS.IN_PROGRESS
def getState(self):
return self.board
def getOneDimState(self):
return self.board.reshape( (1,self.board.size) )
def updateState(self,coord, val):
self.board[0,coord] = val
# Set-up
NUM_EPISODES = 3000
BOARD_SIZE = (1,10)
SHIP_SIZE = 2
#ship = genShipCoords(SHIP_SIZE, BOARD_SIZE[0], BOARD_SIZE[1])
ship = [5,6]
print("SHIP LOCATION:{}".format(ship))
actionSet = (-1,1) #LEFT=-1, RIGHT= +1
# Neural Network for Q-table (aka Q-network)
tf.reset_default_graph()
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,BOARD_SIZE[1]*2],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([BOARD_SIZE[1]*2,len(actionSet)],0,0.01))
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1)
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,len(actionSet)],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
init = tf.global_variables_initializer()
# Set learning parameters
y = 0.9
e = 0.1
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
sess.run(init)
for i in range(NUM_EPISODES):
#Reset environment and get first new observation
game = SimpleBattleship(ship, BOARD_SIZE[0], BOARD_SIZE[1])
s = game.getOneDimState()
crossHair = 0
rAll = 0
termEpisode = False
j = 0
#The Q-Network
print("New Episode #" + str(i))
print(game.getState())
while j < 20: #should never need more than 10 (max number of legal moves if board size is 10
j+=1
#Choose an action by greedily (with e chance of random action) from the Q-network
actionIdx,allQ,Wnn = sess.run([predict,Qout,W],feed_dict={inputs1:s})
if np.random.rand(1) < e:
actionIdx = [random.randint(0,len(actionSet)-1)]
#print(actionIdx)
#Get new state and reward from environment
crossHair += actionSet[actionIdx[0]]
s1, r, gameStatus = game.fireShot(crossHair)
print(str(j)+"\n")
print(game.getState())
print("actionIdx[0]:" + str(actionSet[actionIdx[0]]))
if gameStatus == GAME_STATUS.COMPLETE:
termEpisode = True
#Obtain the Q' values by feeding the new state through our network
Q1 = sess.run(Qout,feed_dict={inputs1:s1})
#Obtain maxQ' and set our target value for chosen action.
maxQ1 = np.max(Q1)
targetQ = allQ
targetQ[0,actionIdx[0]] = r + y*maxQ1
#Train our network using target and predicted Q values
_,Wnn1 = sess.run([updateModel,W],feed_dict={inputs1:s,nextQ:targetQ})
Wdiff = Wnn-Wnn1
rAll += r
s = s1
if termEpisode == True:
#Reduce chance of random action as we train the model.
e -= e*0.01
break
jList.append(j)
rList.append(rAll)
plt.plot(rList)
plt.show()