-
Notifications
You must be signed in to change notification settings - Fork 0
/
agent.asm
272 lines (233 loc) · 7.32 KB
/
agent.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#include p18f87k22.inc
#include constants.inc
global agent_learn
extern player_gridhex, third_check_up, third_check_down, third_check_left
extern third_check_right, handle_D_button, gamestate
extern find_max, q1_H, q2_H, q3_H, q4_H, q1_L, q2_L, q3_L, q4_L
extern current_max_H, q_max_H, q_max_L, reward_L, reward_H, display_score
acs0 udata_acs
fsr_start res 1
state1_gridhex res 1
state1_max_i res 1
state1_max_H res 1
state1_max_L res 1
state1_max_H_tmp res 1
state1_max_L_tmp res 1
state2_max_H res 1
state2_max_L res 1
tmp_move res 1
counter res 1
is_neg res 1
q_agent code
agent_learn
tstfsz gamestate
bra learn
return
learn
call display_score
call move
movlw 0x77
cpfseq gamestate ; Compare if gamestate == 0x77 (level 1)
bra check_23 ; if != 0x77 (check for level 2 and 3)
bra learn_1 ; if == 0x77 (learn level 1)
learn_1
movlw 0x28
goto rejoin_learn
check_23
movlw 0x78
cpfseq gamestate ; Compare if gamestate == 0x78 (level 2)
bra check_3 ; if != 0x78 (check for level 3)
bra learn_2 ; if == 0x08 (learn level 2)
learn_2
movlw 0x26
goto rejoin_learn
check_3
movlw 0x79
cpfseq gamestate ; Compare if gamestate == 0x79 (level 3)
bra agent_learn ; if != 0x79 (loop to beginning)
bra learn_3 ; if == 0x79 (learn level 3)
learn_3
movlw 0x0C
goto rejoin_learn
rejoin_learn
cpfseq player_gridhex
bra agent_learn
return
move
call get_action
call q_learn
call update_q_table
call reset_game_fsr
return
get_action
call store_q_values ; store q values in format to find max
call find_max ; find max of q_table[state1][actions]
movff player_gridhex, state1_gridhex
movff q_max_H, state1_max_H ; store state 1 q value max H
movff q_max_L, state1_max_L ; store state 1 q value max L
movff current_max_H, state1_max_i ; store max index for state1
call reset_game_fsr
call use_max_q ; move character based on maxQ
call store_q_values ; get new gridhex, and new q
call find_max ; find maxQ of state2
movff q_max_H, state2_max_H
movff q_max_L, state2_max_L
return
store_q_values
movlb 4 ; select bank 4
lfsr FSR2, 0x480 ; store low byte in FSR2 bank 4
movlw 0x04
mulwf player_gridhex
movff PRODL, fsr_start
movf fsr_start, W
movff PLUSW2, q1_L
incf fsr_start
movf fsr_start, W
movff PLUSW2, q2_L
incf fsr_start
movf fsr_start, W
movff PLUSW2, q3_L
incf fsr_start
movf fsr_start, W
movff PLUSW2, q4_L
movlb 5 ; select bank 5
lfsr FSR2, 0x580 ; store low byte in FSR2 bank 5
movlw 0x04
mulwf player_gridhex
movff PRODL, fsr_start
movf fsr_start, W
movff PLUSW2, q1_H
incf fsr_start
movf fsr_start, W
movff PLUSW2, q2_H
incf fsr_start
movf fsr_start, W
movff PLUSW2, q3_H
incf fsr_start
movf fsr_start, W
movff PLUSW2, q4_H
return
use_max_q
movlw 0x00
subwf current_max_H, W
movwf tmp_move
tstfsz tmp_move
bra check_q_value_down
call third_check_up
return
check_q_value_down
movlw 0x01
subwf current_max_H, W
movwf tmp_move
tstfsz tmp_move
bra check_q_value_left
call third_check_down
return
check_q_value_left
movlw 0x02
subwf current_max_H, W
movwf tmp_move
tstfsz tmp_move
bra check_q_value_right
call third_check_left
return
check_q_value_right
movlw 0x03
subwf current_max_H, W
movwf tmp_move
tstfsz tmp_move
return
call third_check_right
return
; *** qvalue_state1_action = qvalue_state1_action + learning_rate x *** ;
; *** [reward + discount_rate x MAX(qvalue_state2) - qvalue_state1_action] *** ;
; *** Take learning_rate = 1, discount_rate = 1 *** ;
q_learn
clrf is_neg
movff state1_max_L, state1_max_L_tmp
movff state1_max_H, state1_max_H_tmp
; *** A = reward + 1 x MAX(qvalue_state2) *** ;
movf reward_L, W ; move reward to WREG
addwf state2_max_L, F ; add to state2_max_L
movf reward_H, W ; add carry to H
addwfc state2_max_H, F ; add carry to state2_max_H
; *** B = A - qvalue_state1_action *** ;
; *** Perform negf on the 16bit state1_max_H:state1_max_L manually *** ;
movlw 0x01
subwf state1_max_L_tmp, F
movlw 0x00
subwfb state1_max_H_tmp, F
comf state1_max_L_tmp
comf state1_max_H_tmp
; *** Add A_low:A_high to negf(state1_max_H:state1_max_L) *** ;
movf state1_max_L_tmp, W
addwf state2_max_L, F
movf state1_max_H_tmp, W
addwfc state2_max_H, F
; *** C = learning_rate x B *** ;
clrf counter
movlw 0x80
cpfslt state2_max_H
call convert_from_twos_comp
learning_rate_mul
incf counter
bcf STATUS, Z
rrcf state2_max_H, F
rrcf state2_max_L, F
movlw 0x02
cpfslt counter
bra learning_rate_mul
movlw 0x01
cpfseq is_neg
bra final_q_learn
call convert_to_twos_comp
; *** D = C + qvalue_state1_action (in 2's complement) *** ;
final_q_learn
movf state1_max_L, W ; add state1 max low byte to the
addwf state2_max_L, F ; low byte of A
movf state1_max_H, W ; add state1 max high byte to
addwfc state2_max_H, F ; high byte of A
movff state2_max_L, state1_max_L
movff state2_max_H, state1_max_H
return
update_q_table
movlb 4 ; select bank 4 for L bytes (reset FSR pointer)
lfsr FSR2, 0x480 ; reset at 0x480
movlw 0x04
mulwf state1_gridhex ; Mutiply gridhex value by 4 (up down left right)
movf PRODL, W ; Retrieve value (into W)
addwf state1_max_i, W ; Offset by index value
movff state1_max_L, PLUSW2 ; update table with new q value for L bytes
movlb 5 ; select bank 5 for H bytes (reset FSR pointer)
lfsr FSR2, 0x580 ; reset at 0x580
movlw 0x04
mulwf state1_gridhex ; Mutiply gridhex value by 4 (up down left right)
movf PRODL, W ; Retrieve value (into W)
addwf state1_max_i, W ; Offset by index value
movff state1_max_H, PLUSW2 ; update table with new q value for H bytes
return
reset_game_fsr
movlb 6 ; select bank 6
lfsr FSR1, 0x680 ; reset at 0x680 to point to game grid values
movlb 8 ; select bank 8
lfsr FSR0, 0x880 ; reset at 0x880 for mapmatrix level
return
convert_from_twos_comp
movlw 0x01
movwf is_neg
movlw 0x01
subwf state2_max_L, F
movlw 0x00
subwfb state2_max_H, F
comf state2_max_L
comf state2_max_H
return
convert_to_twos_comp
comf state2_max_L
comf state2_max_H
movlw 0x01
addwf state2_max_L
movlw 0x00
addwfc state2_max_H
return
end