-
Notifications
You must be signed in to change notification settings - Fork 1
/
f_trust_Qlearn_counter_hybrid_regret_purist.m
executable file
·131 lines (116 loc) · 4.29 KB
/
f_trust_Qlearn_counter_hybrid_regret_purist.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
function [fx] = f_trust_Qlearn_counter_hybrid_regret_purist(x,theta,u,inF)
% function [fx,dfdx,dfdP] = f_trust_Qlearn_counter(x,theta,u,inF)
% evolution function of q-values of a RL agent (2-armed bandit problem)
% [fx,dfdx,dfdP] = f_Qlearn2(x,P,u,in)
% Here, there are only two q-values to evolve, i.e. there are only two
% actions to reinforce (2-armed bandit problem).
% IN:
% - x_t : q-values (2x1)
% - P : (inverse-sigmoid) learning-rate
% - u : u(1)=previous action (1 or 0), u(2)=feedback
% - in : [useless]
% OUT:
% - fx: evolved q-values (2x1)
% - dfdx/dfdP: gradient of the q-values evolution function, wrt q-values
% and evolution parameter, respectively.F
%r = 1; % when trustee shares the reward is $1.5, or for simplicity r = 1
% theta(1) -- basic learning rate
% theta(2) -- punishment sensitivity or reality of rewards parameter
%write out actual rewards first, then counterfactual w/ omega
% %% code actual rewards
% actual_reward = 0;
% if (u(2)==1 && u(1)==1) %trustee shared, subject shared
% actual_reward = 1.5;
% elseif (u(2)==0 && u(1)==1) %trustee kept, subject shared
% actual_reward = 0;
% elseif (u(2)==0 && u(1)<1) %trustee kept, subject kept
% actual_reward = -1;
% else
% actual_reward = -1;
% end
%
% %% counterfactual rewards
% counter_reward = 0;
% if (u(2)==1 && u(1)==1) %trustee shared, subject shared
% counter_reward = 0;
% elseif (u(2)==0 && u(1)==1) %trustee kept, subject shared
% counter_reward = -1;
% elseif (u(2)==0 && u(1)<1) %trustee kept, subject kept
% counter_reward = -1;
% else
% counter_reward = 0.5;
% end
if (u(2)==1 && u(1)==1) %trustee shared, subject shared
r = 1.5; % BUT why don't they consider their alternative action as a reference?
elseif (u(2)<1 && u(1)==1) %trustee kept, subject shared
r = -1.5;
elseif (u(2)<1 && u(1)<1) %trustee kept, subject kept
r = 0.5;
else r = 0.5; %trustee shared, subject kept
end
%% code counterfactual rewards (also apply cr to all trials OR incongruent trials)?
% a parameter the modulates the "regret" wrt "share" action
if inF.regret == 1
%omega = 1./(1+exp(-theta(2))); %bounded between 0 and 1.
omega1 = theta(2);
% omega2 = theta(3);
if (u(2)<1 && u(1) ==1) %trustee kept, subject shared
r = actual_reward + counter_reward * (1+omega1);
elseif (u(2)==1 && u(1) <1) %trustee shared, subject kept
r = actual_reward + counter_reward * (1+omega1);
else
r = actual_reward + counter_reward;
end
else
%r = actual_reward + counter_reward;
r = r;
end
if inF.assymetry_choices==1
% alpha=1./(1+exp(-theta(1).*u(1)+theta(2).*(u(1)-1)));
alpha=1./(1+exp(-theta(1)+theta(2).*(u(1)-1)));
else
alpha = 1./(1+exp(-theta(1))); % learning rate is bounded between 0 and 1.
end
pe = r-x(1); % prediction error
fx = zeros(length(x),1);
%% introduce reputation sensitivity: this assumes that reputation sensitivity is
%% an additive effect wrt the initial value state
if inF.reputation_sensitive==1
theta(2) = sig(theta(2));
fx(1) = x(1)+alpha*pe + theta(2).*u(3).*u(4);
elseif inF.humanity==1
theta(2) = sig(theta(2));
fx(1) = x(1)+alpha*pe + theta(2).*u(5).*u(4);
elseif inF.valence_n==1 && inF.valence_p==1
% fx(1) = x(1)+alpha*pe +theta(2).*u(6).*u(4)+theta(3)*u(7).*u(4);
theta(2) = sig(theta(2));
theta(3) = sig(theta(3));
fx(1) = x(1)+alpha*pe -theta(2).*u(7).*u(4)+theta(3).*u(6).*u(4);
elseif inF.valence_n==1
% fx(1) = x(1)+alpha*pe +theta(2).*u(6).*u(4);
theta(2) = sig(theta(2));
fx(1) = x(1)+alpha*pe -theta(2).*u(7).*u(4);
elseif inF.valence_p==1
% fx(1) = x(1)+alpha*pe +theta(2).*u(7).*u(4);
theta(2) = sig(theta(2));
fx(1) = x(1)+alpha*pe + theta(2).*u(6).*u(4);
else
fx(1) = x(1) + alpha*pe;
end
%tracking PEs
fx(2) = pe;
%% one hidden state (value)
% dfdx = zeros(size(x,1),1);
% dfdx(1) = [1-alpha];
% dfdP = [alpha*(1-alpha)*pe];
%% two hidden states (value + pe)
% gradients' derivation
% if u(1)==1
% dfdx = [1-alpha, 0;
% 0, 1];
% dfdP = [alpha*(1-alpha)*pe(1),0];
% else
% dfdx = [1, 0;
% 0, 1-alpha];
% dfdP = [0,alpha*(1-alpha)*pe(2)];
% end