-
Notifications
You must be signed in to change notification settings - Fork 23
/
wmf.py
166 lines (116 loc) · 5.2 KB
/
wmf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import time
import itertools
def linear_surplus_confidence_matrix(B, alpha):
# To construct the surplus confidence matrix, we need to operate only on the nonzero elements.
# This is not possible: S = alpha * B
S = B.copy()
S.data = alpha * S.data
return S
def log_surplus_confidence_matrix(B, alpha, epsilon):
# To construct the surplus confidence matrix, we need to operate only on the nonzero elements.
# This is not possible: S = alpha * np.log(1 + B / epsilon)
S = B.copy()
S.data = alpha * np.log(1 + S.data / epsilon)
return S
def iter_rows(S):
"""
Helper function to iterate quickly over the data and indices of the
rows of the S matrix. A naive implementation using indexing
on S is much, much slower.
"""
for i in xrange(S.shape[0]):
lo, hi = S.indptr[i], S.indptr[i + 1]
yield i, S.data[lo:hi], S.indices[lo:hi]
def recompute_factors(Y, S, lambda_reg, dtype='float32'):
"""
recompute matrix X from Y.
X = recompute_factors(Y, S, lambda_reg)
This can also be used for the reverse operation as follows:
Y = recompute_factors(X, ST, lambda_reg)
The comments are in terms of X being the users and Y being the items.
"""
m = S.shape[0] # m = number of users
f = Y.shape[1] # f = number of factors
YTY = np.dot(Y.T, Y) # precompute this
YTYpI = YTY + lambda_reg * np.eye(f)
X_new = np.zeros((m, f), dtype=dtype)
for k, s_u, i_u in iter_rows(S):
Y_u = Y[i_u] # exploit sparsity
A = np.dot(s_u + 1, Y_u)
YTSY = np.dot(Y_u.T, (Y_u * s_u.reshape(-1, 1)))
B = YTSY + YTYpI
# Binv = np.linalg.inv(B)
# X_new[k] = np.dot(A, Binv)
X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e
return X_new
def recompute_factors_bias(Y, S, lambda_reg, dtype='float32'):
"""
Like recompute_factors, but the last column of X and Y is treated as
a bias vector.
"""
m = S.shape[0] # m = number of users
f = Y.shape[1] - 1 # f = number of factors
b_y = Y[:, f] # vector of biases
Y_e = Y.copy()
Y_e[:, f] = 1 # factors with added column of ones
YTY = np.dot(Y_e.T, Y_e) # precompute this
R = np.eye(f + 1) # regularization matrix
R[f, f] = 0 # don't regularize the biases!
R *= lambda_reg
YTYpR = YTY + R
byY = np.dot(b_y, Y_e) # precompute this as well
X_new = np.zeros((m, f + 1), dtype=dtype)
for k, s_u, i_u in iter_rows(S):
Y_u = Y_e[i_u] # exploit sparsity
b_y_u = b_y[i_u]
A = np.dot((1 - b_y_u) * s_u + 1, Y_u)
A -= byY
YTSY = np.dot(Y_u.T, (Y_u * s_u[:, None]))
B = YTSY + YTYpR
# Binv = np.linalg.inv(B)
# X_new[k] = np.dot(A, Binv)
X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e
return X_new
def factorize(S, num_factors, lambda_reg=1e-5, num_iterations=20, init_std=0.01, verbose=False, dtype='float32', recompute_factors=recompute_factors, *args, **kwargs):
"""
factorize a given sparse matrix using the Weighted Matrix Factorization algorithm by
Hu, Koren and Volinsky.
S: 'surplus' confidence matrix, i.e. C - I where C is the matrix with confidence weights.
S is sparse while C is not (and the sparsity pattern of S is the same as that of
the preference matrix, so it doesn't need to be specified separately).
num_factors: the number of factors.
lambda_reg: the value of the regularization constant.
num_iterations: the number of iterations to run the algorithm for. Each iteration consists
of two steps, one to recompute U given V, and one to recompute V given U.
init_std: the standard deviation of the Gaussian with which V is initialized.
verbose: print a bunch of stuff during training, including timing information.
dtype: the dtype of the resulting factor matrices. Using single precision is recommended,
it speeds things up a bit.
recompute_factors: helper function that implements the inner loop.
returns:
U, V: factor matrices. If bias=True, the last columns of the matrices contain the biases.
"""
num_users, num_items = S.shape
if verbose:
print "precompute transpose"
start_time = time.time()
ST = S.T.tocsr()
if verbose:
print " took %.3f seconds" % (time.time() - start_time)
print "run ALS algorithm"
start_time = time.time()
U = None # no need to initialize U, it will be overwritten anyway
V = np.random.randn(num_items, num_factors).astype(dtype) * init_std
for i in xrange(num_iterations):
if verbose:
print " iteration %d" % i
print " recompute user factors U"
U = recompute_factors(V, S, lambda_reg, dtype, *args, **kwargs)
if verbose:
print " time since start: %.3f seconds" % (time.time() - start_time)
print " recompute item factors V"
V = recompute_factors(U, ST, lambda_reg, dtype, *args, **kwargs)
if verbose:
print " time since start: %.3f seconds" % (time.time() - start_time)
return U, V