-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuser-based-filtering-MAE.py
179 lines (146 loc) · 5.47 KB
/
user-based-filtering-MAE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import random
import numpy as np
import pdb
from itertools import combinations
from collections import defaultdict
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("User Based Collaborative Filtering")
sc = SparkContext(conf=conf)
def loadMovieNames():
movieNames= {}
with open("ml-100k/u.item") as f:
for line in f:
fields= line.split('|')
movieNames[int(fields[0])] = fields[1]
return movieNames
def parseUserInfo(line):
'''
Parse each line of the specified data file, assuming a "|" delimiter.
Key is user_id, converts each rating to a float.
'''
line = line.split("::")
return line[0],(line[1],float(line[2]))
def findingUserPairs(movie_id, users_with_rating):
'''
For each item, find all user-user pairs combos. (i.e. users with the same item)
'''
for user1,user2 in combinations(users_with_rating,2):
return (user1[0],user2[0]),(user1[1],user2[1])
def cosineSim(user_pair, rating_pairs):
'''
For each user-user pair, return the specified similarity measure,
along with co_raters_count.
'''
sum_x, sum_xy, sum_y,x = (0.0, 0.0, 0.0,0)
for rating_pair in rating_pairs:
sum_x += np.float(rating_pair[0]) * np.float(rating_pair[0])
sum_y += np.float(rating_pair[1]) * np.float(rating_pair[1])
sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])
x += 1
cos_sim = cosine(sum_xy,np.sqrt(sum_x),np.sqrt(sum_y))
return user_pair, (cos_sim,x)
def cosine(dot_product,rating1_norm_squared,rating2_norm_squared):
'''
The cosine between two vectors
'''
num = dot_product
den = rating1_norm_squared * rating2_norm_squared
return (num / (float(den))) if den else 0.0
def nearNeigh(user, users_and_sims, n):
'''
Sort the movie predictions list by similarity and select the top N related users
'''
users_and_sims.sort(key=lambda x: x[1][0],reverse=True)
return user, users_and_sims[:n]
def topMovieRecommendations(user_id, user_sims, users_with_rating, n):
'''
Calculate the top N movie recommendations for each user using the
weighted sum approach
'''
# initialize dicts to store the score of each individual item,
# since an item can exist in more than one item neighborhood
t = defaultdict(int)
sim_s = defaultdict(int)
for (neigh,(sim,count)) in user_sims:
# lookup the movie predictions for this similar neighbours
unscored_movies = users_with_rating.get(neigh,None)
if unscored_movies:
for (movie,rating) in unscored_movies:
if neigh != movie:
# update totals and sim_s with the rating data
t[neigh] += sim * rating
sim_s[neigh] += sim
# create the normalized list of scored movies
scored_items = [(total/sim_s[movie],movie) for movie,total in t.items()]
# sort the scored movies in ascending order
scored_items.sort(reverse=True)
# take out the movie score
ranked_items = [x[1] for x in scored_items]
return user_id,scored_items[:n]
def fillMovieNames(movienames):
'''
Assigns the movie names to dictionary
'''
nameDict = loadMovieNames()
for mid in movienames:
mname = nameDict[mid]
mid = mname
return movienames
def calculateMAE(pred):
'''
Calculates mean average error
'''
diff= float()
n = float()
for values in pred:
(pred_rating,actual_rating) = values
diff+=abs(float(pred_rating) -float(actual_rating))
n+=1.0
return (diff/n)
def keyOfFirstUser(user_pair, movie_sim_data):
'''
For each user-user pair, make the first user's id key
'''
(user1_id,user2_id) = user_pair
return user1_id,(user2_id,movie_sim_data)
lines = sc.textFile("file:///SparkCourse/FinalCode/imdbFileTrain.txt")
movie_user_pairs = lines.map(lambda x: (x.split("::")[1],(x.split("::")[0],float(x.split("::")[2])))).groupByKey()
movie_user_pairs = movie_user_pairs.map(lambda x : (x[0], list(set(x[1]))))
paired_users = movie_user_pairs.filter( lambda p: len(p[1]) > 1)
paired_users =paired_users .map(
lambda p: findingUserPairs(p[0], p[1])).groupByKey()
user_sim = paired_users.map(
lambda p: cosineSim(p[0], p[1]))
user_sim=user_sim.map(
lambda p: keyOfFirstUser(p[0], p[1])).groupByKey()
user_sim=user_sim.map(lambda x : (x[0], list(x[1]))).map(
lambda p: nearNeigh(p[0], p[1], 3))
user_movie_history = lines.map(parseUserInfo).groupByKey().collect()
user_dict = {}
for (user,movie) in user_movie_history:
user_dict[user] = movie
u = sc.broadcast(user_dict)
'''
Calculate the top-N item recommendations for each user
user_id -> [movie1,movie2,movie3,...]
'''
user_movie_recs = user_sim.map(
lambda p: topMovieRecommendations(p[0], p[1], u.value, 5)).collect()
test_ratings = defaultdict(list)
with open("imdbFIleTest.txt") as f:
for line in f:
fields= line.split()
user = fields[0]
movieids = fields[1]
ratings = fields[2]
test_ratings[user] += [(movieids,ratings)]
# create train-test rating tuples
preds = []
for (user,items_with_rating) in user_movie_recs:
for (rating,item) in items_with_rating:
for (test_item,test_rating) in test_ratings[user]:
if str(test_item) == str(item):
preds.append((rating,float(test_rating)))
print preds
result = calculateMAE(preds)
print result