-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_generator.py
41 lines (30 loc) · 1.39 KB
/
data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import numpy
import pandas
DATA_PATH = '/Users/sai/dev/datasets/netflix/'
NUM_USERS, NUM_MOVIES = 2649429, 17770 # This is max, not total
BATCH_SIZE = 25000
def get_ratings_data():
ratings_file = os.path.join(DATA_PATH, 'combined_ratings.csv')
data = pandas.read_csv(ratings_file, sep=',', usecols=(0, 1, 2))
# print("user id min/max: ", data['userId'].min(), data['userId'].max())
# print "Number of unique users: {}".format(numpy.unique(data['userId']).shape[0])
# print("movie id min/max: ", data['movieId'].min(), data['movieId'].max())
# print "Number of unique movies: {}".format(numpy.unique(data['movieId']).shape[0])
data = data.sample(frac=1).reset_index(drop=True) # Shuffle the data in place row-wise
# Use the first 19M samples to train the model
train_users = data['userId'].values - 1 # Offset by 1 so that the IDs start at 0
train_movies = data['movieId'].values - 1 # Offset by 1 so that the IDs start at 0
train_ratings = data['rating'].values
return train_users, train_movies, train_ratings
def transform_ratings_into_classes(ratings):
num_rows = ratings.shape[0]
t = ratings - 1
t = t.astype('int32')
b = numpy.zeros((num_rows, 5))
b[numpy.arange(num_rows), t] = 1
return b
if __name__ == "__main__":
train_users, train_movies, train_ratings = get_ratings_data()
from IPython import embed
embed()