-
Notifications
You must be signed in to change notification settings - Fork 0
/
popular-movies-dataframe.py
55 lines (39 loc) · 1.44 KB
/
popular-movies-dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
def loadMovieNames():
"""Extract movie names"""
movieNames = {}
with open("ml-100k/u.ITEM", encoding="latin-1") as f:
for line in f:
fields = line.split('|')
movieNames[int(fields[0])] = fields[1] # fields[0] is movie ID, fields[1] is movie name
return movieNames
# Create a SparkSession
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()
# Load up our movie ID -> name dictionary
nameDict = loadMovieNames()
# Get the raw data
lines = spark.sparkContext.textFile("file:///Users/cyprienhenry/Documents/SparkCourse/ml-100k/u.data")
# Convert it to a RDD of Row objects
movies = lines.map(lambda x: Row(movieID =int(x.split()[1])))git
# Convert that to a DataFrame
movieDataset = spark.createDataFrame(movies)
# Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = movieDataset.groupBy("movieID").count().orderBy("count", ascending=False).cache()
# Show the results at this point:
#|movieID|count|
#+-------+-----+
#| 50| 584|
#| 258| 509|
#| 100| 508|
topMovieIDs.show()
# Grab the top 10
top10 = topMovieIDs.take(10)
# Print the results
print("\n")
for result in top10:
# Each row has movieID, count as above.
print("%s: %d" % (nameDict[result[0]], result[1]))
# Stop the session
spark.stop()