Explore-AI · Toka008 · Jul 21, 2023 · Jul 21, 2023 · Jul 23, 2023 · Jul 25, 2023
diff --git a/edsa_recommender.py b/edsa_recommender.py
@@ -31,21 +31,54 @@
 # Data handling dependencies
 import pandas as pd
 import numpy as np
+import matplotlib.pyplot as plt
 
 # Custom Libraries
 from utils.data_loader import load_movie_titles
 from recommenders.collaborative_based import collab_model
 from recommenders.content_based import content_model
+from googleapiclient.discovery import build
 
 # Data Loading
 title_list = load_movie_titles('resources/data/movies.csv')
+movies_df = pd.read_csv('resources/data/rated_movies.csv')
+
+
+
+# Function to fetch the YouTube video ID of a trailer based on the movie title
+def get_youtube_trailer_id(movie_title, api_key, num_results=1):
+    youtube = build('youtube', 'v3', developerKey=api_key)
+    search_response = youtube.search().list(
+        q=f"{movie_title} official trailer",
+        part='id',
+        type='video',
+        maxResults=num_results
+    ).execute()
+
+    # Extract the video ID from the API response
+    video_id = search_response['items'][0]['id']['videoId'] if search_response['items'] else None
+    return video_id
+
+
+def extract_year_from_title(title):
+    year_start = title.find("(") + 1
+    year_end = title.find(")")
+    return title[year_start:year_end]
+
+
+# Extract year from the title and create a new "year" column
+movies_df['year'] = movies_df['title'].apply(extract_year_from_title)
+
+# Convert the "year" column to integer type
+movies_df['year'] = pd.to_numeric(movies_df['year'], errors='coerce')
+
 
 # App declaration
 def main():
 
     # DO NOT REMOVE the 'Recommender System' option below, however,
     # you are welcome to add more options to enrich your app.
-    page_options = ["Recommender System","Solution Overview"]
+    page_options = ["Recommender System","Solution Overview","Movie Search","Top Rated Movies","EDA", "About Us"]
 
     # -------------------------------------------------------------------
     # ----------- !! THIS CODE MUST NOT BE ALTERED !! -------------------
@@ -63,7 +96,7 @@ def main():
 
         # User-based preferences
         st.write('### Enter Your Three Favorite Movies')
-        movie_1 = st.selectbox('Fisrt Option',title_list[14930:15200])
+        movie_1 = st.selectbox('First Option',title_list[14930:15200])
         movie_2 = st.selectbox('Second Option',title_list[25055:25255])
         movie_3 = st.selectbox('Third Option',title_list[21100:21200])
         fav_movies = [movie_1,movie_2,movie_3]
@@ -98,15 +131,209 @@ def main():
 
 
     # -------------------------------------------------------------------
+    # Code for "Movie Search" page
+    if page_selection == "Movie Search":
+        st.title("Movie Search")
+
+        # Sidebar - Movie Search
+        genre = st.sidebar.text_input('Enter a Genre (e.g., Action, Drama, Comedy):')
+        title = st.sidebar.text_input('Enter a Movie Title:')
+            # Function to filter movies based on user criteria
+        def filter_movies(df, genre=None, title=None):
+            if genre:
+                df = df[df['genres'].str.contains(genre, case=False)]
+            if title:
+                df = df[df['title'].str.contains(title, case=False)]
+            df = df.sort_values(by='rating', ascending=False)
+            return df
+
+            # Filter the movies based on user criteria
+        filtered_movies = filter_movies(movies_df, genre=genre, title=title)
+
+            # Display the filtered movie results
+        st.table(filtered_movies[['title', 'genres', 'rating']])
+
+
+        # -------------------------------------------------------------------
+
+        # Code for "Top Rated Movies" page
+    if page_selection == "Top Rated Movies":
+    #     st.title('Top Rated Movies')
+    #     num_top_rated_movies = st.slider('Number of Top Rated Movies to Display:', 5, 20, 10)
+
+    #         # Function to get top-rated movies
+    #     def get_top_rated_movies(df, num_movies=10):
+    #         return df.nlargest(num_movies, 'rating')
+
+    #     top_rated_movies = get_top_rated_movies(movies_df, num_top_rated_movies)
+    #     st.table(top_rated_movies[['title', 'genres', 'rating']])
+
+        st.title('Top Rated Movies')
+        num_top_rated_movies = st.slider('Number of Top Rated Movies to Display:', 5, 20, 10)
+
+        # Function to get top-rated movies
+        def get_top_rated_movies(df, num_movies=10):
+            return df.nlargest(num_movies, 'rating')
+
+        top_rated_movies = get_top_rated_movies(movies_df, num_top_rated_movies)
+
+        # Fetch and display trailers for each top-rated movie
+        st.write("Trailers:")
+        api_key = 'AIzaSyAz-2bMsUmJ6DdJioEFAPZYNdoKjbEABEs'  # Replace with your YouTube API key
+        for _, row in top_rated_movies.iterrows():
+            movie_title = row['title']
+            trailer_id = get_youtube_trailer_id(movie_title, api_key)
+            if trailer_id:
+                st.write(f"**{movie_title}**: ")
+                st.video(f"https://www.youtube.com/watch?v={trailer_id}", format="mp4")
+
+
+
 
     # ------------- SAFE FOR ALTERING/EXTENSION -------------------
     if page_selection == "Solution Overview":
         st.title("Solution Overview")
-        st.write("Describe your winning approach on this page")
+        st.image('resources/imgs/header_image.jpg',use_column_width=True)
+
+        # Button to expand/collapse the "Movie Recommender App" subsection
+        if st.button("Movie Recommender App"):
+            st.write("""
+        **Solution Overview: Movie Recommender App**
+
+        Our Movie Recommender App is an intelligent system designed to help users discover their ideal movies by leveraging the power of collaborative-based and content-based filtering techniques. The primary goal of this app is to provide personalized movie recommendations based on user preferences and movie features.
+        """)
+
+        if st.button("Key Features"):
+            st.write("""
+        **Key Features:**
+
+        1. **User-Friendly Interface:** The app offers a simple and intuitive user interface. Users can easily navigate through different sections, including "Recommender System," "Movie Search," and "Top Rated Movies."
+
+        2. **Recommender System:** Our app presents two advanced recommendation algorithms: Collaborative-Based Filtering and Content-Based Filtering. Users can input their three favorite movies, and the system will generate a list of movie recommendations tailored to their unique tastes.
+
+        3. **Movie Search:** Users have the freedom to search for specific movies or explore movies by genres. The app efficiently filters movies based on user-provided genre criteria, allowing users to quickly discover movies that match their interests.
+
+        4. **Top Rated Movies:** Our app presents a list of top-rated movies based on user ratings or other metrics. Users can adjust the number of movies displayed to explore the best movies based on their preferences.
+        """)
+
+        if st.button("How It Works"):
+            st.write("""
+        **How It Works:**
+
+        1. **Collaborative-Based Filtering:** This approach builds user-item interactions to uncover patterns in user preferences. By analyzing how similar users have rated movies, the system identifies movies that align with a user's taste. The resulting recommendations are personalized and considerate of user behavior.
+
+        2. **Content-Based Filtering:** The content-based approach focuses on movie features such as genres and tags. By comparing movie attributes with user preferences, the app suggests movies that align with a user's previous movie choices.
+        """)
+
+        if st.button("Benefits"):
+            st.write("""
+        **Benefits:**
+
+    1. **Personalized Recommendations:** Our app provides personalized movie recommendations, ensuring that users receive tailored suggestions based on their individual interests.
+
+    2. **Exploration and Discovery:** Users can discover new movies outside their typical choices through the diverse recommendations generated by the app.
+
+    3. **Enhanced Movie Search:** The movie search feature enables users to find movies based on specific genres, empowering them to explore movies relevant to their mood or interests.
+
+        """)
+
+        st.write("""Our Movie Recommender App is committed to delivering an engaging and dynamic movie discovery experience for users. We continuously strive to improve our recommendation algorithms and user interface to ensure movie enthusiasts find their perfect watchlist with ease. Enjoy exploring the world of cinema with our smart and sophisticated movie recommender system!
+    """) 
+
+
 
     # You may want to add more sections here for aspects such as an EDA,
     # or to provide your business pitch.
 
+#--------------------------------------------------------------------------------------------------------------------------------
+#-----------------------------------------------------------------------------------------------------------------------------
+
+
+    # Add code for "About Us" page
+    if page_selection == "About Us":
+        st.title("About Us")
+
+        # Insert information about your team, project, or organization
+
+        st.image('resources/imgs/meet_our_team.jpg',use_column_width=True)
+        st.markdown("""
+        - Ayodele Marcus: Movie Analyst
+        - Toka Ramakau: Data Engineer
+        - Mmatlou Matlakala: Lead Data Scientist
+        - Jacinta Muindi: Machine Learning Engineer
+        - Oladimeji Akanni: Data Scientist
+        - Emmanuel Alabi: App Designer
+
+        Contact us at [[email protected]](mailto:[email protected]) for inquiries.
+        """)
+
+
+#--------------------------------------------------------------------------------------------------
+    elif page_selection == "EDA":
+        st.title("Exploratory Data Analysis (EDA)")
+
+        # Show basic statistics
+        st.header("Basic Statistics")
+        st.write("Total Number of Movies:", len(movies_df))
+        st.write("Overall Average Rating:", movies_df['rating'].mean())
+
+        # Button to show ratings distribution
+        if st.button("Show Ratings Distribution"):
+            st.header("Ratings Distribution")
+            ratings_counts = movies_df['rating'].value_counts().sort_index()
+            fig, ax = plt.subplots(figsize=(6.4, 2))
+            ax.bar(ratings_counts.index, ratings_counts.values)
+            ax.set_xlabel("Rating")
+            ax.set_ylabel("Number of Movies")
+            st.pyplot(fig)
+
+            # Button to show top rated movies
+        #if st.button("Show Top Rated Movies"):
+         #   st.header("Top Rated Movies")
+          #  top_rated_movies = movies_df.groupby('title')['rating'].mean().sort_values(ascending=False).head(10)
+           # st.table(top_rated_movies.reset_index().rename(columns={'rating': 'Average Rating'}))
+
+# Button to show genres distribution
+        if st.button("Show Genres Distribution"):
+            st.header("Genres Distribution")
+            genres_counts = movies_df['genres'].str.split('|', expand=True).stack().value_counts()
+            fig, ax = plt.subplots(figsize=(6.4, 2))
+            ax.bar(genres_counts.index, genres_counts.values)
+            ax.set_xticklabels(genres_counts.index, rotation=90)
+            ax.set_xlabel("Genre")
+            ax.set_ylabel("Number of Movies")
+            st.pyplot(fig)
+
+
+# Button to show most common genres
+        if st.button("Show Most Common Genres"):
+            st.header("Most Common Genres")
+            most_common_genres = movies_df['genres'].str.split('|', expand=True).stack().value_counts().head(10)
+            st.table(most_common_genres.reset_index().rename(columns={'index': 'Genre', 0: 'Count'}))
+
+        # Button to show movie count by year
+        #if st.button("Show Movie Count by Year"):
+         #   st.header("Movie Count by Year")
+          #  movie_count_by_year = movies_df['year'].value_counts().sort_index()
+           # fig, ax = plt.subplots()
+         #   ax.plot(movie_count_by_year.index, movie_count_by_year.values)
+          #  ax.set_xlabel("Year")
+           # ax.set_ylabel("Number of Movies")
+            #st.pyplot(fig)
+
+        # User rating stats
+        st.header("User Rating Statistics")
+        user_rating_stats = movies_df['rating'].describe()
+        st.table(user_rating_stats)
+
+
+        # You can add other EDA visualizations and analysis here
+
+    # -------------------------------------------------------------------
+
+
+
+
 
 if __name__ == '__main__':
     main()
diff --git a/recommenders/collaborative_based.py b/recommenders/collaborative_based.py
@@ -30,17 +30,19 @@
 # Script dependencies
 import pandas as pd
 import numpy as np
+import scipy as sp
 import pickle
 import copy
-from surprise import Reader, Dataset
-from surprise import SVD, NormalPredictor, BaselineOnly, KNNBasic, NMF
+from surprise import Reader, Dataset, SVD
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import CountVectorizer
 
 # Importing data
+#movies_df = pd.read_csv('/home/explore-student/unsupervised_data/unsupervised_movie_data/movies.csv',sep = ',',delimiter=',')
+#ratings_df = pd.read_csv('/home/explore-student/unsupervised_data/unsupervised_movie_data/train.csv')
 movies_df = pd.read_csv('resources/data/movies.csv',sep = ',')
 ratings_df = pd.read_csv('resources/data/ratings.csv')
-ratings_df.drop(['timestamp'], axis=1,inplace=True)
+ratings_df.drop(['timestamp'], axis=1, inplace=True)
 
 # We make use of an SVD model trained on a subset of the MovieLens 10k dataset.
 model=pickle.load(open('resources/models/SVD.pkl', 'rb'))
@@ -99,7 +101,7 @@ def pred_movies(movie_list):
     return id_store
 
 # !! DO NOT CHANGE THIS FUNCTION SIGNATURE !!
-# You are, however, encouraged to change its content.  
+# You are, however, encouraged to change its content.
 def collab_model(movie_list,top_n=10):
     """Performs Collaborative filtering based upon a list of movies supplied
        by the app user.
@@ -117,32 +119,60 @@ def collab_model(movie_list,top_n=10):
         Titles of the top-n movie recommendations to the user.
 
     """
-
-    indices = pd.Series(movies_df['title'])
-    movie_ids = pred_movies(movie_list)
-    df_init_users = ratings_df[ratings_df['userId']==movie_ids[0]]
-    for i in movie_ids :
-        df_init_users=df_init_users.append(ratings_df[ratings_df['userId']==i])
-    # Getting the cosine similarity matrix
-    cosine_sim = cosine_similarity(np.array(df_init_users), np.array(df_init_users))
+    names = movies_df.copy()
+    names.set_index('movieId',inplace=True)
+    indices = pd.Series(names['title'])
+    users_ids = pred_movies(movie_list)
+    # Get movie IDs and ratings for top users
+    df_init_users = ratings_df[ratings_df['userId']==users_ids[0]]
+    for i in users_ids[1:]:
+        df_init_users = df_init_users.append(ratings_df[ratings_df['userId']==i])
+    # Include predictions for chosen movies
+    for j in movie_list:
+        a = pd.DataFrame(prediction_item(j))
+        for i in set(df_init_users['userId']):
+            mid = indices[indices == j].index[0]
+            est = a['est'][a['uid']==i].values[0]
+            df_init_users = df_init_users.append(pd.Series([int(i),int(mid),est], index=['userId','movieId','rating']), ignore_index=True)
+    # Remove duplicate entries
+    df_init_users.drop_duplicates(inplace=True)
+    #Create pivot table
+    util_matrix = df_init_users.pivot_table(index=['userId'], columns=['movieId'], values='rating')
+    # Fill Nan values with 0's and save the utility matrix in scipy's sparse matrix format
+    util_matrix.fillna(0, inplace=True)
+    util_matrix_sparse = sp.sparse.csr_matrix(util_matrix.values)
+    # Compute the similarity matrix using the cosine similarity metric
+    user_similarity = cosine_similarity(util_matrix_sparse.T)
+    # Save the matrix as a dataframe to allow for easier indexing
+    user_sim_df = pd.DataFrame(user_similarity, index = util_matrix.columns, columns = util_matrix.columns)
+    user_similarity = cosine_similarity(np.array(df_init_users), np.array(df_init_users))
+    user_sim_df = pd.DataFrame(user_similarity, index = df_init_users['movieId'].values.astype(int), columns = df_init_users['movieId'].values.astype(int))
+    # Remove duplicate rows from matrix
+    user_sim_df = user_sim_df.loc[~user_sim_df.index.duplicated(keep='first')]
+    # Transpose matrix
+    user_sim_df = user_sim_df.T
+    # Find IDs of chosen load_movie_titles
     idx_1 = indices[indices == movie_list[0]].index[0]
     idx_2 = indices[indices == movie_list[1]].index[0]
     idx_3 = indices[indices == movie_list[2]].index[0]
     # Creating a Series with the similarity scores in descending order
-    rank_1 = cosine_sim[idx_1]
-    rank_2 = cosine_sim[idx_2]
-    rank_3 = cosine_sim[idx_3]
+    rank_1 = user_sim_df[idx_1]
+    rank_2 = user_sim_df[idx_2]
+    rank_3 = user_sim_df[idx_3]
     # Calculating the scores
     score_series_1 = pd.Series(rank_1).sort_values(ascending = False)
     score_series_2 = pd.Series(rank_2).sort_values(ascending = False)
     score_series_3 = pd.Series(rank_3).sort_values(ascending = False)
-     # Appending the names of movies
-    listings = score_series_1.append(score_series_1).append(score_series_3).sort_values(ascending = False)
-    recommended_movies = []
+    # Appending the names of movies
+    listings = score_series_1.append(score_series_2).append(score_series_3).sort_values(ascending = False)
     # Choose top 50
     top_50_indexes = list(listings.iloc[1:50].index)
     # Removing chosen movies
     top_indexes = np.setdiff1d(top_50_indexes,[idx_1,idx_2,idx_3])
+    # Get titles of recommended movies
+    recommended_movies = []
     for i in top_indexes[:top_n]:
-        recommended_movies.append(list(movies_df['title'])[i])
+        recommended_movies.append(list(movies_df[movies_df['movieId']==i]['title']))
+    # Return list of movies
+    recommended_movies = [val for sublist in recommended_movies for val in sublist]
     return recommended_movies