Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filterrating #28

Open
wants to merge 39 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b454474
Updated JSON files
nidhi-mylavarapu Mar 14, 2024
6a81f0a
Testing init json
nidhi-mylavarapu Mar 19, 2024
fa5ea0c
changed init json
nidhi-mylavarapu Mar 19, 2024
d1b2b97
changed init.json and search
nidhi-mylavarapu Mar 19, 2024
db11838
add a slider for ratings
mahithapen Mar 19, 2024
4215a2d
change front-end
mahithapen Mar 19, 2024
f1feeac
Add frontend components
dweizzz Mar 20, 2024
b7dcbee
Cosine Similairty + Querying
ramzd18 Mar 20, 2024
babcf51
added overviews, tried to adjust images, to do: width of suggestion c…
nidhi-mylavarapu Mar 21, 2024
426f881
fixed width, images still weird tho
nidhi-mylavarapu Mar 21, 2024
eea02c2
Requirements+backend change
ramzd18 Mar 26, 2024
9bb2d04
Merge branch 'master' of https://github.com/nidhi-mylavarapu/4300-Fla…
ramzd18 Mar 26, 2024
e48de92
use overview
ramzd18 Mar 26, 2024
9891cc4
change json list to add reviews
nathanpalamuttam Apr 14, 2024
11602d5
dataset changed
mahithapen Apr 15, 2024
7195e6e
nb, posters, data
mahithapen Apr 15, 2024
5910d58
updated html
mahithapen Apr 15, 2024
eaae91e
nb and posters
mahithapen Apr 15, 2024
a6cff4e
naive bayes on reviews
ramzd18 Apr 15, 2024
98f5d99
Review Querying enabled
ramzd18 Apr 15, 2024
168424e
poster update
mahithapen Apr 15, 2024
ab8104a
reviews displayed
mahithapen Apr 15, 2024
0f0b297
showing reviews / review query box
nidhi-mylavarapu Apr 15, 2024
d8df3d3
Merge pull request #1 from nidhi-mylavarapu/new-branch-name
nathanpalamuttam Apr 15, 2024
4e9895a
fix images + popularity
mahithapen Apr 16, 2024
2725f67
change review ordering
nathanpalamuttam Apr 16, 2024
061126e
Merge pull request #2 from nidhi-mylavarapu/new-branch-name
mahithapen Apr 16, 2024
5fbb578
remove rating filter
mahithapen Apr 21, 2024
f5df675
remove disliked genres
mahithapen Apr 21, 2024
b3f8549
vote average display
mahithapen Apr 21, 2024
5d4e211
vote average fixed
mahithapen Apr 21, 2024
91b631b
fixed average
mahithapen Apr 21, 2024
1db7466
loading icon, update css
mahithapen Apr 22, 2024
a1ac570
update styles
mahithapen Apr 22, 2024
bf6374c
Display sentiment scores on front end
dweizzz Apr 23, 2024
597ea45
add multiple genre queries and implement backend to sort based on genres
nathanpalamuttam Apr 23, 2024
a1a610a
add counts variable to return json
nathanpalamuttam Apr 23, 2024
b80e1db
filter by rating
mahithapen Apr 23, 2024
d20e494
modify slider
mahithapen Apr 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@ dist/
build/
*.egg-info/
helpers/*
json_template/
json_template/

python_env
34 changes: 34 additions & 0 deletions backend/add_reviews_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import json
import os
import pandas as pd


directory = r'c:\Users\Nathan Palamuttam\Downloads\2_reviews_per_movie_raw'

csv_names = []

for filename in os.listdir(directory):
if filename.endswith('.csv'):
processed_name = filename[:-9]
print(processed_name)
csv_names.append((processed_name, filename))


with open('init.json', 'r') as file:
data = json.load(file)

for item in data:
for item1 in csv_names:
if item['title'] is not None and item['title'] in item1[0]:
csv_file_path = os.path.join(directory, item1[1])
df = pd.read_csv(csv_file_path)
df_sorted= df.sort_values(by='helpful', ascending=False)
reviews = df_sorted['review'].head(10).tolist()
item['reviews'] = reviews
print(item['title'])
print(reviews)
with open('init.json', 'w') as file:
json.dump(data, file, indent = 4)



199 changes: 189 additions & 10 deletions backend/app.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import json
import os
from flask import Flask, render_template, request
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from helpers.MySQLDatabaseHandler import MySQLDatabaseHandler
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import numpy as np

# ROOT_PATH for linking with all your files.


# ROOT_PATH for linking with all your files.
# Feel free to use a config.py or settings.py with a global export variable
os.environ['ROOT_PATH'] = os.path.abspath(os.path.join("..",os.curdir))

Expand All @@ -18,29 +29,197 @@
# Assuming your JSON data is stored in a file named 'init.json'
with open(json_file_path, 'r') as file:
data = json.load(file)
episodes_df = pd.DataFrame(data['episodes'])
reviews_df = pd.DataFrame(data['reviews'])
# episodes_df = pd.DataFrame(data['episodes'])
# reviews_df = pd.DataFrame(data['reviews'])
movies_df = pd.DataFrame(data)

app = Flask(__name__)
CORS(app)

# Sample search using json with pandas
def json_search(query):
matches = []
merged_df = pd.merge(episodes_df, reviews_df, left_on='id', right_on='id', how='inner')
matches = merged_df[merged_df['title'].str.lower().str.contains(query.lower())]
matches_filtered = matches[['title', 'descr', 'imdb_rating']]
matches = movies_df[movies_df['title'].str.lower().str.contains(query.lower()) | movies_df['original_title'].str.lower().str.contains(query.lower())]
matches_filtered = matches[['title', 'overview', 'vote_average', 'reviews','image']] # Adjusted to match relevant fields in the new JSON
matches_filtered_json = matches_filtered.to_json(orient='records')
return matches_filtered_json
def genre_search(genre):
matches = movies_df[movies_df['genres'].apply(lambda g: genre.lower() in (genre_name.lower() for genre_name in g))]
matches_filtered = matches[['title', 'overview', 'vote_average']]
matches_filtered_json = matches_filtered.to_json(orient='records')
return matches_filtered_json
def filter_movies_by_genre(genre):
def is_genre_present(genres_str, genre):
try:
genres_list = ast.literal_eval(genres_str)
for genre_dict in genres_list:
if genre.lower() == genre_dict['name'].lower():
return True
except ValueError:
return False
return False
items_list = genre.split(", ")
matching_rows = []
maxNum = 0
for index, row in movies_df.iterrows():
count = 0
dicOfGenres = ast.literal_eval(row['genres'])
for item in dicOfGenres:
if item['name'] in items_list:
count+=1
if count >= maxNum:
maxNum = count
insRow = row.to_dict()
insRow['counts'] = count
matching_rows.insert(0, insRow)
elif count < maxNum:
insRow = row.to_dict()
insRow['counts'] = count
matching_rows.append(insRow)
df = pd.DataFrame(matching_rows)
df_selected = df[['title', 'overview', 'vote_average', 'reviews','image']]
json_str = df_selected.to_json(orient='records')
return json_str

import math
def compute_similarities(overviews, query):
combined_texts = overviews + [query]
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(combined_texts)
cosine_similarities = linear_kernel(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
similarity_scores = [(score, idx) for idx, score in enumerate(cosine_similarities)]
similarity_scores = sorted(similarity_scores, reverse=True)
top_matches = similarity_scores[:50]
return top_matches

def tokenize(text):
if text is None:
return ""
return text.lower().split()

def build_vocabulary(descriptions):
vocab = set()
for description in descriptions:
vocab.update(tokenize(description))
return list(vocab)

def vectorize(text, vocabulary):
word_counts = {word: 0 for word in vocabulary}
for word in tokenize(text):
if word in word_counts:
word_counts[word] += 1
return [word_counts[word] for word in vocabulary]

def cosine_similarity(vec1, vec2):
dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
magnitude1 = math.sqrt(sum(v**2 for v in vec1))
magnitude2 = math.sqrt(sum(v**2 for v in vec2))
if magnitude1 == 0 or magnitude2 == 0:
return 0
return dot_product / (magnitude1 * magnitude2)
def genresuggests(genre):
genres = set()
for item in data:
genre_list = json.loads(item['genres'].replace("'", "\""))
for genre in genre_list:
if genre in genre['name'].lower():
genres.add(genre['name'])

return jsonify(list(genres))


train_data = pd.read_csv('train.tsv.zip', sep='\t')
train_data['full_sentence'] = train_data.groupby('SentenceId')['Phrase'].transform(lambda x: ' '.join(x))
train_data = train_data.drop_duplicates('SentenceId').reset_index(drop=True)

model = make_pipeline(TfidfVectorizer(stop_words='english'), MultinomialNB())
model.fit(train_data['full_sentence'], train_data['Sentiment'])

def classify_and_score_reviews(json_data):
movies_df = pd.read_json(json_data)
results = []
for index, row in movies_df.iterrows():
movie_reviews = row['reviews']
if movie_reviews:
sentiments = model.predict(movie_reviews)
if len(sentiments) > 0:
average_sentiment = np.mean(sentiments)
results.append((row['title'], average_sentiment))
else:
results.append((row['title'], None))
else:
results.append((row['title'], None))

return dict(results)


@app.route("/")
def home():
return render_template('base.html',title="sample html")

def compute_cosine_similarities(texts):
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)
cosine_similarities = linear_kernel(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten()
return cosine_similarities


@app.route("/episodes")
def episodes_search():
text = request.args.get("title")
return json_search(text)
query = request.args.get("query")
review = request.args.get("review")
min_popularity = float(request.args.get("min_popularity", 0)) # Default to 0 if not provided

pure_json = filter_movies_by_genre(text)
json_text = json.loads(pure_json)
sentiment_scores = classify_and_score_reviews(pure_json)

overviews = [overview['overview'] if overview['overview'] is not None else "" for overview in json_text]
texts = overviews + ([query] if query is not None else [''])
query_sim = compute_cosine_similarities(texts)

reviews = [str(text['reviews']) if text['reviews'] is not None else "" for text in json_text]
review_text = reviews + ([review] if review is not None else [''])
reviews_similarities = compute_cosine_similarities(review_text)

cosine_similarity = None
if review != "" and query != "":
cosine_similarity = (query_sim + reviews_similarities) / 2
elif review != "" and query == "":
cosine_similarity = reviews_similarities
elif review == "" and query != "":
cosine_similarity = query_sim
else:
cosine_similarity = query_sim

movie_scores = list(enumerate(cosine_similarity))
sorted_movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)[:20]
combined_scores = [(index, value, sentiment_scores[json_text[index]['title']]) for index, value in sorted_movie_scores]
combined_scores_sorted = sorted(combined_scores, key=lambda x: x[2], reverse=True)

filtered_movies = [{"title": json_text[int(index)]['title'],
"overview": json_text[int(index)]['overview'],
"vote_average": json_text[int(index)]['vote_average'],
"reviews": json_text[int(index)]['reviews'],
"image": json_text[int(index)]['image'],
"sentiment_score": sentiment} for index, _, sentiment in combined_scores_sorted]
filtered_movies = [movie for movie in filtered_movies if float(movie['vote_average']) >= min_popularity]

return jsonify(filtered_movies)

@app.route('/genre_suggestions')
def genre_suggestions():
query = request.args.get('query', '').lower()
genres = set()

for item in data:
genre_list = json.loads(item['genres'].replace("'", "\""))
for genre in genre_list:
if query in genre['name'].lower():
genres.add(genre['name'])

return jsonify(list(genres))

if 'DB_NAME' not in os.environ:
app.run(debug=True,host="0.0.0.0",port=5000)
app.run(debug=True,host="0.0.0.0",port=5000)

6 changes: 6 additions & 0 deletions backend/checkjson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import json

with open('init.json', 'r') as file:
data = json.load(file)
for item in data:
print(item['popularity'], item['image'])
Loading