From 89c14cc5f784f830f33ff5696b4fd66ca36b48d1 Mon Sep 17 00:00:00 2001 From: steve Date: Thu, 1 Jun 2023 11:48:44 -0400 Subject: [PATCH] added scrapping files for written audience reviews found on rottentomatoes.added gpt feature files. modified rottentomatoes files to include foreign keys --- Scrapping/audienceReviewsMovie.py | 97 +++++++++++++++++++ Scrapping/audienceReviewsTv.py | 100 +++++++++++++++++++ Scrapping/gptMovie.py | 153 ++++++++++++++++++++++++++++++ Scrapping/gptTv.py | 131 +++++++++++++++++++++++++ Scrapping/rtMovieReviews.py | 13 ++- Scrapping/rtTvReviews.py | 13 ++- 6 files changed, 503 insertions(+), 4 deletions(-) create mode 100644 Scrapping/audienceReviewsMovie.py create mode 100644 Scrapping/audienceReviewsTv.py create mode 100644 Scrapping/gptMovie.py create mode 100644 Scrapping/gptTv.py diff --git a/Scrapping/audienceReviewsMovie.py b/Scrapping/audienceReviewsMovie.py new file mode 100644 index 0000000..9a93912 --- /dev/null +++ b/Scrapping/audienceReviewsMovie.py @@ -0,0 +1,97 @@ +import numpy as np +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.support.ui import WebDriverWait +from sqlalchemy import create_engine +import time +from selenium.webdriver.common.keys import Keys +import urllib.request as request +import openai +import re +from string import punctuation +from selenium.webdriver.support import expected_conditions as EC + +def rt_url(title): + new = title.translate(str.maketrans('', '', punctuation)) + l = re.sub('\s','_',new).lower() + link = 'https://www.rottentomatoes.com/m/'+ l + return link +def scrape(movie): + list = [] + rotten = rt_url(movie) + print(rotten) + driver.get(rotten) + time.sleep(5) + try: + driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() + time.sleep(4) + driver.find_element(By.XPATH,'//*[@id="reviews"]/nav/ul/li[3]').click() + except: + driver.get("http://www.google.com") + google = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))) + google.send_keys(movie + " rotten tomatoes") + time.sleep(5) + google.send_keys(Keys.ENTER) + result = driver.find_element(By.CSS_SELECTOR,'.LC20lb.MBeuO.DKV0Md') + result.click() + driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() + time.sleep(3) + time.sleep(3) + reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') + rev = reviews + for r in rev: + res = len(re.findall(r'\w+', r.text)) + if res >= 80: + list.append(r.text) + time.sleep(8) + try: + next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') + next.click() + except: + print(len(list)) + #df[show] = pd.Series(list) + time.sleep(5) + return list + time.sleep(8) + while len(list) <= 15: + + reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') + rev = reviews + for r in rev: + res = len(re.findall(r'\w+', r.text)) + if res >= 80: + list.append(r.text) + time.sleep(7) + try: + next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') + next.click() + time.sleep(8) + except: + break + print(len(list)) + #df[show] = pd.Series(list) + time.sleep(5) + return list + + + +path = "chromedriver.exe" +driver = webdriver.Chrome(path) +engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") +test = pd.read_sql_query('select Movie from netflixTopMovie10',engine) +Movie = test['Movie'].to_list() + +df = pd.DataFrame() +for m in Movie: + try: + df[m] = pd.Series(scrape(m)) + + except: + driver.quit() + time.sleep(3) + df[m] = pd.Series(scrape(m)) + continue +print(df) +df.to_sql('audienceReviewsMovie', con=engine, if_exists='replace') \ No newline at end of file diff --git a/Scrapping/audienceReviewsTv.py b/Scrapping/audienceReviewsTv.py new file mode 100644 index 0000000..c92020c --- /dev/null +++ b/Scrapping/audienceReviewsTv.py @@ -0,0 +1,100 @@ +import numpy as np +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from sqlalchemy import create_engine +import time +from selenium.webdriver.common.keys import Keys +import urllib.request as request +import openai +import re +from string import punctuation + +def rt_url(title): + name = r'.+(?=:)' + na = re.search(name,title) + new = na.group(0).translate(str.maketrans('', '', punctuation)) + l = re.sub('\s','_',new).lower() + + + num = r'\d+$' + n = re.search(num,title) + if n ==None: + return 'https://www.rottentomatoes.com/tv/'+ l + + + nu = n.group(0) + link ='https://www.rottentomatoes.com/tv/'+ l +'/s0' + nu + return link +def scrape(show): + list = [] + rotten = rt_url(show) + print(rotten) + driver.get(rotten) + time.sleep(5) + try: + driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() + except: + driver.find_element(By.XPATH,'//*[@id="seasons-list"]/div/a/season-list-item').click() + driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() + time.sleep(3) + + reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') + rev = reviews + for r in rev: + res = len(re.findall(r'\w+', r.text)) + if res >= 80: + list.append(r.text) + time.sleep(8) + try: + next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') + next.click() + except: + print(len(list)) + #df[show] = pd.Series(list) + time.sleep(5) + return list + time.sleep(8) + while len(list) <= 40: + + reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') + rev = reviews + for r in rev: + res = len(re.findall(r'\w+', r.text)) + if res >= 80: + list.append(r.text) + time.sleep(7) + try: + next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') + next.click() + time.sleep(8) + except: + break + print(len(list)) + #df[show] = pd.Series(list) + time.sleep(5) + return list + + + +path = "chromedriver.exe" + +engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") +test = pd.read_sql_query('select TV from netflixTopTv10',engine) +tv = test['TV'].to_list() + +driver = webdriver.Chrome(path) + + +df = pd.DataFrame() +for show in tv: + try: + df[show] = pd.Series(scrape(show)) + + except: + driver.quit() + df[show] = pd.Series(scrape(show)) + continue +print(df) +df.to_sql('audienceReviewsTv', con=engine, if_exists='replace') \ No newline at end of file diff --git a/Scrapping/gptMovie.py b/Scrapping/gptMovie.py new file mode 100644 index 0000000..72e7af9 --- /dev/null +++ b/Scrapping/gptMovie.py @@ -0,0 +1,153 @@ +import numpy as np +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from sqlalchemy import create_engine +import time +from selenium.webdriver.common.keys import Keys +import urllib.request as request +import openai +import re +from string import punctuation + + +def get_completion(prompt, model="gpt-3.5-turbo"): + messages = [{"role": "user", "content": prompt}] + response = openai.ChatCompletion.create( + model=model, + messages=messages, + temperature=0, # this is the degree of randomness of the model's output + ) + return response.choices[0].message["content"] +#cleaning output +def similar_movies(s): + list = s.split() + + if 'Similar' in list[:2]: + p = r'.+(?<=include)' + n = re.sub(p,'',s) + p = r'(?<=,\s)and' + new = re.sub(p,'',n) + p = r'(.+?)(?:,|$)' + cleaned = re.findall(p,new) + return cleaned + elif 'Other' in list[:2]: + p = r'.+(?<=like)' + n = re.sub(p,'',s) + p = r'(?<=,\s)and' + new = re.sub(p,'',n) + p = r'(.+?)(?:,|$)' + cleaned = re.findall(p,new) + return cleaned + + elif bool(re.search(',',s)): + p = r'(?<=,\s)and' + new = re.sub(p,'',s) + p = r'(.+?)(?:,|$)' + cleaned = re.findall(p,new) + return cleaned + else: + cleaned = [s] + return cleaned + +def rating(s): + p = r'\d(\.\d)?(?=\/)' + new = re.search(p,s) + return new.group(0) + +openai.api_key = "sk-" +engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") +df = pd.read_sql_query('select * from audienceReviewsMovie',engine) +columns = df.columns.to_list()[1:] +df2 = pd.DataFrame(columns= ["Movie","userOpinion","similarMovie","typeOfViewer","rating"]) +for movie in columns: + count = len(df[movie].value_counts()) > 0 + if not count: + df3 = {'Movie':movie,'userOpinion':None,'similarMovies':None, 'typeOfViewer':None,'rating':None} + df2 = df2.append(df3,ignore_index = True) + print(df2) + continue + + + + # query = 'select {} from audienceReviewsTv'.format(show) + # Queen Charlotte: A Bridgerton Story: Series + # df1 = pd.read_sql_query('select ',engine) + reviews = df[movie].dropna().to_list() + text1= " || ".join(reviews[:15]) + #print(text1) + print(movie) + print() +# # print(text) + +# text2 = f""" +# this show is so good cant believe people are hating on it + +# """ + + prompt = f""" + + # I will be giving you some audience reviews of a movie named {movie}. + # Each audience review is separated by ||. + # An example of the format would be audience review || audience review.....|| audience review. + + # Your task is to perform the following actions: + # 1 - generalize what the users are saying about the movie + # 2 - provide names of shows related to this movie + # 3 - tell me who would like this show based on audience reviews + # 4 - after generalizing users reviews, rate this movie out of 10.Based this number off of the user reviews. \ + # Higher numbers means good. Lower numbers means bad. \ + # + + + + # Use the following format: + # user opinion: + # similar shows: + # type of viewer: + # rating: + # Text: <{text1}> + + # """ + + try: + response = get_completion(prompt) + # print(response) + + p = r'(?<=:).+' + + output = re.findall(p,response) + #print(output) + print(similar_movies(output[3])) + df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])} + df2 = df2.append(df3,ignore_index = True) + print(df2) + time.sleep(50) + except: + time.sleep(50) + response = get_completion(prompt) + # print(response) + + p = r'(?<=:).+' + + output = re.findall(p,response) + #print(output) + print(similar_movies(output[3])) + df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])} + df2 = df2.append(df3,ignore_index = True) + print(df2) + time.sleep(50) + +# Read the 'netflixTopTv10' table into a DataFrame +df_netflix = pd.read_sql('netflixTopMovie10', engine) +# Create a mapping between tv names and ids +movie_id_mapping = dict(zip(df_netflix['Movie'], df_netflix['id'])) +# Map the movie in 'gptMovie' DataFrame to their ids +df2['fk_id'] = df2['Movie'].map(movie_id_mapping ) +df2['fk_id'] = df2['fk_id'].astype('int') +# Start the rank from 1 & Rename the index col: +df2.index = df2.index + 1 +df2.index.names = ['rank'] +# insert to database +df2.to_sql('gptMovie', con=engine, if_exists='replace') \ No newline at end of file diff --git a/Scrapping/gptTv.py b/Scrapping/gptTv.py new file mode 100644 index 0000000..9f450e2 --- /dev/null +++ b/Scrapping/gptTv.py @@ -0,0 +1,131 @@ +import numpy as np +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from sqlalchemy import create_engine +import time +from selenium.webdriver.common.keys import Keys +import urllib.request as request +import openai +import re +from string import punctuation + + +def get_completion(prompt, model="gpt-3.5-turbo"): + messages = [{"role": "user", "content": prompt}] + response = openai.ChatCompletion.create( + model=model, + messages=messages, + temperature=0, # this is the degree of randomness of the model's output + ) + return response.choices[0].message["content"] +def similar_shows(s): + list = s.split() + + if 'Similar' in list[:2]: + p = r'.+(?<=include)' + n = re.sub(p,'',s) + p = r'(?<=,\s)and' + new = re.sub(p,'',n) + p = r'(.+?)(?:,|$)' + cleaned = re.findall(p,new) + return cleaned + elif 'Other' in list[:2]: + p = r'.+(?<=like)' + n = re.sub(p,'',s) + p = r'(?<=,\s)and' + new = re.sub(p,'',n) + p = r'(.+?)(?:,|$)' + cleaned = re.findall(p,new) + return cleaned + + elif bool(re.search(',',s)): + p = r'(?<=,\s)and' + new = re.sub(p,'',s) + p = r'(.+?)(?:,|$)' + cleaned = re.findall(p,new) + return cleaned + else: + cleaned = [s] + return cleaned + +def rating(s): + p = r'\d(\.\d)?(?=\/)' + new = re.search(p,s) + return new.group(0) + +openai.api_key = "sk-" +engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") +df = pd.read_sql_query('select * from audienceReviewsTv',engine) +columns = df.columns.to_list()[1:] +df2 = pd.DataFrame(columns= ["TV","userOpinion","similarShows","typeOfViewer","rating"]) +for show in columns: + + + + # query = 'select {} from audienceReviewsTv'.format(show) + # Queen Charlotte: A Bridgerton Story: Series + # df1 = pd.read_sql_query('select ',engine) + reviews = df[show].dropna().to_list() + text1= " || ".join(reviews[:15]) + #print(text1) + print(show) + print() +# # print(text) + +# text2 = f""" +# this show is so good cant believe people are hating on it + +# """ + + prompt = f""" + + # I will be giving you some audience reviews of a show named {show}. + # Each audience review is separated by ||. + # An example of the format would be audience review || audience review.....|| audience review. + + # Your task is to perform the following actions: + # 1 - generalize what the users are saying about the show + # 2 - provide names of shows related to this show + # 3 - tell me who would like this show based on audience reviews + # 4 - after generalizing users reviews, rate this show out of 10.Based this number off of the user reviews. \ + # Higher numbers means good. Lower numbers means bad. \ + # + + + + # Use the following format: + # user opinion: + # similar shows: + # type of viewer: + # rating: + # Text: <{text1}> + + # """ + + response = get_completion(prompt) + # print(response) + + p = r'(?<=:).+' + + output = re.findall(p,response) + #print(output) + print(similar_shows(output[3])) + df3 = {'TV':show,'userOpinion':output[0],'similarShows':str(similar_shows(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])} + df2 = df2.append(df3,ignore_index = True) + print(df2) + time.sleep(50) + +# Read the 'netflixTopTv10' table into a DataFrame +df_netflix = pd.read_sql('netflixTopTv10', engine) +# Create a mapping between tv names and ids +tv_id_mapping = dict(zip(df_netflix['TV'], df_netflix['id'])) +# Map the tv in 'gptTv' DataFrame to their ids +df2['fk_id'] = df2['TV'].map(tv_id_mapping ) +df2['fk_id'] = df2['fk_id'].astype('int') +# Start the rank from 1 & Rename the index col: +df2.index = df2.index + 1 +df2.index.names = ['rank'] +# insert to database +df2.to_sql('gptTV', con=engine, if_exists='replace') \ No newline at end of file diff --git a/Scrapping/rtMovieReviews.py b/Scrapping/rtMovieReviews.py index 4c2e6df..ed91165 100644 --- a/Scrapping/rtMovieReviews.py +++ b/Scrapping/rtMovieReviews.py @@ -19,7 +19,7 @@ def rt_url(title): path = "chromedriver.exe" driver = webdriver.Chrome(path) #change Password to actual password. same with endpoint -engine = create_engine("mysql://admin:Password@endpoint:3306/netflix") +engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") test = pd.read_sql_query('select Movie from netflixTopMovie10',engine) tvs = test['Movie'].to_list() print(tvs) @@ -62,7 +62,16 @@ def rt_url(title): df = pd.DataFrame(nump,columns = ['Movie','audience_score','tomatometer']) df = df.astype({"Movie":'category', "audience_score":'Int64','tomatometer':'Int64'}) print(df) +# Read the 'netflixTopTv10' table into a DataFrame +df_netflix = pd.read_sql('netflixTopMovie10', engine) +# Create a mapping between tv names and ids +tv_id_mapping = dict(zip(df_netflix['Movie'], df_netflix['id'])) +# Map the tv in 'gptTv' DataFrame to their ids +df['fk_id'] = df['Movie'].map(tv_id_mapping ) +df['fk_id'] = df['fk_id'].astype('int') +# Start the rank from 1 & Rename the index col: +df.index = df.index + 1 df.index.names = ['rank'] +# insert to database df.to_sql('rottenTomatoesMovie', con=engine, if_exists='replace') -engine.execute('alter table rottenTomatoesMovie add id int primary key auto_increment') diff --git a/Scrapping/rtTvReviews.py b/Scrapping/rtTvReviews.py index 5d07211..4898087 100644 --- a/Scrapping/rtTvReviews.py +++ b/Scrapping/rtTvReviews.py @@ -29,7 +29,7 @@ def rt_url(title): path = "chromedriver.exe" driver = webdriver.Chrome(path) #change Password to actual password. same with endpoint -engine = create_engine("mysql://admin:Password@endpoint:3306/netflix") +engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") test = pd.read_sql_query('select TV from netflixTopTv10',engine) tvs = test['TV'].to_list() print(tvs) @@ -60,6 +60,15 @@ def rt_url(title): df = pd.DataFrame(nump,columns = ['TV','audience_score','tomatometer']) df = df.astype({"TV":'category', "audience_score":'Int64','tomatometer':'Int64'}) +# Read the 'netflixTopTv10' table into a DataFrame +df_netflix = pd.read_sql('netflixTopTv10', engine) +# Create a mapping between tv names and ids +tv_id_mapping = dict(zip(df_netflix['TV'], df_netflix['id'])) +# Map the tv in 'gptTv' DataFrame to their ids +df['fk_id'] = df['TV'].map(tv_id_mapping ) +df['fk_id'] = df['fk_id'].astype('int') +# Start the rank from 1 & Rename the index col: +df.index = df.index + 1 df.index.names = ['rank'] +# insert to database df.to_sql('rottenTomatoesTv', con=engine, if_exists='replace') -engine.execute('alter table rottenTomatoesTv add id int primary key auto_increment')