From 89c14cc5f784f830f33ff5696b4fd66ca36b48d1 Mon Sep 17 00:00:00 2001
From: steve <svalentinow@gmail.com>
Date: Thu, 1 Jun 2023 11:48:44 -0400
Subject: [PATCH] added scrapping files for written audience reviews found on
 rottentomatoes.added gpt feature files. modified rottentomatoes files to
 include foreign keys

---
 Scrapping/audienceReviewsMovie.py |  97 +++++++++++++++++++
 Scrapping/audienceReviewsTv.py    | 100 +++++++++++++++++++
 Scrapping/gptMovie.py             | 153 ++++++++++++++++++++++++++++++
 Scrapping/gptTv.py                | 131 +++++++++++++++++++++++++
 Scrapping/rtMovieReviews.py       |  13 ++-
 Scrapping/rtTvReviews.py          |  13 ++-
 6 files changed, 503 insertions(+), 4 deletions(-)
 create mode 100644 Scrapping/audienceReviewsMovie.py
 create mode 100644 Scrapping/audienceReviewsTv.py
 create mode 100644 Scrapping/gptMovie.py
 create mode 100644 Scrapping/gptTv.py

diff --git a/Scrapping/audienceReviewsMovie.py b/Scrapping/audienceReviewsMovie.py
new file mode 100644
index 0000000..9a93912
--- /dev/null
+++ b/Scrapping/audienceReviewsMovie.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.ui import WebDriverWait
+from sqlalchemy import create_engine
+import time
+from selenium.webdriver.common.keys import Keys
+import urllib.request as request
+import openai
+import re
+from string import punctuation
+from selenium.webdriver.support import expected_conditions as EC
+
+def rt_url(title):
+    new = title.translate(str.maketrans('', '', punctuation))
+    l = re.sub('\s','_',new).lower()
+    link  = 'https://www.rottentomatoes.com/m/'+ l
+    return link
+def scrape(movie):
+    list = []
+    rotten = rt_url(movie)
+    print(rotten)
+    driver.get(rotten)
+    time.sleep(5) 
+    try:
+        driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
+        time.sleep(4)
+        driver.find_element(By.XPATH,'//*[@id="reviews"]/nav/ul/li[3]').click()
+    except:
+        driver.get("http://www.google.com")
+        google = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q")))
+        google.send_keys(movie + " rotten tomatoes")
+        time.sleep(5)
+        google.send_keys(Keys.ENTER)
+        result = driver.find_element(By.CSS_SELECTOR,'.LC20lb.MBeuO.DKV0Md')
+        result.click()    
+        driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()   
+        time.sleep(3)
+    time.sleep(3)
+    reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
+    rev = reviews
+    for r in rev:
+        res = len(re.findall(r'\w+', r.text))
+        if res >= 80:
+            list.append(r.text)
+    time.sleep(8)
+    try:
+        next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
+        next.click()
+    except:
+        print(len(list))
+        #df[show] = pd.Series(list)
+        time.sleep(5)
+        return list
+    time.sleep(8)
+    while len(list) <= 15:
+
+        reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
+        rev = reviews
+        for r in rev:
+            res = len(re.findall(r'\w+', r.text))
+            if res >= 80:
+                list.append(r.text)
+        time.sleep(7)
+        try:
+            next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
+            next.click()
+            time.sleep(8)
+        except:
+            break
+    print(len(list))
+    #df[show] = pd.Series(list)
+    time.sleep(5)
+    return list
+
+
+
+path = "chromedriver.exe"
+driver = webdriver.Chrome(path) 
+engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
+test = pd.read_sql_query('select Movie from netflixTopMovie10',engine)
+Movie = test['Movie'].to_list()
+
+df = pd.DataFrame()
+for m in Movie:
+    try:
+        df[m] = pd.Series(scrape(m))
+
+    except:
+        driver.quit()
+        time.sleep(3)
+        df[m] = pd.Series(scrape(m))
+        continue
+print(df)
+df.to_sql('audienceReviewsMovie', con=engine, if_exists='replace')
\ No newline at end of file
diff --git a/Scrapping/audienceReviewsTv.py b/Scrapping/audienceReviewsTv.py
new file mode 100644
index 0000000..c92020c
--- /dev/null
+++ b/Scrapping/audienceReviewsTv.py
@@ -0,0 +1,100 @@
+import numpy as np
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from sqlalchemy import create_engine
+import time
+from selenium.webdriver.common.keys import Keys
+import urllib.request as request
+import openai
+import re
+from string import punctuation
+
+def rt_url(title):
+    name = r'.+(?=:)'
+    na = re.search(name,title)
+    new = na.group(0).translate(str.maketrans('', '', punctuation))
+    l = re.sub('\s','_',new).lower()
+
+    
+    num = r'\d+$'
+    n = re.search(num,title)
+    if n ==None:
+        return 'https://www.rottentomatoes.com/tv/'+ l
+
+    
+    nu = n.group(0)
+    link ='https://www.rottentomatoes.com/tv/'+ l +'/s0' + nu
+    return link
+def scrape(show):
+    list = []
+    rotten = rt_url(show)
+    print(rotten)
+    driver.get(rotten)
+    time.sleep(5) 
+    try:
+        driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
+    except:
+        driver.find_element(By.XPATH,'//*[@id="seasons-list"]/div/a/season-list-item').click()
+        driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
+        time.sleep(3)
+    
+    reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
+    rev = reviews
+    for r in rev:
+        res = len(re.findall(r'\w+', r.text))
+        if res >= 80:
+            list.append(r.text)
+    time.sleep(8)
+    try:
+        next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
+        next.click()
+    except:
+        print(len(list))
+        #df[show] = pd.Series(list)
+        time.sleep(5)
+        return list
+    time.sleep(8)
+    while len(list) <= 40:
+
+        reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
+        rev = reviews
+        for r in rev:
+            res = len(re.findall(r'\w+', r.text))
+            if res >= 80:
+                list.append(r.text)
+        time.sleep(7)
+        try:
+            next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
+            next.click()
+            time.sleep(8)
+        except:
+            break
+    print(len(list))
+    #df[show] = pd.Series(list)
+    time.sleep(5)
+    return list
+
+
+
+path = "chromedriver.exe"
+
+engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
+test = pd.read_sql_query('select TV from netflixTopTv10',engine)
+tv = test['TV'].to_list()
+
+driver = webdriver.Chrome(path) 
+
+
+df = pd.DataFrame()
+for show in tv:
+    try:
+        df[show] = pd.Series(scrape(show))
+
+    except:
+        driver.quit()
+        df[show] = pd.Series(scrape(show))
+        continue
+print(df)
+df.to_sql('audienceReviewsTv', con=engine, if_exists='replace')
\ No newline at end of file
diff --git a/Scrapping/gptMovie.py b/Scrapping/gptMovie.py
new file mode 100644
index 0000000..72e7af9
--- /dev/null
+++ b/Scrapping/gptMovie.py
@@ -0,0 +1,153 @@
+import numpy as np
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from sqlalchemy import create_engine
+import time
+from selenium.webdriver.common.keys import Keys
+import urllib.request as request
+import openai
+import re
+from string import punctuation
+
+
+def get_completion(prompt, model="gpt-3.5-turbo"):
+    messages = [{"role": "user", "content": prompt}]
+    response = openai.ChatCompletion.create(
+        model=model,
+        messages=messages,
+        temperature=0, # this is the degree of randomness of the model's output
+    )
+    return response.choices[0].message["content"]
+#cleaning output
+def similar_movies(s):
+    list = s.split()
+
+    if 'Similar' in list[:2]:
+        p = r'.+(?<=include)'
+        n = re.sub(p,'',s)
+        p = r'(?<=,\s)and'
+        new = re.sub(p,'',n)
+        p = r'(.+?)(?:,|$)'
+        cleaned = re.findall(p,new)
+        return cleaned
+    elif 'Other' in list[:2]:
+        p = r'.+(?<=like)'
+        n = re.sub(p,'',s)
+        p = r'(?<=,\s)and'
+        new = re.sub(p,'',n)
+        p = r'(.+?)(?:,|$)'
+        cleaned = re.findall(p,new)
+        return cleaned
+    
+    elif bool(re.search(',',s)):
+        p = r'(?<=,\s)and'
+        new = re.sub(p,'',s)
+        p = r'(.+?)(?:,|$)'
+        cleaned = re.findall(p,new)
+        return cleaned
+    else:
+        cleaned = [s]
+        return cleaned
+
+def rating(s):
+    p = r'\d(\.\d)?(?=\/)'
+    new = re.search(p,s)
+    return new.group(0)    
+
+openai.api_key = "sk-"
+engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
+df = pd.read_sql_query('select * from audienceReviewsMovie',engine)
+columns = df.columns.to_list()[1:]
+df2 = pd.DataFrame(columns= ["Movie","userOpinion","similarMovie","typeOfViewer","rating"])
+for movie in columns:
+    count = len(df[movie].value_counts()) > 0
+    if not count:
+        df3 = {'Movie':movie,'userOpinion':None,'similarMovies':None, 'typeOfViewer':None,'rating':None}
+        df2 = df2.append(df3,ignore_index = True)
+        print(df2)
+        continue
+    
+
+
+    # query = 'select {} from audienceReviewsTv'.format(show)
+    # Queen Charlotte: A Bridgerton Story: Series
+    # df1 = pd.read_sql_query('select ',engine)
+    reviews = df[movie].dropna().to_list()
+    text1= " || ".join(reviews[:15])
+    #print(text1)
+    print(movie)
+    print()
+# # print(text)
+
+# text2 = f"""
+# this show is so good cant believe people are hating on it
+
+# """
+
+    prompt = f"""
+
+    # I will be giving you some audience reviews of a movie named {movie}.
+    # Each audience review is separated by ||.
+    # An example of the format would be audience review || audience review.....|| audience review.
+
+    # Your task is to perform the following actions: 
+    # 1 - generalize what the users are saying about the movie
+    # 2 - provide names of shows related to this movie
+    # 3 - tell me who would like this show based on audience reviews
+    # 4 - after generalizing users reviews, rate this movie out of 10.Based this number off of the user reviews. \
+    #     Higher numbers means good. Lower numbers means bad. \
+    #     
+
+
+
+    # Use the following format:
+    # user opinion: <what are users saying about the movie>
+    # similar shows: <only a list of similar movie. each show should be separated by commas. Do not give a sentence, just a list>
+    # type of viewer:<who would like this movie>
+    # rating: <only rating in the form of a fraction>
+    # Text: <{text1}>
+
+    # """
+
+    try:
+        response = get_completion(prompt)
+    # print(response)
+
+        p = r'(?<=:).+'
+
+        output = re.findall(p,response)
+    #print(output)
+        print(similar_movies(output[3]))
+        df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])}
+        df2 = df2.append(df3,ignore_index = True)
+        print(df2)
+        time.sleep(50)
+    except:
+        time.sleep(50)
+        response = get_completion(prompt)
+    # print(response)
+
+        p = r'(?<=:).+'
+
+        output = re.findall(p,response)
+    #print(output)
+        print(similar_movies(output[3]))
+        df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])}
+        df2 = df2.append(df3,ignore_index = True)
+        print(df2)
+        time.sleep(50)
+    
+# Read the 'netflixTopTv10' table into a DataFrame
+df_netflix = pd.read_sql('netflixTopMovie10', engine)
+# Create a mapping between tv names and ids
+movie_id_mapping = dict(zip(df_netflix['Movie'], df_netflix['id']))
+# Map the movie in 'gptMovie' DataFrame to their ids
+df2['fk_id'] = df2['Movie'].map(movie_id_mapping )  
+df2['fk_id'] = df2['fk_id'].astype('int')
+# Start the rank from 1 & Rename the index col:
+df2.index = df2.index + 1
+df2.index.names = ['rank']
+# insert to database
+df2.to_sql('gptMovie', con=engine, if_exists='replace')
\ No newline at end of file
diff --git a/Scrapping/gptTv.py b/Scrapping/gptTv.py
new file mode 100644
index 0000000..9f450e2
--- /dev/null
+++ b/Scrapping/gptTv.py
@@ -0,0 +1,131 @@
+import numpy as np
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from sqlalchemy import create_engine
+import time
+from selenium.webdriver.common.keys import Keys
+import urllib.request as request
+import openai
+import re
+from string import punctuation
+
+
+def get_completion(prompt, model="gpt-3.5-turbo"):
+    messages = [{"role": "user", "content": prompt}]
+    response = openai.ChatCompletion.create(
+        model=model,
+        messages=messages,
+        temperature=0, # this is the degree of randomness of the model's output
+    )
+    return response.choices[0].message["content"]
+def similar_shows(s):
+    list = s.split()
+
+    if 'Similar' in list[:2]:
+        p = r'.+(?<=include)'
+        n = re.sub(p,'',s)
+        p = r'(?<=,\s)and'
+        new = re.sub(p,'',n)
+        p = r'(.+?)(?:,|$)'
+        cleaned = re.findall(p,new)
+        return cleaned
+    elif 'Other' in list[:2]:
+        p = r'.+(?<=like)'
+        n = re.sub(p,'',s)
+        p = r'(?<=,\s)and'
+        new = re.sub(p,'',n)
+        p = r'(.+?)(?:,|$)'
+        cleaned = re.findall(p,new)
+        return cleaned
+    
+    elif bool(re.search(',',s)):
+        p = r'(?<=,\s)and'
+        new = re.sub(p,'',s)
+        p = r'(.+?)(?:,|$)'
+        cleaned = re.findall(p,new)
+        return cleaned
+    else:
+        cleaned = [s]
+        return cleaned
+
+def rating(s):
+    p = r'\d(\.\d)?(?=\/)'
+    new = re.search(p,s)
+    return new.group(0)    
+
+openai.api_key = "sk-"
+engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
+df = pd.read_sql_query('select * from audienceReviewsTv',engine)
+columns = df.columns.to_list()[1:]
+df2 = pd.DataFrame(columns= ["TV","userOpinion","similarShows","typeOfViewer","rating"])
+for show in columns:
+
+
+
+    # query = 'select {} from audienceReviewsTv'.format(show)
+    # Queen Charlotte: A Bridgerton Story: Series
+    # df1 = pd.read_sql_query('select ',engine)
+    reviews = df[show].dropna().to_list()
+    text1= " || ".join(reviews[:15])
+    #print(text1)
+    print(show)
+    print()
+# # print(text)
+
+# text2 = f"""
+# this show is so good cant believe people are hating on it
+
+# """
+
+    prompt = f"""
+
+    # I will be giving you some audience reviews of a show named {show}.
+    # Each audience review is separated by ||.
+    # An example of the format would be audience review || audience review.....|| audience review.
+
+    # Your task is to perform the following actions: 
+    # 1 - generalize what the users are saying about the show
+    # 2 - provide names of shows related to this show
+    # 3 - tell me who would like this show based on audience reviews
+    # 4 - after generalizing users reviews, rate this show out of 10.Based this number off of the user reviews. \
+    #     Higher numbers means good. Lower numbers means bad. \
+    #     
+
+
+
+    # Use the following format:
+    # user opinion: <what are users saying about the show>
+    # similar shows: <only a list of similar shows. each show should be separated by commas. Do not give a sentence, just a list>
+    # type of viewer:<who would like this show>
+    # rating: <only rating in the form of a fraction>
+    # Text: <{text1}>
+
+    # """
+
+    response = get_completion(prompt)
+    # print(response)
+
+    p = r'(?<=:).+'
+
+    output = re.findall(p,response)
+    #print(output)
+    print(similar_shows(output[3]))
+    df3 = {'TV':show,'userOpinion':output[0],'similarShows':str(similar_shows(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])}
+    df2 = df2.append(df3,ignore_index = True)
+    print(df2)
+    time.sleep(50)
+    
+# Read the 'netflixTopTv10' table into a DataFrame
+df_netflix = pd.read_sql('netflixTopTv10', engine)
+# Create a mapping between tv names and ids
+tv_id_mapping = dict(zip(df_netflix['TV'], df_netflix['id']))
+# Map the tv in 'gptTv' DataFrame to their ids
+df2['fk_id'] = df2['TV'].map(tv_id_mapping )  
+df2['fk_id'] = df2['fk_id'].astype('int')
+# Start the rank from 1 & Rename the index col:
+df2.index = df2.index + 1
+df2.index.names = ['rank']
+# insert to database
+df2.to_sql('gptTV', con=engine, if_exists='replace')
\ No newline at end of file
diff --git a/Scrapping/rtMovieReviews.py b/Scrapping/rtMovieReviews.py
index 4c2e6df..ed91165 100644
--- a/Scrapping/rtMovieReviews.py
+++ b/Scrapping/rtMovieReviews.py
@@ -19,7 +19,7 @@ def rt_url(title):
 path = "chromedriver.exe"
 driver = webdriver.Chrome(path)
 #change Password to actual password. same with endpoint
-engine = create_engine("mysql://admin:Password@endpoint:3306/netflix")
+engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
 test = pd.read_sql_query('select Movie from netflixTopMovie10',engine)
 tvs = test['Movie'].to_list()
 print(tvs)
@@ -62,7 +62,16 @@ def rt_url(title):
 df = pd.DataFrame(nump,columns = ['Movie','audience_score','tomatometer'])
 df = df.astype({"Movie":'category', "audience_score":'Int64','tomatometer':'Int64'})
 print(df)
+# Read the 'netflixTopTv10' table into a DataFrame
+df_netflix = pd.read_sql('netflixTopMovie10', engine)
+# Create a mapping between tv names and ids
+tv_id_mapping = dict(zip(df_netflix['Movie'], df_netflix['id']))
+# Map the tv in 'gptTv' DataFrame to their ids
+df['fk_id'] = df['Movie'].map(tv_id_mapping )  
+df['fk_id'] = df['fk_id'].astype('int')
+# Start the rank from 1 & Rename the index col:
+df.index = df.index + 1
 df.index.names = ['rank']
+# insert to database
 df.to_sql('rottenTomatoesMovie', con=engine, if_exists='replace')
-engine.execute('alter table rottenTomatoesMovie add id int primary key auto_increment')
 
diff --git a/Scrapping/rtTvReviews.py b/Scrapping/rtTvReviews.py
index 5d07211..4898087 100644
--- a/Scrapping/rtTvReviews.py
+++ b/Scrapping/rtTvReviews.py
@@ -29,7 +29,7 @@ def rt_url(title):
 path = "chromedriver.exe"
 driver = webdriver.Chrome(path)
 #change Password to actual password. same with endpoint
-engine = create_engine("mysql://admin:Password@endpoint:3306/netflix")
+engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
 test = pd.read_sql_query('select TV from netflixTopTv10',engine)
 tvs = test['TV'].to_list()
 print(tvs)
@@ -60,6 +60,15 @@ def rt_url(title):
 
 df = pd.DataFrame(nump,columns = ['TV','audience_score','tomatometer'])
 df = df.astype({"TV":'category', "audience_score":'Int64','tomatometer':'Int64'})
+# Read the 'netflixTopTv10' table into a DataFrame
+df_netflix = pd.read_sql('netflixTopTv10', engine)
+# Create a mapping between tv names and ids
+tv_id_mapping = dict(zip(df_netflix['TV'], df_netflix['id']))
+# Map the tv in 'gptTv' DataFrame to their ids
+df['fk_id'] = df['TV'].map(tv_id_mapping )  
+df['fk_id'] = df['fk_id'].astype('int')
+# Start the rank from 1 & Rename the index col:
+df.index = df.index + 1
 df.index.names = ['rank']
+# insert to database
 df.to_sql('rottenTomatoesTv', con=engine, if_exists='replace')
-engine.execute('alter table rottenTomatoesTv add id int primary key auto_increment')