Skip to content

Commit

Permalink
Merge pull request #13 from deepankarck2/Scrap
Browse files Browse the repository at this point in the history
audience reviews scrapping files, gpt files and modified rottentomatoes files.
  • Loading branch information
deepankarck2 authored Jun 2, 2023
2 parents e37ec13 + 89c14cc commit 5f94345
Show file tree
Hide file tree
Showing 6 changed files with 503 additions and 4 deletions.
97 changes: 97 additions & 0 deletions Scrapping/audienceReviewsMovie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from sqlalchemy import create_engine
import time
from selenium.webdriver.common.keys import Keys
import urllib.request as request
import openai
import re
from string import punctuation
from selenium.webdriver.support import expected_conditions as EC

def rt_url(title):
new = title.translate(str.maketrans('', '', punctuation))
l = re.sub('\s','_',new).lower()
link = 'https://www.rottentomatoes.com/m/'+ l
return link
def scrape(movie):
list = []
rotten = rt_url(movie)
print(rotten)
driver.get(rotten)
time.sleep(5)
try:
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
time.sleep(4)
driver.find_element(By.XPATH,'//*[@id="reviews"]/nav/ul/li[3]').click()
except:
driver.get("http://www.google.com")
google = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q")))
google.send_keys(movie + " rotten tomatoes")
time.sleep(5)
google.send_keys(Keys.ENTER)
result = driver.find_element(By.CSS_SELECTOR,'.LC20lb.MBeuO.DKV0Md')
result.click()
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
time.sleep(3)
time.sleep(3)
reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
rev = reviews
for r in rev:
res = len(re.findall(r'\w+', r.text))
if res >= 80:
list.append(r.text)
time.sleep(8)
try:
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
next.click()
except:
print(len(list))
#df[show] = pd.Series(list)
time.sleep(5)
return list
time.sleep(8)
while len(list) <= 15:

reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
rev = reviews
for r in rev:
res = len(re.findall(r'\w+', r.text))
if res >= 80:
list.append(r.text)
time.sleep(7)
try:
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
next.click()
time.sleep(8)
except:
break
print(len(list))
#df[show] = pd.Series(list)
time.sleep(5)
return list



path = "chromedriver.exe"
driver = webdriver.Chrome(path)
engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
test = pd.read_sql_query('select Movie from netflixTopMovie10',engine)
Movie = test['Movie'].to_list()

df = pd.DataFrame()
for m in Movie:
try:
df[m] = pd.Series(scrape(m))

except:
driver.quit()
time.sleep(3)
df[m] = pd.Series(scrape(m))
continue
print(df)
df.to_sql('audienceReviewsMovie', con=engine, if_exists='replace')
100 changes: 100 additions & 0 deletions Scrapping/audienceReviewsTv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from sqlalchemy import create_engine
import time
from selenium.webdriver.common.keys import Keys
import urllib.request as request
import openai
import re
from string import punctuation

def rt_url(title):
name = r'.+(?=:)'
na = re.search(name,title)
new = na.group(0).translate(str.maketrans('', '', punctuation))
l = re.sub('\s','_',new).lower()


num = r'\d+$'
n = re.search(num,title)
if n ==None:
return 'https://www.rottentomatoes.com/tv/'+ l


nu = n.group(0)
link ='https://www.rottentomatoes.com/tv/'+ l +'/s0' + nu
return link
def scrape(show):
list = []
rotten = rt_url(show)
print(rotten)
driver.get(rotten)
time.sleep(5)
try:
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
except:
driver.find_element(By.XPATH,'//*[@id="seasons-list"]/div/a/season-list-item').click()
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click()
time.sleep(3)

reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
rev = reviews
for r in rev:
res = len(re.findall(r'\w+', r.text))
if res >= 80:
list.append(r.text)
time.sleep(8)
try:
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
next.click()
except:
print(len(list))
#df[show] = pd.Series(list)
time.sleep(5)
return list
time.sleep(8)
while len(list) <= 40:

reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text')
rev = reviews
for r in rev:
res = len(re.findall(r'\w+', r.text))
if res >= 80:
list.append(r.text)
time.sleep(7)
try:
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]')
next.click()
time.sleep(8)
except:
break
print(len(list))
#df[show] = pd.Series(list)
time.sleep(5)
return list



path = "chromedriver.exe"

engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
test = pd.read_sql_query('select TV from netflixTopTv10',engine)
tv = test['TV'].to_list()

driver = webdriver.Chrome(path)


df = pd.DataFrame()
for show in tv:
try:
df[show] = pd.Series(scrape(show))

except:
driver.quit()
df[show] = pd.Series(scrape(show))
continue
print(df)
df.to_sql('audienceReviewsTv', con=engine, if_exists='replace')
153 changes: 153 additions & 0 deletions Scrapping/gptMovie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from sqlalchemy import create_engine
import time
from selenium.webdriver.common.keys import Keys
import urllib.request as request
import openai
import re
from string import punctuation


def get_completion(prompt, model="gpt-3.5-turbo"):
messages = [{"role": "user", "content": prompt}]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0, # this is the degree of randomness of the model's output
)
return response.choices[0].message["content"]
#cleaning output
def similar_movies(s):
list = s.split()

if 'Similar' in list[:2]:
p = r'.+(?<=include)'
n = re.sub(p,'',s)
p = r'(?<=,\s)and'
new = re.sub(p,'',n)
p = r'(.+?)(?:,|$)'
cleaned = re.findall(p,new)
return cleaned
elif 'Other' in list[:2]:
p = r'.+(?<=like)'
n = re.sub(p,'',s)
p = r'(?<=,\s)and'
new = re.sub(p,'',n)
p = r'(.+?)(?:,|$)'
cleaned = re.findall(p,new)
return cleaned

elif bool(re.search(',',s)):
p = r'(?<=,\s)and'
new = re.sub(p,'',s)
p = r'(.+?)(?:,|$)'
cleaned = re.findall(p,new)
return cleaned
else:
cleaned = [s]
return cleaned

def rating(s):
p = r'\d(\.\d)?(?=\/)'
new = re.search(p,s)
return new.group(0)

openai.api_key = "sk-"
engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix")
df = pd.read_sql_query('select * from audienceReviewsMovie',engine)
columns = df.columns.to_list()[1:]
df2 = pd.DataFrame(columns= ["Movie","userOpinion","similarMovie","typeOfViewer","rating"])
for movie in columns:
count = len(df[movie].value_counts()) > 0
if not count:
df3 = {'Movie':movie,'userOpinion':None,'similarMovies':None, 'typeOfViewer':None,'rating':None}
df2 = df2.append(df3,ignore_index = True)
print(df2)
continue



# query = 'select {} from audienceReviewsTv'.format(show)
# Queen Charlotte: A Bridgerton Story: Series
# df1 = pd.read_sql_query('select ',engine)
reviews = df[movie].dropna().to_list()
text1= " || ".join(reviews[:15])
#print(text1)
print(movie)
print()
# # print(text)

# text2 = f"""
# this show is so good cant believe people are hating on it

# """

prompt = f"""
# I will be giving you some audience reviews of a movie named {movie}.
# Each audience review is separated by ||.
# An example of the format would be audience review || audience review.....|| audience review.
# Your task is to perform the following actions:
# 1 - generalize what the users are saying about the movie
# 2 - provide names of shows related to this movie
# 3 - tell me who would like this show based on audience reviews
# 4 - after generalizing users reviews, rate this movie out of 10.Based this number off of the user reviews. \
# Higher numbers means good. Lower numbers means bad. \
#
# Use the following format:
# user opinion: <what are users saying about the movie>
# similar shows: <only a list of similar movie. each show should be separated by commas. Do not give a sentence, just a list>
# type of viewer:<who would like this movie>
# rating: <only rating in the form of a fraction>
# Text: <{text1}>
# """

try:
response = get_completion(prompt)
# print(response)

p = r'(?<=:).+'

output = re.findall(p,response)
#print(output)
print(similar_movies(output[3]))
df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])}
df2 = df2.append(df3,ignore_index = True)
print(df2)
time.sleep(50)
except:
time.sleep(50)
response = get_completion(prompt)
# print(response)

p = r'(?<=:).+'

output = re.findall(p,response)
#print(output)
print(similar_movies(output[3]))
df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])}
df2 = df2.append(df3,ignore_index = True)
print(df2)
time.sleep(50)

# Read the 'netflixTopTv10' table into a DataFrame
df_netflix = pd.read_sql('netflixTopMovie10', engine)
# Create a mapping between tv names and ids
movie_id_mapping = dict(zip(df_netflix['Movie'], df_netflix['id']))
# Map the movie in 'gptMovie' DataFrame to their ids
df2['fk_id'] = df2['Movie'].map(movie_id_mapping )
df2['fk_id'] = df2['fk_id'].astype('int')
# Start the rank from 1 & Rename the index col:
df2.index = df2.index + 1
df2.index.names = ['rank']
# insert to database
df2.to_sql('gptMovie', con=engine, if_exists='replace')
Loading

0 comments on commit 5f94345

Please sign in to comment.