-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from deepankarck2/Scrap
audience reviews scrapping files, gpt files and modified rottentomatoes files.
- Loading branch information
Showing
6 changed files
with
503 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.common.keys import Keys | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from sqlalchemy import create_engine | ||
import time | ||
from selenium.webdriver.common.keys import Keys | ||
import urllib.request as request | ||
import openai | ||
import re | ||
from string import punctuation | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
||
def rt_url(title): | ||
new = title.translate(str.maketrans('', '', punctuation)) | ||
l = re.sub('\s','_',new).lower() | ||
link = 'https://www.rottentomatoes.com/m/'+ l | ||
return link | ||
def scrape(movie): | ||
list = [] | ||
rotten = rt_url(movie) | ||
print(rotten) | ||
driver.get(rotten) | ||
time.sleep(5) | ||
try: | ||
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() | ||
time.sleep(4) | ||
driver.find_element(By.XPATH,'//*[@id="reviews"]/nav/ul/li[3]').click() | ||
except: | ||
driver.get("http://www.google.com") | ||
google = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))) | ||
google.send_keys(movie + " rotten tomatoes") | ||
time.sleep(5) | ||
google.send_keys(Keys.ENTER) | ||
result = driver.find_element(By.CSS_SELECTOR,'.LC20lb.MBeuO.DKV0Md') | ||
result.click() | ||
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() | ||
time.sleep(3) | ||
time.sleep(3) | ||
reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') | ||
rev = reviews | ||
for r in rev: | ||
res = len(re.findall(r'\w+', r.text)) | ||
if res >= 80: | ||
list.append(r.text) | ||
time.sleep(8) | ||
try: | ||
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') | ||
next.click() | ||
except: | ||
print(len(list)) | ||
#df[show] = pd.Series(list) | ||
time.sleep(5) | ||
return list | ||
time.sleep(8) | ||
while len(list) <= 15: | ||
|
||
reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') | ||
rev = reviews | ||
for r in rev: | ||
res = len(re.findall(r'\w+', r.text)) | ||
if res >= 80: | ||
list.append(r.text) | ||
time.sleep(7) | ||
try: | ||
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') | ||
next.click() | ||
time.sleep(8) | ||
except: | ||
break | ||
print(len(list)) | ||
#df[show] = pd.Series(list) | ||
time.sleep(5) | ||
return list | ||
|
||
|
||
|
||
path = "chromedriver.exe" | ||
driver = webdriver.Chrome(path) | ||
engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") | ||
test = pd.read_sql_query('select Movie from netflixTopMovie10',engine) | ||
Movie = test['Movie'].to_list() | ||
|
||
df = pd.DataFrame() | ||
for m in Movie: | ||
try: | ||
df[m] = pd.Series(scrape(m)) | ||
|
||
except: | ||
driver.quit() | ||
time.sleep(3) | ||
df[m] = pd.Series(scrape(m)) | ||
continue | ||
print(df) | ||
df.to_sql('audienceReviewsMovie', con=engine, if_exists='replace') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.common.keys import Keys | ||
from sqlalchemy import create_engine | ||
import time | ||
from selenium.webdriver.common.keys import Keys | ||
import urllib.request as request | ||
import openai | ||
import re | ||
from string import punctuation | ||
|
||
def rt_url(title): | ||
name = r'.+(?=:)' | ||
na = re.search(name,title) | ||
new = na.group(0).translate(str.maketrans('', '', punctuation)) | ||
l = re.sub('\s','_',new).lower() | ||
|
||
|
||
num = r'\d+$' | ||
n = re.search(num,title) | ||
if n ==None: | ||
return 'https://www.rottentomatoes.com/tv/'+ l | ||
|
||
|
||
nu = n.group(0) | ||
link ='https://www.rottentomatoes.com/tv/'+ l +'/s0' + nu | ||
return link | ||
def scrape(show): | ||
list = [] | ||
rotten = rt_url(show) | ||
print(rotten) | ||
driver.get(rotten) | ||
time.sleep(5) | ||
try: | ||
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() | ||
except: | ||
driver.find_element(By.XPATH,'//*[@id="seasons-list"]/div/a/season-list-item').click() | ||
driver.find_element(By.XPATH,'//*[@id="scoreboard"]/a[2]').click() | ||
time.sleep(3) | ||
|
||
reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') | ||
rev = reviews | ||
for r in rev: | ||
res = len(re.findall(r'\w+', r.text)) | ||
if res >= 80: | ||
list.append(r.text) | ||
time.sleep(8) | ||
try: | ||
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') | ||
next.click() | ||
except: | ||
print(len(list)) | ||
#df[show] = pd.Series(list) | ||
time.sleep(5) | ||
return list | ||
time.sleep(8) | ||
while len(list) <= 40: | ||
|
||
reviews = driver.find_elements(By.CSS_SELECTOR,'.audience-reviews__review.js-review-text') | ||
rev = reviews | ||
for r in rev: | ||
res = len(re.findall(r'\w+', r.text)) | ||
if res >= 80: | ||
list.append(r.text) | ||
time.sleep(7) | ||
try: | ||
next = driver.find_element(By.XPATH,'//*[@id="reviews"]/div[1]/rt-button[2]') | ||
next.click() | ||
time.sleep(8) | ||
except: | ||
break | ||
print(len(list)) | ||
#df[show] = pd.Series(list) | ||
time.sleep(5) | ||
return list | ||
|
||
|
||
|
||
path = "chromedriver.exe" | ||
|
||
engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") | ||
test = pd.read_sql_query('select TV from netflixTopTv10',engine) | ||
tv = test['TV'].to_list() | ||
|
||
driver = webdriver.Chrome(path) | ||
|
||
|
||
df = pd.DataFrame() | ||
for show in tv: | ||
try: | ||
df[show] = pd.Series(scrape(show)) | ||
|
||
except: | ||
driver.quit() | ||
df[show] = pd.Series(scrape(show)) | ||
continue | ||
print(df) | ||
df.to_sql('audienceReviewsTv', con=engine, if_exists='replace') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.common.keys import Keys | ||
from sqlalchemy import create_engine | ||
import time | ||
from selenium.webdriver.common.keys import Keys | ||
import urllib.request as request | ||
import openai | ||
import re | ||
from string import punctuation | ||
|
||
|
||
def get_completion(prompt, model="gpt-3.5-turbo"): | ||
messages = [{"role": "user", "content": prompt}] | ||
response = openai.ChatCompletion.create( | ||
model=model, | ||
messages=messages, | ||
temperature=0, # this is the degree of randomness of the model's output | ||
) | ||
return response.choices[0].message["content"] | ||
#cleaning output | ||
def similar_movies(s): | ||
list = s.split() | ||
|
||
if 'Similar' in list[:2]: | ||
p = r'.+(?<=include)' | ||
n = re.sub(p,'',s) | ||
p = r'(?<=,\s)and' | ||
new = re.sub(p,'',n) | ||
p = r'(.+?)(?:,|$)' | ||
cleaned = re.findall(p,new) | ||
return cleaned | ||
elif 'Other' in list[:2]: | ||
p = r'.+(?<=like)' | ||
n = re.sub(p,'',s) | ||
p = r'(?<=,\s)and' | ||
new = re.sub(p,'',n) | ||
p = r'(.+?)(?:,|$)' | ||
cleaned = re.findall(p,new) | ||
return cleaned | ||
|
||
elif bool(re.search(',',s)): | ||
p = r'(?<=,\s)and' | ||
new = re.sub(p,'',s) | ||
p = r'(.+?)(?:,|$)' | ||
cleaned = re.findall(p,new) | ||
return cleaned | ||
else: | ||
cleaned = [s] | ||
return cleaned | ||
|
||
def rating(s): | ||
p = r'\d(\.\d)?(?=\/)' | ||
new = re.search(p,s) | ||
return new.group(0) | ||
|
||
openai.api_key = "sk-" | ||
engine = create_engine("mysql://admin:{MYSQL_PASSWORD}@{MYSQL_HOST}:3306/netflix") | ||
df = pd.read_sql_query('select * from audienceReviewsMovie',engine) | ||
columns = df.columns.to_list()[1:] | ||
df2 = pd.DataFrame(columns= ["Movie","userOpinion","similarMovie","typeOfViewer","rating"]) | ||
for movie in columns: | ||
count = len(df[movie].value_counts()) > 0 | ||
if not count: | ||
df3 = {'Movie':movie,'userOpinion':None,'similarMovies':None, 'typeOfViewer':None,'rating':None} | ||
df2 = df2.append(df3,ignore_index = True) | ||
print(df2) | ||
continue | ||
|
||
|
||
|
||
# query = 'select {} from audienceReviewsTv'.format(show) | ||
# Queen Charlotte: A Bridgerton Story: Series | ||
# df1 = pd.read_sql_query('select ',engine) | ||
reviews = df[movie].dropna().to_list() | ||
text1= " || ".join(reviews[:15]) | ||
#print(text1) | ||
print(movie) | ||
print() | ||
# # print(text) | ||
|
||
# text2 = f""" | ||
# this show is so good cant believe people are hating on it | ||
|
||
# """ | ||
|
||
prompt = f""" | ||
# I will be giving you some audience reviews of a movie named {movie}. | ||
# Each audience review is separated by ||. | ||
# An example of the format would be audience review || audience review.....|| audience review. | ||
# Your task is to perform the following actions: | ||
# 1 - generalize what the users are saying about the movie | ||
# 2 - provide names of shows related to this movie | ||
# 3 - tell me who would like this show based on audience reviews | ||
# 4 - after generalizing users reviews, rate this movie out of 10.Based this number off of the user reviews. \ | ||
# Higher numbers means good. Lower numbers means bad. \ | ||
# | ||
# Use the following format: | ||
# user opinion: <what are users saying about the movie> | ||
# similar shows: <only a list of similar movie. each show should be separated by commas. Do not give a sentence, just a list> | ||
# type of viewer:<who would like this movie> | ||
# rating: <only rating in the form of a fraction> | ||
# Text: <{text1}> | ||
# """ | ||
|
||
try: | ||
response = get_completion(prompt) | ||
# print(response) | ||
|
||
p = r'(?<=:).+' | ||
|
||
output = re.findall(p,response) | ||
#print(output) | ||
print(similar_movies(output[3])) | ||
df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])} | ||
df2 = df2.append(df3,ignore_index = True) | ||
print(df2) | ||
time.sleep(50) | ||
except: | ||
time.sleep(50) | ||
response = get_completion(prompt) | ||
# print(response) | ||
|
||
p = r'(?<=:).+' | ||
|
||
output = re.findall(p,response) | ||
#print(output) | ||
print(similar_movies(output[3])) | ||
df3 = {'Movie':movie,'userOpinion':output[0],'similarMovies':str(similar_movies(output[1])), 'typeOfViewer':output[2],'rating':rating(output[3])} | ||
df2 = df2.append(df3,ignore_index = True) | ||
print(df2) | ||
time.sleep(50) | ||
|
||
# Read the 'netflixTopTv10' table into a DataFrame | ||
df_netflix = pd.read_sql('netflixTopMovie10', engine) | ||
# Create a mapping between tv names and ids | ||
movie_id_mapping = dict(zip(df_netflix['Movie'], df_netflix['id'])) | ||
# Map the movie in 'gptMovie' DataFrame to their ids | ||
df2['fk_id'] = df2['Movie'].map(movie_id_mapping ) | ||
df2['fk_id'] = df2['fk_id'].astype('int') | ||
# Start the rank from 1 & Rename the index col: | ||
df2.index = df2.index + 1 | ||
df2.index.names = ['rank'] | ||
# insert to database | ||
df2.to_sql('gptMovie', con=engine, if_exists='replace') |
Oops, something went wrong.