-
Notifications
You must be signed in to change notification settings - Fork 1
/
api.py
146 lines (116 loc) · 5.07 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from urllib.parse import urlparse
import re
import subprocess
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from flask import Flask, request
def are_domains_similar(url1, url2):
# Parse the URLs
parsed_url1 = urlparse(url1)
parsed_url2 = urlparse(url2)
# Normalize the domains (remove protocol and convert to lowercase)
domain1 = parsed_url1.netloc.lower()
domain2 = parsed_url2.netloc.lower()
# Check if one domain is a substring of the other
return domain1 in domain2 or domain2 in domain1
class SiteScraper:
def scrape_site(self, url):
html_content = get_html_content(url)
if html_content is None:
return f"Skipping {url} due to redirect.\n{'-'*50}"
cleaned_content = clean_html_content(html_content)
return f"\n{'-'*50}\nContent {url}:\n{cleaned_content}\n{'-'*50}"
def get_html_content(url):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") # headless mode
prefs = {"profile.managed_default_content_settings.images": 2} # 2: Block images
chrome_options.add_experimental_option("prefs", prefs)
try:
driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(15) # Set page load timeout
original_url = url
driver.get(url)
# Check if there was a redirect with a more lenient comparison
current_url = driver.execute_script("return window.location.href;")
if not are_domains_similar(original_url, current_url):
print(f"Redirect detected from {original_url} to {current_url}")
return None
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.TAG_NAME, 'body'))
)
html_content = driver.page_source
return html_content
except Exception as e:
print(f"Error {e}")
return None
finally:
driver.quit()
def clean_html_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# Remove da headers
for header in soup(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
header.decompose()
# Strip HTML tags la
cleaned_content = ' '.join(soup.stripped_strings)
return cleaned_content
def paid_detector(url_input):
scraper = SiteScraper()
raw_site_data = scraper.scrape_site(url_input)
if raw_site_data.startswith("Skipping"):
return "UNPAID" # Assuming False indicates an unpaid site
if len(raw_site_data) < 60:
print("Length reason")
return "UNPAID" # Assuming False indicates an unpaid site
string_lines = raw_site_data.splitlines()
raw_site = " ".join(string_lines)
prompt = f"I'm providing some raw data which I got from homepage of a website, you need to analyse and tell me, if you think, this website is categorized as some sort of manufacturer of anything physical (for example - screw, fan, industry part, some product)? Answer format should be only - one word - YES or NO (just make your prediction, doesn't matter if it go wrong), also any non english website is NO - raw data ==> {raw_site}"
if len(prompt) > 3800:
prompt = prompt[:3800]
comm = f"tgpt -q --provider phind \"{prompt}\""
try:
output = subprocess.check_output(comm, shell=True, text=True).strip()
except Exception as e:
print(f"ERROR -> {e}")
if 'output' not in locals():
print("No output from tgpt")
return f"UNPAID - {url_input}"
if output == "YES":
return f"PAID - {url_input}" # Assuming True indicates a paid site
if not are_domains_similar(original_url, current_url):
return False
elif output == "NO":
return f"UNPAID - {url_input}" # Assuming False indicates an unpaid site
else:
return False # Assuming False indicates an unknown status
def detect_url_work(url):
if re.search(r'\.org\b', url):
return False
elif re.search(r'\.gov\b', url):
return False
elif re.search(r'\.edu\b', url):
return False
else:
return True
app = Flask(__name__)
@app.route("/", methods=["GET", "POST"])
def main():
if request.method == "POST":
url = request.form.get("url")
if not detect_url_work(url):
return {"result": "UNPAID"} # Assuming False indicates an unpaid site
result = paid_detector(url).split(' ')[0]
if result:
return {"result": "PAID"} # Assuming True indicates a paid site
else:
return {"result": "UNPAID"} # Assuming False indicates an unpaid site
else:
return """
<form action='/' method='POST'>
<input type='text' name='url' placeholder='Enter Domain'>
<button>Detect</button>
</form>
"""
app.run(host="0.0.0.0")