Skip to content

Commit

Permalink
fix web_scrap errors
Browse files Browse the repository at this point in the history
  • Loading branch information
saeeddhqan committed Jul 12, 2021
1 parent 79beacd commit d4728c3
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 20 deletions.
4 changes: 3 additions & 1 deletion maryam/core/util/urlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def parse(self):
def unparse(self, urparse):
return urlparse.urlunparse(urparse)

def sub_service(self, serv=None):
def sub_service(self, serv=None, ifany=False):
'''Add protocol to url or replace it or clean it'''
urparse = re.split(r'://', self.url)
if not serv:
Expand All @@ -47,6 +47,8 @@ def sub_service(self, serv=None):
# Add protocol
serv = re.sub(r'://', '', serv)
if len(urparse) == 2:
if ifany:
return self.url
del urparse[0]
url = f"{serv}://{''.join(urparse)}"
else:
Expand Down
36 changes: 17 additions & 19 deletions maryam/core/util/web_scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
import concurrent.futures

# Web Scraper v5.1
# Web Scrap v5.1

class main:

Expand All @@ -28,10 +28,11 @@ def __init__(self, url, debug=False, limit=1, thread_count=1):
url : First page address
debug : Show the result at moment
limit : Web scrap level(if it's 1 that's mean just search in first page)
thread_count : Count of links for open at per lap
thread_count : Number of links for each lap
"""
self.framework = main.framework
self.url = url
self.parser = self.framework.urlib(url)
self.url = self.parser.sub_service(self.framework._global_options['protocol'], ifany=True)
self.urlib = self.framework.urlib
self.debug = debug
self.limit = limit
Expand Down Expand Up @@ -106,39 +107,35 @@ def add_email(self, link):
return True

def joiner(self, url):
url = str(url)
# ADD slash to end url
url = url
urparse = self.urlib(url)
urparse.url = urparse.quote if '%' not in url else url
urparse2 = self.urlib(str(self.url))
cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:')
cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') or '.' not in url
cond12 = url.endswith(':')
cond2 = len(
urparse.url) > 1 and '%3a//' not in urparse.url and urparse.url[:2] != '//'
cond3 = urparse.url[:2] == '//'
if cond1 or cond12:
return False
elif cond2:
urparse.url = urparse2.join(url)
urparse.url = self.parser.join(url)
elif cond3:
urparse.url = url
else:
urparse.url = urparse2.join(url)
return str(urparse.url)
urparse.url = self.parser.join(url)
return urparse.url

def link_category(self, urls):
links = []
for url in urls:
join = self.joiner(url)

##########################
# ADD CDN, PHONE and EMAIL
##########################
cond1 = not join or (self.add_cdn(url) or self.add_phone(url) or self.add_email(url))
if cond1:
continue

ends = join.endswith
join = str(join).replace('\/', '/')
##########################
# ADD OUT SCOPE
Expand All @@ -154,9 +151,10 @@ def link_category(self, urls):
if urparse.query != '':
self._QUERY_LINKS = self.rept(join, self._QUERY_LINKS)

# If the link is a media link(mp4,..) not a web page
broke = 0
for ext in self.media_exts:
if (f'.{ext}/' in join) or ends(f'.{ext}'):
if (f'.{ext}/' in join) or join.endswith(f'.{ext}'):
self._MEDIA = self.rept(join, self._MEDIA)
broke = 1
break
Expand All @@ -183,12 +181,12 @@ def get_source(self, url):
return []
self.passed.append(url)
# Send Request
# try:
req = self.framework.request(url)
# except:
# return False
# else:
resp = req.text
try:
req = self.framework.request(url)
except:
return False
else:
resp = req.text

pp = self.framework.page_parse(resp)

Expand Down

0 comments on commit d4728c3

Please sign in to comment.