diff --git a/maryam/core/util/urlib.py b/maryam/core/util/urlib.py index 7954a36bc..cf035a4ba 100644 --- a/maryam/core/util/urlib.py +++ b/maryam/core/util/urlib.py @@ -37,7 +37,7 @@ def parse(self): def unparse(self, urparse): return urlparse.urlunparse(urparse) - def sub_service(self, serv=None): + def sub_service(self, serv=None, ifany=False): '''Add protocol to url or replace it or clean it''' urparse = re.split(r'://', self.url) if not serv: @@ -47,6 +47,8 @@ def sub_service(self, serv=None): # Add protocol serv = re.sub(r'://', '', serv) if len(urparse) == 2: + if ifany: + return self.url del urparse[0] url = f"{serv}://{''.join(urparse)}" else: diff --git a/maryam/core/util/web_scrap.py b/maryam/core/util/web_scrap.py index f96b59f81..6889877a3 100644 --- a/maryam/core/util/web_scrap.py +++ b/maryam/core/util/web_scrap.py @@ -18,7 +18,7 @@ import re import concurrent.futures -# Web Scraper v5.1 +# Web Scrap v5.1 class main: @@ -28,10 +28,11 @@ def __init__(self, url, debug=False, limit=1, thread_count=1): url : First page address debug : Show the result at moment limit : Web scrap level(if it's 1 that's mean just search in first page) - thread_count : Count of links for open at per lap + thread_count : Number of links for each lap """ self.framework = main.framework - self.url = url + self.parser = self.framework.urlib(url) + self.url = self.parser.sub_service(self.framework._global_options['protocol'], ifany=True) self.urlib = self.framework.urlib self.debug = debug self.limit = limit @@ -106,12 +107,10 @@ def add_email(self, link): return True def joiner(self, url): - url = str(url) - # ADD slash to end url + url = url urparse = self.urlib(url) urparse.url = urparse.quote if '%' not in url else url - urparse2 = self.urlib(str(self.url)) - cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') + cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') or '.' not in url cond12 = url.endswith(':') cond2 = len( urparse.url) > 1 and '%3a//' not in urparse.url and urparse.url[:2] != '//' @@ -119,18 +118,17 @@ def joiner(self, url): if cond1 or cond12: return False elif cond2: - urparse.url = urparse2.join(url) + urparse.url = self.parser.join(url) elif cond3: urparse.url = url else: - urparse.url = urparse2.join(url) - return str(urparse.url) + urparse.url = self.parser.join(url) + return urparse.url def link_category(self, urls): links = [] for url in urls: join = self.joiner(url) - ########################## # ADD CDN, PHONE and EMAIL ########################## @@ -138,7 +136,6 @@ def link_category(self, urls): if cond1: continue - ends = join.endswith join = str(join).replace('\/', '/') ########################## # ADD OUT SCOPE @@ -154,9 +151,10 @@ def link_category(self, urls): if urparse.query != '': self._QUERY_LINKS = self.rept(join, self._QUERY_LINKS) + # If the link is a media link(mp4,..) not a web page broke = 0 for ext in self.media_exts: - if (f'.{ext}/' in join) or ends(f'.{ext}'): + if (f'.{ext}/' in join) or join.endswith(f'.{ext}'): self._MEDIA = self.rept(join, self._MEDIA) broke = 1 break @@ -183,12 +181,12 @@ def get_source(self, url): return [] self.passed.append(url) # Send Request - # try: - req = self.framework.request(url) - # except: - # return False - # else: - resp = req.text + try: + req = self.framework.request(url) + except: + return False + else: + resp = req.text pp = self.framework.page_parse(resp)