-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3ae208a
commit fa55b8f
Showing
6 changed files
with
94 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,4 +163,5 @@ cython_debug/ | |
|
||
quotes.json | ||
|
||
storage/ | ||
storage/ | ||
results.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
crawlee[beautifulsoup,playwright] | ||
crawlee[beautifulsoup,playwright] | ||
aiohttp | ||
protego |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
""" | ||
Class to respect robot.txt file | ||
""" | ||
|
||
import urllib.parse | ||
|
||
import aiohttp | ||
from protego import Protego | ||
|
||
|
||
class RobotTXT: | ||
"""Class to respect robot.txt file""" | ||
|
||
def __init__(self): | ||
self._robots = {} | ||
self._user_agent = ["*", "GPTBot", "WaveAICrawler"] | ||
|
||
async def __call__(self, url: str) -> bool: | ||
"""Check if the url is allowed to be crawled | ||
Args: | ||
url (str): url to be checked | ||
Returns: | ||
bool: True if the url is allowed to be crawled, False otherwise | ||
""" | ||
|
||
url_parse = urllib.parse.urlparse(url) | ||
robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt" | ||
|
||
if robots_url not in self._robots: | ||
async with aiohttp.ClientSession() as session: | ||
async with session.get(robots_url) as response: | ||
robots_content = await response.text() | ||
self._robots[robots_url] = Protego.parse(robots_content) | ||
|
||
authorize = [] | ||
for agent in self._user_agent: | ||
authorize.append(self._robots[robots_url].can_fetch(url, agent)) | ||
|
||
if len(self._robots) > 100: | ||
self._robots.popitem(last=False) | ||
|
||
return all(authorize) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
""" | ||
Utils function | ||
""" | ||
|
||
import urllib.parse | ||
|
||
|
||
def is_valid_url(url: str) -> bool: | ||
"""Check if url is valid | ||
Args: | ||
url (str): the url to check | ||
Returns: | ||
bool: boolean that indicate if url is valid (true) or not | ||
""" | ||
|
||
try: | ||
result = urllib.parse.urlparse(url) | ||
return all([result.scheme, result.netloc]) | ||
except AttributeError: | ||
return False |