Skip to content

Commit

Permalink
feat: added auth
Browse files Browse the repository at this point in the history
  • Loading branch information
tikazyq committed Apr 7, 2024
1 parent 8e26888 commit 47d456d
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 15 deletions.
21 changes: 17 additions & 4 deletions crawlab_ai/scrapy/list_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scrapy.http import Response

from crawlab_ai.scrapy.base_spider import BaseSpider
from crawlab_ai.utils.auth import get_auth_headers
from crawlab_ai.utils.logger import logger


Expand Down Expand Up @@ -55,16 +56,27 @@ def __init__(self):
super(ScrapyListSpider, self).__init__()
self._fetch_rules()

def start_requests(self) -> Iterable[Request]:
yield Request(self.start_urls[0], self.parse)

def _fetch_rules(self):
logger.info('Fetching rules for URL: ' + self.start_urls[0])
res = requests.post(self._api_endpoint + '/list_rules', json={
'url': self.start_urls[0],
'fields': self.fields,
})
res = requests.post(
url=self._api_endpoint + '/list_rules',
headers=get_auth_headers(),
json={
'url': self.start_urls[0],
'fields': self.fields,
},
)
data = res.json()
self._list_element_css_selector = data['model_list'][0]['list_model']['list_element_css_selector']
self._fields = data['model_list'][0]['list_model']['fields']
self._next_page_element_css_selector = data['model_list'][0]['next_page_element_css_selector']
logger.info('Rules fetched.')
logger.info('List element CSS selector: ' + self._list_element_css_selector)
logger.info('Fields: ' + str(self._fields))
logger.info('Next page element CSS selector: ' + str(self._next_page_element_css_selector))

def parse(self, response: Response, **kwargs: Any) -> Any:
list_items = response.css(self._list_element_css_selector)
Expand Down Expand Up @@ -100,3 +112,4 @@ class TestScrapyListSpider(ScrapyListSpider):

process = CrawlerProcess(get_project_settings())
process.crawl(TestScrapyListSpider)
process.start()
26 changes: 16 additions & 10 deletions crawlab_ai/spider/list_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from bs4 import BeautifulSoup
from pandas import DataFrame
from concurrent.futures import ThreadPoolExecutor, as_completed

from crawlab_ai.utils.auth import get_auth_headers
from crawlab_ai.utils.env import get_api_endpoint
from crawlab_ai.utils.logger import logger

Expand Down Expand Up @@ -35,10 +37,14 @@ def __init__(self, url: str, fields: List[dict] = None, get_html=None):

def _fetch_rules(self):
logger.info('Fetching rules for URL: ' + self.url)
res = requests.post(get_api_endpoint() + '/list_rules', json={
'url': self.url,
'fields': self.fields,
})
res = requests.post(
url=get_api_endpoint() + '/list_rules',
headers=get_auth_headers(),
json={
'url': self.url,
'fields': self.fields,
},
)
data = res.json()
self._list_element_css_selector = data['model_list'][0]['list_model']['list_element_css_selector']
self._fields = data['model_list'][0]['list_model']['fields']
Expand Down Expand Up @@ -122,9 +128,9 @@ def read_list(url: str, fields: List[str] | dict = None, get_html=None, as_dataf
if __name__ == '__main__':
df = read_list('https://quotes.toscrape.com')
print(df)
df = read_list('https://36kr.com/', [
'title',
'author',
'url',
])
print(df)
# df = read_list('https://36kr.com/', [
# 'title',
# 'author',
# 'url',
# ])
# print(df)
21 changes: 21 additions & 0 deletions crawlab_ai/utils/auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os


def get_token() -> str:
# Get token from environment variable
if 'CRAWLAB_TOKEN' in os.environ:
return os.environ['CRAWLAB_TOKEN']

# Prompt user to enter token
token = input('Please enter your API token for Crawlab AI: ')
return token


def get_auth_headers() -> dict:
return {
'Authorization': f'Bearer {get_token()}'
}


if __name__ == '__main__':
print(get_token())
2 changes: 1 addition & 1 deletion crawlab_ai/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
console_handler.setLevel(logging.INFO)

# 定义handler的输出格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s [%(name)s] %(levelname)s: %(message)s')
console_handler.setFormatter(formatter)

# 给logger添加handler
Expand Down
Empty file added test/utils/__init__.py
Empty file.
35 changes: 35 additions & 0 deletions test/utils/test_auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
from unittest.mock import patch, mock_open
from crawlab_ai.utils import auth


def test_set_token_creates_directory_and_file():
with patch('os.path.exists', return_value=False), patch('os.makedirs'), patch('builtins.open',
new_callable=mock_open):
auth.set_token('test_token')
os.makedirs.assert_called_once_with(auth.ROOT_DIRECTORY)
open.assert_called_once_with(f'{auth.ROOT_DIRECTORY}/token', 'w')


def test_set_token_writes_token_to_file():
with patch('os.path.exists', return_value=True), patch('builtins.open', new_callable=mock_open) as new_mock_open:
auth.set_token('test_token')
new_mock_open().write.assert_called_once_with('test_token')


def test_get_token_returns_env_var_token():
with patch.dict(os.environ, {'CRAWLAB_TOKEN': 'env_token'}):
assert auth.get_token() == 'env_token'


def test_get_token_returns_file_token():
with patch('os.path.exists', return_value=True), patch('builtins.open', new_callable=mock_open,
read_data='file_token'):
assert auth.get_token() == 'file_token'


def test_get_token_prompts_for_token():
with patch('os.path.exists', return_value=False), patch('builtins.input', return_value='input_token'), patch(
'crawlab_ai.utils.auth.set_token') as mock_set_token:
assert auth.get_token() == 'input_token'
mock_set_token.assert_called_once_with('input_token')

0 comments on commit 47d456d

Please sign in to comment.