From 15b68e2116bdae6211432ef99c3e9e446f7ed351 Mon Sep 17 00:00:00 2001 From: kaesinol Date: Mon, 20 Nov 2023 16:31:35 +0800 Subject: [PATCH 1/3] :bug::fire: bugfix & remove useless codes --- config.yaml | 3 +-- test.py | 5 ++--- twitter_user_tweet_crawler/__main__.py | 2 +- twitter_user_tweet_crawler/browser.py | 5 ----- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/config.yaml b/config.yaml index 668223c..7c31917 100644 --- a/config.yaml +++ b/config.yaml @@ -1,7 +1,6 @@ { "proxy": {"http": "socks5://127.0.0.1:7890", "https": "socks5://127.0.0.1:7890" }, "max_threads": 1, - "header": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"} -,"user_data_dir": "/media/Data/Project/twitter_user_tweet_crawler/twitter_user_tweet_crawler/userdata", + "header": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"}, "inject_js": "/media/Data/Project/twitter_user_tweet_crawler/script.js", "save": "/media/Data/Project/twitter_user_tweet_crawler/output", "user": "s_nample" diff --git a/test.py b/test.py index 40cb390..cb8d49f 100644 --- a/test.py +++ b/test.py @@ -10,13 +10,12 @@ def get_tweet(): set_work_directory(Path(__file__).absolute().parent) config.load({"proxy": {"http": "socks5://127.0.0.1:7890", "https": "socks5://127.0.0.1:7890"}, "max_threads": 2, "header": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " - "like Gecko) "}, - "user_data_dir": "/media/Data/Project/twitter_user_tweet_crawler/twitter_user_tweet_crawler/userdata" + "like Gecko) "} , "inject_js": "/media/Data/Project/twitter_user_tweet_crawler/script.js", "save": "/media/Data/Project/twitter_user_tweet_crawler/output/", } ) from twitter_user_tweet_crawler.tweet import Tweet - Path(config.save / 'res').mkdir(parents=True, exist_ok=True) + (Path(config.save) / 'res').mkdir(parents=True, exist_ok=True) browser = get_browser() browser.get('https://twitter.com/404') cookie: list[dict] diff --git a/twitter_user_tweet_crawler/__main__.py b/twitter_user_tweet_crawler/__main__.py index ba47e6d..f21ffe5 100644 --- a/twitter_user_tweet_crawler/__main__.py +++ b/twitter_user_tweet_crawler/__main__.py @@ -39,7 +39,7 @@ def get_items_need_handle(): return driver.find_elements(*selector) selector = (By.XPATH, '//*/div[2]/div/div[3]/a[@role="link"]') - Path(config.save / 'res').mkdir(exist_ok=True, parents=True) + (Path(config.save) / 'res').mkdir(exist_ok=True, parents=True) driver = get_browser() diff --git a/twitter_user_tweet_crawler/browser.py b/twitter_user_tweet_crawler/browser.py index b79836c..f2523f9 100644 --- a/twitter_user_tweet_crawler/browser.py +++ b/twitter_user_tweet_crawler/browser.py @@ -14,11 +14,6 @@ def get_browser(headless: bool = False, id=None) -> WebDriver: chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--window-size=1200x600"') - - if not id: - chrome_options.add_argument(f'user-data-dir={config["user_data_dir"] + "/" + str(browsers := browsers + 1)}') - else: - chrome_options.add_argument(f'user-data-dir={config["user_data_dir"] + "/" + str(id)}') if headless: chrome_options.add_argument('--headless') driver = webdriver.Chrome(options=chrome_options) From 7e00e866823dbe0e045af69d0712007bb06087c0 Mon Sep 17 00:00:00 2001 From: kaesinol Date: Mon, 20 Nov 2023 16:35:35 +0800 Subject: [PATCH 2/3] :construction_worker: Update chromedriver acquisition method --- .github/workflows/python-app.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/python-app.yaml b/.github/workflows/python-app.yaml index c0e49e7..7bb5759 100644 --- a/.github/workflows/python-app.yaml +++ b/.github/workflows/python-app.yaml @@ -2,9 +2,6 @@ name: Python application on: push: - branches: - - dev - - main paths: - '**.py' - '**.yml' @@ -20,7 +17,7 @@ jobs: - name: Install Chrome run: | wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb - wget https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/119.0.6045.105/linux64/chromedriver-linux64.zip + wget -O chrome-linux64.zip `curl -s https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json | jq -r '.channels.Stable.downloads.chromedriver[0].url'` sudo rm -rf /usr/bin/chromedriver sudo mkdir /usr/bin/chromedriver sudo unzip chromedriver-linux64.zip -d /usr/bin/chromedriver From d99212ff620e91e39488f7ac8eb846aaaf04272c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=80=E6=99=AE=E4=BB=A5=E5=B0=94?= Date: Mon, 20 Nov 2023 17:08:32 +0800 Subject: [PATCH 3/3] :construction_worker: remake ci test (#1) --- .github/workflows/python-app.yaml | 2 +- tests/CI.py | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/.github/workflows/python-app.yaml b/.github/workflows/python-app.yaml index 7bb5759..78d3d27 100644 --- a/.github/workflows/python-app.yaml +++ b/.github/workflows/python-app.yaml @@ -17,7 +17,7 @@ jobs: - name: Install Chrome run: | wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb - wget -O chrome-linux64.zip `curl -s https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json | jq -r '.channels.Stable.downloads.chromedriver[0].url'` + wget -O chromedriver-linux64.zip `curl -s https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json | jq -r '.channels.Stable.downloads.chromedriver[0].url'` sudo rm -rf /usr/bin/chromedriver sudo mkdir /usr/bin/chromedriver sudo unzip chromedriver-linux64.zip -d /usr/bin/chromedriver diff --git a/tests/CI.py b/tests/CI.py index 6a3c864..3cbd3f5 100644 --- a/tests/CI.py +++ b/tests/CI.py @@ -5,28 +5,25 @@ from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.common.by import By -from twitter_user_tweet_crawler.tweet import Tweet +from twitter_user_tweet_crawler.browser import get_browser +from twitter_user_tweet_crawler.util.config import config - -def get_browser() -> WebDriver: - chrome_options = webdriver.ChromeOptions() - chrome_options.add_argument('--blink-settings=imagesEnabled=false') - chrome_options.add_argument('--disable-remote-fonts') - chrome_options.add_argument('--disable-gpu') - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--headless') - driver = webdriver.Chrome(options=chrome_options) - return driver +config.load({"proxy": None, "max_threads": 2, + "header": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " + "like Gecko) "} + , "inject_js": "script.js", + } + ) class TestCase(unittest.TestCase): def test_spider(self): - browser = get_browser() + browser = get_browser(headless=True) browser.get('https://twitter.com/_CASTSTATION/status/1697029186777706544') sleep(20) - element = browser.find_element(By.XPATH,"//*[contains(text(), '{}')]".format('miku miku oo ee oo')) + element = browser.find_element(By.XPATH, "//*[contains(text(), '{}')]".format('miku miku oo ee oo')) browser.save_screenshot('debug.png') - self.assertIn('miku miku oo ee oo',element.get_attribute('innerHTML')) + self.assertIn('miku miku oo ee oo', element.get_attribute('innerHTML')) if __name__ == '__main__':