diff --git a/twitter_user_tweet_crawler/__main__.py b/twitter_user_tweet_crawler/__main__.py index acf3839..153b951 100644 --- a/twitter_user_tweet_crawler/__main__.py +++ b/twitter_user_tweet_crawler/__main__.py @@ -12,7 +12,6 @@ from .browser import get_browser, get_multiple_browsers from .pool import ThreadPool from .util.config import config, work_directory, set_work_directory -from .util.sql import is_id_exists def main(): @@ -79,7 +78,7 @@ def get_items_need_handle(): for i in links: full_url = i.get_attribute("href") tweet_id = urlparse(full_url).path.split('/')[-1] - if tweet_id not in data_dict and not is_id_exists(int(tweet_id)): + if tweet_id not in data_dict: data_dict[tweet_id] = Tweet(full_url) pool.jobs.append(data_dict[tweet_id].load_data) logger.info(full_url) @@ -91,7 +90,7 @@ def get_items_need_handle(): if __name__ == "__main__": set_work_directory(Path(__file__).absolute().parent) logger.add(work_directory / "log/{time:YYYY-MM-DD}.log", rotation="00:00", - level="INFO", + level="ERROR", encoding="utf-8", format="{time} | {level} | {message}", enqueue=True) Path(Path(__file__).absolute().parent / 'output/res').mkdir(parents=True, exist_ok=True) config.load("config.yaml") diff --git a/twitter_user_tweet_crawler/pool.py b/twitter_user_tweet_crawler/pool.py index f8cbb01..2f1dc0f 100644 --- a/twitter_user_tweet_crawler/pool.py +++ b/twitter_user_tweet_crawler/pool.py @@ -32,9 +32,10 @@ def _on_job_complete(self, index, future): elements = self.browser.index(index) try: future.result() - except NoSuchElementException: - pass # By default, `concurrent.futures` will silently log errors but will not raise them # Throw the error directly - self.browser[elements].__dict__['is_using'] = False - self.check_and_work() + except NoSuchElementException: + pass + finally: + self.browser[elements].__dict__['is_using'] = False + self.check_and_work() diff --git a/twitter_user_tweet_crawler/tweet.py b/twitter_user_tweet_crawler/tweet.py index ce6cebf..89d4650 100644 --- a/twitter_user_tweet_crawler/tweet.py +++ b/twitter_user_tweet_crawler/tweet.py @@ -12,7 +12,7 @@ from urllib.parse import quote from .__main__ import * from .util.config import work_directory -from .util.sql import insert_new_record +from .util.sql import insert_new_record, is_id_exists class Tweet: @@ -26,7 +26,7 @@ class Tweet: link: str def __init__(self, link: str): - self.post_time = int(datetime.now().timestamp()) + self.post_time = 0 self.post_id = int(urlparse(link).path.split('/')[-1]) self.link = link self.text = '' @@ -43,8 +43,10 @@ def write_markdown(self): @logger.catch def commit_sqlite(self): - insert_new_record(self.post_id, self.post_time, self.location) + if not is_id_exists(int(self.post_id)): + insert_new_record(self.post_id, self.post_time, self.location) + @logger.catch def load_data(self, available_driver: WebDriver): def replace_emoji(string: str) -> str: