Merge pull request #536 from songzy12/ttarticle

Fix the crawling of toutiao article urls.
dataabc · Aug 27, 2023 · 4b9d66a · 4b9d66a
2 parents fb60373 + f6fcb7d
commit 4b9d66a
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py
@@ -75,7 +75,7 @@ def get_one_page(self, weibo_id_list):
                         publish_time = datetime_util.str_to_time(
                             weibo.publish_time)
 
-                        if publish_time < since_date:                            
+                        if publish_time < since_date:
                             # As of 2023.05, there can be at most 2 pinned weibo.
                             # We will continue for at most 2 times before return.
                             if self.page == 1 and cur_pinned_count < MAX_PINNED_COUNT:
@@ -158,9 +158,9 @@ def get_article_url(self, info):
         """获取微博头条文章的url"""
         article_url = ''
         text = handle_garbled(info)
-        if text.startswith(u'发布了头条文章'):
+        if text.startswith(u'发布了头条文章') or text.startswith(u'我发表了头条文章'):
             url = info.xpath('.//a/@href')
-            if url and url[0].startswith('https://weibo.cn/sinaurl'):
+            if url and url[0].startswith('https://weibo.com/ttarticle'):
                 article_url = url[0]
         return article_url