Skip to content

Commit

Permalink
Merge pull request #536 from songzy12/ttarticle
Browse files Browse the repository at this point in the history
Fix the crawling of toutiao article urls.
  • Loading branch information
dataabc authored Aug 27, 2023
2 parents fb60373 + f6fcb7d commit 4b9d66a
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions weibo_spider/parser/page_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def get_one_page(self, weibo_id_list):
publish_time = datetime_util.str_to_time(
weibo.publish_time)

if publish_time < since_date:
if publish_time < since_date:
# As of 2023.05, there can be at most 2 pinned weibo.
# We will continue for at most 2 times before return.
if self.page == 1 and cur_pinned_count < MAX_PINNED_COUNT:
Expand Down Expand Up @@ -158,9 +158,9 @@ def get_article_url(self, info):
"""获取微博头条文章的url"""
article_url = ''
text = handle_garbled(info)
if text.startswith(u'发布了头条文章'):
if text.startswith(u'发布了头条文章') or text.startswith(u'我发表了头条文章'):
url = info.xpath('.//a/@href')
if url and url[0].startswith('https://weibo.cn/sinaurl'):
if url and url[0].startswith('https://weibo.com/ttarticle'):
article_url = url[0]
return article_url

Expand Down

0 comments on commit 4b9d66a

Please sign in to comment.