-
Notifications
You must be signed in to change notification settings - Fork 149
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New pixivCrawler notebooks for image downloads
- Loading branch information
Showing
2 changed files
with
434 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,318 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Setup" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"PIXIV_USERNAME = \"userbay\"\n", | ||
"PIXIV_PASSWORD = \"userpay\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pixivpy3 import *\n", | ||
"\n", | ||
"\n", | ||
"api = AppPixivAPI()\n", | ||
"# api = ByPassSniApi() # bypass the GFW\n", | ||
"# api.require_appapi_hosts()\n", | ||
"api.set_accept_language('zh-cn') # tags翻译成中文\n", | ||
"\n", | ||
"token = api.login(PIXIV_USERNAME, PIXIV_PASSWORD)\n", | ||
"user_id = token.response.user.id\n", | ||
"print(token.response.user)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# PixivCrawler (with pixivpy)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import json\n", | ||
"import time\n", | ||
"import random\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import sqlite3 as lite\n", | ||
"from sqlalchemy import create_engine\n", | ||
"from tqdm import tqdm_notebook as tqdm\n", | ||
"\n", | ||
"\n", | ||
"class PixivCrawler(object):\n", | ||
"\n", | ||
" def __init__(self, api, illust_db='pixiv_illusts.db'):\n", | ||
" self.api = api\n", | ||
" self.illust_db = illust_db\n", | ||
" self.user_info = None\n", | ||
"\n", | ||
" def randSleep(self, base=0.1, rand=0.5):\n", | ||
" \"休眠随机的时间\"\n", | ||
" time.sleep(base + rand*random.random())\n", | ||
"\n", | ||
" def GetUserDetail(self, user_id):\n", | ||
" \"查询指定用户的基本信息\"\n", | ||
" self.last_user = self.api.user_detail(user_id)\n", | ||
" return self.last_user\n", | ||
"\n", | ||
" def GetUserBookmarks(self, user_id, restrict='public'):\n", | ||
" \"获取指定用户的收藏列表\"\n", | ||
" df_list = []\n", | ||
" next_qs = {'user_id': user_id, 'restrict': restrict}\n", | ||
"\n", | ||
" user = self.GetUserDetail(user_id)\n", | ||
" self.randSleep(0.1)\n", | ||
"\n", | ||
" with tqdm(total=user.profile.total_illust_bookmarks_public,\n", | ||
" desc=\"api.user_bookmarks_illust\") as pbar:\n", | ||
" while next_qs != None:\n", | ||
" json_result = self.api.user_bookmarks_illust(**next_qs)\n", | ||
" tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n", | ||
" df_list.append(tmp_df)\n", | ||
" pbar.update(tmp_df.shape[0])\n", | ||
" next_qs = self.api.parse_qs(json_result.next_url)\n", | ||
" self.randSleep(0.1)\n", | ||
"\n", | ||
" df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n", | ||
" df['user_id'] = df.user.apply(lambda d: d['id'])\n", | ||
" return df.set_index('illust_id')\n", | ||
"\n", | ||
" def GetUserIllusts(self, user_id, type='illust'):\n", | ||
" \"获取指定用户的作品列表(illusts/manga)\"\n", | ||
" df_list = []\n", | ||
" next_qs = {'user_id': user_id, 'type': type, 'filter': 'for_ios'}\n", | ||
"\n", | ||
" user = self.GetUserDetail(user_id)\n", | ||
" if type == 'illust':\n", | ||
" total = user.profile.total_illusts\n", | ||
" elif type == 'manga':\n", | ||
" total = user.profile.total_manga\n", | ||
" else:\n", | ||
" raise Exception(\"Unsupported type=%d\" % type)\n", | ||
" self.randSleep(0.1)\n", | ||
"\n", | ||
" with tqdm(total=total, desc=\"api.user_illusts\") as pbar:\n", | ||
" while next_qs != None:\n", | ||
" json_result = self.api.user_illusts(**next_qs)\n", | ||
" tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n", | ||
" df_list.append(tmp_df)\n", | ||
" pbar.update(tmp_df.shape[0])\n", | ||
" next_qs = self.api.parse_qs(json_result.next_url)\n", | ||
" self.randSleep(0.1)\n", | ||
"\n", | ||
" df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n", | ||
" df['user_id'] = df.user.apply(lambda d: d['id'])\n", | ||
" return df.set_index('illust_id')\n", | ||
"\n", | ||
" def GetIllustRanking(self, mode, date, total=100):\n", | ||
" \"获取作品排行榜\"\n", | ||
" df_list = []\n", | ||
" next_qs = {'mode': mode, 'date': date, 'filter': 'for_ios'}\n", | ||
"\n", | ||
" with tqdm(total=total, desc=\"api.illust_ranking\") as pbar:\n", | ||
" while next_qs != None:\n", | ||
" json_result = self.api.illust_ranking(**next_qs)\n", | ||
" tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n", | ||
" df_list.append(tmp_df)\n", | ||
" pbar.update(tmp_df.shape[0])\n", | ||
" next_qs = self.api.parse_qs(json_result.next_url)\n", | ||
" self.randSleep(0.3)\n", | ||
"\n", | ||
" df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n", | ||
" df['user_id'] = df.user.apply(lambda d: d['id'])\n", | ||
" return df.set_index('illust_id')\n", | ||
"\n", | ||
" def GetFollowingUsers(self, user_id, restrict='public'):\n", | ||
" \"获取指定用户跟踪的用户列表,返回user_ids\"\n", | ||
" user_ids = []\n", | ||
" next_qs = {'user_id': user_id, 'restrict': restrict}\n", | ||
"\n", | ||
" user = self.GetUserDetail(user_id)\n", | ||
" with tqdm(total=user.profile.total_follow_users,\n", | ||
" desc=\"api.user_following\") as pbar:\n", | ||
" while next_qs != None:\n", | ||
" json_result = self.api.user_following(**next_qs)\n", | ||
" for one_user in json_result.user_previews:\n", | ||
" user_ids.append(one_user.user.id)\n", | ||
" pbar.update(len(json_result.user_previews))\n", | ||
" next_qs = self.api.parse_qs(json_result.next_url)\n", | ||
" self.randSleep(0.3, 0.8)\n", | ||
" return np.array(user_ids)\n", | ||
"\n", | ||
" def UpdateIllusts(self, df_illusts):\n", | ||
" sql_df = df_illusts.copy()\n", | ||
"\n", | ||
" # 数组类字段转json\n", | ||
" sql_df['image_urls'] = sql_df.image_urls.apply(json.dumps)\n", | ||
" sql_df['meta_pages'] = sql_df.meta_pages.apply(json.dumps)\n", | ||
" sql_df['meta_single_page'] = sql_df.meta_single_page.apply(json.dumps)\n", | ||
" sql_df['series'] = sql_df.series.apply(json.dumps)\n", | ||
" sql_df['tags'] = sql_df.tags.apply(json.dumps)\n", | ||
" sql_df['tools'] = sql_df.tools.apply(json.dumps)\n", | ||
" sql_df['user'] = sql_df.user.apply(json.dumps)\n", | ||
"\n", | ||
" # 先读取文件里的illusts存储,并用新的数据代替key相同的内容\n", | ||
" if os.path.isfile(self.illust_db):\n", | ||
" # 读取文件的数据并丢弃同样的illust_id (保留新的illust_id)\n", | ||
" db_df = self.DBIllusts(ensure_json=False)\n", | ||
" db_df = db_df[~db_df.index.isin(sql_df.index)]\n", | ||
" merged_df = pd.concat([sql_df, db_df], sort=False)\n", | ||
" else:\n", | ||
" merged_df = sql_df\n", | ||
"\n", | ||
" # 合并后df写入文件(replace方式)\n", | ||
" engine = create_engine('sqlite:///' + self.illust_db, echo=False)\n", | ||
" merged_df.to_sql('illusts', con=engine, if_exists='replace')\n", | ||
" return merged_df\n", | ||
"\n", | ||
" def DBIllusts(self, sql=\"SELECT * FROM illusts WHERE illust_id > 0\", ensure_json=True):\n", | ||
" with lite.connect(self.illust_db) as conn:\n", | ||
" sql_df = pd.read_sql_query(sql, conn, index_col='illust_id')\n", | ||
"\n", | ||
" # 还原json字段\n", | ||
" if ensure_json:\n", | ||
" sql_df['image_urls'] = sql_df.image_urls.apply(json.loads)\n", | ||
" sql_df['meta_pages'] = sql_df.meta_pages.apply(json.loads)\n", | ||
" sql_df['meta_single_page'] = sql_df.meta_single_page.apply(\n", | ||
" json.loads)\n", | ||
" sql_df['series'] = sql_df.series.apply(json.loads)\n", | ||
" sql_df['tags'] = sql_df.tags.apply(json.loads)\n", | ||
" sql_df['tools'] = sql_df.tools.apply(json.loads)\n", | ||
" sql_df['user'] = sql_df.user.apply(json.loads)\n", | ||
" return sql_df\n", | ||
"\n", | ||
"\n", | ||
"crawl = PixivCrawler(api)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## GetUserBookmarks(public)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_bookmarks = crawl.GetUserBookmarks(user_id)\n", | ||
"_ = crawl.UpdateIllusts(df_bookmarks)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## GetFollowingUsers(public)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"user_ids = crawl.GetFollowingUsers(user_id)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"random.shuffle(user_ids)\n", | ||
"for uid in tqdm(user_ids, desc=\"GetFollowingUsers\"):\n", | ||
" df = crawl.GetUserIllusts(uid)\n", | ||
" _ = crawl.UpdateIllusts(df)\n", | ||
" crawl.randSleep(1.1, 5.0)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## GetIllustRanking" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# mode: [day, week, month, day_male, day_female, week_original, week_rookie, day_manga]\n", | ||
"# date: '2016-08-01'\n", | ||
"# mode (Past): [day, week, month, day_male, day_female, week_original, week_rookie,\n", | ||
"# day_r18, day_male_r18, day_female_r18, week_r18, week_r18g]\n", | ||
"df_ranking = crawl.GetIllustRanking('week', '2019-11-01')\n", | ||
"_ = crawl.UpdateIllusts(df_ranking)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.3" | ||
}, | ||
"toc": { | ||
"base_numbering": 1, | ||
"nav_menu": {}, | ||
"number_sections": true, | ||
"sideBar": true, | ||
"skip_h1_title": false, | ||
"title_cell": "Table of Contents", | ||
"title_sidebar": "Contents", | ||
"toc_cell": false, | ||
"toc_position": {}, | ||
"toc_section_display": true, | ||
"toc_window_display": true | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.