Skip to content

Commit

Permalink
New pixivCrawler notebooks for image downloads
Browse files Browse the repository at this point in the history
  • Loading branch information
upbit committed Nov 10, 2019
1 parent 16eacc1 commit 0a02866
Show file tree
Hide file tree
Showing 2 changed files with 434 additions and 0 deletions.
318 changes: 318 additions & 0 deletions notebooks/pixivCrawler.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PIXIV_USERNAME = \"userbay\"\n",
"PIXIV_PASSWORD = \"userpay\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pixivpy3 import *\n",
"\n",
"\n",
"api = AppPixivAPI()\n",
"# api = ByPassSniApi() # bypass the GFW\n",
"# api.require_appapi_hosts()\n",
"api.set_accept_language('zh-cn') # tags翻译成中文\n",
"\n",
"token = api.login(PIXIV_USERNAME, PIXIV_PASSWORD)\n",
"user_id = token.response.user.id\n",
"print(token.response.user)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PixivCrawler (with pixivpy)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import time\n",
"import random\n",
"import numpy as np\n",
"import pandas as pd\n",
"import sqlite3 as lite\n",
"from sqlalchemy import create_engine\n",
"from tqdm import tqdm_notebook as tqdm\n",
"\n",
"\n",
"class PixivCrawler(object):\n",
"\n",
" def __init__(self, api, illust_db='pixiv_illusts.db'):\n",
" self.api = api\n",
" self.illust_db = illust_db\n",
" self.user_info = None\n",
"\n",
" def randSleep(self, base=0.1, rand=0.5):\n",
" \"休眠随机的时间\"\n",
" time.sleep(base + rand*random.random())\n",
"\n",
" def GetUserDetail(self, user_id):\n",
" \"查询指定用户的基本信息\"\n",
" self.last_user = self.api.user_detail(user_id)\n",
" return self.last_user\n",
"\n",
" def GetUserBookmarks(self, user_id, restrict='public'):\n",
" \"获取指定用户的收藏列表\"\n",
" df_list = []\n",
" next_qs = {'user_id': user_id, 'restrict': restrict}\n",
"\n",
" user = self.GetUserDetail(user_id)\n",
" self.randSleep(0.1)\n",
"\n",
" with tqdm(total=user.profile.total_illust_bookmarks_public,\n",
" desc=\"api.user_bookmarks_illust\") as pbar:\n",
" while next_qs != None:\n",
" json_result = self.api.user_bookmarks_illust(**next_qs)\n",
" tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n",
" df_list.append(tmp_df)\n",
" pbar.update(tmp_df.shape[0])\n",
" next_qs = self.api.parse_qs(json_result.next_url)\n",
" self.randSleep(0.1)\n",
"\n",
" df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n",
" df['user_id'] = df.user.apply(lambda d: d['id'])\n",
" return df.set_index('illust_id')\n",
"\n",
" def GetUserIllusts(self, user_id, type='illust'):\n",
" \"获取指定用户的作品列表(illusts/manga)\"\n",
" df_list = []\n",
" next_qs = {'user_id': user_id, 'type': type, 'filter': 'for_ios'}\n",
"\n",
" user = self.GetUserDetail(user_id)\n",
" if type == 'illust':\n",
" total = user.profile.total_illusts\n",
" elif type == 'manga':\n",
" total = user.profile.total_manga\n",
" else:\n",
" raise Exception(\"Unsupported type=%d\" % type)\n",
" self.randSleep(0.1)\n",
"\n",
" with tqdm(total=total, desc=\"api.user_illusts\") as pbar:\n",
" while next_qs != None:\n",
" json_result = self.api.user_illusts(**next_qs)\n",
" tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n",
" df_list.append(tmp_df)\n",
" pbar.update(tmp_df.shape[0])\n",
" next_qs = self.api.parse_qs(json_result.next_url)\n",
" self.randSleep(0.1)\n",
"\n",
" df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n",
" df['user_id'] = df.user.apply(lambda d: d['id'])\n",
" return df.set_index('illust_id')\n",
"\n",
" def GetIllustRanking(self, mode, date, total=100):\n",
" \"获取作品排行榜\"\n",
" df_list = []\n",
" next_qs = {'mode': mode, 'date': date, 'filter': 'for_ios'}\n",
"\n",
" with tqdm(total=total, desc=\"api.illust_ranking\") as pbar:\n",
" while next_qs != None:\n",
" json_result = self.api.illust_ranking(**next_qs)\n",
" tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n",
" df_list.append(tmp_df)\n",
" pbar.update(tmp_df.shape[0])\n",
" next_qs = self.api.parse_qs(json_result.next_url)\n",
" self.randSleep(0.3)\n",
"\n",
" df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n",
" df['user_id'] = df.user.apply(lambda d: d['id'])\n",
" return df.set_index('illust_id')\n",
"\n",
" def GetFollowingUsers(self, user_id, restrict='public'):\n",
" \"获取指定用户跟踪的用户列表,返回user_ids\"\n",
" user_ids = []\n",
" next_qs = {'user_id': user_id, 'restrict': restrict}\n",
"\n",
" user = self.GetUserDetail(user_id)\n",
" with tqdm(total=user.profile.total_follow_users,\n",
" desc=\"api.user_following\") as pbar:\n",
" while next_qs != None:\n",
" json_result = self.api.user_following(**next_qs)\n",
" for one_user in json_result.user_previews:\n",
" user_ids.append(one_user.user.id)\n",
" pbar.update(len(json_result.user_previews))\n",
" next_qs = self.api.parse_qs(json_result.next_url)\n",
" self.randSleep(0.3, 0.8)\n",
" return np.array(user_ids)\n",
"\n",
" def UpdateIllusts(self, df_illusts):\n",
" sql_df = df_illusts.copy()\n",
"\n",
" # 数组类字段转json\n",
" sql_df['image_urls'] = sql_df.image_urls.apply(json.dumps)\n",
" sql_df['meta_pages'] = sql_df.meta_pages.apply(json.dumps)\n",
" sql_df['meta_single_page'] = sql_df.meta_single_page.apply(json.dumps)\n",
" sql_df['series'] = sql_df.series.apply(json.dumps)\n",
" sql_df['tags'] = sql_df.tags.apply(json.dumps)\n",
" sql_df['tools'] = sql_df.tools.apply(json.dumps)\n",
" sql_df['user'] = sql_df.user.apply(json.dumps)\n",
"\n",
" # 先读取文件里的illusts存储,并用新的数据代替key相同的内容\n",
" if os.path.isfile(self.illust_db):\n",
" # 读取文件的数据并丢弃同样的illust_id (保留新的illust_id)\n",
" db_df = self.DBIllusts(ensure_json=False)\n",
" db_df = db_df[~db_df.index.isin(sql_df.index)]\n",
" merged_df = pd.concat([sql_df, db_df], sort=False)\n",
" else:\n",
" merged_df = sql_df\n",
"\n",
" # 合并后df写入文件(replace方式)\n",
" engine = create_engine('sqlite:///' + self.illust_db, echo=False)\n",
" merged_df.to_sql('illusts', con=engine, if_exists='replace')\n",
" return merged_df\n",
"\n",
" def DBIllusts(self, sql=\"SELECT * FROM illusts WHERE illust_id > 0\", ensure_json=True):\n",
" with lite.connect(self.illust_db) as conn:\n",
" sql_df = pd.read_sql_query(sql, conn, index_col='illust_id')\n",
"\n",
" # 还原json字段\n",
" if ensure_json:\n",
" sql_df['image_urls'] = sql_df.image_urls.apply(json.loads)\n",
" sql_df['meta_pages'] = sql_df.meta_pages.apply(json.loads)\n",
" sql_df['meta_single_page'] = sql_df.meta_single_page.apply(\n",
" json.loads)\n",
" sql_df['series'] = sql_df.series.apply(json.loads)\n",
" sql_df['tags'] = sql_df.tags.apply(json.loads)\n",
" sql_df['tools'] = sql_df.tools.apply(json.loads)\n",
" sql_df['user'] = sql_df.user.apply(json.loads)\n",
" return sql_df\n",
"\n",
"\n",
"crawl = PixivCrawler(api)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GetUserBookmarks(public)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_bookmarks = crawl.GetUserBookmarks(user_id)\n",
"_ = crawl.UpdateIllusts(df_bookmarks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GetFollowingUsers(public)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"user_ids = crawl.GetFollowingUsers(user_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"random.shuffle(user_ids)\n",
"for uid in tqdm(user_ids, desc=\"GetFollowingUsers\"):\n",
" df = crawl.GetUserIllusts(uid)\n",
" _ = crawl.UpdateIllusts(df)\n",
" crawl.randSleep(1.1, 5.0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GetIllustRanking"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# mode: [day, week, month, day_male, day_female, week_original, week_rookie, day_manga]\n",
"# date: '2016-08-01'\n",
"# mode (Past): [day, week, month, day_male, day_female, week_original, week_rookie,\n",
"# day_r18, day_male_r18, day_female_r18, week_r18, week_r18g]\n",
"df_ranking = crawl.GetIllustRanking('week', '2019-11-01')\n",
"_ = crawl.UpdateIllusts(df_ranking)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 0a02866

Please sign in to comment.