-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathto_pdf.py
134 lines (113 loc) · 5.13 KB
/
to_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import re
import time
import pandas as pd
import pdfkit
import requests
from bs4 import BeautifulSoup
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
wb1 = Workbook()
ws = wb.active
ws1 = wb1.active
ws.append([
"微信链接"
])
ws1.append([
"PDF下载链接"
])
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"cookie": "passport_web_did=6991686627378610179; _csrf_token=006d8b440983f6a55624c75ec805cbb723740b6b-1627878898; lang=zh; garr_version_list=%7B%7D; vt=1; et=d2559f22427792d90a4449b5947f79ba; ot=d2559f22427792d90a4449b5947f79ba; __tea__ug__uid=6991686625102710303; locale=zh-CN; trust_browser_id=45cd2fe3-f4fa-4e14-8ad3-54f0344741ed; _ga=GA1.2.2054286673.1627881995; _gid=GA1.2.1200040649.1627881995; fid=d51de26d-6ecf-4a26-9364-d46043764697; is_anonymous_session=; MONITOR_WEB_ID=6991700032240025602; template-branch-fixed=1; landing_url=https://passport.feishu.cn/suite/passport/page/login/?query_scope=all&app_id=2&should_pass_through=1&utm_from=organic_ccm_share_web&redirect_uri=https%3A%2F%2Fnnf1sjjavr.feishu.cn%2Fdrive%2Fhome%2F&template_id=6882649779491307521&biz=feishu_docs; session=XN0YXJ0-1d426122-efdd-4d76-9604-03d9f342b8bg-WVuZA; template-branch-list=; swp_csrf_token=2f8a4936-b2a1-42b5-8203-f905dc6d6610; t_beda37=8b0ccfcba972fa82b6088e7cd229575634347796047dcda911415bfb71b22f75"
}
def get_yuanrenyun_ip():
# 代理隧道验证信息
# url = "http://http.tiqu.letecs.com/getip3?num=1&type=2&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=2®ions=&gm=4"
url = "http://d.jghttp.alicloudecs.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&time=4&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1®ions=&username=chukou01&spec=1"
resp = requests.get(url).json()
print(resp)
ip = resp["data"][0]["ip"]
port = resp["data"][0]["port"]
meta = "http://%(host)s:%(port)s" % {
"host": ip,
"port": port,
}
proxies = {
"http": meta,
"https": meta
}
# print(proxies)
return proxies
options = {
'page-size': 'A4',
# 'margin-top': '0.75in',
# 'margin-right': '0.75in',
# 'margin-bottom': '0.75in',
# 'margin-left': '0.75in',
'encoding': "UTF-8",
# 'custom-header': headers,
# 'debug-javascript': [''],
# 'javascript-delay': 10000,
# 'no-stop-slow-scripts': "",
# 'load-media-error-handling': 'abort',
}
def get_ip():
resp = requests.get(
"http://lumingmelody.v4.dailiyun.com/query.txt?key=NP2908A5A2&word=&count=1&rand=false<ime=0&norepeat=false&detail=false")
# print(resp.text)
proxy = {
'http://': f'{resp.text}',
'https://': f'{resp.text}'
}
return proxy
def weixin_to_pdf(url):
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
t = html.xpath('//*[@id="activity-name"]/text()')
title = (''.join(str(i) for i in t)).strip()
print(title)
config = pdfkit.configuration(wkhtmltopdf=r'D:\wkhtmltopdf\bin\wkhtmltopdf.exe')
pdfkit.from_string(get_html(url), f'./巨量PDF/{title}.pdf', configuration=config, options=options)
time.sleep(3)
def down_load(d_id, d_url):
p_url = f"https://bytedance.feishu.cn/space/api/meta/?token={d_id}&type=12"
proxies = get_ip()
resp = requests.get(p_url, headers=headers, proxies=proxies).json()
print(resp)
time.sleep(3)
title = resp['data']['title']
response = requests.get(d_url, headers=headers)
with open(f"./巨量PDF1/{title}", mode='wb') as f:
f.write(response.content)
time.sleep(3)
def get_html(url):
res = requests.get(url)
# data-src替换为src 有时候返回的正文被隐藏了,将hidden去掉
html = res.text.replace("data-src", "src").replace('style="visibility: hidden;"', "")
soup = BeautifulSoup(html)
# 选择正文(去除javascrapt等)
html = soup.select('div#img-content')[0]
# 可以修改字体
font = '''
<style type="text/css">
@font-face{font-family: "微软雅黑";src:url("C:\\Windows\\Fonts\\msyh.ttc")
</style>
<style type = "text/css">
p { font-family: '微软雅黑', cursive; }
</style>
'''
html = font + str(html)
return html
if __name__ == '__main__':
df = pd.read_excel(r"D:\red_book\red_book_51wom\red_book_8月\red_book_08_02\PDF下载链接.xlsx")
urls = df['PDF下载链接']
# df = pd.read_excel(r"D:\red_book\red_book_51wom\red_book_8月\red_book_08_03\未下载微信链接.xlsx")
# urls = df['未下载微信链接']
# # url = "https://mp.weixin.qq.com/s?__biz=MzUxNTk5NjY4MQ==&mid=2247485533&idx=1&sn=8f5121e99ee6531eb8ce45bed4f5f4a0&chksm=f9af653bced8ec2dbddc6cc7e8ee15a6d3875b32331b1480e891e987a5f3c7f5117042185b32&token=1252715679&lang=zh_CN#rd"
for url in urls:
# weixin_to_pdf(url)
d_id = url.split("/")[-2]
# print(d_id)
down_load(d_id, url)
# down_load("boxcn3f4uFHxMDV4cA9wIbcUGQi", "https://internal-api-drive-stream.feishu.cn/space/api/box/stream/download/all/boxcnZr2Ucnj0HKfnA8oDGyLSEd/")