forked from hcj5206/CnkiAndCqvipCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PublicDef.py
191 lines (178 loc) · 8.17 KB
/
PublicDef.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# author:hcj
# @File : PublicDef.py
# datetime:2019/6/27 19:59
import os
import re
import sys
import threading
import time
import psutil as psutil
from HCJ_Buff_Control import Read_buff, Write_buff
def CreatUrlBuffTable(db,TableName):
CreatDBTableSql = '\
CREATE TABLE IF NOT EXISTS `%s` (\
`Index` int(11) unsigned NOT NULL AUTO_INCREMENT,\
`Url` VARCHAR(255) DEFAULT NULL,\
`State` INT(11) NULL DEFAULT \'0\' COMMENT \'-5 日期不对 -10 出现错误 -15 无效链接 0 初始 10 处理中 20 处理结束\',\
`Datetime` DATETIME NULL DEFAULT CURRENT_TIMESTAMP,\
`Source` VARCHAR(200) NULL DEFAULT NULL,\
UNIQUE INDEX `Url` (`Url`),\
PRIMARY KEY (`Index`)\
) ENGINE=InnoDB DEFAULT CHARSET=utf8; ' % TableName
dict_result = db.upda_sql(CreatDBTableSql)
if not dict_result:
print("创建%s表出现问题" % TableName)
def CreatResultDBTable(db,TableName):
'''
创建结构数据库表单,如果不存在就创建
:return:
'''
CreatDBTableSql = '\
CREATE TABLE IF NOT EXISTS `%s` (\
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,\
`url` text DEFAULT NULL, \
`title` varchar(200) DEFAULT NULL,\
`authors` varchar(200) DEFAULT NULL,\
`unit` text DEFAULT NULL,\
`publication` varchar(200) DEFAULT NULL,\
`keywords` varchar(200) DEFAULT NULL,\
`abstract` text DEFAULT NULL,\
`year` varchar(200) DEFAULT NULL,\
`volume` varchar(200) DEFAULT NULL,\
`issue` varchar(200) DEFAULT NULL,\
`pagecode` varchar(200) DEFAULT NULL,\
`doi` varchar(200) DEFAULT NULL,\
`sponser` text DEFAULT NULL,\
`type` varchar(200) DEFAULT NULL,\
`source` varchar(200) DEFAULT NULL,\
PRIMARY KEY (`id`)\
) ENGINE=InnoDB DEFAULT CHARSET=utf8; ' % TableName
dict_result = db.upda_sql(CreatDBTableSql)
if not dict_result:
print("创建出现问题")
def RemoveSpecialCharacter(str1):
# 去除特殊符号 只留汉字 数字 字母
cop = re.compile(r"[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]")
string1 = cop.sub('', str(str1)) # 将string1中匹配到的字符替换成空字符
return string1
def InsetDbbyDict(table,Dict,db,DbDatabuff,Dbresult):
COLstr = '' # 列的字段
ROWstr = '' # 行字段
SearchDbname=""
for key in list(Dict.keys()):
Dict[key]=Dict[key].replace('\n','').replace('\"',"#").replace('\t',"").replace('\r',"").replace('\xa0',"").replace('%', ">").replace('\'', "^")
Dict['publication']=Dict['publication'].replace('《','').replace('》','')
for key in Dict.keys():
COLstr = COLstr + ' ' + '`' + key + '`,'
ROWstr = (ROWstr + '"%s"' + ',') % str(Dict[key])
if 'cqvip' in Dict['url']:
SearchDbname="cqvip"
if 'cnki' in Dict['url']:
SearchDbname="cnki"
if 'wanfang' in Dict['url']:
SearchDbname="wanfang"
sql = "INSERT INTO %s (%s,`source`) VALUES (%s,'%s');" % (
table,COLstr[:-1], ROWstr[:-1],SearchDbname)
sql_select="select count(*) from `%s` where `title`='%s' and `publication` LIKE '%%%%%s%%%%'"%(Dbresult,(Dict['title']),((Dict['publication'])))
sql_update="Update `%s` set `State`=20 where `Url`='%s' "%(DbDatabuff,str(Dict['url']).replace('>', "%"))
result_count=db.do_sql_one(sql_select)
if result_count[0]==0 or result_count[0]=='0':
result_dic=db.insert(sql)
if result_dic['result']:#
db.upda_sql(sql_update)
else:
print(result_dic['err'])
db.upda_sql("Update `%s` set `State`=-15 where `Url`='%s' "%(DbDatabuff,str(Dict['url'])))
else:
str1=""
for key in list(Dict.keys()):
if str(Dict[key]) != "":
col = key
row = Dict[key]
str1 = str1 + "`%s`=(case when `%s`='' then '%s' else `%s` end )," % (col, col, row, col)
sql_update1 = "update `%s` set `url`=concat(`url`,';%s'),`source`=concat(`source`,';%s'),%s where `title`='%s' and `publication` LIKE '%%%%%s%%%%'" % (Dbresult,Dict['url'], SearchDbname,str1[:-1], ((Dict['title'])), ((Dict['publication'])))
db.upda_sql(sql_update1)
sql_update = "Update `%s` set `State`=20 where `Url`='%s' " % (DbDatabuff,str(Dict['url']).replace('>', "%"))
db.upda_sql(sql_update)
def ShowStatePro(db,SearchDBName,DbDatabuff,Dbresult):
sql_count_all = "select count(*) from `%s` where `Source`='%s'"%(DbDatabuff,SearchDBName)
num_all = int(db.do_sql_one(sql_count_all)[0])
sql_count_done = "select count(*) from `%s` where `State`=20 and `Source`='%s'"%(DbDatabuff,SearchDBName)
num_done = int(db.do_sql_one(sql_count_done)[0])
sql_count_error = "select count(*) from `%s` where `State`=-15 and `Source`='%s'"%(DbDatabuff,SearchDBName)
num_error = int(db.do_sql_one(sql_count_error)[0])
num_error = num_error if num_error > 0 else 0
sql_count_done_not_in_year = "select count(*) from `%s` where `State`=-5 and `Source`='%s'"%(DbDatabuff,SearchDBName)
num_done_not_in_year = int(db.do_sql_one(sql_count_done_not_in_year)[0])
num_done_not_in_year = num_done_not_in_year if num_done_not_in_year > 0 else 0
num_done = num_done + num_done_not_in_year+num_error
if num_all > 0:
print(
"%s采集器:#############################################目前有%s条数据,其中已处理的有%s,其中年份不符合的有%s,无效链接%s,处理完成度为%.2f,##############################" % (
SearchDBName,num_all, num_done, num_done_not_in_year,num_error, (int(num_done) / int(num_all)) * 100))
if '1' in str(Read_buff(file_buff="Config.ini", settion=SearchDBName, info='flag_get_all_url')) and num_all == num_done:
# 完成全部
Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=1)
time.sleep(5)
print("%s:爬取结束"%SearchDBName)
sys.exit(0)
common_used_numerals_tmp = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
common_used_numerals = {}
for key in common_used_numerals_tmp:
common_used_numerals[key] = common_used_numerals_tmp[key]
def chinese2digits(uchars_chinese):
total = 0
r = 1 # 表示单位:个十百千...
for i in range(len(uchars_chinese) - 1, -1, -1):
val = common_used_numerals.get(uchars_chinese[i])
if val >= 10 and i == 0: # 应对 十三 十四 十*之类
if val > r:
r = val
total = total + val
else:
r = r * val
# total =total + r * x
elif val >= 10:
if val > r:
r = val
else:
r = r * val
else:
total = total + r * val
return total
num_str_start_symbol = ['一', '二', '两', '三', '四', '五', '六', '七', '八', '九',
'十']
more_num_str_symbol = ['零', '一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿']
def changeChineseNumToArab(oriStr):
lenStr = len(oriStr)
aProStr = ''
if lenStr == 0:
return aProStr
hasNumStart = False
numberStr = ''
for idx in range(lenStr):
if oriStr[idx] in num_str_start_symbol:
if not hasNumStart:
hasNumStart = True
numberStr += oriStr[idx]
else:
if hasNumStart:
if oriStr[idx] in more_num_str_symbol:
numberStr += oriStr[idx]
continue
else:
numResult = str(chinese2digits(numberStr))
numberStr = ''
hasNumStart = False
aProStr += numResult
aProStr += oriStr[idx]
pass
if len(numberStr) > 0:
resultNum = chinese2digits(numberStr)
aProStr += str(resultNum)
return aProStr
if __name__ == '__main__':
str1=u"我爱你中国123456Zsdfsdf"