-
Notifications
You must be signed in to change notification settings - Fork 144
/
Copy pathzhihu_database.py
169 lines (150 loc) · 6.22 KB
/
zhihu_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 21 15:39:46 2015
@author: Che
"""
import glob
import sqlite3
import pandas as pd
import string
def user_to_sqlite():
'''
User's porfolio information
'''
conn = sqlite3.connect("zhihu.db")
user_path = 'csv/user/edited/*'
user_files = glob.glob(user_path)
for ufile in user_files:
data = pd.read_csv(ufile, header=None)
for row in data.iterrows():
record_id = "NULL"
user_url = "\'" + str(row[1][0]) + "\'"
#translate可以去掉string里所有標點符號, e.g. James Ch'i -> James Chi
user_id = "\'" + str(row[1][1]).translate(string.maketrans("",""), string.punctuation) + "\'"
followee_num = str(row[1][2])
follower_num = str(row[1][3])
answer_num = str(row[1][4])
agree_num = str(row[1][5])
thanks_num = str(row[1][6])
layer = str(row[1][7])
is_crawled = "NULL"
#handle duplicate wih OR REPLACE
try:
command = "INSERT OR REPLACE INTO user VALUES(" + record_id + "," + user_url + "," + user_id + "," + \
followee_num + "," + follower_num + "," + answer_num + "," + \
agree_num + "," + thanks_num + "," + layer + "," + is_crawled + ")"
conn.execute(command)
except sqlite3.Error as e:
print 'Error:', e.args[0]
print 'at command:', command
continue
conn.commit()
conn.close()
def following_to_sqlite_1_to_n():
'''
User's followees
'''
conn = sqlite3.connect("zhihu.db")
following_path = 'csv/following/*'
following_files = glob.glob(following_path)
for ffile in following_files:
data = pd.read_csv(ffile, header=None)
for row in data.iterrows():
record_id = "NULL"
user_url = "\'" + str(row[1][0]) + "\'"
#handle duplicate with distinct index
for followee in str(row[1][1]).split('\t'):
followee_url = "\'" + followee + "\'"
try:
command = "INSERT INTO following VALUES(" + record_id + "," + user_url + "," + followee_url + ")"
conn.execute(command)
except sqlite3.Error as e:
print 'Error:', e.args[0]
print 'at command:', command
continue
conn.commit()
conn.close()
def following_to_sqlite_1_to_1():
'''
User's followees; This method is not used due to the format of crawling result csv.
'''
conn = sqlite3.connect("zhihu.db")
following_path = 'csv/following/*'
following_files = glob.glob(following_path)
for ffile in following_files:
data = pd.read_csv(ffile, header=None)
for row in data.iterrows():
record_id = "NULL"
user_url = "\'" + str(row[1][0]) + "\'"
followee_url = "\'" + str(row[1][1]) + "\'"
#handle duplicate with distinct index
try:
command = "INSERT INTO following VALUES(" + record_id + "," + user_url + "," + followee_url + ")"
conn.execute(command)
except sqlite3.Error as e:
print 'Error:', e.args[0]
print 'at command:', command
continue
conn.commit()
conn.close()
def user_question_to_sqlite():
'''
User answered questions
'''
conn = sqlite3.connect("zhihu.db")
following_path = 'csv/user_question/*'
following_files = glob.glob(following_path)
for ffile in following_files:
data = pd.read_csv(ffile, header=None)
for row in data.iterrows():
record_id = "NULL"
user_quesitons_pair = str(row[1][0]).split('\t', 1)
user_url = "\'" + user_quesitons_pair[0] + "\'"
for q in user_quesitons_pair[1].split('\t'):
question_id = "\'" + q + "\'"
#handle duplicate with distinct index
try:
command = "INSERT INTO UserQuestion VALUES(" + record_id + "," + user_url + "," + question_id + ")"
conn.execute(command)
except sqlite3.Error as e:
print 'Error:', e.args[0]
print 'at command:', command
continue
conn.commit()
conn.close()
def question_topic_to_sqlite():
'''
question's topics
'''
conn = sqlite3.connect("zhihu.db")
following_path = 'csv/question_topic/*'
following_files = glob.glob(following_path)
for ffile in following_files:
#sep to ensure the whole row is the only one field
#use open() can work, too
data = pd.read_csv(ffile, header=None, sep='\t\t')
for row in data.iterrows():
record_id = "NULL"
question_topics_pair = str(row[1][0]).split('\t', 1)
question_id = "\'" + question_topics_pair[0] + "\'"
#delete old topics, effeciency is low
try:
command = "DELETE FROM Question WHERE question_id=" + question_id
conn.execute(command)
except sqlite3.Error as e:
print 'Error:', e.args[0]
print 'at command:', command
continue
#insert new topics, handle duplicate with distinct index
if len(question_topics_pair) == 2: #question has topics
for t in question_topics_pair[1].split('\t'):
topic = "\'" + str(t).replace("'","''") + "\'" #escape "'" to "''" in sqlite
try:
command = "INSERT INTO Question VALUES(" + record_id + "," + question_id + "," + topic + ")"
conn.execute(command)
except sqlite3.Error as e:
print 'Error:', e.args[0]
print 'at command:', command
continue
conn.commit()
conn.close()