-
Notifications
You must be signed in to change notification settings - Fork 32
/
main.py
155 lines (124 loc) · 6.78 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
__author__ = 'ando'
import os
import random
from multiprocessing import cpu_count
import logging as log
import numpy as np
import psutil
from math import floor
from ADSCModel.model import Model
from ADSCModel.context_embeddings import Context2Vec
from ADSCModel.node_embeddings import Node2Vec
from ADSCModel.community_embeddings import Community2Vec
import utils.IO_utils as io_utils
import utils.graph_utils as graph_utils
import utils.plot_utils as plot_utils
import timeit
log.basicConfig(format='%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s', level=log.DEBUG)
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
if __name__ == "__main__":
#Reading the input parameters form the configuration files
number_walks = 10 # number of walks for each node
walk_length = 80 # length of each walk
representation_size = 128 # size of the embedding
num_workers = 10 # number of thread
num_iter = 1 # number of overall iteration
reg_covar = 0.00001 # regularization coefficient to ensure positive covar
input_file = 'Dblp' # name of the input file
output_file = 'Dblp' # name of the output file
batch_size = 50
window_size = 10 # windows size used to compute the context embedding
negative = 5 # number of negative sample
lr = 0.025 # learning rate
alpha_betas = [(0.1, 0.1)]
down_sampling = 0.0
ks = [5]
walks_filebase = os.path.join('data', output_file) # where read/write the sampled path
#CONSTRUCT THE GRAPH
G = graph_utils.load_matfile(os.path.join('./data', input_file, input_file + '.mat'), undirected=True)
# Sampling the random walks for context
log.info("sampling the paths")
walk_files = graph_utils.write_walks_to_disk(G, os.path.join(walks_filebase, "{}.walks".format(output_file)),
num_paths=number_walks,
path_length=walk_length,
alpha=0,
rand=random.Random(0),
num_workers=num_workers)
vertex_counts = graph_utils.count_textfiles(walk_files, num_workers)
model = Model(vertex_counts,
size=representation_size,
down_sampling=down_sampling,
table_size=100000000,
input_file=os.path.join(input_file, input_file),
path_labels="./data")
#Learning algorithm
node_learner = Node2Vec(workers=num_workers, negative=negative, lr=lr)
cont_learner = Context2Vec(window_size=window_size, workers=num_workers, negative=negative, lr=lr)
com_learner = Community2Vec(lr=lr)
context_total_path = G.number_of_nodes() * number_walks * walk_length
edges = np.array(G.edges())
log.debug("context_total_path: %d" % (context_total_path))
log.debug('node total edges: %d' % G.number_of_edges())
log.info('\n_______________________________________')
log.info('\t\tPRE-TRAINING\n')
###########################
# PRE-TRAINING #
###########################
node_learner.train(model,
edges=edges,
iter=1,
chunksize=batch_size)
cont_learner.train(model,
paths=graph_utils.combine_files_iter(walk_files),
total_nodes=context_total_path,
alpha=1,
chunksize=batch_size)
#
model.save("{}_pre-training".format(output_file))
###########################
# EMBEDDING LEARNING #
###########################
iter_node = floor(context_total_path/G.number_of_edges())
iter_com = floor(context_total_path/(G.number_of_edges()))
# iter_com = 1
# alpha, beta = alpha_betas
for it in range(num_iter):
for alpha, beta in alpha_betas:
for k in ks:
log.info('\n_______________________________________\n')
log.info('\t\tITER-{}\n'.format(it))
model = model.load_model("{}_pre-training".format(output_file))
model.reset_communities_weights(k)
log.info('using alpha:{}\tbeta:{}\titer_com:{}\titer_node: {}'.format(alpha, beta, iter_com, iter_node))
start_time = timeit.default_timer()
com_learner.fit(model, reg_covar=reg_covar, n_init=10)
node_learner.train(model,
edges=edges,
iter=iter_node,
chunksize=batch_size)
com_learner.train(G.nodes(), model, beta, chunksize=batch_size, iter=iter_com)
cont_learner.train(model,
paths=graph_utils.combine_files_iter(walk_files),
total_nodes=context_total_path,
alpha=alpha,
chunksize=batch_size)
log.info('time: %.2fs' % (timeit.default_timer() - start_time))
# log.info(model.centroid)
io_utils.save_embedding(model.node_embedding, model.vocab,
file_name="{}_alpha-{}_beta-{}_ws-{}_neg-{}_lr-{}_icom-{}_ind-{}_k-{}_ds-{}".format(output_file,
alpha,
beta,
window_size,
negative,
lr,
iter_com,
iter_node,
model.k,
down_sampling))