-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdivide.py
496 lines (433 loc) · 22.5 KB
/
divide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#!/usr/bin/env python
#spellcheck-off
"""
Divide a corpus into training and test sets.
Before this script, we have selected some subset of lemmas and morph combinations
that we analyse, and in this script we divide the data so that train and test sets:
* have the same distribution of lemmas (minimise atom divergence)
* and the same distribution of morph tags (minimise atom divergence)
* but different distribution of the combinations of these two (maximise (or
set to some other specified value) compound divergence)
"""
from os import path, makedirs
import argparse
import random
import sys
import pickle as pkl
from tqdm import tqdm
import torch
TRAIN_SET = 0
TEST_SET = 1
DISCARD_SET = 2
SET_NAMES = {TRAIN_SET: 'TRAIN_SET', TEST_SET: 'TEST_SET', DISCARD_SET: 'DISCARD_SET'}
MASK_VALUE = -torch.inf
if torch.cuda.is_available():
print("Using", torch.cuda.device_count(), "GPU(s).")
device = torch.device("cuda")
if torch.cuda.device_count() == 1:
device = torch.device("cuda")
elif torch.cuda.device_count() == 2:
device = torch.device("cuda:1")
else:
raise ValueError("More than 2 GPUs are not supported.")
else:
print("Using only CPU.")
device = torch.device("cpu")
secondary_device = torch.device("cpu")
def chernoff_coef(matrix1, matrix2, alpha):
"""
The Chernoff coefficient c is a similarity measure C_{alpha}(P||Q)
= sum_k[p_k^alpha * q_k^(1-alpha)] e[0,1] between two probability
distributions P and Q. The alpha parameter determines if we want to
measure whether Q includes elements that are not in P.
(Atom divergence is 1 - Chernoff coefficient, with alpha=0.5)
(Compound divergence is 1 - Chernoff coefficient, with alpha=0.1)
Returns
-------
torch.Tensor, vector with length matrix.shape[0]
Chernoff coefficient between vector and each row of matrix
"""
if len(matrix1.shape) == 1:
sum_axis = 0
elif len(matrix1.shape) == 2:
sum_axis = 1
if len(matrix2.shape) == 1:
if matrix2.shape[0] != matrix1.shape[1]:
raise ValueError("matrix second dim must be equal to vector length")
else:
raise ValueError("matrix must be 1D or 2D")
if alpha < 0 or alpha > 1:
raise ValueError("alpha must be in [0,1]")
return torch.sum(torch.exp((torch.log(matrix1) * alpha) +
(torch.log(matrix2) * (1-alpha))), axis=sum_axis)
def mat_vec_divergence(matrix, vector, alpha):
"""Divergence between a vector and each row of a matrix."""
return 1 - chernoff_coef(normalize_matrix(matrix), normalize_vector(vector), alpha)
def mat_mat_divergence(matrix1, matrix2, alpha):
"""Divergence between each row of matrix1 and the corresponding row of matrix2."""
return 1 - chernoff_coef(normalize_matrix(matrix1), normalize_matrix(matrix2), alpha)
def get_scores(atomdivs, comdivs, target_atom_div, target_com_div):
"""Return the scores for each index in atomdivs and comdivs vectors."""
return - torch.abs(comdivs - target_com_div) \
- torch.abs(atomdivs - target_atom_div)
def remove_row(matrix, row):
"""Remove a row from a matrix (2D tensor) or vector (1D tensor)."""
return matrix[torch.arange(matrix.shape[0], device=device) != row]
def delete_value(vector, value):
"""Delete element from tensor by value. If there are multiple values,
the first one is deleted. (Vectors should not have duplicates, though.)"""
idx = (vector == value).nonzero(as_tuple=True)[0][0]
return remove_row(vector, idx)
def normalize_vector(vector):
"""Normalize a vector to have sum 1."""
return torch.nan_to_num(torch.divide(vector, torch.sum(vector)))
def normalize_matrix(matrix):
"""Normalize a matrix; each row sums up to 1."""
return torch.nan_to_num(
torch.transpose(
torch.divide(
torch.transpose(matrix, 0, 1),
torch.sum(matrix, axis=1)
),
0, 1) # /transpose
)
def load_struct(filename):
"""Load a structure from a file. Supported file extensions: pkl, txt."""
if filename.endswith('.pkl'):
with open(filename, 'rb') as pklf:
struct = pkl.load(pklf)
elif filename.endswith('.txt'):
with open(filename, 'r', encoding='utf-8') as txtf:
struct = [c.strip() for c in txtf.readlines()]
else:
sys.exit('Unknown file extension. Supported: pkl, txt.')
return struct
def cp_file_names(data_dir, i, atomdiv, comdiv) -> tuple[str, str]:
"""Return the names of the train and test sets for the given parameters."""
suf = f'_iter{i}_comdiv{comdiv}_atomdiv{atomdiv}'
train_set_out = data_dir + f'/train_set{suf}.txt'
test_set_out = data_dir + f'/test_set{suf}.txt'
return train_set_out, test_set_out
def get_candidates(current_freq_sums_dict, subset, the_other_set, changes) -> dict:
"""Returns a dictionary containing the candidate changes to the compound and
atom frequency distributions of the train and test sets."""
return {subset: current_freq_sums_dict[subset] + changes,
the_other_set: current_freq_sums_dict[the_other_set] - changes}
class DivideTrainTest:
"""Divide sample set to train and test sets. This class is inherited by other classes
that define the actual algorithm in the divide_corpus() method."""
def __init__(self,
data_dir=".",
subsample_size=None,
subsample_iter=None,
):
self._read_data(data_dir)
self.n_samples = len(self.sent_ids)
print(f'Size of corpus: {self.n_samples} sentences.')
if subsample_size is not None:
if subsample_size < subsample_iter:
raise ValueError('subsample_size smaller than subsample_iter!')
self.do_subsample = True
self.subsample_size = subsample_size
if subsample_iter is None:
raise ValueError('subsample_size defined but subsample_iter not defined!')
self.subsample_iter = subsample_iter
else:
self.do_subsample = False
if subsample_iter is not None:
raise ValueError('subsample_iter defined but subsample_size not defined!')
if self.do_subsample:
if self.n_samples < self.subsample_size:
raise ValueError('subsample_size is larger than the number of sentences!')
self.n_matrix_rows = int(self.subsample_size)
self.random_idxs = torch.zeros(self.subsample_size, device=device)
else:
self.n_matrix_rows = int(self.n_samples)
# sizes of matrices
self.com_dim = self.com_freq_matrix_full.shape[1]
self.atom_dim = self.atom_freq_matrix_full.shape[1]
# 2 matrices: one for each set
self.candidate_com_sums = torch.zeros((2, self.n_matrix_rows, self.com_dim), device=device)
self.candidate_atom_sums = torch.zeros((2, self.n_matrix_rows, self.atom_dim),
device=device)
# keep unnormalised vectors as separate variables to enable updating
self.subset_com_freq_sum = torch.zeros((2, self.com_dim), device=device)
self.subset_atom_freq_sum = torch.zeros((2, self.atom_dim), device=device)
# init subset indices in the discard set
self.subset_indices = torch.zeros(self.n_samples, device=device) + DISCARD_SET
def _read_data(self, data_dir: str) -> None:
group_suffix = ''
print('Reading data from files...')
self.atom_freq_matrix_full = torch.load(
path.join(data_dir, f'atom_freqs{group_suffix}.pt'),
map_location=secondary_device)
self.com_freq_matrix_full = torch.load(
path.join(data_dir,f'compound_freqs{group_suffix}.pt'),
map_location=secondary_device)
self.atom_ids = load_struct(path.join(data_dir, 'atom_ids.pkl'))
self.com_ids = load_struct(path.join(data_dir, 'com_ids.pkl'))
self.sent_ids = load_struct(path.join(data_dir, f'used_sent_ids{group_suffix}.txt'))
print('Done reading data.')
def write_ids_to_file(self, set_ids, set_output):
"""Write the ids of the sentences in the set to a file."""
with open(set_output, 'w', encoding='utf-8') as f:
for sent_id in set_ids:
f.write(f'{sent_id}\n')
def print_subset_atoms_and_compounds(self, print_all=False, separate=True):
"""Print the number of atoms and compounds in the train and test sets."""
if not separate:
print('\nATOMS; TRAIN SET, TEST SET:')
for atom, freq_train, freq_test in zip(self.atom_ids.keys(),
self.subset_atom_freq_sum[TRAIN_SET],
self.subset_atom_freq_sum[TEST_SET]):
if print_all or freq_train > 0 or freq_test > 0:
print(f'{atom}: {freq_train} {freq_test}')
print('\nCOMPOUNDS; TRAIN SET, TEST SET:')
for compound, freq_train, freq_test in zip(self.com_ids.keys(),
self.subset_com_freq_sum[TRAIN_SET],
self.subset_com_freq_sum[TEST_SET]):
if print_all or freq_train > 0 or freq_test > 0:
print(f'{compound}: {freq_train} {freq_test}')
else:
print('\nATOMS in TRAIN SET:')
print([atom for atom, freq in zip(self.atom_ids.keys(),
self.subset_atom_freq_sum[TRAIN_SET]) if freq > 0])
print('\nATOMS in TEST SET:')
print([atom for atom, freq in zip(self.atom_ids.keys(),
self.subset_atom_freq_sum[TEST_SET]) if freq > 0])
print('\nCOMS in TRAIN SET:')
print([com for com, freq in zip(self.com_ids.keys(),
self.subset_com_freq_sum[TRAIN_SET]) if freq > 0])
print('\nCOMS in TEST SET:')
print([com for com, freq in zip(self.com_ids.keys(),
self.subset_com_freq_sum[TEST_SET]) if freq > 0])
def divide_corpus(self):
"""Divide the corpus into train and test sets."""
raise NotImplementedError('This method must be implemented in a subclass.')
def get_divergences(self):
"""Returns the current atom and compound divergences."""
atomdiv = 1 - chernoff_coef(
normalize_vector(self.subset_atom_freq_sum[TRAIN_SET]),
normalize_vector(self.subset_atom_freq_sum[TEST_SET]), 0.5)
comdiv = 1 - chernoff_coef(
normalize_vector(self.subset_com_freq_sum[TRAIN_SET]),
normalize_vector(self.subset_com_freq_sum[TEST_SET]), 0.1)
return atomdiv, comdiv
def get_subset_indices(self, subset):
"""Returns the indices of the given subset."""
return (self.subset_indices == subset).nonzero()
class FromEmptySets(DivideTrainTest):
"""Divide sample set to train and test sets, starting from empty sets."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
print('Initialising the matrices...')
# TODO: lose the matrices on CPU and use only the GPU ones
# move the full matrices to the GPU (if available)
self.com_freq_matrix = self.com_freq_matrix_full.clone().detach().to(device)
self.atom_freq_matrix = self.atom_freq_matrix_full.clone().detach().to(device)
# used_ids_mask contains -inf for sentences that are used, 0 otherwise
self.used_ids_mask = torch.zeros(self.n_matrix_rows, device=device)
# initialise the random subsample indices
self.random_idxs, _, _ = self._get_random_subsample(DISCARD_SET)
# initialize train set with one random sample
self._add_sample_to_set(TRAIN_SET, random.randrange(self.n_matrix_rows))
print('Initialisation done. Initialised the train set with one random sentence.')
def _get_random_subsample(self, subset_id):
"""Take a random subsample of the sentences in the given subset."""
subset_indices = self.get_subset_indices(subset_id).squeeze()
random_subsample_indices = subset_indices[
torch.randperm(subset_indices.shape[0], device=device)][:self.subsample_size]
random_atom = self.atom_freq_matrix.index_select(0, random_subsample_indices).to_dense()
random_com = self.com_freq_matrix.index_select(0, random_subsample_indices).to_dense()
return random_subsample_indices, random_atom, random_com
def _subsample(self, from_set, to_sets):
"""Take a random subsample of sentences in from_set, of size self.subsample_size.
Update candidate_com_sums and candidate_atom_sums with the new subsample."""
self.random_idxs, random_atom, random_com = self._get_random_subsample(from_set)
self.used_ids_mask = torch.zeros(self.subsample_size, device=device)
for to_set in to_sets:
# a new matrix that has the freq sums with each new sample and the new random subset
self.candidate_com_sums[to_set] = self.subset_com_freq_sum[to_set] + random_com
self.candidate_atom_sums[to_set] = self.subset_atom_freq_sum[to_set] + random_atom
def _best_sentence(self, atom_div_vec, com_div_vec, mask=None) -> dict:
"""Return the argmax and max score, and the compound and atom divergences of
the sample that maximises the score. Score is the linear combination of
the negated differences between the target divergences and actual divergences.
Optionally, a mask can be applied to the score vector, e.g. to avoid selecting
sentences that have already been selected."""
scores = get_scores(atom_div_vec, com_div_vec, self.target_atom_div, self.target_com_div)
if mask is not None:
scores += mask
best_idx = torch.argmax(scores)
return {'idx': best_idx,
'score': scores[best_idx],
'atomdiv': atom_div_vec[best_idx],
'comdiv': com_div_vec[best_idx]}
def _add_sample_to_set(self, subset, selected_idx, from_set=None):
"""Update the data structs after a new sample has been selected to a subset."""
# mark the selected index as used
self.used_ids_mask[selected_idx] = MASK_VALUE
# if we are using a subsample, we need to map the selected index to the original
if self.do_subsample:
selected_idx = self.random_idxs[selected_idx]
self.subset_indices[selected_idx] = subset
selected_com_row = self.com_freq_matrix[selected_idx].to_dense()
selected_atom_row = self.atom_freq_matrix[selected_idx].to_dense()
self.subset_com_freq_sum[subset] += selected_com_row
self.subset_atom_freq_sum[subset] += selected_atom_row
if from_set is not None:
self.subset_com_freq_sum[from_set] -= selected_com_row
self.subset_atom_freq_sum[from_set] -= selected_atom_row
def _candidate_divergences(self, subset_id: int) -> tuple[torch.Tensor, torch.Tensor]:
"""Compute the compound divergence between the subset vector and the matrix containing
the candidate sentences."""
if subset_id == TRAIN_SET:
the_other_set_id = TEST_SET
com_alpha = 0.1
else:
the_other_set_id = TRAIN_SET
com_alpha = 0.9 # because test set is the first argument to chernoff_coef()
# The atom divergence is 1 - Chernoff coefficient, with alpha=0.5.
atom_div_add_to_subset = mat_vec_divergence(self.candidate_atom_sums[subset_id],
self.subset_atom_freq_sum[the_other_set_id], 0.5)
# The compound divergence is 1 - Chernoff coefficient, with alpha=0.1.
com_div_add_to_subset = mat_vec_divergence(self.candidate_com_sums[subset_id],
self.subset_com_freq_sum[the_other_set_id], com_alpha)
return atom_div_add_to_subset, com_div_add_to_subset
def divide_corpus(self,
target_com_div=1.0,
min_test_percent=0.05,
max_test_percent=0.3,
select_n_samples=None,
max_iters=None,
print_every=10000,
save_cp=100000,
output_dir=".",
):
"""Divide data into train and test sets. At each iteration, select the sample from
sample_matrix that maximises the score."""
self.target_com_div = target_com_div
self.target_atom_div = 0.0
if select_n_samples is None or select_n_samples > self.n_samples - self.subsample_size:
print('Warning: using all samples in corpus.')
# not using the last, incomplete subsample
# TODO: fix this to use all sentences
if self.do_subsample:
select_n_samples = self.n_samples - self.subsample_size
else:
select_n_samples = self.n_samples - 1
best_values = {}
train_size = 0
test_size = 0
def _print_iteration():
print(f'After iteration {i+1}: Train set size {train_size}; ' \
+ f'Test set size {test_size}. ' \
+ f'Compound divergence {float(best_values[selected_set]["comdiv"])}; ' \
+ f'Atom divergence {float(best_values[selected_set]["atomdiv"])}')
def _save_division():
train_set_out, test_set_out = cp_file_names(output_dir, i,
float(best_values[selected_set]["atomdiv"]),
float(best_values[selected_set]["comdiv"]))
self.write_ids_to_file(
[self.sent_ids[ind] for ind in self.get_subset_indices(TRAIN_SET)], train_set_out)
self.write_ids_to_file(
[self.sent_ids[ind] for ind in self.get_subset_indices(TEST_SET)], test_set_out)
print('Starting division. ' + \
f'Train set size: {self.get_subset_indices(TRAIN_SET).size()[0]}, ' + \
f'Test set size: {self.get_subset_indices(TEST_SET).size()[0]}.')
print(f'Dividing {select_n_samples} sentences...')
for i in tqdm(range(select_n_samples)):
train_size = self.get_subset_indices(TRAIN_SET).size()[0]
test_size = self.get_subset_indices(TEST_SET).size()[0]
test_percent = test_size / (train_size + test_size)
if test_percent > max_test_percent: # First check the size constraints
best_values[TRAIN_SET] = self._best_sentence(
*self._candidate_divergences(TRAIN_SET), mask=self.used_ids_mask)
selected_set = TRAIN_SET
elif test_percent < min_test_percent:
best_values[TEST_SET] = self._best_sentence(
*self._candidate_divergences(TEST_SET), mask=self.used_ids_mask)
selected_set = TEST_SET
else: # otherwise compare the max scores of the two sets
best_values[TRAIN_SET] = self._best_sentence(
*self._candidate_divergences(TRAIN_SET), mask=self.used_ids_mask)
best_values[TEST_SET] = self._best_sentence(
*self._candidate_divergences(TEST_SET), mask=self.used_ids_mask)
if best_values[TRAIN_SET]['score'] > best_values[TEST_SET]['score']:
selected_set = TRAIN_SET
else:
selected_set = TEST_SET
selected_idx = best_values[selected_set]['idx']
self._add_sample_to_set(selected_set, selected_idx)
if self.do_subsample and i % self.subsample_iter == 0:
self._subsample(DISCARD_SET, [TRAIN_SET, TEST_SET])
else:
self.candidate_com_sums[selected_set] += \
self.com_freq_matrix[selected_idx].to_dense()
self.candidate_atom_sums[selected_set] += \
self.atom_freq_matrix[selected_idx].to_dense()
if i % print_every == 0:
_print_iteration()
if i % save_cp == 0:
_save_division()
if i == max_iters:
break
print('Division complete.')
_print_iteration()
_save_division()
def create_parser():
"""Create the argument parser."""
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--data-dir", type=str, default=None,
help="Path to the directory containing the data files.")
arg_parser.add_argument("--min-test-percent", type=float, default=0.05,
help="Minimum ratio of test set size to train set size.")
arg_parser.add_argument("--max-test-percent", type=float, default=0.3,
help="Maximum ratio of test set size to train set size.")
arg_parser.add_argument("--subsample-size", type=int, default=None)
arg_parser.add_argument("--subsample-iter", type=int, default=None,
help="Subsample set every n iterations.")
arg_parser.add_argument("--compound-divergences", nargs="*",
type=float, default=[1.0])
arg_parser.add_argument("--leave-out", type=float, default=0.0)
arg_parser.add_argument("--max-iters", type=int, default=None)
arg_parser.add_argument("--random-seed", type=int, default=1234)
arg_parser.add_argument("--print-every", type=int, default=1000)
arg_parser.add_argument("--save-cp", type=int, default=50000,
help="Write test and train sets to txt file at every n iteration")
return arg_parser
def launch_from_empty_sets(args):
"""Run the greedy algorithm."""
divide_train_test = FromEmptySets(
data_dir=args.data_dir,
subsample_size=args.subsample_size,
subsample_iter=args.subsample_iter,
)
use_n_samples = int((1 - args.leave_out) * divide_train_test.n_samples) - 1
for compound_div in args.compound_divergences:
output_dir_name = f'{args.data_dir}/comdiv{compound_div}' \
+ f'_seed{args.random_seed}' \
+ f'_subsample{args.subsample_size}every{args.subsample_iter}iters' \
+ f'_testsize{args.min_test_percent}to{args.max_test_percent}' \
+ f'_leaveout{args.leave_out}'
if path.isdir(output_dir_name):
sys.exit('Output directory already exists. Exiting.')
makedirs(output_dir_name)
print('Dividing the corpus, writing to', output_dir_name)
divide_train_test.divide_corpus(
target_com_div=compound_div,
min_test_percent=args.min_test_percent,
max_test_percent=args.max_test_percent,
select_n_samples=use_n_samples,
print_every=args.print_every,
max_iters=args.max_iters,
save_cp=args.save_cp,
output_dir=output_dir_name,
)
divide_train_test.print_subset_atoms_and_compounds()
def main():
args = create_parser().parse_args()
random.seed(args.random_seed)
launch_from_empty_sets(args)
if __name__ == "__main__":
main()