-
Notifications
You must be signed in to change notification settings - Fork 0
/
self_training.sh
executable file
·90 lines (69 loc) · 4.54 KB
/
self_training.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/bin/bash
function self_training() {
$ML_FRAMEWORK_DIR/log.sh INFO "Starting self-training..."
run_dir=${params[RUN_DIR]}
iter_count=${params[ITER_COUNT]-"10"}
unlabeled_split_size=${params[UNLABELED_SPLIT_SIZE]}
unlabeled_data=${params[UNLABELED_DATA]}
delible=${params[DELIBLE]}
train_data=${params[TRAIN_DATA]}
ml_params=${params[ML_PARAMS]}
######################## Unlabeled data splitting #########################
# check if the unlabeled data is a single file (and should be splitted) or it is multiple files defined by a wildcard
unlabeled_data=`$ML_FRAMEWORK_DIR/scripts/data_split.sh "$unlabeled_data" "$unlabeled_split_size" $run_dir`
$ML_FRAMEWORK_DIR/log.sh DEBUG "Unlabeled data stored in: $unlabeled_data"
# count the number of instances (so far only for the ranking-style data)
$ML_FRAMEWORK_DIR/log.sh INFO "Counting the number of instances in UNLABELED_DATA. This might take a few minutes..."
unlabeled_base=`basename "$unlabeled_data"`
for file in $unlabeled_data; do
zcat $file | $ML_FRAMEWORK_DIR/scripts/count_ranking_instances.pl >> $run_dir/data/instances_per_part.$unlabeled_base
done
######################## Self-training iterations ##########################
# iterations
for (( i=0; i<$iter_count; i++ )); do
iter=`printf "%03d" $i`
mkdir -p $run_dir/iter_$iter
echo $iter > $run_dir/iter_$iter/stats
$ML_FRAMEWORK_DIR/semisup_iter.sh -f $config_file \
TRAIN_DATA="$train_data" \
TESTED_TRAIN_DATA=${params[TRAIN_DATA]} \
ML_PARAMS="$ml_params" \
ITER=$i \
UNLABELED_PART_SIZES=$run_dir/data/instances_per_part.$unlabeled_base \
RUN_DIR=$run_dir/iter_$iter
if [ -n "$delible" -a "$delible" -ne 0 ]; then
system_labeled_data=$run_dir/iter_$iter/data/`basename "$unlabeled_data"`
$ML_FRAMEWORK_DIR/log.sh INFO "Delible; using gold-labeled training data and $system_labeled_data as a training data for the next iteration."
#init_model=`make -s -f $ML_FRAMEWORK_DIR/makefile.train_test_eval model_path CONFIG_FILE=$config_file RUN_DIR=$run_dir/iter_000`
#$ML_FRAMEWORK_DIR/log.sh INFO "Delible; using gold-labeled model $init_model as an initial model for the next iteration."
else
if [ -n $prev_iter ]; then
prev_iter_system_labeled_data=$run_dir/iter_$prev_iter/data/all.system_labeled.table
fi
system_labeled_data=$run_dir/iter_$iter/data/all.system_labeled.table
zcat $prev_iter_system_labeled_data $run_dir/iter_$iter/data/`basename "$unlabeled_data"` | gzip -c > $system_labeled_data
$ML_FRAMEWORK_DIR/log.sh INFO "Not delible; using gold-labeled training data and cumulated data in $system_labeled_data as a training data for the next iteration."
#init_model=`make -s -f $ML_FRAMEWORK_DIR/makefile.train_test_eval model_path CONFIG_FILE=$config_file RUN_DIR=$run_dir/iter_$iter TRAIN_DATA=$train_data`
#$ML_FRAMEWORK_DIR/log.sh INFO "Not delible; using cumulated model $init_model as an initial model for the next iteration."
fi
train_data="${params[TRAIN_DATA]} $system_labeled_data"
#train_data=$run_dir/iter_$iter/data/`basename "$unlabeled_data"`
prev_iter=$iter
done
# the final iteration - just training and testing
iter=`printf "%03d" $iter_count`
mkdir -p $run_dir/iter_$iter
echo $iter > $run_dir/iter_$iter/stats
make -s -f $ML_FRAMEWORK_DIR/makefile.train_test_eval eval CONFIG_FILE=$config_file RUN_DIR=$run_dir/iter_$iter TRAIN_DATA="$train_data" TEST_DATA=${params[TRAIN_DATA]} ML_PARAMS="$ml_params" >> $run_dir/iter_$iter/stats
make -s -f $ML_FRAMEWORK_DIR/makefile.train_test_eval eval CONFIG_FILE=$config_file RUN_DIR=$run_dir/iter_$iter TRAIN_DATA="$train_data" TEST_DATA=${params[TEST_DATA]} ML_PARAMS="$ml_params" >> $run_dir/iter_$iter/stats
############################ Collecting statistics #########################
echo -e "ML_METHOD:\t" ${params[ML_METHOD]} ${params[ML_PARAMS]} > $run_dir/stats
# collecting numbers
paste $run_dir/iter_*/stats > $run_dir/stats.numbers
# a header used for iter results
echo "ITER" > $run_dir/stats.header
print_header $run_dir/stats.numbers "TRAIN" "TEST" >> $run_dir/stats.header
paste $run_dir/stats.header $run_dir/stats.numbers >> $run_dir/stats
sed -i 's/$/|/' $run_dir/stats
rm $run_dir/stats.header $run_dir/stats.numbers
}