-
Notifications
You must be signed in to change notification settings - Fork 0
/
makefile.train_test_eval
171 lines (131 loc) · 6.74 KB
/
makefile.train_test_eval
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
SHELL:=/bin/bash
include $(ML_FRAMEWORK_DIR)/makefile.common
#--------------------------------------- HELP --------------------------------------------------------
define HELP_TEXT
Usage: make [train | test | eval | clean]
- train: creates a model from the training data
* performs preprocess before
- test: label the testing data using a trained model
* performs train before
- eval: calculates the score on the testing data
* performs test before
Parameters:
CONFIG_FILE : all parameters can be defined also in a config file;
the value set as a command argument overrides the value set in the config file
TRAIN_DATA : a path to the training data; multiple files can be specified by a wildcard (for the time being, it does not accept space or comma-separated files)
TEST_DATA : a path to the testing data; multiple files can be specified by a wildcard (for the time being, it does not accept space or comma-separated files)
RUN_DIR : a directory where all intermediate files are stored
ML_METHOD : a ML method to be used (default = maxent)
ML_PARAMS : additional ML parameters (default = )
MODEL_DIR : a directory with the model (default = $$RUN_DIR/model)
RESULT_DIR : a directory with the result (default = $$RUN_DIR/result)
endef
export HELP_TEXT
help :
@echo "$$HELP_TEXT"
#----------- data ----------------
#TRAIN_DATA
#TEST_DATA
#------------------------------------------- ML ---------------------------------------------------
ML_METHOD=maxent
#ML_METHOD=vw
#ML_METHOD=sklearn.decision_trees
ifeq ($(ML_METHOD),vw)
ML_PARAMS=--passes 20
endif
#VW_APP=/net/work/people/mnovak/tools/x86_64/vowpal_wabbit/vowpalwabbit/vw
VW_APP=/net/cluster/TMP/mnovak/tools/vowpal_wabbit/vowpalwabbit/vw
#-------- directories --------------
SCRIPT_DIR=$(ML_FRAMEWORK_DIR)/scripts
MODEL_DIR=$(RUN_DIR)/model
RESULT_DIR=$(RUN_DIR)/result
$(MODEL_DIR) $(RESULT_DIR) : | $(RUN_DIR)
mkdir $@
#========== reading variables from a config file ================
# this must be placed after all definitions of public variables
include $(ML_FRAMEWORK_DIR)/makefile.config
#----------- skuska --------------
skuska :
skuska=(--csoaa_ldf mc --f log); \
skuska=($${skuska[@]:2}); \
echo $${skuska[@]}
# @echo -e "[$$(date '+%F %T')]\tasd"
# @echo $(ML_METHOD)
# @echo $(ML_PARAMS)
# @echo $(RUN_DIR)
# @echo $(TEST_DATA)
# @echo $(TRAIN_DATA)
# @echo $(FEAT_LIST)
# @echo $(abspath $(TRAIN_DATA))
#----------- files ---------------
TRAIN_DATA_STEM := $(shell $(SCRIPT_DIR)/file_stem.pl "$(TRAIN_DATA)")
TEST_DATA_STEM := $(shell $(SCRIPT_DIR)/file_stem.pl "$(TEST_DATA)")
MODEL_FILE = $(MODEL_DIR)/$(TRAIN_DATA_STEM).$(ML_METHOD).model
RESULT_FILE = $(RESULT_DIR)/$(TEST_DATA_STEM).$(TRAIN_DATA_STEM).$(ML_METHOD).res
stem :
echo $(TRAIN_DATA_STEM)
#----------------------------------------- TRAIN --------------------------------------------------------------
train : $(MODEL_FILE)
$(MODEL_DIR)/$(TRAIN_DATA_STEM).maxent.model : $(TRAIN_DATA) | $(MODEL_DIR)
$(LOG_INFO) "Training a maxent model: $^ => $@" >&2
zcat $^ | $(SCRIPT_DIR)/maxent.train.pl $@
$(MODEL_DIR)/$(TRAIN_DATA_STEM).vw.model : $(TRAIN_DATA) | $(MODEL_DIR)
$(LOG_INFO) "Training a VW-oaa model: $^ => $@" >&2
zcat $^ | cut -f2 --complement | $(VW_APP) -f $@ -b 20 \
--oaa `zcat $^ | cut -f 1 -d' ' | sort -n | tail -n1` $(ML_PARAMS) \
--loss_function logistic \
--holdout_off \
-k --cache_file $(MODEL_DIR)/$(TRAIN_DATA_STEM).vw.$$$$.cache; \
rm $(MODEL_DIR)/$(TRAIN_DATA_STEM).vw.$$$$.cache
$(MODEL_DIR)/$(TRAIN_DATA_STEM).vw.ranking.model : $(TRAIN_DATA) | $(MODEL_DIR)
$(LOG_INFO) "Training a VW-csoaa_ldf (ranking) model: $^ => $@" >&2
zcat $^ | cut -f2 --complement | $(SCRIPT_DIR)/shared_to_nonshared.pl | $(VW_APP) -f $@ -b 27 \
--csoaa_ldf $(ML_PARAMS) \
--holdout_off \
-k --cache_file $(MODEL_DIR)/$(TRAIN_DATA_STEM).vw.ranking.$$$$.cache; \
rm $(MODEL_DIR)/$(TRAIN_DATA_STEM).vw.ranking.$$$$.cache
$(MODEL_DIR)/$(TRAIN_DATA_STEM).sklearn.%.model : $(TRAIN_DATA) | $(MODEL_DIR)
$(LOG_INFO) "Training a ScikitLearn ($*) model: $^ => $@" >&2
zcat $^ | $(SCRIPT_DIR)/sklearn.train.py $* "$(ML_PARAMS)" $@
$(MODEL_DIR)/$(TRAIN_DATA_STEM).sklearn-ranking.%.model : $(TRAIN_DATA) | $(MODEL_DIR)
$(LOG_INFO) "Training a ranking ScikitLearn ($*) model: $^ => $@" >&2
zcat $^ | cut -f2 --complement | $(SCRIPT_DIR)/shared_to_nonshared.pl | $(SCRIPT_DIR)/sklearn.train.py --ranking $* "$(ML_PARAMS)" $@
model_path:
echo $(MODEL_FILE)
clean_train:
-rm $(MODEL_FILE)
# shuffle before training: $(SCRIPT_DIR)/shuffle.pl -s '^\s*$$' -r 1986
#----------------------------------------- TEST --------------------------------------------------------------
test : $(RESULT_FILE)
$(RESULT_DIR)/$(TEST_DATA_STEM).$(TRAIN_DATA_STEM).maxent.res : $(MODEL_FILE) $(TEST_DATA) | $(RESULT_DIR)
$(LOG_INFO) "Predicting labels with a maxent model: $< + $(word 2,$^) => $@" >&2
zcat $(word 2,$^) | $(SCRIPT_DIR)/maxent.test.pl $< > $@
$(RESULT_DIR)/$(TEST_DATA_STEM).$(TRAIN_DATA_STEM).vw.res : $(MODEL_FILE) $(TEST_DATA) | $(RESULT_DIR)
$(LOG_INFO) "Predicting labels with a VW-oaa model: $< + $(word 2,$^) => $@" >&2
zcat $(word 2,$^) | cut -f2 --complement | $(VW_APP) -t -i $< -p $@.$$$$; \
perl -pe '$$_ =~ s/^(.*?)\..*? (.*?)$$/$$2\t$$1/;' < $@.$$$$ > $@; \
rm $@.$$$$
$(RESULT_DIR)/$(TEST_DATA_STEM).$(TRAIN_DATA_STEM).vw.ranking.res : $(MODEL_FILE) $(TEST_DATA) | $(RESULT_DIR)
$(LOG_INFO) "Predicting labels with a VW-csoaa_ldf (ranking) model: $< + $(word 2,$^) => $@" >&2
zcat $(word 2,$^) | cut -f2 --complement | $(SCRIPT_DIR)/shared_to_nonshared.pl | $(VW_APP) -t -i $< -p $@ --loss_function=logistic --probabilities
sed -i 's/^ .*$$//' $@
$(RESULT_DIR)/$(TEST_DATA_STEM).$(TRAIN_DATA_STEM).sklearn.%.res : $(MODEL_FILE) $(TEST_DATA) | $(RESULT_DIR)
$(LOG_INFO) "Predicting labels with a ScikitLearn ($*): $< + $(word 2,$^) => $@" >&2
zcat $(word 2,$^) | $(SCRIPT_DIR)/sklearn.test.py $< > $@
$(RESULT_DIR)/$(TEST_DATA_STEM).$(TRAIN_DATA_STEM).sklearn-ranking.%.res : $(MODEL_FILE) $(TEST_DATA) | $(RESULT_DIR)
$(LOG_INFO) "Predicting labels with a ScikitLearn ranker ($*): $< + $(word 2,$^) => $@" >&2
zcat $(word 2,$^) | cut -f2 --complement | $(SCRIPT_DIR)/shared_to_nonshared.pl | $(SCRIPT_DIR)/sklearn.test.py --ranking $< > $@
clean_test:
-rm $(RESULT_FILE)
result_path :
echo $(RESULT_FILE)
#----------------------------------------- EVAL --------------------------------------------------------------
ifeq ($(findstring ranking,$(RESULT_FILE)),ranking)
RANK_FLAG=--ranking
endif
EVAL_FLAG=--acc --prf
eval : $(RESULT_FILE)
$(LOG_INFO) "Evaluating the predicted labelling: $<" >&2
cat $< | $(SCRIPT_DIR)/results_to_triples.pl $(RANK_FLAG) | $(SCRIPT_DIR)/eval.pl $(EVAL_FLAG)
#---------------------------------------- CLEAN -------------------------------------------------------
clean : clean_test clean_train