-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_data.v3.py
95 lines (95 loc) · 3.92 KB
/
feature_data.v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
feature_dict = {
"Intercept": [u'Intercept',],
"Gender": [
u'C(gender, levels=GENDERS)[T.F]',
u'C(gender, levels=GENDERS)[T.M]',
],
"Affiliation": [
u'C(source_country, levels=TOP_15_COUNTRIES)[T.UNKNOWN]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.UK]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.JAPAN]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.GERMANY]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.FRANCE]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.ITALY]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.CANADA]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.CHINA]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.AUSTRALIA]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.SPAIN]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.NETHERLANDS]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.SWEDEN]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.INDIA]',
u'C(source_country, levels=TOP_15_COUNTRIES)[T.OTHER]',
],
"Ethnicity": [
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[0]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[1]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[2]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[3]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[4]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[5]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[6]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[7]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[8]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[9]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[10]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[11]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[12]',
u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[13]',
],
"AuthorAge": [
u'I(auth_prev_papers == 0)[T.True]',
u'np.log10(auth_prev_papers + 1)',
],
"SourceCites": [
u'I(source_ncites == 1)[T.True]',
u'np.log10(source_ncites)',
u'I(np.log10(source_ncites) ** 2)',
],
"SourceAuthors": [
u'I(source_n_authors > 20)[T.True]',
u'np.log10(np.clip(source_n_authors, 0, 20))',
u'I(np.log10(np.clip(source_n_authors, 0, 20)) ** 2)',
],
"MeshCounts": [
u'I(source_n_mesh_ex == 0)[T.True]',
u'I(sink_n_mesh_ex == 0)[T.True]',
u'np.log10(source_n_mesh_ex + 1)',
u'np.log10(sink_n_mesh_ex + 1)',
],
"Journal": [
u'journal_same[T.True]',
u'I(jj_sim == 0)[T.True]',
u'np.log10(jj_sim + 1)',
u'I(np.log10(jj_sim + 1) ** 2)',
],
"YearSpan": [
u'I(year_span < 0)[T.True]',
u'I(year_span == 0)[T.True]',
u'mf.score_log_1(year_span)',
u'I(mf.score_log_1(year_span) ** 2)',
],
"SinkCites": [
u'I(sink_prev_ncites == 0)[T.True]',
u'np.log10(sink_prev_ncites + 1)',
u'I(np.log10(sink_prev_ncites + 1) ** 2)',
],
"PubType": [
u'source_is_journal[T.True]',
u'source_is_review[T.True]',
u'source_is_case_rep[T.True]',
u'source_is_let_ed_com[T.True]',
u'sink_is_journal[T.True]',
u'sink_is_review[T.True]',
u'sink_is_case_rep[T.True]',
u'sink_is_let_ed_com[T.True]',
],
"Language": [
u'source_is_eng[T.True]',
u'sink_is_eng[T.True]',
],
"VolumeNovelty": [
u'np.log10(np.nan_to_num(source_V_novelty) + 1)',
u'np.log10(np.nan_to_num(sink_V_novelty) + 1)',
u'I(np.log10(np.nan_to_num(sink_V_novelty) + 1) ** 2)'
]
}