-
Notifications
You must be signed in to change notification settings - Fork 13
/
importReuters.php
110 lines (99 loc) · 3.35 KB
/
importReuters.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
<?php
$loader = require 'vendor/autoload.php';
$client = new \Elasticsearch\Client();
$client->indices()->delete(['index' => 'reuters', 'ignore' => 404]);
$params = ['index' => 'reuters', 'body' => [
'settings' => [
'number_of_shards' => 1,
'number_of_replicas' => 0,
'analysis' => [
'filter' => [
'shingle' => [
'type' => 'shingle'
]
],
'char_filter' => [
'pre_negs' => [
'type' => 'pattern_replace',
'pattern' => '(\\w+)\\s+((?i:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint))\\b',
'replacement' => '~$1 $2'
],
'post_negs' => [
'type' => 'pattern_replace',
'pattern' => '\\b((?i:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint))\\s+(\\w+)',
'replacement' => '$1 ~$2'
]
],
'analyzer' => [
'reuters' => [
'type' => 'custom',
'tokenizer' => 'standard',
'filter' => ['lowercase', 'stop', 'kstem']
]
]
]
],
'mappings' => [
'_default_' => [
'properties' => [
'title' => [
'type' => 'string',
'analyzer' => 'reuters',
'term_vector' => 'yes',
'copy_to' => 'combined'
],
'body' => [
'type' => 'string',
'analyzer' => 'reuters',
'term_vector' => 'yes',
'copy_to' => 'combined'
],
'combined' => [
'type' => 'string',
'analyzer' => 'reuters',
'term_vector' => 'yes'
],
'topics' => [
'type' => 'string',
'index' => 'not_analyzed'
],
'places' => [
'type' => 'string',
'index' => 'not_analyzed'
]
]
]
]
]];
$client->indices()->create($params);
$params = [];
for ($i = 0; $i < 17; ++$i) {
$dir = realpath(dirname(__FILE__));
$fileNum = sprintf('%03d', $i);
$data = file_get_contents("$dir/reuters-21578-json/reuters-$fileNum.json");
$data = json_decode($data, true);
foreach ($data as $doc) {
$params['body'][] = array('index' => array());
$params['body'][] = $doc;
}
$params['index'] = 'reuters';
$params['type'] = 'train';
$client->bulk($params);
echo "\n$i";
$params = array();
}
for ($i = 17; $i < 22; ++$i) {
$dir = realpath(dirname(__FILE__));
$fileNum = sprintf('%03d', $i);
$data = file_get_contents("$dir/reuters-21578-json/reuters-$fileNum.json");
$data = json_decode($data, true);
foreach ($data as $doc) {
$params['body'][] = array('index' => array());
$params['body'][] = $doc;
}
$params['index'] = 'reuters';
$params['type'] = 'test';
$client->bulk($params);
echo "\n$i";
$params = array();
}