-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.h
458 lines (381 loc) · 14.5 KB
/
main.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
#ifndef __main_h__
#define __main_h__
#include <mysql.h>
#include <curl/curl.h>
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <sstream>
#include <chrono>
#include <thread>
#include <mutex>
#include "json.hpp"
#include <stdio.h>
#include <string.h>
#include <locale.h>
#include <sys/time.h>
using namespace std ;
using json = nlohmann::json;
#define DEBUG_OUTPUT 0
#define NS_UNKNOWN -999
#define UNKNOWN_WIKIDATA_ITEM 0
#define DB_PAGE_BATCH_SIZE 1000000
#define NS_FILE 6
#define MAX_QUERY_OUTPUT_LENGTH 2000
#define MAX_HTML_RESULTS 10000
class TPlatform ;
class TPageList ;
extern vector <string> file_data_keys ;
extern TPlatform *root_platform ;
extern std::mutex g_root_platform_mutex;
string ltrim(std::string s) ;
string rtrim(string s) ;
string trim(string s) ;
char *loadFileFromDisk ( string filename ) ;
string loadAndCacheFileFromDisk ( string filename ) ;
void split ( const string &input , vector <string> &v , char delim , uint32_t max = 0 ) ;
const std::string urlencode( const std::string& s ) ;
const std::string urldecode ( const std::string& str ) ;
string getWikiServer ( string wiki ) ;
string loadTextfromURL ( string url ) ;
bool loadJSONfromURL ( string url , json &j , bool use_cache = false ) ;
bool loadJSONfromPOST ( string url , const string &post , json &j ) ;
void stringReplace(std::string& str, string oldStr, string newStr) ;
string space2_ ( string s ) ;
string _2space ( string s ) ;
string ui2s ( uint32_t i ) ;
string f2s ( float f ) ;
string escapeURLcomponent ( string s ) ;
double time_diff(struct timeval x , struct timeval y);
string pad ( string s , int num , char c ) ;
class TWikidataDB {
public:
TWikidataDB () {} ;
TWikidataDB ( string wiki , TPlatform *_platform = NULL ) ;
void setHostDB ( string host , string db , bool force_utf8 = false ) ;
void doConnect ( bool first = false ) ;
void runQuery ( string sql ) ;
bool isConnected() ;
MYSQL_RES *getQueryResults ( string sql ) ;
string escape ( string s ) ;
string space2_ ( string s ) ;
uint32_t lastInsertID () ;
~TWikidataDB () ;
protected:
MYSQL mysql;
string _host , _config_file , _database , _wiki ;
TPlatform *platform = NULL ;
bool did_connect = false ;
bool _force_utf8 = false ;
void finishWithError ( string msg = "" , string sql = "" ) ;
bool setHostDBFromWiki ( string wiki ) ;
struct MemoryStruct {
char *memory;
size_t size;
};
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
mem->memory = (char*) realloc(mem->memory, mem->size + realsize + 1);
if(mem->memory == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
} ;
struct CURLMemoryStruct {
char *memory;
size_t size;
};
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
struct CURLMemoryStruct *mem = (struct CURLMemoryStruct *)userp;
mem->memory = (char*) realloc(mem->memory, mem->size + realsize + 1);
if(mem->memory == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
class TPageMetadata {
public:
string getMisc ( const string &key , const string &_default = "" ) ;
uint32_t id = 0 ;
uint32_t size = 0 ;
int16_t ns = NS_UNKNOWN ;
bool is_full_title = true ;
uint32_t q = 0 ;
string timestamp ;
map <string,string> misc ;
} ;
class TPage {
public:
TPage ( string s = "" , int ns = NS_UNKNOWN ) { name = space2_(trim(_2space(s))) ; meta.ns = ns ; }
const string getNameWithoutNamespace() ;
void determineNamespace ( TPageList *pl ) ;
string name ;
TPageMetadata meta ;
} ;
inline bool operator < ( const TPage &t1 , const TPage &t2 ) { return (t1.name == t2.name ? t1.meta.ns < t2.meta.ns : t1.name < t2.name ) ; }
inline bool operator == ( const TPage &t1 , const TPage &t2 ) { return !((t1<t2)||(t2<t1)) ; }
#define PAGE_SORT_DEFAULT 0
#define PAGE_SORT_TITLE 1
#define PAGE_SORT_NS_TITLE 2
#define PAGE_SORT_SIZE 3
#define PAGE_SORT_DATE 4
#define PAGE_SORT_FILE_SIZE 5
#define PAGE_SORT_UPLOAD_DATE 6
#define PAGE_SORT_INCOMING_LINKS 7
#define PAGE_SORT_RANDOM 8
#define PAGE_SORT_REDLINKS_COUNT 9
class TPageList {
public:
TPageList ( string w = "" ) { wiki = w ; }
void clear () { pages.clear() ; }
void intersect ( TPageList &pl ) ;
void merge ( TPageList &pl ) ;
void negate ( TPageList &pl ) ;
inline int32_t size () { return pages.size() ; }
string getNamespaceString ( const int16_t ns ) ;
int16_t getNamespaceNumber ( const string &ns ) ;
void convertToWiki ( string new_wiki ) ;
void convertWikidataToWiki ( string new_wiki ) ;
void swap ( TPageList &pl ) ;
void customSort ( uint8_t mode , bool ascending ) ;
virtual bool error ( string s ) { return false ; }
uint32_t annotateWikidataItem ( TWikidataDB &db , string wiki , map <string,TPage *> &name2o ) ;
void join ( string cmd , TPageList &pl ) ;
void loadMissingMetadata ( string wikidata_language , TPlatform *platform = NULL ) ;
inline bool hasDataLoaded() { return data_loaded ; }
void regexpFilter ( string regexp ) ;
string wiki ;
vector <TPage> pages ;
map <int16_t,string> ns_canonical , ns_local ;
protected:
void loadNamespaces () ;
void convertToWikidata () ;
void addWikidataLabelsForNamespace ( uint32_t namespace_id , string entity_type , string wikidata_language , TWikidataDB &db , map <int16_t,vector <TPage *> > &ns_page ) ;
bool is_sorted = false ;
bool namespaces_loaded = false ;
void sort() ;
map <string,int16_t> ns_string2id ;
bool data_loaded = false ;
} ;
class TSource : public TPageList {
public:
TSource ( TPlatform *p = NULL ) { platform = p ; run_result = false ; }
string getSourceName() { return source_name ; }
virtual bool error ( string s ) ;
virtual bool run () {} ;
virtual bool getLastRunResult() { return run_result ; }
protected:
TPlatform *platform ;
string source_name ;
bool run_result ;
} ;
class TSourceSPARQL : public TSource {
public:
TSourceSPARQL ( TPlatform *p = NULL ) { platform = p ; source_name = "sparql" ; } ;
virtual bool run () ;
protected:
bool runQuery ( string query ) ;
string sparql_prefixes = "PREFIX v: <http://www.wikidata.org/prop/statement/>\nPREFIX q: <http://www.wikidata.org/prop/qualifier/>\nPREFIX ps: <http://www.wikidata.org/prop/statement/>\nPREFIX pq: <http://www.wikidata.org/prop/qualifier/>\n" ;
} ;
class TSourceLabels : public TSource {
public:
TSourceLabels ( TPlatform *p = NULL ) { platform = p ; source_name = "labels" ; } ;
virtual bool run () ;
} ;
class TSourcePagePile : public TSource {
public:
TSourcePagePile ( TPlatform *p = NULL ) { platform = p ; source_name = "pagepile" ; } ;
virtual bool run () ;
protected:
bool getPile ( uint32_t id ) ;
} ;
class TSourceSearch : public TSource {
public:
TSourceSearch ( TPlatform *p = NULL ) { platform = p ; source_name = "search" ; } ;
virtual bool run () ;
protected:
} ;
class TSourceManual : public TSource {
public:
TSourceManual ( TPlatform *p = NULL ) { platform = p ; source_name = "manual" ; } ;
virtual bool run () ;
protected:
bool parseList ( string &text , string &new_wiki ) ;
} ;
class TSourceWikidata : public TSource {
public:
TSourceWikidata ( TPlatform *p = NULL ) { platform = p ; source_name = "wikidata" ; } ;
virtual bool run () ;
protected:
bool getData ( string sites ) ;
} ;
class TSourceDatabaseCatDepth {
public:
TSourceDatabaseCatDepth ( string n = "" , int16_t d = -1 ) { name = n ; depth = d ; }
string name ;
int16_t depth ;
} ;
class TSourceDatabaseParams {
public:
vector <TSourceDatabaseCatDepth> positive , negative ;
vector <uint16_t> page_namespace_ids ;
vector <string> templates_yes , templates_any , templates_no ;
bool templates_yes_talk_page = false ;
bool templates_any_talk_page = false ;
bool templates_no_talk_page = false ;
vector <string> linked_from_all , linked_from_any , linked_from_none ;
vector <string> links_to_all , links_to_any , links_to_none ;
string wiki = "enwiki" ;
int16_t default_depth = 0 ;
string combine = "subset" ;
string redirects = "either" ;
string last_edit_bot = "either" ;
string last_edit_anon = "either" ;
string last_edit_flagged = "either" ;
string page_image = "any" ;
string ores_type = "any" ;
string ores_prediction = "any" ;
float ores_prob_from = 0 ;
float ores_prob_to = 1.0 ;
int32_t larger , smaller , minlinks , maxlinks ;
string before , after , max_age , page_wikidata_item ;
bool only_new_since = false ;
} ;
class TSourceDatabase : public TSource {
public:
TSourceDatabase ( TPlatform *p = NULL ) { platform = p ; source_name = "categories" ; } ;
static string listEscapedStrings ( TWikidataDB &db , vector <string> &s , bool fix_spaces = true ) ;
virtual void setPageList ( TPageList *pagelist ) { primary_pagelist = pagelist ; }
virtual bool run () ;
protected:
typedef vector <vector<string> > vvs ;
bool getPages () ;
void iterateCategoryBatches ( vector <vvs> &ret , vvs &categories , uint32_t start = 0 ) ;
bool getPagesforPrimary ( TWikidataDB &db , string primary , string sql , string sql_before_after , vector <TPage> &pages_sublist , bool is_before_after_done ) ;
bool parseCategoryList ( TWikidataDB &db , vector <TSourceDatabaseCatDepth> &input , vector <vector<string> > &output ) ;
void getCategoriesInTree ( TWikidataDB &db , string name , int16_t depth , vector <string> &ret ) ;
void goDepth ( TWikidataDB &db , map <string,bool> &tmp , vector <string> &cats , int16_t left ) ;
string templateSubquery ( TWikidataDB &db , vector <string> input , bool use_talk_page , bool find_not ) ;
string linksFromSubquery ( TWikidataDB &db , vector <string> input ) ;
string linksToSubquery ( TWikidataDB &db , vector <string> input ) ;
void groupLinkListByNamespace ( vector <string> &input , map <int32_t,vector <string> > &nslist ) ;
TSourceDatabaseParams params ;
TPageList *primary_pagelist = NULL ;
vvs cat_pos , cat_neg ;
bool has_pos_cats , has_neg_cats , has_pos_templates , has_pos_linked_from ;
} ;
class TPlatform {
public:
bool readConfigFile ( string filename ) ;
void setConfig ( TPlatform &p ) ;
bool error ( string s ) { errors.push_back ( s ) ; return false ; } ;
string process() ;
string getWiki () ;
string getParam ( string key , string default_value = "" , bool ignore_empty = false ) ;
float getQueryTime() { return querytime ; }
string getExistingLabel ( string name ) ;
bool doOutputRedlinks() { return output_redlinks ; }
void setDatabaseParameters ( TSourceDatabaseParams &db_params ) ;
string getLabelBaseSQL ( TWikidataDB &db ) ;
map <string,string> config , params ;
string content_type , query ;
vector <string> errors ;
uint32_t psid = 0 ;
protected:
void parseCats ( string input , vector <TSourceDatabaseCatDepth> &output ) ;
void splitParamIntoVector ( string input , vector <string> &output ) ;
void processFiles ( TPageList &pl ) ;
void annotateFile ( TWikidataDB &db , map <string,TPage *> &name2f , bool file_data , bool file_usage , bool file_usage_data_ns0 ) ;
void processPages ( TPageList &pl ) ;
void processSubpages ( TPageList &pl ) ;
void annotatePage ( TWikidataDB &db , map <uint32_t,vector <TPage *> > &ns_pages , bool add_image , bool add_coordinates , bool add_defaultsort , bool add_disambiguation , bool add_incoming_links ) ;
void processWikidata ( TPageList &pl ) ;
void processSitelinks ( TPageList &pagelist ) ;
void processLabels ( TPageList &pagelist ) ;
void processCreator ( TPageList &pagelist ) ;
void filterWikidata ( TPageList &pagelist ) ;
void processMissingDatabaseFilters ( TPageList &pagelist ) ;
void getCommonWikiAuto ( map <string,TSource *> &sources ) ;
void combine ( TPageList &pagelist , map <string,TSource *> &sources ) ;
void sortResults ( TPageList &pagelist ) ;
void processRedlinks ( TPageList &pagelist ) ;
void applyResultsLimit ( TPageList &pagelist ) ;
void legacyAutoListParameters () ;
void getParameterAsStringArray ( string s , vector <string> &vs ) ;
string legacyCombinationParameters ( map <string,TSource *> &sources ) ;
float querytime = 0 ; // seconds
string wiki , common_wiki ;
map <string,bool> existing_labels ;
bool output_redlinks = false ;
bool label_filter_used_as_primary = false ;
} ;
class TRenderer {
public:
TRenderer ( TPlatform *p = NULL ) { platform = p ; }
string renderPageList ( TPageList &pagelist ) ;
bool only_files = false ;
private:
string renderPageListHTML ( TPageList &pagelist ) ;
string renderPageListJSON ( TPageList &pagelist ) ;
string renderPageListWiki ( TPageList &pagelist ) ;
string renderPageListCTSV ( TPageList &pagelist , string mode ) ;
string renderPageListPagePile ( TPageList &pagelist ) ;
string getLink ( TPage &page ) ;
void initializeColumns() ;
string getTableHeaderHTML() ;
string getTableRowHTML ( uint32_t cnt , TPage &page , TPageList &pagelist ) ;
string getTableRowCTSV ( uint32_t cnt , TPage &page , TPageList &pagelist , string &mode ) ;
void escapeCSV ( string &out ) ;
void addPageMetadataToJSON ( vector <TPage>::iterator i , json &o ) ;
vector <string> columns ;
TPlatform *platform = NULL ;
string wiki ;
bool use_autolist , autolist_creator_mode , is_wikidata ;
string thumnail_size = "120px" ;
struct timeval now_ish ;
} ;
class TWDFIST {
public:
typedef map <string,int32_t> string2int32 ;
TWDFIST ( TPageList *pagelist , TPlatform *platform ) : pagelist(pagelist) , platform(platform) {} ;
string run () ;
protected :
bool isValidFile ( string file ) ;
string normalizeFilename ( string filename ) ;
void seedIgnoreFiles () ;
void seedIgnoreFilesFromWikiPage () ;
void seedIgnoreFilesFromIgnoreDatabase () ;
void filterFilesFromIgnoreDatabase () ;
void filterFilesFiveOrIsUsed () ;
void removeItemsWithNoFileCandidates () ;
void filterItems() ;
void filterFiles () ;
void followLanguageLinks () ;
void followCoordinates () ;
void followSearchCommons () ;
void followCommonsCats () ;
void addFileToQ ( string q , string file ) ;
TPageList *pagelist ;
TPlatform *platform ;
vector <string> items ;
bool wdf_langlinks , wdf_coords , wdf_search_commons , wdf_commons_cats ;
bool wdf_only_items_without_p18 , wdf_only_files_not_on_wd , wdf_only_jpeg , wdf_max_five_results , wdf_only_page_images , wdf_allow_svg ;
map <string,uint8_t> files2ignore ;
map <string,string2int32 > q2image ;
} ;
#endif