-
Notifications
You must be signed in to change notification settings - Fork 6
/
grabDeletedText.php
286 lines (250 loc) · 8.93 KB
/
grabDeletedText.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
<?php
/**
* Maintenance script to grab text from a wiki and import it to another wiki.
* Translated from Edward Chernenko's Perl version (text.pl).
*
* @file
* @ingroup Maintenance
* @author Jack Phoenix <[email protected]>
* @author Calimonious the Estrange
* @author Jesús Martínez <[email protected]>
* @version 1.1
* @date 5 August 2019
*/
use MediaWiki\MediaWikiServices;
use MediaWiki\Revision\SlotRecord;
require_once 'includes/TextGrabber.php';
class GrabDeletedText extends TextGrabber {
/**
* Actual start point if bad drcontinues force having to continue from earlier
* (mw1.19- issue)
*
* @var string
*/
protected $badStart;
/**
* Last title to get; useful for working around content with a namespace/interwiki
* on top of it in mw1.19-
*
* @var string
*/
protected $lastTitle;
/**
* API limits to use instead of max
*
* @var int
*/
protected $apiLimits;
/**
* Array of namespaces to grab deleted revisions
*
* @var Array
*/
protected $namespaces = null;
public function __construct() {
parent::__construct();
$this->addDescription( 'Grab deleted text from an external wiki and import it into one of ours.' );
# $this->addOption( 'start', 'Revision at which to start', false, true );
#$this->addOption( 'startdate', 'Not yet implemented.', false, true );
$this->addOption( 'drcontinue', 'API continue to restart deleted revision process', false, true );
$this->addOption( 'apilimits', 'API limits to use. Maximum limits for the user will be used by default', false, true );
$this->addOption( 'lasttitle', 'Last title to get; useful for working around content with a namespace/interwiki on top of it in mw1.19-', false, true );
$this->addOption( 'badstart', 'Actual start point if bad drcontinues force having to continue from earlier (mw1.19- issue)', false, true );
$this->addOption( 'namespaces', 'Pipe-separated namespaces (ID) to grab. Defaults to all namespaces', false, true );
}
public function execute() {
parent::execute();
$this->lastTitle = $this->getOption( 'lasttitle' );
$this->badStart = $this->getOption( 'badstart' );
# End date isn't necessarily supported by source wikis, but we'll deal with that later.
$this->endDate = $this->getOption( 'enddate' );
if ( $this->endDate ) {
$this->endDate = wfTimestamp( TS_MW, $this->endDate );
if ( !$this->endDate ) {
$this->fatalError( 'Invalid enddate format.' );
}
} else {
$this->endDate = wfTimestampNow();
}
$apiLimits = $this->getOption( 'apilimits' );
if ( !is_null( $apiLimits ) && is_numeric( $apiLimits ) && (int)$apiLimits > 0 ) {
$this->apiLimits = (int)$apiLimits;
} else {
$this->apiLimits = null;
}
$this->output( "Retreiving namespaces list...\n" );
$params = [
'meta' => 'siteinfo',
'siprop' => 'namespaces|statistics|namespacealiases'
];
$result = $this->bot->query( $params );
$siteinfo = $result['query'];
# No data - bail out early
if ( empty( $siteinfo ) ) {
$this->fatalError( 'No siteinfo data found...' );
}
$textNamespaces = [];
if ( $this->hasOption( 'namespaces' ) ) {
$textNamespaces = explode( '|', $this->getOption( 'namespaces', '' ) );
} else {
foreach ( array_keys( $siteinfo['namespaces'] ) as $ns ) {
# Ignore special
if ( $ns >= 0 ) {
$textNamespaces[] = $ns;
}
}
}
if ( !$textNamespaces ) {
$this->fatalError( 'Got no namespaces...' );
}
# Get deleted revisions
$this->output( "\nSaving deleted revisions...\n" );
$revisions_processed = 0;
foreach ( $textNamespaces as $ns ) {
$more = true;
$drcontinue = $this->getOption( 'drcontinue' );
if ( !$drcontinue ) {
$drcontinue = null;
} else {
# Parse start namespace from input string and use
# Length of namespace number
$nsStart = strpos( $drcontinue, '|' );
# Namespsace number
if ( $nsStart == 0 ) {
$nsStart = 0;
} else {
$nsStart = substr( $drcontinue, 0, $nsStart );
}
if ( $ns < $nsStart ) {
$this->output( "Skipping $ns\n" );
continue;
} elseif ( $nsStart != $ns ) {
$drcontinue = null;
}
}
# Count revisions
$nsRevisions = 0;
# TODO: list=deletedrevs is deprecated in recent MediaWiki versions.
# should try to use list=alldeletedrevisions first and fallback to deletedrevs
$params = [
'list' => 'deletedrevs',
'drnamespace' => $ns,
'drlimit' => $this->getApiLimit(),
'drdir' => 'newer',
'drprop' => 'revid|parentid|user|userid|comment|minor|len|content|tags',
];
while ( $more ) {
if ( $drcontinue === null ) {
unset( $params['drcontinue'] );
} else {
# Check for 1.19 bug with the drcontinue that causes the query to jump backward on colonspaces, but we need something to compare back to for this...
if ( isset( $params['drcontinue'] ) ) {
$oldcontinue = $params['drcontinue'];
if ( substr( str_replace( ' ', '_', $drcontinue ), 0, -15 ) < substr( str_replace( ' ', '_', $oldcontinue ), 0, -15 ) ) {
$this->fatalError( 'Bad drcontinue; ' . str_replace( ' ', '_', $drcontinue ) . ' < ' . str_replace( ' ', '_', $oldcontinue ) );
}
}
$params['drcontinue'] = $drcontinue;
}
$result = $this->bot->query( $params );
if ( $result && isset( $result['error'] ) ) {
$this->fatalError( "$user does not have required rights to fetch deleted revisions." );
}
if ( empty( $result ) ) {
sleep( .5 );
$this->output( "Bad result.\n" );
continue;
}
$pageChunks = $result['query']['deletedrevs'];
if ( empty( $pageChunks ) ) {
$this->output( "No revisions found.\n" );
$more = false;
}
foreach ( $pageChunks as $pageChunk ) {
$nsRevisions = $this->processDeletedRevisions( $pageChunk, $nsRevisions );
}
if ( isset( $result['query-continue'] ) && isset( $result['query-continue']['deletedrevs'] ) ) {
# Ancient way of api pagination
# TODO: Document what is this for. Examples welcome
$drcontinue = str_replace( '&', '%26', $result['query-continue']['deletedrevs']['drcontinue'] );
$params = array_merge( $params, $result['query-continue']['deletedrevs'] );
} elseif ( isset( $result['continue'] ) ) {
# New pagination
$drcontinue = $result['continue']['drcontinue'];
$params = array_merge( $params, $result['continue'] );
} else {
$more = false;
}
$this->output( "drcontinue = $drcontinue\n" );
}
$this->output( "$nsRevisions chunks of revisions processed in namespace $ns.\n" );
$revisions_processed += $nsRevisions;
}
$this->output( "\n" );
$this->output( "Saved $revisions_processed deleted revisions.\n" );
# Done.
}
/**
* Add deleted revisions to the archive and text tables
* Takes results in chunks because that's how the API returns pages - with chunks of revisions.
*
* @param Array $pageChunk Chunk of revisions, represents a deleted page
* @param int $nsRevisions Count of deleted revisions for this namespace, for progress reports
* @returns int $nsRevisions updated
*/
function processDeletedRevisions( $pageChunk, $nsRevisions ) {
# Go back if we're not actually to the start point yet.
if ( $this->badStart ) {
if ( str_replace( ' ', '_', $badStart ) > str_replace( ' ', '_', $pageChunk['title'] ) ) {
return $nsRevisions;
} else {
# We're now at the correct position, clear the flag and continue
$this->badStart = null;
}
}
$ns = $pageChunk['ns'];
$title = $this->sanitiseTitle( $ns, $pageChunk['title'] );
# TODO: Document this whith examples if possible
if ( $this->lastTitle && ( str_replace( ' ', '_', $pageChunk['title'] ) > str_replace( ' ', '_', $this->lastTitle ) ) ) {
$this->fatalError( "Stopping at {$pageChunk['title']}; lasttitle reached." );
}
$this->output( "Processing {$pageChunk['title']}\n" );
$revisions = $pageChunk['revisions'];
foreach ( $revisions as $revision ) {
if ( $nsRevisions % 500 == 0 && $nsRevisions !== 0 ) {
$this->output( "$nsRevisions revisions inserted\n" );
}
# Stop if past the enddate
$timestamp = wfTimestamp( TS_MW, $revision['timestamp'] );
if ( $timestamp > $this->endDate ) {
return $nsRevisions;
}
$revisionId = $revision['revid'];
if ( !$revisionId ) {
# Revision ID is mandatory with the new content tables and things will fail if not provided.
$this->output( sprintf( "WARNING: Got revision without revision id, " .
"with timestamp %s. Skipping!\n", $revision['timestamp'] ) );
continue;
}
$titleObj = Title::makeTitle( $ns, $title );
if ( $this->insertArchivedRevision( $revision, $titleObj ) ) {
$nsRevisions++;
}
}
return $nsRevisions;
}
/**
* Returns the standard api result limit for queries
*
* @returns int limit provided by user, or 'max' to use the maximum
* allowed for the user querying the api
*/
function getApiLimit() {
if ( is_null( $this->apiLimits ) ) {
return 'max';
}
return $this->apiLimits;
}
}
$maintClass = 'GrabDeletedText';
require_once RUN_MAINTENANCE_IF_MAIN;