-
Notifications
You must be signed in to change notification settings - Fork 3
/
conservatory-import
executable file
·386 lines (334 loc) · 14.9 KB
/
conservatory-import
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#!/usr/bin/env python3
# Import successive releases into a git repository, for conservatory.github.io
# Copyright (C) 2017, 2018 by Karl Fogel
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Affero GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Affero
# GNU General Public License for more details.
#
# You should have received a copy of the Affero GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
__doc__ = """\
Import successive releases into a git repository, presumably for
preservation in https://conservatory.github.io/.
Usage:
$ conservatory-import [OPTIONS...] PROJ-0.0.tgz [PROJ-0.1.tgz [...]]
Create a git repository named PROJ, and import successive releases
into it. The arguments are the release tarballs, and they must be
presented as arguments in the right order -- this program does not
try to fix the ordering if you get it wrong.
The releases can be ".tar", ".tar.gz", ".tgz", or ".tar.bz2" format
(support for zip etc is planned, and patches are welcome of course).
Because release package names usually sort lexically, this usage will
typically work:
$ conservatory-import [OPTIONS...] PROJ-*
Options:
-r / --readme-prefix PREFIX_FILE
Prepend the contents of PREFIX_FILE to the "README" file in every
commit. The README is identified by trying reasonable variations
such as "readme.txt", "README.md", "ReadMe.rst", and so on. See
https://github.com/Conservatory/dcled/blob/master/README (the
block at the top of the file) for a good example of prefix text.
-m / --metadata-dir DIRECTORY
For every commit, get commit metadata from files under DIRECTORY
that are named according to the release number. Use the file
DIRECTORY/REL-msg as the commit message for release REL, use
DIRECTORY/REL-date as the git date for the commit, and use
DIRECTORY/REL-author as the git author. Each REL must match
exactly the release identifier portion of the distribution name.
For example, for "PROJ-1.3.tgz", REL would be "1.3", and for
PROJ-2.1-beta3.tar.gz" REL would be "2.1-beta3". If a file is
not present, then use a default (whatever would have been used
for that metadata on that commit if -m had not been passed).
-k / --known-names NAME_PREFIX_1[,NAME_PREFIX2,...]
Supply a comma-separated list of known names, to help this script
separate them from the release identifier.
This is probably best explained with an example. Take the
MHonArc mail archiver package: it was distributed as
"MHonArc1.2.3.tar.gz", "MHonArc2.0.1.tar.gz", etc, until 2.6.5.
At that point, upstream started putting a hyphen in the tarball:
"MHonArc-2.6.5.tar.gz", then "MHonArc-2.6.6.tar.gz", and so on.
To handle this, you'd pass '--known-names MHonArc'. (You don't
pass '--known-names MHonArc,MHonArc-' because the hyphen is not
part of the package's name; it's rare to list multiple names.)
This program does not push the git repository up to the Conservatory
for you. You have to do that manually, by first creating an empty
repository on GitHub via the web UI, and then pushing all the commits
up to it:
$ cd new_local_repository_with_all_releases_imported
$ git remote add origin [email protected]:Conservatory/PROJ.git
$ git push -u origin master
"""
import getopt
import sys
import shutil
import subprocess
import tempfile
import tarfile
import zipfile
import os
import stat
import re
def maybe_amend_readme(prefix):
"""If a readme file is present in cwd, prepend string PREFIX to it.
No adjustment is made to PREFIX; it is prepended exactly as is.
Only modify the first plausible readme file found (see the code
for the order they're tried in). The return value is undefined.
"""
base = "ReadMe"
# See https://github.com/github/markup#markups
for ext in ('',
'.md',
'.mkdn',
'.markdown',
'.mdown',
'.txt',
'.text',
'.rst',
'.asciidoc',
'.org',
'.textile',
'.rdoc',
'.creole',
'.mediawiki',
'.wiki',
'.pod',):
for name in (base.upper(), base.lower(), base, base.capitalize(),):
path = name + ext
if os.path.exists(path):
try:
with open(path, 'r+') as f:
orig = f.read()
f.seek(0)
f.write(prefix + orig)
except PermissionError:
orig_perms = os.stat(path)[stat.ST_MODE]
os.chmod(path, 0o644)
# Try again, albeit WETly.
with open(path, 'r+') as f:
orig = f.read()
f.seek(0)
f.write(prefix + orig)
# Restore the original read-only permissions.
# Although I don't know why we bother, since
# apparently git doesn't preserve them anyway.
# But maybe we'll learn how to fix that later,
# and in any case, that's no excuse for not
# preserving them now: the bug should always be
# confined to as narrow a space as possible.
os.chmod(path, orig_perms)
return
def do_commit(rel_id, metadata_dir):
"""Commit REL_ID in cwd, using metadata files from METADATA_DIR.
If METADATA_DIR is None, use no metadata files. Else it is an
absolute path to a directory, and whichever of these files are
present in that directory are used as follows:
REL_ID-msg: Whole contents is used as commit message.
REL_ID-date: First line is used as date for this commit.
REL_ID-author: First line is used as author for this commit.
"""
cmd = ['git', 'commit', '--quiet',]
params = {}
# Gather any metadata from metadata_dir.
if metadata_dir is not None:
for fbase in ["msg", "date", "author",]:
path = os.path.join(metadata_dir, rel_id + "-" + fbase)
try:
with open(path, 'r') as f:
# TODO: Sanity-check the strings for quoting chars or
# other weirdness?
if fbase == "msg":
params[fbase] = f.read()
else:
params[fbase] = f.readline().strip()
except:
pass
# Build the 'git commit' command line, using whatever data we
# found in metadata_dir.
if "msg" in params:
cmd.extend(("-m", params["msg"],))
else:
cmd.extend(("-m", "Import release %s" % rel_id,))
if "author" in params:
cmd.extend(("--author", params["author"],))
if "date" in params:
cmd.extend(("--date", params["date"],))
# Do it.
subprocess.call(cmd)
def parse_release(release, known_names):
"""From a RELEASE like "proj-1.0-beta.tar.gz", return a triple
tuple like ("proj", "1.0-beta", ".tar.gz",). If KNOWN_NAMES is
not None, it is a list of known prefixes from -k / --known-names,
which see."""
name = None
# First work from the right side and grab the extension.
ext_split_re = re.compile(
"\\.(tar\\.gz|tar\\.bz|tar\\.bz2|tgz|zip|Zip|ZIP|bzip|bzip2|z|Z)$")
ext = ext_split_re.split(release)[-2]
# Then work from the front and get the name.
if known_names is not None:
for prefix in known_names:
if release.startswith(prefix):
name = prefix
if name is None: # no prefix match, so grab up to version number
vn_split_re = re.compile("([-_][0-9]+[^0-9])")
name = vn_split_re.split(release, maxsplit=1)[0]
# The version is whatever's left in between (minus separator chars).
vn = release[len(name):(-1 - len(ext))]
if (vn[0] == '-' or vn[0] == '_' or vn[0] == '.'):
# Could do this with a regexp too, and that might be better.
# Although we check for a "." as separator here, in fact the
# vn_split_re regexp earlier doesn't (yet) allow that.
# However, there are probably packages out there that use a
# dot instead of an underscore or hyphen. It might make sense
# to switch to using a regexp here, and sharing it with the
# earlier code, so that consistency is automatic.
vn = vn[1:]
return name, vn, ext,
class ArchiveError(Exception):
pass
class Archive:
"""Wrapper class representing a Python3 tarfile or zipfile.
Works the way you think it does."""
# So, this is annoying. Python3's 'tarfile' and 'zipfile' modules
# do basically the same thing, but their interfaces have a few
# needless differences, thus requiring this wrapper class. If
# zipfile had open() instead of ZipFile() for its constructor, and
# had getnames() instead of namelist() for its member lister, then
# there would be no need for a wrapper at all. The calling code
# could create the right kind of object conditionally based on the
# archive's extension (i.e., just inline the code from __init__()
# below), and thereafter make the identically-named method calls.
def __init__(self, infile):
self.kind = "tar"
if (infile.endswith(".zip") or
infile.endswith(".Zip") or
infile.endswith(".ZIP")):
self.kind = "zip"
if self.kind == "tar":
self.obj = tarfile.open(infile, 'r')
elif self.kind == "zip":
self.obj = zipfile.ZipFile(infile, 'r')
else:
raise ArchiveError("Unknown archive type: '%s'" % infile)
def members(self):
if self.kind == "tar":
return self.obj.getnames()
elif self.kind == "zip":
return self.obj.namelist()
def extract(self):
self.obj.extractall()
def close(self):
self.obj.close()
def main():
quiet = False
readme_prefix = None
metadata_dir = None
# If this is not None, it is a list of allowed prefixes to be
# regexp-gobbled greedily before the release identifier is found.
# See the '--known-names' option documentation for details.
known_names = None
try:
(opts, args) = getopt.getopt(sys.argv[1:], "k:r:m:qh?",
["known-names=",
"readme-prefix=",
"metadata-dir=",
"quiet",
"help",
"usage" ])
except getopt.GetoptError as err:
sys.stderr.write(str(err))
sys.stderr.write("\n")
sys.exit(1)
for opt, optarg in opts:
if opt in ("-h", "-?", "--help", "--usage",):
print(__doc__)
sys.exit(0)
elif opt in ("-q", "--quiet",):
quiet = True
elif opt in ("-k", "--known-names",):
known_names = optarg.split(",")
elif opt in ("-r", "--readme-prefix",):
with open(optarg, 'r') as f:
readme_prefix = f.read()
elif opt in ("-m", "--metadata-dir",):
metadata_dir = os.path.abspath(optarg)
if not os.path.isdir(metadata_dir):
sys.stderr.write("ERROR: '%s' is not a directory\n"
% metadata_dir)
sys.exit(1)
if len(args) < 1:
sys.stderr.write("ERROR: Need at least one release to work with.\n")
sys.stderr.write("\n")
sys.stderr.write("%s\n" % __doc__)
sys.exit(1)
name, ignored_rel_id, ignored_ext = parse_release(
os.path.basename(args[0]), known_names)
# Place the git repository in cwd, with a "-repos" extension to
# avoid collisions with tarballs that unpack to just 'name'.
git_repos = "%s-repos" % name
git_repos_abs = os.path.join(os.getcwd(), git_repos)
# Ensure there is a local Git repository, either by making it or by
# finding one already here (in which case we assume it's the one we
# want to use -- i.e., that it already has earlier versions in its
# history and that we're just adding new versions on top of those).
if os.path.isfile(git_repos):
sys.stderr.write("ERROR: New git repository blocked by file at '%s'.\n"
% git_repos)
sys.exit(1)
elif os.path.isdir(git_repos) and not os.path.isdir(os.path.join(git_repos, ".git")):
sys.stderr.write("ERROR: Directory at '%s' is not a Git repository.\n"
% git_repos)
sys.exit(1)
elif not os.path.exists(git_repos):
subprocess.call(['git', 'init', '--quiet', git_repos,])
elif not quiet:
print("Proceeding with '%s' as an existing Git repository."
% git_repos)
saved_cwd = os.getcwd()
# Let's hope the user passed the arguments in the right order.
for release_package in args:
name, rel_id, ignored_ext = parse_release(
os.path.basename(release_package), known_names)
rel = Archive(release_package)
rel_members = rel.members()
common_prefix = os.path.commonprefix(rel_members)
# Security check.
for path in rel_members:
if path.startswith("/") or path.find("/../") != -1:
sys.stderr.write("ERROR: Possibly malicious release:\n")
sys.stderr.write(" Path \"%s\" is suspect.\n" % path)
sys.stderr.write(" Exiting without unpacking.\n")
sys.exit(1)
sandbox = tempfile.mkdtemp()
os.chdir(sandbox)
rel.extract()
if common_prefix != '':
os.chdir(common_prefix)
# Yes, we are really doing it this way.
shutil.move(os.path.join(git_repos_abs, ".git"), os.getcwd())
maybe_amend_readme(readme_prefix)
subprocess.call(['git', 'add', '-A', '.',])
do_commit(rel_id, metadata_dir)
shutil.move(os.path.join(os.getcwd(), ".git"), git_repos_abs)
shutil.rmtree(sandbox)
os.chdir(saved_cwd)
rel.close()
if not quiet:
print("Imported %s %s" % (name, rel_id))
# Check out a working tree of the latest release, so we don't
# spook the user with a directory that looks empty.
os.chdir(git_repos)
subprocess.call(['git', 'checkout', '--quiet', '--force',])
os.chdir(saved_cwd)
if not quiet:
sys.stderr.write("Done. Git repository '%s' is ready.\n"
% git_repos_abs)
if __name__ == '__main__':
main()