-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathshard
executable file
·42 lines (35 loc) · 1.21 KB
/
shard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
# Split a file by lines into N roughly evenly sized files.
# Usage: shard [shards] [filename]
# e.g. shard 3 testfile
# ---> split testfile into testfile.0, testfile.1, testfile.2
# e.g. zcat x | shard 5
# ---> split into shard.0, shard.1, ..., shard.4
# e.g. shard 5 file1 file2 file3
# ---> split concatenation of files 1-3 into shard.0, shard.1, ..., shard.4
# Author: David McClosky
# Homepage: http://zorglish.org
# Code: http://github.com/dmcc
import sys
import itertools
num_shards = int(sys.argv[1])
filenames = sys.argv[2:]
if len(filenames) == 1:
filename = filenames[0]
else: # many files or stdin
# TODO should make this configurable
filename = 'shard'
# TODO should make this configurable
# %s is the original filename
# %d is the shard number
num_digits = len(str(num_shards))
template = '%%s.%%0%dd' % num_digits
# template resolves to filename.shardnumber
# where shardnumber is 0-filled
shard_files = [open(template % (filename, shard), 'w')
for shard in range(num_shards)]
shard_file_cycler = itertools.cycle(shard_files)
for filename in filenames:
for line in open(filename):
shard_file = next(shard_file_cycler)
shard_file.write(line)