-
Notifications
You must be signed in to change notification settings - Fork 2
/
aligntable
executable file
·393 lines (345 loc) · 14.8 KB
/
aligntable
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python3
import sys
import re
def helpandexit():
print(r'''
Aligns lines read from stdin by adding/removing spaces such that columns are
aligned. In the input text, two columns must be separated by the <separator>.The
locations of the columns are determined automatically such that the separators are
aligned across the lines.
In general, whitespaces between the columns are ignored, except those at the beginning of
the lines. Those are assumed to be important and provided for indentation purposes.
COMMON OPTIONS
-h,--help Print help.
-s "<regexp>" Use custom column separator. (Default=" ").
Whatever matches as the separator will be used as the separator after alignment.
Separators may vary in length, although this is rarely needed.
-g,--group Logically group rows with different number of columns. Align them independently.
This is very useful when dealing with things like LaTeX tables. (Default=disabled)
--tex Align latex table.
Equivalent to <aligntable -s "&" --group --sepwrap " " " " -strrep "\t" " " --nomaxif "^[ ]*[%]">
--texboldmax Align latex table and make max-entries bold.
Equivalent to <aligntable -s "&" --group --sepwrap " " " " --maxwrap "\bf " ""
-strrep "\t" " "
-strrep "\\bf" "" --numignore "\\\\" --numignore "\\[A-Za-z]+"
--numignore "[{}]" --nomaxif "^[ ]*[%]">
ADVANCED OPTIONS
--numignore <regex> Replace matching subtexts with " " (to avoid mistakenly concatenating multiple numbers)
just before conversion of an entry to number where necessary (eg, --maxwrap).
Matching subtexts will be preserved in the final output. Multiple --numignore
can be provided.
--strrep <regex> <string> Before running the alignment, replace each matching entry of <regex> with <string>
within each line. Multiple strrep options are applied in the provided order.
--atleastfirst Indendatation for each column cannot be smaller than the original indendation
of the first row of that column. (Default=disabled)
--sepwrap <prefix> <suffix> Wrap each separator with the provided text. (Default: <prefix>="" <suffix>="").
--maxwrap <prefix> <suffix> Wrap the maximum numeric value within each column. Intentionally does not
support subtext (other than whitespace trimming); instead use --strrep and
--numignore to clean-up irrelevant strings before conversion to number.
--minwrap <prefix> <suffix> Similar to --maxwrap, wraps minimum value within each column. (Default=disabled)
--commentsign <string> Ignore the lines starting with <string> (ignores preceding spaces).
--nomaxif <regexp> Do not allow lines containing <regexp> to
Note that all string arguments are processed as "raw strings" without python-decoding.
Important: TABS typically cause problem. A quick solution is to replace them before running this tool or using the --strrep option.
EXAMPLES
* A basic example:
echo -e "first second third\n fd fourth fifth sixth" | aligntable
* Providing -s " &. " would use any of the following as the separator: " &x "," &y ",... etc.
* In Vim, type the following to process the current file:
:%!aligntable <options>
And also adding following options to .vimrc can be handy:
command! AlignLatexTable :%!aligntable --tex
command! AlignLatexTableBoldMax :%!aligntable --texboldmax --nomaxif "\#NOMAX\#"
SEE alignassign
R. Gokberk Cinbis, March 2013
''')
sys.exit()
'''
NOT IMPLEMENTED YET
* For lines with fewer columns, given an option to match columns to the closest original column by looking at the
preceding lines. This is useful for aligning comments providing the parameters.
'''
# Copy lines from stdin
def get_stdin_lines():
lines = []
for line in sys.stdin:
line = line.rstrip('\n') # remove newline at the end.
lines.append(line)
return lines
# Remove first part before the separator.
# If lines[i] does not have separator, seps[i]=None and lines[i]='' after processing.
# prevorglens is the non-trimmed length of prevs entries.
def separate_lines(prog,lines):
nline = len(lines)
prevs = [None] * nline
prevorglens = [None] * nline
seps = [None] * nline
for li in range(nline):
l = lines[li]
m = prog.search(l)
if m:
prev__ = l[:m.start()]
prevorglens = len(prev__)
prevs[li] = prev__.rstrip() # do not strip(): we want to keep the spaces at the beginning of the lines.
seps[li] = l[m.start():m.end()] # do not strip separators: space itself can be the separator.
lines[li] = l[m.end():].strip()
else:
prevs[li] = l.rstrip()
lines[li] = ''
return prevorglens,prevs,seps
# Return the number of columns in each line, which is the "#separator + 1".
def get_num_col(prog,lines):
ncol = [0] * len(lines)
# fails unless we have a very smart regexp: ncol[li] = len( prog.findall(lines[li]) ) + 1
for li in range(len(lines)):
l = lines[li]
while True:
m = prog.search(l)
if m:
ncol[li] += 1
l=l[m.end():].strip()
else:
break
return ncol
# return true if list has -1 in all entries.
def all_empty(arr):
for a in arr:
if a!='':
return False
return True
# first non-negative entry
def firstnonneg(arr):
for a in arr:
if a>=0:
return a
return None
# returns a list of numeric entries. Returns "None" for non-numeric entries or those
# with multiple numbers enclosed.
#
# subtext numbers: We can consider using only the numeric sub-text automatically.
# However, that is likely to cause compilications: For example, a text like "Foo[2]"
# may be interpreted as number 2. Then, we have to declare which columns should be updated,
# which quickly becomes unusable.
#
# Instead of starting with sth that is too-flexible, I instead require entry to be directly
# numeric AFTER applying --strrep and --numignore.
#
# Returns nums,starts,ends indicating numbers and corresponding sub-text start/end indices.
def txt2nums(txts,numignore,is_nowrap_line):
nums = [None] * len(txts)
starts = [None] * len(txts)
ends = [None] * len(txts)
for li in range(len(txts)):
orgtxt = txts[li]
# check if the line should be ignored.
if is_nowrap_line[li]:
continue # ignore the line
# pre process with numignore
newtxt = orgtxt
for p in numignore: # numignore are precompiled to programs.
newtxt = p.sub(" ",newtxt)
newtxt = newtxt.strip() # remove surrounding whitespace, possibly due to numignore.
# try convert
try:
nums[li] = float(newtxt) # float("") throws exception.
except exceptions.ValueError:
continue
# find substring indices
ind = findtxt(orgtxt,newtxt)
if len(ind)==1:
starts[li] = ind[0]
ends[li] = ind[0] + len(newtxt)
else:
nums[li] = None
return nums,starts,ends
# find indices {i} where nums(i)==val.
def findval(nums,val):
return [a for a,b in enumerate(nums) if b==val]
# find substring start indices where txt[i:i+len]==subtxt
def findtxt(txt,subtxt):
out = []
starti = 0
if txt=="":
raise Exception("There Inf-many empty substrings!")
while True:
i = txt.find(subtxt,starti)
if i < 0:
return out
else:
out.append(i)
starti = i + len(subtxt)
# apply wrap option
# leave lines with matching nowrapif entries unchanged.
def applywrap(prevs,opt,optwrap,numignore,is_nowrap_line):
if optwrap==None or prevs==[]:
return
nums,starts,ends = txt2nums(prevs,numignore,is_nowrap_line)
if opt=='max':
val = max(nums)
elif opt=='min':
val = min(nums)
else:
raise Exception("Unrecognized opt for applywrap()")
if val==None:
return # all entries in nums is None.
lines = findval(nums,val) # excludes lines where nums[i]=None.
for li in lines:
s = starts[li]
e = ends[li]
prevs[li] = prevs[li][:s] + optwrap[0] + prevs[li][s:e] + optwrap[1] + prevs[li][e:]
# if i-th line should be ignored from the process, then sets outlines[i]=lines[i] and lines[i]=''
def ignore_lines(lines,outlines,commentsign):
nline = len(lines)
for j in range(nline):
x = lines[j].lstrip()
for cs in commentsign:
if x.startswith(cs):
outlines[j]=lines[j]
lines[j] = ''
break
# process single group where linemask defines the number of lines to be processed
def processgroup(prog,separator,sepwrap,maxwrap,minwrap,lines,atleastfirst,numignore,commentsign,nomaxif):
# align each column iteratively
nline = len(lines)
outlines = [''] * nline
ignore_lines(lines,outlines,commentsign)
is_nomaxline = [any([p.search(l)!=None for p in nomaxif]) for l in lines] # find out which lines shouldn't be processed for "maxwrap".
#for i in is_nomaxline:
# print(i)
while all_empty(lines)==False:
prevorglens,prevs,seps = separate_lines(prog,lines)
#print("=======")
#print(prevs)
#print(seps)
# add min-max wrappers before getting alignmnet info
applywrap(prevs,'max',maxwrap,numignore,is_nomaxline)
applywrap(prevs,'min',minwrap,numignore,[False] * nline)
# add separator wrappers
for j in range(nline):
if seps[j]==None:
seps[j]=''
else:
seps[j] = sepwrap[0] + seps[j] + sepwrap[1]
# get alignment info after manipulations on the "prev" entries (ie. current column text) and "sep" entries.
maxprevlen = max([len(x) for x in prevs])
maxseplen = max([len(x) for x in seps]) # sep=None if unavaible.
if atleastfirst: # first occurrence of the column is allowed to override this, if denotes a larger spacing
for j in range(nline):
if seps[j]!=None:
maxprevlen = max(maxprevlen,prevorglens[j])
break
for j in range(nline):
nspace_before = maxprevlen-len(prevs[j])
nspace_after = maxseplen-len(seps[j])
outlines[j] += prevs[j] + (' ' * nspace_before) + seps[j] + (' ' * nspace_after)
outlines = [x.rstrip() for x in outlines] # remove trailing whitespaces
return outlines
# preprocess: find & replace.
def apply_strrep(strrep,lines):
for x in strrep:
p = re.compile(x[0])
txt = x[1]
for li in range(len(lines)):
lines[li] = p.sub(txt,lines[li])
# main process function
def process(separator,sepwrap,maxwrap,minwrap,groupbyncol,atleastfirst,strrep,numignore,commentsign,nomaxif):
lines = get_stdin_lines() # get lines
apply_strrep(strrep,lines) # preprocess: --strrep
prog = re.compile(separator) # separator regexp
numignore = [re.compile(x) for x in numignore] # precompile --numignore entries
nomaxif = [re.compile(x) for x in nomaxif] # precompile regular expressions
if groupbyncol:
ncols = get_num_col(prog,lines) # number of columns in each line
unq_ncols = list(set(ncols))
for ncol in unq_ncols:
lineind = []
curlines = []
for j in range(len(lines)):
if ncols[j]==ncol:
lineind.append(j)
curlines.append(lines[j])
curlines = processgroup(prog,separator,sepwrap,maxwrap,minwrap,curlines,atleastfirst,numignore,commentsign,nomaxif)
for j in range(len(lineind)):
lines[lineind[j]] = curlines[j]
else:
lines = processgroup(prog,separator,sepwrap,maxwrap,minwrap,lines,atleastfirst,numignore,commentsign,nomaxif)
# print lines
for l in lines:
print(l)
# get the i-th argument
def getarg(argi):
if argi >= len(sys.argv):
print('incomplete argument list')
helpandexit()
return sys.argv[argi]
####### main #######
# defaults
separator = ' ' # <regex>
sepwrap = ['',''] # [<prefix>,<suffix>]
maxwrap = None # [<prefix>,<suffix>]
minwrap = None # [<prefix>,<suffix>]
numignore = [] # Array of regular expressions: [<regex1>,<regex2>,...]
groupbyncol = False # False | True
atleastfirst = False # False | True
strrep = [] # array of (<regex>,<string>) tuples.
commentsign = [] # Array of comment signs.
nomaxif = [] # Array of regular expression strings.
# parse arguments
ai = 1
while ai < len(sys.argv):
a = getarg(ai)
if a=='-h' or a=='--help':
helpandexit()
elif a=='-s':
ai += 1
separator = getarg(ai)
elif a=='-g' or a=='--group':
groupbyncol = True
elif a=='--atleastfirst':
atleastfirst = True
elif a=='--sepwrap':
sepwrap = [getarg(ai+1),getarg(ai+2)]
ai += 2
elif a=='--tex':
separator = "&"
groupbyncol = True
sepwrap = [r" ",r" "]
strrep.append((r"\t",r" "))
# NO, tables become ugly! commentsign.append("%")
elif a=='--texboldmax':
separator = "&"
groupbyncol = True
sepwrap = [r" ",r" "] # to replicate console argument, I use raw-strng literals here in every possible place.
#maxwrap = [r"\bf ",r""]
maxwrap = [r"\textbf{",r"}"] # TODO: Add {}
strrep.append((r"\t",r" "))
strrep.append((r"\\bf",r""))
strrep.append((r"\\textbf",r""))
numignore.append(r"\\\\")
numignore.append(r"\\[A-Za-z]+")
numignore.append(r"[{}]")
nomaxif.append(r"^[ ]*[%]")
# NO, tables become ugly! commentsign.append("%")
elif a=='--maxwrap':
maxwrap = [getarg(ai+1),getarg(ai+2)]
ai += 2
elif a=='--minwrap':
minwrap = [getarg(ai+1),getarg(ai+2)]
ai += 2
elif a=='--strrep':
strrep.append((getarg(ai+1),getarg(ai+2)))
ai += 2
elif a=='--numignore':
numignore.append(getarg(ai+1))
ai += 1
elif a=='--nomaxif':
nomaxif.append(getarg(ai+1))
ai += 1
elif a=='--commentsign':
commentsign.append(getarg(ai+1))
ai += 1
else:
print('unrecognized argument!')
helpandexit()
ai += 1
process(separator,sepwrap,maxwrap,minwrap,groupbyncol,atleastfirst,strrep,numignore,commentsign,nomaxif)