-
Notifications
You must be signed in to change notification settings - Fork 0
/
field
executable file
·422 lines (349 loc) · 13.3 KB
/
field
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#!/usr/bin/env python3
r"""
field [options] [args]
select and output fields from each record of stdin
args:
- 1-arg: field [options] <rangelist>
- 2-arg: field [non-field opts] <indelim> <rangelist>
- 3-arg: field [non-field opts] <indelim> <outdelim> <rangelist>
- if -r/--regex, indelim is a python regular expression
- field and record delimiters can also be given as options
- fixed delimiters are python string literals
- regex delimiters are python raw strings
fields:
- defaults to whitespace: 'bytes.split(sep=None)'
- if supplied as $1, uses 'bytes.split(sep="$1")'
- if '-r', uses 're.split(r'$1')'
- if '-i <value>', uses 'bytes.split(sep="<value>")'
- if '-F <value>', uses 're.split(r"<value>")'
- if $IFS set, splits on its chars: 're.split(f"[{getenv(IFS)}]+")'
- if '-e', all empty fields are discarded
- if '-G', only leading or trailing blank fields are discarded
records:
- defaults to newline: 'bytes.split(sep="\n")'
- if '-I <value>', uses 'bytes.split(sep="<value>")'
- if '-R <value>', uses 're.split(r"<value>")'
- if '-E', empty records in input are discarded
- if '-N', trailing separator on last record not emitted
- processes input as bytes, in chunks of io.DEFAULT_BUFFER_SIZE
- chunk size overridden with '-b/--bsz' (optional k or m suffix)
outdelims:
- fields default to space, ie '\x20', or via $2, or via -o/--ofs
- records default to newline, ie '\n', or via -O/--ors
rangelist:
- specifier is a comma separated list of ranges
- ranges are either single numbers or first-last sequences
- unspecified Y in X-Y will default to highest Y in the input
- if first field in range greater than than last, print reversed
- if any specified fields do not exist in the record, skip them
- field '0' is the same as all fields (range '1-')
range:
N: just field number N (1 is first)
-N: just field Nth from the end (-1 is last)
N-M: fields N through M (M > N)
N-M: fields M through N, backwards (N > M)
N-: fields N through the end
-N-: fields from Nth from the end, through the end
N--M: fields N through Mth from end
-N--M: fields Nth from the end through Mth from the end
examples:
"echo one two three four five | field 2 -> "two"
"echo one two three four five | field -2" -> "four"
"echo one two three four five | field 2-4" -> "two three four"
"echo one two three four five | field 4-2" -> "four three two"
"echo one two three four five | field 3-" -> "three four five"
"echo one two three four five | field -2-" -> "four five"
"echo one two three | field 1,0" -> "one one two three"
"echo {1..12} | field -8--10" -> "5 4 3"
"echo {1..12} | field -8--15" -> "5 4 3 2 1"
"echo {1..12} | field 3--15" -> "3 2 1"
"echo {1..12} | field 1-3,-1--2,8,8,10-,-2" -> "1 2 3 12 11 8 8 10 11 12 11"
"""
"""
todo:
- read from files given as additional arguments
- /R/-/S/: starting with /R/ and ending with /S/
- specify record selection criteria (pattern)
- field reformatting, eg wrapping, fit in column or on page
- args like -g columnate_group
- different behaviors for range overlaps (set union, addition)
- way to disinclude fields ("all fields except...")
- multiple delimiters, maybe a -i and -d possible for every -f
- multiple delimiters, as in multiple patterns will serve as one
- implement a "record" in terms of "field?"
- flag to discard empty records in output (no non-empty fields)
"""
__url__ = 'https://github.com/smemsh/field/'
__license__ = 'GPL-2.0'
from sys import exit, hexversion
if hexversion < 0x030900f0: exit("minpython: %x" % hexversion)
import argparse
import re
from io import DEFAULT_BUFFER_SIZE as BUFSIZ
from sys import argv, stdin, stdout, stderr
from types import SimpleNamespace
from select import select
from functools import partial
from traceback import print_exc
from os import (
getenv, isatty, dup,
close as osclose,
EX_OK as EXIT_SUCCESS,
EX_SOFTWARE as EXIT_FAILURE,
)
from os.path import basename
###
def err(*args, **kwargs):
print(f"{invname}:", *args, file=stderr, **kwargs)
def bomb(*args, **kwargs):
err(*args, **kwargs)
exit(EXIT_FAILURE)
###
def process_args():
global ifs, ofs, irs, ors
global args
def usagex(*args, **kwargs):
nonlocal p
p.print_help(file=stderr)
print(file=stderr)
bomb(*args, **kwargs)
# parse_args() gives escaped strings
def unesc(s):
if s is None: return
else: return s.encode('raw-unicode-escape').decode('unicode-escape')
# not clear why the interface gives a prefix arg and defaults it, but
# doesn't allow it to be passed in from anywhere, so we have to override
#
class RawTextHelpFormatterEmptyUsageLine(argparse.RawTextHelpFormatter):
def add_usage(self, usage, actions, groups, prefix=None):
if prefix is None:
prefix = ''
return super(RawTextHelpFormatterEmptyUsageLine, self) \
.add_usage(usage, actions, groups, prefix)
p = argparse.ArgumentParser(
prog = invname,
description = __doc__.strip(),
allow_abbrev = False,
formatter_class = RawTextHelpFormatterEmptyUsageLine,
usage = "",
)
def addopt(p, flagchar, longopt, help=None, /, **kwargs):
options = list(("-%s --%s" % (flagchar, longopt)).split())
p.add_argument(*options, help=help, **kwargs)
def addflag(*args, **kwargs):
addopt(*args, action='store_true', **kwargs)
def hasopt(*options):
return any([getattr(args, a) for a in [*options]])
addopt (p, 'F', 'ifsre', "input field separator as python regex")
addopt (p, 'R', 'irsre', "input record separator as python regex")
addopt (p, 'i', 'ifs', "input field separator as python string")
addopt (p, 'I', 'irs', "input record separator as python string")
addopt (p, 'o', 'ofs', "output field separator string")
addopt (p, 'O', 'ors', "output record separator string")
addopt (p, 'b', 'bsz', "bytes per read, optional m or k suffix")
addflag (p, '0', 'null', "irs is a '\\0' char")
addflag (p, 'z', 'zero', "ors is a '\\0' char")
addflag (p, 'r', 'regex', "positional ifs is a python regex")
addflag (p, 'G', 'noedges', "discard initial or trailing empty fields")
addflag (p, 'e', 'noempty', "discard empty fields within a record")
addflag (p, 'E', 'noblanks', "discard blank records with no fields")
addflag (p, 'N', 'noendrec', "skip ors after last record was emitted")
addflag (p, 'l', 'flushrecs', "do individual writes every record")
if not args: usagex("must supply args on stdin")
args, left = p.parse_known_args(args)
n = len(left)
if n == 0:
fields = '1-'
else:
fields = left[0]
if n == 1:
if args.regex:
bomb("-r/--regex only for positionally specified ifs")
else:
# positional arg syntax
if hasopt('ifs', 'ifsre', 'ofs'):
usagex("ifs/ofs must be either positional or optional")
else:
ifs = left[0]
if not args.regex:
# leave escaped if regex, as in raw string
ifs = unesc(ifs)
if n == 2:
fields = left[1]
elif n == 3:
ofs = unesc(left[1])
fields = left[2]
else:
usagex("bad args")
if not ifs:
ifs = getenv('IFS')
if ifs:
ifs = f"[{ifsenv}]+"
args.regex = True
if args.null and args.irs:
bomb("-0/--null and --irs are mutually exclusive")
if args.zero and args.ors:
bomb("-z/--zero, --ors are mutually exclusive")
if args.null: irs = '\0'
if args.zero: ors = '\0'
ifs = args.ifsre or unesc(args.ifs) or ifs
ofs = unesc(args.ofs) or ofs
irs = args.irsre or unesc(args.irs) or irs
ors = unesc(args.ors) or ors
if b := args.bsz:
if m := re.fullmatch(r'(?P<bsz>\d+)(?P<unit>[km]?)', b, re.I):
b = int(m.group('bsz'))
u = m.group('unit')
if u == 'k': b <<= 10
elif u == 'm': b <<= 20
args.bsz = b
else:
bomb("invalid bsz parameter")
return fields
class FieldRange:
def __init__(self, start, end):
self.start = int(start) # start is always supplied
self.end = None if end is None else int(end)
def parse_fields(rangelist):
ranges = []
for fieldrange in rangelist.split(','):
match = re.fullmatch(r"""
(?P<start>-?\d+)
(?P<isrange>-
(?P<end>-?\d+)?
)?
""", fieldrange, re.VERBOSE)
if match:
m = SimpleNamespace(**match.groupdict())
if not m.isrange:
m.end = m.start
else:
for endpoint in [m.start, m.end]:
if endpoint is not None and int(endpoint) == 0:
bomb("range spec '0' (all fields) excludes start/end")
del m.isrange
ranges += [FieldRange(**vars(m))]
continue
else:
bomb(f"bad field range: {fieldrange}")
return ranges
def field(rangelist):
global ors
ranges = parse_fields(rangelist)
flush = True if args.flushrecs else False
out = stdout.buffer
g = globals()
for var in ['ifs', 'irs', 'ofs', 'ors']:
if g[var]: g[var] = bytes(g[var], 'utf-8')
r_isre = args.irsre and not args.null
f_isre = args.ifsre or args.regex
reflags = re.MULTILINE | re.DOTALL
# readlines() does not accept delim, and no plans (cpython #41622)
def recordize(bufs, delim):
record = ''
for buf in bufs:
if record: record += buf
else: record = buf
if not buf: break
if r_isre: records = re.split(delim, record, reflags)
else: records = record.split(delim)
record = records.pop()
yield from records
if record:
# delim at eof terminates input, as in awk
yield record
bufs = iter(partial(infile.read, args.bsz or BUFSIZ), '')
records = recordize(bufs, irs)
# handle iteration ourselves, so last loop can behave differently
try: record = next(records) # preload first record before entering loop
except StopIteration: return # early exit, not even one input record
lastloop = False
while True:
if args.noblanks and not record:
continue
if f_isre: fields = re.split(ifs, record)
else: fields = record.split(ifs)
if args.noedges:
indices = list(range(len(fields)))
for i in indices: # leading
if not fields[i]: del fields[i]; indices.pop()
else: break
for i in range(len(fields) - 1, -1, -1): # trailing
if not fields[i]: del fields[i]
else: break
if args.noempty:
fields = [f for f in fields if f]
n = len(fields)
indices = []
for r in ranges:
start = r.start or 1
end = r.end or n
if start < 0: start = n - abs(start) + 1
if end < 0: end = n - abs(end) + 1
step = 1 if start <= end else -1
offsets = range(start - 1, end - 1 + step, step)
indices += [i for i in offsets if 0 <= i < n]
output = ofs.join([fields[i] for i in indices])
try: record = next(records)
except StopIteration:
lastloop = True
if args.noendrec:
ors = b''
if isatty(stdout.fileno()):
if not ors.endswith(b'\n'):
ors += b'\n'
output += ors
try:
out.write(output)
if flush: out.flush()
except BrokenPipeError:
exit(EXIT_SUCCESS) # eg "field | head"
if lastloop:
break
###
def main():
if debug == 1:
breakpoint()
rangelist = process_args()
try: subprogram = globals()[invname]
except (KeyError, TypeError):
from inspect import trace
if len(trace()) == 1: bomb("unimplemented")
else: raise
return subprogram(rangelist)
###
if __name__ == "__main__":
args = argv[1:]
invname = basename(argv[0])
# move stdin, pdb needs stdio fds itself
stdinfd = stdin.fileno()
if not isatty(stdinfd) and select([stdin], [], [])[0]:
infile = open(dup(stdinfd), 'rb')
osclose(stdinfd)
try: stdin = open('/dev/tty')
except: pass # no ctty, but then pdb would not be in use
else:
args = None
from bdb import BdbQuit
debug = int(getenv('DEBUG') or 0)
if debug:
import pdb
from pprint import pp
err('debug: enabled')
ifs = None; ofs = '\x20'
irs = "\n"; ors = "\n"
try: main()
except BdbQuit: bomb("debug: stop")
except SystemExit: raise
except KeyboardInterrupt: bomb("interrupted")
except:
print_exc(file=stderr)
if debug: pdb.post_mortem()
finally: # cpython bug 55589
try: stdout.flush()
finally:
try: stdout.close()
finally:
try: stderr.flush()
finally: stderr.close()