#!/usr/bin/env python3
r"""
field [options] [args]
select and output fields from each record of stdin
args:
- 1-arg: field [options]
- 2-arg: field [non-field opts]
- 3-arg: field [non-field opts]
- if -r/--regex, indelim is a python regular expression
- field and record delimiters can also be given as options
- fixed delimiters are python string literals
- regex delimiters are python raw strings
fields:
- defaults to whitespace: 'bytes.split(sep=None)'
- if supplied as $1, uses 'bytes.split(sep="$1")'
- if '-r', uses 're.split(r'$1')'
- if '-i ', uses 'bytes.split(sep="")'
- if '-F ', uses 're.split(r"")'
- if $IFS set, splits on its chars: 're.split(f"[{getenv(IFS)}]+")'
- if '-e', all empty fields are discarded
- if '-G', only leading or trailing blank fields are discarded
records:
- defaults to newline: 'bytes.split(sep="\n")'
- if '-I ', uses 'bytes.split(sep="")'
- if '-R ', uses 're.split(r"")'
- if '-E', empty records in input are discarded
- if '-N', trailing separator on last record not emitted
- processes input as bytes, in chunks of io.DEFAULT_BUFFER_SIZE
- chunk size overridden with '-b/--bsz' (optional k or m suffix)
outdelims:
- fields default to space, ie '\x20', or via $2, or via -o/--ofs
- records default to newline, ie '\n', or via -O/--ors
rangelist:
- specifier is a comma separated list of ranges
- ranges are either single numbers or first-last sequences
- unspecified Y in X-Y will default to highest Y in the input
- if first field in range greater than than last, print reversed
- if any specified fields do not exist in the record, skip them
- field '0' is the same as all fields (range '1-')
range:
N: just field number N (1 is first)
-N: just field Nth from the end (-1 is last)
N-M: fields N through M (M > N)
N-M: fields M through N, backwards (N > M)
N-: fields N through the end
-N-: fields from Nth from the end, through the end
N--M: fields N through Mth from end
-N--M: fields Nth from the end through Mth from the end
examples:
"echo one two three four five | field 2 -> "two"
"echo one two three four five | field -2" -> "four"
"echo one two three four five | field 2-4" -> "two three four"
"echo one two three four five | field 4-2" -> "four three two"
"echo one two three four five | field 3-" -> "three four five"
"echo one two three four five | field -2-" -> "four five"
"echo one two three | field 1,0" -> "one one two three"
"echo {1..12} | field -8--10" -> "5 4 3"
"echo {1..12} | field -8--15" -> "5 4 3 2 1"
"echo {1..12} | field 3--15" -> "3 2 1"
"echo {1..12} | field 1-3,-1--2,8,8,10-,-2" -> "1 2 3 12 11 8 8 10 11 12 11"
"""
"""
todo:
- read from files given as additional arguments
- /R/-/S/: starting with /R/ and ending with /S/
- specify record selection criteria (pattern)
- field reformatting, eg wrapping, fit in column or on page
- args like -g columnate_group
- different behaviors for range overlaps (set union, addition)
- way to disinclude fields ("all fields except...")
- multiple delimiters, maybe a -i and -d possible for every -f
- multiple delimiters, as in multiple patterns will serve as one
- implement a "record" in terms of "field?"
- flag to discard empty records in output (no non-empty fields)
"""
__url__ = 'https://github.com/smemsh/field/'
__author__ = 'Scott Mcdermott '
__license__ = 'GPL-2.0'
from sys import exit, hexversion
if hexversion < 0x030900f0: exit("minpython: %x" % hexversion)
import argparse
import re
from io import DEFAULT_BUFFER_SIZE as BUFSIZ
from sys import argv, stdin, stdout, stderr
from types import SimpleNamespace
from select import select
from functools import partial
from traceback import print_exc
from os import (
getenv, isatty, dup,
close as osclose,
EX_OK as EXIT_SUCCESS,
EX_SOFTWARE as EXIT_FAILURE,
)
from os.path import basename
###
def err(*args, **kwargs):
print(f"{invname}:", *args, file=stderr, **kwargs)
def bomb(*args, **kwargs):
err(*args, **kwargs)
exit(EXIT_FAILURE)
###
def process_args():
global ifs, ofs, irs, ors
global args
def usagex(*args, **kwargs):
nonlocal p
p.print_help(file=stderr)
print(file=stderr)
bomb(*args, **kwargs)
# parse_args() gives escaped strings
def unesc(s):
if s is None: return
else: return s.encode('raw-unicode-escape').decode('unicode-escape')
# not clear why the interface gives a prefix arg and defaults it, but
# doesn't allow it to be passed in from anywhere, so we have to override
#
class RawTextHelpFormatterEmptyUsageLine(argparse.RawTextHelpFormatter):
def add_usage(self, usage, actions, groups, prefix=None):
if prefix is None:
prefix = ''
return super(RawTextHelpFormatterEmptyUsageLine, self) \
.add_usage(usage, actions, groups, prefix)
p = argparse.ArgumentParser(
prog = invname,
description = __doc__.strip(),
allow_abbrev = False,
formatter_class = RawTextHelpFormatterEmptyUsageLine,
usage = "",
)
def addopt(p, flagchar, longopt, help=None, /, **kwargs):
options = list(("-%s --%s" % (flagchar, longopt)).split())
p.add_argument(*options, help=help, **kwargs)
def addflag(*args, **kwargs):
addopt(*args, action='store_true', **kwargs)
def hasopt(*options):
return any([getattr(args, a) for a in [*options]])
addopt (p, 'F', 'ifsre', "input field separator as python regex")
addopt (p, 'R', 'irsre', "input record separator as python regex")
addopt (p, 'i', 'ifs', "input field separator as python string")
addopt (p, 'I', 'irs', "input record separator as python string")
addopt (p, 'o', 'ofs', "output field separator string")
addopt (p, 'O', 'ors', "output record separator string")
addopt (p, 'b', 'bsz', "bytes per read, optional m or k suffix")
addflag (p, '0', 'null', "irs is a '\\0' char")
addflag (p, 'z', 'zero', "ors is a '\\0' char")
addflag (p, 'r', 'regex', "positional ifs is a python regex")
addflag (p, 'G', 'noedges', "discard initial or trailing empty fields")
addflag (p, 'e', 'noempty', "discard empty fields within a record")
addflag (p, 'E', 'noblanks', "discard blank records with no fields")
addflag (p, 'N', 'noendrec', "skip ors after last record was emitted")
addflag (p, 'l', 'flushrecs', "do individual writes every record")
if not args: usagex("must supply args on stdin")
args, left = p.parse_known_args(args)
n = len(left)
if n == 0:
fields = '1-'
else:
fields = left[0]
if n == 1:
if args.regex:
bomb("-r/--regex only for positionally specified ifs")
else:
# positional arg syntax
if hasopt('ifs', 'ifsre', 'ofs'):
usagex("ifs/ofs must be either positional or optional")
else:
ifs = left[0]
if not args.regex:
# leave escaped if regex, as in raw string
ifs = unesc(ifs)
if n == 2:
fields = left[1]
elif n == 3:
ofs = unesc(left[1])
fields = left[2]
else:
usagex("bad args")
if not ifs:
ifs = getenv('IFS')
if ifs:
ifs = f"[{ifsenv}]+"
args.regex = True
if args.null and args.irs:
bomb("-0/--null and --irs are mutually exclusive")
if args.zero and args.ors:
bomb("-z/--zero, --ors are mutually exclusive")
if args.null: irs = '\0'
if args.zero: ors = '\0'
ifs = args.ifsre or unesc(args.ifs) or ifs
ofs = unesc(args.ofs) or ofs
irs = args.irsre or unesc(args.irs) or irs
ors = unesc(args.ors) or ors
if b := args.bsz:
if m := re.fullmatch(r'(?P\d+)(?P[km]?)', b, re.I):
b = int(m.group('bsz'))
u = m.group('unit')
if u == 'k': b <<= 10
elif u == 'm': b <<= 20
args.bsz = b
else:
bomb("invalid bsz parameter")
return fields
class FieldRange:
def __init__(self, start, end):
self.start = int(start) # start is always supplied
self.end = None if end is None else int(end)
def parse_fields(rangelist):
ranges = []
for fieldrange in rangelist.split(','):
match = re.fullmatch(r"""
(?P-?\d+)
(?P-
(?P-?\d+)?
)?
""", fieldrange, re.VERBOSE)
if match:
m = SimpleNamespace(**match.groupdict())
if not m.isrange:
m.end = m.start
else:
for endpoint in [m.start, m.end]:
if endpoint is not None and int(endpoint) == 0:
bomb("range spec '0' (all fields) excludes start/end")
del m.isrange
ranges += [FieldRange(**vars(m))]
continue
else:
bomb(f"bad field range: {fieldrange}")
return ranges
def field(rangelist):
global ors
ranges = parse_fields(rangelist)
flush = True if args.flushrecs else False
out = stdout.buffer
g = globals()
for var in ['ifs', 'irs', 'ofs', 'ors']:
if g[var]: g[var] = bytes(g[var], 'utf-8')
r_isre = args.irsre and not args.null
f_isre = args.ifsre or args.regex
reflags = re.MULTILINE | re.DOTALL
# readlines() does not accept delim, and no plans (cpython #41622)
def recordize(bufs, delim):
record = ''
for buf in bufs:
if record: record += buf
else: record = buf
if not buf: break
if r_isre: records = re.split(delim, record, reflags)
else: records = record.split(delim)
record = records.pop()
yield from records
if record:
# delim at eof terminates input, as in awk
yield record
bufs = iter(partial(infile.read, args.bsz or BUFSIZ), '')
records = recordize(bufs, irs)
# handle iteration ourselves, so last loop can behave differently
try: record = next(records) # preload first record before entering loop
except StopIteration: return # early exit, not even one input record
lastloop = False
while True:
if args.noblanks and not record:
continue
if f_isre: fields = re.split(ifs, record)
else: fields = record.split(ifs)
if args.noedges:
indices = list(range(len(fields)))
for i in indices: # leading
if not fields[i]: del fields[i]; indices.pop()
else: break
for i in range(len(fields) - 1, -1, -1): # trailing
if not fields[i]: del fields[i]
else: break
if args.noempty:
fields = [f for f in fields if f]
n = len(fields)
indices = []
for r in ranges:
start = r.start or 1
end = r.end or n
if start < 0: start = n - abs(start) + 1
if end < 0: end = n - abs(end) + 1
step = 1 if start <= end else -1
offsets = range(start - 1, end - 1 + step, step)
indices += [i for i in offsets if 0 <= i < n]
output = ofs.join([fields[i] for i in indices])
try: record = next(records)
except StopIteration:
lastloop = True
if args.noendrec:
ors = b''
if isatty(stdout.fileno()):
if not ors.endswith(b'\n'):
ors += b'\n'
output += ors
try:
out.write(output)
if flush: out.flush()
except BrokenPipeError:
exit(EXIT_SUCCESS) # eg "field | head"
if lastloop:
break
###
def main():
if debug == 1:
breakpoint()
rangelist = process_args()
try: subprogram = globals()[invname]
except (KeyError, TypeError):
from inspect import trace
if len(trace()) == 1: bomb("unimplemented")
else: raise
return subprogram(rangelist)
###
if __name__ == "__main__":
args = argv[1:]
invname = basename(argv[0])
# move stdin, pdb needs stdio fds itself
stdinfd = stdin.fileno()
if not isatty(stdinfd) and select([stdin], [], [])[0]:
infile = open(dup(stdinfd), 'rb')
osclose(stdinfd)
try: stdin = open('/dev/tty')
except: pass # no ctty, but then pdb would not be in use
else:
args = None
from bdb import BdbQuit
debug = int(getenv('DEBUG') or 0)
if debug:
import pdb
from pprint import pp
err('debug: enabled')
ifs = None; ofs = '\x20'
irs = "\n"; ors = "\n"
try: main()
except BdbQuit: bomb("debug: stop")
except SystemExit: raise
except KeyboardInterrupt: bomb("interrupted")
except:
print_exc(file=stderr)
if debug: pdb.post_mortem()
finally: # cpython bug 55589
try: stdout.flush()
finally:
try: stdout.close()
finally:
try: stderr.flush()
finally: stderr.close()