#!/usr/bin/env python3 r""" field [options] [args] select and output fields from each record of stdin args: - 1-arg: field [options] - 2-arg: field [non-field opts] - 3-arg: field [non-field opts] - if -r/--regex, indelim is a python regular expression - field and record delimiters can also be given as options - fixed delimiters are python string literals - regex delimiters are python raw strings fields: - defaults to whitespace: 'bytes.split(sep=None)' - if supplied as $1, uses 'bytes.split(sep="$1")' - if '-r', uses 're.split(r'$1')' - if '-i ', uses 'bytes.split(sep="")' - if '-F ', uses 're.split(r"")' - if $IFS set, splits on its chars: 're.split(f"[{getenv(IFS)}]+")' - if '-e', all empty fields are discarded - if '-G', only leading or trailing blank fields are discarded records: - defaults to newline: 'bytes.split(sep="\n")' - if '-I ', uses 'bytes.split(sep="")' - if '-R ', uses 're.split(r"")' - if '-E', empty records in input are discarded - if '-N', trailing separator on last record not emitted - processes input as bytes, in chunks of io.DEFAULT_BUFFER_SIZE - chunk size overridden with '-b/--bsz' (optional k or m suffix) outdelims: - fields default to space, ie '\x20', or via $2, or via -o/--ofs - records default to newline, ie '\n', or via -O/--ors rangelist: - specifier is a comma separated list of ranges - ranges are either single numbers or first-last sequences - unspecified Y in X-Y will default to highest Y in the input - if first field in range greater than than last, print reversed - if any specified fields do not exist in the record, skip them - field '0' is the same as all fields (range '1-') range: N: just field number N (1 is first) -N: just field Nth from the end (-1 is last) N-M: fields N through M (M > N) N-M: fields M through N, backwards (N > M) N-: fields N through the end -N-: fields from Nth from the end, through the end N--M: fields N through Mth from end -N--M: fields Nth from the end through Mth from the end examples: "echo one two three four five | field 2 -> "two" "echo one two three four five | field -2" -> "four" "echo one two three four five | field 2-4" -> "two three four" "echo one two three four five | field 4-2" -> "four three two" "echo one two three four five | field 3-" -> "three four five" "echo one two three four five | field -2-" -> "four five" "echo one two three | field 1,0" -> "one one two three" "echo {1..12} | field -8--10" -> "5 4 3" "echo {1..12} | field -8--15" -> "5 4 3 2 1" "echo {1..12} | field 3--15" -> "3 2 1" "echo {1..12} | field 1-3,-1--2,8,8,10-,-2" -> "1 2 3 12 11 8 8 10 11 12 11" """ """ todo: - read from files given as additional arguments - /R/-/S/: starting with /R/ and ending with /S/ - specify record selection criteria (pattern) - field reformatting, eg wrapping, fit in column or on page - args like -g columnate_group - different behaviors for range overlaps (set union, addition) - way to disinclude fields ("all fields except...") - multiple delimiters, maybe a -i and -d possible for every -f - multiple delimiters, as in multiple patterns will serve as one - implement a "record" in terms of "field?" - flag to discard empty records in output (no non-empty fields) """ __url__ = 'https://github.com/smemsh/field/' __author__ = 'Scott Mcdermott ' __license__ = 'GPL-2.0' from sys import exit, hexversion if hexversion < 0x030900f0: exit("minpython: %x" % hexversion) import argparse import re from io import DEFAULT_BUFFER_SIZE as BUFSIZ from sys import argv, stdin, stdout, stderr from types import SimpleNamespace from select import select from functools import partial from traceback import print_exc from os import ( getenv, isatty, dup, close as osclose, EX_OK as EXIT_SUCCESS, EX_SOFTWARE as EXIT_FAILURE, ) from os.path import basename ### def err(*args, **kwargs): print(f"{invname}:", *args, file=stderr, **kwargs) def bomb(*args, **kwargs): err(*args, **kwargs) exit(EXIT_FAILURE) ### def process_args(): global ifs, ofs, irs, ors global args def usagex(*args, **kwargs): nonlocal p p.print_help(file=stderr) print(file=stderr) bomb(*args, **kwargs) # parse_args() gives escaped strings def unesc(s): if s is None: return else: return s.encode('raw-unicode-escape').decode('unicode-escape') # not clear why the interface gives a prefix arg and defaults it, but # doesn't allow it to be passed in from anywhere, so we have to override # class RawTextHelpFormatterEmptyUsageLine(argparse.RawTextHelpFormatter): def add_usage(self, usage, actions, groups, prefix=None): if prefix is None: prefix = '' return super(RawTextHelpFormatterEmptyUsageLine, self) \ .add_usage(usage, actions, groups, prefix) p = argparse.ArgumentParser( prog = invname, description = __doc__.strip(), allow_abbrev = False, formatter_class = RawTextHelpFormatterEmptyUsageLine, usage = "", ) def addopt(p, flagchar, longopt, help=None, /, **kwargs): options = list(("-%s --%s" % (flagchar, longopt)).split()) p.add_argument(*options, help=help, **kwargs) def addflag(*args, **kwargs): addopt(*args, action='store_true', **kwargs) def hasopt(*options): return any([getattr(args, a) for a in [*options]]) addopt (p, 'F', 'ifsre', "input field separator as python regex") addopt (p, 'R', 'irsre', "input record separator as python regex") addopt (p, 'i', 'ifs', "input field separator as python string") addopt (p, 'I', 'irs', "input record separator as python string") addopt (p, 'o', 'ofs', "output field separator string") addopt (p, 'O', 'ors', "output record separator string") addopt (p, 'b', 'bsz', "bytes per read, optional m or k suffix") addflag (p, '0', 'null', "irs is a '\\0' char") addflag (p, 'z', 'zero', "ors is a '\\0' char") addflag (p, 'r', 'regex', "positional ifs is a python regex") addflag (p, 'G', 'noedges', "discard initial or trailing empty fields") addflag (p, 'e', 'noempty', "discard empty fields within a record") addflag (p, 'E', 'noblanks', "discard blank records with no fields") addflag (p, 'N', 'noendrec', "skip ors after last record was emitted") addflag (p, 'l', 'flushrecs', "do individual writes every record") if not args: usagex("must supply args on stdin") args, left = p.parse_known_args(args) n = len(left) if n == 0: fields = '1-' else: fields = left[0] if n == 1: if args.regex: bomb("-r/--regex only for positionally specified ifs") else: # positional arg syntax if hasopt('ifs', 'ifsre', 'ofs'): usagex("ifs/ofs must be either positional or optional") else: ifs = left[0] if not args.regex: # leave escaped if regex, as in raw string ifs = unesc(ifs) if n == 2: fields = left[1] elif n == 3: ofs = unesc(left[1]) fields = left[2] else: usagex("bad args") if not ifs: ifs = getenv('IFS') if ifs: ifs = f"[{ifsenv}]+" args.regex = True if args.null and args.irs: bomb("-0/--null and --irs are mutually exclusive") if args.zero and args.ors: bomb("-z/--zero, --ors are mutually exclusive") if args.null: irs = '\0' if args.zero: ors = '\0' ifs = args.ifsre or unesc(args.ifs) or ifs ofs = unesc(args.ofs) or ofs irs = args.irsre or unesc(args.irs) or irs ors = unesc(args.ors) or ors if b := args.bsz: if m := re.fullmatch(r'(?P\d+)(?P[km]?)', b, re.I): b = int(m.group('bsz')) u = m.group('unit') if u == 'k': b <<= 10 elif u == 'm': b <<= 20 args.bsz = b else: bomb("invalid bsz parameter") return fields class FieldRange: def __init__(self, start, end): self.start = int(start) # start is always supplied self.end = None if end is None else int(end) def parse_fields(rangelist): ranges = [] for fieldrange in rangelist.split(','): match = re.fullmatch(r""" (?P-?\d+) (?P- (?P-?\d+)? )? """, fieldrange, re.VERBOSE) if match: m = SimpleNamespace(**match.groupdict()) if not m.isrange: m.end = m.start else: for endpoint in [m.start, m.end]: if endpoint is not None and int(endpoint) == 0: bomb("range spec '0' (all fields) excludes start/end") del m.isrange ranges += [FieldRange(**vars(m))] continue else: bomb(f"bad field range: {fieldrange}") return ranges def field(rangelist): global ors ranges = parse_fields(rangelist) flush = True if args.flushrecs else False out = stdout.buffer g = globals() for var in ['ifs', 'irs', 'ofs', 'ors']: if g[var]: g[var] = bytes(g[var], 'utf-8') r_isre = args.irsre and not args.null f_isre = args.ifsre or args.regex reflags = re.MULTILINE | re.DOTALL # readlines() does not accept delim, and no plans (cpython #41622) def recordize(bufs, delim): record = '' for buf in bufs: if record: record += buf else: record = buf if not buf: break if r_isre: records = re.split(delim, record, reflags) else: records = record.split(delim) record = records.pop() yield from records if record: # delim at eof terminates input, as in awk yield record bufs = iter(partial(infile.read, args.bsz or BUFSIZ), '') records = recordize(bufs, irs) # handle iteration ourselves, so last loop can behave differently try: record = next(records) # preload first record before entering loop except StopIteration: return # early exit, not even one input record lastloop = False while True: if args.noblanks and not record: continue if f_isre: fields = re.split(ifs, record) else: fields = record.split(ifs) if args.noedges: indices = list(range(len(fields))) for i in indices: # leading if not fields[i]: del fields[i]; indices.pop() else: break for i in range(len(fields) - 1, -1, -1): # trailing if not fields[i]: del fields[i] else: break if args.noempty: fields = [f for f in fields if f] n = len(fields) indices = [] for r in ranges: start = r.start or 1 end = r.end or n if start < 0: start = n - abs(start) + 1 if end < 0: end = n - abs(end) + 1 step = 1 if start <= end else -1 offsets = range(start - 1, end - 1 + step, step) indices += [i for i in offsets if 0 <= i < n] output = ofs.join([fields[i] for i in indices]) try: record = next(records) except StopIteration: lastloop = True if args.noendrec: ors = b'' if isatty(stdout.fileno()): if not ors.endswith(b'\n'): ors += b'\n' output += ors try: out.write(output) if flush: out.flush() except BrokenPipeError: exit(EXIT_SUCCESS) # eg "field | head" if lastloop: break ### def main(): if debug == 1: breakpoint() rangelist = process_args() try: subprogram = globals()[invname] except (KeyError, TypeError): from inspect import trace if len(trace()) == 1: bomb("unimplemented") else: raise return subprogram(rangelist) ### if __name__ == "__main__": args = argv[1:] invname = basename(argv[0]) # move stdin, pdb needs stdio fds itself stdinfd = stdin.fileno() if not isatty(stdinfd) and select([stdin], [], [])[0]: infile = open(dup(stdinfd), 'rb') osclose(stdinfd) try: stdin = open('/dev/tty') except: pass # no ctty, but then pdb would not be in use else: args = None from bdb import BdbQuit debug = int(getenv('DEBUG') or 0) if debug: import pdb from pprint import pp err('debug: enabled') ifs = None; ofs = '\x20' irs = "\n"; ors = "\n" try: main() except BdbQuit: bomb("debug: stop") except SystemExit: raise except KeyboardInterrupt: bomb("interrupted") except: print_exc(file=stderr) if debug: pdb.post_mortem() finally: # cpython bug 55589 try: stdout.flush() finally: try: stdout.close() finally: try: stderr.flush() finally: stderr.close()