/* * SPDX-FileCopyrightText: Â© 2017-2025 Istari Digital, Inc. * SPDX-License-Identifier: Apache-2.0 */ package lex import ( "fmt" "unicode/utf8" "github.com/pkg/errors" "github.com/dgraph-io/dgraph/v25/x" ) // EOF indicates the end of the an input. const EOF = -1 // ItemType is used to set the type of a token. These constants can be defined // in the file containing state functions. Note that their value should be >= 5. type ItemType int const ( // ItemEOF is emitted when the end of the input is reached. ItemEOF ItemType = iota // ItemError is emitted when there was an error lexing the input. ItemError ) // StateFn represents the state of the scanner as a function that returns the next state. type StateFn func(*Lexer) StateFn // Item represents a unit emitted by the lexer. type Item struct { Typ ItemType Val string line int column int } // Errorf returns an error message that includes the line and column where the error occurred. func (i Item) Errorf(format string, args ...interface{}) error { return errors.Errorf("line %d column %d: "+format, append([]interface{}{i.line, i.column}, args...)...) } func (i Item) String() string { if i.Typ == ItemEOF { return "EOF" } return fmt.Sprintf("lex.Item [%v] %q at %d:%d", i.Typ, i.Val, i.line, i.column) } // ItemIterator iterates over the items emitted by a lexer. type ItemIterator struct { l *Lexer idx int } // NewIterator returns a new ItemIterator instance that uses the lexer. func (l *Lexer) NewIterator() *ItemIterator { it := &ItemIterator{ l: l, idx: -1, } return it } // Errorf returns an error message using the location of the next item in the iterator. func (p *ItemIterator) Errorf(format string, args ...interface{}) error { nextItem, _ := p.PeekOne() return nextItem.Errorf(format, args...) } // Next advances the iterator by one. func (p *ItemIterator) Next() bool { p.idx++ return p.idx < len(p.l.items) } // Item returns the current item. func (p *ItemIterator) Item() Item { if p.idx < 0 || p.idx >= len(p.l.items) { return Item{ line: -1, // using negative numbers to indicate out-of-range item column: -1, } } return (p.l.items)[p.idx] } // Prev moves the index back by one. func (p *ItemIterator) Prev() bool { if p.idx > 0 { p.idx-- return true } return false } // Restore restores the the iterator to position specified. func (p *ItemIterator) Restore(pos int) { x.AssertTrue(pos <= len(p.l.items) && pos >= -1) p.idx = pos } // Save returns the current position of the iterator which we can use for restoring later. func (p *ItemIterator) Save() int { return p.idx } // Peek returns the next n items without consuming them. func (p *ItemIterator) Peek(num int) ([]Item, error) { if (p.idx + num + 1) > len(p.l.items) { return nil, errors.Errorf("Out of range for peek") } return p.l.items[p.idx+1 : p.idx+num+1], nil } // PeekOne returns the next 1 item without consuming it. func (p *ItemIterator) PeekOne() (Item, bool) { if p.idx+1 >= len(p.l.items) { return Item{ line: -1, column: -1, // use negative number to indicate out of range }, false } return p.l.items[p.idx+1], true } // A RuneWidth represents a consecutive string of runes with the same width // and the number of runes is stored in count. // The reason we maintain this information is to properly backup when multiple look-aheads happen. // For example, if the following sequence of events happen // 1. Lexer.Next() consumes 1 byte // 2. Lexer.Next() consumes 1 byte // 3. Lexer.Next() consumes 3 bytes // we would create two RunWidthTrackers, the 1st having width 1 and count 2, while the 2nd having // width 3 and count 1, then the following backups can be done properly: // 4. Lexer.Backup() should decrement the pos by 3 // 5. Lexer.Backup() should decrement the pos by 1 // 6. Lexer.Backup() should decrement the pos by 1 type RuneWidth struct { width int // count should be always greater than or equal to 1, because we pop a tracker item // from the stack when count is about to reach 0 count int } // Lexer converts a raw input into tokens. type Lexer struct { // NOTE: Using a text scanner wouldn't work because it's designed for parsing // Golang. It won't keep track of Start Position, or allow us to retrieve // slice from [Start:Pos]. Better to just use normal string. Input string // string being scanned. Start int // Start Position of this item. Pos int // current Position of this item. Width int // Width of last rune read from input. widthStack []*RuneWidth items []Item // channel of scanned items. Depth int // nesting of {} BlockDepth int // nesting of blocks (e.g. mutation block inside upsert block) ArgDepth int // nesting of () Mode StateFn // Default state to go back to after reading a token. Line int // the current line number corresponding to Start Column int // the current column number corresponding to Start } // Reset resets Lexer fields. It reuses already allocated buffers. func (l *Lexer) Reset(input string) { // Pick the slices so we can reuse it. item := l.items widthStack := l.widthStack *l = Lexer{} l.Input = input l.items = item[:0] l.widthStack = widthStack[:0] l.Line = 1 } // ValidateResult verifies whether the entire input can be lexed without errors. func (l *Lexer) ValidateResult() error { it := l.NewIterator() for it.Next() { item := it.Item() if item.Typ == ItemError { return errors.New(item.Val) } } return nil } // Run executes the given StateFn on the lexer and returns the lexer. func (l *Lexer) Run(f StateFn) *Lexer { for state := f; state != nil; { // The following statement is useful for debugging. // fmt.Printf("Func: %v\n", runtime.FuncForPC(reflect.ValueOf(state).Pointer()).Name()) state = state(l) } return l } // Errorf returns the error state function. func (l *Lexer) Errorf(format string, args ...interface{}) StateFn { l.items = append(l.items, Item{ Typ: ItemError, Val: fmt.Sprintf("while lexing %v at line %d column %d: "+format, append([]interface{}{l.Input, l.Line, l.Column}, args...)...), line: l.Line, column: l.Column, }) return nil } // Emit emits the item with it's type information. func (l *Lexer) Emit(t ItemType) { if t != ItemEOF && l.Pos < l.Start { // Let ItemEOF go through. return } item := Item{ Typ: t, Val: l.Input[l.Start:l.Pos], line: l.Line, column: l.Column, } l.items = append(l.items, item) l.moveStartToPos() } func (l *Lexer) pushWidth(width int) { wl := len(l.widthStack) if wl == 0 || l.widthStack[wl-1].width != width { l.widthStack = append(l.widthStack, &RuneWidth{ count: 1, width: width, }) } else { l.widthStack[wl-1].count++ } } // Next reads the next rune from the Input, sets the Width and advances Pos. func (l *Lexer) Next() (result rune) { if l.Pos >= len(l.Input) { l.pushWidth(0) return EOF } r, w := utf8.DecodeRuneInString(l.Input[l.Pos:]) l.pushWidth(w) l.Pos += w return r } // Backup moves the lexer back to its previous position. func (l *Lexer) Backup() { wl := len(l.widthStack) x.AssertTruef(wl > 0, "Backup should not be called when the width tracker stack is empty") rw := l.widthStack[wl-1] if rw.count == 1 { l.widthStack = l.widthStack[:wl-1] // pop the item from the stack } else { rw.count-- } l.Pos -= rw.width } // Peek returns the next rune without advancing the lexer. func (l *Lexer) Peek() rune { r := l.Next() l.Backup() return r } // Peek returns the next two rune without advancing the lexer. func (l *Lexer) PeekTwo() []rune { r1 := l.Next() if r1 == EOF { l.Backup() return []rune{r1, EOF} } r2 := l.Next() l.Backup() l.Backup() return []rune{r1, r2} } func (l *Lexer) moveStartToPos() { // check if we are about to move Start to a new line for offset := l.Start; offset < l.Pos; { r, w := utf8.DecodeRuneInString(l.Input[offset:l.Pos]) offset += w if IsEndOfLine(r) { l.Line++ l.Column = 0 } else { l.Column += w } } l.Start = l.Pos } // Ignore skips the current token. Meant to be used for tokens that do not have any // syntactical meaning (e.g comments). func (l *Lexer) Ignore() { l.moveStartToPos() } // CheckRune is predicate signature for accepting valid runes on input. type CheckRune func(r rune) bool // CheckRuneRec is like CheckRune with Lexer as extra argument. // This can be used to recursively call other CheckRune(s). type CheckRuneRec func(r rune, l *Lexer) bool // AcceptRun accepts tokens based on CheckRune until it returns false or EOF is reached. // Returns last rune accepted and valid flag for rune. func (l *Lexer) AcceptRun(c CheckRune) (lastr rune, validr bool) { validr = false for { r := l.Next() if r == EOF || !c(r) { break } validr = true lastr = r } l.Backup() return lastr, validr } // AcceptRunRec accepts tokens based on CheckRuneRec until it returns false or EOF is reached. func (l *Lexer) AcceptRunRec(c CheckRuneRec) { for { r := l.Next() if r == EOF || !c(r, l) { break } } l.Backup() } // AcceptUntil accepts tokens based on CheckRune till it returns false or EOF is reached. func (l *Lexer) AcceptUntil(c CheckRune) { for { r := l.Next() if r == EOF || c(r) { break } } l.Backup() } // AcceptRunTimes accepts tokens with CheckRune given number of times. // returns number of times it was successful. func (l *Lexer) AcceptRunTimes(c CheckRune, times int) int { i := 0 for ; i < times; i++ { r := l.Next() if r == EOF || !c(r) { break } } l.Backup() return i } // IgnoreRun ignores all the runes accepted by the given CheckRune. func (l *Lexer) IgnoreRun(c CheckRune) { l.AcceptRun(c) l.Ignore() } const ( quote = '"' ) // IsEscChar returns true if the run is an escape character (ECHAR ::= '\' [uvtbnrf"'\]) func (l *Lexer) IsEscChar(r rune) bool { switch r { case 'u', 'v', 't', 'b', 'n', 'r', 'f', '"', '\'', '\\': return true } return false } // IsEndOfLine returns true if the rune is a Linefeed or a Carriage return. func IsEndOfLine(r rune) bool { return r == '\u000A' || r == '\u000D' } // LexQuotedString properly processes a quoted string (by taking care of escaped characters). func (l *Lexer) LexQuotedString() error { l.Backup() r := l.Next() if r != quote { return errors.Errorf("String should start with quote.") } for { r := l.Next() if r == EOF { return errors.Errorf("Unexpected end of input.") } if r == '\\' { r := l.Next() if !l.IsEscChar(r) { return errors.Errorf("Not a valid escape char: '%c'", r) } continue // eat the next char } if r == quote { break } } return nil }