-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
lexer.go
430 lines (383 loc) · 11 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
/*
* Copyright 2015-2025 Hypermode Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lex
import (
"fmt"
"unicode/utf8"
"github.com/pkg/errors"
"github.com/hypermodeinc/dgraph/v24/x"
)
// EOF indicates the end of the an input.
const EOF = -1
// ItemType is used to set the type of a token. These constants can be defined
// in the file containing state functions. Note that their value should be >= 5.
type ItemType int
const (
// ItemEOF is emitted when the end of the input is reached.
ItemEOF ItemType = iota
// ItemError is emitted when there was an error lexing the input.
ItemError
)
// StateFn represents the state of the scanner as a function that returns the next state.
type StateFn func(*Lexer) StateFn
// Item represents a unit emitted by the lexer.
type Item struct {
Typ ItemType
Val string
line int
column int
}
// Errorf returns an error message that includes the line and column where the error occurred.
func (i Item) Errorf(format string, args ...interface{}) error {
return errors.Errorf("line %d column %d: "+format,
append([]interface{}{i.line, i.column}, args...)...)
}
func (i Item) String() string {
if i.Typ == ItemEOF {
return "EOF"
}
return fmt.Sprintf("lex.Item [%v] %q at %d:%d", i.Typ, i.Val, i.line, i.column)
}
// ItemIterator iterates over the items emitted by a lexer.
type ItemIterator struct {
l *Lexer
idx int
}
// NewIterator returns a new ItemIterator instance that uses the lexer.
func (l *Lexer) NewIterator() *ItemIterator {
it := &ItemIterator{
l: l,
idx: -1,
}
return it
}
// Errorf returns an error message using the location of the next item in the iterator.
func (p *ItemIterator) Errorf(format string, args ...interface{}) error {
nextItem, _ := p.PeekOne()
return nextItem.Errorf(format, args...)
}
// Next advances the iterator by one.
func (p *ItemIterator) Next() bool {
p.idx++
return p.idx < len(p.l.items)
}
// Item returns the current item.
func (p *ItemIterator) Item() Item {
if p.idx < 0 || p.idx >= len(p.l.items) {
return Item{
line: -1, // using negative numbers to indicate out-of-range item
column: -1,
}
}
return (p.l.items)[p.idx]
}
// Prev moves the index back by one.
func (p *ItemIterator) Prev() bool {
if p.idx > 0 {
p.idx--
return true
}
return false
}
// Restore restores the the iterator to position specified.
func (p *ItemIterator) Restore(pos int) {
x.AssertTrue(pos <= len(p.l.items) && pos >= -1)
p.idx = pos
}
// Save returns the current position of the iterator which we can use for restoring later.
func (p *ItemIterator) Save() int {
return p.idx
}
// Peek returns the next n items without consuming them.
func (p *ItemIterator) Peek(num int) ([]Item, error) {
if (p.idx + num + 1) > len(p.l.items) {
return nil, errors.Errorf("Out of range for peek")
}
return p.l.items[p.idx+1 : p.idx+num+1], nil
}
// PeekOne returns the next 1 item without consuming it.
func (p *ItemIterator) PeekOne() (Item, bool) {
if p.idx+1 >= len(p.l.items) {
return Item{
line: -1,
column: -1, // use negative number to indicate out of range
}, false
}
return p.l.items[p.idx+1], true
}
// A RuneWidth represents a consecutive string of runes with the same width
// and the number of runes is stored in count.
// The reason we maintain this information is to properly backup when multiple look-aheads happen.
// For example, if the following sequence of events happen
// 1. Lexer.Next() consumes 1 byte
// 2. Lexer.Next() consumes 1 byte
// 3. Lexer.Next() consumes 3 bytes
// we would create two RunWidthTrackers, the 1st having width 1 and count 2, while the 2nd having
// width 3 and count 1, then the following backups can be done properly:
// 4. Lexer.Backup() should decrement the pos by 3
// 5. Lexer.Backup() should decrement the pos by 1
// 6. Lexer.Backup() should decrement the pos by 1
type RuneWidth struct {
width int
// count should be always greater than or equal to 1, because we pop a tracker item
// from the stack when count is about to reach 0
count int
}
// Lexer converts a raw input into tokens.
type Lexer struct {
// NOTE: Using a text scanner wouldn't work because it's designed for parsing
// Golang. It won't keep track of Start Position, or allow us to retrieve
// slice from [Start:Pos]. Better to just use normal string.
Input string // string being scanned.
Start int // Start Position of this item.
Pos int // current Position of this item.
Width int // Width of last rune read from input.
widthStack []*RuneWidth
items []Item // channel of scanned items.
Depth int // nesting of {}
BlockDepth int // nesting of blocks (e.g. mutation block inside upsert block)
ArgDepth int // nesting of ()
Mode StateFn // Default state to go back to after reading a token.
Line int // the current line number corresponding to Start
Column int // the current column number corresponding to Start
}
// Reset resets Lexer fields. It reuses already allocated buffers.
func (l *Lexer) Reset(input string) {
// Pick the slices so we can reuse it.
item := l.items
widthStack := l.widthStack
*l = Lexer{}
l.Input = input
l.items = item[:0]
l.widthStack = widthStack[:0]
l.Line = 1
}
// ValidateResult verifies whether the entire input can be lexed without errors.
func (l *Lexer) ValidateResult() error {
it := l.NewIterator()
for it.Next() {
item := it.Item()
if item.Typ == ItemError {
return errors.New(item.Val)
}
}
return nil
}
// Run executes the given StateFn on the lexer and returns the lexer.
func (l *Lexer) Run(f StateFn) *Lexer {
for state := f; state != nil; {
// The following statement is useful for debugging.
// fmt.Printf("Func: %v\n", runtime.FuncForPC(reflect.ValueOf(state).Pointer()).Name())
state = state(l)
}
return l
}
// Errorf returns the error state function.
func (l *Lexer) Errorf(format string, args ...interface{}) StateFn {
l.items = append(l.items, Item{
Typ: ItemError,
Val: fmt.Sprintf("while lexing %v at line %d column %d: "+format,
append([]interface{}{l.Input, l.Line, l.Column}, args...)...),
line: l.Line,
column: l.Column,
})
return nil
}
// Emit emits the item with it's type information.
func (l *Lexer) Emit(t ItemType) {
if t != ItemEOF && l.Pos < l.Start {
// Let ItemEOF go through.
return
}
item := Item{
Typ: t,
Val: l.Input[l.Start:l.Pos],
line: l.Line,
column: l.Column,
}
l.items = append(l.items, item)
l.moveStartToPos()
}
func (l *Lexer) pushWidth(width int) {
wl := len(l.widthStack)
if wl == 0 || l.widthStack[wl-1].width != width {
l.widthStack = append(l.widthStack, &RuneWidth{
count: 1,
width: width,
})
} else {
l.widthStack[wl-1].count++
}
}
// Next reads the next rune from the Input, sets the Width and advances Pos.
func (l *Lexer) Next() (result rune) {
if l.Pos >= len(l.Input) {
l.pushWidth(0)
return EOF
}
r, w := utf8.DecodeRuneInString(l.Input[l.Pos:])
l.pushWidth(w)
l.Pos += w
return r
}
// Backup moves the lexer back to its previous position.
func (l *Lexer) Backup() {
wl := len(l.widthStack)
x.AssertTruef(wl > 0,
"Backup should not be called when the width tracker stack is empty")
rw := l.widthStack[wl-1]
if rw.count == 1 {
l.widthStack = l.widthStack[:wl-1] // pop the item from the stack
} else {
rw.count--
}
l.Pos -= rw.width
}
// Peek returns the next rune without advancing the lexer.
func (l *Lexer) Peek() rune {
r := l.Next()
l.Backup()
return r
}
// Peek returns the next two rune without advancing the lexer.
func (l *Lexer) PeekTwo() []rune {
r1 := l.Next()
if r1 == EOF {
l.Backup()
return []rune{r1, EOF}
}
r2 := l.Next()
l.Backup()
l.Backup()
return []rune{r1, r2}
}
func (l *Lexer) moveStartToPos() {
// check if we are about to move Start to a new line
for offset := l.Start; offset < l.Pos; {
r, w := utf8.DecodeRuneInString(l.Input[offset:l.Pos])
offset += w
if IsEndOfLine(r) {
l.Line++
l.Column = 0
} else {
l.Column += w
}
}
l.Start = l.Pos
}
// Ignore skips the current token. Meant to be used for tokens that do not have any
// syntactical meaning (e.g comments).
func (l *Lexer) Ignore() {
l.moveStartToPos()
}
// CheckRune is predicate signature for accepting valid runes on input.
type CheckRune func(r rune) bool
// CheckRuneRec is like CheckRune with Lexer as extra argument.
// This can be used to recursively call other CheckRune(s).
type CheckRuneRec func(r rune, l *Lexer) bool
// AcceptRun accepts tokens based on CheckRune until it returns false or EOF is reached.
// Returns last rune accepted and valid flag for rune.
func (l *Lexer) AcceptRun(c CheckRune) (lastr rune, validr bool) {
validr = false
for {
r := l.Next()
if r == EOF || !c(r) {
break
}
validr = true
lastr = r
}
l.Backup()
return lastr, validr
}
// AcceptRunRec accepts tokens based on CheckRuneRec until it returns false or EOF is reached.
func (l *Lexer) AcceptRunRec(c CheckRuneRec) {
for {
r := l.Next()
if r == EOF || !c(r, l) {
break
}
}
l.Backup()
}
// AcceptUntil accepts tokens based on CheckRune till it returns false or EOF is reached.
func (l *Lexer) AcceptUntil(c CheckRune) {
for {
r := l.Next()
if r == EOF || c(r) {
break
}
}
l.Backup()
}
// AcceptRunTimes accepts tokens with CheckRune given number of times.
// returns number of times it was successful.
func (l *Lexer) AcceptRunTimes(c CheckRune, times int) int {
i := 0
for ; i < times; i++ {
r := l.Next()
if r == EOF || !c(r) {
break
}
}
l.Backup()
return i
}
// IgnoreRun ignores all the runes accepted by the given CheckRune.
func (l *Lexer) IgnoreRun(c CheckRune) {
l.AcceptRun(c)
l.Ignore()
}
const (
quote = '"'
)
// IsEscChar returns true if the run is an escape character (ECHAR ::= '\' [uvtbnrf"'\])
func (l *Lexer) IsEscChar(r rune) bool {
switch r {
case 'u', 'v', 't', 'b', 'n', 'r', 'f', '"', '\'', '\\':
return true
}
return false
}
// IsEndOfLine returns true if the rune is a Linefeed or a Carriage return.
func IsEndOfLine(r rune) bool {
return r == '\u000A' || r == '\u000D'
}
// LexQuotedString properly processes a quoted string (by taking care of escaped characters).
func (l *Lexer) LexQuotedString() error {
l.Backup()
r := l.Next()
if r != quote {
return errors.Errorf("String should start with quote.")
}
for {
r := l.Next()
if r == EOF {
return errors.Errorf("Unexpected end of input.")
}
if r == '\\' {
r := l.Next()
if !l.IsEscChar(r) {
return errors.Errorf("Not a valid escape char: '%c'", r)
}
continue // eat the next char
}
if r == quote {
break
}
}
return nil
}