blob: eb10ea10261ae54f2b2b47a2e64f5d1dae2946cc [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package text
6
7import (
8 "bytes"
9 "fmt"
10 "io"
11 "regexp"
12 "strconv"
13 "unicode/utf8"
14
15 "google.golang.org/protobuf/internal/errors"
16)
17
18// Decoder is a token-based textproto decoder.
19type Decoder struct {
20 // lastCall is last method called, either readCall or peekCall.
21 // Initial value is readCall.
22 lastCall call
23
24 // lastToken contains the last read token.
25 lastToken Token
26
27 // lastErr contains the last read error.
28 lastErr error
29
30 // openStack is a stack containing the byte characters for MessageOpen and
31 // ListOpen kinds. The top of stack represents the message or the list that
32 // the current token is nested in. An empty stack means the current token is
33 // at the top level message. The characters '{' and '<' both represent the
34 // MessageOpen kind.
35 openStack []byte
36
37 // orig is used in reporting line and column.
38 orig []byte
39 // in contains the unconsumed input.
40 in []byte
41}
42
43// NewDecoder returns a Decoder to read the given []byte.
44func NewDecoder(b []byte) *Decoder {
45 return &Decoder{orig: b, in: b}
46}
47
48// ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
49var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
50
51// call specifies which Decoder method was invoked.
52type call uint8
53
54const (
55 readCall call = iota
56 peekCall
57)
58
59// Peek looks ahead and returns the next token and error without advancing a read.
60func (d *Decoder) Peek() (Token, error) {
61 defer func() { d.lastCall = peekCall }()
62 if d.lastCall == readCall {
63 d.lastToken, d.lastErr = d.Read()
64 }
65 return d.lastToken, d.lastErr
66}
67
68// Read returns the next token.
69// It will return an error if there is no valid token.
70func (d *Decoder) Read() (Token, error) {
71 defer func() { d.lastCall = readCall }()
72 if d.lastCall == peekCall {
73 return d.lastToken, d.lastErr
74 }
75
76 tok, err := d.parseNext(d.lastToken.kind)
77 if err != nil {
78 return Token{}, err
79 }
80
81 switch tok.kind {
82 case comma, semicolon:
83 tok, err = d.parseNext(tok.kind)
84 if err != nil {
85 return Token{}, err
86 }
87 }
88 d.lastToken = tok
89 return tok, nil
90}
91
92const (
93 mismatchedFmt = "mismatched close character %q"
94 unexpectedFmt = "unexpected character %q"
95)
96
97// parseNext parses the next Token based on given last kind.
98func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
99 // Trim leading spaces.
100 d.consume(0)
101 isEOF := false
102 if len(d.in) == 0 {
103 isEOF = true
104 }
105
106 switch lastKind {
107 case EOF:
108 return d.consumeToken(EOF, 0, 0), nil
109
110 case bof:
111 // Start of top level message. Next token can be EOF or Name.
112 if isEOF {
113 return d.consumeToken(EOF, 0, 0), nil
114 }
115 return d.parseFieldName()
116
117 case Name:
118 // Next token can be MessageOpen, ListOpen or Scalar.
119 if isEOF {
120 return Token{}, ErrUnexpectedEOF
121 }
122 switch ch := d.in[0]; ch {
123 case '{', '<':
124 d.pushOpenStack(ch)
125 return d.consumeToken(MessageOpen, 1, 0), nil
126 case '[':
127 d.pushOpenStack(ch)
128 return d.consumeToken(ListOpen, 1, 0), nil
129 default:
130 return d.parseScalar()
131 }
132
133 case Scalar:
134 openKind, closeCh := d.currentOpenKind()
135 switch openKind {
136 case bof:
137 // Top level message.
138 // Next token can be EOF, comma, semicolon or Name.
139 if isEOF {
140 return d.consumeToken(EOF, 0, 0), nil
141 }
142 switch d.in[0] {
143 case ',':
144 return d.consumeToken(comma, 1, 0), nil
145 case ';':
146 return d.consumeToken(semicolon, 1, 0), nil
147 default:
148 return d.parseFieldName()
149 }
150
151 case MessageOpen:
152 // Next token can be MessageClose, comma, semicolon or Name.
153 if isEOF {
154 return Token{}, ErrUnexpectedEOF
155 }
156 switch ch := d.in[0]; ch {
157 case closeCh:
158 d.popOpenStack()
159 return d.consumeToken(MessageClose, 1, 0), nil
160 case otherCloseChar[closeCh]:
161 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
162 case ',':
163 return d.consumeToken(comma, 1, 0), nil
164 case ';':
165 return d.consumeToken(semicolon, 1, 0), nil
166 default:
167 return d.parseFieldName()
168 }
169
170 case ListOpen:
171 // Next token can be ListClose or comma.
172 if isEOF {
173 return Token{}, ErrUnexpectedEOF
174 }
175 switch ch := d.in[0]; ch {
176 case ']':
177 d.popOpenStack()
178 return d.consumeToken(ListClose, 1, 0), nil
179 case ',':
180 return d.consumeToken(comma, 1, 0), nil
181 default:
182 return Token{}, d.newSyntaxError(unexpectedFmt, ch)
183 }
184 }
185
186 case MessageOpen:
187 // Next token can be MessageClose or Name.
188 if isEOF {
189 return Token{}, ErrUnexpectedEOF
190 }
191 _, closeCh := d.currentOpenKind()
192 switch ch := d.in[0]; ch {
193 case closeCh:
194 d.popOpenStack()
195 return d.consumeToken(MessageClose, 1, 0), nil
196 case otherCloseChar[closeCh]:
197 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
198 default:
199 return d.parseFieldName()
200 }
201
202 case MessageClose:
203 openKind, closeCh := d.currentOpenKind()
204 switch openKind {
205 case bof:
206 // Top level message.
207 // Next token can be EOF, comma, semicolon or Name.
208 if isEOF {
209 return d.consumeToken(EOF, 0, 0), nil
210 }
211 switch ch := d.in[0]; ch {
212 case ',':
213 return d.consumeToken(comma, 1, 0), nil
214 case ';':
215 return d.consumeToken(semicolon, 1, 0), nil
216 default:
217 return d.parseFieldName()
218 }
219
220 case MessageOpen:
221 // Next token can be MessageClose, comma, semicolon or Name.
222 if isEOF {
223 return Token{}, ErrUnexpectedEOF
224 }
225 switch ch := d.in[0]; ch {
226 case closeCh:
227 d.popOpenStack()
228 return d.consumeToken(MessageClose, 1, 0), nil
229 case otherCloseChar[closeCh]:
230 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
231 case ',':
232 return d.consumeToken(comma, 1, 0), nil
233 case ';':
234 return d.consumeToken(semicolon, 1, 0), nil
235 default:
236 return d.parseFieldName()
237 }
238
239 case ListOpen:
240 // Next token can be ListClose or comma
241 if isEOF {
242 return Token{}, ErrUnexpectedEOF
243 }
244 switch ch := d.in[0]; ch {
245 case closeCh:
246 d.popOpenStack()
247 return d.consumeToken(ListClose, 1, 0), nil
248 case ',':
249 return d.consumeToken(comma, 1, 0), nil
250 default:
251 return Token{}, d.newSyntaxError(unexpectedFmt, ch)
252 }
253 }
254
255 case ListOpen:
256 // Next token can be ListClose, MessageStart or Scalar.
257 if isEOF {
258 return Token{}, ErrUnexpectedEOF
259 }
260 switch ch := d.in[0]; ch {
261 case ']':
262 d.popOpenStack()
263 return d.consumeToken(ListClose, 1, 0), nil
264 case '{', '<':
265 d.pushOpenStack(ch)
266 return d.consumeToken(MessageOpen, 1, 0), nil
267 default:
268 return d.parseScalar()
269 }
270
271 case ListClose:
272 openKind, closeCh := d.currentOpenKind()
273 switch openKind {
274 case bof:
275 // Top level message.
276 // Next token can be EOF, comma, semicolon or Name.
277 if isEOF {
278 return d.consumeToken(EOF, 0, 0), nil
279 }
280 switch ch := d.in[0]; ch {
281 case ',':
282 return d.consumeToken(comma, 1, 0), nil
283 case ';':
284 return d.consumeToken(semicolon, 1, 0), nil
285 default:
286 return d.parseFieldName()
287 }
288
289 case MessageOpen:
290 // Next token can be MessageClose, comma, semicolon or Name.
291 if isEOF {
292 return Token{}, ErrUnexpectedEOF
293 }
294 switch ch := d.in[0]; ch {
295 case closeCh:
296 d.popOpenStack()
297 return d.consumeToken(MessageClose, 1, 0), nil
298 case otherCloseChar[closeCh]:
299 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
300 case ',':
301 return d.consumeToken(comma, 1, 0), nil
302 case ';':
303 return d.consumeToken(semicolon, 1, 0), nil
304 default:
305 return d.parseFieldName()
306 }
307
308 default:
309 // It is not possible to have this case. Let it panic below.
310 }
311
312 case comma, semicolon:
313 openKind, closeCh := d.currentOpenKind()
314 switch openKind {
315 case bof:
316 // Top level message. Next token can be EOF or Name.
317 if isEOF {
318 return d.consumeToken(EOF, 0, 0), nil
319 }
320 return d.parseFieldName()
321
322 case MessageOpen:
323 // Next token can be MessageClose or Name.
324 if isEOF {
325 return Token{}, ErrUnexpectedEOF
326 }
327 switch ch := d.in[0]; ch {
328 case closeCh:
329 d.popOpenStack()
330 return d.consumeToken(MessageClose, 1, 0), nil
331 case otherCloseChar[closeCh]:
332 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
333 default:
334 return d.parseFieldName()
335 }
336
337 case ListOpen:
338 if lastKind == semicolon {
339 // It is not be possible to have this case as logic here
340 // should not have produced a semicolon Token when inside a
341 // list. Let it panic below.
342 break
343 }
344 // Next token can be MessageOpen or Scalar.
345 if isEOF {
346 return Token{}, ErrUnexpectedEOF
347 }
348 switch ch := d.in[0]; ch {
349 case '{', '<':
350 d.pushOpenStack(ch)
351 return d.consumeToken(MessageOpen, 1, 0), nil
352 default:
353 return d.parseScalar()
354 }
355 }
356 }
357
358 line, column := d.Position(len(d.orig) - len(d.in))
359 panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
360}
361
362var otherCloseChar = map[byte]byte{
363 '}': '>',
364 '>': '}',
365}
366
367// currentOpenKind indicates whether current position is inside a message, list
368// or top-level message by returning MessageOpen, ListOpen or bof respectively.
369// If the returned kind is either a MessageOpen or ListOpen, it also returns the
370// corresponding closing character.
371func (d *Decoder) currentOpenKind() (Kind, byte) {
372 if len(d.openStack) == 0 {
373 return bof, 0
374 }
375 openCh := d.openStack[len(d.openStack)-1]
376 switch openCh {
377 case '{':
378 return MessageOpen, '}'
379 case '<':
380 return MessageOpen, '>'
381 case '[':
382 return ListOpen, ']'
383 }
384 panic(fmt.Sprintf("Decoder: openStack contains invalid byte %s", string(openCh)))
385}
386
387func (d *Decoder) pushOpenStack(ch byte) {
388 d.openStack = append(d.openStack, ch)
389}
390
391func (d *Decoder) popOpenStack() {
392 d.openStack = d.openStack[:len(d.openStack)-1]
393}
394
395// parseFieldName parses field name and separator.
396func (d *Decoder) parseFieldName() (tok Token, err error) {
397 defer func() {
398 if err == nil && d.tryConsumeChar(':') {
399 tok.attrs |= hasSeparator
400 }
401 }()
402
403 // Extension or Any type URL.
404 if d.in[0] == '[' {
405 return d.parseTypeName()
406 }
407
408 // Identifier.
409 if size := parseIdent(d.in, false); size > 0 {
410 return d.consumeToken(Name, size, uint8(IdentName)), nil
411 }
412
413 // Field number. Identify if input is a valid number that is not negative
414 // and is decimal integer within 32-bit range.
415 if num := parseNumber(d.in); num.size > 0 {
416 if !num.neg && num.kind == numDec {
417 if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil {
418 return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
419 }
420 }
421 return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
422 }
423
424 return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
425}
426
427// parseTypeName parses Any type URL or extension field name. The name is
428// enclosed in [ and ] characters. The C++ parser does not handle many legal URL
429// strings. This implementation is more liberal and allows for the pattern
430// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
431// in between [ ], '.', '/' and the sub names.
432func (d *Decoder) parseTypeName() (Token, error) {
433 startPos := len(d.orig) - len(d.in)
434 // Use alias s to advance first in order to use d.in for error handling.
435 // Caller already checks for [ as first character.
436 s := consume(d.in[1:], 0)
437 if len(s) == 0 {
438 return Token{}, ErrUnexpectedEOF
439 }
440
441 var name []byte
442 for len(s) > 0 && isTypeNameChar(s[0]) {
443 name = append(name, s[0])
444 s = s[1:]
445 }
446 s = consume(s, 0)
447
448 var closed bool
449 for len(s) > 0 && !closed {
450 switch {
451 case s[0] == ']':
452 s = s[1:]
453 closed = true
454
455 case s[0] == '/', s[0] == '.':
456 if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
457 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
458 d.orig[startPos:len(d.orig)-len(s)+1])
459 }
460 name = append(name, s[0])
461 s = s[1:]
462 s = consume(s, 0)
463 for len(s) > 0 && isTypeNameChar(s[0]) {
464 name = append(name, s[0])
465 s = s[1:]
466 }
467 s = consume(s, 0)
468
469 default:
470 return Token{}, d.newSyntaxError(
471 "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
472 }
473 }
474
475 if !closed {
476 return Token{}, ErrUnexpectedEOF
477 }
478
479 // First character cannot be '.'. Last character cannot be '.' or '/'.
480 size := len(name)
481 if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
482 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
483 d.orig[startPos:len(d.orig)-len(s)])
484 }
485
486 d.in = s
487 endPos := len(d.orig) - len(d.in)
488 d.consume(0)
489
490 return Token{
491 kind: Name,
492 attrs: uint8(TypeName),
493 pos: startPos,
494 raw: d.orig[startPos:endPos],
495 str: string(name),
496 }, nil
497}
498
499func isTypeNameChar(b byte) bool {
500 return (b == '-' || b == '_' ||
501 ('0' <= b && b <= '9') ||
502 ('a' <= b && b <= 'z') ||
503 ('A' <= b && b <= 'Z'))
504}
505
506func isWhiteSpace(b byte) bool {
507 switch b {
508 case ' ', '\n', '\r', '\t':
509 return true
510 default:
511 return false
512 }
513}
514
515// parseIdent parses an unquoted proto identifier and returns size.
516// If allowNeg is true, it allows '-' to be the first character in the
517// identifier. This is used when parsing literal values like -infinity, etc.
518// Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
519func parseIdent(input []byte, allowNeg bool) int {
520 var size int
521
522 s := input
523 if len(s) == 0 {
524 return 0
525 }
526
527 if allowNeg && s[0] == '-' {
528 s = s[1:]
529 size++
530 if len(s) == 0 {
531 return 0
532 }
533 }
534
535 switch {
536 case s[0] == '_',
537 'a' <= s[0] && s[0] <= 'z',
538 'A' <= s[0] && s[0] <= 'Z':
539 s = s[1:]
540 size++
541 default:
542 return 0
543 }
544
545 for len(s) > 0 && (s[0] == '_' ||
546 'a' <= s[0] && s[0] <= 'z' ||
547 'A' <= s[0] && s[0] <= 'Z' ||
548 '0' <= s[0] && s[0] <= '9') {
549 s = s[1:]
550 size++
551 }
552
553 if len(s) > 0 && !isDelim(s[0]) {
554 return 0
555 }
556
557 return size
558}
559
560// parseScalar parses for a string, literal or number value.
561func (d *Decoder) parseScalar() (Token, error) {
562 if d.in[0] == '"' || d.in[0] == '\'' {
563 return d.parseStringValue()
564 }
565
566 if tok, ok := d.parseLiteralValue(); ok {
567 return tok, nil
568 }
569
570 if tok, ok := d.parseNumberValue(); ok {
571 return tok, nil
572 }
573
574 return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
575}
576
577// parseLiteralValue parses a literal value. A literal value is used for
578// bools, special floats and enums. This function simply identifies that the
579// field value is a literal.
580func (d *Decoder) parseLiteralValue() (Token, bool) {
581 size := parseIdent(d.in, true)
582 if size == 0 {
583 return Token{}, false
584 }
585 return d.consumeToken(Scalar, size, literalValue), true
586}
587
588// consumeToken constructs a Token for given Kind from d.in and consumes given
589// size-length from it.
590func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
591 // Important to compute raw and pos before consuming.
592 tok := Token{
593 kind: kind,
594 attrs: attrs,
595 pos: len(d.orig) - len(d.in),
596 raw: d.in[:size],
597 }
598 d.consume(size)
599 return tok
600}
601
602// newSyntaxError returns a syntax error with line and column information for
603// current position.
604func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
605 e := errors.New(f, x...)
606 line, column := d.Position(len(d.orig) - len(d.in))
607 return errors.New("syntax error (line %d:%d): %v", line, column, e)
608}
609
610// Position returns line and column number of given index of the original input.
611// It will panic if index is out of range.
612func (d *Decoder) Position(idx int) (line int, column int) {
613 b := d.orig[:idx]
614 line = bytes.Count(b, []byte("\n")) + 1
615 if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
616 b = b[i+1:]
617 }
618 column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
619 return line, column
620}
621
622func (d *Decoder) tryConsumeChar(c byte) bool {
623 if len(d.in) > 0 && d.in[0] == c {
624 d.consume(1)
625 return true
626 }
627 return false
628}
629
630// consume consumes n bytes of input and any subsequent whitespace or comments.
631func (d *Decoder) consume(n int) {
632 d.in = consume(d.in, n)
633 return
634}
635
636// consume consumes n bytes of input and any subsequent whitespace or comments.
637func consume(b []byte, n int) []byte {
638 b = b[n:]
639 for len(b) > 0 {
640 switch b[0] {
641 case ' ', '\n', '\r', '\t':
642 b = b[1:]
643 case '#':
644 if i := bytes.IndexByte(b, '\n'); i >= 0 {
645 b = b[i+len("\n"):]
646 } else {
647 b = nil
648 }
649 default:
650 return b
651 }
652 }
653 return b
654}
655
656// Any sequence that looks like a non-delimiter (for error reporting).
657var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
658
659// isDelim returns true if given byte is a delimiter character.
660func isDelim(c byte) bool {
661 return !(c == '-' || c == '+' || c == '.' || c == '_' ||
662 ('a' <= c && c <= 'z') ||
663 ('A' <= c && c <= 'Z') ||
664 ('0' <= c && c <= '9'))
665}