blob: c685e565e08cbf752f42119558e107db1bf8fbed [file] [log] [blame]
Zack Williamse940c7a2019-08-21 14:25:39 -07001package protoparse
2
3import (
4 "bufio"
5 "bytes"
6 "errors"
7 "fmt"
8 "io"
9 "strconv"
10 "strings"
11 "unicode/utf8"
12)
13
14type runeReader struct {
15 rr *bufio.Reader
16 unread []rune
17 err error
18}
19
20func (rr *runeReader) readRune() (r rune, size int, err error) {
21 if rr.err != nil {
22 return 0, 0, rr.err
23 }
24 if len(rr.unread) > 0 {
25 r := rr.unread[len(rr.unread)-1]
26 rr.unread = rr.unread[:len(rr.unread)-1]
27 return r, utf8.RuneLen(r), nil
28 }
29 r, sz, err := rr.rr.ReadRune()
30 if err != nil {
31 rr.err = err
32 }
33 return r, sz, err
34}
35
36func (rr *runeReader) unreadRune(r rune) {
37 rr.unread = append(rr.unread, r)
38}
39
40func lexError(l protoLexer, pos *SourcePos, err string) {
41 pl := l.(*protoLex)
42 if pl.err == nil {
43 pl.err = ErrorWithSourcePos{Underlying: errors.New(err), Pos: pos}
44 }
45}
46
47type protoLex struct {
48 filename string
49 input *runeReader
50 err error
51 res *fileNode
52
53 lineNo int
54 colNo int
55 offset int
56
57 prevSym terminalNode
58}
59
60func newLexer(in io.Reader) *protoLex {
61 return &protoLex{input: &runeReader{rr: bufio.NewReader(in)}}
62}
63
64var keywords = map[string]int{
65 "syntax": _SYNTAX,
66 "import": _IMPORT,
67 "weak": _WEAK,
68 "public": _PUBLIC,
69 "package": _PACKAGE,
70 "option": _OPTION,
71 "true": _TRUE,
72 "false": _FALSE,
73 "inf": _INF,
74 "nan": _NAN,
75 "repeated": _REPEATED,
76 "optional": _OPTIONAL,
77 "required": _REQUIRED,
78 "double": _DOUBLE,
79 "float": _FLOAT,
80 "int32": _INT32,
81 "int64": _INT64,
82 "uint32": _UINT32,
83 "uint64": _UINT64,
84 "sint32": _SINT32,
85 "sint64": _SINT64,
86 "fixed32": _FIXED32,
87 "fixed64": _FIXED64,
88 "sfixed32": _SFIXED32,
89 "sfixed64": _SFIXED64,
90 "bool": _BOOL,
91 "string": _STRING,
92 "bytes": _BYTES,
93 "group": _GROUP,
94 "oneof": _ONEOF,
95 "map": _MAP,
96 "extensions": _EXTENSIONS,
97 "to": _TO,
98 "max": _MAX,
99 "reserved": _RESERVED,
100 "enum": _ENUM,
101 "message": _MESSAGE,
102 "extend": _EXTEND,
103 "service": _SERVICE,
104 "rpc": _RPC,
105 "stream": _STREAM,
106 "returns": _RETURNS,
107}
108
109func (l *protoLex) cur() *SourcePos {
110 return &SourcePos{
111 Filename: l.filename,
112 Offset: l.offset,
113 Line: l.lineNo + 1,
114 Col: l.colNo + 1,
115 }
116}
117
118func (l *protoLex) prev() *SourcePos {
119 if l.prevSym == nil {
120 return &SourcePos{
121 Filename: l.filename,
122 Offset: 0,
123 Line: 1,
124 Col: 1,
125 }
126 }
127 return l.prevSym.start()
128}
129
130func (l *protoLex) Lex(lval *protoSymType) int {
131 if l.err != nil {
132 // if we are already in a failed state, bail
133 lval.err = l.err
134 return _ERROR
135 }
136
137 prevLineNo := l.lineNo
138 prevColNo := l.colNo
139 prevOffset := l.offset
140 var comments []*comment
141
142 pos := func() posRange {
143 return posRange{
144 start: &SourcePos{
145 Filename: l.filename,
146 Offset: prevOffset,
147 Line: prevLineNo + 1,
148 Col: prevColNo + 1,
149 },
150 end: l.cur(),
151 }
152 }
153 basic := func() basicNode {
154 return basicNode{
155 posRange: pos(),
156 leading: comments,
157 }
158 }
159 setPrev := func(n terminalNode) {
160 nStart := n.start().Line
161 if _, ok := n.(*basicNode); ok {
162 // if the node is a simple rune, don't attribute comments to it
163 // HACK: adjusting the start line makes leading comments appear
164 // detached so logic below will naturally associated trailing
165 // comment to previous symbol
166 nStart += 2
167 }
168 if l.prevSym != nil && len(n.leadingComments()) > 0 && l.prevSym.end().Line < nStart {
169 // we may need to re-attribute the first comment to
170 // instead be previous node's trailing comment
171 prevEnd := l.prevSym.end().Line
172 comments := n.leadingComments()
173 c := comments[0]
174 commentStart := c.start.Line
175 if commentStart == prevEnd {
176 // comment is on same line as previous symbol
177 n.popLeadingComment()
178 l.prevSym.pushTrailingComment(c)
179 } else if commentStart == prevEnd+1 {
180 // comment is right after previous symbol; see if it is detached
181 // and if so re-attribute
182 singleLineStyle := strings.HasPrefix(c.text, "//")
183 line := c.end.Line
184 groupEnd := -1
185 for i := 1; i < len(comments); i++ {
186 c := comments[i]
187 newGroup := false
188 if !singleLineStyle || c.start.Line > line+1 {
189 // we've found a gap between comments, which means the
190 // previous comments were detached
191 newGroup = true
192 } else {
193 line = c.end.Line
194 singleLineStyle = strings.HasPrefix(comments[i].text, "//")
195 if !singleLineStyle {
196 // we've found a switch from // comments to /*
197 // consider that a new group which means the
198 // previous comments were detached
199 newGroup = true
200 }
201 }
202 if newGroup {
203 groupEnd = i
204 break
205 }
206 }
207
208 if groupEnd == -1 {
209 // just one group of comments; we'll mark it as a trailing
210 // comment if it immediately follows previous symbol and is
211 // detached from current symbol
212 c1 := comments[0]
213 c2 := comments[len(comments)-1]
214 if c1.start.Line <= prevEnd+1 && c2.end.Line < nStart-1 {
215 groupEnd = len(comments)
216 }
217 }
218
219 for i := 0; i < groupEnd; i++ {
220 l.prevSym.pushTrailingComment(n.popLeadingComment())
221 }
222 }
223 }
224
225 l.prevSym = n
226 }
227 setString := func(val string) {
228 b := basic()
229 lval.str = &stringLiteralNode{val: val}
230 lval.str.setRange(&b, &b)
231 setPrev(lval.str)
232 }
233 setIdent := func(val string, kind identKind) {
234 lval.id = &identNode{basicNode: basic(), val: val, kind: kind}
235 setPrev(lval.id)
236 }
237 setInt := func(val uint64) {
238 lval.ui = &intLiteralNode{basicNode: basic(), val: val}
239 setPrev(lval.ui)
240 }
241 setFloat := func(val float64) {
242 b := basic()
243 lval.f = &floatLiteralNode{val: val}
244 lval.f.setRange(&b, &b)
245 setPrev(lval.f)
246 }
247 setRune := func() {
248 b := basic()
249 lval.b = &b
250 setPrev(lval.b)
251 }
252 setError := func(err error) {
253 lval.err = err
254 l.err = err
255 }
256
257 for {
258 c, n, err := l.input.readRune()
259 if err == io.EOF {
260 // we're not actually returning a rune, but this will associate
261 // accumulated comments as a trailing comment on last symbol
262 // (if appropriate)
263 setRune()
264 return 0
265 } else if err != nil {
266 setError(err)
267 return _ERROR
268 }
269
270 prevLineNo = l.lineNo
271 prevColNo = l.colNo
272 prevOffset = l.offset
273
274 l.offset += n
275 if c == '\n' {
276 l.colNo = 0
277 l.lineNo++
278 continue
279 } else if c == '\r' {
280 continue
281 }
282 l.colNo++
283 if c == ' ' || c == '\t' {
284 continue
285 }
286
287 if c == '.' {
288 // tokens that start with a dot include type names and decimal literals
289 cn, _, err := l.input.readRune()
290 if err != nil {
291 setRune()
292 return int(c)
293 }
294 if cn == '_' || (cn >= 'a' && cn <= 'z') || (cn >= 'A' && cn <= 'Z') {
295 l.colNo++
296 token := []rune{c, cn}
297 token = l.readIdentifier(token)
298 setIdent(string(token), identTypeName)
299 return _TYPENAME
300 }
301 if cn >= '0' && cn <= '9' {
302 l.colNo++
303 token := []rune{c, cn}
304 token = l.readNumber(token, false, true)
305 f, err := strconv.ParseFloat(string(token), 64)
306 if err != nil {
307 setError(err)
308 return _ERROR
309 }
310 setFloat(f)
311 return _FLOAT_LIT
312 }
313 l.input.unreadRune(cn)
314 setRune()
315 return int(c)
316 }
317
318 if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') {
319 // identifier
320 token := []rune{c}
321 token = l.readIdentifier(token)
322 str := string(token)
323 if strings.Contains(str, ".") {
324 setIdent(str, identQualified)
325 return _FQNAME
326 }
327 if t, ok := keywords[str]; ok {
328 setIdent(str, identSimpleName)
329 return t
330 }
331 setIdent(str, identSimpleName)
332 return _NAME
333 }
334
335 if c >= '0' && c <= '9' {
336 // integer or float literal
337 if c == '0' {
338 cn, _, err := l.input.readRune()
339 if err != nil {
340 setInt(0)
341 return _INT_LIT
342 }
343 if cn == 'x' || cn == 'X' {
344 cnn, _, err := l.input.readRune()
345 if err != nil {
346 l.input.unreadRune(cn)
347 setInt(0)
348 return _INT_LIT
349 }
350 if (cnn >= '0' && cnn <= '9') || (cnn >= 'a' && cnn <= 'f') || (cnn >= 'A' && cnn <= 'F') {
351 // hexadecimal!
352 l.colNo += 2
353 token := []rune{cnn}
354 token = l.readHexNumber(token)
355 ui, err := strconv.ParseUint(string(token), 16, 64)
356 if err != nil {
357 setError(err)
358 return _ERROR
359 }
360 setInt(ui)
361 return _INT_LIT
362 }
363 l.input.unreadRune(cnn)
364 l.input.unreadRune(cn)
365 setInt(0)
366 return _INT_LIT
367 } else {
368 l.input.unreadRune(cn)
369 }
370 }
371 token := []rune{c}
372 token = l.readNumber(token, true, true)
373 numstr := string(token)
374 if strings.Contains(numstr, ".") || strings.Contains(numstr, "e") || strings.Contains(numstr, "E") {
375 // floating point!
376 f, err := strconv.ParseFloat(numstr, 64)
377 if err != nil {
378 setError(err)
379 return _ERROR
380 }
381 setFloat(f)
382 return _FLOAT_LIT
383 }
384 // integer! (decimal or octal)
385 ui, err := strconv.ParseUint(numstr, 0, 64)
386 if err != nil {
387 setError(err)
388 return _ERROR
389 }
390 setInt(ui)
391 return _INT_LIT
392 }
393
394 if c == '\'' || c == '"' {
395 // string literal
396 str, err := l.readStringLiteral(c)
397 if err != nil {
398 setError(err)
399 return _ERROR
400 }
401 setString(str)
402 return _STRING_LIT
403 }
404
405 if c == '/' {
406 // comment
407 cn, _, err := l.input.readRune()
408 if err != nil {
409 setRune()
410 return int(c)
411 }
412 if cn == '/' {
413 l.colNo++
414 hitNewline, txt := l.skipToEndOfLineComment()
415 commentPos := pos()
416 commentPos.end.Col++
417 if hitNewline {
418 l.colNo = 0
419 l.lineNo++
420 }
421 comments = append(comments, &comment{posRange: commentPos, text: txt})
422 continue
423 }
424 if cn == '*' {
425 l.colNo++
426 if txt, ok := l.skipToEndOfBlockComment(); !ok {
427 setError(errors.New("block comment never terminates, unexpected EOF"))
428 return _ERROR
429 } else {
430 comments = append(comments, &comment{posRange: pos(), text: txt})
431 }
432 continue
433 }
434 l.input.unreadRune(cn)
435 }
436
437 setRune()
438 return int(c)
439 }
440}
441
442func (l *protoLex) readNumber(sofar []rune, allowDot bool, allowExp bool) []rune {
443 token := sofar
444 for {
445 c, _, err := l.input.readRune()
446 if err != nil {
447 break
448 }
449 if c == '.' {
450 if !allowDot {
451 l.input.unreadRune(c)
452 break
453 }
454 allowDot = false
455 cn, _, err := l.input.readRune()
456 if err != nil {
457 l.input.unreadRune(c)
458 break
459 }
460 if cn < '0' || cn > '9' {
461 l.input.unreadRune(cn)
462 l.input.unreadRune(c)
463 break
464 }
465 l.colNo++
466 token = append(token, c)
467 c = cn
468 } else if c == 'e' || c == 'E' {
469 if !allowExp {
470 l.input.unreadRune(c)
471 break
472 }
473 allowExp = false
474 cn, _, err := l.input.readRune()
475 if err != nil {
476 l.input.unreadRune(c)
477 break
478 }
479 if cn == '-' || cn == '+' {
480 cnn, _, err := l.input.readRune()
481 if err != nil {
482 l.input.unreadRune(cn)
483 l.input.unreadRune(c)
484 break
485 }
486 if cnn < '0' || cnn > '9' {
487 l.input.unreadRune(cnn)
488 l.input.unreadRune(cn)
489 l.input.unreadRune(c)
490 break
491 }
492 l.colNo++
493 token = append(token, c)
494 c = cn
495 cn = cnn
496 } else if cn < '0' || cn > '9' {
497 l.input.unreadRune(cn)
498 l.input.unreadRune(c)
499 break
500 }
501 l.colNo++
502 token = append(token, c)
503 c = cn
504 } else if c < '0' || c > '9' {
505 l.input.unreadRune(c)
506 break
507 }
508 l.colNo++
509 token = append(token, c)
510 }
511 return token
512}
513
514func (l *protoLex) readHexNumber(sofar []rune) []rune {
515 token := sofar
516 for {
517 c, _, err := l.input.readRune()
518 if err != nil {
519 break
520 }
521 if (c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9') {
522 l.input.unreadRune(c)
523 break
524 }
525 l.colNo++
526 token = append(token, c)
527 }
528 return token
529}
530
531func (l *protoLex) readIdentifier(sofar []rune) []rune {
532 token := sofar
533 for {
534 c, _, err := l.input.readRune()
535 if err != nil {
536 break
537 }
538 if c == '.' {
539 cn, _, err := l.input.readRune()
540 if err != nil {
541 l.input.unreadRune(c)
542 break
543 }
544 if cn != '_' && (cn < 'a' || cn > 'z') && (cn < 'A' || cn > 'Z') {
545 l.input.unreadRune(cn)
546 l.input.unreadRune(c)
547 break
548 }
549 l.colNo++
550 token = append(token, c)
551 c = cn
552 } else if c != '_' && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') {
553 l.input.unreadRune(c)
554 break
555 }
556 l.colNo++
557 token = append(token, c)
558 }
559 return token
560}
561
562func (l *protoLex) readStringLiteral(quote rune) (string, error) {
563 var buf bytes.Buffer
564 for {
565 c, _, err := l.input.readRune()
566 if err != nil {
567 if err == io.EOF {
568 err = io.ErrUnexpectedEOF
569 }
570 return "", err
571 }
572 if c == '\n' {
573 l.colNo = 0
574 l.lineNo++
575 return "", errors.New("encountered end-of-line before end of string literal")
576 }
577 l.colNo++
578 if c == quote {
579 break
580 }
581 if c == 0 {
582 return "", errors.New("null character ('\\0') not allowed in string literal")
583 }
584 if c == '\\' {
585 // escape sequence
586 c, _, err = l.input.readRune()
587 if err != nil {
588 return "", err
589 }
590 l.colNo++
591 if c == 'x' || c == 'X' {
592 // hex escape
593 c, _, err := l.input.readRune()
594 if err != nil {
595 return "", err
596 }
597 l.colNo++
598 c2, _, err := l.input.readRune()
599 if err != nil {
600 return "", err
601 }
602 var hex string
603 if (c2 < '0' || c2 > '9') && (c2 < 'a' || c2 > 'f') && (c2 < 'A' || c2 > 'F') {
604 l.input.unreadRune(c2)
605 hex = string(c)
606 } else {
607 l.colNo++
608 hex = string([]rune{c, c2})
609 }
610 i, err := strconv.ParseInt(hex, 16, 32)
611 if err != nil {
612 return "", fmt.Errorf("invalid hex escape: \\x%q", hex)
613 }
614 buf.WriteByte(byte(i))
615
616 } else if c >= '0' && c <= '7' {
617 // octal escape
618 c2, _, err := l.input.readRune()
619 if err != nil {
620 return "", err
621 }
622 var octal string
623 if c2 < '0' || c2 > '7' {
624 l.input.unreadRune(c2)
625 octal = string(c)
626 } else {
627 l.colNo++
628 c3, _, err := l.input.readRune()
629 if err != nil {
630 return "", err
631 }
632 if c3 < '0' || c3 > '7' {
633 l.input.unreadRune(c3)
634 octal = string([]rune{c, c2})
635 } else {
636 l.colNo++
637 octal = string([]rune{c, c2, c3})
638 }
639 }
640 i, err := strconv.ParseInt(octal, 8, 32)
641 if err != nil {
642 return "", fmt.Errorf("invalid octal escape: \\%q", octal)
643 }
644 if i > 0xff {
645 return "", fmt.Errorf("octal escape is out range, must be between 0 and 377: \\%q", octal)
646 }
647 buf.WriteByte(byte(i))
648
649 } else if c == 'u' {
650 // short unicode escape
651 u := make([]rune, 4)
652 for i := range u {
653 c, _, err := l.input.readRune()
654 if err != nil {
655 return "", err
656 }
657 l.colNo++
658 u[i] = c
659 }
660 i, err := strconv.ParseInt(string(u), 16, 32)
661 if err != nil {
662 return "", fmt.Errorf("invalid unicode escape: \\u%q", string(u))
663 }
664 buf.WriteRune(rune(i))
665
666 } else if c == 'U' {
667 // long unicode escape
668 u := make([]rune, 8)
669 for i := range u {
670 c, _, err := l.input.readRune()
671 if err != nil {
672 return "", err
673 }
674 l.colNo++
675 u[i] = c
676 }
677 i, err := strconv.ParseInt(string(u), 16, 32)
678 if err != nil {
679 return "", fmt.Errorf("invalid unicode escape: \\U%q", string(u))
680 }
681 if i > 0x10ffff || i < 0 {
682 return "", fmt.Errorf("unicode escape is out of range, must be between 0 and 0x10ffff: \\U%q", string(u))
683 }
684 buf.WriteRune(rune(i))
685
686 } else if c == 'a' {
687 buf.WriteByte('\a')
688 } else if c == 'b' {
689 buf.WriteByte('\b')
690 } else if c == 'f' {
691 buf.WriteByte('\f')
692 } else if c == 'n' {
693 buf.WriteByte('\n')
694 } else if c == 'r' {
695 buf.WriteByte('\r')
696 } else if c == 't' {
697 buf.WriteByte('\t')
698 } else if c == 'v' {
699 buf.WriteByte('\v')
700 } else if c == '\\' {
701 buf.WriteByte('\\')
702 } else if c == '\'' {
703 buf.WriteByte('\'')
704 } else if c == '"' {
705 buf.WriteByte('"')
706 } else if c == '?' {
707 buf.WriteByte('?')
708 } else {
709 return "", fmt.Errorf("invalid escape sequence: %q", "\\"+string(c))
710 }
711 } else {
712 buf.WriteRune(c)
713 }
714 }
715 return buf.String(), nil
716}
717
718func (l *protoLex) skipToEndOfLineComment() (bool, string) {
719 txt := []rune{'/', '/'}
720 for {
721 c, _, err := l.input.readRune()
722 if err != nil {
723 return false, string(txt)
724 }
725 if c == '\n' {
726 return true, string(txt)
727 }
728 l.colNo++
729 txt = append(txt, c)
730 }
731}
732
733func (l *protoLex) skipToEndOfBlockComment() (string, bool) {
734 txt := []rune{'/', '*'}
735 for {
736 c, _, err := l.input.readRune()
737 if err != nil {
738 return "", false
739 }
740 if c == '\n' {
741 l.colNo = 0
742 l.lineNo++
743 } else {
744 l.colNo++
745 }
746 txt = append(txt, c)
747 if c == '*' {
748 c, _, err := l.input.readRune()
749 if err != nil {
750 return "", false
751 }
752 if c == '/' {
753 l.colNo++
754 txt = append(txt, c)
755 return string(txt), true
756 }
757 l.input.unreadRune(c)
758 }
759 }
760}
761
762func (l *protoLex) Error(s string) {
763 if l.err == nil {
764 l.err = ErrorWithSourcePos{Underlying: errors.New(s), Pos: l.prevSym.start()}
765 }
766}