blob: f11e1a3b14581c46d2386c76124a0f36b4574d2a [file] [log] [blame]
Martin Cosyns0efdc872021-09-27 16:24:30 +00001# Copyright 2020-present Open Networking Foundation
2# Original copyright 2020-present ADTRAN, Inc.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# -*- coding: utf-8 -*-
15
16# Parser for protocol buffer .proto files
17import enum as stdlib_enum
18from string import ascii_letters, digits, hexdigits, octdigits
19
20import attr
21
22from parsy import char_from, from_enum, generate, regex, seq, string
23
24# This file follows the spec at
25# https://developers.google.com/protocol-buffers/docs/reference/proto3-spec
26# very closely.
27
28# However, because we are parsing into useful objects, we do transformations
29# along the way e.g. turning into integers, strings etc. and custom objects.
30# Some of the lowest level items have been implemented using 'regex' and converting
31# the descriptions to regular expressions. Higher level constructs have been
32# implemented using other parsy primitives and combinators.
33
34# Notes:
35
36# 1. Whitespace is very badly defined in the 'spec', so we guess what is meant.
37# 2. The spec doesn't allow for comments, and neither does this parser.
38# Other places mention that C++ style comments are allowed. To support that,
39# this parser would need to be changed into split lexing/parsing stages
40# (otherwise you hit issues with comments start markers within string literals).
41# 3. Other notes inline.
42
43
44# Our utilities
45optional_string = lambda s: string(s).times(0, 1).concat()
46convert_decimal = int
47convert_octal = lambda s: int(s, 8)
48convert_hex = lambda s: int(s, 16)
49exclude_none = lambda l: [i for i in l if i is not None]
50
51
52def lexeme(p):
53 """
54 From a parser (or string), make a parser that consumes
55 whitespace on either side.
56 """
57 if isinstance(p, str):
58 p = string(p)
59 return regex(r'\s*') >> p << regex(r'\s*')
60
61
62def is_present(p):
63 """
64 Given a parser or string, make a parser that returns
65 True if the parser matches, False otherwise
66 """
67 return lexeme(p).optional().map(lambda v: False if v is None else True)
68
69
70# Our data structures
71@attr.s
72class Import:
73 identifier = attr.ib()
74 option = attr.ib()
75
76
77@attr.s
78class Package:
79 identifer = attr.ib()
80
81
82@attr.s
83class Option:
84 name = attr.ib()
85 value = attr.ib()
86
87
88@attr.s
89class Field:
90 repeated = attr.ib()
91 type = attr.ib()
92 name = attr.ib()
93 number = attr.ib()
94 options = attr.ib()
95
96
97@attr.s
98class OneOfField:
99 type = attr.ib()
100 name = attr.ib()
101 number = attr.ib()
102 options = attr.ib()
103
104
105@attr.s
106class OneOf:
107 name = attr.ib()
108 fields = attr.ib()
109
110
111@attr.s
112class Map:
113 key_type = attr.ib()
114 type = attr.ib()
115 name = attr.ib()
116 number = attr.ib()
117 options = attr.ib()
118
119
120@attr.s
121class Reserved:
122 items = attr.ib()
123
124
125@attr.s
126class Range:
127 from_ = attr.ib()
128 to = attr.ib()
129
130
131@attr.s
132class EnumField:
133 name = attr.ib()
134 value = attr.ib()
135 options = attr.ib()
136
137
138@attr.s
139class Enum:
140 name = attr.ib()
141 body = attr.ib()
142
143
144@attr.s
145class Message:
146 name = attr.ib()
147 body = attr.ib()
148
149
150@attr.s
151class Service:
152 name = attr.ib()
153 body = attr.ib()
154
155
156@attr.s
157class Rpc:
158 name = attr.ib()
159 request_stream = attr.ib()
160 request_message_type = attr.ib()
161 response_stream = attr.ib()
162 response_message_type = attr.ib()
163 options = attr.ib()
164
165
166@attr.s
167class Proto:
168 syntax = attr.ib()
169 statements = attr.ib()
170
171
172# Enums:
173class ImportOption(stdlib_enum.Enum):
174 WEAK = "weak"
175 PUBLIC = "public"
176
177
178class Type(stdlib_enum.Enum):
179 DOUBLE = "double"
180 FLOAT = "float"
181 INT32 = "int32"
182 INT64 = "int64"
183 UINT32 = "uint32"
184 UINT64 = "uint64"
185 SINT32 = "sint32"
186 SINT64 = "sint64"
187 FIXED32 = "fixed32"
188 FIXED64 = "fixed64"
189 SFIXED32 = "sfixed32"
190 SFIXED64 = "sfixed64"
191 BOOL = "bool"
192 STRING = "string"
193 BYTES = "bytes"
194
195
196class KeyType(stdlib_enum.Enum):
197 INT32 = "int32"
198 INT64 = "int64"
199 UINT32 = "uint32"
200 UINT64 = "uint64"
201 SINT32 = "sint32"
202 SINT64 = "sint64"
203 FIXED32 = "fixed32"
204 FIXED64 = "fixed64"
205 SFIXED32 = "sfixed32"
206 SFIXED64 = "sfixed64"
207 BOOL = "bool"
208 STRING = "string"
209
210
211# Some extra constants to avoid typing
212SEMI, EQ, LPAREN, RPAREN, LBRACE, RBRACE, LBRAC, RBRAC = [lexeme(c) for c in ";=(){}[]"]
213
214
215# -- Beginning of following spec --
216# Letters and digits
217letter = char_from(ascii_letters)
218decimalDigit = char_from(digits)
219octalDigit = char_from(octdigits)
220hexDigit = char_from(hexdigits)
221
222# Identifiers
223
224# Compared to spec, we add some '_' prefixed items which are not wrapped in `lexeme`,
225# on the assumption that spaces in the middle of identifiers are not accepted.
226_ident = (letter + (letter | decimalDigit | string("_")).many().concat()).desc('ident')
227ident = lexeme(_ident)
228fullIdent = lexeme(ident + (string(".") + ident).many().concat()).desc('fullIdent')
229_messageName = _ident
230messageName = lexeme(ident).desc('messageName')
231_enumName = ident
232enumName = lexeme(_enumName).desc('enumName')
233fieldName = ident.desc('fieldName')
234oneofName = ident.desc('oneofName')
235mapName = ident.desc('mapName')
236serviceName = ident.desc('serviceName')
237rpcName = ident.desc('rpcName')
238messageType = optional_string(".") + (_ident + string(".")).many().concat() + _messageName
239enumType = optional_string(".") + (_ident + string(".")).many().concat() + _enumName
240
241# Integer literals
242decimalLit = regex("[1-9][0-9]*").desc('decimalLit').map(convert_decimal)
243octalLit = regex("0[0-7]*").desc('octalLit').map(convert_octal)
244hexLit = regex("0[x|X][0-9a-fA-F]+").desc('octalLit').map(convert_hex)
245intLit = decimalLit | octalLit | hexLit
246
247
248# Floating-point literals
249decimals = r'[0-9]+'
250exponent = r'[e|E][+|-]?' + decimals
251floatLit = regex(r'({decimals}\.({decimals})?({exponent})?)|{decimals}{exponent}|\.{decimals}({exponent})?'
252 .format(decimals=decimals, exponent=exponent)).desc('floatLit').map(float)
253
254
255# Boolean
256boolLit = (string("true").result(True) | string("false").result(False)).desc('boolLit')
257
258
259# String literals
260hexEscape = regex(r"\\[x|X]") >> regex("[0-9a-fA-F]{2}").map(convert_hex).map(chr)
261octEscape = regex(r"\\") >> regex('[0-7]{2}').map(convert_octal).map(chr)
262charEscape = regex(r"\\") >> (
263 string("a").result("\a")
264 | string("b").result("\b")
265 | string("f").result("\f")
266 | string("n").result("\n")
267 | string("r").result("\r")
268 | string("t").result("\t")
269 | string("v").result("\v")
270 | string("\\").result("\\")
271 | string("'").result("'")
272 | string('"').result('"')
273)
274escapes = hexEscape | octEscape | charEscape
275# Correction to spec regarding " and ' inside quoted strings
276strLit = (string("'") >> (escapes | regex(r"[^\0\n\'\\]")).many().concat() << string("'")
277 | string('"') >> (escapes | regex(r"[^\0\n\"\\]")).many().concat() << string('"')).desc('strLit')
278quote = string("'") | string('"')
279
280# EmptyStatement
281emptyStatement = string(";").result(None)
282
283# Signed numbers:
284# (Extra compared to spec, to cope with need to produce signed numeric values)
285signedNumberChange = lambda s, num: (-1) if s == "-" else (+1)
286sign = regex("[-+]?")
287signedIntLit = seq(sign, intLit).combine(signedNumberChange)
288signedFloatLit = seq(sign, floatLit).combine(signedNumberChange)
289
290
291# Constant
292# put fullIdent at end to disabmiguate from boolLit
293constant = signedIntLit | signedFloatLit | strLit | boolLit | fullIdent
294
295# Syntax
296syntax = lexeme("syntax") >> EQ >> quote >> string("proto3") << quote + SEMI
297
298# Import Statement
299import_option = from_enum(ImportOption)
300
301import_ = seq(lexeme("import") >> import_option.optional().tag('option'),
302 lexeme(strLit).tag('identifier') << SEMI).combine_dict(Import)
303
304# Package
305package = seq(lexeme("package") >> fullIdent << SEMI).map(Package)
306
307# Option
308optionName = (ident | (LPAREN >> fullIdent << RPAREN)) + (string(".") + ident).many().concat()
309option = seq(lexeme("option") >> optionName.tag('name'),
310 EQ >> constant.tag('value') << SEMI,
311 ).combine_dict(Option)
312
313# Normal field
314type_ = lexeme(from_enum(Type) | messageType | enumType)
315fieldNumber = lexeme(intLit)
316
317fieldOption = seq(optionName.tag('name'),
318 EQ >> constant.tag('value')).combine_dict(Option)
319fieldOptions = fieldOption.sep_by(lexeme(","), min=1)
320fieldOptionList = (lexeme("[") >> fieldOptions << lexeme("]")).optional().map(
321 lambda o: [] if o is None else o)
322
323field = seq(is_present("repeated").tag('repeated'),
324 type_.tag('type'),
325 fieldName.tag('name') << EQ,
326 fieldNumber.tag('number'),
327 fieldOptionList.tag('options') << SEMI,
328 ).combine_dict(Field)
329
330# Oneof and oneof field
331oneofField = seq(type_.tag('type'),
332 fieldName.tag('name') << EQ,
333 fieldNumber.tag('number'),
334 fieldOptionList.tag('options') << SEMI,
335 ).combine_dict(OneOfField)
336oneof = seq(lexeme("oneof") >> oneofName.tag('name'),
337 LBRACE
338 >> (oneofField | emptyStatement).many().map(exclude_none).tag('fields')
339 << RBRACE
340 ).combine_dict(OneOf)
341
342# Map field
343keyType = lexeme(from_enum(KeyType))
344mapField = seq(lexeme("map") >> lexeme("<") >> keyType.tag('key_type'),
345 lexeme(",") >> type_.tag('type'),
346 lexeme(">") >> mapName.tag('name'),
347 EQ >> fieldNumber.tag('number'),
348 fieldOptionList.tag('options') << SEMI
349 ).combine_dict(Map)
350
351# Reserved
352range_ = seq(lexeme(intLit).tag('from_'),
353 (lexeme("to") >> (intLit | lexeme("max"))).optional().tag('to')
354 ).combine_dict(Range)
355ranges = range_.sep_by(lexeme(","), min=1)
356# The spec for 'reserved' indicates 'fieldName' here, which is never a quoted string.
357# But the example has a quoted string. We have changed it to 'strLit'
358fieldNames = strLit.sep_by(lexeme(","), min=1)
359reserved = seq(lexeme("reserved") >> (ranges | fieldNames) << SEMI
360 ).combine(Reserved)
361
362# Enum definition
363enumValueOption = seq(optionName.tag('name') << EQ,
364 constant.tag('value')
365 ).combine_dict(Option)
366enumField = seq(ident.tag('name') << EQ,
367 lexeme(intLit).tag('value'),
368 (lexeme("[") >> enumValueOption.sep_by(lexeme(","), min=1) << lexeme("]")).optional()
369 .map(lambda o: [] if o is None else o).tag('options')
370 << SEMI
371 ).combine_dict(EnumField)
372enumBody = (LBRACE
373 >> (option | enumField | emptyStatement).many().map(exclude_none)
374 << RBRACE)
375enum = seq(lexeme("enum") >> enumName.tag('name'),
376 enumBody.tag('body')
377 ).combine_dict(Enum)
378
379
380# Message definition
381@generate
382def message():
383 yield lexeme("message")
384 name = yield messageName
385 body = yield messageBody
386 return Message(name=name, body=body)
387
388
389messageBody = (LBRACE
390 >> (field | enum | message | option | oneof | mapField
391 | reserved | emptyStatement).many()
392 << RBRACE)
393
394
395# Service definition
396rpc = seq(lexeme("rpc") >> rpcName.tag('name'),
397 LPAREN
398 >> (is_present("stream").tag("request_stream")),
399 messageType.tag("request_message_type") << RPAREN,
400 lexeme("returns") >> LPAREN
401 >> (is_present("stream").tag("response_stream")),
402 messageType.tag("response_message_type")
403 << RPAREN,
404 ((LBRACE
405 >> (option | emptyStatement).many()
406 << RBRACE)
407 | SEMI.result([])
408 ).optional().map(exclude_none).tag('options')
409 ).combine_dict(Rpc)
410
411service = seq(lexeme("service") >> serviceName.tag('name'),
412 LBRACE
413 >> (option | rpc | emptyStatement).many().map(exclude_none).tag('body')
414 << RBRACE
415 ).combine_dict(Service)
416
417
418# Proto file
419topLevelDef = message | enum | service
420proto = seq(syntax.tag('syntax'),
421 (import_ | package | option | topLevelDef | emptyStatement
422 ).many().map(exclude_none).tag('statements')
423 ).combine_dict(Proto)
424
425
426EXAMPLE = """syntax = "proto3";
427import public "other.proto";
428option java_package = "com.example.foo";
429option java_package = "com.example.foo";
430package dmi;
431
432enum EnumAllowingAlias {
433 option allow_alias = true;
434 UNKNOWN = 0;
435 STARTED = 1;
436 RUNNING = 2 [(custom_option) = "hello world"];
437}
438message outer {
439 option (my_option).a = true;
440 message inner {
441 int64 ival = 1;
442 }
443 repeated inner inner_message = 2;
444 EnumAllowingAlias enum_field =3;
445 map<int32, string> my_map = 4;
446 oneof operation {
447 MetricsConfig changes = 2;
448 bool reset_to_default = 3;
449 }
450}
451"""
452# Smoke test - should find 4 top level statements in the example:
453# assert len(proto.parse(EXAMPLE).statements) == 4
454# print(proto.parse(EXAMPLE).statements)
455# for st in proto.parse(EXAMPLE).statements:
456# print(type(st))