blob: c850d8c876b8009d818c8742abe3b29cbe8961a5 [file] [log] [blame]
Martin Cosynsf88ed6e2020-12-02 10:30:10 +01001# kann keine Kommentare
2
3# -*- coding: utf-8 -*-
4
5# Parser for protocol buffer .proto files
6import enum as stdlib_enum
7from string import ascii_letters, digits, hexdigits, octdigits
8
9import attr
10
11from parsy import char_from, from_enum, generate, regex, seq, string
12
13# This file follows the spec at
14# https://developers.google.com/protocol-buffers/docs/reference/proto3-spec
15# very closely.
16
17# However, because we are parsing into useful objects, we do transformations
18# along the way e.g. turning into integers, strings etc. and custom objects.
19# Some of the lowest level items have been implemented using 'regex' and converting
20# the descriptions to regular expressions. Higher level constructs have been
21# implemented using other parsy primitives and combinators.
22
23# Notes:
24
25# 1. Whitespace is very badly defined in the 'spec', so we guess what is meant.
26# 2. The spec doesn't allow for comments, and neither does this parser.
27# Other places mention that C++ style comments are allowed. To support that,
28# this parser would need to be changed into split lexing/parsing stages
29# (otherwise you hit issues with comments start markers within string literals).
30# 3. Other notes inline.
31
32
33# Our utilities
34optional_string = lambda s: string(s).times(0, 1).concat()
35convert_decimal = int
36convert_octal = lambda s: int(s, 8)
37convert_hex = lambda s: int(s, 16)
38exclude_none = lambda l: [i for i in l if i is not None]
39
40
41def lexeme(p):
42 """
43 From a parser (or string), make a parser that consumes
44 whitespace on either side.
45 """
46 if isinstance(p, str):
47 p = string(p)
48 return regex(r'\s*') >> p << regex(r'\s*')
49
50
51def is_present(p):
52 """
53 Given a parser or string, make a parser that returns
54 True if the parser matches, False otherwise
55 """
56 return lexeme(p).optional().map(lambda v: False if v is None else True)
57
58
59# Our data structures
60@attr.s
61class Import:
62 identifier = attr.ib()
63 option = attr.ib()
64
65
66@attr.s
67class Package:
68 identifer = attr.ib()
69
70
71@attr.s
72class Option:
73 name = attr.ib()
74 value = attr.ib()
75
76
77@attr.s
78class Field:
79 repeated = attr.ib()
80 type = attr.ib()
81 name = attr.ib()
82 number = attr.ib()
83 options = attr.ib()
84
85
86@attr.s
87class OneOfField:
88 type = attr.ib()
89 name = attr.ib()
90 number = attr.ib()
91 options = attr.ib()
92
93
94@attr.s
95class OneOf:
96 name = attr.ib()
97 fields = attr.ib()
98
99
100@attr.s
101class Map:
102 key_type = attr.ib()
103 type = attr.ib()
104 name = attr.ib()
105 number = attr.ib()
106 options = attr.ib()
107
108
109@attr.s
110class Reserved:
111 items = attr.ib()
112
113
114@attr.s
115class Range:
116 from_ = attr.ib()
117 to = attr.ib()
118
119
120@attr.s
121class EnumField:
122 name = attr.ib()
123 value = attr.ib()
124 options = attr.ib()
125
126
127@attr.s
128class Enum:
129 name = attr.ib()
130 body = attr.ib()
131
132
133@attr.s
134class Message:
135 name = attr.ib()
136 body = attr.ib()
137
138
139@attr.s
140class Service:
141 name = attr.ib()
142 body = attr.ib()
143
144
145@attr.s
146class Rpc:
147 name = attr.ib()
148 request_stream = attr.ib()
149 request_message_type = attr.ib()
150 response_stream = attr.ib()
151 response_message_type = attr.ib()
152 options = attr.ib()
153
154
155@attr.s
156class Proto:
157 syntax = attr.ib()
158 statements = attr.ib()
159
160
161# Enums:
162class ImportOption(stdlib_enum.Enum):
163 WEAK = "weak"
164 PUBLIC = "public"
165
166
167class Type(stdlib_enum.Enum):
168 DOUBLE = "double"
169 FLOAT = "float"
170 INT32 = "int32"
171 INT64 = "int64"
172 UINT32 = "uint32"
173 UINT64 = "uint64"
174 SINT32 = "sint32"
175 SINT64 = "sint64"
176 FIXED32 = "fixed32"
177 FIXED64 = "fixed64"
178 SFIXED32 = "sfixed32"
179 SFIXED64 = "sfixed64"
180 BOOL = "bool"
181 STRING = "string"
182 BYTES = "bytes"
183
184
185class KeyType(stdlib_enum.Enum):
186 INT32 = "int32"
187 INT64 = "int64"
188 UINT32 = "uint32"
189 UINT64 = "uint64"
190 SINT32 = "sint32"
191 SINT64 = "sint64"
192 FIXED32 = "fixed32"
193 FIXED64 = "fixed64"
194 SFIXED32 = "sfixed32"
195 SFIXED64 = "sfixed64"
196 BOOL = "bool"
197 STRING = "string"
198
199
200# Some extra constants to avoid typing
201SEMI, EQ, LPAREN, RPAREN, LBRACE, RBRACE, LBRAC, RBRAC = [lexeme(c) for c in ";=(){}[]"]
202
203
204# -- Beginning of following spec --
205# Letters and digits
206letter = char_from(ascii_letters)
207decimalDigit = char_from(digits)
208octalDigit = char_from(octdigits)
209hexDigit = char_from(hexdigits)
210
211# Identifiers
212
213# Compared to spec, we add some '_' prefixed items which are not wrapped in `lexeme`,
214# on the assumption that spaces in the middle of identifiers are not accepted.
215_ident = (letter + (letter | decimalDigit | string("_")).many().concat()).desc('ident')
216ident = lexeme(_ident)
217fullIdent = lexeme(ident + (string(".") + ident).many().concat()).desc('fullIdent')
218_messageName = _ident
219messageName = lexeme(ident).desc('messageName')
220_enumName = ident
221enumName = lexeme(_enumName).desc('enumName')
222fieldName = ident.desc('fieldName')
223oneofName = ident.desc('oneofName')
224mapName = ident.desc('mapName')
225serviceName = ident.desc('serviceName')
226rpcName = ident.desc('rpcName')
227messageType = optional_string(".") + (_ident + string(".")).many().concat() + _messageName
228enumType = optional_string(".") + (_ident + string(".")).many().concat() + _enumName
229
230# Integer literals
231decimalLit = regex("[1-9][0-9]*").desc('decimalLit').map(convert_decimal)
232octalLit = regex("0[0-7]*").desc('octalLit').map(convert_octal)
233hexLit = regex("0[x|X][0-9a-fA-F]+").desc('octalLit').map(convert_hex)
234intLit = decimalLit | octalLit | hexLit
235
236
237# Floating-point literals
238decimals = r'[0-9]+'
239exponent = r'[e|E][+|-]?' + decimals
240floatLit = regex(r'({decimals}\.({decimals})?({exponent})?)|{decimals}{exponent}|\.{decimals}({exponent})?'
241 .format(decimals=decimals, exponent=exponent)).desc('floatLit').map(float)
242
243
244# Boolean
245boolLit = (string("true").result(True) | string("false").result(False)).desc('boolLit')
246
247
248# String literals
249hexEscape = regex(r"\\[x|X]") >> regex("[0-9a-fA-F]{2}").map(convert_hex).map(chr)
250octEscape = regex(r"\\") >> regex('[0-7]{2}').map(convert_octal).map(chr)
251charEscape = regex(r"\\") >> (
252 string("a").result("\a")
253 | string("b").result("\b")
254 | string("f").result("\f")
255 | string("n").result("\n")
256 | string("r").result("\r")
257 | string("t").result("\t")
258 | string("v").result("\v")
259 | string("\\").result("\\")
260 | string("'").result("'")
261 | string('"').result('"')
262)
263escapes = hexEscape | octEscape | charEscape
264# Correction to spec regarding " and ' inside quoted strings
265strLit = (string("'") >> (escapes | regex(r"[^\0\n\'\\]")).many().concat() << string("'")
266 | string('"') >> (escapes | regex(r"[^\0\n\"\\]")).many().concat() << string('"')).desc('strLit')
267quote = string("'") | string('"')
268
269# EmptyStatement
270emptyStatement = string(";").result(None)
271
272# Signed numbers:
273# (Extra compared to spec, to cope with need to produce signed numeric values)
274signedNumberChange = lambda s, num: (-1) if s == "-" else (+1)
275sign = regex("[-+]?")
276signedIntLit = seq(sign, intLit).combine(signedNumberChange)
277signedFloatLit = seq(sign, floatLit).combine(signedNumberChange)
278
279
280# Constant
281# put fullIdent at end to disabmiguate from boolLit
282constant = signedIntLit | signedFloatLit | strLit | boolLit | fullIdent
283
284# Syntax
285syntax = lexeme("syntax") >> EQ >> quote >> string("proto3") << quote + SEMI
286
287# Import Statement
288import_option = from_enum(ImportOption)
289
290import_ = seq(lexeme("import") >> import_option.optional().tag('option'),
291 lexeme(strLit).tag('identifier') << SEMI).combine_dict(Import)
292
293# Package
294package = seq(lexeme("package") >> fullIdent << SEMI).map(Package)
295
296# Option
297optionName = (ident | (LPAREN >> fullIdent << RPAREN)) + (string(".") + ident).many().concat()
298option = seq(lexeme("option") >> optionName.tag('name'),
299 EQ >> constant.tag('value') << SEMI,
300 ).combine_dict(Option)
301
302# Normal field
303type_ = lexeme(from_enum(Type) | messageType | enumType)
304fieldNumber = lexeme(intLit)
305
306fieldOption = seq(optionName.tag('name'),
307 EQ >> constant.tag('value')).combine_dict(Option)
308fieldOptions = fieldOption.sep_by(lexeme(","), min=1)
309fieldOptionList = (lexeme("[") >> fieldOptions << lexeme("]")).optional().map(
310 lambda o: [] if o is None else o)
311
312field = seq(is_present("repeated").tag('repeated'),
313 type_.tag('type'),
314 fieldName.tag('name') << EQ,
315 fieldNumber.tag('number'),
316 fieldOptionList.tag('options') << SEMI,
317 ).combine_dict(Field)
318
319# Oneof and oneof field
320oneofField = seq(type_.tag('type'),
321 fieldName.tag('name') << EQ,
322 fieldNumber.tag('number'),
323 fieldOptionList.tag('options') << SEMI,
324 ).combine_dict(OneOfField)
325oneof = seq(lexeme("oneof") >> oneofName.tag('name'),
326 LBRACE
327 >> (oneofField | emptyStatement).many().map(exclude_none).tag('fields')
328 << RBRACE
329 ).combine_dict(OneOf)
330
331# Map field
332keyType = lexeme(from_enum(KeyType))
333mapField = seq(lexeme("map") >> lexeme("<") >> keyType.tag('key_type'),
334 lexeme(",") >> type_.tag('type'),
335 lexeme(">") >> mapName.tag('name'),
336 EQ >> fieldNumber.tag('number'),
337 fieldOptionList.tag('options') << SEMI
338 ).combine_dict(Map)
339
340# Reserved
341range_ = seq(lexeme(intLit).tag('from_'),
342 (lexeme("to") >> (intLit | lexeme("max"))).optional().tag('to')
343 ).combine_dict(Range)
344ranges = range_.sep_by(lexeme(","), min=1)
345# The spec for 'reserved' indicates 'fieldName' here, which is never a quoted string.
346# But the example has a quoted string. We have changed it to 'strLit'
347fieldNames = strLit.sep_by(lexeme(","), min=1)
348reserved = seq(lexeme("reserved") >> (ranges | fieldNames) << SEMI
349 ).combine(Reserved)
350
351# Enum definition
352enumValueOption = seq(optionName.tag('name') << EQ,
353 constant.tag('value')
354 ).combine_dict(Option)
355enumField = seq(ident.tag('name') << EQ,
356 lexeme(intLit).tag('value'),
357 (lexeme("[") >> enumValueOption.sep_by(lexeme(","), min=1) << lexeme("]")).optional()
358 .map(lambda o: [] if o is None else o).tag('options')
359 << SEMI
360 ).combine_dict(EnumField)
361enumBody = (LBRACE
362 >> (option | enumField | emptyStatement).many().map(exclude_none)
363 << RBRACE)
364enum = seq(lexeme("enum") >> enumName.tag('name'),
365 enumBody.tag('body')
366 ).combine_dict(Enum)
367
368
369# Message definition
370@generate
371def message():
372 yield lexeme("message")
373 name = yield messageName
374 body = yield messageBody
375 return Message(name=name, body=body)
376
377
378messageBody = (LBRACE
379 >> (field | enum | message | option | oneof | mapField
380 | reserved | emptyStatement).many()
381 << RBRACE)
382
383
384# Service definition
385rpc = seq(lexeme("rpc") >> rpcName.tag('name'),
386 LPAREN
387 >> (is_present("stream").tag("request_stream")),
388 messageType.tag("request_message_type") << RPAREN,
389 lexeme("returns") >> LPAREN
390 >> (is_present("stream").tag("response_stream")),
391 messageType.tag("response_message_type")
392 << RPAREN,
393 ((LBRACE
394 >> (option | emptyStatement).many()
395 << RBRACE)
396 | SEMI.result([])
397 ).optional().map(exclude_none).tag('options')
398 ).combine_dict(Rpc)
399
400service = seq(lexeme("service") >> serviceName.tag('name'),
401 LBRACE
402 >> (option | rpc | emptyStatement).many().map(exclude_none).tag('body')
403 << RBRACE
404 ).combine_dict(Service)
405
406
407# Proto file
408topLevelDef = message | enum | service
409proto = seq(syntax.tag('syntax'),
410 (import_ | package | option | topLevelDef | emptyStatement
411 ).many().map(exclude_none).tag('statements')
412 ).combine_dict(Proto)
413
414
415EXAMPLE = """syntax = "proto3";
416import public "other.proto";
417option java_package = "com.example.foo";
418option java_package = "com.example.foo";
419package dmi;
420
421enum EnumAllowingAlias {
422 option allow_alias = true;
423 UNKNOWN = 0;
424 STARTED = 1;
425 RUNNING = 2 [(custom_option) = "hello world"];
426}
427message outer {
428 option (my_option).a = true;
429 message inner {
430 int64 ival = 1;
431 }
432 repeated inner inner_message = 2;
433 EnumAllowingAlias enum_field =3;
434 map<int32, string> my_map = 4;
435 oneof operation {
436 MetricsConfig changes = 2;
437 bool reset_to_default = 3;
438 }
439}
440"""
441# Smoke test - should find 4 top level statements in the example:
442# assert len(proto.parse(EXAMPLE).statements) == 4
443# print(proto.parse(EXAMPLE).statements)
444# for st in proto.parse(EXAMPLE).statements:
445# print(type(st))