Martin Cosyns | 0efdc87 | 2021-09-27 16:24:30 +0000 | [diff] [blame] | 1 | # Copyright 2020-present Open Networking Foundation |
| 2 | # Original copyright 2020-present ADTRAN, Inc. |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # -*- coding: utf-8 -*- |
| 15 | |
| 16 | # Parser for protocol buffer .proto files |
| 17 | import enum as stdlib_enum |
| 18 | from string import ascii_letters, digits, hexdigits, octdigits |
| 19 | |
| 20 | import attr |
| 21 | |
| 22 | from parsy import char_from, from_enum, generate, regex, seq, string |
| 23 | |
| 24 | # This file follows the spec at |
| 25 | # https://developers.google.com/protocol-buffers/docs/reference/proto3-spec |
| 26 | # very closely. |
| 27 | |
| 28 | # However, because we are parsing into useful objects, we do transformations |
| 29 | # along the way e.g. turning into integers, strings etc. and custom objects. |
| 30 | # Some of the lowest level items have been implemented using 'regex' and converting |
| 31 | # the descriptions to regular expressions. Higher level constructs have been |
| 32 | # implemented using other parsy primitives and combinators. |
| 33 | |
| 34 | # Notes: |
| 35 | |
| 36 | # 1. Whitespace is very badly defined in the 'spec', so we guess what is meant. |
| 37 | # 2. The spec doesn't allow for comments, and neither does this parser. |
| 38 | # Other places mention that C++ style comments are allowed. To support that, |
| 39 | # this parser would need to be changed into split lexing/parsing stages |
| 40 | # (otherwise you hit issues with comments start markers within string literals). |
| 41 | # 3. Other notes inline. |
| 42 | |
| 43 | |
| 44 | # Our utilities |
| 45 | optional_string = lambda s: string(s).times(0, 1).concat() |
| 46 | convert_decimal = int |
| 47 | convert_octal = lambda s: int(s, 8) |
| 48 | convert_hex = lambda s: int(s, 16) |
| 49 | exclude_none = lambda l: [i for i in l if i is not None] |
| 50 | |
| 51 | |
| 52 | def lexeme(p): |
| 53 | """ |
| 54 | From a parser (or string), make a parser that consumes |
| 55 | whitespace on either side. |
| 56 | """ |
| 57 | if isinstance(p, str): |
| 58 | p = string(p) |
| 59 | return regex(r'\s*') >> p << regex(r'\s*') |
| 60 | |
| 61 | |
| 62 | def is_present(p): |
| 63 | """ |
| 64 | Given a parser or string, make a parser that returns |
| 65 | True if the parser matches, False otherwise |
| 66 | """ |
| 67 | return lexeme(p).optional().map(lambda v: False if v is None else True) |
| 68 | |
| 69 | |
| 70 | # Our data structures |
| 71 | @attr.s |
| 72 | class Import: |
| 73 | identifier = attr.ib() |
| 74 | option = attr.ib() |
| 75 | |
| 76 | |
| 77 | @attr.s |
| 78 | class Package: |
| 79 | identifer = attr.ib() |
| 80 | |
| 81 | |
| 82 | @attr.s |
| 83 | class Option: |
| 84 | name = attr.ib() |
| 85 | value = attr.ib() |
| 86 | |
| 87 | |
| 88 | @attr.s |
| 89 | class Field: |
| 90 | repeated = attr.ib() |
| 91 | type = attr.ib() |
| 92 | name = attr.ib() |
| 93 | number = attr.ib() |
| 94 | options = attr.ib() |
| 95 | |
| 96 | |
| 97 | @attr.s |
| 98 | class OneOfField: |
| 99 | type = attr.ib() |
| 100 | name = attr.ib() |
| 101 | number = attr.ib() |
| 102 | options = attr.ib() |
| 103 | |
| 104 | |
| 105 | @attr.s |
| 106 | class OneOf: |
| 107 | name = attr.ib() |
| 108 | fields = attr.ib() |
| 109 | |
| 110 | |
| 111 | @attr.s |
| 112 | class Map: |
| 113 | key_type = attr.ib() |
| 114 | type = attr.ib() |
| 115 | name = attr.ib() |
| 116 | number = attr.ib() |
| 117 | options = attr.ib() |
| 118 | |
| 119 | |
| 120 | @attr.s |
| 121 | class Reserved: |
| 122 | items = attr.ib() |
| 123 | |
| 124 | |
| 125 | @attr.s |
| 126 | class Range: |
| 127 | from_ = attr.ib() |
| 128 | to = attr.ib() |
| 129 | |
| 130 | |
| 131 | @attr.s |
| 132 | class EnumField: |
| 133 | name = attr.ib() |
| 134 | value = attr.ib() |
| 135 | options = attr.ib() |
| 136 | |
| 137 | |
| 138 | @attr.s |
| 139 | class Enum: |
| 140 | name = attr.ib() |
| 141 | body = attr.ib() |
| 142 | |
| 143 | |
| 144 | @attr.s |
| 145 | class Message: |
| 146 | name = attr.ib() |
| 147 | body = attr.ib() |
| 148 | |
| 149 | |
| 150 | @attr.s |
| 151 | class Service: |
| 152 | name = attr.ib() |
| 153 | body = attr.ib() |
| 154 | |
| 155 | |
| 156 | @attr.s |
| 157 | class Rpc: |
| 158 | name = attr.ib() |
| 159 | request_stream = attr.ib() |
| 160 | request_message_type = attr.ib() |
| 161 | response_stream = attr.ib() |
| 162 | response_message_type = attr.ib() |
| 163 | options = attr.ib() |
| 164 | |
| 165 | |
| 166 | @attr.s |
| 167 | class Proto: |
| 168 | syntax = attr.ib() |
| 169 | statements = attr.ib() |
| 170 | |
| 171 | |
| 172 | # Enums: |
| 173 | class ImportOption(stdlib_enum.Enum): |
| 174 | WEAK = "weak" |
| 175 | PUBLIC = "public" |
| 176 | |
| 177 | |
| 178 | class Type(stdlib_enum.Enum): |
| 179 | DOUBLE = "double" |
| 180 | FLOAT = "float" |
| 181 | INT32 = "int32" |
| 182 | INT64 = "int64" |
| 183 | UINT32 = "uint32" |
| 184 | UINT64 = "uint64" |
| 185 | SINT32 = "sint32" |
| 186 | SINT64 = "sint64" |
| 187 | FIXED32 = "fixed32" |
| 188 | FIXED64 = "fixed64" |
| 189 | SFIXED32 = "sfixed32" |
| 190 | SFIXED64 = "sfixed64" |
| 191 | BOOL = "bool" |
| 192 | STRING = "string" |
| 193 | BYTES = "bytes" |
| 194 | |
| 195 | |
| 196 | class KeyType(stdlib_enum.Enum): |
| 197 | INT32 = "int32" |
| 198 | INT64 = "int64" |
| 199 | UINT32 = "uint32" |
| 200 | UINT64 = "uint64" |
| 201 | SINT32 = "sint32" |
| 202 | SINT64 = "sint64" |
| 203 | FIXED32 = "fixed32" |
| 204 | FIXED64 = "fixed64" |
| 205 | SFIXED32 = "sfixed32" |
| 206 | SFIXED64 = "sfixed64" |
| 207 | BOOL = "bool" |
| 208 | STRING = "string" |
| 209 | |
| 210 | |
| 211 | # Some extra constants to avoid typing |
| 212 | SEMI, EQ, LPAREN, RPAREN, LBRACE, RBRACE, LBRAC, RBRAC = [lexeme(c) for c in ";=(){}[]"] |
| 213 | |
| 214 | |
| 215 | # -- Beginning of following spec -- |
| 216 | # Letters and digits |
| 217 | letter = char_from(ascii_letters) |
| 218 | decimalDigit = char_from(digits) |
| 219 | octalDigit = char_from(octdigits) |
| 220 | hexDigit = char_from(hexdigits) |
| 221 | |
| 222 | # Identifiers |
| 223 | |
| 224 | # Compared to spec, we add some '_' prefixed items which are not wrapped in `lexeme`, |
| 225 | # on the assumption that spaces in the middle of identifiers are not accepted. |
| 226 | _ident = (letter + (letter | decimalDigit | string("_")).many().concat()).desc('ident') |
| 227 | ident = lexeme(_ident) |
| 228 | fullIdent = lexeme(ident + (string(".") + ident).many().concat()).desc('fullIdent') |
| 229 | _messageName = _ident |
| 230 | messageName = lexeme(ident).desc('messageName') |
| 231 | _enumName = ident |
| 232 | enumName = lexeme(_enumName).desc('enumName') |
| 233 | fieldName = ident.desc('fieldName') |
| 234 | oneofName = ident.desc('oneofName') |
| 235 | mapName = ident.desc('mapName') |
| 236 | serviceName = ident.desc('serviceName') |
| 237 | rpcName = ident.desc('rpcName') |
| 238 | messageType = optional_string(".") + (_ident + string(".")).many().concat() + _messageName |
| 239 | enumType = optional_string(".") + (_ident + string(".")).many().concat() + _enumName |
| 240 | |
| 241 | # Integer literals |
| 242 | decimalLit = regex("[1-9][0-9]*").desc('decimalLit').map(convert_decimal) |
| 243 | octalLit = regex("0[0-7]*").desc('octalLit').map(convert_octal) |
| 244 | hexLit = regex("0[x|X][0-9a-fA-F]+").desc('octalLit').map(convert_hex) |
| 245 | intLit = decimalLit | octalLit | hexLit |
| 246 | |
| 247 | |
| 248 | # Floating-point literals |
| 249 | decimals = r'[0-9]+' |
| 250 | exponent = r'[e|E][+|-]?' + decimals |
| 251 | floatLit = regex(r'({decimals}\.({decimals})?({exponent})?)|{decimals}{exponent}|\.{decimals}({exponent})?' |
| 252 | .format(decimals=decimals, exponent=exponent)).desc('floatLit').map(float) |
| 253 | |
| 254 | |
| 255 | # Boolean |
| 256 | boolLit = (string("true").result(True) | string("false").result(False)).desc('boolLit') |
| 257 | |
| 258 | |
| 259 | # String literals |
| 260 | hexEscape = regex(r"\\[x|X]") >> regex("[0-9a-fA-F]{2}").map(convert_hex).map(chr) |
| 261 | octEscape = regex(r"\\") >> regex('[0-7]{2}').map(convert_octal).map(chr) |
| 262 | charEscape = regex(r"\\") >> ( |
| 263 | string("a").result("\a") |
| 264 | | string("b").result("\b") |
| 265 | | string("f").result("\f") |
| 266 | | string("n").result("\n") |
| 267 | | string("r").result("\r") |
| 268 | | string("t").result("\t") |
| 269 | | string("v").result("\v") |
| 270 | | string("\\").result("\\") |
| 271 | | string("'").result("'") |
| 272 | | string('"').result('"') |
| 273 | ) |
| 274 | escapes = hexEscape | octEscape | charEscape |
| 275 | # Correction to spec regarding " and ' inside quoted strings |
| 276 | strLit = (string("'") >> (escapes | regex(r"[^\0\n\'\\]")).many().concat() << string("'") |
| 277 | | string('"') >> (escapes | regex(r"[^\0\n\"\\]")).many().concat() << string('"')).desc('strLit') |
| 278 | quote = string("'") | string('"') |
| 279 | |
| 280 | # EmptyStatement |
| 281 | emptyStatement = string(";").result(None) |
| 282 | |
| 283 | # Signed numbers: |
| 284 | # (Extra compared to spec, to cope with need to produce signed numeric values) |
| 285 | signedNumberChange = lambda s, num: (-1) if s == "-" else (+1) |
| 286 | sign = regex("[-+]?") |
| 287 | signedIntLit = seq(sign, intLit).combine(signedNumberChange) |
| 288 | signedFloatLit = seq(sign, floatLit).combine(signedNumberChange) |
| 289 | |
| 290 | |
| 291 | # Constant |
| 292 | # put fullIdent at end to disabmiguate from boolLit |
| 293 | constant = signedIntLit | signedFloatLit | strLit | boolLit | fullIdent |
| 294 | |
| 295 | # Syntax |
| 296 | syntax = lexeme("syntax") >> EQ >> quote >> string("proto3") << quote + SEMI |
| 297 | |
| 298 | # Import Statement |
| 299 | import_option = from_enum(ImportOption) |
| 300 | |
| 301 | import_ = seq(lexeme("import") >> import_option.optional().tag('option'), |
| 302 | lexeme(strLit).tag('identifier') << SEMI).combine_dict(Import) |
| 303 | |
| 304 | # Package |
| 305 | package = seq(lexeme("package") >> fullIdent << SEMI).map(Package) |
| 306 | |
| 307 | # Option |
| 308 | optionName = (ident | (LPAREN >> fullIdent << RPAREN)) + (string(".") + ident).many().concat() |
| 309 | option = seq(lexeme("option") >> optionName.tag('name'), |
| 310 | EQ >> constant.tag('value') << SEMI, |
| 311 | ).combine_dict(Option) |
| 312 | |
| 313 | # Normal field |
| 314 | type_ = lexeme(from_enum(Type) | messageType | enumType) |
| 315 | fieldNumber = lexeme(intLit) |
| 316 | |
| 317 | fieldOption = seq(optionName.tag('name'), |
| 318 | EQ >> constant.tag('value')).combine_dict(Option) |
| 319 | fieldOptions = fieldOption.sep_by(lexeme(","), min=1) |
| 320 | fieldOptionList = (lexeme("[") >> fieldOptions << lexeme("]")).optional().map( |
| 321 | lambda o: [] if o is None else o) |
| 322 | |
| 323 | field = seq(is_present("repeated").tag('repeated'), |
| 324 | type_.tag('type'), |
| 325 | fieldName.tag('name') << EQ, |
| 326 | fieldNumber.tag('number'), |
| 327 | fieldOptionList.tag('options') << SEMI, |
| 328 | ).combine_dict(Field) |
| 329 | |
| 330 | # Oneof and oneof field |
| 331 | oneofField = seq(type_.tag('type'), |
| 332 | fieldName.tag('name') << EQ, |
| 333 | fieldNumber.tag('number'), |
| 334 | fieldOptionList.tag('options') << SEMI, |
| 335 | ).combine_dict(OneOfField) |
| 336 | oneof = seq(lexeme("oneof") >> oneofName.tag('name'), |
| 337 | LBRACE |
| 338 | >> (oneofField | emptyStatement).many().map(exclude_none).tag('fields') |
| 339 | << RBRACE |
| 340 | ).combine_dict(OneOf) |
| 341 | |
| 342 | # Map field |
| 343 | keyType = lexeme(from_enum(KeyType)) |
| 344 | mapField = seq(lexeme("map") >> lexeme("<") >> keyType.tag('key_type'), |
| 345 | lexeme(",") >> type_.tag('type'), |
| 346 | lexeme(">") >> mapName.tag('name'), |
| 347 | EQ >> fieldNumber.tag('number'), |
| 348 | fieldOptionList.tag('options') << SEMI |
| 349 | ).combine_dict(Map) |
| 350 | |
| 351 | # Reserved |
| 352 | range_ = seq(lexeme(intLit).tag('from_'), |
| 353 | (lexeme("to") >> (intLit | lexeme("max"))).optional().tag('to') |
| 354 | ).combine_dict(Range) |
| 355 | ranges = range_.sep_by(lexeme(","), min=1) |
| 356 | # The spec for 'reserved' indicates 'fieldName' here, which is never a quoted string. |
| 357 | # But the example has a quoted string. We have changed it to 'strLit' |
| 358 | fieldNames = strLit.sep_by(lexeme(","), min=1) |
| 359 | reserved = seq(lexeme("reserved") >> (ranges | fieldNames) << SEMI |
| 360 | ).combine(Reserved) |
| 361 | |
| 362 | # Enum definition |
| 363 | enumValueOption = seq(optionName.tag('name') << EQ, |
| 364 | constant.tag('value') |
| 365 | ).combine_dict(Option) |
| 366 | enumField = seq(ident.tag('name') << EQ, |
| 367 | lexeme(intLit).tag('value'), |
| 368 | (lexeme("[") >> enumValueOption.sep_by(lexeme(","), min=1) << lexeme("]")).optional() |
| 369 | .map(lambda o: [] if o is None else o).tag('options') |
| 370 | << SEMI |
| 371 | ).combine_dict(EnumField) |
| 372 | enumBody = (LBRACE |
| 373 | >> (option | enumField | emptyStatement).many().map(exclude_none) |
| 374 | << RBRACE) |
| 375 | enum = seq(lexeme("enum") >> enumName.tag('name'), |
| 376 | enumBody.tag('body') |
| 377 | ).combine_dict(Enum) |
| 378 | |
| 379 | |
| 380 | # Message definition |
| 381 | @generate |
| 382 | def message(): |
| 383 | yield lexeme("message") |
| 384 | name = yield messageName |
| 385 | body = yield messageBody |
| 386 | return Message(name=name, body=body) |
| 387 | |
| 388 | |
| 389 | messageBody = (LBRACE |
| 390 | >> (field | enum | message | option | oneof | mapField |
| 391 | | reserved | emptyStatement).many() |
| 392 | << RBRACE) |
| 393 | |
| 394 | |
| 395 | # Service definition |
| 396 | rpc = seq(lexeme("rpc") >> rpcName.tag('name'), |
| 397 | LPAREN |
| 398 | >> (is_present("stream").tag("request_stream")), |
| 399 | messageType.tag("request_message_type") << RPAREN, |
| 400 | lexeme("returns") >> LPAREN |
| 401 | >> (is_present("stream").tag("response_stream")), |
| 402 | messageType.tag("response_message_type") |
| 403 | << RPAREN, |
| 404 | ((LBRACE |
| 405 | >> (option | emptyStatement).many() |
| 406 | << RBRACE) |
| 407 | | SEMI.result([]) |
| 408 | ).optional().map(exclude_none).tag('options') |
| 409 | ).combine_dict(Rpc) |
| 410 | |
| 411 | service = seq(lexeme("service") >> serviceName.tag('name'), |
| 412 | LBRACE |
| 413 | >> (option | rpc | emptyStatement).many().map(exclude_none).tag('body') |
| 414 | << RBRACE |
| 415 | ).combine_dict(Service) |
| 416 | |
| 417 | |
| 418 | # Proto file |
| 419 | topLevelDef = message | enum | service |
| 420 | proto = seq(syntax.tag('syntax'), |
| 421 | (import_ | package | option | topLevelDef | emptyStatement |
| 422 | ).many().map(exclude_none).tag('statements') |
| 423 | ).combine_dict(Proto) |
| 424 | |
| 425 | |
| 426 | EXAMPLE = """syntax = "proto3"; |
| 427 | import public "other.proto"; |
| 428 | option java_package = "com.example.foo"; |
| 429 | option java_package = "com.example.foo"; |
| 430 | package dmi; |
| 431 | |
| 432 | enum EnumAllowingAlias { |
| 433 | option allow_alias = true; |
| 434 | UNKNOWN = 0; |
| 435 | STARTED = 1; |
| 436 | RUNNING = 2 [(custom_option) = "hello world"]; |
| 437 | } |
| 438 | message outer { |
| 439 | option (my_option).a = true; |
| 440 | message inner { |
| 441 | int64 ival = 1; |
| 442 | } |
| 443 | repeated inner inner_message = 2; |
| 444 | EnumAllowingAlias enum_field =3; |
| 445 | map<int32, string> my_map = 4; |
| 446 | oneof operation { |
| 447 | MetricsConfig changes = 2; |
| 448 | bool reset_to_default = 3; |
| 449 | } |
| 450 | } |
| 451 | """ |
| 452 | # Smoke test - should find 4 top level statements in the example: |
| 453 | # assert len(proto.parse(EXAMPLE).statements) == 4 |
| 454 | # print(proto.parse(EXAMPLE).statements) |
| 455 | # for st in proto.parse(EXAMPLE).statements: |
| 456 | # print(type(st)) |