Blame - grpc_robot/tools/protobuf_parse.py - grpc-robot

blob: f11e1a3b14581c46d2386c76124a0f36b4574d2a [file] [log] [blame]

Martin Cosyns	0efdc87	2021-09-27 16:24:30 +0000	[diff] [blame^]	1	# Copyright 2020-present Open Networking Foundation
				2	# Original copyright 2020-present ADTRAN, Inc.
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# http://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# -- coding: utf-8 --
				15
				16	# Parser for protocol buffer .proto files
				17	import enum as stdlib_enum
				18	from string import ascii_letters, digits, hexdigits, octdigits
				19
				20	import attr
				21
				22	from parsy import char_from, from_enum, generate, regex, seq, string
				23
				24	# This file follows the spec at
				25	# https://developers.google.com/protocol-buffers/docs/reference/proto3-spec
				26	# very closely.
				27
				28	# However, because we are parsing into useful objects, we do transformations
				29	# along the way e.g. turning into integers, strings etc. and custom objects.
				30	# Some of the lowest level items have been implemented using 'regex' and converting
				31	# the descriptions to regular expressions. Higher level constructs have been
				32	# implemented using other parsy primitives and combinators.
				33
				34	# Notes:
				35
				36	# 1. Whitespace is very badly defined in the 'spec', so we guess what is meant.
				37	# 2. The spec doesn't allow for comments, and neither does this parser.
				38	# Other places mention that C++ style comments are allowed. To support that,
				39	# this parser would need to be changed into split lexing/parsing stages
				40	# (otherwise you hit issues with comments start markers within string literals).
				41	# 3. Other notes inline.
				42
				43
				44	# Our utilities
				45	optional_string = lambda s: string(s).times(0, 1).concat()
				46	convert_decimal = int
				47	convert_octal = lambda s: int(s, 8)
				48	convert_hex = lambda s: int(s, 16)
				49	exclude_none = lambda l: [i for i in l if i is not None]
				50
				51
				52	def lexeme(p):
				53	"""
				54	From a parser (or string), make a parser that consumes
				55	whitespace on either side.
				56	"""
				57	if isinstance(p, str):
				58	p = string(p)
				59	return regex(r'\s') >> p << regex(r'\s')
				60
				61
				62	def is_present(p):
				63	"""
				64	Given a parser or string, make a parser that returns
				65	True if the parser matches, False otherwise
				66	"""
				67	return lexeme(p).optional().map(lambda v: False if v is None else True)
				68
				69
				70	# Our data structures
				71	@attr.s
				72	class Import:
				73	identifier = attr.ib()
				74	option = attr.ib()
				75
				76
				77	@attr.s
				78	class Package:
				79	identifer = attr.ib()
				80
				81
				82	@attr.s
				83	class Option:
				84	name = attr.ib()
				85	value = attr.ib()
				86
				87
				88	@attr.s
				89	class Field:
				90	repeated = attr.ib()
				91	type = attr.ib()
				92	name = attr.ib()
				93	number = attr.ib()
				94	options = attr.ib()
				95
				96
				97	@attr.s
				98	class OneOfField:
				99	type = attr.ib()
				100	name = attr.ib()
				101	number = attr.ib()
				102	options = attr.ib()
				103
				104
				105	@attr.s
				106	class OneOf:
				107	name = attr.ib()
				108	fields = attr.ib()
				109
				110
				111	@attr.s
				112	class Map:
				113	key_type = attr.ib()
				114	type = attr.ib()
				115	name = attr.ib()
				116	number = attr.ib()
				117	options = attr.ib()
				118
				119
				120	@attr.s
				121	class Reserved:
				122	items = attr.ib()
				123
				124
				125	@attr.s
				126	class Range:
				127	from_ = attr.ib()
				128	to = attr.ib()
				129
				130
				131	@attr.s
				132	class EnumField:
				133	name = attr.ib()
				134	value = attr.ib()
				135	options = attr.ib()
				136
				137
				138	@attr.s
				139	class Enum:
				140	name = attr.ib()
				141	body = attr.ib()
				142
				143
				144	@attr.s
				145	class Message:
				146	name = attr.ib()
				147	body = attr.ib()
				148
				149
				150	@attr.s
				151	class Service:
				152	name = attr.ib()
				153	body = attr.ib()
				154
				155
				156	@attr.s
				157	class Rpc:
				158	name = attr.ib()
				159	request_stream = attr.ib()
				160	request_message_type = attr.ib()
				161	response_stream = attr.ib()
				162	response_message_type = attr.ib()
				163	options = attr.ib()
				164
				165
				166	@attr.s
				167	class Proto:
				168	syntax = attr.ib()
				169	statements = attr.ib()
				170
				171
				172	# Enums:
				173	class ImportOption(stdlib_enum.Enum):
				174	WEAK = "weak"
				175	PUBLIC = "public"
				176
				177
				178	class Type(stdlib_enum.Enum):
				179	DOUBLE = "double"
				180	FLOAT = "float"
				181	INT32 = "int32"
				182	INT64 = "int64"
				183	UINT32 = "uint32"
				184	UINT64 = "uint64"
				185	SINT32 = "sint32"
				186	SINT64 = "sint64"
				187	FIXED32 = "fixed32"
				188	FIXED64 = "fixed64"
				189	SFIXED32 = "sfixed32"
				190	SFIXED64 = "sfixed64"
				191	BOOL = "bool"
				192	STRING = "string"
				193	BYTES = "bytes"
				194
				195
				196	class KeyType(stdlib_enum.Enum):
				197	INT32 = "int32"
				198	INT64 = "int64"
				199	UINT32 = "uint32"
				200	UINT64 = "uint64"
				201	SINT32 = "sint32"
				202	SINT64 = "sint64"
				203	FIXED32 = "fixed32"
				204	FIXED64 = "fixed64"
				205	SFIXED32 = "sfixed32"
				206	SFIXED64 = "sfixed64"
				207	BOOL = "bool"
				208	STRING = "string"
				209
				210
				211	# Some extra constants to avoid typing
				212	SEMI, EQ, LPAREN, RPAREN, LBRACE, RBRACE, LBRAC, RBRAC = [lexeme(c) for c in ";=(){}[]"]
				213
				214
				215	# -- Beginning of following spec --
				216	# Letters and digits
				217	letter = char_from(ascii_letters)
				218	decimalDigit = char_from(digits)
				219	octalDigit = char_from(octdigits)
				220	hexDigit = char_from(hexdigits)
				221
				222	# Identifiers
				223
				224	# Compared to spec, we add some '_' prefixed items which are not wrapped in `lexeme`,
				225	# on the assumption that spaces in the middle of identifiers are not accepted.
				226	_ident = (letter + (letter \| decimalDigit \| string("_")).many().concat()).desc('ident')
				227	ident = lexeme(_ident)
				228	fullIdent = lexeme(ident + (string(".") + ident).many().concat()).desc('fullIdent')
				229	_messageName = _ident
				230	messageName = lexeme(ident).desc('messageName')
				231	_enumName = ident
				232	enumName = lexeme(_enumName).desc('enumName')
				233	fieldName = ident.desc('fieldName')
				234	oneofName = ident.desc('oneofName')
				235	mapName = ident.desc('mapName')
				236	serviceName = ident.desc('serviceName')
				237	rpcName = ident.desc('rpcName')
				238	messageType = optional_string(".") + (_ident + string(".")).many().concat() + _messageName
				239	enumType = optional_string(".") + (_ident + string(".")).many().concat() + _enumName
				240
				241	# Integer literals
				242	decimalLit = regex("[1-9][0-9]*").desc('decimalLit').map(convert_decimal)
				243	octalLit = regex("0[0-7]*").desc('octalLit').map(convert_octal)
				244	hexLit = regex("0[x\|X][0-9a-fA-F]+").desc('octalLit').map(convert_hex)
				245	intLit = decimalLit \| octalLit \| hexLit
				246
				247
				248	# Floating-point literals
				249	decimals = r'[0-9]+'
				250	exponent = r'[e\|E][+\|-]?' + decimals
				251	floatLit = regex(r'({decimals}\.({decimals})?({exponent})?)\|{decimals}{exponent}\|\.{decimals}({exponent})?'
				252	.format(decimals=decimals, exponent=exponent)).desc('floatLit').map(float)
				253
				254
				255	# Boolean
				256	boolLit = (string("true").result(True) \| string("false").result(False)).desc('boolLit')
				257
				258
				259	# String literals
				260	hexEscape = regex(r"\\[x\|X]") >> regex("[0-9a-fA-F]{2}").map(convert_hex).map(chr)
				261	octEscape = regex(r"\\") >> regex('[0-7]{2}').map(convert_octal).map(chr)
				262	charEscape = regex(r"\\") >> (
				263	string("a").result("\a")
				264	\| string("b").result("\b")
				265	\| string("f").result("\f")
				266	\| string("n").result("\n")
				267	\| string("r").result("\r")
				268	\| string("t").result("\t")
				269	\| string("v").result("\v")
				270	\| string("\\").result("\\")
				271	\| string("'").result("'")
				272	\| string('"').result('"')
				273	)
				274	escapes = hexEscape \| octEscape \| charEscape
				275	# Correction to spec regarding " and ' inside quoted strings
				276	strLit = (string("'") >> (escapes \| regex(r"[^\0\n\'\\]")).many().concat() << string("'")
				277	\| string('"') >> (escapes \| regex(r"[^\0\n\"\\]")).many().concat() << string('"')).desc('strLit')
				278	quote = string("'") \| string('"')
				279
				280	# EmptyStatement
				281	emptyStatement = string(";").result(None)
				282
				283	# Signed numbers:
				284	# (Extra compared to spec, to cope with need to produce signed numeric values)
				285	signedNumberChange = lambda s, num: (-1) if s == "-" else (+1)
				286	sign = regex("[-+]?")
				287	signedIntLit = seq(sign, intLit).combine(signedNumberChange)
				288	signedFloatLit = seq(sign, floatLit).combine(signedNumberChange)
				289
				290
				291	# Constant
				292	# put fullIdent at end to disabmiguate from boolLit
				293	constant = signedIntLit \| signedFloatLit \| strLit \| boolLit \| fullIdent
				294
				295	# Syntax
				296	syntax = lexeme("syntax") >> EQ >> quote >> string("proto3") << quote + SEMI
				297
				298	# Import Statement
				299	import_option = from_enum(ImportOption)
				300
				301	import_ = seq(lexeme("import") >> import_option.optional().tag('option'),
				302	lexeme(strLit).tag('identifier') << SEMI).combine_dict(Import)
				303
				304	# Package
				305	package = seq(lexeme("package") >> fullIdent << SEMI).map(Package)
				306
				307	# Option
				308	optionName = (ident \| (LPAREN >> fullIdent << RPAREN)) + (string(".") + ident).many().concat()
				309	option = seq(lexeme("option") >> optionName.tag('name'),
				310	EQ >> constant.tag('value') << SEMI,
				311	).combine_dict(Option)
				312
				313	# Normal field
				314	type_ = lexeme(from_enum(Type) \| messageType \| enumType)
				315	fieldNumber = lexeme(intLit)
				316
				317	fieldOption = seq(optionName.tag('name'),
				318	EQ >> constant.tag('value')).combine_dict(Option)
				319	fieldOptions = fieldOption.sep_by(lexeme(","), min=1)
				320	fieldOptionList = (lexeme("[") >> fieldOptions << lexeme("]")).optional().map(
				321	lambda o: [] if o is None else o)
				322
				323	field = seq(is_present("repeated").tag('repeated'),
				324	type_.tag('type'),
				325	fieldName.tag('name') << EQ,
				326	fieldNumber.tag('number'),
				327	fieldOptionList.tag('options') << SEMI,
				328	).combine_dict(Field)
				329
				330	# Oneof and oneof field
				331	oneofField = seq(type_.tag('type'),
				332	fieldName.tag('name') << EQ,
				333	fieldNumber.tag('number'),
				334	fieldOptionList.tag('options') << SEMI,
				335	).combine_dict(OneOfField)
				336	oneof = seq(lexeme("oneof") >> oneofName.tag('name'),
				337	LBRACE
				338	>> (oneofField \| emptyStatement).many().map(exclude_none).tag('fields')
				339	<< RBRACE
				340	).combine_dict(OneOf)
				341
				342	# Map field
				343	keyType = lexeme(from_enum(KeyType))
				344	mapField = seq(lexeme("map") >> lexeme("<") >> keyType.tag('key_type'),
				345	lexeme(",") >> type_.tag('type'),
				346	lexeme(">") >> mapName.tag('name'),
				347	EQ >> fieldNumber.tag('number'),
				348	fieldOptionList.tag('options') << SEMI
				349	).combine_dict(Map)
				350
				351	# Reserved
				352	range_ = seq(lexeme(intLit).tag('from_'),
				353	(lexeme("to") >> (intLit \| lexeme("max"))).optional().tag('to')
				354	).combine_dict(Range)
				355	ranges = range_.sep_by(lexeme(","), min=1)
				356	# The spec for 'reserved' indicates 'fieldName' here, which is never a quoted string.
				357	# But the example has a quoted string. We have changed it to 'strLit'
				358	fieldNames = strLit.sep_by(lexeme(","), min=1)
				359	reserved = seq(lexeme("reserved") >> (ranges \| fieldNames) << SEMI
				360	).combine(Reserved)
				361
				362	# Enum definition
				363	enumValueOption = seq(optionName.tag('name') << EQ,
				364	constant.tag('value')
				365	).combine_dict(Option)
				366	enumField = seq(ident.tag('name') << EQ,
				367	lexeme(intLit).tag('value'),
				368	(lexeme("[") >> enumValueOption.sep_by(lexeme(","), min=1) << lexeme("]")).optional()
				369	.map(lambda o: [] if o is None else o).tag('options')
				370	<< SEMI
				371	).combine_dict(EnumField)
				372	enumBody = (LBRACE
				373	>> (option \| enumField \| emptyStatement).many().map(exclude_none)
				374	<< RBRACE)
				375	enum = seq(lexeme("enum") >> enumName.tag('name'),
				376	enumBody.tag('body')
				377	).combine_dict(Enum)
				378
				379
				380	# Message definition
				381	@generate
				382	def message():
				383	yield lexeme("message")
				384	name = yield messageName
				385	body = yield messageBody
				386	return Message(name=name, body=body)
				387
				388
				389	messageBody = (LBRACE
				390	>> (field \| enum \| message \| option \| oneof \| mapField
				391	\| reserved \| emptyStatement).many()
				392	<< RBRACE)
				393
				394
				395	# Service definition
				396	rpc = seq(lexeme("rpc") >> rpcName.tag('name'),
				397	LPAREN
				398	>> (is_present("stream").tag("request_stream")),
				399	messageType.tag("request_message_type") << RPAREN,
				400	lexeme("returns") >> LPAREN
				401	>> (is_present("stream").tag("response_stream")),
				402	messageType.tag("response_message_type")
				403	<< RPAREN,
				404	((LBRACE
				405	>> (option \| emptyStatement).many()
				406	<< RBRACE)
				407	\| SEMI.result([])
				408	).optional().map(exclude_none).tag('options')
				409	).combine_dict(Rpc)
				410
				411	service = seq(lexeme("service") >> serviceName.tag('name'),
				412	LBRACE
				413	>> (option \| rpc \| emptyStatement).many().map(exclude_none).tag('body')
				414	<< RBRACE
				415	).combine_dict(Service)
				416
				417
				418	# Proto file
				419	topLevelDef = message \| enum \| service
				420	proto = seq(syntax.tag('syntax'),
				421	(import_ \| package \| option \| topLevelDef \| emptyStatement
				422	).many().map(exclude_none).tag('statements')
				423	).combine_dict(Proto)
				424
				425
				426	EXAMPLE = """syntax = "proto3";
				427	import public "other.proto";
				428	option java_package = "com.example.foo";
				429	option java_package = "com.example.foo";
				430	package dmi;
				431
				432	enum EnumAllowingAlias {
				433	option allow_alias = true;
				434	UNKNOWN = 0;
				435	STARTED = 1;
				436	RUNNING = 2 [(custom_option) = "hello world"];
				437	}
				438	message outer {
				439	option (my_option).a = true;
				440	message inner {
				441	int64 ival = 1;
				442	}
				443	repeated inner inner_message = 2;
				444	EnumAllowingAlias enum_field =3;
				445	map<int32, string> my_map = 4;
				446	oneof operation {
				447	MetricsConfig changes = 2;
				448	bool reset_to_default = 3;
				449	}
				450	}
				451	"""
				452	# Smoke test - should find 4 top level statements in the example:
				453	# assert len(proto.parse(EXAMPLE).statements) == 4
				454	# print(proto.parse(EXAMPLE).statements)
				455	# for st in proto.parse(EXAMPLE).statements:
				456	# print(type(st))