blob: e75eb525b2f6a63b63a92b873cd785605df57331 [file] [log] [blame]
Dusan Klinecccaa0d92014-11-09 03:21:31 +01001__author__ = "Dusan (Ph4r05) Klinec"
2__copyright__ = "Copyright (C) 2014 Dusan (ph4r05) Klinec"
3__license__ = "Apache License, Version 2.0"
4__version__ = "1.0"
5
6import ply.lex as lex
7import ply.yacc as yacc
8from .model import *
9
10class ProtobufLexer(object):
11 keywords = ('double', 'float', 'int32', 'int64', 'uint32', 'uint64', 'sint32', 'sint64',
12 'fixed32', 'fixed64', 'sfixed32', 'sfixed64', 'bool', 'string', 'bytes',
13 'message', 'required', 'optional', 'repeated', 'enum', 'extensions', 'max', 'extends', 'extend',
14 'to', 'package', 'service', 'rpc', 'returns', 'true', 'false', 'option', 'import')
15
16 tokens = [
17 'NAME',
18 'NUM',
19 'STRING_LITERAL',
20 'LINE_COMMENT', 'BLOCK_COMMENT',
21
22 'LBRACE', 'RBRACE', 'LBRACK', 'RBRACK',
Dusan Klinecc9b031a2014-11-10 13:21:08 +010023 'LPAR', 'RPAR', 'EQ', 'SEMI', 'DOT',
Dusan Klinecaa9ff472014-11-10 18:02:03 +010024 'STARTTOKEN'
Dusan Klinecccaa0d92014-11-09 03:21:31 +010025
26 ] + [k.upper() for k in keywords]
27 literals = '()+-*/=?:,.^|&~!=[]{};<>@%'
28
29 t_NUM = r'[+-]?\d+'
30 t_STRING_LITERAL = r'\"([^\\\n]|(\\.))*?\"'
31
32 t_ignore_LINE_COMMENT = '//.*'
33 def t_BLOCK_COMMENT(self, t):
34 r'/\*(.|\n)*?\*/'
35 t.lexer.lineno += t.value.count('\n')
36
37 t_LBRACE = '{'
38 t_RBRACE = '}'
39 t_LBRACK = '\\['
40 t_RBRACK = '\\]'
41 t_LPAR = '\\('
42 t_RPAR = '\\)'
43 t_EQ = '='
44 t_SEMI = ';'
Dusan Klineca4fae112014-11-10 08:50:27 +010045 t_DOT = '\\.'
Dusan Klinecccaa0d92014-11-09 03:21:31 +010046 t_ignore = ' \t\f'
Dusan Klinecaa9ff472014-11-10 18:02:03 +010047 t_STARTTOKEN = '\\+'
Dusan Klinecccaa0d92014-11-09 03:21:31 +010048
49 def t_NAME(self, t):
50 '[A-Za-z_$][A-Za-z0-9_$]*'
51 if t.value in ProtobufLexer.keywords:
Dusan Klineca4fae112014-11-10 08:50:27 +010052 #print "type: %s val %s t %s" % (t.type, t.value, t)
Dusan Klinecccaa0d92014-11-09 03:21:31 +010053 t.type = t.value.upper()
54 return t
55
56 def t_newline(self, t):
57 r'\n+'
58 t.lexer.lineno += len(t.value)
59
60 def t_newline2(self, t):
61 r'(\r\n)+'
62 t.lexer.lineno += len(t.value) / 2
63
64 def t_error(self, t):
65 print("Illegal character '{}' ({}) in line {}".format(t.value[0], hex(ord(t.value[0])), t.lexer.lineno))
66 t.lexer.skip(1)
67
Dusan Klinecc9b031a2014-11-10 13:21:08 +010068class LexHelper:
Dusan Klinecaa9ff472014-11-10 18:02:03 +010069 offset = 0
70 def get_max_linespan(self, p):
71 defSpan=[1e60, -1]
Dusan Klinecc9b031a2014-11-10 13:21:08 +010072 mSpan=[1e60, -1]
73 for sp in range(0, len(p)):
74 csp = p.linespan(sp)
Dusan Klinecaa9ff472014-11-10 18:02:03 +010075 if csp[0] == 0 and csp[1] == 0:
76 if hasattr(p[sp], "linespan"):
77 csp = p[sp].linespan
78 else:
79 continue
80 if csp == None or len(csp) != 2: continue
81 if csp[0] == 0 and csp[1] == 0: continue
Dusan Klinecc9b031a2014-11-10 13:21:08 +010082 if csp[0] < mSpan[0]: mSpan[0] = csp[0]
83 if csp[1] > mSpan[1]: mSpan[1] = csp[1]
Dusan Klinecaa9ff472014-11-10 18:02:03 +010084 if defSpan == mSpan: return (0,0)
85 return tuple([mSpan[0]-self.offset, mSpan[1]-self.offset])
Dusan Klinecc9b031a2014-11-10 13:21:08 +010086
Dusan Klinecaa9ff472014-11-10 18:02:03 +010087 def get_max_lexspan(self, p):
88 defSpan=[1e60, -1]
Dusan Klinecc9b031a2014-11-10 13:21:08 +010089 mSpan=[1e60, -1]
90 for sp in range(0, len(p)):
91 csp = p.lexspan(sp)
Dusan Klinecaa9ff472014-11-10 18:02:03 +010092 if csp[0] == 0 and csp[1] == 0:
93 if hasattr(p[sp], "lexspan"):
94 csp = p[sp].lexspan
95 else:
96 continue
97 if csp == None or len(csp) != 2: continue
98 if csp[0] == 0 and csp[1] == 0: continue
Dusan Klinecc9b031a2014-11-10 13:21:08 +010099 if csp[0] < mSpan[0]: mSpan[0] = csp[0]
100 if csp[1] > mSpan[1]: mSpan[1] = csp[1]
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100101 if defSpan == mSpan: return (0,0)
102 return tuple([mSpan[0]-self.offset, mSpan[1]-self.offset])
Dusan Klinecc9b031a2014-11-10 13:21:08 +0100103
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100104 def set_parse_object(self, dst, p):
105 dst.setLexData(linespan=self.get_max_linespan(p), lexspan=self.get_max_lexspan(p))
Dusan Klinecc9b031a2014-11-10 13:21:08 +0100106 dst.setLexObj(p)
107
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100108class ProtobufParser(object):
109 tokens = ProtobufLexer.tokens
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100110 offset = 0
111 lh = LexHelper()
112
113 def setOffset(self, of):
114 self.offset = of
115 self.lh.offset = of
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100116
117 def p_empty(self, p):
118 '''empty :'''
119 pass
120
121 def p_field_modifier(self,p):
122 '''field_modifier : REQUIRED
123 | OPTIONAL
124 | REPEATED'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100125 p[0] = LU.i(p,1)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100126
127 def p_primitive_type(self, p):
128 '''primitive_type : DOUBLE
129 | FLOAT
130 | INT32
131 | INT64
132 | UINT32
133 | UINT64
134 | SINT32
135 | SINT64
136 | FIXED32
137 | FIXED64
138 | SFIXED32
139 | SFIXED64
140 | BOOL
141 | STRING
142 | BYTES'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100143 p[0] = LU.i(p,1)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100144
145 def p_field_id(self, p):
146 '''field_id : NUM'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100147 p[0] = LU.i(p,1)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100148
149 def p_rvalue(self, p):
150 '''rvalue : NUM
151 | TRUE
152 | FALSE'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100153 p[0] = LU.i(p,1)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100154
155 def p_rvalue2(self, p):
156 '''rvalue : NAME'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100157 p[0] = Name(LU.i(p, 1))
158 self.lh.set_parse_object(p[0], p)
159 p[0].deriveLex()
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100160
161 def p_field_directive(self, p):
162 '''field_directive : LBRACK NAME EQ rvalue RBRACK'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100163 p[0] = FieldDirective(Name(LU.i(p, 2)), LU.i(p,4))
164 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100165
166 def p_field_directive_times(self, p):
167 '''field_directive_times : field_directive_plus'''
168 p[0] = p[1]
169
170 def p_field_directive_times2(self, p):
171 '''field_directive_times : empty'''
172 p[0] = []
173
174 def p_field_directive_plus(self, p):
175 '''field_directive_plus : field_directive
176 | field_directive_plus field_directive'''
177 if len(p) == 2:
Dusan Klineca9f6d362014-11-10 21:07:08 +0100178 p[0] = [LU(p,1)]
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100179 else:
Dusan Klineca9f6d362014-11-10 21:07:08 +0100180 p[0] = p[1] + [LU(p,2)]
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100181
Dusan Klineca4fae112014-11-10 08:50:27 +0100182 def p_dotname(self, p):
183 '''dotname : NAME
184 | dotname DOT NAME'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100185 if len(p) == 2:
186 p[0] = [LU(p,1)]
187 else:
188 p[0] = p[1] + [LU(p,3)]
Dusan Klineca4fae112014-11-10 08:50:27 +0100189
190 # Hack for cases when there is a field named 'message' or 'max'
191 def p_fieldName(self, p):
192 '''field_name : NAME
193 | MESSAGE
194 | MAX'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100195 p[0] = Name(LU.i(p,1))
196 self.lh.set_parse_object(p[0], p)
197 p[0].deriveLex()
Dusan Klineca4fae112014-11-10 08:50:27 +0100198
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100199 def p_field_type(self, p):
200 '''field_type : primitive_type'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100201 p[0] = FieldType(LU.i(p,1))
202 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100203
204 def p_field_type2(self, p):
Dusan Klineca4fae112014-11-10 08:50:27 +0100205 '''field_type : dotname'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100206 p[0] = DotName(LU.i(p, 1))
207 self.lh.set_parse_object(p[0], p)
208 p[0].deriveLex()
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100209
210 # Root of the field declaration.
211 def p_field_definition(self, p):
Dusan Klineca4fae112014-11-10 08:50:27 +0100212 '''field_definition : field_modifier field_type field_name EQ field_id field_directive_times SEMI'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100213 p[0] = FieldDefinition(LU.i(p,1), LU.i(p,2), LU.i(p, 3), LU.i(p,5), LU.i(p,6))
214 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100215
216 # Root of the enum field declaration.
217 def p_enum_field(self, p):
Dusan Klineca4fae112014-11-10 08:50:27 +0100218 '''enum_field : field_name EQ NUM SEMI'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100219 p[0] = EnumFieldDefinition(LU.i(p, 1), LU.i(p,3))
220 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100221
222 def p_enum_body_part(self, p):
223 '''enum_body_part : enum_field
224 | option_directive'''
225 p[0] = p[1]
226
227 def p_enum_body(self, p):
228 '''enum_body : enum_body_part
229 | enum_body enum_body_part'''
230 if len(p) == 2:
231 p[0] = [p[1]]
232 else:
233 p[0] = p[1] + [p[2]]
234
235 def p_enum_body_opt(self, p):
236 '''enum_body_opt : empty'''
237 p[0] = []
238
239 def p_enum_body_opt2(self, p):
240 '''enum_body_opt : enum_body'''
241 p[0] = p[1]
242
243 # Root of the enum declaration.
244 # enum_definition ::= 'enum' ident '{' { ident '=' integer ';' }* '}'
245 def p_enum_definition(self, p):
246 '''enum_definition : ENUM NAME LBRACE enum_body_opt RBRACE'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100247 p[0] = EnumDefinition(Name(LU.i(p, 2)), LU.i(p,4))
248 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100249
250 def p_extensions_to(self, p):
251 '''extensions_to : MAX'''
252 p[0] = ExtensionsMax()
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100253 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100254
255 def p_extensions_to2(self, p):
256 '''extensions_to : NUM'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100257 p[0] = LU.i(p, 1)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100258
259 # extensions_definition ::= 'extensions' integer 'to' integer ';'
260 def p_extensions_definition(self, p):
261 '''extensions_definition : EXTENSIONS NUM TO extensions_to SEMI'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100262 p[0] = ExtensionsDirective(LU.i(p,2), LU.i(p,4))
263 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100264
265 # message_extension ::= 'extend' ident '{' message_body '}'
266 def p_message_extension(self, p):
267 '''message_extension : EXTEND NAME LBRACE message_body RBRACE'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100268 p[0] = MessageExtension(Name(LU.i(p, 2)), LU.i(p,4))
269 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100270
271 def p_message_body_part(self, p):
272 '''message_body_part : field_definition
273 | enum_definition
274 | message_definition
275 | extensions_definition
276 | message_extension'''
277 p[0] = p[1]
278
279 # message_body ::= { field_definition | enum_definition | message_definition | extensions_definition | message_extension }*
280 def p_message_body(self, p):
281 '''message_body : empty'''
282 p[0] = []
283
284 # message_body ::= { field_definition | enum_definition | message_definition | extensions_definition | message_extension }*
285 def p_message_body2(self, p):
286 '''message_body : message_body_part
287 | message_body message_body_part'''
288 if len(p) == 2:
289 p[0] = [p[1]]
290 else:
291 p[0] = p[1] + [p[2]]
292
293 # Root of the message declaration.
294 # message_definition = MESSAGE_ - ident("messageId") + LBRACE + message_body("body") + RBRACE
295 def p_message_definition(self, p):
296 '''message_definition : MESSAGE NAME LBRACE message_body RBRACE'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100297 p[0] = MessageDefinition(Name(LU.i(p, 2)), LU.i(p,4))
298 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100299
300 # method_definition ::= 'rpc' ident '(' [ ident ] ')' 'returns' '(' [ ident ] ')' ';'
301 def p_method_definition(self, p):
302 '''method_definition : RPC NAME LPAR NAME RPAR RETURNS LPAR NAME RPAR'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100303 p[0] = MethodDefinition(Name(LU.i(p, 2)), Name(LU.i(p, 4)), Name(LU.i(p, 8)))
304 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100305
306 def p_method_definition_opt(self, p):
307 '''method_definition_opt : empty'''
308 p[0] = []
309
310 def p_method_definition_opt2(self, p):
311 '''method_definition_opt : method_definition
312 | method_definition_opt method_definition'''
313 if len(p) == 2:
314 p[0] = [p[1]]
315 else:
316 p[0] = p[1] + [p[2]]
317
318 # service_definition ::= 'service' ident '{' method_definition* '}'
319 # service_definition = SERVICE_ - ident("serviceName") + LBRACE + ZeroOrMore(Group(method_definition)) + RBRACE
320 def p_service_definition(self, p):
321 '''service_definition : SERVICE NAME LBRACE method_definition_opt RBRACE'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100322 p[0] = ServiceDefinition(Name(LU.i(p, 2)), LU.i(p,4))
323 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100324
325 # package_directive ::= 'package' ident [ '.' ident]* ';'
326 def p_package_directive(self,p):
Dusan Klineca4fae112014-11-10 08:50:27 +0100327 '''package_directive : PACKAGE dotname SEMI'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100328 p[0] = PackageStatement(Name(LU.i(p, 2)))
329 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100330
331 # import_directive = IMPORT_ - quotedString("importFileSpec") + SEMI
332 def p_import_directive(self, p):
333 '''import_directive : IMPORT STRING_LITERAL SEMI'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100334 p[0] = ImportStatement(Literal(LU.i(p,2)))
335 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100336
337 def p_option_rvalue(self, p):
338 '''option_rvalue : NUM
339 | TRUE
340 | FALSE'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100341 p[0] = LU(p, 1)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100342
343 def p_option_rvalue2(self, p):
344 '''option_rvalue : STRING_LITERAL'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100345 p[0] = Literal(LU(p,1))
346
347 def p_option_rvalue3(self, p):
348 '''option_rvalue : NAME'''
349 p[0] = Name(LU.i(p,1))
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100350
351 # option_directive = OPTION_ - ident("optionName") + EQ + quotedString("optionValue") + SEMI
352 def p_option_directive(self, p):
353 '''option_directive : OPTION NAME EQ option_rvalue SEMI'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100354 p[0] = OptionStatement(Name(LU.i(p, 2)), LU.i(p,4))
355 self.lh.set_parse_object(p[0], p)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100356
357 # topLevelStatement = Group(message_definition | message_extension | enum_definition | service_definition | import_directive | option_directive)
358 def p_topLevel(self,p):
359 '''topLevel : message_definition
360 | message_extension
361 | enum_definition
362 | service_definition
363 | import_directive
364 | option_directive'''
365 p[0] = p[1]
366
367 def p_package_definition(self, p):
368 '''package_definition : package_directive'''
369 p[0] = p[1]
370
371 def p_packages2(self, p):
372 '''package_definition : empty'''
373 p[0] = []
374
375 def p_statements2(self, p):
376 '''statements : topLevel
377 | statements topLevel'''
378 if len(p) == 2:
379 p[0] = [p[1]]
380 else:
381 p[0] = p[1] + [p[2]]
382
383 def p_statements(self, p):
384 '''statements : empty'''
385 p[0] = []
386
387 # parser = Optional(package_directive) + ZeroOrMore(topLevelStatement)
Dusan Klinecc9b031a2014-11-10 13:21:08 +0100388 def p_protofile(self, p):
389 '''protofile : package_definition statements'''
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100390 p[0] = ProtoFile(LU.i(p,1), LU.i(p,2))
391 self.lh.set_parse_object(p[0], p)
Dusan Klinecc9b031a2014-11-10 13:21:08 +0100392
393 # Parsing starting point
394 def p_goal(self, p):
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100395 '''goal : STARTTOKEN protofile'''
Dusan Klinecc9b031a2014-11-10 13:21:08 +0100396 p[0] = p[2]
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100397
398 def p_error(self, p):
399 print('error: {}'.format(p))
400
401class ProtobufAnalyzer(object):
402
403 def __init__(self):
404 self.lexer = lex.lex(module=ProtobufLexer(), optimize=1)
405 self.parser = yacc.yacc(module=ProtobufParser(), start='goal', optimize=1)
406
407 def tokenize_string(self, code):
408 self.lexer.input(code)
409 for token in self.lexer:
410 print(token)
411
412 def tokenize_file(self, _file):
413 if type(_file) == str:
414 _file = file(_file)
415 content = ''
416 for line in _file:
417 content += line
418 return self.tokenize_string(content)
419
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100420 def parse_string(self, code, debug=0, lineno=1, prefix='+'):
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100421 self.lexer.lineno = lineno
Dusan Klinecaa9ff472014-11-10 18:02:03 +0100422 self.parser.offset = len(prefix)
Dusan Klinecccaa0d92014-11-09 03:21:31 +0100423 return self.parser.parse(prefix + code, lexer=self.lexer, debug=debug)
424
425 def parse_file(self, _file, debug=0):
426 if type(_file) == str:
427 _file = file(_file)
428 content = ''
429 for line in _file:
430 content += line
431 return self.parse_string(content, debug=debug)