blob: 19fc36caf3cc8b5eb2fd510b0365caa5f61e8cac [file] [log] [blame]
sslobodrd046be82019-01-16 10:02:22 -05001// Copyright (c) 2012-2018 Ugorji Nwoke. All rights reserved.
2// Use of this source code is governed by a MIT license found in the LICENSE file.
3
4// +build ignore
5
6package codec
7
8import "reflect"
9
10/*
11
12A strict Non-validating namespace-aware XML 1.0 parser and (en|de)coder.
13
14We are attempting this due to perceived issues with encoding/xml:
15 - Complicated. It tried to do too much, and is not as simple to use as json.
16 - Due to over-engineering, reflection is over-used AND performance suffers:
17 java is 6X faster:http://fabsk.eu/blog/category/informatique/dev/golang/
18 even PYTHON performs better: http://outgoing.typepad.com/outgoing/2014/07/exploring-golang.html
19
20codec framework will offer the following benefits
21 - VASTLY improved performance (when using reflection-mode or codecgen)
22 - simplicity and consistency: with the rest of the supported formats
23 - all other benefits of codec framework (streaming, codegeneration, etc)
24
25codec is not a drop-in replacement for encoding/xml.
26It is a replacement, based on the simplicity and performance of codec.
27Look at it like JAXB for Go.
28
29Challenges:
30 - Need to output XML preamble, with all namespaces at the right location in the output.
31 - Each "end" block is dynamic, so we need to maintain a context-aware stack
32 - How to decide when to use an attribute VS an element
33 - How to handle chardata, attr, comment EXPLICITLY.
34 - Should it output fragments?
35 e.g. encoding a bool should just output true OR false, which is not well-formed XML.
36
37Extend the struct tag. See representative example:
38 type X struct {
39 ID uint8 `codec:"http://ugorji.net/x-namespace xid id,omitempty,toarray,attr,cdata"`
40 // format: [namespace-uri ][namespace-prefix ]local-name, ...
41 }
42
43Based on this, we encode
44 - fields as elements, BUT
45 encode as attributes if struct tag contains ",attr" and is a scalar (bool, number or string)
46 - text as entity-escaped text, BUT encode as CDATA if struct tag contains ",cdata".
47
48To handle namespaces:
49 - XMLHandle is denoted as being namespace-aware.
50 Consequently, we WILL use the ns:name pair to encode and decode if defined, else use the plain name.
51 - *Encoder and *Decoder know whether the Handle "prefers" namespaces.
52 - add *Encoder.getEncName(*structFieldInfo).
53 No one calls *structFieldInfo.indexForEncName directly anymore
54 - OR better yet: indexForEncName is namespace-aware, and helper.go is all namespace-aware
55 indexForEncName takes a parameter of the form namespace:local-name OR local-name
56 - add *Decoder.getStructFieldInfo(encName string) // encName here is either like abc, or h1:nsabc
57 by being a method on *Decoder, or maybe a method on the Handle itself.
58 No one accesses .encName anymore
59 - let encode.go and decode.go use these (for consistency)
60 - only problem exists for gen.go, where we create a big switch on encName.
61 Now, we also have to add a switch on strings.endsWith(kName, encNsName)
62 - gen.go will need to have many more methods, and then double-on the 2 switch loops like:
63 switch k {
64 case "abc" : x.abc()
65 case "def" : x.def()
66 default {
67 switch {
68 case !nsAware: panic(...)
69 case strings.endsWith(":abc"): x.abc()
70 case strings.endsWith(":def"): x.def()
71 default: panic(...)
72 }
73 }
74 }
75
76The structure below accommodates this:
77
78 type typeInfo struct {
79 sfi []*structFieldInfo // sorted by encName
80 sfins // sorted by namespace
81 sfia // sorted, to have those with attributes at the top. Needed to write XML appropriately.
82 sfip // unsorted
83 }
84 type structFieldInfo struct {
85 encName
86 nsEncName
87 ns string
88 attr bool
89 cdata bool
90 }
91
92indexForEncName is now an internal helper function that takes a sorted array
93(one of ti.sfins or ti.sfi). It is only used by *Encoder.getStructFieldInfo(...)
94
95There will be a separate parser from the builder.
96The parser will have a method: next() xmlToken method. It has lookahead support,
97so you can pop multiple tokens, make a determination, and push them back in the order popped.
98This will be needed to determine whether we are "nakedly" decoding a container or not.
99The stack will be implemented using a slice and push/pop happens at the [0] element.
100
101xmlToken has fields:
102 - type uint8: 0 | ElementStart | ElementEnd | AttrKey | AttrVal | Text
103 - value string
104 - ns string
105
106SEE: http://www.xml.com/pub/a/98/10/guide0.html?page=3#ENTDECL
107
108The following are skipped when parsing:
109 - External Entities (from external file)
110 - Notation Declaration e.g. <!NOTATION GIF87A SYSTEM "GIF">
111 - Entity Declarations & References
112 - XML Declaration (assume UTF-8)
113 - XML Directive i.e. <! ... >
114 - Other Declarations: Notation, etc.
115 - Comment
116 - Processing Instruction
117 - schema / DTD for validation:
118 We are not a VALIDATING parser. Validation is done elsewhere.
119 However, some parts of the DTD internal subset are used (SEE BELOW).
120 For Attribute List Declarations e.g.
121 <!ATTLIST foo:oldjoke name ID #REQUIRED label CDATA #IMPLIED status ( funny | notfunny ) 'funny' >
122 We considered using the ATTLIST to get "default" value, but not to validate the contents. (VETOED)
123
124The following XML features are supported
125 - Namespace
126 - Element
127 - Attribute
128 - cdata
129 - Unicode escape
130
131The following DTD (when as an internal sub-set) features are supported:
132 - Internal Entities e.g.
133 <!ELEMENT burns "ugorji is cool" > AND entities for the set: [<>&"']
134 - Parameter entities e.g.
135 <!ENTITY % personcontent "ugorji is cool"> <!ELEMENT burns (%personcontent;)*>
136
137At decode time, a structure containing the following is kept
138 - namespace mapping
139 - default attribute values
140 - all internal entities (<>&"' and others written in the document)
141
142When decode starts, it parses XML namespace declarations and creates a map in the
143xmlDecDriver. While parsing, that map continuously gets updated.
144The only problem happens when a namespace declaration happens on the node that it defines.
145e.g. <hn:name xmlns:hn="http://www.ugorji.net" >
146To handle this, each Element must be fully parsed at a time,
147even if it amounts to multiple tokens which are returned one at a time on request.
148
149xmlns is a special attribute name.
150 - It is used to define namespaces, including the default
151 - It is never returned as an AttrKey or AttrVal.
152 *We may decide later to allow user to use it e.g. you want to parse the xmlns mappings into a field.*
153
154Number, bool, null, mapKey, etc can all be decoded from any xmlToken.
155This accommodates map[int]string for example.
156
157It should be possible to create a schema from the types,
158or vice versa (generate types from schema with appropriate tags).
159This is however out-of-scope from this parsing project.
160
161We should write all namespace information at the first point that it is referenced in the tree,
162and use the mapping for all child nodes and attributes. This means that state is maintained
163at a point in the tree. This also means that calls to Decode or MustDecode will reset some state.
164
165When decoding, it is important to keep track of entity references and default attribute values.
166It seems these can only be stored in the DTD components. We should honor them when decoding.
167
168Configuration for XMLHandle will look like this:
169
170 XMLHandle
171 DefaultNS string
172 // Encoding:
173 NS map[string]string // ns URI to key, used for encoding
174 // Decoding: in case ENTITY declared in external schema or dtd, store info needed here
175 Entities map[string]string // map of entity rep to character
176
177
178During encode, if a namespace mapping is not defined for a namespace found on a struct,
179then we create a mapping for it using nsN (where N is 1..1000000, and doesn't conflict
180with any other namespace mapping).
181
182Note that different fields in a struct can have different namespaces.
183However, all fields will default to the namespace on the _struct field (if defined).
184
185An XML document is a name, a map of attributes and a list of children.
186Consequently, we cannot "DecodeNaked" into a map[string]interface{} (for example).
187We have to "DecodeNaked" into something that resembles XML data.
188
189To support DecodeNaked (decode into nil interface{}), we have to define some "supporting" types:
190 type Name struct { // Preferred. Less allocations due to conversions.
191 Local string
192 Space string
193 }
194 type Element struct {
195 Name Name
196 Attrs map[Name]string
197 Children []interface{} // each child is either *Element or string
198 }
199Only two "supporting" types are exposed for XML: Name and Element.
200
201// ------------------
202
203We considered 'type Name string' where Name is like "Space Local" (space-separated).
204We decided against it, because each creation of a name would lead to
205double allocation (first convert []byte to string, then concatenate them into a string).
206The benefit is that it is faster to read Attrs from a map. But given that Element is a value
207object, we want to eschew methods and have public exposed variables.
208
209We also considered the following, where xml types were not value objects, and we used
210intelligent accessor methods to extract information and for performance.
211*** WE DECIDED AGAINST THIS. ***
212 type Attr struct {
213 Name Name
214 Value string
215 }
216 // Element is a ValueObject: There are no accessor methods.
217 // Make element self-contained.
218 type Element struct {
219 Name Name
220 attrsMap map[string]string // where key is "Space Local"
221 attrs []Attr
222 childrenT []string
223 childrenE []Element
224 childrenI []int // each child is a index into T or E.
225 }
226 func (x *Element) child(i) interface{} // returns string or *Element
227
228// ------------------
229
230Per XML spec and our default handling, white space is always treated as
231insignificant between elements, except in a text node. The xml:space='preserve'
232attribute is ignored.
233
234**Note: there is no xml: namespace. The xml: attributes were defined before namespaces.**
235**So treat them as just "directives" that should be interpreted to mean something**.
236
237On encoding, we support indenting aka prettifying markup in the same way we support it for json.
238
239A document or element can only be encoded/decoded from/to a struct. In this mode:
240 - struct name maps to element name (or tag-info from _struct field)
241 - fields are mapped to child elements or attributes
242
243A map is either encoded as attributes on current element, or as a set of child elements.
244Maps are encoded as attributes iff their keys and values are primitives (number, bool, string).
245
246A list is encoded as a set of child elements.
247
248Primitives (number, bool, string) are encoded as an element, attribute or text
249depending on the context.
250
251Extensions must encode themselves as a text string.
252
253Encoding is tough, specifically when encoding mappings, because we need to encode
254as either attribute or element. To do this, we need to default to encoding as attributes,
255and then let Encoder inform the Handle when to start encoding as nodes.
256i.e. Encoder does something like:
257
258 h.EncodeMapStart()
259 h.Encode(), h.Encode(), ...
260 h.EncodeMapNotAttrSignal() // this is not a bool, because it's a signal
261 h.Encode(), h.Encode(), ...
262 h.EncodeEnd()
263
264Only XMLHandle understands this, and will set itself to start encoding as elements.
265
266This support extends to maps. For example, if a struct field is a map, and it has
267the struct tag signifying it should be attr, then all its fields are encoded as attributes.
268e.g.
269
270 type X struct {
271 M map[string]int `codec:"m,attr"` // encode keys as attributes named
272 }
273
274Question:
275 - if encoding a map, what if map keys have spaces in them???
276 Then they cannot be attributes or child elements. Error.
277
278Options to consider adding later:
279 - For attribute values, normalize by trimming beginning and ending white space,
280 and converting every white space sequence to a single space.
281 - ATTLIST restrictions are enforced.
282 e.g. default value of xml:space, skipping xml:XYZ style attributes, etc.
283 - Consider supporting NON-STRICT mode (e.g. to handle HTML parsing).
284 Some elements e.g. br, hr, etc need not close and should be auto-closed
285 ... (see http://www.w3.org/TR/html4/loose.dtd)
286 An expansive set of entities are pre-defined.
287 - Have easy way to create a HTML parser:
288 add a HTML() method to XMLHandle, that will set Strict=false, specify AutoClose,
289 and add HTML Entities to the list.
290 - Support validating element/attribute XMLName before writing it.
291 Keep this behind a flag, which is set to false by default (for performance).
292 type XMLHandle struct {
293 CheckName bool
294 }
295
296Misc:
297
298ROADMAP (1 weeks):
299 - build encoder (1 day)
300 - build decoder (based off xmlParser) (1 day)
301 - implement xmlParser (2 days).
302 Look at encoding/xml for inspiration.
303 - integrate and TEST (1 days)
304 - write article and post it (1 day)
305
306// ---------- MORE NOTES FROM 2017-11-30 ------------
307
308when parsing
309- parse the attributes first
310- then parse the nodes
311
312basically:
313- if encoding a field: we use the field name for the wrapper
314- if encoding a non-field, then just use the element type name
315
316 map[string]string ==> <map><key>abc</key><value>val</value></map>... or
317 <map key="abc">val</map>... OR
318 <key1>val1</key1><key2>val2</key2>... <- PREFERED
319 []string ==> <string>v1</string><string>v2</string>...
320 string v1 ==> <string>v1</string>
321 bool true ==> <bool>true</bool>
322 float 1.0 ==> <float>1.0</float>
323 ...
324
325 F1 map[string]string ==> <F1><key>abc</key><value>val</value></F1>... OR
326 <F1 key="abc">val</F1>... OR
327 <F1><abc>val</abc>...</F1> <- PREFERED
328 F2 []string ==> <F2>v1</F2><F2>v2</F2>...
329 F3 bool ==> <F3>true</F3>
330 ...
331
332- a scalar is encoded as:
333 (value) of type T ==> <T><value/></T>
334 (value) of field F ==> <F><value/></F>
335- A kv-pair is encoded as:
336 (key,value) ==> <map><key><value/></key></map> OR <map key="value">
337 (key,value) of field F ==> <F><key><value/></key></F> OR <F key="value">
338- A map or struct is just a list of kv-pairs
339- A list is encoded as sequences of same node e.g.
340 <F1 key1="value11">
341 <F1 key2="value12">
342 <F2>value21</F2>
343 <F2>value22</F2>
344- we may have to singularize the field name, when entering into xml,
345 and pluralize them when encoding.
346- bi-directional encode->decode->encode is not a MUST.
347 even encoding/xml cannot decode correctly what was encoded:
348
349 see https://play.golang.org/p/224V_nyhMS
350 func main() {
351 fmt.Println("Hello, playground")
352 v := []interface{}{"hello", 1, true, nil, time.Now()}
353 s, err := xml.Marshal(v)
354 fmt.Printf("err: %v, \ns: %s\n", err, s)
355 var v2 []interface{}
356 err = xml.Unmarshal(s, &v2)
357 fmt.Printf("err: %v, \nv2: %v\n", err, v2)
358 type T struct {
359 V []interface{}
360 }
361 v3 := T{V: v}
362 s, err = xml.Marshal(v3)
363 fmt.Printf("err: %v, \ns: %s\n", err, s)
364 var v4 T
365 err = xml.Unmarshal(s, &v4)
366 fmt.Printf("err: %v, \nv4: %v\n", err, v4)
367 }
368 Output:
369 err: <nil>,
370 s: <string>hello</string><int>1</int><bool>true</bool><Time>2009-11-10T23:00:00Z</Time>
371 err: <nil>,
372 v2: [<nil>]
373 err: <nil>,
374 s: <T><V>hello</V><V>1</V><V>true</V><V>2009-11-10T23:00:00Z</V></T>
375 err: <nil>,
376 v4: {[<nil> <nil> <nil> <nil>]}
377-
378*/
379
380// ----------- PARSER -------------------
381
382type xmlTokenType uint8
383
384const (
385 _ xmlTokenType = iota << 1
386 xmlTokenElemStart
387 xmlTokenElemEnd
388 xmlTokenAttrKey
389 xmlTokenAttrVal
390 xmlTokenText
391)
392
393type xmlToken struct {
394 Type xmlTokenType
395 Value string
396 Namespace string // blank for AttrVal and Text
397}
398
399type xmlParser struct {
400 r decReader
401 toks []xmlToken // list of tokens.
402 ptr int // ptr into the toks slice
403 done bool // nothing else to parse. r now returns EOF.
404}
405
406func (x *xmlParser) next() (t *xmlToken) {
407 // once x.done, or x.ptr == len(x.toks) == 0, then return nil (to signify finish)
408 if !x.done && len(x.toks) == 0 {
409 x.nextTag()
410 }
411 // parses one element at a time (into possible many tokens)
412 if x.ptr < len(x.toks) {
413 t = &(x.toks[x.ptr])
414 x.ptr++
415 if x.ptr == len(x.toks) {
416 x.ptr = 0
417 x.toks = x.toks[:0]
418 }
419 }
420 return
421}
422
423// nextTag will parses the next element and fill up toks.
424// It set done flag if/once EOF is reached.
425func (x *xmlParser) nextTag() {
426 // TODO: implement.
427}
428
429// ----------- ENCODER -------------------
430
431type xmlEncDriver struct {
432 e *Encoder
433 w encWriter
434 h *XMLHandle
435 b [64]byte // scratch
436 bs []byte // scratch
437 // s jsonStack
438 noBuiltInTypes
439}
440
441// ----------- DECODER -------------------
442
443type xmlDecDriver struct {
444 d *Decoder
445 h *XMLHandle
446 r decReader // *bytesDecReader decReader
447 ct valueType // container type. one of unset, array or map.
448 bstr [8]byte // scratch used for string \UXXX parsing
449 b [64]byte // scratch
450
451 // wsSkipped bool // whitespace skipped
452
453 // s jsonStack
454
455 noBuiltInTypes
456}
457
458// DecodeNaked will decode into an XMLNode
459
460// XMLName is a value object representing a namespace-aware NAME
461type XMLName struct {
462 Local string
463 Space string
464}
465
466// XMLNode represents a "union" of the different types of XML Nodes.
467// Only one of fields (Text or *Element) is set.
468type XMLNode struct {
469 Element *Element
470 Text string
471}
472
473// XMLElement is a value object representing an fully-parsed XML element.
474type XMLElement struct {
475 Name Name
476 Attrs map[XMLName]string
477 // Children is a list of child nodes, each being a *XMLElement or string
478 Children []XMLNode
479}
480
481// ----------- HANDLE -------------------
482
483type XMLHandle struct {
484 BasicHandle
485 textEncodingType
486
487 DefaultNS string
488 NS map[string]string // ns URI to key, for encoding
489 Entities map[string]string // entity representation to string, for encoding.
490}
491
492func (h *XMLHandle) newEncDriver(e *Encoder) encDriver {
493 return &xmlEncDriver{e: e, w: e.w, h: h}
494}
495
496func (h *XMLHandle) newDecDriver(d *Decoder) decDriver {
497 // d := xmlDecDriver{r: r.(*bytesDecReader), h: h}
498 hd := xmlDecDriver{d: d, r: d.r, h: h}
499 hd.n.bytes = d.b[:]
500 return &hd
501}
502
503func (h *XMLHandle) SetInterfaceExt(rt reflect.Type, tag uint64, ext InterfaceExt) (err error) {
504 return h.SetExt(rt, tag, &extWrapper{bytesExtFailer{}, ext})
505}
506
507var _ decDriver = (*xmlDecDriver)(nil)
508var _ encDriver = (*xmlEncDriver)(nil)