sslobodr | d046be8 | 2019-01-16 10:02:22 -0500 | [diff] [blame] | 1 | // Copyright (c) 2012-2018 Ugorji Nwoke. All rights reserved. |
| 2 | // Use of this source code is governed by a MIT license found in the LICENSE file. |
| 3 | |
| 4 | // +build ignore |
| 5 | |
| 6 | package codec |
| 7 | |
| 8 | import "reflect" |
| 9 | |
| 10 | /* |
| 11 | |
| 12 | A strict Non-validating namespace-aware XML 1.0 parser and (en|de)coder. |
| 13 | |
| 14 | We are attempting this due to perceived issues with encoding/xml: |
| 15 | - Complicated. It tried to do too much, and is not as simple to use as json. |
| 16 | - Due to over-engineering, reflection is over-used AND performance suffers: |
| 17 | java is 6X faster:http://fabsk.eu/blog/category/informatique/dev/golang/ |
| 18 | even PYTHON performs better: http://outgoing.typepad.com/outgoing/2014/07/exploring-golang.html |
| 19 | |
| 20 | codec framework will offer the following benefits |
| 21 | - VASTLY improved performance (when using reflection-mode or codecgen) |
| 22 | - simplicity and consistency: with the rest of the supported formats |
| 23 | - all other benefits of codec framework (streaming, codegeneration, etc) |
| 24 | |
| 25 | codec is not a drop-in replacement for encoding/xml. |
| 26 | It is a replacement, based on the simplicity and performance of codec. |
| 27 | Look at it like JAXB for Go. |
| 28 | |
| 29 | Challenges: |
| 30 | - Need to output XML preamble, with all namespaces at the right location in the output. |
| 31 | - Each "end" block is dynamic, so we need to maintain a context-aware stack |
| 32 | - How to decide when to use an attribute VS an element |
| 33 | - How to handle chardata, attr, comment EXPLICITLY. |
| 34 | - Should it output fragments? |
| 35 | e.g. encoding a bool should just output true OR false, which is not well-formed XML. |
| 36 | |
| 37 | Extend the struct tag. See representative example: |
| 38 | type X struct { |
| 39 | ID uint8 `codec:"http://ugorji.net/x-namespace xid id,omitempty,toarray,attr,cdata"` |
| 40 | // format: [namespace-uri ][namespace-prefix ]local-name, ... |
| 41 | } |
| 42 | |
| 43 | Based on this, we encode |
| 44 | - fields as elements, BUT |
| 45 | encode as attributes if struct tag contains ",attr" and is a scalar (bool, number or string) |
| 46 | - text as entity-escaped text, BUT encode as CDATA if struct tag contains ",cdata". |
| 47 | |
| 48 | To handle namespaces: |
| 49 | - XMLHandle is denoted as being namespace-aware. |
| 50 | Consequently, we WILL use the ns:name pair to encode and decode if defined, else use the plain name. |
| 51 | - *Encoder and *Decoder know whether the Handle "prefers" namespaces. |
| 52 | - add *Encoder.getEncName(*structFieldInfo). |
| 53 | No one calls *structFieldInfo.indexForEncName directly anymore |
| 54 | - OR better yet: indexForEncName is namespace-aware, and helper.go is all namespace-aware |
| 55 | indexForEncName takes a parameter of the form namespace:local-name OR local-name |
| 56 | - add *Decoder.getStructFieldInfo(encName string) // encName here is either like abc, or h1:nsabc |
| 57 | by being a method on *Decoder, or maybe a method on the Handle itself. |
| 58 | No one accesses .encName anymore |
| 59 | - let encode.go and decode.go use these (for consistency) |
| 60 | - only problem exists for gen.go, where we create a big switch on encName. |
| 61 | Now, we also have to add a switch on strings.endsWith(kName, encNsName) |
| 62 | - gen.go will need to have many more methods, and then double-on the 2 switch loops like: |
| 63 | switch k { |
| 64 | case "abc" : x.abc() |
| 65 | case "def" : x.def() |
| 66 | default { |
| 67 | switch { |
| 68 | case !nsAware: panic(...) |
| 69 | case strings.endsWith(":abc"): x.abc() |
| 70 | case strings.endsWith(":def"): x.def() |
| 71 | default: panic(...) |
| 72 | } |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | The structure below accommodates this: |
| 77 | |
| 78 | type typeInfo struct { |
| 79 | sfi []*structFieldInfo // sorted by encName |
| 80 | sfins // sorted by namespace |
| 81 | sfia // sorted, to have those with attributes at the top. Needed to write XML appropriately. |
| 82 | sfip // unsorted |
| 83 | } |
| 84 | type structFieldInfo struct { |
| 85 | encName |
| 86 | nsEncName |
| 87 | ns string |
| 88 | attr bool |
| 89 | cdata bool |
| 90 | } |
| 91 | |
| 92 | indexForEncName is now an internal helper function that takes a sorted array |
| 93 | (one of ti.sfins or ti.sfi). It is only used by *Encoder.getStructFieldInfo(...) |
| 94 | |
| 95 | There will be a separate parser from the builder. |
| 96 | The parser will have a method: next() xmlToken method. It has lookahead support, |
| 97 | so you can pop multiple tokens, make a determination, and push them back in the order popped. |
| 98 | This will be needed to determine whether we are "nakedly" decoding a container or not. |
| 99 | The stack will be implemented using a slice and push/pop happens at the [0] element. |
| 100 | |
| 101 | xmlToken has fields: |
| 102 | - type uint8: 0 | ElementStart | ElementEnd | AttrKey | AttrVal | Text |
| 103 | - value string |
| 104 | - ns string |
| 105 | |
| 106 | SEE: http://www.xml.com/pub/a/98/10/guide0.html?page=3#ENTDECL |
| 107 | |
| 108 | The following are skipped when parsing: |
| 109 | - External Entities (from external file) |
| 110 | - Notation Declaration e.g. <!NOTATION GIF87A SYSTEM "GIF"> |
| 111 | - Entity Declarations & References |
| 112 | - XML Declaration (assume UTF-8) |
| 113 | - XML Directive i.e. <! ... > |
| 114 | - Other Declarations: Notation, etc. |
| 115 | - Comment |
| 116 | - Processing Instruction |
| 117 | - schema / DTD for validation: |
| 118 | We are not a VALIDATING parser. Validation is done elsewhere. |
| 119 | However, some parts of the DTD internal subset are used (SEE BELOW). |
| 120 | For Attribute List Declarations e.g. |
| 121 | <!ATTLIST foo:oldjoke name ID #REQUIRED label CDATA #IMPLIED status ( funny | notfunny ) 'funny' > |
| 122 | We considered using the ATTLIST to get "default" value, but not to validate the contents. (VETOED) |
| 123 | |
| 124 | The following XML features are supported |
| 125 | - Namespace |
| 126 | - Element |
| 127 | - Attribute |
| 128 | - cdata |
| 129 | - Unicode escape |
| 130 | |
| 131 | The following DTD (when as an internal sub-set) features are supported: |
| 132 | - Internal Entities e.g. |
| 133 | <!ELEMENT burns "ugorji is cool" > AND entities for the set: [<>&"'] |
| 134 | - Parameter entities e.g. |
| 135 | <!ENTITY % personcontent "ugorji is cool"> <!ELEMENT burns (%personcontent;)*> |
| 136 | |
| 137 | At decode time, a structure containing the following is kept |
| 138 | - namespace mapping |
| 139 | - default attribute values |
| 140 | - all internal entities (<>&"' and others written in the document) |
| 141 | |
| 142 | When decode starts, it parses XML namespace declarations and creates a map in the |
| 143 | xmlDecDriver. While parsing, that map continuously gets updated. |
| 144 | The only problem happens when a namespace declaration happens on the node that it defines. |
| 145 | e.g. <hn:name xmlns:hn="http://www.ugorji.net" > |
| 146 | To handle this, each Element must be fully parsed at a time, |
| 147 | even if it amounts to multiple tokens which are returned one at a time on request. |
| 148 | |
| 149 | xmlns is a special attribute name. |
| 150 | - It is used to define namespaces, including the default |
| 151 | - It is never returned as an AttrKey or AttrVal. |
| 152 | *We may decide later to allow user to use it e.g. you want to parse the xmlns mappings into a field.* |
| 153 | |
| 154 | Number, bool, null, mapKey, etc can all be decoded from any xmlToken. |
| 155 | This accommodates map[int]string for example. |
| 156 | |
| 157 | It should be possible to create a schema from the types, |
| 158 | or vice versa (generate types from schema with appropriate tags). |
| 159 | This is however out-of-scope from this parsing project. |
| 160 | |
| 161 | We should write all namespace information at the first point that it is referenced in the tree, |
| 162 | and use the mapping for all child nodes and attributes. This means that state is maintained |
| 163 | at a point in the tree. This also means that calls to Decode or MustDecode will reset some state. |
| 164 | |
| 165 | When decoding, it is important to keep track of entity references and default attribute values. |
| 166 | It seems these can only be stored in the DTD components. We should honor them when decoding. |
| 167 | |
| 168 | Configuration for XMLHandle will look like this: |
| 169 | |
| 170 | XMLHandle |
| 171 | DefaultNS string |
| 172 | // Encoding: |
| 173 | NS map[string]string // ns URI to key, used for encoding |
| 174 | // Decoding: in case ENTITY declared in external schema or dtd, store info needed here |
| 175 | Entities map[string]string // map of entity rep to character |
| 176 | |
| 177 | |
| 178 | During encode, if a namespace mapping is not defined for a namespace found on a struct, |
| 179 | then we create a mapping for it using nsN (where N is 1..1000000, and doesn't conflict |
| 180 | with any other namespace mapping). |
| 181 | |
| 182 | Note that different fields in a struct can have different namespaces. |
| 183 | However, all fields will default to the namespace on the _struct field (if defined). |
| 184 | |
| 185 | An XML document is a name, a map of attributes and a list of children. |
| 186 | Consequently, we cannot "DecodeNaked" into a map[string]interface{} (for example). |
| 187 | We have to "DecodeNaked" into something that resembles XML data. |
| 188 | |
| 189 | To support DecodeNaked (decode into nil interface{}), we have to define some "supporting" types: |
| 190 | type Name struct { // Preferred. Less allocations due to conversions. |
| 191 | Local string |
| 192 | Space string |
| 193 | } |
| 194 | type Element struct { |
| 195 | Name Name |
| 196 | Attrs map[Name]string |
| 197 | Children []interface{} // each child is either *Element or string |
| 198 | } |
| 199 | Only two "supporting" types are exposed for XML: Name and Element. |
| 200 | |
| 201 | // ------------------ |
| 202 | |
| 203 | We considered 'type Name string' where Name is like "Space Local" (space-separated). |
| 204 | We decided against it, because each creation of a name would lead to |
| 205 | double allocation (first convert []byte to string, then concatenate them into a string). |
| 206 | The benefit is that it is faster to read Attrs from a map. But given that Element is a value |
| 207 | object, we want to eschew methods and have public exposed variables. |
| 208 | |
| 209 | We also considered the following, where xml types were not value objects, and we used |
| 210 | intelligent accessor methods to extract information and for performance. |
| 211 | *** WE DECIDED AGAINST THIS. *** |
| 212 | type Attr struct { |
| 213 | Name Name |
| 214 | Value string |
| 215 | } |
| 216 | // Element is a ValueObject: There are no accessor methods. |
| 217 | // Make element self-contained. |
| 218 | type Element struct { |
| 219 | Name Name |
| 220 | attrsMap map[string]string // where key is "Space Local" |
| 221 | attrs []Attr |
| 222 | childrenT []string |
| 223 | childrenE []Element |
| 224 | childrenI []int // each child is a index into T or E. |
| 225 | } |
| 226 | func (x *Element) child(i) interface{} // returns string or *Element |
| 227 | |
| 228 | // ------------------ |
| 229 | |
| 230 | Per XML spec and our default handling, white space is always treated as |
| 231 | insignificant between elements, except in a text node. The xml:space='preserve' |
| 232 | attribute is ignored. |
| 233 | |
| 234 | **Note: there is no xml: namespace. The xml: attributes were defined before namespaces.** |
| 235 | **So treat them as just "directives" that should be interpreted to mean something**. |
| 236 | |
| 237 | On encoding, we support indenting aka prettifying markup in the same way we support it for json. |
| 238 | |
| 239 | A document or element can only be encoded/decoded from/to a struct. In this mode: |
| 240 | - struct name maps to element name (or tag-info from _struct field) |
| 241 | - fields are mapped to child elements or attributes |
| 242 | |
| 243 | A map is either encoded as attributes on current element, or as a set of child elements. |
| 244 | Maps are encoded as attributes iff their keys and values are primitives (number, bool, string). |
| 245 | |
| 246 | A list is encoded as a set of child elements. |
| 247 | |
| 248 | Primitives (number, bool, string) are encoded as an element, attribute or text |
| 249 | depending on the context. |
| 250 | |
| 251 | Extensions must encode themselves as a text string. |
| 252 | |
| 253 | Encoding is tough, specifically when encoding mappings, because we need to encode |
| 254 | as either attribute or element. To do this, we need to default to encoding as attributes, |
| 255 | and then let Encoder inform the Handle when to start encoding as nodes. |
| 256 | i.e. Encoder does something like: |
| 257 | |
| 258 | h.EncodeMapStart() |
| 259 | h.Encode(), h.Encode(), ... |
| 260 | h.EncodeMapNotAttrSignal() // this is not a bool, because it's a signal |
| 261 | h.Encode(), h.Encode(), ... |
| 262 | h.EncodeEnd() |
| 263 | |
| 264 | Only XMLHandle understands this, and will set itself to start encoding as elements. |
| 265 | |
| 266 | This support extends to maps. For example, if a struct field is a map, and it has |
| 267 | the struct tag signifying it should be attr, then all its fields are encoded as attributes. |
| 268 | e.g. |
| 269 | |
| 270 | type X struct { |
| 271 | M map[string]int `codec:"m,attr"` // encode keys as attributes named |
| 272 | } |
| 273 | |
| 274 | Question: |
| 275 | - if encoding a map, what if map keys have spaces in them??? |
| 276 | Then they cannot be attributes or child elements. Error. |
| 277 | |
| 278 | Options to consider adding later: |
| 279 | - For attribute values, normalize by trimming beginning and ending white space, |
| 280 | and converting every white space sequence to a single space. |
| 281 | - ATTLIST restrictions are enforced. |
| 282 | e.g. default value of xml:space, skipping xml:XYZ style attributes, etc. |
| 283 | - Consider supporting NON-STRICT mode (e.g. to handle HTML parsing). |
| 284 | Some elements e.g. br, hr, etc need not close and should be auto-closed |
| 285 | ... (see http://www.w3.org/TR/html4/loose.dtd) |
| 286 | An expansive set of entities are pre-defined. |
| 287 | - Have easy way to create a HTML parser: |
| 288 | add a HTML() method to XMLHandle, that will set Strict=false, specify AutoClose, |
| 289 | and add HTML Entities to the list. |
| 290 | - Support validating element/attribute XMLName before writing it. |
| 291 | Keep this behind a flag, which is set to false by default (for performance). |
| 292 | type XMLHandle struct { |
| 293 | CheckName bool |
| 294 | } |
| 295 | |
| 296 | Misc: |
| 297 | |
| 298 | ROADMAP (1 weeks): |
| 299 | - build encoder (1 day) |
| 300 | - build decoder (based off xmlParser) (1 day) |
| 301 | - implement xmlParser (2 days). |
| 302 | Look at encoding/xml for inspiration. |
| 303 | - integrate and TEST (1 days) |
| 304 | - write article and post it (1 day) |
| 305 | |
| 306 | // ---------- MORE NOTES FROM 2017-11-30 ------------ |
| 307 | |
| 308 | when parsing |
| 309 | - parse the attributes first |
| 310 | - then parse the nodes |
| 311 | |
| 312 | basically: |
| 313 | - if encoding a field: we use the field name for the wrapper |
| 314 | - if encoding a non-field, then just use the element type name |
| 315 | |
| 316 | map[string]string ==> <map><key>abc</key><value>val</value></map>... or |
| 317 | <map key="abc">val</map>... OR |
| 318 | <key1>val1</key1><key2>val2</key2>... <- PREFERED |
| 319 | []string ==> <string>v1</string><string>v2</string>... |
| 320 | string v1 ==> <string>v1</string> |
| 321 | bool true ==> <bool>true</bool> |
| 322 | float 1.0 ==> <float>1.0</float> |
| 323 | ... |
| 324 | |
| 325 | F1 map[string]string ==> <F1><key>abc</key><value>val</value></F1>... OR |
| 326 | <F1 key="abc">val</F1>... OR |
| 327 | <F1><abc>val</abc>...</F1> <- PREFERED |
| 328 | F2 []string ==> <F2>v1</F2><F2>v2</F2>... |
| 329 | F3 bool ==> <F3>true</F3> |
| 330 | ... |
| 331 | |
| 332 | - a scalar is encoded as: |
| 333 | (value) of type T ==> <T><value/></T> |
| 334 | (value) of field F ==> <F><value/></F> |
| 335 | - A kv-pair is encoded as: |
| 336 | (key,value) ==> <map><key><value/></key></map> OR <map key="value"> |
| 337 | (key,value) of field F ==> <F><key><value/></key></F> OR <F key="value"> |
| 338 | - A map or struct is just a list of kv-pairs |
| 339 | - A list is encoded as sequences of same node e.g. |
| 340 | <F1 key1="value11"> |
| 341 | <F1 key2="value12"> |
| 342 | <F2>value21</F2> |
| 343 | <F2>value22</F2> |
| 344 | - we may have to singularize the field name, when entering into xml, |
| 345 | and pluralize them when encoding. |
| 346 | - bi-directional encode->decode->encode is not a MUST. |
| 347 | even encoding/xml cannot decode correctly what was encoded: |
| 348 | |
| 349 | see https://play.golang.org/p/224V_nyhMS |
| 350 | func main() { |
| 351 | fmt.Println("Hello, playground") |
| 352 | v := []interface{}{"hello", 1, true, nil, time.Now()} |
| 353 | s, err := xml.Marshal(v) |
| 354 | fmt.Printf("err: %v, \ns: %s\n", err, s) |
| 355 | var v2 []interface{} |
| 356 | err = xml.Unmarshal(s, &v2) |
| 357 | fmt.Printf("err: %v, \nv2: %v\n", err, v2) |
| 358 | type T struct { |
| 359 | V []interface{} |
| 360 | } |
| 361 | v3 := T{V: v} |
| 362 | s, err = xml.Marshal(v3) |
| 363 | fmt.Printf("err: %v, \ns: %s\n", err, s) |
| 364 | var v4 T |
| 365 | err = xml.Unmarshal(s, &v4) |
| 366 | fmt.Printf("err: %v, \nv4: %v\n", err, v4) |
| 367 | } |
| 368 | Output: |
| 369 | err: <nil>, |
| 370 | s: <string>hello</string><int>1</int><bool>true</bool><Time>2009-11-10T23:00:00Z</Time> |
| 371 | err: <nil>, |
| 372 | v2: [<nil>] |
| 373 | err: <nil>, |
| 374 | s: <T><V>hello</V><V>1</V><V>true</V><V>2009-11-10T23:00:00Z</V></T> |
| 375 | err: <nil>, |
| 376 | v4: {[<nil> <nil> <nil> <nil>]} |
| 377 | - |
| 378 | */ |
| 379 | |
| 380 | // ----------- PARSER ------------------- |
| 381 | |
| 382 | type xmlTokenType uint8 |
| 383 | |
| 384 | const ( |
| 385 | _ xmlTokenType = iota << 1 |
| 386 | xmlTokenElemStart |
| 387 | xmlTokenElemEnd |
| 388 | xmlTokenAttrKey |
| 389 | xmlTokenAttrVal |
| 390 | xmlTokenText |
| 391 | ) |
| 392 | |
| 393 | type xmlToken struct { |
| 394 | Type xmlTokenType |
| 395 | Value string |
| 396 | Namespace string // blank for AttrVal and Text |
| 397 | } |
| 398 | |
| 399 | type xmlParser struct { |
| 400 | r decReader |
| 401 | toks []xmlToken // list of tokens. |
| 402 | ptr int // ptr into the toks slice |
| 403 | done bool // nothing else to parse. r now returns EOF. |
| 404 | } |
| 405 | |
| 406 | func (x *xmlParser) next() (t *xmlToken) { |
| 407 | // once x.done, or x.ptr == len(x.toks) == 0, then return nil (to signify finish) |
| 408 | if !x.done && len(x.toks) == 0 { |
| 409 | x.nextTag() |
| 410 | } |
| 411 | // parses one element at a time (into possible many tokens) |
| 412 | if x.ptr < len(x.toks) { |
| 413 | t = &(x.toks[x.ptr]) |
| 414 | x.ptr++ |
| 415 | if x.ptr == len(x.toks) { |
| 416 | x.ptr = 0 |
| 417 | x.toks = x.toks[:0] |
| 418 | } |
| 419 | } |
| 420 | return |
| 421 | } |
| 422 | |
| 423 | // nextTag will parses the next element and fill up toks. |
| 424 | // It set done flag if/once EOF is reached. |
| 425 | func (x *xmlParser) nextTag() { |
| 426 | // TODO: implement. |
| 427 | } |
| 428 | |
| 429 | // ----------- ENCODER ------------------- |
| 430 | |
| 431 | type xmlEncDriver struct { |
| 432 | e *Encoder |
| 433 | w encWriter |
| 434 | h *XMLHandle |
| 435 | b [64]byte // scratch |
| 436 | bs []byte // scratch |
| 437 | // s jsonStack |
| 438 | noBuiltInTypes |
| 439 | } |
| 440 | |
| 441 | // ----------- DECODER ------------------- |
| 442 | |
| 443 | type xmlDecDriver struct { |
| 444 | d *Decoder |
| 445 | h *XMLHandle |
| 446 | r decReader // *bytesDecReader decReader |
| 447 | ct valueType // container type. one of unset, array or map. |
| 448 | bstr [8]byte // scratch used for string \UXXX parsing |
| 449 | b [64]byte // scratch |
| 450 | |
| 451 | // wsSkipped bool // whitespace skipped |
| 452 | |
| 453 | // s jsonStack |
| 454 | |
| 455 | noBuiltInTypes |
| 456 | } |
| 457 | |
| 458 | // DecodeNaked will decode into an XMLNode |
| 459 | |
| 460 | // XMLName is a value object representing a namespace-aware NAME |
| 461 | type XMLName struct { |
| 462 | Local string |
| 463 | Space string |
| 464 | } |
| 465 | |
| 466 | // XMLNode represents a "union" of the different types of XML Nodes. |
| 467 | // Only one of fields (Text or *Element) is set. |
| 468 | type XMLNode struct { |
| 469 | Element *Element |
| 470 | Text string |
| 471 | } |
| 472 | |
| 473 | // XMLElement is a value object representing an fully-parsed XML element. |
| 474 | type XMLElement struct { |
| 475 | Name Name |
| 476 | Attrs map[XMLName]string |
| 477 | // Children is a list of child nodes, each being a *XMLElement or string |
| 478 | Children []XMLNode |
| 479 | } |
| 480 | |
| 481 | // ----------- HANDLE ------------------- |
| 482 | |
| 483 | type XMLHandle struct { |
| 484 | BasicHandle |
| 485 | textEncodingType |
| 486 | |
| 487 | DefaultNS string |
| 488 | NS map[string]string // ns URI to key, for encoding |
| 489 | Entities map[string]string // entity representation to string, for encoding. |
| 490 | } |
| 491 | |
| 492 | func (h *XMLHandle) newEncDriver(e *Encoder) encDriver { |
| 493 | return &xmlEncDriver{e: e, w: e.w, h: h} |
| 494 | } |
| 495 | |
| 496 | func (h *XMLHandle) newDecDriver(d *Decoder) decDriver { |
| 497 | // d := xmlDecDriver{r: r.(*bytesDecReader), h: h} |
| 498 | hd := xmlDecDriver{d: d, r: d.r, h: h} |
| 499 | hd.n.bytes = d.b[:] |
| 500 | return &hd |
| 501 | } |
| 502 | |
| 503 | func (h *XMLHandle) SetInterfaceExt(rt reflect.Type, tag uint64, ext InterfaceExt) (err error) { |
| 504 | return h.SetExt(rt, tag, &extWrapper{bytesExtFailer{}, ext}) |
| 505 | } |
| 506 | |
| 507 | var _ decDriver = (*xmlDecDriver)(nil) |
| 508 | var _ encDriver = (*xmlEncDriver)(nil) |