David K. Bainbridge | 215e024 | 2017-09-05 23:18:24 -0700 | [diff] [blame] | 1 | // Copyright 2009 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package xml implements a simple XML 1.0 parser that |
| 6 | // understands XML name spaces. |
| 7 | package xml |
| 8 | |
| 9 | // References: |
| 10 | // Annotated XML spec: http://www.xml.com/axml/testaxml.htm |
| 11 | // XML name spaces: http://www.w3.org/TR/REC-xml-names/ |
| 12 | |
| 13 | // TODO(rsc): |
| 14 | // Test error handling. |
| 15 | |
| 16 | import ( |
| 17 | "bufio" |
| 18 | "bytes" |
| 19 | "errors" |
| 20 | "fmt" |
| 21 | "io" |
| 22 | "strconv" |
| 23 | "strings" |
| 24 | "unicode" |
| 25 | "unicode/utf8" |
| 26 | ) |
| 27 | |
| 28 | // A SyntaxError represents a syntax error in the XML input stream. |
| 29 | type SyntaxError struct { |
| 30 | Msg string |
| 31 | Line int |
| 32 | } |
| 33 | |
| 34 | func (e *SyntaxError) Error() string { |
| 35 | return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg |
| 36 | } |
| 37 | |
| 38 | // A Name represents an XML name (Local) annotated with a name space |
| 39 | // identifier (Space). In tokens returned by Decoder.Token, the Space |
| 40 | // identifier is given as a canonical URL, not the short prefix used in |
| 41 | // the document being parsed. |
| 42 | // |
| 43 | // As a special case, XML namespace declarations will use the literal |
| 44 | // string "xmlns" for the Space field instead of the fully resolved URL. |
| 45 | // See Encoder.EncodeToken for more information on namespace encoding |
| 46 | // behaviour. |
| 47 | type Name struct { |
| 48 | Space, Local string |
| 49 | } |
| 50 | |
| 51 | // isNamespace reports whether the name is a namespace-defining name. |
| 52 | func (name Name) isNamespace() bool { |
| 53 | return name.Local == "xmlns" || name.Space == "xmlns" |
| 54 | } |
| 55 | |
| 56 | // An Attr represents an attribute in an XML element (Name=Value). |
| 57 | type Attr struct { |
| 58 | Name Name |
| 59 | Value string |
| 60 | } |
| 61 | |
| 62 | // A Token is an interface holding one of the token types: |
| 63 | // StartElement, EndElement, CharData, Comment, ProcInst, or Directive. |
| 64 | type Token interface{} |
| 65 | |
| 66 | // A StartElement represents an XML start element. |
| 67 | type StartElement struct { |
| 68 | Name Name |
| 69 | Attr []Attr |
| 70 | } |
| 71 | |
| 72 | func (e StartElement) Copy() StartElement { |
| 73 | attrs := make([]Attr, len(e.Attr)) |
| 74 | copy(attrs, e.Attr) |
| 75 | e.Attr = attrs |
| 76 | return e |
| 77 | } |
| 78 | |
| 79 | // End returns the corresponding XML end element. |
| 80 | func (e StartElement) End() EndElement { |
| 81 | return EndElement{e.Name} |
| 82 | } |
| 83 | |
| 84 | // setDefaultNamespace sets the namespace of the element |
| 85 | // as the default for all elements contained within it. |
| 86 | func (e *StartElement) setDefaultNamespace() { |
| 87 | if e.Name.Space == "" { |
| 88 | // If there's no namespace on the element, don't |
| 89 | // set the default. Strictly speaking this might be wrong, as |
| 90 | // we can't tell if the element had no namespace set |
| 91 | // or was just using the default namespace. |
| 92 | return |
| 93 | } |
| 94 | // Don't add a default name space if there's already one set. |
| 95 | for _, attr := range e.Attr { |
| 96 | if attr.Name.Space == "" && attr.Name.Local == "xmlns" { |
| 97 | return |
| 98 | } |
| 99 | } |
| 100 | e.Attr = append(e.Attr, Attr{ |
| 101 | Name: Name{ |
| 102 | Local: "xmlns", |
| 103 | }, |
| 104 | Value: e.Name.Space, |
| 105 | }) |
| 106 | } |
| 107 | |
| 108 | // An EndElement represents an XML end element. |
| 109 | type EndElement struct { |
| 110 | Name Name |
| 111 | } |
| 112 | |
| 113 | // A CharData represents XML character data (raw text), |
| 114 | // in which XML escape sequences have been replaced by |
| 115 | // the characters they represent. |
| 116 | type CharData []byte |
| 117 | |
| 118 | func makeCopy(b []byte) []byte { |
| 119 | b1 := make([]byte, len(b)) |
| 120 | copy(b1, b) |
| 121 | return b1 |
| 122 | } |
| 123 | |
| 124 | func (c CharData) Copy() CharData { return CharData(makeCopy(c)) } |
| 125 | |
| 126 | // A Comment represents an XML comment of the form <!--comment-->. |
| 127 | // The bytes do not include the <!-- and --> comment markers. |
| 128 | type Comment []byte |
| 129 | |
| 130 | func (c Comment) Copy() Comment { return Comment(makeCopy(c)) } |
| 131 | |
| 132 | // A ProcInst represents an XML processing instruction of the form <?target inst?> |
| 133 | type ProcInst struct { |
| 134 | Target string |
| 135 | Inst []byte |
| 136 | } |
| 137 | |
| 138 | func (p ProcInst) Copy() ProcInst { |
| 139 | p.Inst = makeCopy(p.Inst) |
| 140 | return p |
| 141 | } |
| 142 | |
| 143 | // A Directive represents an XML directive of the form <!text>. |
| 144 | // The bytes do not include the <! and > markers. |
| 145 | type Directive []byte |
| 146 | |
| 147 | func (d Directive) Copy() Directive { return Directive(makeCopy(d)) } |
| 148 | |
| 149 | // CopyToken returns a copy of a Token. |
| 150 | func CopyToken(t Token) Token { |
| 151 | switch v := t.(type) { |
| 152 | case CharData: |
| 153 | return v.Copy() |
| 154 | case Comment: |
| 155 | return v.Copy() |
| 156 | case Directive: |
| 157 | return v.Copy() |
| 158 | case ProcInst: |
| 159 | return v.Copy() |
| 160 | case StartElement: |
| 161 | return v.Copy() |
| 162 | } |
| 163 | return t |
| 164 | } |
| 165 | |
| 166 | // A Decoder represents an XML parser reading a particular input stream. |
| 167 | // The parser assumes that its input is encoded in UTF-8. |
| 168 | type Decoder struct { |
| 169 | // Strict defaults to true, enforcing the requirements |
| 170 | // of the XML specification. |
| 171 | // If set to false, the parser allows input containing common |
| 172 | // mistakes: |
| 173 | // * If an element is missing an end tag, the parser invents |
| 174 | // end tags as necessary to keep the return values from Token |
| 175 | // properly balanced. |
| 176 | // * In attribute values and character data, unknown or malformed |
| 177 | // character entities (sequences beginning with &) are left alone. |
| 178 | // |
| 179 | // Setting: |
| 180 | // |
| 181 | // d.Strict = false; |
| 182 | // d.AutoClose = HTMLAutoClose; |
| 183 | // d.Entity = HTMLEntity |
| 184 | // |
| 185 | // creates a parser that can handle typical HTML. |
| 186 | // |
| 187 | // Strict mode does not enforce the requirements of the XML name spaces TR. |
| 188 | // In particular it does not reject name space tags using undefined prefixes. |
| 189 | // Such tags are recorded with the unknown prefix as the name space URL. |
| 190 | Strict bool |
| 191 | |
| 192 | // When Strict == false, AutoClose indicates a set of elements to |
| 193 | // consider closed immediately after they are opened, regardless |
| 194 | // of whether an end element is present. |
| 195 | AutoClose []string |
| 196 | |
| 197 | // Entity can be used to map non-standard entity names to string replacements. |
| 198 | // The parser behaves as if these standard mappings are present in the map, |
| 199 | // regardless of the actual map content: |
| 200 | // |
| 201 | // "lt": "<", |
| 202 | // "gt": ">", |
| 203 | // "amp": "&", |
| 204 | // "apos": "'", |
| 205 | // "quot": `"`, |
| 206 | Entity map[string]string |
| 207 | |
| 208 | // CharsetReader, if non-nil, defines a function to generate |
| 209 | // charset-conversion readers, converting from the provided |
| 210 | // non-UTF-8 charset into UTF-8. If CharsetReader is nil or |
| 211 | // returns an error, parsing stops with an error. One of the |
| 212 | // the CharsetReader's result values must be non-nil. |
| 213 | CharsetReader func(charset string, input io.Reader) (io.Reader, error) |
| 214 | |
| 215 | // DefaultSpace sets the default name space used for unadorned tags, |
| 216 | // as if the entire XML stream were wrapped in an element containing |
| 217 | // the attribute xmlns="DefaultSpace". |
| 218 | DefaultSpace string |
| 219 | |
| 220 | r io.ByteReader |
| 221 | buf bytes.Buffer |
| 222 | saved *bytes.Buffer |
| 223 | stk *stack |
| 224 | free *stack |
| 225 | needClose bool |
| 226 | toClose Name |
| 227 | nextToken Token |
| 228 | nextByte int |
| 229 | ns map[string]string |
| 230 | err error |
| 231 | line int |
| 232 | offset int64 |
| 233 | unmarshalDepth int |
| 234 | } |
| 235 | |
| 236 | // NewDecoder creates a new XML parser reading from r. |
| 237 | // If r does not implement io.ByteReader, NewDecoder will |
| 238 | // do its own buffering. |
| 239 | func NewDecoder(r io.Reader) *Decoder { |
| 240 | d := &Decoder{ |
| 241 | ns: make(map[string]string), |
| 242 | nextByte: -1, |
| 243 | line: 1, |
| 244 | Strict: true, |
| 245 | } |
| 246 | d.switchToReader(r) |
| 247 | return d |
| 248 | } |
| 249 | |
| 250 | // Token returns the next XML token in the input stream. |
| 251 | // At the end of the input stream, Token returns nil, io.EOF. |
| 252 | // |
| 253 | // Slices of bytes in the returned token data refer to the |
| 254 | // parser's internal buffer and remain valid only until the next |
| 255 | // call to Token. To acquire a copy of the bytes, call CopyToken |
| 256 | // or the token's Copy method. |
| 257 | // |
| 258 | // Token expands self-closing elements such as <br/> |
| 259 | // into separate start and end elements returned by successive calls. |
| 260 | // |
| 261 | // Token guarantees that the StartElement and EndElement |
| 262 | // tokens it returns are properly nested and matched: |
| 263 | // if Token encounters an unexpected end element, |
| 264 | // it will return an error. |
| 265 | // |
| 266 | // Token implements XML name spaces as described by |
| 267 | // http://www.w3.org/TR/REC-xml-names/. Each of the |
| 268 | // Name structures contained in the Token has the Space |
| 269 | // set to the URL identifying its name space when known. |
| 270 | // If Token encounters an unrecognized name space prefix, |
| 271 | // it uses the prefix as the Space rather than report an error. |
| 272 | func (d *Decoder) Token() (t Token, err error) { |
| 273 | if d.stk != nil && d.stk.kind == stkEOF { |
| 274 | err = io.EOF |
| 275 | return |
| 276 | } |
| 277 | if d.nextToken != nil { |
| 278 | t = d.nextToken |
| 279 | d.nextToken = nil |
| 280 | } else if t, err = d.rawToken(); err != nil { |
| 281 | return |
| 282 | } |
| 283 | |
| 284 | if !d.Strict { |
| 285 | if t1, ok := d.autoClose(t); ok { |
| 286 | d.nextToken = t |
| 287 | t = t1 |
| 288 | } |
| 289 | } |
| 290 | switch t1 := t.(type) { |
| 291 | case StartElement: |
| 292 | // In XML name spaces, the translations listed in the |
| 293 | // attributes apply to the element name and |
| 294 | // to the other attribute names, so process |
| 295 | // the translations first. |
| 296 | for _, a := range t1.Attr { |
| 297 | if a.Name.Space == "xmlns" { |
| 298 | v, ok := d.ns[a.Name.Local] |
| 299 | d.pushNs(a.Name.Local, v, ok) |
| 300 | d.ns[a.Name.Local] = a.Value |
| 301 | } |
| 302 | if a.Name.Space == "" && a.Name.Local == "xmlns" { |
| 303 | // Default space for untagged names |
| 304 | v, ok := d.ns[""] |
| 305 | d.pushNs("", v, ok) |
| 306 | d.ns[""] = a.Value |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | d.translate(&t1.Name, true) |
| 311 | for i := range t1.Attr { |
| 312 | d.translate(&t1.Attr[i].Name, false) |
| 313 | } |
| 314 | d.pushElement(t1.Name) |
| 315 | t = t1 |
| 316 | |
| 317 | case EndElement: |
| 318 | d.translate(&t1.Name, true) |
| 319 | if !d.popElement(&t1) { |
| 320 | return nil, d.err |
| 321 | } |
| 322 | t = t1 |
| 323 | } |
| 324 | return |
| 325 | } |
| 326 | |
| 327 | const xmlURL = "http://www.w3.org/XML/1998/namespace" |
| 328 | |
| 329 | // Apply name space translation to name n. |
| 330 | // The default name space (for Space=="") |
| 331 | // applies only to element names, not to attribute names. |
| 332 | func (d *Decoder) translate(n *Name, isElementName bool) { |
| 333 | switch { |
| 334 | case n.Space == "xmlns": |
| 335 | return |
| 336 | case n.Space == "" && !isElementName: |
| 337 | return |
| 338 | case n.Space == "xml": |
| 339 | n.Space = xmlURL |
| 340 | case n.Space == "" && n.Local == "xmlns": |
| 341 | return |
| 342 | } |
| 343 | if v, ok := d.ns[n.Space]; ok { |
| 344 | n.Space = v |
| 345 | } else if n.Space == "" { |
| 346 | n.Space = d.DefaultSpace |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | func (d *Decoder) switchToReader(r io.Reader) { |
| 351 | // Get efficient byte at a time reader. |
| 352 | // Assume that if reader has its own |
| 353 | // ReadByte, it's efficient enough. |
| 354 | // Otherwise, use bufio. |
| 355 | if rb, ok := r.(io.ByteReader); ok { |
| 356 | d.r = rb |
| 357 | } else { |
| 358 | d.r = bufio.NewReader(r) |
| 359 | } |
| 360 | } |
| 361 | |
| 362 | // Parsing state - stack holds old name space translations |
| 363 | // and the current set of open elements. The translations to pop when |
| 364 | // ending a given tag are *below* it on the stack, which is |
| 365 | // more work but forced on us by XML. |
| 366 | type stack struct { |
| 367 | next *stack |
| 368 | kind int |
| 369 | name Name |
| 370 | ok bool |
| 371 | } |
| 372 | |
| 373 | const ( |
| 374 | stkStart = iota |
| 375 | stkNs |
| 376 | stkEOF |
| 377 | ) |
| 378 | |
| 379 | func (d *Decoder) push(kind int) *stack { |
| 380 | s := d.free |
| 381 | if s != nil { |
| 382 | d.free = s.next |
| 383 | } else { |
| 384 | s = new(stack) |
| 385 | } |
| 386 | s.next = d.stk |
| 387 | s.kind = kind |
| 388 | d.stk = s |
| 389 | return s |
| 390 | } |
| 391 | |
| 392 | func (d *Decoder) pop() *stack { |
| 393 | s := d.stk |
| 394 | if s != nil { |
| 395 | d.stk = s.next |
| 396 | s.next = d.free |
| 397 | d.free = s |
| 398 | } |
| 399 | return s |
| 400 | } |
| 401 | |
| 402 | // Record that after the current element is finished |
| 403 | // (that element is already pushed on the stack) |
| 404 | // Token should return EOF until popEOF is called. |
| 405 | func (d *Decoder) pushEOF() { |
| 406 | // Walk down stack to find Start. |
| 407 | // It might not be the top, because there might be stkNs |
| 408 | // entries above it. |
| 409 | start := d.stk |
| 410 | for start.kind != stkStart { |
| 411 | start = start.next |
| 412 | } |
| 413 | // The stkNs entries below a start are associated with that |
| 414 | // element too; skip over them. |
| 415 | for start.next != nil && start.next.kind == stkNs { |
| 416 | start = start.next |
| 417 | } |
| 418 | s := d.free |
| 419 | if s != nil { |
| 420 | d.free = s.next |
| 421 | } else { |
| 422 | s = new(stack) |
| 423 | } |
| 424 | s.kind = stkEOF |
| 425 | s.next = start.next |
| 426 | start.next = s |
| 427 | } |
| 428 | |
| 429 | // Undo a pushEOF. |
| 430 | // The element must have been finished, so the EOF should be at the top of the stack. |
| 431 | func (d *Decoder) popEOF() bool { |
| 432 | if d.stk == nil || d.stk.kind != stkEOF { |
| 433 | return false |
| 434 | } |
| 435 | d.pop() |
| 436 | return true |
| 437 | } |
| 438 | |
| 439 | // Record that we are starting an element with the given name. |
| 440 | func (d *Decoder) pushElement(name Name) { |
| 441 | s := d.push(stkStart) |
| 442 | s.name = name |
| 443 | } |
| 444 | |
| 445 | // Record that we are changing the value of ns[local]. |
| 446 | // The old value is url, ok. |
| 447 | func (d *Decoder) pushNs(local string, url string, ok bool) { |
| 448 | s := d.push(stkNs) |
| 449 | s.name.Local = local |
| 450 | s.name.Space = url |
| 451 | s.ok = ok |
| 452 | } |
| 453 | |
| 454 | // Creates a SyntaxError with the current line number. |
| 455 | func (d *Decoder) syntaxError(msg string) error { |
| 456 | return &SyntaxError{Msg: msg, Line: d.line} |
| 457 | } |
| 458 | |
| 459 | // Record that we are ending an element with the given name. |
| 460 | // The name must match the record at the top of the stack, |
| 461 | // which must be a pushElement record. |
| 462 | // After popping the element, apply any undo records from |
| 463 | // the stack to restore the name translations that existed |
| 464 | // before we saw this element. |
| 465 | func (d *Decoder) popElement(t *EndElement) bool { |
| 466 | s := d.pop() |
| 467 | name := t.Name |
| 468 | switch { |
| 469 | case s == nil || s.kind != stkStart: |
| 470 | d.err = d.syntaxError("unexpected end element </" + name.Local + ">") |
| 471 | return false |
| 472 | case s.name.Local != name.Local: |
| 473 | if !d.Strict { |
| 474 | d.needClose = true |
| 475 | d.toClose = t.Name |
| 476 | t.Name = s.name |
| 477 | return true |
| 478 | } |
| 479 | d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">") |
| 480 | return false |
| 481 | case s.name.Space != name.Space: |
| 482 | d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space + |
| 483 | "closed by </" + name.Local + "> in space " + name.Space) |
| 484 | return false |
| 485 | } |
| 486 | |
| 487 | // Pop stack until a Start or EOF is on the top, undoing the |
| 488 | // translations that were associated with the element we just closed. |
| 489 | for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF { |
| 490 | s := d.pop() |
| 491 | if s.ok { |
| 492 | d.ns[s.name.Local] = s.name.Space |
| 493 | } else { |
| 494 | delete(d.ns, s.name.Local) |
| 495 | } |
| 496 | } |
| 497 | |
| 498 | return true |
| 499 | } |
| 500 | |
| 501 | // If the top element on the stack is autoclosing and |
| 502 | // t is not the end tag, invent the end tag. |
| 503 | func (d *Decoder) autoClose(t Token) (Token, bool) { |
| 504 | if d.stk == nil || d.stk.kind != stkStart { |
| 505 | return nil, false |
| 506 | } |
| 507 | name := strings.ToLower(d.stk.name.Local) |
| 508 | for _, s := range d.AutoClose { |
| 509 | if strings.ToLower(s) == name { |
| 510 | // This one should be auto closed if t doesn't close it. |
| 511 | et, ok := t.(EndElement) |
| 512 | if !ok || et.Name.Local != name { |
| 513 | return EndElement{d.stk.name}, true |
| 514 | } |
| 515 | break |
| 516 | } |
| 517 | } |
| 518 | return nil, false |
| 519 | } |
| 520 | |
| 521 | var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method") |
| 522 | |
| 523 | // RawToken is like Token but does not verify that |
| 524 | // start and end elements match and does not translate |
| 525 | // name space prefixes to their corresponding URLs. |
| 526 | func (d *Decoder) RawToken() (Token, error) { |
| 527 | if d.unmarshalDepth > 0 { |
| 528 | return nil, errRawToken |
| 529 | } |
| 530 | return d.rawToken() |
| 531 | } |
| 532 | |
| 533 | func (d *Decoder) rawToken() (Token, error) { |
| 534 | if d.err != nil { |
| 535 | return nil, d.err |
| 536 | } |
| 537 | if d.needClose { |
| 538 | // The last element we read was self-closing and |
| 539 | // we returned just the StartElement half. |
| 540 | // Return the EndElement half now. |
| 541 | d.needClose = false |
| 542 | return EndElement{d.toClose}, nil |
| 543 | } |
| 544 | |
| 545 | b, ok := d.getc() |
| 546 | if !ok { |
| 547 | return nil, d.err |
| 548 | } |
| 549 | |
| 550 | if b != '<' { |
| 551 | // Text section. |
| 552 | d.ungetc(b) |
| 553 | data := d.text(-1, false) |
| 554 | if data == nil { |
| 555 | return nil, d.err |
| 556 | } |
| 557 | return CharData(data), nil |
| 558 | } |
| 559 | |
| 560 | if b, ok = d.mustgetc(); !ok { |
| 561 | return nil, d.err |
| 562 | } |
| 563 | switch b { |
| 564 | case '/': |
| 565 | // </: End element |
| 566 | var name Name |
| 567 | if name, ok = d.nsname(); !ok { |
| 568 | if d.err == nil { |
| 569 | d.err = d.syntaxError("expected element name after </") |
| 570 | } |
| 571 | return nil, d.err |
| 572 | } |
| 573 | d.space() |
| 574 | if b, ok = d.mustgetc(); !ok { |
| 575 | return nil, d.err |
| 576 | } |
| 577 | if b != '>' { |
| 578 | d.err = d.syntaxError("invalid characters between </" + name.Local + " and >") |
| 579 | return nil, d.err |
| 580 | } |
| 581 | return EndElement{name}, nil |
| 582 | |
| 583 | case '?': |
| 584 | // <?: Processing instruction. |
| 585 | var target string |
| 586 | if target, ok = d.name(); !ok { |
| 587 | if d.err == nil { |
| 588 | d.err = d.syntaxError("expected target name after <?") |
| 589 | } |
| 590 | return nil, d.err |
| 591 | } |
| 592 | d.space() |
| 593 | d.buf.Reset() |
| 594 | var b0 byte |
| 595 | for { |
| 596 | if b, ok = d.mustgetc(); !ok { |
| 597 | return nil, d.err |
| 598 | } |
| 599 | d.buf.WriteByte(b) |
| 600 | if b0 == '?' && b == '>' { |
| 601 | break |
| 602 | } |
| 603 | b0 = b |
| 604 | } |
| 605 | data := d.buf.Bytes() |
| 606 | data = data[0 : len(data)-2] // chop ?> |
| 607 | |
| 608 | if target == "xml" { |
| 609 | content := string(data) |
| 610 | ver := procInst("version", content) |
| 611 | if ver != "" && ver != "1.0" { |
| 612 | d.err = fmt.Errorf("xml: unsupported version %q; only version 1.0 is supported", ver) |
| 613 | return nil, d.err |
| 614 | } |
| 615 | enc := procInst("encoding", content) |
| 616 | if enc != "" && enc != "utf-8" && enc != "UTF-8" { |
| 617 | if d.CharsetReader == nil { |
| 618 | d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc) |
| 619 | return nil, d.err |
| 620 | } |
| 621 | newr, err := d.CharsetReader(enc, d.r.(io.Reader)) |
| 622 | if err != nil { |
| 623 | d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err) |
| 624 | return nil, d.err |
| 625 | } |
| 626 | if newr == nil { |
| 627 | panic("CharsetReader returned a nil Reader for charset " + enc) |
| 628 | } |
| 629 | d.switchToReader(newr) |
| 630 | } |
| 631 | } |
| 632 | return ProcInst{target, data}, nil |
| 633 | |
| 634 | case '!': |
| 635 | // <!: Maybe comment, maybe CDATA. |
| 636 | if b, ok = d.mustgetc(); !ok { |
| 637 | return nil, d.err |
| 638 | } |
| 639 | switch b { |
| 640 | case '-': // <!- |
| 641 | // Probably <!-- for a comment. |
| 642 | if b, ok = d.mustgetc(); !ok { |
| 643 | return nil, d.err |
| 644 | } |
| 645 | if b != '-' { |
| 646 | d.err = d.syntaxError("invalid sequence <!- not part of <!--") |
| 647 | return nil, d.err |
| 648 | } |
| 649 | // Look for terminator. |
| 650 | d.buf.Reset() |
| 651 | var b0, b1 byte |
| 652 | for { |
| 653 | if b, ok = d.mustgetc(); !ok { |
| 654 | return nil, d.err |
| 655 | } |
| 656 | d.buf.WriteByte(b) |
| 657 | if b0 == '-' && b1 == '-' && b == '>' { |
| 658 | break |
| 659 | } |
| 660 | b0, b1 = b1, b |
| 661 | } |
| 662 | data := d.buf.Bytes() |
| 663 | data = data[0 : len(data)-3] // chop --> |
| 664 | return Comment(data), nil |
| 665 | |
| 666 | case '[': // <![ |
| 667 | // Probably <![CDATA[. |
| 668 | for i := 0; i < 6; i++ { |
| 669 | if b, ok = d.mustgetc(); !ok { |
| 670 | return nil, d.err |
| 671 | } |
| 672 | if b != "CDATA["[i] { |
| 673 | d.err = d.syntaxError("invalid <![ sequence") |
| 674 | return nil, d.err |
| 675 | } |
| 676 | } |
| 677 | // Have <![CDATA[. Read text until ]]>. |
| 678 | data := d.text(-1, true) |
| 679 | if data == nil { |
| 680 | return nil, d.err |
| 681 | } |
| 682 | return CharData(data), nil |
| 683 | } |
| 684 | |
| 685 | // Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc. |
| 686 | // We don't care, but accumulate for caller. Quoted angle |
| 687 | // brackets do not count for nesting. |
| 688 | d.buf.Reset() |
| 689 | d.buf.WriteByte(b) |
| 690 | inquote := uint8(0) |
| 691 | depth := 0 |
| 692 | for { |
| 693 | if b, ok = d.mustgetc(); !ok { |
| 694 | return nil, d.err |
| 695 | } |
| 696 | if inquote == 0 && b == '>' && depth == 0 { |
| 697 | break |
| 698 | } |
| 699 | HandleB: |
| 700 | d.buf.WriteByte(b) |
| 701 | switch { |
| 702 | case b == inquote: |
| 703 | inquote = 0 |
| 704 | |
| 705 | case inquote != 0: |
| 706 | // in quotes, no special action |
| 707 | |
| 708 | case b == '\'' || b == '"': |
| 709 | inquote = b |
| 710 | |
| 711 | case b == '>' && inquote == 0: |
| 712 | depth-- |
| 713 | |
| 714 | case b == '<' && inquote == 0: |
| 715 | // Look for <!-- to begin comment. |
| 716 | s := "!--" |
| 717 | for i := 0; i < len(s); i++ { |
| 718 | if b, ok = d.mustgetc(); !ok { |
| 719 | return nil, d.err |
| 720 | } |
| 721 | if b != s[i] { |
| 722 | for j := 0; j < i; j++ { |
| 723 | d.buf.WriteByte(s[j]) |
| 724 | } |
| 725 | depth++ |
| 726 | goto HandleB |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | // Remove < that was written above. |
| 731 | d.buf.Truncate(d.buf.Len() - 1) |
| 732 | |
| 733 | // Look for terminator. |
| 734 | var b0, b1 byte |
| 735 | for { |
| 736 | if b, ok = d.mustgetc(); !ok { |
| 737 | return nil, d.err |
| 738 | } |
| 739 | if b0 == '-' && b1 == '-' && b == '>' { |
| 740 | break |
| 741 | } |
| 742 | b0, b1 = b1, b |
| 743 | } |
| 744 | } |
| 745 | } |
| 746 | return Directive(d.buf.Bytes()), nil |
| 747 | } |
| 748 | |
| 749 | // Must be an open element like <a href="foo"> |
| 750 | d.ungetc(b) |
| 751 | |
| 752 | var ( |
| 753 | name Name |
| 754 | empty bool |
| 755 | attr []Attr |
| 756 | ) |
| 757 | if name, ok = d.nsname(); !ok { |
| 758 | if d.err == nil { |
| 759 | d.err = d.syntaxError("expected element name after <") |
| 760 | } |
| 761 | return nil, d.err |
| 762 | } |
| 763 | |
| 764 | attr = []Attr{} |
| 765 | for { |
| 766 | d.space() |
| 767 | if b, ok = d.mustgetc(); !ok { |
| 768 | return nil, d.err |
| 769 | } |
| 770 | if b == '/' { |
| 771 | empty = true |
| 772 | if b, ok = d.mustgetc(); !ok { |
| 773 | return nil, d.err |
| 774 | } |
| 775 | if b != '>' { |
| 776 | d.err = d.syntaxError("expected /> in element") |
| 777 | return nil, d.err |
| 778 | } |
| 779 | break |
| 780 | } |
| 781 | if b == '>' { |
| 782 | break |
| 783 | } |
| 784 | d.ungetc(b) |
| 785 | |
| 786 | n := len(attr) |
| 787 | if n >= cap(attr) { |
| 788 | nCap := 2 * cap(attr) |
| 789 | if nCap == 0 { |
| 790 | nCap = 4 |
| 791 | } |
| 792 | nattr := make([]Attr, n, nCap) |
| 793 | copy(nattr, attr) |
| 794 | attr = nattr |
| 795 | } |
| 796 | attr = attr[0 : n+1] |
| 797 | a := &attr[n] |
| 798 | if a.Name, ok = d.nsname(); !ok { |
| 799 | if d.err == nil { |
| 800 | d.err = d.syntaxError("expected attribute name in element") |
| 801 | } |
| 802 | return nil, d.err |
| 803 | } |
| 804 | d.space() |
| 805 | if b, ok = d.mustgetc(); !ok { |
| 806 | return nil, d.err |
| 807 | } |
| 808 | if b != '=' { |
| 809 | if d.Strict { |
| 810 | d.err = d.syntaxError("attribute name without = in element") |
| 811 | return nil, d.err |
| 812 | } else { |
| 813 | d.ungetc(b) |
| 814 | a.Value = a.Name.Local |
| 815 | } |
| 816 | } else { |
| 817 | d.space() |
| 818 | data := d.attrval() |
| 819 | if data == nil { |
| 820 | return nil, d.err |
| 821 | } |
| 822 | a.Value = string(data) |
| 823 | } |
| 824 | } |
| 825 | if empty { |
| 826 | d.needClose = true |
| 827 | d.toClose = name |
| 828 | } |
| 829 | return StartElement{name, attr}, nil |
| 830 | } |
| 831 | |
| 832 | func (d *Decoder) attrval() []byte { |
| 833 | b, ok := d.mustgetc() |
| 834 | if !ok { |
| 835 | return nil |
| 836 | } |
| 837 | // Handle quoted attribute values |
| 838 | if b == '"' || b == '\'' { |
| 839 | return d.text(int(b), false) |
| 840 | } |
| 841 | // Handle unquoted attribute values for strict parsers |
| 842 | if d.Strict { |
| 843 | d.err = d.syntaxError("unquoted or missing attribute value in element") |
| 844 | return nil |
| 845 | } |
| 846 | // Handle unquoted attribute values for unstrict parsers |
| 847 | d.ungetc(b) |
| 848 | d.buf.Reset() |
| 849 | for { |
| 850 | b, ok = d.mustgetc() |
| 851 | if !ok { |
| 852 | return nil |
| 853 | } |
| 854 | // http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2 |
| 855 | if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || |
| 856 | '0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' { |
| 857 | d.buf.WriteByte(b) |
| 858 | } else { |
| 859 | d.ungetc(b) |
| 860 | break |
| 861 | } |
| 862 | } |
| 863 | return d.buf.Bytes() |
| 864 | } |
| 865 | |
| 866 | // Skip spaces if any |
| 867 | func (d *Decoder) space() { |
| 868 | for { |
| 869 | b, ok := d.getc() |
| 870 | if !ok { |
| 871 | return |
| 872 | } |
| 873 | switch b { |
| 874 | case ' ', '\r', '\n', '\t': |
| 875 | default: |
| 876 | d.ungetc(b) |
| 877 | return |
| 878 | } |
| 879 | } |
| 880 | } |
| 881 | |
| 882 | // Read a single byte. |
| 883 | // If there is no byte to read, return ok==false |
| 884 | // and leave the error in d.err. |
| 885 | // Maintain line number. |
| 886 | func (d *Decoder) getc() (b byte, ok bool) { |
| 887 | if d.err != nil { |
| 888 | return 0, false |
| 889 | } |
| 890 | if d.nextByte >= 0 { |
| 891 | b = byte(d.nextByte) |
| 892 | d.nextByte = -1 |
| 893 | } else { |
| 894 | b, d.err = d.r.ReadByte() |
| 895 | if d.err != nil { |
| 896 | return 0, false |
| 897 | } |
| 898 | if d.saved != nil { |
| 899 | d.saved.WriteByte(b) |
| 900 | } |
| 901 | } |
| 902 | if b == '\n' { |
| 903 | d.line++ |
| 904 | } |
| 905 | d.offset++ |
| 906 | return b, true |
| 907 | } |
| 908 | |
| 909 | // InputOffset returns the input stream byte offset of the current decoder position. |
| 910 | // The offset gives the location of the end of the most recently returned token |
| 911 | // and the beginning of the next token. |
| 912 | func (d *Decoder) InputOffset() int64 { |
| 913 | return d.offset |
| 914 | } |
| 915 | |
| 916 | // Return saved offset. |
| 917 | // If we did ungetc (nextByte >= 0), have to back up one. |
| 918 | func (d *Decoder) savedOffset() int { |
| 919 | n := d.saved.Len() |
| 920 | if d.nextByte >= 0 { |
| 921 | n-- |
| 922 | } |
| 923 | return n |
| 924 | } |
| 925 | |
| 926 | // Must read a single byte. |
| 927 | // If there is no byte to read, |
| 928 | // set d.err to SyntaxError("unexpected EOF") |
| 929 | // and return ok==false |
| 930 | func (d *Decoder) mustgetc() (b byte, ok bool) { |
| 931 | if b, ok = d.getc(); !ok { |
| 932 | if d.err == io.EOF { |
| 933 | d.err = d.syntaxError("unexpected EOF") |
| 934 | } |
| 935 | } |
| 936 | return |
| 937 | } |
| 938 | |
| 939 | // Unread a single byte. |
| 940 | func (d *Decoder) ungetc(b byte) { |
| 941 | if b == '\n' { |
| 942 | d.line-- |
| 943 | } |
| 944 | d.nextByte = int(b) |
| 945 | d.offset-- |
| 946 | } |
| 947 | |
| 948 | var entity = map[string]int{ |
| 949 | "lt": '<', |
| 950 | "gt": '>', |
| 951 | "amp": '&', |
| 952 | "apos": '\'', |
| 953 | "quot": '"', |
| 954 | } |
| 955 | |
| 956 | // Read plain text section (XML calls it character data). |
| 957 | // If quote >= 0, we are in a quoted string and need to find the matching quote. |
| 958 | // If cdata == true, we are in a <![CDATA[ section and need to find ]]>. |
| 959 | // On failure return nil and leave the error in d.err. |
| 960 | func (d *Decoder) text(quote int, cdata bool) []byte { |
| 961 | var b0, b1 byte |
| 962 | var trunc int |
| 963 | d.buf.Reset() |
| 964 | Input: |
| 965 | for { |
| 966 | b, ok := d.getc() |
| 967 | if !ok { |
| 968 | if cdata { |
| 969 | if d.err == io.EOF { |
| 970 | d.err = d.syntaxError("unexpected EOF in CDATA section") |
| 971 | } |
| 972 | return nil |
| 973 | } |
| 974 | break Input |
| 975 | } |
| 976 | |
| 977 | // <![CDATA[ section ends with ]]>. |
| 978 | // It is an error for ]]> to appear in ordinary text. |
| 979 | if b0 == ']' && b1 == ']' && b == '>' { |
| 980 | if cdata { |
| 981 | trunc = 2 |
| 982 | break Input |
| 983 | } |
| 984 | d.err = d.syntaxError("unescaped ]]> not in CDATA section") |
| 985 | return nil |
| 986 | } |
| 987 | |
| 988 | // Stop reading text if we see a <. |
| 989 | if b == '<' && !cdata { |
| 990 | if quote >= 0 { |
| 991 | d.err = d.syntaxError("unescaped < inside quoted string") |
| 992 | return nil |
| 993 | } |
| 994 | d.ungetc('<') |
| 995 | break Input |
| 996 | } |
| 997 | if quote >= 0 && b == byte(quote) { |
| 998 | break Input |
| 999 | } |
| 1000 | if b == '&' && !cdata { |
| 1001 | // Read escaped character expression up to semicolon. |
| 1002 | // XML in all its glory allows a document to define and use |
| 1003 | // its own character names with <!ENTITY ...> directives. |
| 1004 | // Parsers are required to recognize lt, gt, amp, apos, and quot |
| 1005 | // even if they have not been declared. |
| 1006 | before := d.buf.Len() |
| 1007 | d.buf.WriteByte('&') |
| 1008 | var ok bool |
| 1009 | var text string |
| 1010 | var haveText bool |
| 1011 | if b, ok = d.mustgetc(); !ok { |
| 1012 | return nil |
| 1013 | } |
| 1014 | if b == '#' { |
| 1015 | d.buf.WriteByte(b) |
| 1016 | if b, ok = d.mustgetc(); !ok { |
| 1017 | return nil |
| 1018 | } |
| 1019 | base := 10 |
| 1020 | if b == 'x' { |
| 1021 | base = 16 |
| 1022 | d.buf.WriteByte(b) |
| 1023 | if b, ok = d.mustgetc(); !ok { |
| 1024 | return nil |
| 1025 | } |
| 1026 | } |
| 1027 | start := d.buf.Len() |
| 1028 | for '0' <= b && b <= '9' || |
| 1029 | base == 16 && 'a' <= b && b <= 'f' || |
| 1030 | base == 16 && 'A' <= b && b <= 'F' { |
| 1031 | d.buf.WriteByte(b) |
| 1032 | if b, ok = d.mustgetc(); !ok { |
| 1033 | return nil |
| 1034 | } |
| 1035 | } |
| 1036 | if b != ';' { |
| 1037 | d.ungetc(b) |
| 1038 | } else { |
| 1039 | s := string(d.buf.Bytes()[start:]) |
| 1040 | d.buf.WriteByte(';') |
| 1041 | n, err := strconv.ParseUint(s, base, 64) |
| 1042 | if err == nil && n <= unicode.MaxRune { |
| 1043 | text = string(n) |
| 1044 | haveText = true |
| 1045 | } |
| 1046 | } |
| 1047 | } else { |
| 1048 | d.ungetc(b) |
| 1049 | if !d.readName() { |
| 1050 | if d.err != nil { |
| 1051 | return nil |
| 1052 | } |
| 1053 | ok = false |
| 1054 | } |
| 1055 | if b, ok = d.mustgetc(); !ok { |
| 1056 | return nil |
| 1057 | } |
| 1058 | if b != ';' { |
| 1059 | d.ungetc(b) |
| 1060 | } else { |
| 1061 | name := d.buf.Bytes()[before+1:] |
| 1062 | d.buf.WriteByte(';') |
| 1063 | if isName(name) { |
| 1064 | s := string(name) |
| 1065 | if r, ok := entity[s]; ok { |
| 1066 | text = string(r) |
| 1067 | haveText = true |
| 1068 | } else if d.Entity != nil { |
| 1069 | text, haveText = d.Entity[s] |
| 1070 | } |
| 1071 | } |
| 1072 | } |
| 1073 | } |
| 1074 | |
| 1075 | if haveText { |
| 1076 | d.buf.Truncate(before) |
| 1077 | d.buf.Write([]byte(text)) |
| 1078 | b0, b1 = 0, 0 |
| 1079 | continue Input |
| 1080 | } |
| 1081 | if !d.Strict { |
| 1082 | b0, b1 = 0, 0 |
| 1083 | continue Input |
| 1084 | } |
| 1085 | ent := string(d.buf.Bytes()[before:]) |
| 1086 | if ent[len(ent)-1] != ';' { |
| 1087 | ent += " (no semicolon)" |
| 1088 | } |
| 1089 | d.err = d.syntaxError("invalid character entity " + ent) |
| 1090 | return nil |
| 1091 | } |
| 1092 | |
| 1093 | // We must rewrite unescaped \r and \r\n into \n. |
| 1094 | if b == '\r' { |
| 1095 | d.buf.WriteByte('\n') |
| 1096 | } else if b1 == '\r' && b == '\n' { |
| 1097 | // Skip \r\n--we already wrote \n. |
| 1098 | } else { |
| 1099 | d.buf.WriteByte(b) |
| 1100 | } |
| 1101 | |
| 1102 | b0, b1 = b1, b |
| 1103 | } |
| 1104 | data := d.buf.Bytes() |
| 1105 | data = data[0 : len(data)-trunc] |
| 1106 | |
| 1107 | // Inspect each rune for being a disallowed character. |
| 1108 | buf := data |
| 1109 | for len(buf) > 0 { |
| 1110 | r, size := utf8.DecodeRune(buf) |
| 1111 | if r == utf8.RuneError && size == 1 { |
| 1112 | d.err = d.syntaxError("invalid UTF-8") |
| 1113 | return nil |
| 1114 | } |
| 1115 | buf = buf[size:] |
| 1116 | if !isInCharacterRange(r) { |
| 1117 | d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r)) |
| 1118 | return nil |
| 1119 | } |
| 1120 | } |
| 1121 | |
| 1122 | return data |
| 1123 | } |
| 1124 | |
| 1125 | // Decide whether the given rune is in the XML Character Range, per |
| 1126 | // the Char production of http://www.xml.com/axml/testaxml.htm, |
| 1127 | // Section 2.2 Characters. |
| 1128 | func isInCharacterRange(r rune) (inrange bool) { |
| 1129 | return r == 0x09 || |
| 1130 | r == 0x0A || |
| 1131 | r == 0x0D || |
| 1132 | r >= 0x20 && r <= 0xDF77 || |
| 1133 | r >= 0xE000 && r <= 0xFFFD || |
| 1134 | r >= 0x10000 && r <= 0x10FFFF |
| 1135 | } |
| 1136 | |
| 1137 | // Get name space name: name with a : stuck in the middle. |
| 1138 | // The part before the : is the name space identifier. |
| 1139 | func (d *Decoder) nsname() (name Name, ok bool) { |
| 1140 | s, ok := d.name() |
| 1141 | if !ok { |
| 1142 | return |
| 1143 | } |
| 1144 | i := strings.Index(s, ":") |
| 1145 | if i < 0 { |
| 1146 | name.Local = s |
| 1147 | } else { |
| 1148 | name.Space = s[0:i] |
| 1149 | name.Local = s[i+1:] |
| 1150 | } |
| 1151 | return name, true |
| 1152 | } |
| 1153 | |
| 1154 | // Get name: /first(first|second)*/ |
| 1155 | // Do not set d.err if the name is missing (unless unexpected EOF is received): |
| 1156 | // let the caller provide better context. |
| 1157 | func (d *Decoder) name() (s string, ok bool) { |
| 1158 | d.buf.Reset() |
| 1159 | if !d.readName() { |
| 1160 | return "", false |
| 1161 | } |
| 1162 | |
| 1163 | // Now we check the characters. |
| 1164 | b := d.buf.Bytes() |
| 1165 | if !isName(b) { |
| 1166 | d.err = d.syntaxError("invalid XML name: " + string(b)) |
| 1167 | return "", false |
| 1168 | } |
| 1169 | return string(b), true |
| 1170 | } |
| 1171 | |
| 1172 | // Read a name and append its bytes to d.buf. |
| 1173 | // The name is delimited by any single-byte character not valid in names. |
| 1174 | // All multi-byte characters are accepted; the caller must check their validity. |
| 1175 | func (d *Decoder) readName() (ok bool) { |
| 1176 | var b byte |
| 1177 | if b, ok = d.mustgetc(); !ok { |
| 1178 | return |
| 1179 | } |
| 1180 | if b < utf8.RuneSelf && !isNameByte(b) { |
| 1181 | d.ungetc(b) |
| 1182 | return false |
| 1183 | } |
| 1184 | d.buf.WriteByte(b) |
| 1185 | |
| 1186 | for { |
| 1187 | if b, ok = d.mustgetc(); !ok { |
| 1188 | return |
| 1189 | } |
| 1190 | if b < utf8.RuneSelf && !isNameByte(b) { |
| 1191 | d.ungetc(b) |
| 1192 | break |
| 1193 | } |
| 1194 | d.buf.WriteByte(b) |
| 1195 | } |
| 1196 | return true |
| 1197 | } |
| 1198 | |
| 1199 | func isNameByte(c byte) bool { |
| 1200 | return 'A' <= c && c <= 'Z' || |
| 1201 | 'a' <= c && c <= 'z' || |
| 1202 | '0' <= c && c <= '9' || |
| 1203 | c == '_' || c == ':' || c == '.' || c == '-' |
| 1204 | } |
| 1205 | |
| 1206 | func isName(s []byte) bool { |
| 1207 | if len(s) == 0 { |
| 1208 | return false |
| 1209 | } |
| 1210 | c, n := utf8.DecodeRune(s) |
| 1211 | if c == utf8.RuneError && n == 1 { |
| 1212 | return false |
| 1213 | } |
| 1214 | if !unicode.Is(first, c) { |
| 1215 | return false |
| 1216 | } |
| 1217 | for n < len(s) { |
| 1218 | s = s[n:] |
| 1219 | c, n = utf8.DecodeRune(s) |
| 1220 | if c == utf8.RuneError && n == 1 { |
| 1221 | return false |
| 1222 | } |
| 1223 | if !unicode.Is(first, c) && !unicode.Is(second, c) { |
| 1224 | return false |
| 1225 | } |
| 1226 | } |
| 1227 | return true |
| 1228 | } |
| 1229 | |
| 1230 | func isNameString(s string) bool { |
| 1231 | if len(s) == 0 { |
| 1232 | return false |
| 1233 | } |
| 1234 | c, n := utf8.DecodeRuneInString(s) |
| 1235 | if c == utf8.RuneError && n == 1 { |
| 1236 | return false |
| 1237 | } |
| 1238 | if !unicode.Is(first, c) { |
| 1239 | return false |
| 1240 | } |
| 1241 | for n < len(s) { |
| 1242 | s = s[n:] |
| 1243 | c, n = utf8.DecodeRuneInString(s) |
| 1244 | if c == utf8.RuneError && n == 1 { |
| 1245 | return false |
| 1246 | } |
| 1247 | if !unicode.Is(first, c) && !unicode.Is(second, c) { |
| 1248 | return false |
| 1249 | } |
| 1250 | } |
| 1251 | return true |
| 1252 | } |
| 1253 | |
| 1254 | // These tables were generated by cut and paste from Appendix B of |
| 1255 | // the XML spec at http://www.xml.com/axml/testaxml.htm |
| 1256 | // and then reformatting. First corresponds to (Letter | '_' | ':') |
| 1257 | // and second corresponds to NameChar. |
| 1258 | |
| 1259 | var first = &unicode.RangeTable{ |
| 1260 | R16: []unicode.Range16{ |
| 1261 | {0x003A, 0x003A, 1}, |
| 1262 | {0x0041, 0x005A, 1}, |
| 1263 | {0x005F, 0x005F, 1}, |
| 1264 | {0x0061, 0x007A, 1}, |
| 1265 | {0x00C0, 0x00D6, 1}, |
| 1266 | {0x00D8, 0x00F6, 1}, |
| 1267 | {0x00F8, 0x00FF, 1}, |
| 1268 | {0x0100, 0x0131, 1}, |
| 1269 | {0x0134, 0x013E, 1}, |
| 1270 | {0x0141, 0x0148, 1}, |
| 1271 | {0x014A, 0x017E, 1}, |
| 1272 | {0x0180, 0x01C3, 1}, |
| 1273 | {0x01CD, 0x01F0, 1}, |
| 1274 | {0x01F4, 0x01F5, 1}, |
| 1275 | {0x01FA, 0x0217, 1}, |
| 1276 | {0x0250, 0x02A8, 1}, |
| 1277 | {0x02BB, 0x02C1, 1}, |
| 1278 | {0x0386, 0x0386, 1}, |
| 1279 | {0x0388, 0x038A, 1}, |
| 1280 | {0x038C, 0x038C, 1}, |
| 1281 | {0x038E, 0x03A1, 1}, |
| 1282 | {0x03A3, 0x03CE, 1}, |
| 1283 | {0x03D0, 0x03D6, 1}, |
| 1284 | {0x03DA, 0x03E0, 2}, |
| 1285 | {0x03E2, 0x03F3, 1}, |
| 1286 | {0x0401, 0x040C, 1}, |
| 1287 | {0x040E, 0x044F, 1}, |
| 1288 | {0x0451, 0x045C, 1}, |
| 1289 | {0x045E, 0x0481, 1}, |
| 1290 | {0x0490, 0x04C4, 1}, |
| 1291 | {0x04C7, 0x04C8, 1}, |
| 1292 | {0x04CB, 0x04CC, 1}, |
| 1293 | {0x04D0, 0x04EB, 1}, |
| 1294 | {0x04EE, 0x04F5, 1}, |
| 1295 | {0x04F8, 0x04F9, 1}, |
| 1296 | {0x0531, 0x0556, 1}, |
| 1297 | {0x0559, 0x0559, 1}, |
| 1298 | {0x0561, 0x0586, 1}, |
| 1299 | {0x05D0, 0x05EA, 1}, |
| 1300 | {0x05F0, 0x05F2, 1}, |
| 1301 | {0x0621, 0x063A, 1}, |
| 1302 | {0x0641, 0x064A, 1}, |
| 1303 | {0x0671, 0x06B7, 1}, |
| 1304 | {0x06BA, 0x06BE, 1}, |
| 1305 | {0x06C0, 0x06CE, 1}, |
| 1306 | {0x06D0, 0x06D3, 1}, |
| 1307 | {0x06D5, 0x06D5, 1}, |
| 1308 | {0x06E5, 0x06E6, 1}, |
| 1309 | {0x0905, 0x0939, 1}, |
| 1310 | {0x093D, 0x093D, 1}, |
| 1311 | {0x0958, 0x0961, 1}, |
| 1312 | {0x0985, 0x098C, 1}, |
| 1313 | {0x098F, 0x0990, 1}, |
| 1314 | {0x0993, 0x09A8, 1}, |
| 1315 | {0x09AA, 0x09B0, 1}, |
| 1316 | {0x09B2, 0x09B2, 1}, |
| 1317 | {0x09B6, 0x09B9, 1}, |
| 1318 | {0x09DC, 0x09DD, 1}, |
| 1319 | {0x09DF, 0x09E1, 1}, |
| 1320 | {0x09F0, 0x09F1, 1}, |
| 1321 | {0x0A05, 0x0A0A, 1}, |
| 1322 | {0x0A0F, 0x0A10, 1}, |
| 1323 | {0x0A13, 0x0A28, 1}, |
| 1324 | {0x0A2A, 0x0A30, 1}, |
| 1325 | {0x0A32, 0x0A33, 1}, |
| 1326 | {0x0A35, 0x0A36, 1}, |
| 1327 | {0x0A38, 0x0A39, 1}, |
| 1328 | {0x0A59, 0x0A5C, 1}, |
| 1329 | {0x0A5E, 0x0A5E, 1}, |
| 1330 | {0x0A72, 0x0A74, 1}, |
| 1331 | {0x0A85, 0x0A8B, 1}, |
| 1332 | {0x0A8D, 0x0A8D, 1}, |
| 1333 | {0x0A8F, 0x0A91, 1}, |
| 1334 | {0x0A93, 0x0AA8, 1}, |
| 1335 | {0x0AAA, 0x0AB0, 1}, |
| 1336 | {0x0AB2, 0x0AB3, 1}, |
| 1337 | {0x0AB5, 0x0AB9, 1}, |
| 1338 | {0x0ABD, 0x0AE0, 0x23}, |
| 1339 | {0x0B05, 0x0B0C, 1}, |
| 1340 | {0x0B0F, 0x0B10, 1}, |
| 1341 | {0x0B13, 0x0B28, 1}, |
| 1342 | {0x0B2A, 0x0B30, 1}, |
| 1343 | {0x0B32, 0x0B33, 1}, |
| 1344 | {0x0B36, 0x0B39, 1}, |
| 1345 | {0x0B3D, 0x0B3D, 1}, |
| 1346 | {0x0B5C, 0x0B5D, 1}, |
| 1347 | {0x0B5F, 0x0B61, 1}, |
| 1348 | {0x0B85, 0x0B8A, 1}, |
| 1349 | {0x0B8E, 0x0B90, 1}, |
| 1350 | {0x0B92, 0x0B95, 1}, |
| 1351 | {0x0B99, 0x0B9A, 1}, |
| 1352 | {0x0B9C, 0x0B9C, 1}, |
| 1353 | {0x0B9E, 0x0B9F, 1}, |
| 1354 | {0x0BA3, 0x0BA4, 1}, |
| 1355 | {0x0BA8, 0x0BAA, 1}, |
| 1356 | {0x0BAE, 0x0BB5, 1}, |
| 1357 | {0x0BB7, 0x0BB9, 1}, |
| 1358 | {0x0C05, 0x0C0C, 1}, |
| 1359 | {0x0C0E, 0x0C10, 1}, |
| 1360 | {0x0C12, 0x0C28, 1}, |
| 1361 | {0x0C2A, 0x0C33, 1}, |
| 1362 | {0x0C35, 0x0C39, 1}, |
| 1363 | {0x0C60, 0x0C61, 1}, |
| 1364 | {0x0C85, 0x0C8C, 1}, |
| 1365 | {0x0C8E, 0x0C90, 1}, |
| 1366 | {0x0C92, 0x0CA8, 1}, |
| 1367 | {0x0CAA, 0x0CB3, 1}, |
| 1368 | {0x0CB5, 0x0CB9, 1}, |
| 1369 | {0x0CDE, 0x0CDE, 1}, |
| 1370 | {0x0CE0, 0x0CE1, 1}, |
| 1371 | {0x0D05, 0x0D0C, 1}, |
| 1372 | {0x0D0E, 0x0D10, 1}, |
| 1373 | {0x0D12, 0x0D28, 1}, |
| 1374 | {0x0D2A, 0x0D39, 1}, |
| 1375 | {0x0D60, 0x0D61, 1}, |
| 1376 | {0x0E01, 0x0E2E, 1}, |
| 1377 | {0x0E30, 0x0E30, 1}, |
| 1378 | {0x0E32, 0x0E33, 1}, |
| 1379 | {0x0E40, 0x0E45, 1}, |
| 1380 | {0x0E81, 0x0E82, 1}, |
| 1381 | {0x0E84, 0x0E84, 1}, |
| 1382 | {0x0E87, 0x0E88, 1}, |
| 1383 | {0x0E8A, 0x0E8D, 3}, |
| 1384 | {0x0E94, 0x0E97, 1}, |
| 1385 | {0x0E99, 0x0E9F, 1}, |
| 1386 | {0x0EA1, 0x0EA3, 1}, |
| 1387 | {0x0EA5, 0x0EA7, 2}, |
| 1388 | {0x0EAA, 0x0EAB, 1}, |
| 1389 | {0x0EAD, 0x0EAE, 1}, |
| 1390 | {0x0EB0, 0x0EB0, 1}, |
| 1391 | {0x0EB2, 0x0EB3, 1}, |
| 1392 | {0x0EBD, 0x0EBD, 1}, |
| 1393 | {0x0EC0, 0x0EC4, 1}, |
| 1394 | {0x0F40, 0x0F47, 1}, |
| 1395 | {0x0F49, 0x0F69, 1}, |
| 1396 | {0x10A0, 0x10C5, 1}, |
| 1397 | {0x10D0, 0x10F6, 1}, |
| 1398 | {0x1100, 0x1100, 1}, |
| 1399 | {0x1102, 0x1103, 1}, |
| 1400 | {0x1105, 0x1107, 1}, |
| 1401 | {0x1109, 0x1109, 1}, |
| 1402 | {0x110B, 0x110C, 1}, |
| 1403 | {0x110E, 0x1112, 1}, |
| 1404 | {0x113C, 0x1140, 2}, |
| 1405 | {0x114C, 0x1150, 2}, |
| 1406 | {0x1154, 0x1155, 1}, |
| 1407 | {0x1159, 0x1159, 1}, |
| 1408 | {0x115F, 0x1161, 1}, |
| 1409 | {0x1163, 0x1169, 2}, |
| 1410 | {0x116D, 0x116E, 1}, |
| 1411 | {0x1172, 0x1173, 1}, |
| 1412 | {0x1175, 0x119E, 0x119E - 0x1175}, |
| 1413 | {0x11A8, 0x11AB, 0x11AB - 0x11A8}, |
| 1414 | {0x11AE, 0x11AF, 1}, |
| 1415 | {0x11B7, 0x11B8, 1}, |
| 1416 | {0x11BA, 0x11BA, 1}, |
| 1417 | {0x11BC, 0x11C2, 1}, |
| 1418 | {0x11EB, 0x11F0, 0x11F0 - 0x11EB}, |
| 1419 | {0x11F9, 0x11F9, 1}, |
| 1420 | {0x1E00, 0x1E9B, 1}, |
| 1421 | {0x1EA0, 0x1EF9, 1}, |
| 1422 | {0x1F00, 0x1F15, 1}, |
| 1423 | {0x1F18, 0x1F1D, 1}, |
| 1424 | {0x1F20, 0x1F45, 1}, |
| 1425 | {0x1F48, 0x1F4D, 1}, |
| 1426 | {0x1F50, 0x1F57, 1}, |
| 1427 | {0x1F59, 0x1F5B, 0x1F5B - 0x1F59}, |
| 1428 | {0x1F5D, 0x1F5D, 1}, |
| 1429 | {0x1F5F, 0x1F7D, 1}, |
| 1430 | {0x1F80, 0x1FB4, 1}, |
| 1431 | {0x1FB6, 0x1FBC, 1}, |
| 1432 | {0x1FBE, 0x1FBE, 1}, |
| 1433 | {0x1FC2, 0x1FC4, 1}, |
| 1434 | {0x1FC6, 0x1FCC, 1}, |
| 1435 | {0x1FD0, 0x1FD3, 1}, |
| 1436 | {0x1FD6, 0x1FDB, 1}, |
| 1437 | {0x1FE0, 0x1FEC, 1}, |
| 1438 | {0x1FF2, 0x1FF4, 1}, |
| 1439 | {0x1FF6, 0x1FFC, 1}, |
| 1440 | {0x2126, 0x2126, 1}, |
| 1441 | {0x212A, 0x212B, 1}, |
| 1442 | {0x212E, 0x212E, 1}, |
| 1443 | {0x2180, 0x2182, 1}, |
| 1444 | {0x3007, 0x3007, 1}, |
| 1445 | {0x3021, 0x3029, 1}, |
| 1446 | {0x3041, 0x3094, 1}, |
| 1447 | {0x30A1, 0x30FA, 1}, |
| 1448 | {0x3105, 0x312C, 1}, |
| 1449 | {0x4E00, 0x9FA5, 1}, |
| 1450 | {0xAC00, 0xD7A3, 1}, |
| 1451 | }, |
| 1452 | } |
| 1453 | |
| 1454 | var second = &unicode.RangeTable{ |
| 1455 | R16: []unicode.Range16{ |
| 1456 | {0x002D, 0x002E, 1}, |
| 1457 | {0x0030, 0x0039, 1}, |
| 1458 | {0x00B7, 0x00B7, 1}, |
| 1459 | {0x02D0, 0x02D1, 1}, |
| 1460 | {0x0300, 0x0345, 1}, |
| 1461 | {0x0360, 0x0361, 1}, |
| 1462 | {0x0387, 0x0387, 1}, |
| 1463 | {0x0483, 0x0486, 1}, |
| 1464 | {0x0591, 0x05A1, 1}, |
| 1465 | {0x05A3, 0x05B9, 1}, |
| 1466 | {0x05BB, 0x05BD, 1}, |
| 1467 | {0x05BF, 0x05BF, 1}, |
| 1468 | {0x05C1, 0x05C2, 1}, |
| 1469 | {0x05C4, 0x0640, 0x0640 - 0x05C4}, |
| 1470 | {0x064B, 0x0652, 1}, |
| 1471 | {0x0660, 0x0669, 1}, |
| 1472 | {0x0670, 0x0670, 1}, |
| 1473 | {0x06D6, 0x06DC, 1}, |
| 1474 | {0x06DD, 0x06DF, 1}, |
| 1475 | {0x06E0, 0x06E4, 1}, |
| 1476 | {0x06E7, 0x06E8, 1}, |
| 1477 | {0x06EA, 0x06ED, 1}, |
| 1478 | {0x06F0, 0x06F9, 1}, |
| 1479 | {0x0901, 0x0903, 1}, |
| 1480 | {0x093C, 0x093C, 1}, |
| 1481 | {0x093E, 0x094C, 1}, |
| 1482 | {0x094D, 0x094D, 1}, |
| 1483 | {0x0951, 0x0954, 1}, |
| 1484 | {0x0962, 0x0963, 1}, |
| 1485 | {0x0966, 0x096F, 1}, |
| 1486 | {0x0981, 0x0983, 1}, |
| 1487 | {0x09BC, 0x09BC, 1}, |
| 1488 | {0x09BE, 0x09BF, 1}, |
| 1489 | {0x09C0, 0x09C4, 1}, |
| 1490 | {0x09C7, 0x09C8, 1}, |
| 1491 | {0x09CB, 0x09CD, 1}, |
| 1492 | {0x09D7, 0x09D7, 1}, |
| 1493 | {0x09E2, 0x09E3, 1}, |
| 1494 | {0x09E6, 0x09EF, 1}, |
| 1495 | {0x0A02, 0x0A3C, 0x3A}, |
| 1496 | {0x0A3E, 0x0A3F, 1}, |
| 1497 | {0x0A40, 0x0A42, 1}, |
| 1498 | {0x0A47, 0x0A48, 1}, |
| 1499 | {0x0A4B, 0x0A4D, 1}, |
| 1500 | {0x0A66, 0x0A6F, 1}, |
| 1501 | {0x0A70, 0x0A71, 1}, |
| 1502 | {0x0A81, 0x0A83, 1}, |
| 1503 | {0x0ABC, 0x0ABC, 1}, |
| 1504 | {0x0ABE, 0x0AC5, 1}, |
| 1505 | {0x0AC7, 0x0AC9, 1}, |
| 1506 | {0x0ACB, 0x0ACD, 1}, |
| 1507 | {0x0AE6, 0x0AEF, 1}, |
| 1508 | {0x0B01, 0x0B03, 1}, |
| 1509 | {0x0B3C, 0x0B3C, 1}, |
| 1510 | {0x0B3E, 0x0B43, 1}, |
| 1511 | {0x0B47, 0x0B48, 1}, |
| 1512 | {0x0B4B, 0x0B4D, 1}, |
| 1513 | {0x0B56, 0x0B57, 1}, |
| 1514 | {0x0B66, 0x0B6F, 1}, |
| 1515 | {0x0B82, 0x0B83, 1}, |
| 1516 | {0x0BBE, 0x0BC2, 1}, |
| 1517 | {0x0BC6, 0x0BC8, 1}, |
| 1518 | {0x0BCA, 0x0BCD, 1}, |
| 1519 | {0x0BD7, 0x0BD7, 1}, |
| 1520 | {0x0BE7, 0x0BEF, 1}, |
| 1521 | {0x0C01, 0x0C03, 1}, |
| 1522 | {0x0C3E, 0x0C44, 1}, |
| 1523 | {0x0C46, 0x0C48, 1}, |
| 1524 | {0x0C4A, 0x0C4D, 1}, |
| 1525 | {0x0C55, 0x0C56, 1}, |
| 1526 | {0x0C66, 0x0C6F, 1}, |
| 1527 | {0x0C82, 0x0C83, 1}, |
| 1528 | {0x0CBE, 0x0CC4, 1}, |
| 1529 | {0x0CC6, 0x0CC8, 1}, |
| 1530 | {0x0CCA, 0x0CCD, 1}, |
| 1531 | {0x0CD5, 0x0CD6, 1}, |
| 1532 | {0x0CE6, 0x0CEF, 1}, |
| 1533 | {0x0D02, 0x0D03, 1}, |
| 1534 | {0x0D3E, 0x0D43, 1}, |
| 1535 | {0x0D46, 0x0D48, 1}, |
| 1536 | {0x0D4A, 0x0D4D, 1}, |
| 1537 | {0x0D57, 0x0D57, 1}, |
| 1538 | {0x0D66, 0x0D6F, 1}, |
| 1539 | {0x0E31, 0x0E31, 1}, |
| 1540 | {0x0E34, 0x0E3A, 1}, |
| 1541 | {0x0E46, 0x0E46, 1}, |
| 1542 | {0x0E47, 0x0E4E, 1}, |
| 1543 | {0x0E50, 0x0E59, 1}, |
| 1544 | {0x0EB1, 0x0EB1, 1}, |
| 1545 | {0x0EB4, 0x0EB9, 1}, |
| 1546 | {0x0EBB, 0x0EBC, 1}, |
| 1547 | {0x0EC6, 0x0EC6, 1}, |
| 1548 | {0x0EC8, 0x0ECD, 1}, |
| 1549 | {0x0ED0, 0x0ED9, 1}, |
| 1550 | {0x0F18, 0x0F19, 1}, |
| 1551 | {0x0F20, 0x0F29, 1}, |
| 1552 | {0x0F35, 0x0F39, 2}, |
| 1553 | {0x0F3E, 0x0F3F, 1}, |
| 1554 | {0x0F71, 0x0F84, 1}, |
| 1555 | {0x0F86, 0x0F8B, 1}, |
| 1556 | {0x0F90, 0x0F95, 1}, |
| 1557 | {0x0F97, 0x0F97, 1}, |
| 1558 | {0x0F99, 0x0FAD, 1}, |
| 1559 | {0x0FB1, 0x0FB7, 1}, |
| 1560 | {0x0FB9, 0x0FB9, 1}, |
| 1561 | {0x20D0, 0x20DC, 1}, |
| 1562 | {0x20E1, 0x3005, 0x3005 - 0x20E1}, |
| 1563 | {0x302A, 0x302F, 1}, |
| 1564 | {0x3031, 0x3035, 1}, |
| 1565 | {0x3099, 0x309A, 1}, |
| 1566 | {0x309D, 0x309E, 1}, |
| 1567 | {0x30FC, 0x30FE, 1}, |
| 1568 | }, |
| 1569 | } |
| 1570 | |
| 1571 | // HTMLEntity is an entity map containing translations for the |
| 1572 | // standard HTML entity characters. |
| 1573 | var HTMLEntity = htmlEntity |
| 1574 | |
| 1575 | var htmlEntity = map[string]string{ |
| 1576 | /* |
| 1577 | hget http://www.w3.org/TR/html4/sgml/entities.html | |
| 1578 | ssam ' |
| 1579 | ,y /\>/ x/\<(.|\n)+/ s/\n/ /g |
| 1580 | ,x v/^\<!ENTITY/d |
| 1581 | ,s/\<!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/ "\1": "\\u\2",/g |
| 1582 | ' |
| 1583 | */ |
| 1584 | "nbsp": "\u00A0", |
| 1585 | "iexcl": "\u00A1", |
| 1586 | "cent": "\u00A2", |
| 1587 | "pound": "\u00A3", |
| 1588 | "curren": "\u00A4", |
| 1589 | "yen": "\u00A5", |
| 1590 | "brvbar": "\u00A6", |
| 1591 | "sect": "\u00A7", |
| 1592 | "uml": "\u00A8", |
| 1593 | "copy": "\u00A9", |
| 1594 | "ordf": "\u00AA", |
| 1595 | "laquo": "\u00AB", |
| 1596 | "not": "\u00AC", |
| 1597 | "shy": "\u00AD", |
| 1598 | "reg": "\u00AE", |
| 1599 | "macr": "\u00AF", |
| 1600 | "deg": "\u00B0", |
| 1601 | "plusmn": "\u00B1", |
| 1602 | "sup2": "\u00B2", |
| 1603 | "sup3": "\u00B3", |
| 1604 | "acute": "\u00B4", |
| 1605 | "micro": "\u00B5", |
| 1606 | "para": "\u00B6", |
| 1607 | "middot": "\u00B7", |
| 1608 | "cedil": "\u00B8", |
| 1609 | "sup1": "\u00B9", |
| 1610 | "ordm": "\u00BA", |
| 1611 | "raquo": "\u00BB", |
| 1612 | "frac14": "\u00BC", |
| 1613 | "frac12": "\u00BD", |
| 1614 | "frac34": "\u00BE", |
| 1615 | "iquest": "\u00BF", |
| 1616 | "Agrave": "\u00C0", |
| 1617 | "Aacute": "\u00C1", |
| 1618 | "Acirc": "\u00C2", |
| 1619 | "Atilde": "\u00C3", |
| 1620 | "Auml": "\u00C4", |
| 1621 | "Aring": "\u00C5", |
| 1622 | "AElig": "\u00C6", |
| 1623 | "Ccedil": "\u00C7", |
| 1624 | "Egrave": "\u00C8", |
| 1625 | "Eacute": "\u00C9", |
| 1626 | "Ecirc": "\u00CA", |
| 1627 | "Euml": "\u00CB", |
| 1628 | "Igrave": "\u00CC", |
| 1629 | "Iacute": "\u00CD", |
| 1630 | "Icirc": "\u00CE", |
| 1631 | "Iuml": "\u00CF", |
| 1632 | "ETH": "\u00D0", |
| 1633 | "Ntilde": "\u00D1", |
| 1634 | "Ograve": "\u00D2", |
| 1635 | "Oacute": "\u00D3", |
| 1636 | "Ocirc": "\u00D4", |
| 1637 | "Otilde": "\u00D5", |
| 1638 | "Ouml": "\u00D6", |
| 1639 | "times": "\u00D7", |
| 1640 | "Oslash": "\u00D8", |
| 1641 | "Ugrave": "\u00D9", |
| 1642 | "Uacute": "\u00DA", |
| 1643 | "Ucirc": "\u00DB", |
| 1644 | "Uuml": "\u00DC", |
| 1645 | "Yacute": "\u00DD", |
| 1646 | "THORN": "\u00DE", |
| 1647 | "szlig": "\u00DF", |
| 1648 | "agrave": "\u00E0", |
| 1649 | "aacute": "\u00E1", |
| 1650 | "acirc": "\u00E2", |
| 1651 | "atilde": "\u00E3", |
| 1652 | "auml": "\u00E4", |
| 1653 | "aring": "\u00E5", |
| 1654 | "aelig": "\u00E6", |
| 1655 | "ccedil": "\u00E7", |
| 1656 | "egrave": "\u00E8", |
| 1657 | "eacute": "\u00E9", |
| 1658 | "ecirc": "\u00EA", |
| 1659 | "euml": "\u00EB", |
| 1660 | "igrave": "\u00EC", |
| 1661 | "iacute": "\u00ED", |
| 1662 | "icirc": "\u00EE", |
| 1663 | "iuml": "\u00EF", |
| 1664 | "eth": "\u00F0", |
| 1665 | "ntilde": "\u00F1", |
| 1666 | "ograve": "\u00F2", |
| 1667 | "oacute": "\u00F3", |
| 1668 | "ocirc": "\u00F4", |
| 1669 | "otilde": "\u00F5", |
| 1670 | "ouml": "\u00F6", |
| 1671 | "divide": "\u00F7", |
| 1672 | "oslash": "\u00F8", |
| 1673 | "ugrave": "\u00F9", |
| 1674 | "uacute": "\u00FA", |
| 1675 | "ucirc": "\u00FB", |
| 1676 | "uuml": "\u00FC", |
| 1677 | "yacute": "\u00FD", |
| 1678 | "thorn": "\u00FE", |
| 1679 | "yuml": "\u00FF", |
| 1680 | "fnof": "\u0192", |
| 1681 | "Alpha": "\u0391", |
| 1682 | "Beta": "\u0392", |
| 1683 | "Gamma": "\u0393", |
| 1684 | "Delta": "\u0394", |
| 1685 | "Epsilon": "\u0395", |
| 1686 | "Zeta": "\u0396", |
| 1687 | "Eta": "\u0397", |
| 1688 | "Theta": "\u0398", |
| 1689 | "Iota": "\u0399", |
| 1690 | "Kappa": "\u039A", |
| 1691 | "Lambda": "\u039B", |
| 1692 | "Mu": "\u039C", |
| 1693 | "Nu": "\u039D", |
| 1694 | "Xi": "\u039E", |
| 1695 | "Omicron": "\u039F", |
| 1696 | "Pi": "\u03A0", |
| 1697 | "Rho": "\u03A1", |
| 1698 | "Sigma": "\u03A3", |
| 1699 | "Tau": "\u03A4", |
| 1700 | "Upsilon": "\u03A5", |
| 1701 | "Phi": "\u03A6", |
| 1702 | "Chi": "\u03A7", |
| 1703 | "Psi": "\u03A8", |
| 1704 | "Omega": "\u03A9", |
| 1705 | "alpha": "\u03B1", |
| 1706 | "beta": "\u03B2", |
| 1707 | "gamma": "\u03B3", |
| 1708 | "delta": "\u03B4", |
| 1709 | "epsilon": "\u03B5", |
| 1710 | "zeta": "\u03B6", |
| 1711 | "eta": "\u03B7", |
| 1712 | "theta": "\u03B8", |
| 1713 | "iota": "\u03B9", |
| 1714 | "kappa": "\u03BA", |
| 1715 | "lambda": "\u03BB", |
| 1716 | "mu": "\u03BC", |
| 1717 | "nu": "\u03BD", |
| 1718 | "xi": "\u03BE", |
| 1719 | "omicron": "\u03BF", |
| 1720 | "pi": "\u03C0", |
| 1721 | "rho": "\u03C1", |
| 1722 | "sigmaf": "\u03C2", |
| 1723 | "sigma": "\u03C3", |
| 1724 | "tau": "\u03C4", |
| 1725 | "upsilon": "\u03C5", |
| 1726 | "phi": "\u03C6", |
| 1727 | "chi": "\u03C7", |
| 1728 | "psi": "\u03C8", |
| 1729 | "omega": "\u03C9", |
| 1730 | "thetasym": "\u03D1", |
| 1731 | "upsih": "\u03D2", |
| 1732 | "piv": "\u03D6", |
| 1733 | "bull": "\u2022", |
| 1734 | "hellip": "\u2026", |
| 1735 | "prime": "\u2032", |
| 1736 | "Prime": "\u2033", |
| 1737 | "oline": "\u203E", |
| 1738 | "frasl": "\u2044", |
| 1739 | "weierp": "\u2118", |
| 1740 | "image": "\u2111", |
| 1741 | "real": "\u211C", |
| 1742 | "trade": "\u2122", |
| 1743 | "alefsym": "\u2135", |
| 1744 | "larr": "\u2190", |
| 1745 | "uarr": "\u2191", |
| 1746 | "rarr": "\u2192", |
| 1747 | "darr": "\u2193", |
| 1748 | "harr": "\u2194", |
| 1749 | "crarr": "\u21B5", |
| 1750 | "lArr": "\u21D0", |
| 1751 | "uArr": "\u21D1", |
| 1752 | "rArr": "\u21D2", |
| 1753 | "dArr": "\u21D3", |
| 1754 | "hArr": "\u21D4", |
| 1755 | "forall": "\u2200", |
| 1756 | "part": "\u2202", |
| 1757 | "exist": "\u2203", |
| 1758 | "empty": "\u2205", |
| 1759 | "nabla": "\u2207", |
| 1760 | "isin": "\u2208", |
| 1761 | "notin": "\u2209", |
| 1762 | "ni": "\u220B", |
| 1763 | "prod": "\u220F", |
| 1764 | "sum": "\u2211", |
| 1765 | "minus": "\u2212", |
| 1766 | "lowast": "\u2217", |
| 1767 | "radic": "\u221A", |
| 1768 | "prop": "\u221D", |
| 1769 | "infin": "\u221E", |
| 1770 | "ang": "\u2220", |
| 1771 | "and": "\u2227", |
| 1772 | "or": "\u2228", |
| 1773 | "cap": "\u2229", |
| 1774 | "cup": "\u222A", |
| 1775 | "int": "\u222B", |
| 1776 | "there4": "\u2234", |
| 1777 | "sim": "\u223C", |
| 1778 | "cong": "\u2245", |
| 1779 | "asymp": "\u2248", |
| 1780 | "ne": "\u2260", |
| 1781 | "equiv": "\u2261", |
| 1782 | "le": "\u2264", |
| 1783 | "ge": "\u2265", |
| 1784 | "sub": "\u2282", |
| 1785 | "sup": "\u2283", |
| 1786 | "nsub": "\u2284", |
| 1787 | "sube": "\u2286", |
| 1788 | "supe": "\u2287", |
| 1789 | "oplus": "\u2295", |
| 1790 | "otimes": "\u2297", |
| 1791 | "perp": "\u22A5", |
| 1792 | "sdot": "\u22C5", |
| 1793 | "lceil": "\u2308", |
| 1794 | "rceil": "\u2309", |
| 1795 | "lfloor": "\u230A", |
| 1796 | "rfloor": "\u230B", |
| 1797 | "lang": "\u2329", |
| 1798 | "rang": "\u232A", |
| 1799 | "loz": "\u25CA", |
| 1800 | "spades": "\u2660", |
| 1801 | "clubs": "\u2663", |
| 1802 | "hearts": "\u2665", |
| 1803 | "diams": "\u2666", |
| 1804 | "quot": "\u0022", |
| 1805 | "amp": "\u0026", |
| 1806 | "lt": "\u003C", |
| 1807 | "gt": "\u003E", |
| 1808 | "OElig": "\u0152", |
| 1809 | "oelig": "\u0153", |
| 1810 | "Scaron": "\u0160", |
| 1811 | "scaron": "\u0161", |
| 1812 | "Yuml": "\u0178", |
| 1813 | "circ": "\u02C6", |
| 1814 | "tilde": "\u02DC", |
| 1815 | "ensp": "\u2002", |
| 1816 | "emsp": "\u2003", |
| 1817 | "thinsp": "\u2009", |
| 1818 | "zwnj": "\u200C", |
| 1819 | "zwj": "\u200D", |
| 1820 | "lrm": "\u200E", |
| 1821 | "rlm": "\u200F", |
| 1822 | "ndash": "\u2013", |
| 1823 | "mdash": "\u2014", |
| 1824 | "lsquo": "\u2018", |
| 1825 | "rsquo": "\u2019", |
| 1826 | "sbquo": "\u201A", |
| 1827 | "ldquo": "\u201C", |
| 1828 | "rdquo": "\u201D", |
| 1829 | "bdquo": "\u201E", |
| 1830 | "dagger": "\u2020", |
| 1831 | "Dagger": "\u2021", |
| 1832 | "permil": "\u2030", |
| 1833 | "lsaquo": "\u2039", |
| 1834 | "rsaquo": "\u203A", |
| 1835 | "euro": "\u20AC", |
| 1836 | } |
| 1837 | |
| 1838 | // HTMLAutoClose is the set of HTML elements that |
| 1839 | // should be considered to close automatically. |
| 1840 | var HTMLAutoClose = htmlAutoClose |
| 1841 | |
| 1842 | var htmlAutoClose = []string{ |
| 1843 | /* |
| 1844 | hget http://www.w3.org/TR/html4/loose.dtd | |
| 1845 | 9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/ "\1",/p' | tr A-Z a-z |
| 1846 | */ |
| 1847 | "basefont", |
| 1848 | "br", |
| 1849 | "area", |
| 1850 | "link", |
| 1851 | "img", |
| 1852 | "param", |
| 1853 | "hr", |
| 1854 | "input", |
| 1855 | "col", |
| 1856 | "frame", |
| 1857 | "isindex", |
| 1858 | "base", |
| 1859 | "meta", |
| 1860 | } |
| 1861 | |
| 1862 | var ( |
| 1863 | esc_quot = []byte(""") // shorter than """ |
| 1864 | esc_apos = []byte("'") // shorter than "'" |
| 1865 | esc_amp = []byte("&") |
| 1866 | esc_lt = []byte("<") |
| 1867 | esc_gt = []byte(">") |
| 1868 | esc_tab = []byte("	") |
| 1869 | esc_nl = []byte("
") |
| 1870 | esc_cr = []byte("
") |
| 1871 | esc_fffd = []byte("\uFFFD") // Unicode replacement character |
| 1872 | ) |
| 1873 | |
| 1874 | // EscapeText writes to w the properly escaped XML equivalent |
| 1875 | // of the plain text data s. |
| 1876 | func EscapeText(w io.Writer, s []byte) error { |
| 1877 | return escapeText(w, s, true) |
| 1878 | } |
| 1879 | |
| 1880 | // escapeText writes to w the properly escaped XML equivalent |
| 1881 | // of the plain text data s. If escapeNewline is true, newline |
| 1882 | // characters will be escaped. |
| 1883 | func escapeText(w io.Writer, s []byte, escapeNewline bool) error { |
| 1884 | var esc []byte |
| 1885 | last := 0 |
| 1886 | for i := 0; i < len(s); { |
| 1887 | r, width := utf8.DecodeRune(s[i:]) |
| 1888 | i += width |
| 1889 | switch r { |
| 1890 | case '"': |
| 1891 | esc = esc_quot |
| 1892 | case '\'': |
| 1893 | esc = esc_apos |
| 1894 | case '&': |
| 1895 | esc = esc_amp |
| 1896 | case '<': |
| 1897 | esc = esc_lt |
| 1898 | case '>': |
| 1899 | esc = esc_gt |
| 1900 | case '\t': |
| 1901 | esc = esc_tab |
| 1902 | case '\n': |
| 1903 | if !escapeNewline { |
| 1904 | continue |
| 1905 | } |
| 1906 | esc = esc_nl |
| 1907 | case '\r': |
| 1908 | esc = esc_cr |
| 1909 | default: |
| 1910 | if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) { |
| 1911 | esc = esc_fffd |
| 1912 | break |
| 1913 | } |
| 1914 | continue |
| 1915 | } |
| 1916 | if _, err := w.Write(s[last : i-width]); err != nil { |
| 1917 | return err |
| 1918 | } |
| 1919 | if _, err := w.Write(esc); err != nil { |
| 1920 | return err |
| 1921 | } |
| 1922 | last = i |
| 1923 | } |
| 1924 | if _, err := w.Write(s[last:]); err != nil { |
| 1925 | return err |
| 1926 | } |
| 1927 | return nil |
| 1928 | } |
| 1929 | |
| 1930 | // EscapeString writes to p the properly escaped XML equivalent |
| 1931 | // of the plain text data s. |
| 1932 | func (p *printer) EscapeString(s string) { |
| 1933 | var esc []byte |
| 1934 | last := 0 |
| 1935 | for i := 0; i < len(s); { |
| 1936 | r, width := utf8.DecodeRuneInString(s[i:]) |
| 1937 | i += width |
| 1938 | switch r { |
| 1939 | case '"': |
| 1940 | esc = esc_quot |
| 1941 | case '\'': |
| 1942 | esc = esc_apos |
| 1943 | case '&': |
| 1944 | esc = esc_amp |
| 1945 | case '<': |
| 1946 | esc = esc_lt |
| 1947 | case '>': |
| 1948 | esc = esc_gt |
| 1949 | case '\t': |
| 1950 | esc = esc_tab |
| 1951 | case '\n': |
| 1952 | esc = esc_nl |
| 1953 | case '\r': |
| 1954 | esc = esc_cr |
| 1955 | default: |
| 1956 | if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) { |
| 1957 | esc = esc_fffd |
| 1958 | break |
| 1959 | } |
| 1960 | continue |
| 1961 | } |
| 1962 | p.WriteString(s[last : i-width]) |
| 1963 | p.Write(esc) |
| 1964 | last = i |
| 1965 | } |
| 1966 | p.WriteString(s[last:]) |
| 1967 | } |
| 1968 | |
| 1969 | // Escape is like EscapeText but omits the error return value. |
| 1970 | // It is provided for backwards compatibility with Go 1.0. |
| 1971 | // Code targeting Go 1.1 or later should use EscapeText. |
| 1972 | func Escape(w io.Writer, s []byte) { |
| 1973 | EscapeText(w, s) |
| 1974 | } |
| 1975 | |
| 1976 | // procInst parses the `param="..."` or `param='...'` |
| 1977 | // value out of the provided string, returning "" if not found. |
| 1978 | func procInst(param, s string) string { |
| 1979 | // TODO: this parsing is somewhat lame and not exact. |
| 1980 | // It works for all actual cases, though. |
| 1981 | param = param + "=" |
| 1982 | idx := strings.Index(s, param) |
| 1983 | if idx == -1 { |
| 1984 | return "" |
| 1985 | } |
| 1986 | v := s[idx+len(param):] |
| 1987 | if v == "" { |
| 1988 | return "" |
| 1989 | } |
| 1990 | if v[0] != '\'' && v[0] != '"' { |
| 1991 | return "" |
| 1992 | } |
| 1993 | idx = strings.IndexRune(v[1:], rune(v[0])) |
| 1994 | if idx == -1 { |
| 1995 | return "" |
| 1996 | } |
| 1997 | return v[1 : idx+1] |
| 1998 | } |