David K. Bainbridge | 215e024 | 2017-09-05 23:18:24 -0700 | [diff] [blame] | 1 | // Copyright 2010 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package html |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "errors" |
| 10 | "io" |
| 11 | "strconv" |
| 12 | "strings" |
| 13 | |
| 14 | "golang.org/x/net/html/atom" |
| 15 | ) |
| 16 | |
| 17 | // A TokenType is the type of a Token. |
| 18 | type TokenType uint32 |
| 19 | |
| 20 | const ( |
| 21 | // ErrorToken means that an error occurred during tokenization. |
| 22 | ErrorToken TokenType = iota |
| 23 | // TextToken means a text node. |
| 24 | TextToken |
| 25 | // A StartTagToken looks like <a>. |
| 26 | StartTagToken |
| 27 | // An EndTagToken looks like </a>. |
| 28 | EndTagToken |
| 29 | // A SelfClosingTagToken tag looks like <br/>. |
| 30 | SelfClosingTagToken |
| 31 | // A CommentToken looks like <!--x-->. |
| 32 | CommentToken |
| 33 | // A DoctypeToken looks like <!DOCTYPE x> |
| 34 | DoctypeToken |
| 35 | ) |
| 36 | |
| 37 | // ErrBufferExceeded means that the buffering limit was exceeded. |
| 38 | var ErrBufferExceeded = errors.New("max buffer exceeded") |
| 39 | |
| 40 | // String returns a string representation of the TokenType. |
| 41 | func (t TokenType) String() string { |
| 42 | switch t { |
| 43 | case ErrorToken: |
| 44 | return "Error" |
| 45 | case TextToken: |
| 46 | return "Text" |
| 47 | case StartTagToken: |
| 48 | return "StartTag" |
| 49 | case EndTagToken: |
| 50 | return "EndTag" |
| 51 | case SelfClosingTagToken: |
| 52 | return "SelfClosingTag" |
| 53 | case CommentToken: |
| 54 | return "Comment" |
| 55 | case DoctypeToken: |
| 56 | return "Doctype" |
| 57 | } |
| 58 | return "Invalid(" + strconv.Itoa(int(t)) + ")" |
| 59 | } |
| 60 | |
| 61 | // An Attribute is an attribute namespace-key-value triple. Namespace is |
| 62 | // non-empty for foreign attributes like xlink, Key is alphabetic (and hence |
| 63 | // does not contain escapable characters like '&', '<' or '>'), and Val is |
| 64 | // unescaped (it looks like "a<b" rather than "a<b"). |
| 65 | // |
| 66 | // Namespace is only used by the parser, not the tokenizer. |
| 67 | type Attribute struct { |
| 68 | Namespace, Key, Val string |
| 69 | } |
| 70 | |
| 71 | // A Token consists of a TokenType and some Data (tag name for start and end |
| 72 | // tags, content for text, comments and doctypes). A tag Token may also contain |
| 73 | // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b" |
| 74 | // rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or |
| 75 | // zero if Data is not a known tag name. |
| 76 | type Token struct { |
| 77 | Type TokenType |
| 78 | DataAtom atom.Atom |
| 79 | Data string |
| 80 | Attr []Attribute |
| 81 | } |
| 82 | |
| 83 | // tagString returns a string representation of a tag Token's Data and Attr. |
| 84 | func (t Token) tagString() string { |
| 85 | if len(t.Attr) == 0 { |
| 86 | return t.Data |
| 87 | } |
| 88 | buf := bytes.NewBufferString(t.Data) |
| 89 | for _, a := range t.Attr { |
| 90 | buf.WriteByte(' ') |
| 91 | buf.WriteString(a.Key) |
| 92 | buf.WriteString(`="`) |
| 93 | escape(buf, a.Val) |
| 94 | buf.WriteByte('"') |
| 95 | } |
| 96 | return buf.String() |
| 97 | } |
| 98 | |
| 99 | // String returns a string representation of the Token. |
| 100 | func (t Token) String() string { |
| 101 | switch t.Type { |
| 102 | case ErrorToken: |
| 103 | return "" |
| 104 | case TextToken: |
| 105 | return EscapeString(t.Data) |
| 106 | case StartTagToken: |
| 107 | return "<" + t.tagString() + ">" |
| 108 | case EndTagToken: |
| 109 | return "</" + t.tagString() + ">" |
| 110 | case SelfClosingTagToken: |
| 111 | return "<" + t.tagString() + "/>" |
| 112 | case CommentToken: |
| 113 | return "<!--" + t.Data + "-->" |
| 114 | case DoctypeToken: |
| 115 | return "<!DOCTYPE " + t.Data + ">" |
| 116 | } |
| 117 | return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" |
| 118 | } |
| 119 | |
| 120 | // span is a range of bytes in a Tokenizer's buffer. The start is inclusive, |
| 121 | // the end is exclusive. |
| 122 | type span struct { |
| 123 | start, end int |
| 124 | } |
| 125 | |
| 126 | // A Tokenizer returns a stream of HTML Tokens. |
| 127 | type Tokenizer struct { |
| 128 | // r is the source of the HTML text. |
| 129 | r io.Reader |
| 130 | // tt is the TokenType of the current token. |
| 131 | tt TokenType |
| 132 | // err is the first error encountered during tokenization. It is possible |
| 133 | // for tt != Error && err != nil to hold: this means that Next returned a |
| 134 | // valid token but the subsequent Next call will return an error token. |
| 135 | // For example, if the HTML text input was just "plain", then the first |
| 136 | // Next call would set z.err to io.EOF but return a TextToken, and all |
| 137 | // subsequent Next calls would return an ErrorToken. |
| 138 | // err is never reset. Once it becomes non-nil, it stays non-nil. |
| 139 | err error |
| 140 | // readErr is the error returned by the io.Reader r. It is separate from |
| 141 | // err because it is valid for an io.Reader to return (n int, err1 error) |
| 142 | // such that n > 0 && err1 != nil, and callers should always process the |
| 143 | // n > 0 bytes before considering the error err1. |
| 144 | readErr error |
| 145 | // buf[raw.start:raw.end] holds the raw bytes of the current token. |
| 146 | // buf[raw.end:] is buffered input that will yield future tokens. |
| 147 | raw span |
| 148 | buf []byte |
| 149 | // maxBuf limits the data buffered in buf. A value of 0 means unlimited. |
| 150 | maxBuf int |
| 151 | // buf[data.start:data.end] holds the raw bytes of the current token's data: |
| 152 | // a text token's text, a tag token's tag name, etc. |
| 153 | data span |
| 154 | // pendingAttr is the attribute key and value currently being tokenized. |
| 155 | // When complete, pendingAttr is pushed onto attr. nAttrReturned is |
| 156 | // incremented on each call to TagAttr. |
| 157 | pendingAttr [2]span |
| 158 | attr [][2]span |
| 159 | nAttrReturned int |
| 160 | // rawTag is the "script" in "</script>" that closes the next token. If |
| 161 | // non-empty, the subsequent call to Next will return a raw or RCDATA text |
| 162 | // token: one that treats "<p>" as text instead of an element. |
| 163 | // rawTag's contents are lower-cased. |
| 164 | rawTag string |
| 165 | // textIsRaw is whether the current text token's data is not escaped. |
| 166 | textIsRaw bool |
| 167 | // convertNUL is whether NUL bytes in the current token's data should |
| 168 | // be converted into \ufffd replacement characters. |
| 169 | convertNUL bool |
| 170 | // allowCDATA is whether CDATA sections are allowed in the current context. |
| 171 | allowCDATA bool |
| 172 | } |
| 173 | |
| 174 | // AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as |
| 175 | // the text "foo". The default value is false, which means to recognize it as |
| 176 | // a bogus comment "<!-- [CDATA[foo]] -->" instead. |
| 177 | // |
| 178 | // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and |
| 179 | // only if tokenizing foreign content, such as MathML and SVG. However, |
| 180 | // tracking foreign-contentness is difficult to do purely in the tokenizer, |
| 181 | // as opposed to the parser, due to HTML integration points: an <svg> element |
| 182 | // can contain a <foreignObject> that is foreign-to-SVG but not foreign-to- |
| 183 | // HTML. For strict compliance with the HTML5 tokenization algorithm, it is the |
| 184 | // responsibility of the user of a tokenizer to call AllowCDATA as appropriate. |
| 185 | // In practice, if using the tokenizer without caring whether MathML or SVG |
| 186 | // CDATA is text or comments, such as tokenizing HTML to find all the anchor |
| 187 | // text, it is acceptable to ignore this responsibility. |
| 188 | func (z *Tokenizer) AllowCDATA(allowCDATA bool) { |
| 189 | z.allowCDATA = allowCDATA |
| 190 | } |
| 191 | |
| 192 | // NextIsNotRawText instructs the tokenizer that the next token should not be |
| 193 | // considered as 'raw text'. Some elements, such as script and title elements, |
| 194 | // normally require the next token after the opening tag to be 'raw text' that |
| 195 | // has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>" |
| 196 | // yields a start tag token for "<title>", a text token for "a<b>c</b>d", and |
| 197 | // an end tag token for "</title>". There are no distinct start tag or end tag |
| 198 | // tokens for the "<b>" and "</b>". |
| 199 | // |
| 200 | // This tokenizer implementation will generally look for raw text at the right |
| 201 | // times. Strictly speaking, an HTML5 compliant tokenizer should not look for |
| 202 | // raw text if in foreign content: <title> generally needs raw text, but a |
| 203 | // <title> inside an <svg> does not. Another example is that a <textarea> |
| 204 | // generally needs raw text, but a <textarea> is not allowed as an immediate |
| 205 | // child of a <select>; in normal parsing, a <textarea> implies </select>, but |
| 206 | // one cannot close the implicit element when parsing a <select>'s InnerHTML. |
| 207 | // Similarly to AllowCDATA, tracking the correct moment to override raw-text- |
| 208 | // ness is difficult to do purely in the tokenizer, as opposed to the parser. |
| 209 | // For strict compliance with the HTML5 tokenization algorithm, it is the |
| 210 | // responsibility of the user of a tokenizer to call NextIsNotRawText as |
| 211 | // appropriate. In practice, like AllowCDATA, it is acceptable to ignore this |
| 212 | // responsibility for basic usage. |
| 213 | // |
| 214 | // Note that this 'raw text' concept is different from the one offered by the |
| 215 | // Tokenizer.Raw method. |
| 216 | func (z *Tokenizer) NextIsNotRawText() { |
| 217 | z.rawTag = "" |
| 218 | } |
| 219 | |
| 220 | // Err returns the error associated with the most recent ErrorToken token. |
| 221 | // This is typically io.EOF, meaning the end of tokenization. |
| 222 | func (z *Tokenizer) Err() error { |
| 223 | if z.tt != ErrorToken { |
| 224 | return nil |
| 225 | } |
| 226 | return z.err |
| 227 | } |
| 228 | |
| 229 | // readByte returns the next byte from the input stream, doing a buffered read |
| 230 | // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte |
| 231 | // slice that holds all the bytes read so far for the current token. |
| 232 | // It sets z.err if the underlying reader returns an error. |
| 233 | // Pre-condition: z.err == nil. |
| 234 | func (z *Tokenizer) readByte() byte { |
| 235 | if z.raw.end >= len(z.buf) { |
| 236 | // Our buffer is exhausted and we have to read from z.r. Check if the |
| 237 | // previous read resulted in an error. |
| 238 | if z.readErr != nil { |
| 239 | z.err = z.readErr |
| 240 | return 0 |
| 241 | } |
| 242 | // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length |
| 243 | // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we |
| 244 | // allocate a new buffer before the copy. |
| 245 | c := cap(z.buf) |
| 246 | d := z.raw.end - z.raw.start |
| 247 | var buf1 []byte |
| 248 | if 2*d > c { |
| 249 | buf1 = make([]byte, d, 2*c) |
| 250 | } else { |
| 251 | buf1 = z.buf[:d] |
| 252 | } |
| 253 | copy(buf1, z.buf[z.raw.start:z.raw.end]) |
| 254 | if x := z.raw.start; x != 0 { |
| 255 | // Adjust the data/attr spans to refer to the same contents after the copy. |
| 256 | z.data.start -= x |
| 257 | z.data.end -= x |
| 258 | z.pendingAttr[0].start -= x |
| 259 | z.pendingAttr[0].end -= x |
| 260 | z.pendingAttr[1].start -= x |
| 261 | z.pendingAttr[1].end -= x |
| 262 | for i := range z.attr { |
| 263 | z.attr[i][0].start -= x |
| 264 | z.attr[i][0].end -= x |
| 265 | z.attr[i][1].start -= x |
| 266 | z.attr[i][1].end -= x |
| 267 | } |
| 268 | } |
| 269 | z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] |
| 270 | // Now that we have copied the live bytes to the start of the buffer, |
| 271 | // we read from z.r into the remainder. |
| 272 | var n int |
| 273 | n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) |
| 274 | if n == 0 { |
| 275 | z.err = z.readErr |
| 276 | return 0 |
| 277 | } |
| 278 | z.buf = buf1[:d+n] |
| 279 | } |
| 280 | x := z.buf[z.raw.end] |
| 281 | z.raw.end++ |
| 282 | if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { |
| 283 | z.err = ErrBufferExceeded |
| 284 | return 0 |
| 285 | } |
| 286 | return x |
| 287 | } |
| 288 | |
| 289 | // Buffered returns a slice containing data buffered but not yet tokenized. |
| 290 | func (z *Tokenizer) Buffered() []byte { |
| 291 | return z.buf[z.raw.end:] |
| 292 | } |
| 293 | |
| 294 | // readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). |
| 295 | // It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) |
| 296 | // too many times in succession. |
| 297 | func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { |
| 298 | for i := 0; i < 100; i++ { |
| 299 | n, err := r.Read(b) |
| 300 | if n != 0 || err != nil { |
| 301 | return n, err |
| 302 | } |
| 303 | } |
| 304 | return 0, io.ErrNoProgress |
| 305 | } |
| 306 | |
| 307 | // skipWhiteSpace skips past any white space. |
| 308 | func (z *Tokenizer) skipWhiteSpace() { |
| 309 | if z.err != nil { |
| 310 | return |
| 311 | } |
| 312 | for { |
| 313 | c := z.readByte() |
| 314 | if z.err != nil { |
| 315 | return |
| 316 | } |
| 317 | switch c { |
| 318 | case ' ', '\n', '\r', '\t', '\f': |
| 319 | // No-op. |
| 320 | default: |
| 321 | z.raw.end-- |
| 322 | return |
| 323 | } |
| 324 | } |
| 325 | } |
| 326 | |
| 327 | // readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and |
| 328 | // is typically something like "script" or "textarea". |
| 329 | func (z *Tokenizer) readRawOrRCDATA() { |
| 330 | if z.rawTag == "script" { |
| 331 | z.readScript() |
| 332 | z.textIsRaw = true |
| 333 | z.rawTag = "" |
| 334 | return |
| 335 | } |
| 336 | loop: |
| 337 | for { |
| 338 | c := z.readByte() |
| 339 | if z.err != nil { |
| 340 | break loop |
| 341 | } |
| 342 | if c != '<' { |
| 343 | continue loop |
| 344 | } |
| 345 | c = z.readByte() |
| 346 | if z.err != nil { |
| 347 | break loop |
| 348 | } |
| 349 | if c != '/' { |
| 350 | continue loop |
| 351 | } |
| 352 | if z.readRawEndTag() || z.err != nil { |
| 353 | break loop |
| 354 | } |
| 355 | } |
| 356 | z.data.end = z.raw.end |
| 357 | // A textarea's or title's RCDATA can contain escaped entities. |
| 358 | z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" |
| 359 | z.rawTag = "" |
| 360 | } |
| 361 | |
| 362 | // readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag. |
| 363 | // If it succeeds, it backs up the input position to reconsume the tag and |
| 364 | // returns true. Otherwise it returns false. The opening "</" has already been |
| 365 | // consumed. |
| 366 | func (z *Tokenizer) readRawEndTag() bool { |
| 367 | for i := 0; i < len(z.rawTag); i++ { |
| 368 | c := z.readByte() |
| 369 | if z.err != nil { |
| 370 | return false |
| 371 | } |
| 372 | if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { |
| 373 | z.raw.end-- |
| 374 | return false |
| 375 | } |
| 376 | } |
| 377 | c := z.readByte() |
| 378 | if z.err != nil { |
| 379 | return false |
| 380 | } |
| 381 | switch c { |
| 382 | case ' ', '\n', '\r', '\t', '\f', '/', '>': |
| 383 | // The 3 is 2 for the leading "</" plus 1 for the trailing character c. |
| 384 | z.raw.end -= 3 + len(z.rawTag) |
| 385 | return true |
| 386 | } |
| 387 | z.raw.end-- |
| 388 | return false |
| 389 | } |
| 390 | |
| 391 | // readScript reads until the next </script> tag, following the byzantine |
| 392 | // rules for escaping/hiding the closing tag. |
| 393 | func (z *Tokenizer) readScript() { |
| 394 | defer func() { |
| 395 | z.data.end = z.raw.end |
| 396 | }() |
| 397 | var c byte |
| 398 | |
| 399 | scriptData: |
| 400 | c = z.readByte() |
| 401 | if z.err != nil { |
| 402 | return |
| 403 | } |
| 404 | if c == '<' { |
| 405 | goto scriptDataLessThanSign |
| 406 | } |
| 407 | goto scriptData |
| 408 | |
| 409 | scriptDataLessThanSign: |
| 410 | c = z.readByte() |
| 411 | if z.err != nil { |
| 412 | return |
| 413 | } |
| 414 | switch c { |
| 415 | case '/': |
| 416 | goto scriptDataEndTagOpen |
| 417 | case '!': |
| 418 | goto scriptDataEscapeStart |
| 419 | } |
| 420 | z.raw.end-- |
| 421 | goto scriptData |
| 422 | |
| 423 | scriptDataEndTagOpen: |
| 424 | if z.readRawEndTag() || z.err != nil { |
| 425 | return |
| 426 | } |
| 427 | goto scriptData |
| 428 | |
| 429 | scriptDataEscapeStart: |
| 430 | c = z.readByte() |
| 431 | if z.err != nil { |
| 432 | return |
| 433 | } |
| 434 | if c == '-' { |
| 435 | goto scriptDataEscapeStartDash |
| 436 | } |
| 437 | z.raw.end-- |
| 438 | goto scriptData |
| 439 | |
| 440 | scriptDataEscapeStartDash: |
| 441 | c = z.readByte() |
| 442 | if z.err != nil { |
| 443 | return |
| 444 | } |
| 445 | if c == '-' { |
| 446 | goto scriptDataEscapedDashDash |
| 447 | } |
| 448 | z.raw.end-- |
| 449 | goto scriptData |
| 450 | |
| 451 | scriptDataEscaped: |
| 452 | c = z.readByte() |
| 453 | if z.err != nil { |
| 454 | return |
| 455 | } |
| 456 | switch c { |
| 457 | case '-': |
| 458 | goto scriptDataEscapedDash |
| 459 | case '<': |
| 460 | goto scriptDataEscapedLessThanSign |
| 461 | } |
| 462 | goto scriptDataEscaped |
| 463 | |
| 464 | scriptDataEscapedDash: |
| 465 | c = z.readByte() |
| 466 | if z.err != nil { |
| 467 | return |
| 468 | } |
| 469 | switch c { |
| 470 | case '-': |
| 471 | goto scriptDataEscapedDashDash |
| 472 | case '<': |
| 473 | goto scriptDataEscapedLessThanSign |
| 474 | } |
| 475 | goto scriptDataEscaped |
| 476 | |
| 477 | scriptDataEscapedDashDash: |
| 478 | c = z.readByte() |
| 479 | if z.err != nil { |
| 480 | return |
| 481 | } |
| 482 | switch c { |
| 483 | case '-': |
| 484 | goto scriptDataEscapedDashDash |
| 485 | case '<': |
| 486 | goto scriptDataEscapedLessThanSign |
| 487 | case '>': |
| 488 | goto scriptData |
| 489 | } |
| 490 | goto scriptDataEscaped |
| 491 | |
| 492 | scriptDataEscapedLessThanSign: |
| 493 | c = z.readByte() |
| 494 | if z.err != nil { |
| 495 | return |
| 496 | } |
| 497 | if c == '/' { |
| 498 | goto scriptDataEscapedEndTagOpen |
| 499 | } |
| 500 | if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { |
| 501 | goto scriptDataDoubleEscapeStart |
| 502 | } |
| 503 | z.raw.end-- |
| 504 | goto scriptData |
| 505 | |
| 506 | scriptDataEscapedEndTagOpen: |
| 507 | if z.readRawEndTag() || z.err != nil { |
| 508 | return |
| 509 | } |
| 510 | goto scriptDataEscaped |
| 511 | |
| 512 | scriptDataDoubleEscapeStart: |
| 513 | z.raw.end-- |
| 514 | for i := 0; i < len("script"); i++ { |
| 515 | c = z.readByte() |
| 516 | if z.err != nil { |
| 517 | return |
| 518 | } |
| 519 | if c != "script"[i] && c != "SCRIPT"[i] { |
| 520 | z.raw.end-- |
| 521 | goto scriptDataEscaped |
| 522 | } |
| 523 | } |
| 524 | c = z.readByte() |
| 525 | if z.err != nil { |
| 526 | return |
| 527 | } |
| 528 | switch c { |
| 529 | case ' ', '\n', '\r', '\t', '\f', '/', '>': |
| 530 | goto scriptDataDoubleEscaped |
| 531 | } |
| 532 | z.raw.end-- |
| 533 | goto scriptDataEscaped |
| 534 | |
| 535 | scriptDataDoubleEscaped: |
| 536 | c = z.readByte() |
| 537 | if z.err != nil { |
| 538 | return |
| 539 | } |
| 540 | switch c { |
| 541 | case '-': |
| 542 | goto scriptDataDoubleEscapedDash |
| 543 | case '<': |
| 544 | goto scriptDataDoubleEscapedLessThanSign |
| 545 | } |
| 546 | goto scriptDataDoubleEscaped |
| 547 | |
| 548 | scriptDataDoubleEscapedDash: |
| 549 | c = z.readByte() |
| 550 | if z.err != nil { |
| 551 | return |
| 552 | } |
| 553 | switch c { |
| 554 | case '-': |
| 555 | goto scriptDataDoubleEscapedDashDash |
| 556 | case '<': |
| 557 | goto scriptDataDoubleEscapedLessThanSign |
| 558 | } |
| 559 | goto scriptDataDoubleEscaped |
| 560 | |
| 561 | scriptDataDoubleEscapedDashDash: |
| 562 | c = z.readByte() |
| 563 | if z.err != nil { |
| 564 | return |
| 565 | } |
| 566 | switch c { |
| 567 | case '-': |
| 568 | goto scriptDataDoubleEscapedDashDash |
| 569 | case '<': |
| 570 | goto scriptDataDoubleEscapedLessThanSign |
| 571 | case '>': |
| 572 | goto scriptData |
| 573 | } |
| 574 | goto scriptDataDoubleEscaped |
| 575 | |
| 576 | scriptDataDoubleEscapedLessThanSign: |
| 577 | c = z.readByte() |
| 578 | if z.err != nil { |
| 579 | return |
| 580 | } |
| 581 | if c == '/' { |
| 582 | goto scriptDataDoubleEscapeEnd |
| 583 | } |
| 584 | z.raw.end-- |
| 585 | goto scriptDataDoubleEscaped |
| 586 | |
| 587 | scriptDataDoubleEscapeEnd: |
| 588 | if z.readRawEndTag() { |
| 589 | z.raw.end += len("</script>") |
| 590 | goto scriptDataEscaped |
| 591 | } |
| 592 | if z.err != nil { |
| 593 | return |
| 594 | } |
| 595 | goto scriptDataDoubleEscaped |
| 596 | } |
| 597 | |
| 598 | // readComment reads the next comment token starting with "<!--". The opening |
| 599 | // "<!--" has already been consumed. |
| 600 | func (z *Tokenizer) readComment() { |
| 601 | z.data.start = z.raw.end |
| 602 | defer func() { |
| 603 | if z.data.end < z.data.start { |
| 604 | // It's a comment with no data, like <!-->. |
| 605 | z.data.end = z.data.start |
| 606 | } |
| 607 | }() |
| 608 | for dashCount := 2; ; { |
| 609 | c := z.readByte() |
| 610 | if z.err != nil { |
| 611 | // Ignore up to two dashes at EOF. |
| 612 | if dashCount > 2 { |
| 613 | dashCount = 2 |
| 614 | } |
| 615 | z.data.end = z.raw.end - dashCount |
| 616 | return |
| 617 | } |
| 618 | switch c { |
| 619 | case '-': |
| 620 | dashCount++ |
| 621 | continue |
| 622 | case '>': |
| 623 | if dashCount >= 2 { |
| 624 | z.data.end = z.raw.end - len("-->") |
| 625 | return |
| 626 | } |
| 627 | case '!': |
| 628 | if dashCount >= 2 { |
| 629 | c = z.readByte() |
| 630 | if z.err != nil { |
| 631 | z.data.end = z.raw.end |
| 632 | return |
| 633 | } |
| 634 | if c == '>' { |
| 635 | z.data.end = z.raw.end - len("--!>") |
| 636 | return |
| 637 | } |
| 638 | } |
| 639 | } |
| 640 | dashCount = 0 |
| 641 | } |
| 642 | } |
| 643 | |
| 644 | // readUntilCloseAngle reads until the next ">". |
| 645 | func (z *Tokenizer) readUntilCloseAngle() { |
| 646 | z.data.start = z.raw.end |
| 647 | for { |
| 648 | c := z.readByte() |
| 649 | if z.err != nil { |
| 650 | z.data.end = z.raw.end |
| 651 | return |
| 652 | } |
| 653 | if c == '>' { |
| 654 | z.data.end = z.raw.end - len(">") |
| 655 | return |
| 656 | } |
| 657 | } |
| 658 | } |
| 659 | |
| 660 | // readMarkupDeclaration reads the next token starting with "<!". It might be |
| 661 | // a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or |
| 662 | // "<!a bogus comment". The opening "<!" has already been consumed. |
| 663 | func (z *Tokenizer) readMarkupDeclaration() TokenType { |
| 664 | z.data.start = z.raw.end |
| 665 | var c [2]byte |
| 666 | for i := 0; i < 2; i++ { |
| 667 | c[i] = z.readByte() |
| 668 | if z.err != nil { |
| 669 | z.data.end = z.raw.end |
| 670 | return CommentToken |
| 671 | } |
| 672 | } |
| 673 | if c[0] == '-' && c[1] == '-' { |
| 674 | z.readComment() |
| 675 | return CommentToken |
| 676 | } |
| 677 | z.raw.end -= 2 |
| 678 | if z.readDoctype() { |
| 679 | return DoctypeToken |
| 680 | } |
| 681 | if z.allowCDATA && z.readCDATA() { |
| 682 | z.convertNUL = true |
| 683 | return TextToken |
| 684 | } |
| 685 | // It's a bogus comment. |
| 686 | z.readUntilCloseAngle() |
| 687 | return CommentToken |
| 688 | } |
| 689 | |
| 690 | // readDoctype attempts to read a doctype declaration and returns true if |
| 691 | // successful. The opening "<!" has already been consumed. |
| 692 | func (z *Tokenizer) readDoctype() bool { |
| 693 | const s = "DOCTYPE" |
| 694 | for i := 0; i < len(s); i++ { |
| 695 | c := z.readByte() |
| 696 | if z.err != nil { |
| 697 | z.data.end = z.raw.end |
| 698 | return false |
| 699 | } |
| 700 | if c != s[i] && c != s[i]+('a'-'A') { |
| 701 | // Back up to read the fragment of "DOCTYPE" again. |
| 702 | z.raw.end = z.data.start |
| 703 | return false |
| 704 | } |
| 705 | } |
| 706 | if z.skipWhiteSpace(); z.err != nil { |
| 707 | z.data.start = z.raw.end |
| 708 | z.data.end = z.raw.end |
| 709 | return true |
| 710 | } |
| 711 | z.readUntilCloseAngle() |
| 712 | return true |
| 713 | } |
| 714 | |
| 715 | // readCDATA attempts to read a CDATA section and returns true if |
| 716 | // successful. The opening "<!" has already been consumed. |
| 717 | func (z *Tokenizer) readCDATA() bool { |
| 718 | const s = "[CDATA[" |
| 719 | for i := 0; i < len(s); i++ { |
| 720 | c := z.readByte() |
| 721 | if z.err != nil { |
| 722 | z.data.end = z.raw.end |
| 723 | return false |
| 724 | } |
| 725 | if c != s[i] { |
| 726 | // Back up to read the fragment of "[CDATA[" again. |
| 727 | z.raw.end = z.data.start |
| 728 | return false |
| 729 | } |
| 730 | } |
| 731 | z.data.start = z.raw.end |
| 732 | brackets := 0 |
| 733 | for { |
| 734 | c := z.readByte() |
| 735 | if z.err != nil { |
| 736 | z.data.end = z.raw.end |
| 737 | return true |
| 738 | } |
| 739 | switch c { |
| 740 | case ']': |
| 741 | brackets++ |
| 742 | case '>': |
| 743 | if brackets >= 2 { |
| 744 | z.data.end = z.raw.end - len("]]>") |
| 745 | return true |
| 746 | } |
| 747 | brackets = 0 |
| 748 | default: |
| 749 | brackets = 0 |
| 750 | } |
| 751 | } |
| 752 | } |
| 753 | |
| 754 | // startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] |
| 755 | // case-insensitively matches any element of ss. |
| 756 | func (z *Tokenizer) startTagIn(ss ...string) bool { |
| 757 | loop: |
| 758 | for _, s := range ss { |
| 759 | if z.data.end-z.data.start != len(s) { |
| 760 | continue loop |
| 761 | } |
| 762 | for i := 0; i < len(s); i++ { |
| 763 | c := z.buf[z.data.start+i] |
| 764 | if 'A' <= c && c <= 'Z' { |
| 765 | c += 'a' - 'A' |
| 766 | } |
| 767 | if c != s[i] { |
| 768 | continue loop |
| 769 | } |
| 770 | } |
| 771 | return true |
| 772 | } |
| 773 | return false |
| 774 | } |
| 775 | |
| 776 | // readStartTag reads the next start tag token. The opening "<a" has already |
| 777 | // been consumed, where 'a' means anything in [A-Za-z]. |
| 778 | func (z *Tokenizer) readStartTag() TokenType { |
| 779 | z.readTag(true) |
| 780 | if z.err != nil { |
| 781 | return ErrorToken |
| 782 | } |
| 783 | // Several tags flag the tokenizer's next token as raw. |
| 784 | c, raw := z.buf[z.data.start], false |
| 785 | if 'A' <= c && c <= 'Z' { |
| 786 | c += 'a' - 'A' |
| 787 | } |
| 788 | switch c { |
| 789 | case 'i': |
| 790 | raw = z.startTagIn("iframe") |
| 791 | case 'n': |
| 792 | raw = z.startTagIn("noembed", "noframes", "noscript") |
| 793 | case 'p': |
| 794 | raw = z.startTagIn("plaintext") |
| 795 | case 's': |
| 796 | raw = z.startTagIn("script", "style") |
| 797 | case 't': |
| 798 | raw = z.startTagIn("textarea", "title") |
| 799 | case 'x': |
| 800 | raw = z.startTagIn("xmp") |
| 801 | } |
| 802 | if raw { |
| 803 | z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end])) |
| 804 | } |
| 805 | // Look for a self-closing token like "<br/>". |
| 806 | if z.err == nil && z.buf[z.raw.end-2] == '/' { |
| 807 | return SelfClosingTagToken |
| 808 | } |
| 809 | return StartTagToken |
| 810 | } |
| 811 | |
| 812 | // readTag reads the next tag token and its attributes. If saveAttr, those |
| 813 | // attributes are saved in z.attr, otherwise z.attr is set to an empty slice. |
| 814 | // The opening "<a" or "</a" has already been consumed, where 'a' means anything |
| 815 | // in [A-Za-z]. |
| 816 | func (z *Tokenizer) readTag(saveAttr bool) { |
| 817 | z.attr = z.attr[:0] |
| 818 | z.nAttrReturned = 0 |
| 819 | // Read the tag name and attribute key/value pairs. |
| 820 | z.readTagName() |
| 821 | if z.skipWhiteSpace(); z.err != nil { |
| 822 | return |
| 823 | } |
| 824 | for { |
| 825 | c := z.readByte() |
| 826 | if z.err != nil || c == '>' { |
| 827 | break |
| 828 | } |
| 829 | z.raw.end-- |
| 830 | z.readTagAttrKey() |
| 831 | z.readTagAttrVal() |
| 832 | // Save pendingAttr if saveAttr and that attribute has a non-empty key. |
| 833 | if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { |
| 834 | z.attr = append(z.attr, z.pendingAttr) |
| 835 | } |
| 836 | if z.skipWhiteSpace(); z.err != nil { |
| 837 | break |
| 838 | } |
| 839 | } |
| 840 | } |
| 841 | |
| 842 | // readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end) |
| 843 | // is positioned such that the first byte of the tag name (the "d" in "<div") |
| 844 | // has already been consumed. |
| 845 | func (z *Tokenizer) readTagName() { |
| 846 | z.data.start = z.raw.end - 1 |
| 847 | for { |
| 848 | c := z.readByte() |
| 849 | if z.err != nil { |
| 850 | z.data.end = z.raw.end |
| 851 | return |
| 852 | } |
| 853 | switch c { |
| 854 | case ' ', '\n', '\r', '\t', '\f': |
| 855 | z.data.end = z.raw.end - 1 |
| 856 | return |
| 857 | case '/', '>': |
| 858 | z.raw.end-- |
| 859 | z.data.end = z.raw.end |
| 860 | return |
| 861 | } |
| 862 | } |
| 863 | } |
| 864 | |
| 865 | // readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>". |
| 866 | // Precondition: z.err == nil. |
| 867 | func (z *Tokenizer) readTagAttrKey() { |
| 868 | z.pendingAttr[0].start = z.raw.end |
| 869 | for { |
| 870 | c := z.readByte() |
| 871 | if z.err != nil { |
| 872 | z.pendingAttr[0].end = z.raw.end |
| 873 | return |
| 874 | } |
| 875 | switch c { |
| 876 | case ' ', '\n', '\r', '\t', '\f', '/': |
| 877 | z.pendingAttr[0].end = z.raw.end - 1 |
| 878 | return |
| 879 | case '=', '>': |
| 880 | z.raw.end-- |
| 881 | z.pendingAttr[0].end = z.raw.end |
| 882 | return |
| 883 | } |
| 884 | } |
| 885 | } |
| 886 | |
| 887 | // readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>". |
| 888 | func (z *Tokenizer) readTagAttrVal() { |
| 889 | z.pendingAttr[1].start = z.raw.end |
| 890 | z.pendingAttr[1].end = z.raw.end |
| 891 | if z.skipWhiteSpace(); z.err != nil { |
| 892 | return |
| 893 | } |
| 894 | c := z.readByte() |
| 895 | if z.err != nil { |
| 896 | return |
| 897 | } |
| 898 | if c != '=' { |
| 899 | z.raw.end-- |
| 900 | return |
| 901 | } |
| 902 | if z.skipWhiteSpace(); z.err != nil { |
| 903 | return |
| 904 | } |
| 905 | quote := z.readByte() |
| 906 | if z.err != nil { |
| 907 | return |
| 908 | } |
| 909 | switch quote { |
| 910 | case '>': |
| 911 | z.raw.end-- |
| 912 | return |
| 913 | |
| 914 | case '\'', '"': |
| 915 | z.pendingAttr[1].start = z.raw.end |
| 916 | for { |
| 917 | c := z.readByte() |
| 918 | if z.err != nil { |
| 919 | z.pendingAttr[1].end = z.raw.end |
| 920 | return |
| 921 | } |
| 922 | if c == quote { |
| 923 | z.pendingAttr[1].end = z.raw.end - 1 |
| 924 | return |
| 925 | } |
| 926 | } |
| 927 | |
| 928 | default: |
| 929 | z.pendingAttr[1].start = z.raw.end - 1 |
| 930 | for { |
| 931 | c := z.readByte() |
| 932 | if z.err != nil { |
| 933 | z.pendingAttr[1].end = z.raw.end |
| 934 | return |
| 935 | } |
| 936 | switch c { |
| 937 | case ' ', '\n', '\r', '\t', '\f': |
| 938 | z.pendingAttr[1].end = z.raw.end - 1 |
| 939 | return |
| 940 | case '>': |
| 941 | z.raw.end-- |
| 942 | z.pendingAttr[1].end = z.raw.end |
| 943 | return |
| 944 | } |
| 945 | } |
| 946 | } |
| 947 | } |
| 948 | |
| 949 | // Next scans the next token and returns its type. |
| 950 | func (z *Tokenizer) Next() TokenType { |
| 951 | z.raw.start = z.raw.end |
| 952 | z.data.start = z.raw.end |
| 953 | z.data.end = z.raw.end |
| 954 | if z.err != nil { |
| 955 | z.tt = ErrorToken |
| 956 | return z.tt |
| 957 | } |
| 958 | if z.rawTag != "" { |
| 959 | if z.rawTag == "plaintext" { |
| 960 | // Read everything up to EOF. |
| 961 | for z.err == nil { |
| 962 | z.readByte() |
| 963 | } |
| 964 | z.data.end = z.raw.end |
| 965 | z.textIsRaw = true |
| 966 | } else { |
| 967 | z.readRawOrRCDATA() |
| 968 | } |
| 969 | if z.data.end > z.data.start { |
| 970 | z.tt = TextToken |
| 971 | z.convertNUL = true |
| 972 | return z.tt |
| 973 | } |
| 974 | } |
| 975 | z.textIsRaw = false |
| 976 | z.convertNUL = false |
| 977 | |
| 978 | loop: |
| 979 | for { |
| 980 | c := z.readByte() |
| 981 | if z.err != nil { |
| 982 | break loop |
| 983 | } |
| 984 | if c != '<' { |
| 985 | continue loop |
| 986 | } |
| 987 | |
| 988 | // Check if the '<' we have just read is part of a tag, comment |
| 989 | // or doctype. If not, it's part of the accumulated text token. |
| 990 | c = z.readByte() |
| 991 | if z.err != nil { |
| 992 | break loop |
| 993 | } |
| 994 | var tokenType TokenType |
| 995 | switch { |
| 996 | case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': |
| 997 | tokenType = StartTagToken |
| 998 | case c == '/': |
| 999 | tokenType = EndTagToken |
| 1000 | case c == '!' || c == '?': |
| 1001 | // We use CommentToken to mean any of "<!--actual comments-->", |
| 1002 | // "<!DOCTYPE declarations>" and "<?xml processing instructions?>". |
| 1003 | tokenType = CommentToken |
| 1004 | default: |
| 1005 | // Reconsume the current character. |
| 1006 | z.raw.end-- |
| 1007 | continue |
| 1008 | } |
| 1009 | |
| 1010 | // We have a non-text token, but we might have accumulated some text |
| 1011 | // before that. If so, we return the text first, and return the non- |
| 1012 | // text token on the subsequent call to Next. |
| 1013 | if x := z.raw.end - len("<a"); z.raw.start < x { |
| 1014 | z.raw.end = x |
| 1015 | z.data.end = x |
| 1016 | z.tt = TextToken |
| 1017 | return z.tt |
| 1018 | } |
| 1019 | switch tokenType { |
| 1020 | case StartTagToken: |
| 1021 | z.tt = z.readStartTag() |
| 1022 | return z.tt |
| 1023 | case EndTagToken: |
| 1024 | c = z.readByte() |
| 1025 | if z.err != nil { |
| 1026 | break loop |
| 1027 | } |
| 1028 | if c == '>' { |
| 1029 | // "</>" does not generate a token at all. Generate an empty comment |
| 1030 | // to allow passthrough clients to pick up the data using Raw. |
| 1031 | // Reset the tokenizer state and start again. |
| 1032 | z.tt = CommentToken |
| 1033 | return z.tt |
| 1034 | } |
| 1035 | if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { |
| 1036 | z.readTag(false) |
| 1037 | if z.err != nil { |
| 1038 | z.tt = ErrorToken |
| 1039 | } else { |
| 1040 | z.tt = EndTagToken |
| 1041 | } |
| 1042 | return z.tt |
| 1043 | } |
| 1044 | z.raw.end-- |
| 1045 | z.readUntilCloseAngle() |
| 1046 | z.tt = CommentToken |
| 1047 | return z.tt |
| 1048 | case CommentToken: |
| 1049 | if c == '!' { |
| 1050 | z.tt = z.readMarkupDeclaration() |
| 1051 | return z.tt |
| 1052 | } |
| 1053 | z.raw.end-- |
| 1054 | z.readUntilCloseAngle() |
| 1055 | z.tt = CommentToken |
| 1056 | return z.tt |
| 1057 | } |
| 1058 | } |
| 1059 | if z.raw.start < z.raw.end { |
| 1060 | z.data.end = z.raw.end |
| 1061 | z.tt = TextToken |
| 1062 | return z.tt |
| 1063 | } |
| 1064 | z.tt = ErrorToken |
| 1065 | return z.tt |
| 1066 | } |
| 1067 | |
| 1068 | // Raw returns the unmodified text of the current token. Calling Next, Token, |
| 1069 | // Text, TagName or TagAttr may change the contents of the returned slice. |
| 1070 | func (z *Tokenizer) Raw() []byte { |
| 1071 | return z.buf[z.raw.start:z.raw.end] |
| 1072 | } |
| 1073 | |
| 1074 | // convertNewlines converts "\r" and "\r\n" in s to "\n". |
| 1075 | // The conversion happens in place, but the resulting slice may be shorter. |
| 1076 | func convertNewlines(s []byte) []byte { |
| 1077 | for i, c := range s { |
| 1078 | if c != '\r' { |
| 1079 | continue |
| 1080 | } |
| 1081 | |
| 1082 | src := i + 1 |
| 1083 | if src >= len(s) || s[src] != '\n' { |
| 1084 | s[i] = '\n' |
| 1085 | continue |
| 1086 | } |
| 1087 | |
| 1088 | dst := i |
| 1089 | for src < len(s) { |
| 1090 | if s[src] == '\r' { |
| 1091 | if src+1 < len(s) && s[src+1] == '\n' { |
| 1092 | src++ |
| 1093 | } |
| 1094 | s[dst] = '\n' |
| 1095 | } else { |
| 1096 | s[dst] = s[src] |
| 1097 | } |
| 1098 | src++ |
| 1099 | dst++ |
| 1100 | } |
| 1101 | return s[:dst] |
| 1102 | } |
| 1103 | return s |
| 1104 | } |
| 1105 | |
| 1106 | var ( |
| 1107 | nul = []byte("\x00") |
| 1108 | replacement = []byte("\ufffd") |
| 1109 | ) |
| 1110 | |
| 1111 | // Text returns the unescaped text of a text, comment or doctype token. The |
| 1112 | // contents of the returned slice may change on the next call to Next. |
| 1113 | func (z *Tokenizer) Text() []byte { |
| 1114 | switch z.tt { |
| 1115 | case TextToken, CommentToken, DoctypeToken: |
| 1116 | s := z.buf[z.data.start:z.data.end] |
| 1117 | z.data.start = z.raw.end |
| 1118 | z.data.end = z.raw.end |
| 1119 | s = convertNewlines(s) |
| 1120 | if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { |
| 1121 | s = bytes.Replace(s, nul, replacement, -1) |
| 1122 | } |
| 1123 | if !z.textIsRaw { |
| 1124 | s = unescape(s, false) |
| 1125 | } |
| 1126 | return s |
| 1127 | } |
| 1128 | return nil |
| 1129 | } |
| 1130 | |
| 1131 | // TagName returns the lower-cased name of a tag token (the `img` out of |
| 1132 | // `<IMG SRC="foo">`) and whether the tag has attributes. |
| 1133 | // The contents of the returned slice may change on the next call to Next. |
| 1134 | func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { |
| 1135 | if z.data.start < z.data.end { |
| 1136 | switch z.tt { |
| 1137 | case StartTagToken, EndTagToken, SelfClosingTagToken: |
| 1138 | s := z.buf[z.data.start:z.data.end] |
| 1139 | z.data.start = z.raw.end |
| 1140 | z.data.end = z.raw.end |
| 1141 | return lower(s), z.nAttrReturned < len(z.attr) |
| 1142 | } |
| 1143 | } |
| 1144 | return nil, false |
| 1145 | } |
| 1146 | |
| 1147 | // TagAttr returns the lower-cased key and unescaped value of the next unparsed |
| 1148 | // attribute for the current tag token and whether there are more attributes. |
| 1149 | // The contents of the returned slices may change on the next call to Next. |
| 1150 | func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { |
| 1151 | if z.nAttrReturned < len(z.attr) { |
| 1152 | switch z.tt { |
| 1153 | case StartTagToken, SelfClosingTagToken: |
| 1154 | x := z.attr[z.nAttrReturned] |
| 1155 | z.nAttrReturned++ |
| 1156 | key = z.buf[x[0].start:x[0].end] |
| 1157 | val = z.buf[x[1].start:x[1].end] |
| 1158 | return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) |
| 1159 | } |
| 1160 | } |
| 1161 | return nil, nil, false |
| 1162 | } |
| 1163 | |
| 1164 | // Token returns the next Token. The result's Data and Attr values remain valid |
| 1165 | // after subsequent Next calls. |
| 1166 | func (z *Tokenizer) Token() Token { |
| 1167 | t := Token{Type: z.tt} |
| 1168 | switch z.tt { |
| 1169 | case TextToken, CommentToken, DoctypeToken: |
| 1170 | t.Data = string(z.Text()) |
| 1171 | case StartTagToken, SelfClosingTagToken, EndTagToken: |
| 1172 | name, moreAttr := z.TagName() |
| 1173 | for moreAttr { |
| 1174 | var key, val []byte |
| 1175 | key, val, moreAttr = z.TagAttr() |
| 1176 | t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) |
| 1177 | } |
| 1178 | if a := atom.Lookup(name); a != 0 { |
| 1179 | t.DataAtom, t.Data = a, a.String() |
| 1180 | } else { |
| 1181 | t.DataAtom, t.Data = 0, string(name) |
| 1182 | } |
| 1183 | } |
| 1184 | return t |
| 1185 | } |
| 1186 | |
| 1187 | // SetMaxBuf sets a limit on the amount of data buffered during tokenization. |
| 1188 | // A value of 0 means unlimited. |
| 1189 | func (z *Tokenizer) SetMaxBuf(n int) { |
| 1190 | z.maxBuf = n |
| 1191 | } |
| 1192 | |
| 1193 | // NewTokenizer returns a new HTML Tokenizer for the given Reader. |
| 1194 | // The input is assumed to be UTF-8 encoded. |
| 1195 | func NewTokenizer(r io.Reader) *Tokenizer { |
| 1196 | return NewTokenizerFragment(r, "") |
| 1197 | } |
| 1198 | |
| 1199 | // NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for |
| 1200 | // tokenizing an existing element's InnerHTML fragment. contextTag is that |
| 1201 | // element's tag, such as "div" or "iframe". |
| 1202 | // |
| 1203 | // For example, how the InnerHTML "a<b" is tokenized depends on whether it is |
| 1204 | // for a <p> tag or a <script> tag. |
| 1205 | // |
| 1206 | // The input is assumed to be UTF-8 encoded. |
| 1207 | func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer { |
| 1208 | z := &Tokenizer{ |
| 1209 | r: r, |
| 1210 | buf: make([]byte, 0, 4096), |
| 1211 | } |
| 1212 | if contextTag != "" { |
| 1213 | switch s := strings.ToLower(contextTag); s { |
| 1214 | case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp": |
| 1215 | z.rawTag = s |
| 1216 | } |
| 1217 | } |
| 1218 | return z |
| 1219 | } |