David K. Bainbridge | 215e024 | 2017-09-05 23:18:24 -0700 | [diff] [blame] | 1 | // Copyright 2011 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package html |
| 6 | |
| 7 | import ( |
| 8 | "strings" |
| 9 | ) |
| 10 | |
| 11 | // parseDoctype parses the data from a DoctypeToken into a name, |
| 12 | // public identifier, and system identifier. It returns a Node whose Type |
| 13 | // is DoctypeNode, whose Data is the name, and which has attributes |
| 14 | // named "system" and "public" for the two identifiers if they were present. |
| 15 | // quirks is whether the document should be parsed in "quirks mode". |
| 16 | func parseDoctype(s string) (n *Node, quirks bool) { |
| 17 | n = &Node{Type: DoctypeNode} |
| 18 | |
| 19 | // Find the name. |
| 20 | space := strings.IndexAny(s, whitespace) |
| 21 | if space == -1 { |
| 22 | space = len(s) |
| 23 | } |
| 24 | n.Data = s[:space] |
| 25 | // The comparison to "html" is case-sensitive. |
| 26 | if n.Data != "html" { |
| 27 | quirks = true |
| 28 | } |
| 29 | n.Data = strings.ToLower(n.Data) |
| 30 | s = strings.TrimLeft(s[space:], whitespace) |
| 31 | |
| 32 | if len(s) < 6 { |
| 33 | // It can't start with "PUBLIC" or "SYSTEM". |
| 34 | // Ignore the rest of the string. |
| 35 | return n, quirks || s != "" |
| 36 | } |
| 37 | |
| 38 | key := strings.ToLower(s[:6]) |
| 39 | s = s[6:] |
| 40 | for key == "public" || key == "system" { |
| 41 | s = strings.TrimLeft(s, whitespace) |
| 42 | if s == "" { |
| 43 | break |
| 44 | } |
| 45 | quote := s[0] |
| 46 | if quote != '"' && quote != '\'' { |
| 47 | break |
| 48 | } |
| 49 | s = s[1:] |
| 50 | q := strings.IndexRune(s, rune(quote)) |
| 51 | var id string |
| 52 | if q == -1 { |
| 53 | id = s |
| 54 | s = "" |
| 55 | } else { |
| 56 | id = s[:q] |
| 57 | s = s[q+1:] |
| 58 | } |
| 59 | n.Attr = append(n.Attr, Attribute{Key: key, Val: id}) |
| 60 | if key == "public" { |
| 61 | key = "system" |
| 62 | } else { |
| 63 | key = "" |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | if key != "" || s != "" { |
| 68 | quirks = true |
| 69 | } else if len(n.Attr) > 0 { |
| 70 | if n.Attr[0].Key == "public" { |
| 71 | public := strings.ToLower(n.Attr[0].Val) |
| 72 | switch public { |
| 73 | case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html": |
| 74 | quirks = true |
| 75 | default: |
| 76 | for _, q := range quirkyIDs { |
| 77 | if strings.HasPrefix(public, q) { |
| 78 | quirks = true |
| 79 | break |
| 80 | } |
| 81 | } |
| 82 | } |
| 83 | // The following two public IDs only cause quirks mode if there is no system ID. |
| 84 | if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") || |
| 85 | strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) { |
| 86 | quirks = true |
| 87 | } |
| 88 | } |
| 89 | if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" && |
| 90 | strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" { |
| 91 | quirks = true |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | return n, quirks |
| 96 | } |
| 97 | |
| 98 | // quirkyIDs is a list of public doctype identifiers that cause a document |
| 99 | // to be interpreted in quirks mode. The identifiers should be in lower case. |
| 100 | var quirkyIDs = []string{ |
| 101 | "+//silmaril//dtd html pro v0r11 19970101//", |
| 102 | "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", |
| 103 | "-//as//dtd html 3.0 aswedit + extensions//", |
| 104 | "-//ietf//dtd html 2.0 level 1//", |
| 105 | "-//ietf//dtd html 2.0 level 2//", |
| 106 | "-//ietf//dtd html 2.0 strict level 1//", |
| 107 | "-//ietf//dtd html 2.0 strict level 2//", |
| 108 | "-//ietf//dtd html 2.0 strict//", |
| 109 | "-//ietf//dtd html 2.0//", |
| 110 | "-//ietf//dtd html 2.1e//", |
| 111 | "-//ietf//dtd html 3.0//", |
| 112 | "-//ietf//dtd html 3.2 final//", |
| 113 | "-//ietf//dtd html 3.2//", |
| 114 | "-//ietf//dtd html 3//", |
| 115 | "-//ietf//dtd html level 0//", |
| 116 | "-//ietf//dtd html level 1//", |
| 117 | "-//ietf//dtd html level 2//", |
| 118 | "-//ietf//dtd html level 3//", |
| 119 | "-//ietf//dtd html strict level 0//", |
| 120 | "-//ietf//dtd html strict level 1//", |
| 121 | "-//ietf//dtd html strict level 2//", |
| 122 | "-//ietf//dtd html strict level 3//", |
| 123 | "-//ietf//dtd html strict//", |
| 124 | "-//ietf//dtd html//", |
| 125 | "-//metrius//dtd metrius presentational//", |
| 126 | "-//microsoft//dtd internet explorer 2.0 html strict//", |
| 127 | "-//microsoft//dtd internet explorer 2.0 html//", |
| 128 | "-//microsoft//dtd internet explorer 2.0 tables//", |
| 129 | "-//microsoft//dtd internet explorer 3.0 html strict//", |
| 130 | "-//microsoft//dtd internet explorer 3.0 html//", |
| 131 | "-//microsoft//dtd internet explorer 3.0 tables//", |
| 132 | "-//netscape comm. corp.//dtd html//", |
| 133 | "-//netscape comm. corp.//dtd strict html//", |
| 134 | "-//o'reilly and associates//dtd html 2.0//", |
| 135 | "-//o'reilly and associates//dtd html extended 1.0//", |
| 136 | "-//o'reilly and associates//dtd html extended relaxed 1.0//", |
| 137 | "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", |
| 138 | "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", |
| 139 | "-//spyglass//dtd html 2.0 extended//", |
| 140 | "-//sq//dtd html 2.0 hotmetal + extensions//", |
| 141 | "-//sun microsystems corp.//dtd hotjava html//", |
| 142 | "-//sun microsystems corp.//dtd hotjava strict html//", |
| 143 | "-//w3c//dtd html 3 1995-03-24//", |
| 144 | "-//w3c//dtd html 3.2 draft//", |
| 145 | "-//w3c//dtd html 3.2 final//", |
| 146 | "-//w3c//dtd html 3.2//", |
| 147 | "-//w3c//dtd html 3.2s draft//", |
| 148 | "-//w3c//dtd html 4.0 frameset//", |
| 149 | "-//w3c//dtd html 4.0 transitional//", |
| 150 | "-//w3c//dtd html experimental 19960712//", |
| 151 | "-//w3c//dtd html experimental 970421//", |
| 152 | "-//w3c//dtd w3 html//", |
| 153 | "-//w3o//dtd w3 html 3.0//", |
| 154 | "-//webtechs//dtd mozilla html 2.0//", |
| 155 | "-//webtechs//dtd mozilla html//", |
| 156 | } |