David K. Bainbridge | 215e024 | 2017-09-05 23:18:24 -0700 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | package main |
| 8 | |
| 9 | // This program generates table.go and table_test.go. |
| 10 | // Invoke as |
| 11 | // |
| 12 | // go run gen.go |gofmt >table.go |
| 13 | // go run gen.go -test |gofmt >table_test.go |
| 14 | |
| 15 | import ( |
| 16 | "flag" |
| 17 | "fmt" |
| 18 | "math/rand" |
| 19 | "os" |
| 20 | "sort" |
| 21 | "strings" |
| 22 | ) |
| 23 | |
| 24 | // identifier converts s to a Go exported identifier. |
| 25 | // It converts "div" to "Div" and "accept-charset" to "AcceptCharset". |
| 26 | func identifier(s string) string { |
| 27 | b := make([]byte, 0, len(s)) |
| 28 | cap := true |
| 29 | for _, c := range s { |
| 30 | if c == '-' { |
| 31 | cap = true |
| 32 | continue |
| 33 | } |
| 34 | if cap && 'a' <= c && c <= 'z' { |
| 35 | c -= 'a' - 'A' |
| 36 | } |
| 37 | cap = false |
| 38 | b = append(b, byte(c)) |
| 39 | } |
| 40 | return string(b) |
| 41 | } |
| 42 | |
| 43 | var test = flag.Bool("test", false, "generate table_test.go") |
| 44 | |
| 45 | func main() { |
| 46 | flag.Parse() |
| 47 | |
| 48 | var all []string |
| 49 | all = append(all, elements...) |
| 50 | all = append(all, attributes...) |
| 51 | all = append(all, eventHandlers...) |
| 52 | all = append(all, extra...) |
| 53 | sort.Strings(all) |
| 54 | |
| 55 | if *test { |
| 56 | fmt.Printf("// generated by go run gen.go -test; DO NOT EDIT\n\n") |
| 57 | fmt.Printf("package atom\n\n") |
| 58 | fmt.Printf("var testAtomList = []string{\n") |
| 59 | for _, s := range all { |
| 60 | fmt.Printf("\t%q,\n", s) |
| 61 | } |
| 62 | fmt.Printf("}\n") |
| 63 | return |
| 64 | } |
| 65 | |
| 66 | // uniq - lists have dups |
| 67 | // compute max len too |
| 68 | maxLen := 0 |
| 69 | w := 0 |
| 70 | for _, s := range all { |
| 71 | if w == 0 || all[w-1] != s { |
| 72 | if maxLen < len(s) { |
| 73 | maxLen = len(s) |
| 74 | } |
| 75 | all[w] = s |
| 76 | w++ |
| 77 | } |
| 78 | } |
| 79 | all = all[:w] |
| 80 | |
| 81 | // Find hash that minimizes table size. |
| 82 | var best *table |
| 83 | for i := 0; i < 1000000; i++ { |
| 84 | if best != nil && 1<<(best.k-1) < len(all) { |
| 85 | break |
| 86 | } |
| 87 | h := rand.Uint32() |
| 88 | for k := uint(0); k <= 16; k++ { |
| 89 | if best != nil && k >= best.k { |
| 90 | break |
| 91 | } |
| 92 | var t table |
| 93 | if t.init(h, k, all) { |
| 94 | best = &t |
| 95 | break |
| 96 | } |
| 97 | } |
| 98 | } |
| 99 | if best == nil { |
| 100 | fmt.Fprintf(os.Stderr, "failed to construct string table\n") |
| 101 | os.Exit(1) |
| 102 | } |
| 103 | |
| 104 | // Lay out strings, using overlaps when possible. |
| 105 | layout := append([]string{}, all...) |
| 106 | |
| 107 | // Remove strings that are substrings of other strings |
| 108 | for changed := true; changed; { |
| 109 | changed = false |
| 110 | for i, s := range layout { |
| 111 | if s == "" { |
| 112 | continue |
| 113 | } |
| 114 | for j, t := range layout { |
| 115 | if i != j && t != "" && strings.Contains(s, t) { |
| 116 | changed = true |
| 117 | layout[j] = "" |
| 118 | } |
| 119 | } |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | // Join strings where one suffix matches another prefix. |
| 124 | for { |
| 125 | // Find best i, j, k such that layout[i][len-k:] == layout[j][:k], |
| 126 | // maximizing overlap length k. |
| 127 | besti := -1 |
| 128 | bestj := -1 |
| 129 | bestk := 0 |
| 130 | for i, s := range layout { |
| 131 | if s == "" { |
| 132 | continue |
| 133 | } |
| 134 | for j, t := range layout { |
| 135 | if i == j { |
| 136 | continue |
| 137 | } |
| 138 | for k := bestk + 1; k <= len(s) && k <= len(t); k++ { |
| 139 | if s[len(s)-k:] == t[:k] { |
| 140 | besti = i |
| 141 | bestj = j |
| 142 | bestk = k |
| 143 | } |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | if bestk > 0 { |
| 148 | layout[besti] += layout[bestj][bestk:] |
| 149 | layout[bestj] = "" |
| 150 | continue |
| 151 | } |
| 152 | break |
| 153 | } |
| 154 | |
| 155 | text := strings.Join(layout, "") |
| 156 | |
| 157 | atom := map[string]uint32{} |
| 158 | for _, s := range all { |
| 159 | off := strings.Index(text, s) |
| 160 | if off < 0 { |
| 161 | panic("lost string " + s) |
| 162 | } |
| 163 | atom[s] = uint32(off<<8 | len(s)) |
| 164 | } |
| 165 | |
| 166 | // Generate the Go code. |
| 167 | fmt.Printf("// generated by go run gen.go; DO NOT EDIT\n\n") |
| 168 | fmt.Printf("package atom\n\nconst (\n") |
| 169 | for _, s := range all { |
| 170 | fmt.Printf("\t%s Atom = %#x\n", identifier(s), atom[s]) |
| 171 | } |
| 172 | fmt.Printf(")\n\n") |
| 173 | |
| 174 | fmt.Printf("const hash0 = %#x\n\n", best.h0) |
| 175 | fmt.Printf("const maxAtomLen = %d\n\n", maxLen) |
| 176 | |
| 177 | fmt.Printf("var table = [1<<%d]Atom{\n", best.k) |
| 178 | for i, s := range best.tab { |
| 179 | if s == "" { |
| 180 | continue |
| 181 | } |
| 182 | fmt.Printf("\t%#x: %#x, // %s\n", i, atom[s], s) |
| 183 | } |
| 184 | fmt.Printf("}\n") |
| 185 | datasize := (1 << best.k) * 4 |
| 186 | |
| 187 | fmt.Printf("const atomText =\n") |
| 188 | textsize := len(text) |
| 189 | for len(text) > 60 { |
| 190 | fmt.Printf("\t%q +\n", text[:60]) |
| 191 | text = text[60:] |
| 192 | } |
| 193 | fmt.Printf("\t%q\n\n", text) |
| 194 | |
| 195 | fmt.Fprintf(os.Stderr, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize) |
| 196 | } |
| 197 | |
| 198 | type byLen []string |
| 199 | |
| 200 | func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) } |
| 201 | func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] } |
| 202 | func (x byLen) Len() int { return len(x) } |
| 203 | |
| 204 | // fnv computes the FNV hash with an arbitrary starting value h. |
| 205 | func fnv(h uint32, s string) uint32 { |
| 206 | for i := 0; i < len(s); i++ { |
| 207 | h ^= uint32(s[i]) |
| 208 | h *= 16777619 |
| 209 | } |
| 210 | return h |
| 211 | } |
| 212 | |
| 213 | // A table represents an attempt at constructing the lookup table. |
| 214 | // The lookup table uses cuckoo hashing, meaning that each string |
| 215 | // can be found in one of two positions. |
| 216 | type table struct { |
| 217 | h0 uint32 |
| 218 | k uint |
| 219 | mask uint32 |
| 220 | tab []string |
| 221 | } |
| 222 | |
| 223 | // hash returns the two hashes for s. |
| 224 | func (t *table) hash(s string) (h1, h2 uint32) { |
| 225 | h := fnv(t.h0, s) |
| 226 | h1 = h & t.mask |
| 227 | h2 = (h >> 16) & t.mask |
| 228 | return |
| 229 | } |
| 230 | |
| 231 | // init initializes the table with the given parameters. |
| 232 | // h0 is the initial hash value, |
| 233 | // k is the number of bits of hash value to use, and |
| 234 | // x is the list of strings to store in the table. |
| 235 | // init returns false if the table cannot be constructed. |
| 236 | func (t *table) init(h0 uint32, k uint, x []string) bool { |
| 237 | t.h0 = h0 |
| 238 | t.k = k |
| 239 | t.tab = make([]string, 1<<k) |
| 240 | t.mask = 1<<k - 1 |
| 241 | for _, s := range x { |
| 242 | if !t.insert(s) { |
| 243 | return false |
| 244 | } |
| 245 | } |
| 246 | return true |
| 247 | } |
| 248 | |
| 249 | // insert inserts s in the table. |
| 250 | func (t *table) insert(s string) bool { |
| 251 | h1, h2 := t.hash(s) |
| 252 | if t.tab[h1] == "" { |
| 253 | t.tab[h1] = s |
| 254 | return true |
| 255 | } |
| 256 | if t.tab[h2] == "" { |
| 257 | t.tab[h2] = s |
| 258 | return true |
| 259 | } |
| 260 | if t.push(h1, 0) { |
| 261 | t.tab[h1] = s |
| 262 | return true |
| 263 | } |
| 264 | if t.push(h2, 0) { |
| 265 | t.tab[h2] = s |
| 266 | return true |
| 267 | } |
| 268 | return false |
| 269 | } |
| 270 | |
| 271 | // push attempts to push aside the entry in slot i. |
| 272 | func (t *table) push(i uint32, depth int) bool { |
| 273 | if depth > len(t.tab) { |
| 274 | return false |
| 275 | } |
| 276 | s := t.tab[i] |
| 277 | h1, h2 := t.hash(s) |
| 278 | j := h1 + h2 - i |
| 279 | if t.tab[j] != "" && !t.push(j, depth+1) { |
| 280 | return false |
| 281 | } |
| 282 | t.tab[j] = s |
| 283 | return true |
| 284 | } |
| 285 | |
| 286 | // The lists of element names and attribute keys were taken from |
| 287 | // https://html.spec.whatwg.org/multipage/indices.html#index |
| 288 | // as of the "HTML Living Standard - Last Updated 21 February 2015" version. |
| 289 | |
| 290 | var elements = []string{ |
| 291 | "a", |
| 292 | "abbr", |
| 293 | "address", |
| 294 | "area", |
| 295 | "article", |
| 296 | "aside", |
| 297 | "audio", |
| 298 | "b", |
| 299 | "base", |
| 300 | "bdi", |
| 301 | "bdo", |
| 302 | "blockquote", |
| 303 | "body", |
| 304 | "br", |
| 305 | "button", |
| 306 | "canvas", |
| 307 | "caption", |
| 308 | "cite", |
| 309 | "code", |
| 310 | "col", |
| 311 | "colgroup", |
| 312 | "command", |
| 313 | "data", |
| 314 | "datalist", |
| 315 | "dd", |
| 316 | "del", |
| 317 | "details", |
| 318 | "dfn", |
| 319 | "dialog", |
| 320 | "div", |
| 321 | "dl", |
| 322 | "dt", |
| 323 | "em", |
| 324 | "embed", |
| 325 | "fieldset", |
| 326 | "figcaption", |
| 327 | "figure", |
| 328 | "footer", |
| 329 | "form", |
| 330 | "h1", |
| 331 | "h2", |
| 332 | "h3", |
| 333 | "h4", |
| 334 | "h5", |
| 335 | "h6", |
| 336 | "head", |
| 337 | "header", |
| 338 | "hgroup", |
| 339 | "hr", |
| 340 | "html", |
| 341 | "i", |
| 342 | "iframe", |
| 343 | "img", |
| 344 | "input", |
| 345 | "ins", |
| 346 | "kbd", |
| 347 | "keygen", |
| 348 | "label", |
| 349 | "legend", |
| 350 | "li", |
| 351 | "link", |
| 352 | "map", |
| 353 | "mark", |
| 354 | "menu", |
| 355 | "menuitem", |
| 356 | "meta", |
| 357 | "meter", |
| 358 | "nav", |
| 359 | "noscript", |
| 360 | "object", |
| 361 | "ol", |
| 362 | "optgroup", |
| 363 | "option", |
| 364 | "output", |
| 365 | "p", |
| 366 | "param", |
| 367 | "pre", |
| 368 | "progress", |
| 369 | "q", |
| 370 | "rp", |
| 371 | "rt", |
| 372 | "ruby", |
| 373 | "s", |
| 374 | "samp", |
| 375 | "script", |
| 376 | "section", |
| 377 | "select", |
| 378 | "small", |
| 379 | "source", |
| 380 | "span", |
| 381 | "strong", |
| 382 | "style", |
| 383 | "sub", |
| 384 | "summary", |
| 385 | "sup", |
| 386 | "table", |
| 387 | "tbody", |
| 388 | "td", |
| 389 | "template", |
| 390 | "textarea", |
| 391 | "tfoot", |
| 392 | "th", |
| 393 | "thead", |
| 394 | "time", |
| 395 | "title", |
| 396 | "tr", |
| 397 | "track", |
| 398 | "u", |
| 399 | "ul", |
| 400 | "var", |
| 401 | "video", |
| 402 | "wbr", |
| 403 | } |
| 404 | |
| 405 | // https://html.spec.whatwg.org/multipage/indices.html#attributes-3 |
| 406 | |
| 407 | var attributes = []string{ |
| 408 | "abbr", |
| 409 | "accept", |
| 410 | "accept-charset", |
| 411 | "accesskey", |
| 412 | "action", |
| 413 | "alt", |
| 414 | "async", |
| 415 | "autocomplete", |
| 416 | "autofocus", |
| 417 | "autoplay", |
| 418 | "challenge", |
| 419 | "charset", |
| 420 | "checked", |
| 421 | "cite", |
| 422 | "class", |
| 423 | "cols", |
| 424 | "colspan", |
| 425 | "command", |
| 426 | "content", |
| 427 | "contenteditable", |
| 428 | "contextmenu", |
| 429 | "controls", |
| 430 | "coords", |
| 431 | "crossorigin", |
| 432 | "data", |
| 433 | "datetime", |
| 434 | "default", |
| 435 | "defer", |
| 436 | "dir", |
| 437 | "dirname", |
| 438 | "disabled", |
| 439 | "download", |
| 440 | "draggable", |
| 441 | "dropzone", |
| 442 | "enctype", |
| 443 | "for", |
| 444 | "form", |
| 445 | "formaction", |
| 446 | "formenctype", |
| 447 | "formmethod", |
| 448 | "formnovalidate", |
| 449 | "formtarget", |
| 450 | "headers", |
| 451 | "height", |
| 452 | "hidden", |
| 453 | "high", |
| 454 | "href", |
| 455 | "hreflang", |
| 456 | "http-equiv", |
| 457 | "icon", |
| 458 | "id", |
| 459 | "inputmode", |
| 460 | "ismap", |
| 461 | "itemid", |
| 462 | "itemprop", |
| 463 | "itemref", |
| 464 | "itemscope", |
| 465 | "itemtype", |
| 466 | "keytype", |
| 467 | "kind", |
| 468 | "label", |
| 469 | "lang", |
| 470 | "list", |
| 471 | "loop", |
| 472 | "low", |
| 473 | "manifest", |
| 474 | "max", |
| 475 | "maxlength", |
| 476 | "media", |
| 477 | "mediagroup", |
| 478 | "method", |
| 479 | "min", |
| 480 | "minlength", |
| 481 | "multiple", |
| 482 | "muted", |
| 483 | "name", |
| 484 | "novalidate", |
| 485 | "open", |
| 486 | "optimum", |
| 487 | "pattern", |
| 488 | "ping", |
| 489 | "placeholder", |
| 490 | "poster", |
| 491 | "preload", |
| 492 | "radiogroup", |
| 493 | "readonly", |
| 494 | "rel", |
| 495 | "required", |
| 496 | "reversed", |
| 497 | "rows", |
| 498 | "rowspan", |
| 499 | "sandbox", |
| 500 | "spellcheck", |
| 501 | "scope", |
| 502 | "scoped", |
| 503 | "seamless", |
| 504 | "selected", |
| 505 | "shape", |
| 506 | "size", |
| 507 | "sizes", |
| 508 | "sortable", |
| 509 | "sorted", |
| 510 | "span", |
| 511 | "src", |
| 512 | "srcdoc", |
| 513 | "srclang", |
| 514 | "start", |
| 515 | "step", |
| 516 | "style", |
| 517 | "tabindex", |
| 518 | "target", |
| 519 | "title", |
| 520 | "translate", |
| 521 | "type", |
| 522 | "typemustmatch", |
| 523 | "usemap", |
| 524 | "value", |
| 525 | "width", |
| 526 | "wrap", |
| 527 | } |
| 528 | |
| 529 | var eventHandlers = []string{ |
| 530 | "onabort", |
| 531 | "onautocomplete", |
| 532 | "onautocompleteerror", |
| 533 | "onafterprint", |
| 534 | "onbeforeprint", |
| 535 | "onbeforeunload", |
| 536 | "onblur", |
| 537 | "oncancel", |
| 538 | "oncanplay", |
| 539 | "oncanplaythrough", |
| 540 | "onchange", |
| 541 | "onclick", |
| 542 | "onclose", |
| 543 | "oncontextmenu", |
| 544 | "oncuechange", |
| 545 | "ondblclick", |
| 546 | "ondrag", |
| 547 | "ondragend", |
| 548 | "ondragenter", |
| 549 | "ondragleave", |
| 550 | "ondragover", |
| 551 | "ondragstart", |
| 552 | "ondrop", |
| 553 | "ondurationchange", |
| 554 | "onemptied", |
| 555 | "onended", |
| 556 | "onerror", |
| 557 | "onfocus", |
| 558 | "onhashchange", |
| 559 | "oninput", |
| 560 | "oninvalid", |
| 561 | "onkeydown", |
| 562 | "onkeypress", |
| 563 | "onkeyup", |
| 564 | "onlanguagechange", |
| 565 | "onload", |
| 566 | "onloadeddata", |
| 567 | "onloadedmetadata", |
| 568 | "onloadstart", |
| 569 | "onmessage", |
| 570 | "onmousedown", |
| 571 | "onmousemove", |
| 572 | "onmouseout", |
| 573 | "onmouseover", |
| 574 | "onmouseup", |
| 575 | "onmousewheel", |
| 576 | "onoffline", |
| 577 | "ononline", |
| 578 | "onpagehide", |
| 579 | "onpageshow", |
| 580 | "onpause", |
| 581 | "onplay", |
| 582 | "onplaying", |
| 583 | "onpopstate", |
| 584 | "onprogress", |
| 585 | "onratechange", |
| 586 | "onreset", |
| 587 | "onresize", |
| 588 | "onscroll", |
| 589 | "onseeked", |
| 590 | "onseeking", |
| 591 | "onselect", |
| 592 | "onshow", |
| 593 | "onsort", |
| 594 | "onstalled", |
| 595 | "onstorage", |
| 596 | "onsubmit", |
| 597 | "onsuspend", |
| 598 | "ontimeupdate", |
| 599 | "ontoggle", |
| 600 | "onunload", |
| 601 | "onvolumechange", |
| 602 | "onwaiting", |
| 603 | } |
| 604 | |
| 605 | // extra are ad-hoc values not covered by any of the lists above. |
| 606 | var extra = []string{ |
| 607 | "align", |
| 608 | "annotation", |
| 609 | "annotation-xml", |
| 610 | "applet", |
| 611 | "basefont", |
| 612 | "bgsound", |
| 613 | "big", |
| 614 | "blink", |
| 615 | "center", |
| 616 | "color", |
| 617 | "desc", |
| 618 | "face", |
| 619 | "font", |
| 620 | "foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive. |
| 621 | "foreignobject", |
| 622 | "frame", |
| 623 | "frameset", |
| 624 | "image", |
| 625 | "isindex", |
| 626 | "listing", |
| 627 | "malignmark", |
| 628 | "marquee", |
| 629 | "math", |
| 630 | "mglyph", |
| 631 | "mi", |
| 632 | "mn", |
| 633 | "mo", |
| 634 | "ms", |
| 635 | "mtext", |
| 636 | "nobr", |
| 637 | "noembed", |
| 638 | "noframes", |
| 639 | "plaintext", |
| 640 | "prompt", |
| 641 | "public", |
| 642 | "spacer", |
| 643 | "strike", |
| 644 | "svg", |
| 645 | "system", |
| 646 | "tt", |
| 647 | "xmp", |
| 648 | } |