William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package build |
| 6 | |
| 7 | import ( |
| 8 | "fmt" |
| 9 | "log" |
| 10 | "sort" |
| 11 | "strings" |
| 12 | "unicode" |
| 13 | |
| 14 | "golang.org/x/text/internal/colltab" |
| 15 | "golang.org/x/text/unicode/norm" |
| 16 | ) |
| 17 | |
| 18 | type logicalAnchor int |
| 19 | |
| 20 | const ( |
| 21 | firstAnchor logicalAnchor = -1 |
| 22 | noAnchor = 0 |
| 23 | lastAnchor = 1 |
| 24 | ) |
| 25 | |
| 26 | // entry is used to keep track of a single entry in the collation element table |
| 27 | // during building. Examples of entries can be found in the Default Unicode |
| 28 | // Collation Element Table. |
| 29 | // See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. |
| 30 | type entry struct { |
| 31 | str string // same as string(runes) |
| 32 | runes []rune |
| 33 | elems []rawCE // the collation elements |
| 34 | extend string // weights of extend to be appended to elems |
| 35 | before bool // weights relative to next instead of previous. |
| 36 | lock bool // entry is used in extension and can no longer be moved. |
| 37 | |
| 38 | // prev, next, and level are used to keep track of tailorings. |
| 39 | prev, next *entry |
| 40 | level colltab.Level // next differs at this level |
| 41 | skipRemove bool // do not unlink when removed |
| 42 | |
| 43 | decompose bool // can use NFKD decomposition to generate elems |
| 44 | exclude bool // do not include in table |
| 45 | implicit bool // derived, is not included in the list |
| 46 | modified bool // entry was modified in tailoring |
| 47 | logical logicalAnchor |
| 48 | |
| 49 | expansionIndex int // used to store index into expansion table |
| 50 | contractionHandle ctHandle |
| 51 | contractionIndex int // index into contraction elements |
| 52 | } |
| 53 | |
| 54 | func (e *entry) String() string { |
| 55 | return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)", |
| 56 | e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) |
| 57 | } |
| 58 | |
| 59 | func (e *entry) skip() bool { |
| 60 | return e.contraction() |
| 61 | } |
| 62 | |
| 63 | func (e *entry) expansion() bool { |
| 64 | return !e.decompose && len(e.elems) > 1 |
| 65 | } |
| 66 | |
| 67 | func (e *entry) contraction() bool { |
| 68 | return len(e.runes) > 1 |
| 69 | } |
| 70 | |
| 71 | func (e *entry) contractionStarter() bool { |
| 72 | return e.contractionHandle.n != 0 |
| 73 | } |
| 74 | |
| 75 | // nextIndexed gets the next entry that needs to be stored in the table. |
| 76 | // It returns the entry and the collation level at which the next entry differs |
| 77 | // from the current entry. |
| 78 | // Entries that can be explicitly derived and logical reset positions are |
| 79 | // examples of entries that will not be indexed. |
| 80 | func (e *entry) nextIndexed() (*entry, colltab.Level) { |
| 81 | level := e.level |
| 82 | for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next { |
| 83 | if e.level < level { |
| 84 | level = e.level |
| 85 | } |
| 86 | } |
| 87 | return e, level |
| 88 | } |
| 89 | |
| 90 | // remove unlinks entry e from the sorted chain and clears the collation |
| 91 | // elements. e may not be at the front or end of the list. This should always |
| 92 | // be the case, as the front and end of the list are always logical anchors, |
| 93 | // which may not be removed. |
| 94 | func (e *entry) remove() { |
| 95 | if e.logical != noAnchor { |
| 96 | log.Fatalf("may not remove anchor %q", e.str) |
| 97 | } |
| 98 | // TODO: need to set e.prev.level to e.level if e.level is smaller? |
| 99 | e.elems = nil |
| 100 | if !e.skipRemove { |
| 101 | if e.prev != nil { |
| 102 | e.prev.next = e.next |
| 103 | } |
| 104 | if e.next != nil { |
| 105 | e.next.prev = e.prev |
| 106 | } |
| 107 | } |
| 108 | e.skipRemove = false |
| 109 | } |
| 110 | |
| 111 | // insertAfter inserts n after e. |
| 112 | func (e *entry) insertAfter(n *entry) { |
| 113 | if e == n { |
| 114 | panic("e == anchor") |
| 115 | } |
| 116 | if e == nil { |
| 117 | panic("unexpected nil anchor") |
| 118 | } |
| 119 | n.remove() |
| 120 | n.decompose = false // redo decomposition test |
| 121 | |
| 122 | n.next = e.next |
| 123 | n.prev = e |
| 124 | if e.next != nil { |
| 125 | e.next.prev = n |
| 126 | } |
| 127 | e.next = n |
| 128 | } |
| 129 | |
| 130 | // insertBefore inserts n before e. |
| 131 | func (e *entry) insertBefore(n *entry) { |
| 132 | if e == n { |
| 133 | panic("e == anchor") |
| 134 | } |
| 135 | if e == nil { |
| 136 | panic("unexpected nil anchor") |
| 137 | } |
| 138 | n.remove() |
| 139 | n.decompose = false // redo decomposition test |
| 140 | |
| 141 | n.prev = e.prev |
| 142 | n.next = e |
| 143 | if e.prev != nil { |
| 144 | e.prev.next = n |
| 145 | } |
| 146 | e.prev = n |
| 147 | } |
| 148 | |
| 149 | func (e *entry) encodeBase() (ce uint32, err error) { |
| 150 | switch { |
| 151 | case e.expansion(): |
| 152 | ce, err = makeExpandIndex(e.expansionIndex) |
| 153 | default: |
| 154 | if e.decompose { |
| 155 | log.Fatal("decompose should be handled elsewhere") |
| 156 | } |
| 157 | ce, err = makeCE(e.elems[0]) |
| 158 | } |
| 159 | return |
| 160 | } |
| 161 | |
| 162 | func (e *entry) encode() (ce uint32, err error) { |
| 163 | if e.skip() { |
| 164 | log.Fatal("cannot build colElem for entry that should be skipped") |
| 165 | } |
| 166 | switch { |
| 167 | case e.decompose: |
| 168 | t1 := e.elems[0].w[2] |
| 169 | t2 := 0 |
| 170 | if len(e.elems) > 1 { |
| 171 | t2 = e.elems[1].w[2] |
| 172 | } |
| 173 | ce, err = makeDecompose(t1, t2) |
| 174 | case e.contractionStarter(): |
| 175 | ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) |
| 176 | default: |
| 177 | if len(e.runes) > 1 { |
| 178 | log.Fatal("colElem: contractions are handled in contraction trie") |
| 179 | } |
| 180 | ce, err = e.encodeBase() |
| 181 | } |
| 182 | return |
| 183 | } |
| 184 | |
| 185 | // entryLess returns true if a sorts before b and false otherwise. |
| 186 | func entryLess(a, b *entry) bool { |
| 187 | if res, _ := compareWeights(a.elems, b.elems); res != 0 { |
| 188 | return res == -1 |
| 189 | } |
| 190 | if a.logical != noAnchor { |
| 191 | return a.logical == firstAnchor |
| 192 | } |
| 193 | if b.logical != noAnchor { |
| 194 | return b.logical == lastAnchor |
| 195 | } |
| 196 | return a.str < b.str |
| 197 | } |
| 198 | |
| 199 | type sortedEntries []*entry |
| 200 | |
| 201 | func (s sortedEntries) Len() int { |
| 202 | return len(s) |
| 203 | } |
| 204 | |
| 205 | func (s sortedEntries) Swap(i, j int) { |
| 206 | s[i], s[j] = s[j], s[i] |
| 207 | } |
| 208 | |
| 209 | func (s sortedEntries) Less(i, j int) bool { |
| 210 | return entryLess(s[i], s[j]) |
| 211 | } |
| 212 | |
| 213 | type ordering struct { |
| 214 | id string |
| 215 | entryMap map[string]*entry |
| 216 | ordered []*entry |
| 217 | handle *trieHandle |
| 218 | } |
| 219 | |
| 220 | // insert inserts e into both entryMap and ordered. |
| 221 | // Note that insert simply appends e to ordered. To reattain a sorted |
| 222 | // order, o.sort() should be called. |
| 223 | func (o *ordering) insert(e *entry) { |
| 224 | if e.logical == noAnchor { |
| 225 | o.entryMap[e.str] = e |
| 226 | } else { |
| 227 | // Use key format as used in UCA rules. |
| 228 | o.entryMap[fmt.Sprintf("[%s]", e.str)] = e |
| 229 | // Also add index entry for XML format. |
| 230 | o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e |
| 231 | } |
| 232 | o.ordered = append(o.ordered, e) |
| 233 | } |
| 234 | |
| 235 | // newEntry creates a new entry for the given info and inserts it into |
| 236 | // the index. |
| 237 | func (o *ordering) newEntry(s string, ces []rawCE) *entry { |
| 238 | e := &entry{ |
| 239 | runes: []rune(s), |
| 240 | elems: ces, |
| 241 | str: s, |
| 242 | } |
| 243 | o.insert(e) |
| 244 | return e |
| 245 | } |
| 246 | |
| 247 | // find looks up and returns the entry for the given string. |
| 248 | // It returns nil if str is not in the index and if an implicit value |
| 249 | // cannot be derived, that is, if str represents more than one rune. |
| 250 | func (o *ordering) find(str string) *entry { |
| 251 | e := o.entryMap[str] |
| 252 | if e == nil { |
| 253 | r := []rune(str) |
| 254 | if len(r) == 1 { |
| 255 | const ( |
| 256 | firstHangul = 0xAC00 |
| 257 | lastHangul = 0xD7A3 |
| 258 | ) |
| 259 | if r[0] >= firstHangul && r[0] <= lastHangul { |
| 260 | ce := []rawCE{} |
| 261 | nfd := norm.NFD.String(str) |
| 262 | for _, r := range nfd { |
| 263 | ce = append(ce, o.find(string(r)).elems...) |
| 264 | } |
| 265 | e = o.newEntry(nfd, ce) |
| 266 | } else { |
| 267 | e = o.newEntry(string(r[0]), []rawCE{ |
| 268 | {w: []int{ |
| 269 | implicitPrimary(r[0]), |
| 270 | defaultSecondary, |
| 271 | defaultTertiary, |
| 272 | int(r[0]), |
| 273 | }, |
| 274 | }, |
| 275 | }) |
| 276 | e.modified = true |
| 277 | } |
| 278 | e.exclude = true // do not index implicits |
| 279 | } |
| 280 | } |
| 281 | return e |
| 282 | } |
| 283 | |
| 284 | // makeRootOrdering returns a newly initialized ordering value and populates |
| 285 | // it with a set of logical reset points that can be used as anchors. |
| 286 | // The anchors first_tertiary_ignorable and __END__ will always sort at |
| 287 | // the beginning and end, respectively. This means that prev and next are non-nil |
| 288 | // for any indexed entry. |
| 289 | func makeRootOrdering() ordering { |
| 290 | const max = unicode.MaxRune |
| 291 | o := ordering{ |
| 292 | entryMap: make(map[string]*entry), |
| 293 | } |
| 294 | insert := func(typ logicalAnchor, s string, ce []int) { |
| 295 | e := &entry{ |
| 296 | elems: []rawCE{{w: ce}}, |
| 297 | str: s, |
| 298 | exclude: true, |
| 299 | logical: typ, |
| 300 | } |
| 301 | o.insert(e) |
| 302 | } |
| 303 | insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0}) |
| 304 | insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max}) |
| 305 | insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max}) |
| 306 | insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max}) |
| 307 | insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max}) |
| 308 | return o |
| 309 | } |
| 310 | |
| 311 | // patchForInsert eleminates entries from the list with more than one collation element. |
| 312 | // The next and prev fields of the eliminated entries still point to appropriate |
| 313 | // values in the newly created list. |
| 314 | // It requires that sort has been called. |
| 315 | func (o *ordering) patchForInsert() { |
| 316 | for i := 0; i < len(o.ordered)-1; { |
| 317 | e := o.ordered[i] |
| 318 | lev := e.level |
| 319 | n := e.next |
| 320 | for ; n != nil && len(n.elems) > 1; n = n.next { |
| 321 | if n.level < lev { |
| 322 | lev = n.level |
| 323 | } |
| 324 | n.skipRemove = true |
| 325 | } |
| 326 | for ; o.ordered[i] != n; i++ { |
| 327 | o.ordered[i].level = lev |
| 328 | o.ordered[i].next = n |
| 329 | o.ordered[i+1].prev = e |
| 330 | } |
| 331 | } |
| 332 | } |
| 333 | |
| 334 | // clone copies all ordering of es into a new ordering value. |
| 335 | func (o *ordering) clone() *ordering { |
| 336 | o.sort() |
| 337 | oo := ordering{ |
| 338 | entryMap: make(map[string]*entry), |
| 339 | } |
| 340 | for _, e := range o.ordered { |
| 341 | ne := &entry{ |
| 342 | runes: e.runes, |
| 343 | elems: e.elems, |
| 344 | str: e.str, |
| 345 | decompose: e.decompose, |
| 346 | exclude: e.exclude, |
| 347 | logical: e.logical, |
| 348 | } |
| 349 | oo.insert(ne) |
| 350 | } |
| 351 | oo.sort() // link all ordering. |
| 352 | oo.patchForInsert() |
| 353 | return &oo |
| 354 | } |
| 355 | |
| 356 | // front returns the first entry to be indexed. |
| 357 | // It assumes that sort() has been called. |
| 358 | func (o *ordering) front() *entry { |
| 359 | e := o.ordered[0] |
| 360 | if e.prev != nil { |
| 361 | log.Panicf("unexpected first entry: %v", e) |
| 362 | } |
| 363 | // The first entry is always a logical position, which should not be indexed. |
| 364 | e, _ = e.nextIndexed() |
| 365 | return e |
| 366 | } |
| 367 | |
| 368 | // sort sorts all ordering based on their collation elements and initializes |
| 369 | // the prev, next, and level fields accordingly. |
| 370 | func (o *ordering) sort() { |
| 371 | sort.Sort(sortedEntries(o.ordered)) |
| 372 | l := o.ordered |
| 373 | for i := 1; i < len(l); i++ { |
| 374 | k := i - 1 |
| 375 | l[k].next = l[i] |
| 376 | _, l[k].level = compareWeights(l[k].elems, l[i].elems) |
| 377 | l[i].prev = l[k] |
| 378 | } |
| 379 | } |
| 380 | |
| 381 | // genColElems generates a collation element array from the runes in str. This |
| 382 | // assumes that all collation elements have already been added to the Builder. |
| 383 | func (o *ordering) genColElems(str string) []rawCE { |
| 384 | elems := []rawCE{} |
| 385 | for _, r := range []rune(str) { |
| 386 | for _, ce := range o.find(string(r)).elems { |
| 387 | if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 { |
| 388 | elems = append(elems, ce) |
| 389 | } |
| 390 | } |
| 391 | } |
| 392 | return elems |
| 393 | } |