Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame^] | 1 | // Copyright 2011 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package norm |
| 6 | |
| 7 | import ( |
| 8 | "fmt" |
| 9 | "unicode/utf8" |
| 10 | ) |
| 11 | |
| 12 | // MaxSegmentSize is the maximum size of a byte buffer needed to consider any |
| 13 | // sequence of starter and non-starter runes for the purpose of normalization. |
| 14 | const MaxSegmentSize = maxByteBufferSize |
| 15 | |
| 16 | // An Iter iterates over a string or byte slice, while normalizing it |
| 17 | // to a given Form. |
| 18 | type Iter struct { |
| 19 | rb reorderBuffer |
| 20 | buf [maxByteBufferSize]byte |
| 21 | info Properties // first character saved from previous iteration |
| 22 | next iterFunc // implementation of next depends on form |
| 23 | asciiF iterFunc |
| 24 | |
| 25 | p int // current position in input source |
| 26 | multiSeg []byte // remainder of multi-segment decomposition |
| 27 | } |
| 28 | |
| 29 | type iterFunc func(*Iter) []byte |
| 30 | |
| 31 | // Init initializes i to iterate over src after normalizing it to Form f. |
| 32 | func (i *Iter) Init(f Form, src []byte) { |
| 33 | i.p = 0 |
| 34 | if len(src) == 0 { |
| 35 | i.setDone() |
| 36 | i.rb.nsrc = 0 |
| 37 | return |
| 38 | } |
| 39 | i.multiSeg = nil |
| 40 | i.rb.init(f, src) |
| 41 | i.next = i.rb.f.nextMain |
| 42 | i.asciiF = nextASCIIBytes |
| 43 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 44 | i.rb.ss.first(i.info) |
| 45 | } |
| 46 | |
| 47 | // InitString initializes i to iterate over src after normalizing it to Form f. |
| 48 | func (i *Iter) InitString(f Form, src string) { |
| 49 | i.p = 0 |
| 50 | if len(src) == 0 { |
| 51 | i.setDone() |
| 52 | i.rb.nsrc = 0 |
| 53 | return |
| 54 | } |
| 55 | i.multiSeg = nil |
| 56 | i.rb.initString(f, src) |
| 57 | i.next = i.rb.f.nextMain |
| 58 | i.asciiF = nextASCIIString |
| 59 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 60 | i.rb.ss.first(i.info) |
| 61 | } |
| 62 | |
| 63 | // Seek sets the segment to be returned by the next call to Next to start |
| 64 | // at position p. It is the responsibility of the caller to set p to the |
| 65 | // start of a segment. |
| 66 | func (i *Iter) Seek(offset int64, whence int) (int64, error) { |
| 67 | var abs int64 |
| 68 | switch whence { |
| 69 | case 0: |
| 70 | abs = offset |
| 71 | case 1: |
| 72 | abs = int64(i.p) + offset |
| 73 | case 2: |
| 74 | abs = int64(i.rb.nsrc) + offset |
| 75 | default: |
| 76 | return 0, fmt.Errorf("norm: invalid whence") |
| 77 | } |
| 78 | if abs < 0 { |
| 79 | return 0, fmt.Errorf("norm: negative position") |
| 80 | } |
| 81 | if int(abs) >= i.rb.nsrc { |
| 82 | i.setDone() |
| 83 | return int64(i.p), nil |
| 84 | } |
| 85 | i.p = int(abs) |
| 86 | i.multiSeg = nil |
| 87 | i.next = i.rb.f.nextMain |
| 88 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 89 | i.rb.ss.first(i.info) |
| 90 | return abs, nil |
| 91 | } |
| 92 | |
| 93 | // returnSlice returns a slice of the underlying input type as a byte slice. |
| 94 | // If the underlying is of type []byte, it will simply return a slice. |
| 95 | // If the underlying is of type string, it will copy the slice to the buffer |
| 96 | // and return that. |
| 97 | func (i *Iter) returnSlice(a, b int) []byte { |
| 98 | if i.rb.src.bytes == nil { |
| 99 | return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] |
| 100 | } |
| 101 | return i.rb.src.bytes[a:b] |
| 102 | } |
| 103 | |
| 104 | // Pos returns the byte position at which the next call to Next will commence processing. |
| 105 | func (i *Iter) Pos() int { |
| 106 | return i.p |
| 107 | } |
| 108 | |
| 109 | func (i *Iter) setDone() { |
| 110 | i.next = nextDone |
| 111 | i.p = i.rb.nsrc |
| 112 | } |
| 113 | |
| 114 | // Done returns true if there is no more input to process. |
| 115 | func (i *Iter) Done() bool { |
| 116 | return i.p >= i.rb.nsrc |
| 117 | } |
| 118 | |
| 119 | // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. |
| 120 | // For any input a and b for which f(a) == f(b), subsequent calls |
| 121 | // to Next will return the same segments. |
| 122 | // Modifying runes are grouped together with the preceding starter, if such a starter exists. |
| 123 | // Although not guaranteed, n will typically be the smallest possible n. |
| 124 | func (i *Iter) Next() []byte { |
| 125 | return i.next(i) |
| 126 | } |
| 127 | |
| 128 | func nextASCIIBytes(i *Iter) []byte { |
| 129 | p := i.p + 1 |
| 130 | if p >= i.rb.nsrc { |
| 131 | p0 := i.p |
| 132 | i.setDone() |
| 133 | return i.rb.src.bytes[p0:p] |
| 134 | } |
| 135 | if i.rb.src.bytes[p] < utf8.RuneSelf { |
| 136 | p0 := i.p |
| 137 | i.p = p |
| 138 | return i.rb.src.bytes[p0:p] |
| 139 | } |
| 140 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 141 | i.next = i.rb.f.nextMain |
| 142 | return i.next(i) |
| 143 | } |
| 144 | |
| 145 | func nextASCIIString(i *Iter) []byte { |
| 146 | p := i.p + 1 |
| 147 | if p >= i.rb.nsrc { |
| 148 | i.buf[0] = i.rb.src.str[i.p] |
| 149 | i.setDone() |
| 150 | return i.buf[:1] |
| 151 | } |
| 152 | if i.rb.src.str[p] < utf8.RuneSelf { |
| 153 | i.buf[0] = i.rb.src.str[i.p] |
| 154 | i.p = p |
| 155 | return i.buf[:1] |
| 156 | } |
| 157 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 158 | i.next = i.rb.f.nextMain |
| 159 | return i.next(i) |
| 160 | } |
| 161 | |
| 162 | func nextHangul(i *Iter) []byte { |
| 163 | p := i.p |
| 164 | next := p + hangulUTF8Size |
| 165 | if next >= i.rb.nsrc { |
| 166 | i.setDone() |
| 167 | } else if i.rb.src.hangul(next) == 0 { |
| 168 | i.rb.ss.next(i.info) |
| 169 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 170 | i.next = i.rb.f.nextMain |
| 171 | return i.next(i) |
| 172 | } |
| 173 | i.p = next |
| 174 | return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] |
| 175 | } |
| 176 | |
| 177 | func nextDone(i *Iter) []byte { |
| 178 | return nil |
| 179 | } |
| 180 | |
| 181 | // nextMulti is used for iterating over multi-segment decompositions |
| 182 | // for decomposing normal forms. |
| 183 | func nextMulti(i *Iter) []byte { |
| 184 | j := 0 |
| 185 | d := i.multiSeg |
| 186 | // skip first rune |
| 187 | for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { |
| 188 | } |
| 189 | for j < len(d) { |
| 190 | info := i.rb.f.info(input{bytes: d}, j) |
| 191 | if info.BoundaryBefore() { |
| 192 | i.multiSeg = d[j:] |
| 193 | return d[:j] |
| 194 | } |
| 195 | j += int(info.size) |
| 196 | } |
| 197 | // treat last segment as normal decomposition |
| 198 | i.next = i.rb.f.nextMain |
| 199 | return i.next(i) |
| 200 | } |
| 201 | |
| 202 | // nextMultiNorm is used for iterating over multi-segment decompositions |
| 203 | // for composing normal forms. |
| 204 | func nextMultiNorm(i *Iter) []byte { |
| 205 | j := 0 |
| 206 | d := i.multiSeg |
| 207 | for j < len(d) { |
| 208 | info := i.rb.f.info(input{bytes: d}, j) |
| 209 | if info.BoundaryBefore() { |
| 210 | i.rb.compose() |
| 211 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
| 212 | i.rb.insertUnsafe(input{bytes: d}, j, info) |
| 213 | i.multiSeg = d[j+int(info.size):] |
| 214 | return seg |
| 215 | } |
| 216 | i.rb.insertUnsafe(input{bytes: d}, j, info) |
| 217 | j += int(info.size) |
| 218 | } |
| 219 | i.multiSeg = nil |
| 220 | i.next = nextComposed |
| 221 | return doNormComposed(i) |
| 222 | } |
| 223 | |
| 224 | // nextDecomposed is the implementation of Next for forms NFD and NFKD. |
| 225 | func nextDecomposed(i *Iter) (next []byte) { |
| 226 | outp := 0 |
| 227 | inCopyStart, outCopyStart := i.p, 0 |
| 228 | for { |
| 229 | if sz := int(i.info.size); sz <= 1 { |
| 230 | i.rb.ss = 0 |
| 231 | p := i.p |
| 232 | i.p++ // ASCII or illegal byte. Either way, advance by 1. |
| 233 | if i.p >= i.rb.nsrc { |
| 234 | i.setDone() |
| 235 | return i.returnSlice(p, i.p) |
| 236 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
| 237 | i.next = i.asciiF |
| 238 | return i.returnSlice(p, i.p) |
| 239 | } |
| 240 | outp++ |
| 241 | } else if d := i.info.Decomposition(); d != nil { |
| 242 | // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. |
| 243 | // Case 1: there is a leftover to copy. In this case the decomposition |
| 244 | // must begin with a modifier and should always be appended. |
| 245 | // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. |
| 246 | p := outp + len(d) |
| 247 | if outp > 0 { |
| 248 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
| 249 | // TODO: this condition should not be possible, but we leave it |
| 250 | // in for defensive purposes. |
| 251 | if p > len(i.buf) { |
| 252 | return i.buf[:outp] |
| 253 | } |
| 254 | } else if i.info.multiSegment() { |
| 255 | // outp must be 0 as multi-segment decompositions always |
| 256 | // start a new segment. |
| 257 | if i.multiSeg == nil { |
| 258 | i.multiSeg = d |
| 259 | i.next = nextMulti |
| 260 | return nextMulti(i) |
| 261 | } |
| 262 | // We are in the last segment. Treat as normal decomposition. |
| 263 | d = i.multiSeg |
| 264 | i.multiSeg = nil |
| 265 | p = len(d) |
| 266 | } |
| 267 | prevCC := i.info.tccc |
| 268 | if i.p += sz; i.p >= i.rb.nsrc { |
| 269 | i.setDone() |
| 270 | i.info = Properties{} // Force BoundaryBefore to succeed. |
| 271 | } else { |
| 272 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 273 | } |
| 274 | switch i.rb.ss.next(i.info) { |
| 275 | case ssOverflow: |
| 276 | i.next = nextCGJDecompose |
| 277 | fallthrough |
| 278 | case ssStarter: |
| 279 | if outp > 0 { |
| 280 | copy(i.buf[outp:], d) |
| 281 | return i.buf[:p] |
| 282 | } |
| 283 | return d |
| 284 | } |
| 285 | copy(i.buf[outp:], d) |
| 286 | outp = p |
| 287 | inCopyStart, outCopyStart = i.p, outp |
| 288 | if i.info.ccc < prevCC { |
| 289 | goto doNorm |
| 290 | } |
| 291 | continue |
| 292 | } else if r := i.rb.src.hangul(i.p); r != 0 { |
| 293 | outp = decomposeHangul(i.buf[:], r) |
| 294 | i.p += hangulUTF8Size |
| 295 | inCopyStart, outCopyStart = i.p, outp |
| 296 | if i.p >= i.rb.nsrc { |
| 297 | i.setDone() |
| 298 | break |
| 299 | } else if i.rb.src.hangul(i.p) != 0 { |
| 300 | i.next = nextHangul |
| 301 | return i.buf[:outp] |
| 302 | } |
| 303 | } else { |
| 304 | p := outp + sz |
| 305 | if p > len(i.buf) { |
| 306 | break |
| 307 | } |
| 308 | outp = p |
| 309 | i.p += sz |
| 310 | } |
| 311 | if i.p >= i.rb.nsrc { |
| 312 | i.setDone() |
| 313 | break |
| 314 | } |
| 315 | prevCC := i.info.tccc |
| 316 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 317 | if v := i.rb.ss.next(i.info); v == ssStarter { |
| 318 | break |
| 319 | } else if v == ssOverflow { |
| 320 | i.next = nextCGJDecompose |
| 321 | break |
| 322 | } |
| 323 | if i.info.ccc < prevCC { |
| 324 | goto doNorm |
| 325 | } |
| 326 | } |
| 327 | if outCopyStart == 0 { |
| 328 | return i.returnSlice(inCopyStart, i.p) |
| 329 | } else if inCopyStart < i.p { |
| 330 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
| 331 | } |
| 332 | return i.buf[:outp] |
| 333 | doNorm: |
| 334 | // Insert what we have decomposed so far in the reorderBuffer. |
| 335 | // As we will only reorder, there will always be enough room. |
| 336 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
| 337 | i.rb.insertDecomposed(i.buf[0:outp]) |
| 338 | return doNormDecomposed(i) |
| 339 | } |
| 340 | |
| 341 | func doNormDecomposed(i *Iter) []byte { |
| 342 | for { |
| 343 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| 344 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
| 345 | i.setDone() |
| 346 | break |
| 347 | } |
| 348 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 349 | if i.info.ccc == 0 { |
| 350 | break |
| 351 | } |
| 352 | if s := i.rb.ss.next(i.info); s == ssOverflow { |
| 353 | i.next = nextCGJDecompose |
| 354 | break |
| 355 | } |
| 356 | } |
| 357 | // new segment or too many combining characters: exit normalization |
| 358 | return i.buf[:i.rb.flushCopy(i.buf[:])] |
| 359 | } |
| 360 | |
| 361 | func nextCGJDecompose(i *Iter) []byte { |
| 362 | i.rb.ss = 0 |
| 363 | i.rb.insertCGJ() |
| 364 | i.next = nextDecomposed |
| 365 | i.rb.ss.first(i.info) |
| 366 | buf := doNormDecomposed(i) |
| 367 | return buf |
| 368 | } |
| 369 | |
| 370 | // nextComposed is the implementation of Next for forms NFC and NFKC. |
| 371 | func nextComposed(i *Iter) []byte { |
| 372 | outp, startp := 0, i.p |
| 373 | var prevCC uint8 |
| 374 | for { |
| 375 | if !i.info.isYesC() { |
| 376 | goto doNorm |
| 377 | } |
| 378 | prevCC = i.info.tccc |
| 379 | sz := int(i.info.size) |
| 380 | if sz == 0 { |
| 381 | sz = 1 // illegal rune: copy byte-by-byte |
| 382 | } |
| 383 | p := outp + sz |
| 384 | if p > len(i.buf) { |
| 385 | break |
| 386 | } |
| 387 | outp = p |
| 388 | i.p += sz |
| 389 | if i.p >= i.rb.nsrc { |
| 390 | i.setDone() |
| 391 | break |
| 392 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
| 393 | i.rb.ss = 0 |
| 394 | i.next = i.asciiF |
| 395 | break |
| 396 | } |
| 397 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 398 | if v := i.rb.ss.next(i.info); v == ssStarter { |
| 399 | break |
| 400 | } else if v == ssOverflow { |
| 401 | i.next = nextCGJCompose |
| 402 | break |
| 403 | } |
| 404 | if i.info.ccc < prevCC { |
| 405 | goto doNorm |
| 406 | } |
| 407 | } |
| 408 | return i.returnSlice(startp, i.p) |
| 409 | doNorm: |
| 410 | // reset to start position |
| 411 | i.p = startp |
| 412 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 413 | i.rb.ss.first(i.info) |
| 414 | if i.info.multiSegment() { |
| 415 | d := i.info.Decomposition() |
| 416 | info := i.rb.f.info(input{bytes: d}, 0) |
| 417 | i.rb.insertUnsafe(input{bytes: d}, 0, info) |
| 418 | i.multiSeg = d[int(info.size):] |
| 419 | i.next = nextMultiNorm |
| 420 | return nextMultiNorm(i) |
| 421 | } |
| 422 | i.rb.ss.first(i.info) |
| 423 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| 424 | return doNormComposed(i) |
| 425 | } |
| 426 | |
| 427 | func doNormComposed(i *Iter) []byte { |
| 428 | // First rune should already be inserted. |
| 429 | for { |
| 430 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
| 431 | i.setDone() |
| 432 | break |
| 433 | } |
| 434 | i.info = i.rb.f.info(i.rb.src, i.p) |
| 435 | if s := i.rb.ss.next(i.info); s == ssStarter { |
| 436 | break |
| 437 | } else if s == ssOverflow { |
| 438 | i.next = nextCGJCompose |
| 439 | break |
| 440 | } |
| 441 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| 442 | } |
| 443 | i.rb.compose() |
| 444 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
| 445 | return seg |
| 446 | } |
| 447 | |
| 448 | func nextCGJCompose(i *Iter) []byte { |
| 449 | i.rb.ss = 0 // instead of first |
| 450 | i.rb.insertCGJ() |
| 451 | i.next = nextComposed |
| 452 | // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, |
| 453 | // even if they are not. This is particularly dubious for U+FF9E and UFF9A. |
| 454 | // If we ever change that, insert a check here. |
| 455 | i.rb.ss.first(i.info) |
| 456 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| 457 | return doNormComposed(i) |
| 458 | } |