blob: 95efcf26e81d7ab5608a42dddaa6b331200c8fbd [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Note: the file data_test.go that is generated should not be checked in.
6//go:generate go run maketables.go triegen.go
7//go:generate go test -tags test
8
9// Package norm contains types and functions for normalizing Unicode strings.
10package norm // import "golang.org/x/text/unicode/norm"
11
12import (
13 "unicode/utf8"
14
15 "golang.org/x/text/transform"
16)
17
18// A Form denotes a canonical representation of Unicode code points.
19// The Unicode-defined normalization and equivalence forms are:
20//
21// NFC Unicode Normalization Form C
22// NFD Unicode Normalization Form D
23// NFKC Unicode Normalization Form KC
24// NFKD Unicode Normalization Form KD
25//
26// For a Form f, this documentation uses the notation f(x) to mean
27// the bytes or string x converted to the given form.
28// A position n in x is called a boundary if conversion to the form can
29// proceed independently on both sides:
30// f(x) == append(f(x[0:n]), f(x[n:])...)
31//
Don Newton7577f072020-01-06 12:41:11 -050032// References: https://unicode.org/reports/tr15/ and
33// https://unicode.org/notes/tn5/.
Don Newton98fd8812019-09-23 15:15:02 -040034type Form int
35
36const (
37 NFC Form = iota
38 NFD
39 NFKC
40 NFKD
41)
42
43// Bytes returns f(b). May return b if f(b) = b.
44func (f Form) Bytes(b []byte) []byte {
45 src := inputBytes(b)
46 ft := formTable[f]
47 n, ok := ft.quickSpan(src, 0, len(b), true)
48 if ok {
49 return b
50 }
51 out := make([]byte, n, len(b))
52 copy(out, b[0:n])
53 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
54 return doAppendInner(&rb, n)
55}
56
57// String returns f(s).
58func (f Form) String(s string) string {
59 src := inputString(s)
60 ft := formTable[f]
61 n, ok := ft.quickSpan(src, 0, len(s), true)
62 if ok {
63 return s
64 }
65 out := make([]byte, n, len(s))
66 copy(out, s[0:n])
67 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
68 return string(doAppendInner(&rb, n))
69}
70
71// IsNormal returns true if b == f(b).
72func (f Form) IsNormal(b []byte) bool {
73 src := inputBytes(b)
74 ft := formTable[f]
75 bp, ok := ft.quickSpan(src, 0, len(b), true)
76 if ok {
77 return true
78 }
79 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
80 rb.setFlusher(nil, cmpNormalBytes)
81 for bp < len(b) {
82 rb.out = b[bp:]
83 if bp = decomposeSegment(&rb, bp, true); bp < 0 {
84 return false
85 }
86 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
87 }
88 return true
89}
90
91func cmpNormalBytes(rb *reorderBuffer) bool {
92 b := rb.out
93 for i := 0; i < rb.nrune; i++ {
94 info := rb.rune[i]
95 if int(info.size) > len(b) {
96 return false
97 }
98 p := info.pos
99 pe := p + info.size
100 for ; p < pe; p++ {
101 if b[0] != rb.byte[p] {
102 return false
103 }
104 b = b[1:]
105 }
106 }
107 return true
108}
109
110// IsNormalString returns true if s == f(s).
111func (f Form) IsNormalString(s string) bool {
112 src := inputString(s)
113 ft := formTable[f]
114 bp, ok := ft.quickSpan(src, 0, len(s), true)
115 if ok {
116 return true
117 }
118 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
119 rb.setFlusher(nil, func(rb *reorderBuffer) bool {
120 for i := 0; i < rb.nrune; i++ {
121 info := rb.rune[i]
122 if bp+int(info.size) > len(s) {
123 return false
124 }
125 p := info.pos
126 pe := p + info.size
127 for ; p < pe; p++ {
128 if s[bp] != rb.byte[p] {
129 return false
130 }
131 bp++
132 }
133 }
134 return true
135 })
136 for bp < len(s) {
137 if bp = decomposeSegment(&rb, bp, true); bp < 0 {
138 return false
139 }
140 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
141 }
142 return true
143}
144
145// patchTail fixes a case where a rune may be incorrectly normalized
146// if it is followed by illegal continuation bytes. It returns the
147// patched buffer and whether the decomposition is still in progress.
148func patchTail(rb *reorderBuffer) bool {
149 info, p := lastRuneStart(&rb.f, rb.out)
150 if p == -1 || info.size == 0 {
151 return true
152 }
153 end := p + int(info.size)
154 extra := len(rb.out) - end
155 if extra > 0 {
156 // Potentially allocating memory. However, this only
157 // happens with ill-formed UTF-8.
158 x := make([]byte, 0)
159 x = append(x, rb.out[len(rb.out)-extra:]...)
160 rb.out = rb.out[:end]
161 decomposeToLastBoundary(rb)
162 rb.doFlush()
163 rb.out = append(rb.out, x...)
164 return false
165 }
166 buf := rb.out[p:]
167 rb.out = rb.out[:p]
168 decomposeToLastBoundary(rb)
169 if s := rb.ss.next(info); s == ssStarter {
170 rb.doFlush()
171 rb.ss.first(info)
172 } else if s == ssOverflow {
173 rb.doFlush()
174 rb.insertCGJ()
175 rb.ss = 0
176 }
177 rb.insertUnsafe(inputBytes(buf), 0, info)
178 return true
179}
180
181func appendQuick(rb *reorderBuffer, i int) int {
182 if rb.nsrc == i {
183 return i
184 }
185 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
186 rb.out = rb.src.appendSlice(rb.out, i, end)
187 return end
188}
189
190// Append returns f(append(out, b...)).
191// The buffer out must be nil, empty, or equal to f(out).
192func (f Form) Append(out []byte, src ...byte) []byte {
193 return f.doAppend(out, inputBytes(src), len(src))
194}
195
196func (f Form) doAppend(out []byte, src input, n int) []byte {
197 if n == 0 {
198 return out
199 }
200 ft := formTable[f]
201 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
202 if len(out) == 0 {
203 p, _ := ft.quickSpan(src, 0, n, true)
204 out = src.appendSlice(out, 0, p)
205 if p == n {
206 return out
207 }
208 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
209 return doAppendInner(&rb, p)
210 }
211 rb := reorderBuffer{f: *ft, src: src, nsrc: n}
212 return doAppend(&rb, out, 0)
213}
214
215func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
216 rb.setFlusher(out, appendFlush)
217 src, n := rb.src, rb.nsrc
218 doMerge := len(out) > 0
219 if q := src.skipContinuationBytes(p); q > p {
220 // Move leading non-starters to destination.
221 rb.out = src.appendSlice(rb.out, p, q)
222 p = q
223 doMerge = patchTail(rb)
224 }
225 fd := &rb.f
226 if doMerge {
227 var info Properties
228 if p < n {
229 info = fd.info(src, p)
230 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
231 if p == 0 {
232 decomposeToLastBoundary(rb)
233 }
234 p = decomposeSegment(rb, p, true)
235 }
236 }
237 if info.size == 0 {
238 rb.doFlush()
239 // Append incomplete UTF-8 encoding.
240 return src.appendSlice(rb.out, p, n)
241 }
242 if rb.nrune > 0 {
243 return doAppendInner(rb, p)
244 }
245 }
246 p = appendQuick(rb, p)
247 return doAppendInner(rb, p)
248}
249
250func doAppendInner(rb *reorderBuffer, p int) []byte {
251 for n := rb.nsrc; p < n; {
252 p = decomposeSegment(rb, p, true)
253 p = appendQuick(rb, p)
254 }
255 return rb.out
256}
257
258// AppendString returns f(append(out, []byte(s))).
259// The buffer out must be nil, empty, or equal to f(out).
260func (f Form) AppendString(out []byte, src string) []byte {
261 return f.doAppend(out, inputString(src), len(src))
262}
263
264// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
265// It is not guaranteed to return the largest such n.
266func (f Form) QuickSpan(b []byte) int {
267 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
268 return n
269}
270
271// Span implements transform.SpanningTransformer. It returns a boundary n such
272// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
273func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
274 n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
275 if n < len(b) {
276 if !ok {
277 err = transform.ErrEndOfSpan
278 } else {
279 err = transform.ErrShortSrc
280 }
281 }
282 return n, err
283}
284
285// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
286// It is not guaranteed to return the largest such n.
287func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
288 n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
289 if n < len(s) {
290 if !ok {
291 err = transform.ErrEndOfSpan
292 } else {
293 err = transform.ErrShortSrc
294 }
295 }
296 return n, err
297}
298
299// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
300// whether any non-normalized parts were found. If atEOF is false, n will
301// not point past the last segment if this segment might be become
302// non-normalized by appending other runes.
303func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
304 var lastCC uint8
305 ss := streamSafe(0)
306 lastSegStart := i
307 for n = end; i < n; {
308 if j := src.skipASCII(i, n); i != j {
309 i = j
310 lastSegStart = i - 1
311 lastCC = 0
312 ss = 0
313 continue
314 }
315 info := f.info(src, i)
316 if info.size == 0 {
317 if atEOF {
318 // include incomplete runes
319 return n, true
320 }
321 return lastSegStart, true
322 }
323 // This block needs to be before the next, because it is possible to
324 // have an overflow for runes that are starters (e.g. with U+FF9E).
325 switch ss.next(info) {
326 case ssStarter:
327 lastSegStart = i
328 case ssOverflow:
329 return lastSegStart, false
330 case ssSuccess:
331 if lastCC > info.ccc {
332 return lastSegStart, false
333 }
334 }
335 if f.composing {
336 if !info.isYesC() {
337 break
338 }
339 } else {
340 if !info.isYesD() {
341 break
342 }
343 }
344 lastCC = info.ccc
345 i += int(info.size)
346 }
347 if i == n {
348 if !atEOF {
349 n = lastSegStart
350 }
351 return n, true
352 }
353 return lastSegStart, false
354}
355
356// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
357// It is not guaranteed to return the largest such n.
358func (f Form) QuickSpanString(s string) int {
359 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
360 return n
361}
362
363// FirstBoundary returns the position i of the first boundary in b
364// or -1 if b contains no boundary.
365func (f Form) FirstBoundary(b []byte) int {
366 return f.firstBoundary(inputBytes(b), len(b))
367}
368
369func (f Form) firstBoundary(src input, nsrc int) int {
370 i := src.skipContinuationBytes(0)
371 if i >= nsrc {
372 return -1
373 }
374 fd := formTable[f]
375 ss := streamSafe(0)
376 // We should call ss.first here, but we can't as the first rune is
377 // skipped already. This means FirstBoundary can't really determine
378 // CGJ insertion points correctly. Luckily it doesn't have to.
379 for {
380 info := fd.info(src, i)
381 if info.size == 0 {
382 return -1
383 }
384 if s := ss.next(info); s != ssSuccess {
385 return i
386 }
387 i += int(info.size)
388 if i >= nsrc {
389 if !info.BoundaryAfter() && !ss.isMax() {
390 return -1
391 }
392 return nsrc
393 }
394 }
395}
396
397// FirstBoundaryInString returns the position i of the first boundary in s
398// or -1 if s contains no boundary.
399func (f Form) FirstBoundaryInString(s string) int {
400 return f.firstBoundary(inputString(s), len(s))
401}
402
403// NextBoundary reports the index of the boundary between the first and next
404// segment in b or -1 if atEOF is false and there are not enough bytes to
405// determine this boundary.
406func (f Form) NextBoundary(b []byte, atEOF bool) int {
407 return f.nextBoundary(inputBytes(b), len(b), atEOF)
408}
409
410// NextBoundaryInString reports the index of the boundary between the first and
411// next segment in b or -1 if atEOF is false and there are not enough bytes to
412// determine this boundary.
413func (f Form) NextBoundaryInString(s string, atEOF bool) int {
414 return f.nextBoundary(inputString(s), len(s), atEOF)
415}
416
417func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
418 if nsrc == 0 {
419 if atEOF {
420 return 0
421 }
422 return -1
423 }
424 fd := formTable[f]
425 info := fd.info(src, 0)
426 if info.size == 0 {
427 if atEOF {
428 return 1
429 }
430 return -1
431 }
432 ss := streamSafe(0)
433 ss.first(info)
434
435 for i := int(info.size); i < nsrc; i += int(info.size) {
436 info = fd.info(src, i)
437 if info.size == 0 {
438 if atEOF {
439 return i
440 }
441 return -1
442 }
443 // TODO: Using streamSafe to determine the boundary isn't the same as
444 // using BoundaryBefore. Determine which should be used.
445 if s := ss.next(info); s != ssSuccess {
446 return i
447 }
448 }
449 if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
450 return -1
451 }
452 return nsrc
453}
454
455// LastBoundary returns the position i of the last boundary in b
456// or -1 if b contains no boundary.
457func (f Form) LastBoundary(b []byte) int {
458 return lastBoundary(formTable[f], b)
459}
460
461func lastBoundary(fd *formInfo, b []byte) int {
462 i := len(b)
463 info, p := lastRuneStart(fd, b)
464 if p == -1 {
465 return -1
466 }
467 if info.size == 0 { // ends with incomplete rune
468 if p == 0 { // starts with incomplete rune
469 return -1
470 }
471 i = p
472 info, p = lastRuneStart(fd, b[:i])
473 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
474 return i
475 }
476 }
477 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
478 return i
479 }
480 if info.BoundaryAfter() {
481 return i
482 }
483 ss := streamSafe(0)
484 v := ss.backwards(info)
485 for i = p; i >= 0 && v != ssStarter; i = p {
486 info, p = lastRuneStart(fd, b[:i])
487 if v = ss.backwards(info); v == ssOverflow {
488 break
489 }
490 if p+int(info.size) != i {
491 if p == -1 { // no boundary found
492 return -1
493 }
494 return i // boundary after an illegal UTF-8 encoding
495 }
496 }
497 return i
498}
499
500// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
501// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
502// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
503func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
504 // Force one character to be consumed.
505 info := rb.f.info(rb.src, sp)
506 if info.size == 0 {
507 return 0
508 }
509 if s := rb.ss.next(info); s == ssStarter {
510 // TODO: this could be removed if we don't support merging.
511 if rb.nrune > 0 {
512 goto end
513 }
514 } else if s == ssOverflow {
515 rb.insertCGJ()
516 goto end
517 }
518 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
519 return int(err)
520 }
521 for {
522 sp += int(info.size)
523 if sp >= rb.nsrc {
524 if !atEOF && !info.BoundaryAfter() {
525 return int(iShortSrc)
526 }
527 break
528 }
529 info = rb.f.info(rb.src, sp)
530 if info.size == 0 {
531 if !atEOF {
532 return int(iShortSrc)
533 }
534 break
535 }
536 if s := rb.ss.next(info); s == ssStarter {
537 break
538 } else if s == ssOverflow {
539 rb.insertCGJ()
540 break
541 }
542 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
543 return int(err)
544 }
545 }
546end:
547 if !rb.doFlush() {
548 return int(iShortDst)
549 }
550 return sp
551}
552
553// lastRuneStart returns the runeInfo and position of the last
554// rune in buf or the zero runeInfo and -1 if no rune was found.
555func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
556 p := len(buf) - 1
557 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
558 }
559 if p < 0 {
560 return Properties{}, -1
561 }
562 return fd.info(inputBytes(buf), p), p
563}
564
565// decomposeToLastBoundary finds an open segment at the end of the buffer
566// and scans it into rb. Returns the buffer minus the last segment.
567func decomposeToLastBoundary(rb *reorderBuffer) {
568 fd := &rb.f
569 info, i := lastRuneStart(fd, rb.out)
570 if int(info.size) != len(rb.out)-i {
571 // illegal trailing continuation bytes
572 return
573 }
574 if info.BoundaryAfter() {
575 return
576 }
577 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
578 padd := 0
579 ss := streamSafe(0)
580 p := len(rb.out)
581 for {
582 add[padd] = info
583 v := ss.backwards(info)
584 if v == ssOverflow {
585 // Note that if we have an overflow, it the string we are appending to
586 // is not correctly normalized. In this case the behavior is undefined.
587 break
588 }
589 padd++
590 p -= int(info.size)
591 if v == ssStarter || p < 0 {
592 break
593 }
594 info, i = lastRuneStart(fd, rb.out[:p])
595 if int(info.size) != p-i {
596 break
597 }
598 }
599 rb.ss = ss
600 // Copy bytes for insertion as we may need to overwrite rb.out.
601 var buf [maxBufferSize * utf8.UTFMax]byte
602 cp := buf[:copy(buf[:], rb.out[p:])]
603 rb.out = rb.out[:p]
604 for padd--; padd >= 0; padd-- {
605 info = add[padd]
606 rb.insertUnsafe(inputBytes(cp), 0, info)
607 cp = cp[info.size:]
608 }
609}