blob: 4747ad07a839c12bf7d40fc68d450c54e4e1afc2 [file] [log] [blame]
khenaidoo5fc5cea2021-08-11 17:39:16 -04001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Note: the file data_test.go that is generated should not be checked in.
6//go:generate go run maketables.go triegen.go
7//go:generate go test -tags test
8
9// Package norm contains types and functions for normalizing Unicode strings.
10package norm // import "golang.org/x/text/unicode/norm"
11
12import (
13 "unicode/utf8"
14
15 "golang.org/x/text/transform"
16)
17
18// A Form denotes a canonical representation of Unicode code points.
19// The Unicode-defined normalization and equivalence forms are:
20//
Joey Armstrongba3d9d12024-01-15 14:22:11 -050021// NFC Unicode Normalization Form C
22// NFD Unicode Normalization Form D
23// NFKC Unicode Normalization Form KC
24// NFKD Unicode Normalization Form KD
khenaidoo5fc5cea2021-08-11 17:39:16 -040025//
26// For a Form f, this documentation uses the notation f(x) to mean
27// the bytes or string x converted to the given form.
28// A position n in x is called a boundary if conversion to the form can
29// proceed independently on both sides:
Joey Armstrongba3d9d12024-01-15 14:22:11 -050030//
31// f(x) == append(f(x[0:n]), f(x[n:])...)
khenaidoo5fc5cea2021-08-11 17:39:16 -040032//
33// References: https://unicode.org/reports/tr15/ and
34// https://unicode.org/notes/tn5/.
35type Form int
36
37const (
38 NFC Form = iota
39 NFD
40 NFKC
41 NFKD
42)
43
44// Bytes returns f(b). May return b if f(b) = b.
45func (f Form) Bytes(b []byte) []byte {
46 src := inputBytes(b)
47 ft := formTable[f]
48 n, ok := ft.quickSpan(src, 0, len(b), true)
49 if ok {
50 return b
51 }
52 out := make([]byte, n, len(b))
53 copy(out, b[0:n])
54 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
55 return doAppendInner(&rb, n)
56}
57
58// String returns f(s).
59func (f Form) String(s string) string {
60 src := inputString(s)
61 ft := formTable[f]
62 n, ok := ft.quickSpan(src, 0, len(s), true)
63 if ok {
64 return s
65 }
66 out := make([]byte, n, len(s))
67 copy(out, s[0:n])
68 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
69 return string(doAppendInner(&rb, n))
70}
71
72// IsNormal returns true if b == f(b).
73func (f Form) IsNormal(b []byte) bool {
74 src := inputBytes(b)
75 ft := formTable[f]
76 bp, ok := ft.quickSpan(src, 0, len(b), true)
77 if ok {
78 return true
79 }
80 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
81 rb.setFlusher(nil, cmpNormalBytes)
82 for bp < len(b) {
83 rb.out = b[bp:]
84 if bp = decomposeSegment(&rb, bp, true); bp < 0 {
85 return false
86 }
87 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
88 }
89 return true
90}
91
92func cmpNormalBytes(rb *reorderBuffer) bool {
93 b := rb.out
94 for i := 0; i < rb.nrune; i++ {
95 info := rb.rune[i]
96 if int(info.size) > len(b) {
97 return false
98 }
99 p := info.pos
100 pe := p + info.size
101 for ; p < pe; p++ {
102 if b[0] != rb.byte[p] {
103 return false
104 }
105 b = b[1:]
106 }
107 }
108 return true
109}
110
111// IsNormalString returns true if s == f(s).
112func (f Form) IsNormalString(s string) bool {
113 src := inputString(s)
114 ft := formTable[f]
115 bp, ok := ft.quickSpan(src, 0, len(s), true)
116 if ok {
117 return true
118 }
119 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
120 rb.setFlusher(nil, func(rb *reorderBuffer) bool {
121 for i := 0; i < rb.nrune; i++ {
122 info := rb.rune[i]
123 if bp+int(info.size) > len(s) {
124 return false
125 }
126 p := info.pos
127 pe := p + info.size
128 for ; p < pe; p++ {
129 if s[bp] != rb.byte[p] {
130 return false
131 }
132 bp++
133 }
134 }
135 return true
136 })
137 for bp < len(s) {
138 if bp = decomposeSegment(&rb, bp, true); bp < 0 {
139 return false
140 }
141 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
142 }
143 return true
144}
145
146// patchTail fixes a case where a rune may be incorrectly normalized
147// if it is followed by illegal continuation bytes. It returns the
148// patched buffer and whether the decomposition is still in progress.
149func patchTail(rb *reorderBuffer) bool {
150 info, p := lastRuneStart(&rb.f, rb.out)
151 if p == -1 || info.size == 0 {
152 return true
153 }
154 end := p + int(info.size)
155 extra := len(rb.out) - end
156 if extra > 0 {
157 // Potentially allocating memory. However, this only
158 // happens with ill-formed UTF-8.
159 x := make([]byte, 0)
160 x = append(x, rb.out[len(rb.out)-extra:]...)
161 rb.out = rb.out[:end]
162 decomposeToLastBoundary(rb)
163 rb.doFlush()
164 rb.out = append(rb.out, x...)
165 return false
166 }
167 buf := rb.out[p:]
168 rb.out = rb.out[:p]
169 decomposeToLastBoundary(rb)
170 if s := rb.ss.next(info); s == ssStarter {
171 rb.doFlush()
172 rb.ss.first(info)
173 } else if s == ssOverflow {
174 rb.doFlush()
175 rb.insertCGJ()
176 rb.ss = 0
177 }
178 rb.insertUnsafe(inputBytes(buf), 0, info)
179 return true
180}
181
182func appendQuick(rb *reorderBuffer, i int) int {
183 if rb.nsrc == i {
184 return i
185 }
186 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
187 rb.out = rb.src.appendSlice(rb.out, i, end)
188 return end
189}
190
191// Append returns f(append(out, b...)).
192// The buffer out must be nil, empty, or equal to f(out).
193func (f Form) Append(out []byte, src ...byte) []byte {
194 return f.doAppend(out, inputBytes(src), len(src))
195}
196
197func (f Form) doAppend(out []byte, src input, n int) []byte {
198 if n == 0 {
199 return out
200 }
201 ft := formTable[f]
202 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
203 if len(out) == 0 {
204 p, _ := ft.quickSpan(src, 0, n, true)
205 out = src.appendSlice(out, 0, p)
206 if p == n {
207 return out
208 }
209 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
210 return doAppendInner(&rb, p)
211 }
212 rb := reorderBuffer{f: *ft, src: src, nsrc: n}
213 return doAppend(&rb, out, 0)
214}
215
216func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
217 rb.setFlusher(out, appendFlush)
218 src, n := rb.src, rb.nsrc
219 doMerge := len(out) > 0
220 if q := src.skipContinuationBytes(p); q > p {
221 // Move leading non-starters to destination.
222 rb.out = src.appendSlice(rb.out, p, q)
223 p = q
224 doMerge = patchTail(rb)
225 }
226 fd := &rb.f
227 if doMerge {
228 var info Properties
229 if p < n {
230 info = fd.info(src, p)
231 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
232 if p == 0 {
233 decomposeToLastBoundary(rb)
234 }
235 p = decomposeSegment(rb, p, true)
236 }
237 }
238 if info.size == 0 {
239 rb.doFlush()
240 // Append incomplete UTF-8 encoding.
241 return src.appendSlice(rb.out, p, n)
242 }
243 if rb.nrune > 0 {
244 return doAppendInner(rb, p)
245 }
246 }
247 p = appendQuick(rb, p)
248 return doAppendInner(rb, p)
249}
250
251func doAppendInner(rb *reorderBuffer, p int) []byte {
252 for n := rb.nsrc; p < n; {
253 p = decomposeSegment(rb, p, true)
254 p = appendQuick(rb, p)
255 }
256 return rb.out
257}
258
259// AppendString returns f(append(out, []byte(s))).
260// The buffer out must be nil, empty, or equal to f(out).
261func (f Form) AppendString(out []byte, src string) []byte {
262 return f.doAppend(out, inputString(src), len(src))
263}
264
265// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
266// It is not guaranteed to return the largest such n.
267func (f Form) QuickSpan(b []byte) int {
268 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
269 return n
270}
271
272// Span implements transform.SpanningTransformer. It returns a boundary n such
273// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
274func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
275 n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
276 if n < len(b) {
277 if !ok {
278 err = transform.ErrEndOfSpan
279 } else {
280 err = transform.ErrShortSrc
281 }
282 }
283 return n, err
284}
285
286// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
287// It is not guaranteed to return the largest such n.
288func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
289 n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
290 if n < len(s) {
291 if !ok {
292 err = transform.ErrEndOfSpan
293 } else {
294 err = transform.ErrShortSrc
295 }
296 }
297 return n, err
298}
299
300// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
301// whether any non-normalized parts were found. If atEOF is false, n will
302// not point past the last segment if this segment might be become
303// non-normalized by appending other runes.
304func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
305 var lastCC uint8
306 ss := streamSafe(0)
307 lastSegStart := i
308 for n = end; i < n; {
309 if j := src.skipASCII(i, n); i != j {
310 i = j
311 lastSegStart = i - 1
312 lastCC = 0
313 ss = 0
314 continue
315 }
316 info := f.info(src, i)
317 if info.size == 0 {
318 if atEOF {
319 // include incomplete runes
320 return n, true
321 }
322 return lastSegStart, true
323 }
324 // This block needs to be before the next, because it is possible to
325 // have an overflow for runes that are starters (e.g. with U+FF9E).
326 switch ss.next(info) {
327 case ssStarter:
328 lastSegStart = i
329 case ssOverflow:
330 return lastSegStart, false
331 case ssSuccess:
332 if lastCC > info.ccc {
333 return lastSegStart, false
334 }
335 }
336 if f.composing {
337 if !info.isYesC() {
338 break
339 }
340 } else {
341 if !info.isYesD() {
342 break
343 }
344 }
345 lastCC = info.ccc
346 i += int(info.size)
347 }
348 if i == n {
349 if !atEOF {
350 n = lastSegStart
351 }
352 return n, true
353 }
354 return lastSegStart, false
355}
356
357// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
358// It is not guaranteed to return the largest such n.
359func (f Form) QuickSpanString(s string) int {
360 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
361 return n
362}
363
364// FirstBoundary returns the position i of the first boundary in b
365// or -1 if b contains no boundary.
366func (f Form) FirstBoundary(b []byte) int {
367 return f.firstBoundary(inputBytes(b), len(b))
368}
369
370func (f Form) firstBoundary(src input, nsrc int) int {
371 i := src.skipContinuationBytes(0)
372 if i >= nsrc {
373 return -1
374 }
375 fd := formTable[f]
376 ss := streamSafe(0)
377 // We should call ss.first here, but we can't as the first rune is
378 // skipped already. This means FirstBoundary can't really determine
379 // CGJ insertion points correctly. Luckily it doesn't have to.
380 for {
381 info := fd.info(src, i)
382 if info.size == 0 {
383 return -1
384 }
385 if s := ss.next(info); s != ssSuccess {
386 return i
387 }
388 i += int(info.size)
389 if i >= nsrc {
390 if !info.BoundaryAfter() && !ss.isMax() {
391 return -1
392 }
393 return nsrc
394 }
395 }
396}
397
398// FirstBoundaryInString returns the position i of the first boundary in s
399// or -1 if s contains no boundary.
400func (f Form) FirstBoundaryInString(s string) int {
401 return f.firstBoundary(inputString(s), len(s))
402}
403
404// NextBoundary reports the index of the boundary between the first and next
405// segment in b or -1 if atEOF is false and there are not enough bytes to
406// determine this boundary.
407func (f Form) NextBoundary(b []byte, atEOF bool) int {
408 return f.nextBoundary(inputBytes(b), len(b), atEOF)
409}
410
411// NextBoundaryInString reports the index of the boundary between the first and
412// next segment in b or -1 if atEOF is false and there are not enough bytes to
413// determine this boundary.
414func (f Form) NextBoundaryInString(s string, atEOF bool) int {
415 return f.nextBoundary(inputString(s), len(s), atEOF)
416}
417
418func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
419 if nsrc == 0 {
420 if atEOF {
421 return 0
422 }
423 return -1
424 }
425 fd := formTable[f]
426 info := fd.info(src, 0)
427 if info.size == 0 {
428 if atEOF {
429 return 1
430 }
431 return -1
432 }
433 ss := streamSafe(0)
434 ss.first(info)
435
436 for i := int(info.size); i < nsrc; i += int(info.size) {
437 info = fd.info(src, i)
438 if info.size == 0 {
439 if atEOF {
440 return i
441 }
442 return -1
443 }
444 // TODO: Using streamSafe to determine the boundary isn't the same as
445 // using BoundaryBefore. Determine which should be used.
446 if s := ss.next(info); s != ssSuccess {
447 return i
448 }
449 }
450 if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
451 return -1
452 }
453 return nsrc
454}
455
456// LastBoundary returns the position i of the last boundary in b
457// or -1 if b contains no boundary.
458func (f Form) LastBoundary(b []byte) int {
459 return lastBoundary(formTable[f], b)
460}
461
462func lastBoundary(fd *formInfo, b []byte) int {
463 i := len(b)
464 info, p := lastRuneStart(fd, b)
465 if p == -1 {
466 return -1
467 }
468 if info.size == 0 { // ends with incomplete rune
469 if p == 0 { // starts with incomplete rune
470 return -1
471 }
472 i = p
473 info, p = lastRuneStart(fd, b[:i])
474 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
475 return i
476 }
477 }
478 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
479 return i
480 }
481 if info.BoundaryAfter() {
482 return i
483 }
484 ss := streamSafe(0)
485 v := ss.backwards(info)
486 for i = p; i >= 0 && v != ssStarter; i = p {
487 info, p = lastRuneStart(fd, b[:i])
488 if v = ss.backwards(info); v == ssOverflow {
489 break
490 }
491 if p+int(info.size) != i {
492 if p == -1 { // no boundary found
493 return -1
494 }
495 return i // boundary after an illegal UTF-8 encoding
496 }
497 }
498 return i
499}
500
501// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
502// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
503// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
504func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
505 // Force one character to be consumed.
506 info := rb.f.info(rb.src, sp)
507 if info.size == 0 {
508 return 0
509 }
510 if s := rb.ss.next(info); s == ssStarter {
511 // TODO: this could be removed if we don't support merging.
512 if rb.nrune > 0 {
513 goto end
514 }
515 } else if s == ssOverflow {
516 rb.insertCGJ()
517 goto end
518 }
519 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
520 return int(err)
521 }
522 for {
523 sp += int(info.size)
524 if sp >= rb.nsrc {
525 if !atEOF && !info.BoundaryAfter() {
526 return int(iShortSrc)
527 }
528 break
529 }
530 info = rb.f.info(rb.src, sp)
531 if info.size == 0 {
532 if !atEOF {
533 return int(iShortSrc)
534 }
535 break
536 }
537 if s := rb.ss.next(info); s == ssStarter {
538 break
539 } else if s == ssOverflow {
540 rb.insertCGJ()
541 break
542 }
543 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
544 return int(err)
545 }
546 }
547end:
548 if !rb.doFlush() {
549 return int(iShortDst)
550 }
551 return sp
552}
553
554// lastRuneStart returns the runeInfo and position of the last
555// rune in buf or the zero runeInfo and -1 if no rune was found.
556func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
557 p := len(buf) - 1
558 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
559 }
560 if p < 0 {
561 return Properties{}, -1
562 }
563 return fd.info(inputBytes(buf), p), p
564}
565
566// decomposeToLastBoundary finds an open segment at the end of the buffer
567// and scans it into rb. Returns the buffer minus the last segment.
568func decomposeToLastBoundary(rb *reorderBuffer) {
569 fd := &rb.f
570 info, i := lastRuneStart(fd, rb.out)
571 if int(info.size) != len(rb.out)-i {
572 // illegal trailing continuation bytes
573 return
574 }
575 if info.BoundaryAfter() {
576 return
577 }
578 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
579 padd := 0
580 ss := streamSafe(0)
581 p := len(rb.out)
582 for {
583 add[padd] = info
584 v := ss.backwards(info)
585 if v == ssOverflow {
586 // Note that if we have an overflow, it the string we are appending to
587 // is not correctly normalized. In this case the behavior is undefined.
588 break
589 }
590 padd++
591 p -= int(info.size)
592 if v == ssStarter || p < 0 {
593 break
594 }
595 info, i = lastRuneStart(fd, rb.out[:p])
596 if int(info.size) != p-i {
597 break
598 }
599 }
600 rb.ss = ss
601 // Copy bytes for insertion as we may need to overwrite rb.out.
602 var buf [maxBufferSize * utf8.UTFMax]byte
603 cp := buf[:copy(buf[:], rb.out[p:])]
604 rb.out = rb.out[:p]
605 for padd--; padd >= 0; padd-- {
606 info = add[padd]
607 rb.insertUnsafe(inputBytes(cp), 0, info)
608 cp = cp[info.size:]
609 }
610}