blob: 4f837372ca2ff30a0d26aab8fcfbdde6e2c047d1 [file] [log] [blame]
David K. Bainbridge215e0242017-09-05 23:18:24 -07001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
7import (
8 "bytes"
9 "flag"
10 "fmt"
11 "io"
12 "log"
13 "strings"
14 "testing"
15 "unicode/utf8"
16
17 "golang.org/x/text/internal/testtext"
18 "golang.org/x/text/transform"
19)
20
21var (
22 testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
23)
24
25// pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
26func pc(s string) []byte {
27 b := bytes.NewBuffer(make([]byte, 0, len(s)))
28 for i := 0; i < len(s); {
29 r, sz := utf8.DecodeRuneInString(s[i:])
30 n := 0
31 if sz == 1 {
32 // Special-case one-byte case to handle repetition for invalid UTF-8.
33 for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
34 }
35 } else {
36 for _, r2 := range s[i:] {
37 if r2 != r {
38 break
39 }
40 n++
41 }
42 }
43 b.WriteString(s[i : i+sz])
44 if n > 1 {
45 fmt.Fprintf(b, "{%d}", n)
46 }
47 i += sz * n
48 }
49 return b.Bytes()
50}
51
52// pidx finds the index from which two strings start to differ, plus context.
53// It returns the index and ellipsis if the index is greater than 0.
54func pidx(a, b string) (i int, prefix string) {
55 for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
56 }
57 if i < 8 {
58 return 0, ""
59 }
60 i -= 3 // ensure taking at least one full rune before the difference.
61 for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
62 }
63 return i, "..."
64}
65
66type PositionTest struct {
67 input string
68 pos int
69 buffer string // expected contents of reorderBuffer, if applicable
70}
71
72type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
73
74func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
75 rb := reorderBuffer{}
76 rb.init(f, nil)
77 for i, test := range tests {
78 rb.reset()
79 rb.src = inputString(test.input)
80 rb.nsrc = len(test.input)
81 pos, out := fn(&rb, test.input)
82 if pos != test.pos {
83 t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
84 }
85 if outs := string(out); outs != test.buffer {
86 k, pfx := pidx(outs, test.buffer)
87 t.Errorf("%s:%d: buffer \nwas %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
88 }
89 }
90}
91
92func grave(n int) string {
93 return rep(0x0300, n)
94}
95
96func rep(r rune, n int) string {
97 return strings.Repeat(string(r), n)
98}
99
100const segSize = maxByteBufferSize
101
102var cgj = GraphemeJoiner
103
104var decomposeSegmentTests = []PositionTest{
105 // illegal runes
106 {"\xC2", 0, ""},
107 {"\xC0", 1, "\xC0"},
108 {"\u00E0\x80", 2, "\u0061\u0300"},
109 // starter
110 {"a", 1, "a"},
111 {"ab", 1, "a"},
112 // starter + composing
113 {"a\u0300", 3, "a\u0300"},
114 {"a\u0300b", 3, "a\u0300"},
115 // with decomposition
116 {"\u00C0", 2, "A\u0300"},
117 {"\u00C0b", 2, "A\u0300"},
118 // long
119 {grave(31), 60, grave(30) + cgj},
120 {"a" + grave(31), 61, "a" + grave(30) + cgj},
121
122 // Stability tests: see http://www.unicode.org/review/pr-29.html.
123 // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
124 // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
125 // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
126 // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
127 // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
128 {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
129 {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
130 {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
131 {"\u1100\u1161", 6, "\u1100\u1161"},
132
133 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
134 // Sequence of decomposing characters that are starters and modifiers.
135 {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
136
137 {grave(30), 60, grave(30)},
138 // U+FF9E is a starter, but decomposes to U+3099, which is not.
139 {grave(30) + "\uff9e", 60, grave(30) + cgj},
140 // ends with incomplete UTF-8 encoding
141 {"\xCC", 0, ""},
142 {"\u0300\xCC", 2, "\u0300"},
143}
144
145func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
146 rb.initString(NFD, s)
147 rb.setFlusher(nil, appendFlush)
148 p := decomposeSegment(rb, 0, true)
149 return p, rb.out
150}
151
152func TestDecomposeSegment(t *testing.T) {
153 runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
154}
155
156var firstBoundaryTests = []PositionTest{
157 // no boundary
158 {"", -1, ""},
159 {"\u0300", -1, ""},
160 {"\x80\x80", -1, ""},
161 // illegal runes
162 {"\xff", 0, ""},
163 {"\u0300\xff", 2, ""},
164 {"\u0300\xc0\x80\x80", 2, ""},
165 // boundaries
166 {"a", 0, ""},
167 {"\u0300a", 2, ""},
168 // Hangul
169 {"\u1103\u1161", 0, ""},
170 {"\u110B\u1173\u11B7", 0, ""},
171 {"\u1161\u110B\u1173\u11B7", 3, ""},
172 {"\u1173\u11B7\u1103\u1161", 6, ""},
173 // too many combining characters.
174 {grave(maxNonStarters - 1), -1, ""},
175 {grave(maxNonStarters), 60, ""},
176 {grave(maxNonStarters + 1), 60, ""},
177}
178
179func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
180 return rb.f.form.FirstBoundary([]byte(s)), nil
181}
182
183func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
184 return rb.f.form.FirstBoundaryInString(s), nil
185}
186
187func TestFirstBoundary(t *testing.T) {
188 runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
189 runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
190}
191
192func TestNextBoundary(t *testing.T) {
193 testCases := []struct {
194 input string
195 atEOF bool
196 want int
197 }{
198 // no boundary
199 {"", true, 0},
200 {"", false, -1},
201 {"\u0300", true, 2},
202 {"\u0300", false, -1},
203 {"\x80\x80", true, 1},
204 {"\x80\x80", false, 1},
205 // illegal runes
206 {"\xff", false, 1},
207 {"\u0300\xff", false, 2},
208 {"\u0300\xc0\x80\x80", false, 2},
209 {"\xc2\x80\x80", false, 2},
210 {"\xc2", false, -1},
211 {"\xc2", true, 1},
212 {"a\u0300\xc2", false, -1},
213 {"a\u0300\xc2", true, 3},
214 // boundaries
215 {"a", true, 1},
216 {"a", false, -1},
217 {"aa", false, 1},
218 {"\u0300", true, 2},
219 {"\u0300", false, -1},
220 {"\u0300a", false, 2},
221 // Hangul
222 {"\u1103\u1161", true, 6},
223 {"\u1103\u1161", false, -1},
224 {"\u110B\u1173\u11B7", false, -1},
225 {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
226 {"\u1161\u110B\u1173\u11B7", false, 3},
227 {"\u1173\u11B7\u1103\u1161", false, 6},
228 // too many combining characters.
229 {grave(maxNonStarters - 1), false, -1},
230 {grave(maxNonStarters), false, 60},
231 {grave(maxNonStarters + 1), false, 60},
232 }
233
234 for _, tc := range testCases {
235 if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
236 t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
237 }
238 if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
239 t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
240 }
241 }
242}
243
244var decomposeToLastTests = []PositionTest{
245 // ends with inert character
246 {"Hello!", 6, ""},
247 {"\u0632", 2, ""},
248 {"a\u0301\u0635", 5, ""},
249 // ends with non-inert starter
250 {"a", 0, "a"},
251 {"a\u0301a", 3, "a"},
252 {"a\u0301\u03B9", 3, "\u03B9"},
253 {"a\u0327", 0, "a\u0327"},
254 // illegal runes
255 {"\xFF", 1, ""},
256 {"aa\xFF", 3, ""},
257 {"\xC0\x80\x80", 3, ""},
258 {"\xCC\x80\x80", 3, ""},
259 // ends with incomplete UTF-8 encoding
260 {"a\xCC", 2, ""},
261 // ends with combining characters
262 {"\u0300\u0301", 0, "\u0300\u0301"},
263 {"a\u0300\u0301", 0, "a\u0300\u0301"},
264 {"a\u0301\u0308", 0, "a\u0301\u0308"},
265 {"a\u0308\u0301", 0, "a\u0308\u0301"},
266 {"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
267 {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
268 {"\u00C0", 0, "A\u0300"},
269 {"a\u00C0", 1, "A\u0300"},
270 // decomposing
271 {"a\u0300\u00E0", 3, "a\u0300"},
272 // multisegment decompositions (flushes leading segments)
273 {"a\u0300\uFDC0", 7, "\u064A"},
274 {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
275 {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
276 {"\uFDC0" + grave(31), 5, grave(30)},
277 {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
278 // Overflow
279 {"\u00E0" + grave(29), 0, "a" + grave(30)},
280 {"\u00E0" + grave(30), 2, grave(30)},
281 // Hangul
282 {"a\u1103", 1, "\u1103"},
283 {"a\u110B", 1, "\u110B"},
284 {"a\u110B\u1173", 1, "\u110B\u1173"},
285 // See comment in composition.go:compBoundaryAfter.
286 {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
287 {"a\uC73C", 1, "\u110B\u1173"},
288 {"다음", 3, "\u110B\u1173\u11B7"},
289 {"다", 0, "\u1103\u1161"},
290 {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
291 {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
292 {"다음음", 6, "\u110B\u1173\u11B7"},
293 {"음다다", 6, "\u1103\u1161"},
294 // maximized buffer
295 {"a" + grave(30), 0, "a" + grave(30)},
296 // Buffer overflow
297 {"a" + grave(31), 3, grave(30)},
298 // weird UTF-8
299 {"a\u0300\u11B7", 0, "a\u0300\u11B7"},
300}
301
302func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
303 rb.setFlusher([]byte(s), appendFlush)
304 decomposeToLastBoundary(rb)
305 buf := rb.flush(nil)
306 return len(rb.out), buf
307}
308
309func TestDecomposeToLastBoundary(t *testing.T) {
310 runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
311}
312
313var lastBoundaryTests = []PositionTest{
314 // ends with inert character
315 {"Hello!", 6, ""},
316 {"\u0632", 2, ""},
317 // ends with non-inert starter
318 {"a", 0, ""},
319 // illegal runes
320 {"\xff", 1, ""},
321 {"aa\xff", 3, ""},
322 {"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
323 {"\xc0\x80\x80", 3, ""},
324 {"\xc0\x80\x80\u0300", 3, ""},
325 // ends with incomplete UTF-8 encoding
326 {"\xCC", -1, ""},
327 {"\xE0\x80", -1, ""},
328 {"\xF0\x80\x80", -1, ""},
329 {"a\xCC", 0, ""},
330 {"\x80\xCC", 1, ""},
331 {"\xCC\xCC", 1, ""},
332 // ends with combining characters
333 {"a\u0300\u0301", 0, ""},
334 {"aaaa\u0300\u0301", 3, ""},
335 {"\u0300a\u0300\u0301", 2, ""},
336 {"\u00C2", 0, ""},
337 {"a\u00C2", 1, ""},
338 // decomposition may recombine
339 {"\u0226", 0, ""},
340 // no boundary
341 {"", -1, ""},
342 {"\u0300\u0301", -1, ""},
343 {"\u0300", -1, ""},
344 {"\x80\x80", -1, ""},
345 {"\x80\x80\u0301", -1, ""},
346 // Hangul
347 {"다음", 3, ""},
348 {"다", 0, ""},
349 {"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
350 {"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
351 // too many combining characters.
352 {grave(maxNonStarters - 1), -1, ""},
353 // May still be preceded with a non-starter.
354 {grave(maxNonStarters), -1, ""},
355 // May still need to insert a cgj after the last combiner.
356 {grave(maxNonStarters + 1), 2, ""},
357 {grave(maxNonStarters + 2), 4, ""},
358
359 {"a" + grave(maxNonStarters-1), 0, ""},
360 {"a" + grave(maxNonStarters), 0, ""},
361 // May still need to insert a cgj after the last combiner.
362 {"a" + grave(maxNonStarters+1), 3, ""},
363 {"a" + grave(maxNonStarters+2), 5, ""},
364}
365
366func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
367 return rb.f.form.LastBoundary([]byte(s)), nil
368}
369
370func TestLastBoundary(t *testing.T) {
371 runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
372}
373
374type spanTest struct {
375 input string
376 atEOF bool
377 n int
378 err error
379}
380
381var quickSpanTests = []spanTest{
382 {"", true, 0, nil},
383 // starters
384 {"a", true, 1, nil},
385 {"abc", true, 3, nil},
386 {"\u043Eb", true, 3, nil},
387 // incomplete last rune.
388 {"\xCC", true, 1, nil},
389 {"\xCC", false, 0, transform.ErrShortSrc},
390 {"a\xCC", true, 2, nil},
391 {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
392 // incorrectly ordered combining characters
393 {"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
394 {"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
395 {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
396 {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
397 // have a maximum number of combining characters.
398 {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
399 {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
400 {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
401 {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
402 {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
403 {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
404 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
405 {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
406
407 {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
408 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
409 {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
410}
411
412var quickSpanNFDTests = []spanTest{
413 // needs decomposing
414 {"\u00C0", true, 0, transform.ErrEndOfSpan},
415 {"abc\u00C0", true, 3, transform.ErrEndOfSpan},
416 // correctly ordered combining characters
417 {"\u0300", true, 2, nil},
418 {"ab\u0300", true, 4, nil},
419 {"ab\u0300cd", true, 6, nil},
420 {"\u0300cd", true, 4, nil},
421 {"\u0316\u0300", true, 4, nil},
422 {"ab\u0316\u0300", true, 6, nil},
423 {"ab\u0316\u0300cd", true, 8, nil},
424 {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
425 {"\u0316\u0300cd", true, 6, nil},
426 {"\u043E\u0308b", true, 5, nil},
427 // incorrectly ordered combining characters
428 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
429 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
430 // Hangul
431 {"같은", true, 0, transform.ErrEndOfSpan},
432}
433
434var quickSpanNFCTests = []spanTest{
435 // okay composed
436 {"\u00C0", true, 2, nil},
437 {"abc\u00C0", true, 5, nil},
438 // correctly ordered combining characters
439 // TODO: b may combine with modifiers, which is why this fails. We could
440 // make a more precise test that that actually checks whether last
441 // characters combines. Probably not worth it.
442 {"ab\u0300", true, 1, transform.ErrEndOfSpan},
443 {"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
444 {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
445 {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
446 {"\u00C0\u035D", true, 4, nil},
447 // we do not special case leading combining characters
448 {"\u0300cd", true, 0, transform.ErrEndOfSpan},
449 {"\u0300", true, 0, transform.ErrEndOfSpan},
450 {"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
451 {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
452 // incorrectly ordered combining characters
453 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
454 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
455 // Hangul
456 {"같은", true, 6, nil},
457 {"같은", false, 3, transform.ErrShortSrc},
458 // We return the start of the violating segment in case of overflow.
459 {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
460 {grave(30), true, 0, transform.ErrEndOfSpan},
461}
462
463func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
464 for i, tc := range testCases {
465 s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
466 ok := testtext.Run(t, s, func(t *testing.T) {
467 n, err := f.Span([]byte(tc.input), tc.atEOF)
468 if n != tc.n || err != tc.err {
469 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
470 }
471 })
472 if !ok {
473 continue // Don't do the String variant if the Bytes variant failed.
474 }
475 s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
476 testtext.Run(t, s, func(t *testing.T) {
477 n, err := f.SpanString(tc.input, tc.atEOF)
478 if n != tc.n || err != tc.err {
479 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
480 }
481 })
482 }
483}
484
485func TestSpan(t *testing.T) {
486 runSpanTests(t, "NFD", NFD, quickSpanTests)
487 runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
488 runSpanTests(t, "NFC", NFC, quickSpanTests)
489 runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
490}
491
492var isNormalTests = []PositionTest{
493 {"", 1, ""},
494 // illegal runes
495 {"\xff", 1, ""},
496 // starters
497 {"a", 1, ""},
498 {"abc", 1, ""},
499 {"\u043Eb", 1, ""},
500 // incorrectly ordered combining characters
501 {"\u0300\u0316", 0, ""},
502 {"ab\u0300\u0316", 0, ""},
503 {"ab\u0300\u0316cd", 0, ""},
504 {"\u0300\u0316cd", 0, ""},
505}
506var isNormalNFDTests = []PositionTest{
507 // needs decomposing
508 {"\u00C0", 0, ""},
509 {"abc\u00C0", 0, ""},
510 // correctly ordered combining characters
511 {"\u0300", 1, ""},
512 {"ab\u0300", 1, ""},
513 {"ab\u0300cd", 1, ""},
514 {"\u0300cd", 1, ""},
515 {"\u0316\u0300", 1, ""},
516 {"ab\u0316\u0300", 1, ""},
517 {"ab\u0316\u0300cd", 1, ""},
518 {"\u0316\u0300cd", 1, ""},
519 {"\u043E\u0308b", 1, ""},
520 // Hangul
521 {"같은", 0, ""},
522}
523var isNormalNFCTests = []PositionTest{
524 // okay composed
525 {"\u00C0", 1, ""},
526 {"abc\u00C0", 1, ""},
527 // need reordering
528 {"a\u0300", 0, ""},
529 {"a\u0300cd", 0, ""},
530 {"a\u0316\u0300", 0, ""},
531 {"a\u0316\u0300cd", 0, ""},
532 // correctly ordered combining characters
533 {"ab\u0300", 1, ""},
534 {"ab\u0300cd", 1, ""},
535 {"ab\u0316\u0300", 1, ""},
536 {"ab\u0316\u0300cd", 1, ""},
537 {"\u00C0\u035D", 1, ""},
538 {"\u0300", 1, ""},
539 {"\u0316\u0300cd", 1, ""},
540 // Hangul
541 {"같은", 1, ""},
542}
543
544var isNormalNFKXTests = []PositionTest{
545 // Special case.
546 {"\u00BC", 0, ""},
547}
548
549func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
550 if rb.f.form.IsNormal([]byte(s)) {
551 return 1, nil
552 }
553 return 0, nil
554}
555
556func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
557 if rb.f.form.IsNormalString(s) {
558 return 1, nil
559 }
560 return 0, nil
561}
562
563func TestIsNormal(t *testing.T) {
564 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
565 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
566 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
567 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
568 runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
569 runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
570 runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
571 runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
572 runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
573 runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
574}
575
576func TestIsNormalString(t *testing.T) {
577 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
578 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
579 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
580 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
581}
582
583type AppendTest struct {
584 left string
585 right string
586 out string
587}
588
589type appendFunc func(f Form, out []byte, s string) []byte
590
591var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
592
593func runNormTests(t *testing.T, name string, fn appendFunc) {
594 for f := NFC; f <= NFKD; f++ {
595 runAppendTests(t, name, f, fn, normTests[f])
596 }
597}
598
599func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
600 for i, test := range tests {
601 t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
602 id := pc(test.left + test.right)
603 if *testn >= 0 && i != *testn {
604 return
605 }
606 t.Run("fn", func(t *testing.T) {
607 out := []byte(test.left)
608 have := string(fn(f, out, test.right))
609 if len(have) != len(test.out) {
610 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
611 }
612 if have != test.out {
613 k, pf := pidx(have, test.out)
614 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
615 }
616 })
617
618 // Bootstrap by normalizing input. Ensures that the various variants
619 // behave the same.
620 for g := NFC; g <= NFKD; g++ {
621 if f == g {
622 continue
623 }
624 t.Run(fstr[g], func(t *testing.T) {
625 want := g.String(test.left + test.right)
626 have := string(fn(g, g.AppendString(nil, test.left), test.right))
627 if len(have) != len(want) {
628 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
629 }
630 if have != want {
631 k, pf := pidx(have, want)
632 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
633 }
634 })
635 }
636 })
637 }
638}
639
640var normTests = [][]AppendTest{
641 appendTestsNFC,
642 appendTestsNFD,
643 appendTestsNFKC,
644 appendTestsNFKD,
645}
646
647var appendTestsNFC = []AppendTest{
648 {"", ascii, ascii},
649 {"", txt_all, txt_all},
650 {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
651 {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
652
653 // Tests designed for Iter.
654 { // ordering of non-composing combining characters
655 "",
656 "\u0305\u0316",
657 "\u0316\u0305",
658 },
659 { // segment overflow
660 "",
661 "a" + rep(0x0305, maxNonStarters+4) + "\u0316",
662 "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
663 },
664
665 { // Combine across non-blocking non-starters.
666 // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
667 // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
668 "", "a\u0327\u0325", "\u1e01\u0327",
669 },
670
671 { // Jamo V+T does not combine.
672 "",
673 "\u1161\u11a8",
674 "\u1161\u11a8",
675 },
676
677 // Stability tests: see http://www.unicode.org/review/pr-29.html.
678 {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
679 {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
680 {"", "\u0b47\u0b3e", "\u0b4b"},
681 {"", "\u1100\u1161", "\uac00"},
682
683 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
684 { // 0d4a starts a new segment.
685 "",
686 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
687 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
688 },
689
690 { // Split combining characters.
691 // TODO: don't insert CGJ before starters.
692 "",
693 "\u0d46" + strings.Repeat("\u0d3e", 31),
694 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
695 },
696
697 { // Split combining characters.
698 "",
699 "\u0d4a" + strings.Repeat("\u0d3e", 30),
700 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
701 },
702
703 { // https://golang.org/issues/20079
704 "",
705 "\xeb\u0344",
706 "\xeb\u0308\u0301",
707 },
708
709 { // https://golang.org/issues/20079
710 "",
711 "\uac00" + strings.Repeat("\u0300", 30),
712 "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
713 },
714
715 { // https://golang.org/issues/20079
716 "",
717 "\xeb" + strings.Repeat("\u0300", 31),
718 "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
719 },
720}
721
722var appendTestsNFD = []AppendTest{
723// TODO: Move some of the tests here.
724}
725
726var appendTestsNFKC = []AppendTest{
727 // empty buffers
728 {"", "", ""},
729 {"a", "", "a"},
730 {"", "a", "a"},
731 {"", "\u0041\u0307\u0304", "\u01E0"},
732 // segment split across buffers
733 {"", "a\u0300b", "\u00E0b"},
734 {"a", "\u0300b", "\u00E0b"},
735 {"a", "\u0300\u0316", "\u00E0\u0316"},
736 {"a", "\u0316\u0300", "\u00E0\u0316"},
737 {"a", "\u0300a\u0300", "\u00E0\u00E0"},
738 {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
739 {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
740 {"a\u0300", "\u0327", "\u00E0\u0327"},
741 {"a\u0327", "\u0300", "\u00E0\u0327"},
742 {"a\u0316", "\u0300", "\u00E0\u0316"},
743 {"\u0041\u0307", "\u0304", "\u01E0"},
744 // Hangul
745 {"", "\u110B\u1173", "\uC73C"},
746 {"", "\u1103\u1161", "\uB2E4"},
747 {"", "\u110B\u1173\u11B7", "\uC74C"},
748 {"", "\u320E", "\x28\uAC00\x29"},
749 {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
750 {"\u1103", "\u1161", "\uB2E4"},
751 {"\u110B", "\u1173\u11B7", "\uC74C"},
752 {"\u110B\u1173", "\u11B7", "\uC74C"},
753 {"\uC73C", "\u11B7", "\uC74C"},
754 // UTF-8 encoding split across buffers
755 {"a\xCC", "\x80", "\u00E0"},
756 {"a\xCC", "\x80b", "\u00E0b"},
757 {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
758 {"a\xCC", "\x80\x80", "\u00E0\x80"},
759 {"a\xCC", "\x80\xCC", "\u00E0\xCC"},
760 {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
761 // ending in incomplete UTF-8 encoding
762 {"", "\xCC", "\xCC"},
763 {"a", "\xCC", "a\xCC"},
764 {"a", "b\xCC", "ab\xCC"},
765 {"\u0226", "\xCC", "\u0226\xCC"},
766 // illegal runes
767 {"", "\x80", "\x80"},
768 {"", "\x80\x80\x80", "\x80\x80\x80"},
769 {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
770 {"", "a\x80", "a\x80"},
771 {"", "a\x80\x80\x80", "a\x80\x80\x80"},
772 {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
773 {"a", "\x80\x80\x80", "a\x80\x80\x80"},
774 // overflow
775 {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
776 {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
777 {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
778 // overflow of combining characters
779 {"", grave(34), grave(30) + cgj + grave(4)},
780 {"", grave(36), grave(30) + cgj + grave(6)},
781 {grave(29), grave(5), grave(30) + cgj + grave(4)},
782 {grave(30), grave(4), grave(30) + cgj + grave(4)},
783 {grave(30), grave(3), grave(30) + cgj + grave(3)},
784 {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
785 {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
786 {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
787 // - First rune has a trailing non-starter.
788 {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
789 // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
790 // inserted even when FF9E starts a new segment.
791 {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
792 {grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
793 // - Many non-starter decompositions in a row causing overflow.
794 {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
795 {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
796
797 {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
798 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
799 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
800
801 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
802 {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
803 {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
804 {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
805
806 // weird UTF-8
807 {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
808 {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
809 {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
810 {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
811 {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
812 {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
813 {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
814 {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
815
816 {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
817 // large input.
818 {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
819 {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
820 {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
821 {"", "\u0041\u0307\u0304", "\u01E0"},
822}
823
824var appendTestsNFKD = []AppendTest{
825 {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
826
827 { // segment overflow on unchanged character
828 "",
829 "a" + grave(64) + "\u0316",
830 "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
831 },
832 { // segment overflow on unchanged character + start value
833 "",
834 "a" + grave(98) + "\u0316",
835 "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
836 },
837 { // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
838 "",
839 "a" + grave(59) + "\u0340",
840 "a" + grave(30) + cgj + grave(30),
841 },
842 { // segment overflow on non-starter decomposition
843 "",
844 "a" + grave(33) + "\u0340" + grave(30) + "\u0320",
845 "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
846 },
847 { // start value after ASCII overflow
848 "",
849 rep('a', segSize) + grave(32) + "\u0320",
850 rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
851 },
852 { // Jamo overflow
853 "",
854 "\u1100\u1161" + grave(30) + "\u0320" + grave(2),
855 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
856 },
857 { // Hangul
858 "",
859 "\uac00",
860 "\u1100\u1161",
861 },
862 { // Hangul overflow
863 "",
864 "\uac00" + grave(32) + "\u0320",
865 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
866 },
867 { // Hangul overflow in Hangul mode.
868 "",
869 "\uac00\uac00" + grave(32) + "\u0320",
870 "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
871 },
872 { // Hangul overflow in Hangul mode.
873 "",
874 strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
875 strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
876 },
877 { // start value after cc=0
878 "",
879 "您您" + grave(34) + "\u0320",
880 "您您" + grave(30) + cgj + "\u0320" + grave(4),
881 },
882 { // start value after normalization
883 "",
884 "\u0300\u0320a" + grave(34) + "\u0320",
885 "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
886 },
887 {
888 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
889 "",
890 "a\u0f7f" + rep(0xf71, 29) + "\u0f81",
891 "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
892 },
893}
894
895func TestAppend(t *testing.T) {
896 runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
897 return f.Append(out, []byte(s)...)
898 })
899}
900
901func TestAppendString(t *testing.T) {
902 runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
903 return f.AppendString(out, s)
904 })
905}
906
907func TestBytes(t *testing.T) {
908 runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
909 buf := []byte{}
910 buf = append(buf, out...)
911 buf = append(buf, s...)
912 return f.Bytes(buf)
913 })
914}
915
916func TestString(t *testing.T) {
917 runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
918 outs := string(out) + s
919 return []byte(f.String(outs))
920 })
921}
922
923func TestLinking(t *testing.T) {
924 const prog = `
925 package main
926 import "fmt"
927 import "golang.org/x/text/unicode/norm"
928 func main() { fmt.Println(norm.%s) }
929 `
930 baseline, errB := testtext.CodeSize(fmt.Sprintf(prog, "MaxSegmentSize"))
931 withTables, errT := testtext.CodeSize(fmt.Sprintf(prog, `NFC.String("")`))
932 if errB != nil || errT != nil {
933 t.Skipf("code size failed: %v and %v", errB, errT)
934 }
935 // Tables are at least 50K
936 if d := withTables - baseline; d < 50*1024 {
937 t.Errorf("tables appear not to be dropped: %d - %d = %d",
938 withTables, baseline, d)
939 }
940}
941
942func appendBench(f Form, in []byte) func() {
943 buf := make([]byte, 0, 4*len(in))
944 return func() {
945 f.Append(buf, in...)
946 }
947}
948
949func bytesBench(f Form, in []byte) func() {
950 return func() {
951 f.Bytes(in)
952 }
953}
954
955func iterBench(f Form, in []byte) func() {
956 iter := Iter{}
957 return func() {
958 iter.Init(f, in)
959 for !iter.Done() {
960 iter.Next()
961 }
962 }
963}
964
965func transformBench(f Form, in []byte) func() {
966 buf := make([]byte, 4*len(in))
967 return func() {
968 if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
969 log.Panic(n, len(in), err)
970 }
971 }
972}
973
974func readerBench(f Form, in []byte) func() {
975 buf := make([]byte, 4*len(in))
976 return func() {
977 r := f.Reader(bytes.NewReader(in))
978 var err error
979 for err == nil {
980 _, err = r.Read(buf)
981 }
982 if err != io.EOF {
983 panic("")
984 }
985 }
986}
987
988func writerBench(f Form, in []byte) func() {
989 buf := make([]byte, 0, 4*len(in))
990 return func() {
991 r := f.Writer(bytes.NewBuffer(buf))
992 if _, err := r.Write(in); err != nil {
993 panic("")
994 }
995 }
996}
997
998func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
999 bm = append(bm, appendBench(f, in))
1000 bm = append(bm, iterBench(f, in))
1001 bm = append(bm, transformBench(f, in))
1002 bm = append(bm, readerBench(f, in))
1003 bm = append(bm, writerBench(f, in))
1004 return bm
1005}
1006
1007func doFormBenchmark(b *testing.B, inf, f Form, s string) {
1008 b.StopTimer()
1009 in := inf.Bytes([]byte(s))
1010 bm := appendBenchmarks(nil, f, in)
1011 b.SetBytes(int64(len(in) * len(bm)))
1012 b.StartTimer()
1013 for i := 0; i < b.N; i++ {
1014 for _, fn := range bm {
1015 fn()
1016 }
1017 }
1018}
1019
1020func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
1021 b.StopTimer()
1022 fn := f(NFC, s)
1023 b.SetBytes(int64(len(s)))
1024 b.StartTimer()
1025 for i := 0; i < b.N; i++ {
1026 fn()
1027 }
1028}
1029
1030var (
1031 smallNoChange = []byte("nörmalization")
1032 smallChange = []byte("No\u0308rmalization")
1033 ascii = strings.Repeat("There is nothing to change here! ", 500)
1034)
1035
1036func lowerBench(f Form, in []byte) func() {
1037 // Use package strings instead of bytes as it doesn't allocate memory
1038 // if there aren't any changes.
1039 s := string(in)
1040 return func() {
1041 strings.ToLower(s)
1042 }
1043}
1044
1045func BenchmarkLowerCaseNoChange(b *testing.B) {
1046 doSingle(b, lowerBench, smallNoChange)
1047}
1048func BenchmarkLowerCaseChange(b *testing.B) {
1049 doSingle(b, lowerBench, smallChange)
1050}
1051
1052func quickSpanBench(f Form, in []byte) func() {
1053 return func() {
1054 f.QuickSpan(in)
1055 }
1056}
1057
1058func BenchmarkQuickSpanChangeNFC(b *testing.B) {
1059 doSingle(b, quickSpanBench, smallNoChange)
1060}
1061
1062func BenchmarkBytesNoChangeNFC(b *testing.B) {
1063 doSingle(b, bytesBench, smallNoChange)
1064}
1065func BenchmarkBytesChangeNFC(b *testing.B) {
1066 doSingle(b, bytesBench, smallChange)
1067}
1068
1069func BenchmarkAppendNoChangeNFC(b *testing.B) {
1070 doSingle(b, appendBench, smallNoChange)
1071}
1072func BenchmarkAppendChangeNFC(b *testing.B) {
1073 doSingle(b, appendBench, smallChange)
1074}
1075func BenchmarkAppendLargeNFC(b *testing.B) {
1076 doSingle(b, appendBench, txt_all_bytes)
1077}
1078
1079func BenchmarkIterNoChangeNFC(b *testing.B) {
1080 doSingle(b, iterBench, smallNoChange)
1081}
1082func BenchmarkIterChangeNFC(b *testing.B) {
1083 doSingle(b, iterBench, smallChange)
1084}
1085func BenchmarkIterLargeNFC(b *testing.B) {
1086 doSingle(b, iterBench, txt_all_bytes)
1087}
1088
1089func BenchmarkTransformNoChangeNFC(b *testing.B) {
1090 doSingle(b, transformBench, smallNoChange)
1091}
1092func BenchmarkTransformChangeNFC(b *testing.B) {
1093 doSingle(b, transformBench, smallChange)
1094}
1095func BenchmarkTransformLargeNFC(b *testing.B) {
1096 doSingle(b, transformBench, txt_all_bytes)
1097}
1098
1099func BenchmarkNormalizeAsciiNFC(b *testing.B) {
1100 doFormBenchmark(b, NFC, NFC, ascii)
1101}
1102func BenchmarkNormalizeAsciiNFD(b *testing.B) {
1103 doFormBenchmark(b, NFC, NFD, ascii)
1104}
1105func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
1106 doFormBenchmark(b, NFC, NFKC, ascii)
1107}
1108func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
1109 doFormBenchmark(b, NFC, NFKD, ascii)
1110}
1111
1112func BenchmarkNormalizeNFC2NFC(b *testing.B) {
1113 doFormBenchmark(b, NFC, NFC, txt_all)
1114}
1115func BenchmarkNormalizeNFC2NFD(b *testing.B) {
1116 doFormBenchmark(b, NFC, NFD, txt_all)
1117}
1118func BenchmarkNormalizeNFD2NFC(b *testing.B) {
1119 doFormBenchmark(b, NFD, NFC, txt_all)
1120}
1121func BenchmarkNormalizeNFD2NFD(b *testing.B) {
1122 doFormBenchmark(b, NFD, NFD, txt_all)
1123}
1124
1125// Hangul is often special-cased, so we test it separately.
1126func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
1127 doFormBenchmark(b, NFC, NFC, txt_kr)
1128}
1129func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
1130 doFormBenchmark(b, NFC, NFD, txt_kr)
1131}
1132func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
1133 doFormBenchmark(b, NFD, NFC, txt_kr)
1134}
1135func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
1136 doFormBenchmark(b, NFD, NFD, txt_kr)
1137}
1138
1139var forms = []Form{NFC, NFD, NFKC, NFKD}
1140
1141func doTextBenchmark(b *testing.B, s string) {
1142 b.StopTimer()
1143 in := []byte(s)
1144 bm := []func(){}
1145 for _, f := range forms {
1146 bm = appendBenchmarks(bm, f, in)
1147 }
1148 b.SetBytes(int64(len(s) * len(bm)))
1149 b.StartTimer()
1150 for i := 0; i < b.N; i++ {
1151 for _, f := range bm {
1152 f()
1153 }
1154 }
1155}
1156
1157func BenchmarkCanonicalOrdering(b *testing.B) {
1158 doTextBenchmark(b, txt_canon)
1159}
1160func BenchmarkExtendedLatin(b *testing.B) {
1161 doTextBenchmark(b, txt_vn)
1162}
1163func BenchmarkMiscTwoByteUtf8(b *testing.B) {
1164 doTextBenchmark(b, twoByteUtf8)
1165}
1166func BenchmarkMiscThreeByteUtf8(b *testing.B) {
1167 doTextBenchmark(b, threeByteUtf8)
1168}
1169func BenchmarkHangul(b *testing.B) {
1170 doTextBenchmark(b, txt_kr)
1171}
1172func BenchmarkJapanese(b *testing.B) {
1173 doTextBenchmark(b, txt_jp)
1174}
1175func BenchmarkChinese(b *testing.B) {
1176 doTextBenchmark(b, txt_cn)
1177}
1178func BenchmarkOverflow(b *testing.B) {
1179 doTextBenchmark(b, overflow)
1180}
1181
1182var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
1183
1184// Tests sampled from the Canonical ordering tests (Part 2) of
1185// http://unicode.org/Public/UNIDATA/NormalizationTest.txt
1186const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
1187\u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
1188\u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
1189\u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
1190\u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062
1191\u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
1192\u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
1193\u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
1194\u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
1195\u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
1196\u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
1197\u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
1198\u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
1199\u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
1200\u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
1201\u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
1202\u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
1203\u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
1204
1205// Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
1206const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.
1207Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ
1208nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc
1209một giấy phép khác có các điều khoản tương tự như giấy phép này
1210cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
1211trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
1212người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
1213bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
1214chúng theo quy định của pháp luật thì tình trạng của nó không
1215bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
1216
1217// Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
1218const txt_ru = `При обязательном соблюдении следующих условий:
1219Attribution — Вы должны атрибутировать произведение (указывать
1220автора и источник) в порядке, предусмотренном автором или
1221лицензиаром (но только так, чтобы никоим образом не подразумевалось,
1222что они поддерживают вас или использование вами данного произведения).
1223Υπό τις ακόλουθες προϋποθέσεις:`
1224
1225// Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
1226const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
1227τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
1228(χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
1229τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
1230τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
1231μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
1232παρόμοια άδεια.`
1233
1234// Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
1235const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
1236تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
1237الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
1238المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
1239من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
1240لهذا الترخيص.`
1241
1242// Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
1243const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
1244המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
1245שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
1246לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
1247החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
1248
1249const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
1250
1251// Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
1252const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
1253(Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
1254원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
1255이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
1256동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
1257라이선스와 동일한 라이선스를 적용해야 합니다.`
1258
1259// Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
1260const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
1261มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
1262ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
1263คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
1264อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
1265อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
1266
1267const threeByteUtf8 = txt_th
1268
1269// Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
1270const txt_jp = `あなたの従うべき条件は以下の通りです。
1271表示 — あなたは原著作者のクレジットを表示しなければなりません。
1272継承 — もしあなたがこの作品を改変、変形または加工した場合、
1273あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
1274頒布することができます。`
1275
1276// http://creativecommons.org/licenses/by-sa/2.5/cn/
1277const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
1278广播或通过信息网络传播本作品 创作演绎作品
1279对本作品进行商业性使用 惟须遵守下列条件:
1280署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
1281相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
1282您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
1283
1284const txt_cjk = txt_cn + txt_jp + txt_kr
1285const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
1286
1287var txt_all_bytes = []byte(txt_all)