blob: ce17f96c2e0caa84271b430e7afcf5975c800511 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
7import (
8 "fmt"
9 "unicode/utf8"
10)
11
12// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
13// sequence of starter and non-starter runes for the purpose of normalization.
14const MaxSegmentSize = maxByteBufferSize
15
16// An Iter iterates over a string or byte slice, while normalizing it
17// to a given Form.
18type Iter struct {
19 rb reorderBuffer
20 buf [maxByteBufferSize]byte
21 info Properties // first character saved from previous iteration
22 next iterFunc // implementation of next depends on form
23 asciiF iterFunc
24
25 p int // current position in input source
26 multiSeg []byte // remainder of multi-segment decomposition
27}
28
29type iterFunc func(*Iter) []byte
30
31// Init initializes i to iterate over src after normalizing it to Form f.
32func (i *Iter) Init(f Form, src []byte) {
33 i.p = 0
34 if len(src) == 0 {
35 i.setDone()
36 i.rb.nsrc = 0
37 return
38 }
39 i.multiSeg = nil
40 i.rb.init(f, src)
41 i.next = i.rb.f.nextMain
42 i.asciiF = nextASCIIBytes
43 i.info = i.rb.f.info(i.rb.src, i.p)
44 i.rb.ss.first(i.info)
45}
46
47// InitString initializes i to iterate over src after normalizing it to Form f.
48func (i *Iter) InitString(f Form, src string) {
49 i.p = 0
50 if len(src) == 0 {
51 i.setDone()
52 i.rb.nsrc = 0
53 return
54 }
55 i.multiSeg = nil
56 i.rb.initString(f, src)
57 i.next = i.rb.f.nextMain
58 i.asciiF = nextASCIIString
59 i.info = i.rb.f.info(i.rb.src, i.p)
60 i.rb.ss.first(i.info)
61}
62
63// Seek sets the segment to be returned by the next call to Next to start
64// at position p. It is the responsibility of the caller to set p to the
65// start of a segment.
66func (i *Iter) Seek(offset int64, whence int) (int64, error) {
67 var abs int64
68 switch whence {
69 case 0:
70 abs = offset
71 case 1:
72 abs = int64(i.p) + offset
73 case 2:
74 abs = int64(i.rb.nsrc) + offset
75 default:
76 return 0, fmt.Errorf("norm: invalid whence")
77 }
78 if abs < 0 {
79 return 0, fmt.Errorf("norm: negative position")
80 }
81 if int(abs) >= i.rb.nsrc {
82 i.setDone()
83 return int64(i.p), nil
84 }
85 i.p = int(abs)
86 i.multiSeg = nil
87 i.next = i.rb.f.nextMain
88 i.info = i.rb.f.info(i.rb.src, i.p)
89 i.rb.ss.first(i.info)
90 return abs, nil
91}
92
93// returnSlice returns a slice of the underlying input type as a byte slice.
94// If the underlying is of type []byte, it will simply return a slice.
95// If the underlying is of type string, it will copy the slice to the buffer
96// and return that.
97func (i *Iter) returnSlice(a, b int) []byte {
98 if i.rb.src.bytes == nil {
99 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
100 }
101 return i.rb.src.bytes[a:b]
102}
103
104// Pos returns the byte position at which the next call to Next will commence processing.
105func (i *Iter) Pos() int {
106 return i.p
107}
108
109func (i *Iter) setDone() {
110 i.next = nextDone
111 i.p = i.rb.nsrc
112}
113
114// Done returns true if there is no more input to process.
115func (i *Iter) Done() bool {
116 return i.p >= i.rb.nsrc
117}
118
119// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
120// For any input a and b for which f(a) == f(b), subsequent calls
121// to Next will return the same segments.
122// Modifying runes are grouped together with the preceding starter, if such a starter exists.
123// Although not guaranteed, n will typically be the smallest possible n.
124func (i *Iter) Next() []byte {
125 return i.next(i)
126}
127
128func nextASCIIBytes(i *Iter) []byte {
129 p := i.p + 1
130 if p >= i.rb.nsrc {
Don Newton98fd8812019-09-23 15:15:02 -0400131 i.setDone()
Don Newtone0d34a82019-11-14 10:58:06 -0500132 return i.rb.src.bytes[i.p:p]
Don Newton98fd8812019-09-23 15:15:02 -0400133 }
134 if i.rb.src.bytes[p] < utf8.RuneSelf {
135 p0 := i.p
136 i.p = p
137 return i.rb.src.bytes[p0:p]
138 }
139 i.info = i.rb.f.info(i.rb.src, i.p)
140 i.next = i.rb.f.nextMain
141 return i.next(i)
142}
143
144func nextASCIIString(i *Iter) []byte {
145 p := i.p + 1
146 if p >= i.rb.nsrc {
147 i.buf[0] = i.rb.src.str[i.p]
148 i.setDone()
149 return i.buf[:1]
150 }
151 if i.rb.src.str[p] < utf8.RuneSelf {
152 i.buf[0] = i.rb.src.str[i.p]
153 i.p = p
154 return i.buf[:1]
155 }
156 i.info = i.rb.f.info(i.rb.src, i.p)
157 i.next = i.rb.f.nextMain
158 return i.next(i)
159}
160
161func nextHangul(i *Iter) []byte {
162 p := i.p
163 next := p + hangulUTF8Size
164 if next >= i.rb.nsrc {
165 i.setDone()
166 } else if i.rb.src.hangul(next) == 0 {
167 i.rb.ss.next(i.info)
168 i.info = i.rb.f.info(i.rb.src, i.p)
169 i.next = i.rb.f.nextMain
170 return i.next(i)
171 }
172 i.p = next
173 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
174}
175
176func nextDone(i *Iter) []byte {
177 return nil
178}
179
180// nextMulti is used for iterating over multi-segment decompositions
181// for decomposing normal forms.
182func nextMulti(i *Iter) []byte {
183 j := 0
184 d := i.multiSeg
185 // skip first rune
186 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
187 }
188 for j < len(d) {
189 info := i.rb.f.info(input{bytes: d}, j)
190 if info.BoundaryBefore() {
191 i.multiSeg = d[j:]
192 return d[:j]
193 }
194 j += int(info.size)
195 }
196 // treat last segment as normal decomposition
197 i.next = i.rb.f.nextMain
198 return i.next(i)
199}
200
201// nextMultiNorm is used for iterating over multi-segment decompositions
202// for composing normal forms.
203func nextMultiNorm(i *Iter) []byte {
204 j := 0
205 d := i.multiSeg
206 for j < len(d) {
207 info := i.rb.f.info(input{bytes: d}, j)
208 if info.BoundaryBefore() {
209 i.rb.compose()
210 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
211 i.rb.insertUnsafe(input{bytes: d}, j, info)
212 i.multiSeg = d[j+int(info.size):]
213 return seg
214 }
215 i.rb.insertUnsafe(input{bytes: d}, j, info)
216 j += int(info.size)
217 }
218 i.multiSeg = nil
219 i.next = nextComposed
220 return doNormComposed(i)
221}
222
223// nextDecomposed is the implementation of Next for forms NFD and NFKD.
224func nextDecomposed(i *Iter) (next []byte) {
225 outp := 0
226 inCopyStart, outCopyStart := i.p, 0
227 for {
228 if sz := int(i.info.size); sz <= 1 {
229 i.rb.ss = 0
230 p := i.p
231 i.p++ // ASCII or illegal byte. Either way, advance by 1.
232 if i.p >= i.rb.nsrc {
233 i.setDone()
234 return i.returnSlice(p, i.p)
235 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
236 i.next = i.asciiF
237 return i.returnSlice(p, i.p)
238 }
239 outp++
240 } else if d := i.info.Decomposition(); d != nil {
241 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
242 // Case 1: there is a leftover to copy. In this case the decomposition
243 // must begin with a modifier and should always be appended.
244 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
245 p := outp + len(d)
246 if outp > 0 {
247 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
248 // TODO: this condition should not be possible, but we leave it
249 // in for defensive purposes.
250 if p > len(i.buf) {
251 return i.buf[:outp]
252 }
253 } else if i.info.multiSegment() {
254 // outp must be 0 as multi-segment decompositions always
255 // start a new segment.
256 if i.multiSeg == nil {
257 i.multiSeg = d
258 i.next = nextMulti
259 return nextMulti(i)
260 }
261 // We are in the last segment. Treat as normal decomposition.
262 d = i.multiSeg
263 i.multiSeg = nil
264 p = len(d)
265 }
266 prevCC := i.info.tccc
267 if i.p += sz; i.p >= i.rb.nsrc {
268 i.setDone()
269 i.info = Properties{} // Force BoundaryBefore to succeed.
270 } else {
271 i.info = i.rb.f.info(i.rb.src, i.p)
272 }
273 switch i.rb.ss.next(i.info) {
274 case ssOverflow:
275 i.next = nextCGJDecompose
276 fallthrough
277 case ssStarter:
278 if outp > 0 {
279 copy(i.buf[outp:], d)
280 return i.buf[:p]
281 }
282 return d
283 }
284 copy(i.buf[outp:], d)
285 outp = p
286 inCopyStart, outCopyStart = i.p, outp
287 if i.info.ccc < prevCC {
288 goto doNorm
289 }
290 continue
291 } else if r := i.rb.src.hangul(i.p); r != 0 {
292 outp = decomposeHangul(i.buf[:], r)
293 i.p += hangulUTF8Size
294 inCopyStart, outCopyStart = i.p, outp
295 if i.p >= i.rb.nsrc {
296 i.setDone()
297 break
298 } else if i.rb.src.hangul(i.p) != 0 {
299 i.next = nextHangul
300 return i.buf[:outp]
301 }
302 } else {
303 p := outp + sz
304 if p > len(i.buf) {
305 break
306 }
307 outp = p
308 i.p += sz
309 }
310 if i.p >= i.rb.nsrc {
311 i.setDone()
312 break
313 }
314 prevCC := i.info.tccc
315 i.info = i.rb.f.info(i.rb.src, i.p)
316 if v := i.rb.ss.next(i.info); v == ssStarter {
317 break
318 } else if v == ssOverflow {
319 i.next = nextCGJDecompose
320 break
321 }
322 if i.info.ccc < prevCC {
323 goto doNorm
324 }
325 }
326 if outCopyStart == 0 {
327 return i.returnSlice(inCopyStart, i.p)
328 } else if inCopyStart < i.p {
329 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
330 }
331 return i.buf[:outp]
332doNorm:
333 // Insert what we have decomposed so far in the reorderBuffer.
334 // As we will only reorder, there will always be enough room.
335 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
336 i.rb.insertDecomposed(i.buf[0:outp])
337 return doNormDecomposed(i)
338}
339
340func doNormDecomposed(i *Iter) []byte {
341 for {
342 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
343 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
344 i.setDone()
345 break
346 }
347 i.info = i.rb.f.info(i.rb.src, i.p)
348 if i.info.ccc == 0 {
349 break
350 }
351 if s := i.rb.ss.next(i.info); s == ssOverflow {
352 i.next = nextCGJDecompose
353 break
354 }
355 }
356 // new segment or too many combining characters: exit normalization
357 return i.buf[:i.rb.flushCopy(i.buf[:])]
358}
359
360func nextCGJDecompose(i *Iter) []byte {
361 i.rb.ss = 0
362 i.rb.insertCGJ()
363 i.next = nextDecomposed
364 i.rb.ss.first(i.info)
365 buf := doNormDecomposed(i)
366 return buf
367}
368
369// nextComposed is the implementation of Next for forms NFC and NFKC.
370func nextComposed(i *Iter) []byte {
371 outp, startp := 0, i.p
372 var prevCC uint8
373 for {
374 if !i.info.isYesC() {
375 goto doNorm
376 }
377 prevCC = i.info.tccc
378 sz := int(i.info.size)
379 if sz == 0 {
380 sz = 1 // illegal rune: copy byte-by-byte
381 }
382 p := outp + sz
383 if p > len(i.buf) {
384 break
385 }
386 outp = p
387 i.p += sz
388 if i.p >= i.rb.nsrc {
389 i.setDone()
390 break
391 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
392 i.rb.ss = 0
393 i.next = i.asciiF
394 break
395 }
396 i.info = i.rb.f.info(i.rb.src, i.p)
397 if v := i.rb.ss.next(i.info); v == ssStarter {
398 break
399 } else if v == ssOverflow {
400 i.next = nextCGJCompose
401 break
402 }
403 if i.info.ccc < prevCC {
404 goto doNorm
405 }
406 }
407 return i.returnSlice(startp, i.p)
408doNorm:
409 // reset to start position
410 i.p = startp
411 i.info = i.rb.f.info(i.rb.src, i.p)
412 i.rb.ss.first(i.info)
413 if i.info.multiSegment() {
414 d := i.info.Decomposition()
415 info := i.rb.f.info(input{bytes: d}, 0)
416 i.rb.insertUnsafe(input{bytes: d}, 0, info)
417 i.multiSeg = d[int(info.size):]
418 i.next = nextMultiNorm
419 return nextMultiNorm(i)
420 }
421 i.rb.ss.first(i.info)
422 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
423 return doNormComposed(i)
424}
425
426func doNormComposed(i *Iter) []byte {
427 // First rune should already be inserted.
428 for {
429 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
430 i.setDone()
431 break
432 }
433 i.info = i.rb.f.info(i.rb.src, i.p)
434 if s := i.rb.ss.next(i.info); s == ssStarter {
435 break
436 } else if s == ssOverflow {
437 i.next = nextCGJCompose
438 break
439 }
440 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
441 }
442 i.rb.compose()
443 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
444 return seg
445}
446
447func nextCGJCompose(i *Iter) []byte {
448 i.rb.ss = 0 // instead of first
449 i.rb.insertCGJ()
450 i.next = nextComposed
451 // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
452 // even if they are not. This is particularly dubious for U+FF9E and UFF9A.
453 // If we ever change that, insert a check here.
454 i.rb.ss.first(i.info)
455 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
456 return doNormComposed(i)
457}