Blame - vendor/golang.org/x/text/unicode/norm/iter.go - voltha-openonu-adapter-go

blob: 417c6b26894da4d3fa68883b6d049fa7a399c569 [file] [log] [blame]

Holger Hildebrandt	fa07499	2020-03-27 15:42:06 +0000	[diff] [blame^]	1	// Copyright 2011 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package norm
				6
				7	import (
				8	"fmt"
				9	"unicode/utf8"
				10	)
				11
				12	// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
				13	// sequence of starter and non-starter runes for the purpose of normalization.
				14	const MaxSegmentSize = maxByteBufferSize
				15
				16	// An Iter iterates over a string or byte slice, while normalizing it
				17	// to a given Form.
				18	type Iter struct {
				19	rb reorderBuffer
				20	buf [maxByteBufferSize]byte
				21	info Properties // first character saved from previous iteration
				22	next iterFunc // implementation of next depends on form
				23	asciiF iterFunc
				24
				25	p int // current position in input source
				26	multiSeg []byte // remainder of multi-segment decomposition
				27	}
				28
				29	type iterFunc func(*Iter) []byte
				30
				31	// Init initializes i to iterate over src after normalizing it to Form f.
				32	func (i *Iter) Init(f Form, src []byte) {
				33	i.p = 0
				34	if len(src) == 0 {
				35	i.setDone()
				36	i.rb.nsrc = 0
				37	return
				38	}
				39	i.multiSeg = nil
				40	i.rb.init(f, src)
				41	i.next = i.rb.f.nextMain
				42	i.asciiF = nextASCIIBytes
				43	i.info = i.rb.f.info(i.rb.src, i.p)
				44	i.rb.ss.first(i.info)
				45	}
				46
				47	// InitString initializes i to iterate over src after normalizing it to Form f.
				48	func (i *Iter) InitString(f Form, src string) {
				49	i.p = 0
				50	if len(src) == 0 {
				51	i.setDone()
				52	i.rb.nsrc = 0
				53	return
				54	}
				55	i.multiSeg = nil
				56	i.rb.initString(f, src)
				57	i.next = i.rb.f.nextMain
				58	i.asciiF = nextASCIIString
				59	i.info = i.rb.f.info(i.rb.src, i.p)
				60	i.rb.ss.first(i.info)
				61	}
				62
				63	// Seek sets the segment to be returned by the next call to Next to start
				64	// at position p. It is the responsibility of the caller to set p to the
				65	// start of a segment.
				66	func (i *Iter) Seek(offset int64, whence int) (int64, error) {
				67	var abs int64
				68	switch whence {
				69	case 0:
				70	abs = offset
				71	case 1:
				72	abs = int64(i.p) + offset
				73	case 2:
				74	abs = int64(i.rb.nsrc) + offset
				75	default:
				76	return 0, fmt.Errorf("norm: invalid whence")
				77	}
				78	if abs < 0 {
				79	return 0, fmt.Errorf("norm: negative position")
				80	}
				81	if int(abs) >= i.rb.nsrc {
				82	i.setDone()
				83	return int64(i.p), nil
				84	}
				85	i.p = int(abs)
				86	i.multiSeg = nil
				87	i.next = i.rb.f.nextMain
				88	i.info = i.rb.f.info(i.rb.src, i.p)
				89	i.rb.ss.first(i.info)
				90	return abs, nil
				91	}
				92
				93	// returnSlice returns a slice of the underlying input type as a byte slice.
				94	// If the underlying is of type []byte, it will simply return a slice.
				95	// If the underlying is of type string, it will copy the slice to the buffer
				96	// and return that.
				97	func (i *Iter) returnSlice(a, b int) []byte {
				98	if i.rb.src.bytes == nil {
				99	return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
				100	}
				101	return i.rb.src.bytes[a:b]
				102	}
				103
				104	// Pos returns the byte position at which the next call to Next will commence processing.
				105	func (i *Iter) Pos() int {
				106	return i.p
				107	}
				108
				109	func (i *Iter) setDone() {
				110	i.next = nextDone
				111	i.p = i.rb.nsrc
				112	}
				113
				114	// Done returns true if there is no more input to process.
				115	func (i *Iter) Done() bool {
				116	return i.p >= i.rb.nsrc
				117	}
				118
				119	// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
				120	// For any input a and b for which f(a) == f(b), subsequent calls
				121	// to Next will return the same segments.
				122	// Modifying runes are grouped together with the preceding starter, if such a starter exists.
				123	// Although not guaranteed, n will typically be the smallest possible n.
				124	func (i *Iter) Next() []byte {
				125	return i.next(i)
				126	}
				127
				128	func nextASCIIBytes(i *Iter) []byte {
				129	p := i.p + 1
				130	if p >= i.rb.nsrc {
				131	p0 := i.p
				132	i.setDone()
				133	return i.rb.src.bytes[p0:p]
				134	}
				135	if i.rb.src.bytes[p] < utf8.RuneSelf {
				136	p0 := i.p
				137	i.p = p
				138	return i.rb.src.bytes[p0:p]
				139	}
				140	i.info = i.rb.f.info(i.rb.src, i.p)
				141	i.next = i.rb.f.nextMain
				142	return i.next(i)
				143	}
				144
				145	func nextASCIIString(i *Iter) []byte {
				146	p := i.p + 1
				147	if p >= i.rb.nsrc {
				148	i.buf[0] = i.rb.src.str[i.p]
				149	i.setDone()
				150	return i.buf[:1]
				151	}
				152	if i.rb.src.str[p] < utf8.RuneSelf {
				153	i.buf[0] = i.rb.src.str[i.p]
				154	i.p = p
				155	return i.buf[:1]
				156	}
				157	i.info = i.rb.f.info(i.rb.src, i.p)
				158	i.next = i.rb.f.nextMain
				159	return i.next(i)
				160	}
				161
				162	func nextHangul(i *Iter) []byte {
				163	p := i.p
				164	next := p + hangulUTF8Size
				165	if next >= i.rb.nsrc {
				166	i.setDone()
				167	} else if i.rb.src.hangul(next) == 0 {
				168	i.rb.ss.next(i.info)
				169	i.info = i.rb.f.info(i.rb.src, i.p)
				170	i.next = i.rb.f.nextMain
				171	return i.next(i)
				172	}
				173	i.p = next
				174	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
				175	}
				176
				177	func nextDone(i *Iter) []byte {
				178	return nil
				179	}
				180
				181	// nextMulti is used for iterating over multi-segment decompositions
				182	// for decomposing normal forms.
				183	func nextMulti(i *Iter) []byte {
				184	j := 0
				185	d := i.multiSeg
				186	// skip first rune
				187	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
				188	}
				189	for j < len(d) {
				190	info := i.rb.f.info(input{bytes: d}, j)
				191	if info.BoundaryBefore() {
				192	i.multiSeg = d[j:]
				193	return d[:j]
				194	}
				195	j += int(info.size)
				196	}
				197	// treat last segment as normal decomposition
				198	i.next = i.rb.f.nextMain
				199	return i.next(i)
				200	}
				201
				202	// nextMultiNorm is used for iterating over multi-segment decompositions
				203	// for composing normal forms.
				204	func nextMultiNorm(i *Iter) []byte {
				205	j := 0
				206	d := i.multiSeg
				207	for j < len(d) {
				208	info := i.rb.f.info(input{bytes: d}, j)
				209	if info.BoundaryBefore() {
				210	i.rb.compose()
				211	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
				212	i.rb.insertUnsafe(input{bytes: d}, j, info)
				213	i.multiSeg = d[j+int(info.size):]
				214	return seg
				215	}
				216	i.rb.insertUnsafe(input{bytes: d}, j, info)
				217	j += int(info.size)
				218	}
				219	i.multiSeg = nil
				220	i.next = nextComposed
				221	return doNormComposed(i)
				222	}
				223
				224	// nextDecomposed is the implementation of Next for forms NFD and NFKD.
				225	func nextDecomposed(i *Iter) (next []byte) {
				226	outp := 0
				227	inCopyStart, outCopyStart := i.p, 0
				228	for {
				229	if sz := int(i.info.size); sz <= 1 {
				230	i.rb.ss = 0
				231	p := i.p
				232	i.p++ // ASCII or illegal byte. Either way, advance by 1.
				233	if i.p >= i.rb.nsrc {
				234	i.setDone()
				235	return i.returnSlice(p, i.p)
				236	} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
				237	i.next = i.asciiF
				238	return i.returnSlice(p, i.p)
				239	}
				240	outp++
				241	} else if d := i.info.Decomposition(); d != nil {
				242	// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
				243	// Case 1: there is a leftover to copy. In this case the decomposition
				244	// must begin with a modifier and should always be appended.
				245	// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
				246	p := outp + len(d)
				247	if outp > 0 {
				248	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
				249	// TODO: this condition should not be possible, but we leave it
				250	// in for defensive purposes.
				251	if p > len(i.buf) {
				252	return i.buf[:outp]
				253	}
				254	} else if i.info.multiSegment() {
				255	// outp must be 0 as multi-segment decompositions always
				256	// start a new segment.
				257	if i.multiSeg == nil {
				258	i.multiSeg = d
				259	i.next = nextMulti
				260	return nextMulti(i)
				261	}
				262	// We are in the last segment. Treat as normal decomposition.
				263	d = i.multiSeg
				264	i.multiSeg = nil
				265	p = len(d)
				266	}
				267	prevCC := i.info.tccc
				268	if i.p += sz; i.p >= i.rb.nsrc {
				269	i.setDone()
				270	i.info = Properties{} // Force BoundaryBefore to succeed.
				271	} else {
				272	i.info = i.rb.f.info(i.rb.src, i.p)
				273	}
				274	switch i.rb.ss.next(i.info) {
				275	case ssOverflow:
				276	i.next = nextCGJDecompose
				277	fallthrough
				278	case ssStarter:
				279	if outp > 0 {
				280	copy(i.buf[outp:], d)
				281	return i.buf[:p]
				282	}
				283	return d
				284	}
				285	copy(i.buf[outp:], d)
				286	outp = p
				287	inCopyStart, outCopyStart = i.p, outp
				288	if i.info.ccc < prevCC {
				289	goto doNorm
				290	}
				291	continue
				292	} else if r := i.rb.src.hangul(i.p); r != 0 {
				293	outp = decomposeHangul(i.buf[:], r)
				294	i.p += hangulUTF8Size
				295	inCopyStart, outCopyStart = i.p, outp
				296	if i.p >= i.rb.nsrc {
				297	i.setDone()
				298	break
				299	} else if i.rb.src.hangul(i.p) != 0 {
				300	i.next = nextHangul
				301	return i.buf[:outp]
				302	}
				303	} else {
				304	p := outp + sz
				305	if p > len(i.buf) {
				306	break
				307	}
				308	outp = p
				309	i.p += sz
				310	}
				311	if i.p >= i.rb.nsrc {
				312	i.setDone()
				313	break
				314	}
				315	prevCC := i.info.tccc
				316	i.info = i.rb.f.info(i.rb.src, i.p)
				317	if v := i.rb.ss.next(i.info); v == ssStarter {
				318	break
				319	} else if v == ssOverflow {
				320	i.next = nextCGJDecompose
				321	break
				322	}
				323	if i.info.ccc < prevCC {
				324	goto doNorm
				325	}
				326	}
				327	if outCopyStart == 0 {
				328	return i.returnSlice(inCopyStart, i.p)
				329	} else if inCopyStart < i.p {
				330	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
				331	}
				332	return i.buf[:outp]
				333	doNorm:
				334	// Insert what we have decomposed so far in the reorderBuffer.
				335	// As we will only reorder, there will always be enough room.
				336	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
				337	i.rb.insertDecomposed(i.buf[0:outp])
				338	return doNormDecomposed(i)
				339	}
				340
				341	func doNormDecomposed(i *Iter) []byte {
				342	for {
				343	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
				344	if i.p += int(i.info.size); i.p >= i.rb.nsrc {
				345	i.setDone()
				346	break
				347	}
				348	i.info = i.rb.f.info(i.rb.src, i.p)
				349	if i.info.ccc == 0 {
				350	break
				351	}
				352	if s := i.rb.ss.next(i.info); s == ssOverflow {
				353	i.next = nextCGJDecompose
				354	break
				355	}
				356	}
				357	// new segment or too many combining characters: exit normalization
				358	return i.buf[:i.rb.flushCopy(i.buf[:])]
				359	}
				360
				361	func nextCGJDecompose(i *Iter) []byte {
				362	i.rb.ss = 0
				363	i.rb.insertCGJ()
				364	i.next = nextDecomposed
				365	i.rb.ss.first(i.info)
				366	buf := doNormDecomposed(i)
				367	return buf
				368	}
				369
				370	// nextComposed is the implementation of Next for forms NFC and NFKC.
				371	func nextComposed(i *Iter) []byte {
				372	outp, startp := 0, i.p
				373	var prevCC uint8
				374	for {
				375	if !i.info.isYesC() {
				376	goto doNorm
				377	}
				378	prevCC = i.info.tccc
				379	sz := int(i.info.size)
				380	if sz == 0 {
				381	sz = 1 // illegal rune: copy byte-by-byte
				382	}
				383	p := outp + sz
				384	if p > len(i.buf) {
				385	break
				386	}
				387	outp = p
				388	i.p += sz
				389	if i.p >= i.rb.nsrc {
				390	i.setDone()
				391	break
				392	} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
				393	i.rb.ss = 0
				394	i.next = i.asciiF
				395	break
				396	}
				397	i.info = i.rb.f.info(i.rb.src, i.p)
				398	if v := i.rb.ss.next(i.info); v == ssStarter {
				399	break
				400	} else if v == ssOverflow {
				401	i.next = nextCGJCompose
				402	break
				403	}
				404	if i.info.ccc < prevCC {
				405	goto doNorm
				406	}
				407	}
				408	return i.returnSlice(startp, i.p)
				409	doNorm:
				410	// reset to start position
				411	i.p = startp
				412	i.info = i.rb.f.info(i.rb.src, i.p)
				413	i.rb.ss.first(i.info)
				414	if i.info.multiSegment() {
				415	d := i.info.Decomposition()
				416	info := i.rb.f.info(input{bytes: d}, 0)
				417	i.rb.insertUnsafe(input{bytes: d}, 0, info)
				418	i.multiSeg = d[int(info.size):]
				419	i.next = nextMultiNorm
				420	return nextMultiNorm(i)
				421	}
				422	i.rb.ss.first(i.info)
				423	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
				424	return doNormComposed(i)
				425	}
				426
				427	func doNormComposed(i *Iter) []byte {
				428	// First rune should already be inserted.
				429	for {
				430	if i.p += int(i.info.size); i.p >= i.rb.nsrc {
				431	i.setDone()
				432	break
				433	}
				434	i.info = i.rb.f.info(i.rb.src, i.p)
				435	if s := i.rb.ss.next(i.info); s == ssStarter {
				436	break
				437	} else if s == ssOverflow {
				438	i.next = nextCGJCompose
				439	break
				440	}
				441	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
				442	}
				443	i.rb.compose()
				444	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
				445	return seg
				446	}
				447
				448	func nextCGJCompose(i *Iter) []byte {
				449	i.rb.ss = 0 // instead of first
				450	i.rb.insertCGJ()
				451	i.next = nextComposed
				452	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
				453	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
				454	// If we ever change that, insert a check here.
				455	i.rb.ss.first(i.info)
				456	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
				457	return doNormComposed(i)
				458	}