Blame - vendor/golang.org/x/text/unicode/norm/composition.go - ofagent-go

blob: e2087bce52771ac4fc906f1afa9d27efecbac975 [file] [log] [blame]

Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame]	1	// Copyright 2011 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package norm
				6
				7	import "unicode/utf8"
				8
				9	const (
				10	maxNonStarters = 30
				11	// The maximum number of characters needed for a buffer is
				12	// maxNonStarters + 1 for the starter + 1 for the GCJ
				13	maxBufferSize = maxNonStarters + 2
				14	maxNFCExpansion = 3 // NFC(0x1D160)
				15	maxNFKCExpansion = 18 // NFKC(0xFDFA)
				16
				17	maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
				18	)
				19
				20	// ssState is used for reporting the segment state after inserting a rune.
				21	// It is returned by streamSafe.next.
				22	type ssState int
				23
				24	const (
				25	// Indicates a rune was successfully added to the segment.
				26	ssSuccess ssState = iota
				27	// Indicates a rune starts a new segment and should not be added.
				28	ssStarter
				29	// Indicates a rune caused a segment overflow and a CGJ should be inserted.
				30	ssOverflow
				31	)
				32
				33	// streamSafe implements the policy of when a CGJ should be inserted.
				34	type streamSafe uint8
				35
				36	// first inserts the first rune of a segment. It is a faster version of next if
				37	// it is known p represents the first rune in a segment.
				38	func (ss *streamSafe) first(p Properties) {
				39	*ss = streamSafe(p.nTrailingNonStarters())
				40	}
				41
				42	// insert returns a ssState value to indicate whether a rune represented by p
				43	// can be inserted.
				44	func (ss *streamSafe) next(p Properties) ssState {
				45	if *ss > maxNonStarters {
				46	panic("streamSafe was not reset")
				47	}
				48	n := p.nLeadingNonStarters()
				49	if ss += streamSafe(n); ss > maxNonStarters {
				50	*ss = 0
				51	return ssOverflow
				52	}
				53	// The Stream-Safe Text Processing prescribes that the counting can stop
				54	// as soon as a starter is encountered. However, there are some starters,
				55	// like Jamo V and T, that can combine with other runes, leaving their
				56	// successive non-starters appended to the previous, possibly causing an
				57	// overflow. We will therefore consider any rune with a non-zero nLead to
				58	// be a non-starter. Note that it always hold that if nLead > 0 then
				59	// nLead == nTrail.
				60	if n == 0 {
				61	*ss = streamSafe(p.nTrailingNonStarters())
				62	return ssStarter
				63	}
				64	return ssSuccess
				65	}
				66
				67	// backwards is used for checking for overflow and segment starts
				68	// when traversing a string backwards. Users do not need to call first
				69	// for the first rune. The state of the streamSafe retains the count of
				70	// the non-starters loaded.
				71	func (ss *streamSafe) backwards(p Properties) ssState {
				72	if *ss > maxNonStarters {
				73	panic("streamSafe was not reset")
				74	}
				75	c := *ss + streamSafe(p.nTrailingNonStarters())
				76	if c > maxNonStarters {
				77	return ssOverflow
				78	}
				79	*ss = c
				80	if p.nLeadingNonStarters() == 0 {
				81	return ssStarter
				82	}
				83	return ssSuccess
				84	}
				85
				86	func (ss streamSafe) isMax() bool {
				87	return ss == maxNonStarters
				88	}
				89
				90	// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
				91	const GraphemeJoiner = "\u034F"
				92
				93	// reorderBuffer is used to normalize a single segment. Characters inserted with
				94	// insert are decomposed and reordered based on CCC. The compose method can
				95	// be used to recombine characters. Note that the byte buffer does not hold
				96	// the UTF-8 characters in order. Only the rune array is maintained in sorted
				97	// order. flush writes the resulting segment to a byte array.
				98	type reorderBuffer struct {
				99	rune [maxBufferSize]Properties // Per character info.
				100	byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
				101	nbyte uint8 // Number or bytes.
				102	ss streamSafe // For limiting length of non-starter sequence.
				103	nrune int // Number of runeInfos.
				104	f formInfo
				105
				106	src input
				107	nsrc int
				108	tmpBytes input
				109
				110	out []byte
				111	flushF func(*reorderBuffer) bool
				112	}
				113
				114	func (rb *reorderBuffer) init(f Form, src []byte) {
				115	rb.f = *formTable[f]
				116	rb.src.setBytes(src)
				117	rb.nsrc = len(src)
				118	rb.ss = 0
				119	}
				120
				121	func (rb *reorderBuffer) initString(f Form, src string) {
				122	rb.f = *formTable[f]
				123	rb.src.setString(src)
				124	rb.nsrc = len(src)
				125	rb.ss = 0
				126	}
				127
				128	func (rb reorderBuffer) setFlusher(out []byte, f func(reorderBuffer) bool) {
				129	rb.out = out
				130	rb.flushF = f
				131	}
				132
				133	// reset discards all characters from the buffer.
				134	func (rb *reorderBuffer) reset() {
				135	rb.nrune = 0
				136	rb.nbyte = 0
				137	}
				138
				139	func (rb *reorderBuffer) doFlush() bool {
				140	if rb.f.composing {
				141	rb.compose()
				142	}
				143	res := rb.flushF(rb)
				144	rb.reset()
				145	return res
				146	}
				147
				148	// appendFlush appends the normalized segment to rb.out.
				149	func appendFlush(rb *reorderBuffer) bool {
				150	for i := 0; i < rb.nrune; i++ {
				151	start := rb.rune[i].pos
				152	end := start + rb.rune[i].size
				153	rb.out = append(rb.out, rb.byte[start:end]...)
				154	}
				155	return true
				156	}
				157
				158	// flush appends the normalized segment to out and resets rb.
				159	func (rb *reorderBuffer) flush(out []byte) []byte {
				160	for i := 0; i < rb.nrune; i++ {
				161	start := rb.rune[i].pos
				162	end := start + rb.rune[i].size
				163	out = append(out, rb.byte[start:end]...)
				164	}
				165	rb.reset()
				166	return out
				167	}
				168
				169	// flushCopy copies the normalized segment to buf and resets rb.
				170	// It returns the number of bytes written to buf.
				171	func (rb *reorderBuffer) flushCopy(buf []byte) int {
				172	p := 0
				173	for i := 0; i < rb.nrune; i++ {
				174	runep := rb.rune[i]
				175	p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
				176	}
				177	rb.reset()
				178	return p
				179	}
				180
				181	// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
				182	// It returns false if the buffer is not large enough to hold the rune.
				183	// It is used internally by insert and insertString only.
				184	func (rb *reorderBuffer) insertOrdered(info Properties) {
				185	n := rb.nrune
				186	b := rb.rune[:]
				187	cc := info.ccc
				188	if cc > 0 {
				189	// Find insertion position + move elements to make room.
				190	for ; n > 0; n-- {
				191	if b[n-1].ccc <= cc {
				192	break
				193	}
				194	b[n] = b[n-1]
				195	}
				196	}
				197	rb.nrune += 1
				198	pos := uint8(rb.nbyte)
				199	rb.nbyte += utf8.UTFMax
				200	info.pos = pos
				201	b[n] = info
				202	}
				203
				204	// insertErr is an error code returned by insert. Using this type instead
				205	// of error improves performance up to 20% for many of the benchmarks.
				206	type insertErr int
				207
				208	const (
				209	iSuccess insertErr = -iota
				210	iShortDst
				211	iShortSrc
				212	)
				213
				214	// insertFlush inserts the given rune in the buffer ordered by CCC.
				215	// If a decomposition with multiple segments are encountered, they leading
				216	// ones are flushed.
				217	// It returns a non-zero error code if the rune was not inserted.
				218	func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
				219	if rune := src.hangul(i); rune != 0 {
				220	rb.decomposeHangul(rune)
				221	return iSuccess
				222	}
				223	if info.hasDecomposition() {
				224	return rb.insertDecomposed(info.Decomposition())
				225	}
				226	rb.insertSingle(src, i, info)
				227	return iSuccess
				228	}
				229
				230	// insertUnsafe inserts the given rune in the buffer ordered by CCC.
				231	// It is assumed there is sufficient space to hold the runes. It is the
				232	// responsibility of the caller to ensure this. This can be done by checking
				233	// the state returned by the streamSafe type.
				234	func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
				235	if rune := src.hangul(i); rune != 0 {
				236	rb.decomposeHangul(rune)
				237	}
				238	if info.hasDecomposition() {
				239	// TODO: inline.
				240	rb.insertDecomposed(info.Decomposition())
				241	} else {
				242	rb.insertSingle(src, i, info)
				243	}
				244	}
				245
				246	// insertDecomposed inserts an entry in to the reorderBuffer for each rune
				247	// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
				248	// It flushes the buffer on each new segment start.
				249	func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
				250	rb.tmpBytes.setBytes(dcomp)
				251	// As the streamSafe accounting already handles the counting for modifiers,
				252	// we don't have to call next. However, we do need to keep the accounting
				253	// intact when flushing the buffer.
				254	for i := 0; i < len(dcomp); {
				255	info := rb.f.info(rb.tmpBytes, i)
				256	if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
				257	return iShortDst
				258	}
				259	i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
				260	rb.insertOrdered(info)
				261	}
				262	return iSuccess
				263	}
				264
				265	// insertSingle inserts an entry in the reorderBuffer for the rune at
				266	// position i. info is the runeInfo for the rune at position i.
				267	func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
				268	src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
				269	rb.insertOrdered(info)
				270	}
				271
				272	// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
				273	func (rb *reorderBuffer) insertCGJ() {
				274	rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
				275	}
				276
				277	// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
				278	func (rb *reorderBuffer) appendRune(r rune) {
				279	bn := rb.nbyte
				280	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
				281	rb.nbyte += utf8.UTFMax
				282	rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
				283	rb.nrune++
				284	}
				285
				286	// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
				287	func (rb *reorderBuffer) assignRune(pos int, r rune) {
				288	bn := rb.rune[pos].pos
				289	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
				290	rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
				291	}
				292
				293	// runeAt returns the rune at position n. It is used for Hangul and recomposition.
				294	func (rb *reorderBuffer) runeAt(n int) rune {
				295	inf := rb.rune[n]
				296	r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
				297	return r
				298	}
				299
				300	// bytesAt returns the UTF-8 encoding of the rune at position n.
				301	// It is used for Hangul and recomposition.
				302	func (rb *reorderBuffer) bytesAt(n int) []byte {
				303	inf := rb.rune[n]
				304	return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
				305	}
				306
				307	// For Hangul we combine algorithmically, instead of using tables.
				308	const (
				309	hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
				310	hangulBase0 = 0xEA
				311	hangulBase1 = 0xB0
				312	hangulBase2 = 0x80
				313
				314	hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
				315	hangulEnd0 = 0xED
				316	hangulEnd1 = 0x9E
				317	hangulEnd2 = 0xA4
				318
				319	jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
				320	jamoLBase0 = 0xE1
				321	jamoLBase1 = 0x84
				322	jamoLEnd = 0x1113
				323	jamoVBase = 0x1161
				324	jamoVEnd = 0x1176
				325	jamoTBase = 0x11A7
				326	jamoTEnd = 0x11C3
				327
				328	jamoTCount = 28
				329	jamoVCount = 21
				330	jamoVTCount = 21 * 28
				331	jamoLVTCount = 19 * 21 * 28
				332	)
				333
				334	const hangulUTF8Size = 3
				335
				336	func isHangul(b []byte) bool {
				337	if len(b) < hangulUTF8Size {
				338	return false
				339	}
				340	b0 := b[0]
				341	if b0 < hangulBase0 {
				342	return false
				343	}
				344	b1 := b[1]
				345	switch {
				346	case b0 == hangulBase0:
				347	return b1 >= hangulBase1
				348	case b0 < hangulEnd0:
				349	return true
				350	case b0 > hangulEnd0:
				351	return false
				352	case b1 < hangulEnd1:
				353	return true
				354	}
				355	return b1 == hangulEnd1 && b[2] < hangulEnd2
				356	}
				357
				358	func isHangulString(b string) bool {
				359	if len(b) < hangulUTF8Size {
				360	return false
				361	}
				362	b0 := b[0]
				363	if b0 < hangulBase0 {
				364	return false
				365	}
				366	b1 := b[1]
				367	switch {
				368	case b0 == hangulBase0:
				369	return b1 >= hangulBase1
				370	case b0 < hangulEnd0:
				371	return true
				372	case b0 > hangulEnd0:
				373	return false
				374	case b1 < hangulEnd1:
				375	return true
				376	}
				377	return b1 == hangulEnd1 && b[2] < hangulEnd2
				378	}
				379
				380	// Caller must ensure len(b) >= 2.
				381	func isJamoVT(b []byte) bool {
				382	// True if (rune & 0xff00) == jamoLBase
				383	return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
				384	}
				385
				386	func isHangulWithoutJamoT(b []byte) bool {
				387	c, _ := utf8.DecodeRune(b)
				388	c -= hangulBase
				389	return c < jamoLVTCount && c%jamoTCount == 0
				390	}
				391
				392	// decomposeHangul writes the decomposed Hangul to buf and returns the number
				393	// of bytes written. len(buf) should be at least 9.
				394	func decomposeHangul(buf []byte, r rune) int {
				395	const JamoUTF8Len = 3
				396	r -= hangulBase
				397	x := r % jamoTCount
				398	r /= jamoTCount
				399	utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
				400	utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
				401	if x != 0 {
				402	utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
				403	return 3 * JamoUTF8Len
				404	}
				405	return 2 * JamoUTF8Len
				406	}
				407
				408	// decomposeHangul algorithmically decomposes a Hangul rune into
				409	// its Jamo components.
Don Newton	7577f07	2020-01-06 12:41:11 -0500	[diff] [blame^]	410	// See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame]	411	func (rb *reorderBuffer) decomposeHangul(r rune) {
				412	r -= hangulBase
				413	x := r % jamoTCount
				414	r /= jamoTCount
				415	rb.appendRune(jamoLBase + r/jamoVCount)
				416	rb.appendRune(jamoVBase + r%jamoVCount)
				417	if x != 0 {
				418	rb.appendRune(jamoTBase + x)
				419	}
				420	}
				421
				422	// combineHangul algorithmically combines Jamo character components into Hangul.
Don Newton	7577f07	2020-01-06 12:41:11 -0500	[diff] [blame^]	423	// See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame]	424	func (rb *reorderBuffer) combineHangul(s, i, k int) {
				425	b := rb.rune[:]
				426	bn := rb.nrune
				427	for ; i < bn; i++ {
				428	cccB := b[k-1].ccc
				429	cccC := b[i].ccc
				430	if cccB == 0 {
				431	s = k - 1
				432	}
				433	if s != k-1 && cccB >= cccC {
				434	// b[i] is blocked by greater-equal cccX below it
				435	b[k] = b[i]
				436	k++
				437	} else {
				438	l := rb.runeAt(s) // also used to compare to hangulBase
				439	v := rb.runeAt(i) // also used to compare to jamoT
				440	switch {
				441	case jamoLBase <= l && l < jamoLEnd &&
				442	jamoVBase <= v && v < jamoVEnd:
				443	// 11xx plus 116x to LV
				444	rb.assignRune(s, hangulBase+
				445	(l-jamoLBase)jamoVTCount+(v-jamoVBase)jamoTCount)
				446	case hangulBase <= l && l < hangulEnd &&
				447	jamoTBase < v && v < jamoTEnd &&
				448	((l-hangulBase)%jamoTCount) == 0:
				449	// ACxx plus 11Ax to LVT
				450	rb.assignRune(s, l+v-jamoTBase)
				451	default:
				452	b[k] = b[i]
				453	k++
				454	}
				455	}
				456	}
				457	rb.nrune = k
				458	}
				459
				460	// compose recombines the runes in the buffer.
				461	// It should only be used to recompose a single segment, as it will not
				462	// handle alternations between Hangul and non-Hangul characters correctly.
				463	func (rb *reorderBuffer) compose() {
Don Newton	7577f07	2020-01-06 12:41:11 -0500	[diff] [blame^]	464	// Lazily load the map used by the combine func below, but do
				465	// it outside of the loop.
				466	recompMapOnce.Do(buildRecompMap)
				467
Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame]	468	// UAX #15, section X5 , including Corrigendum #5
				469	// "In any character sequence beginning with starter S, a character C is
				470	// blocked from S if and only if there is some character B between S
				471	// and C, and either B is a starter or it has the same or higher
				472	// combining class as C."
				473	bn := rb.nrune
				474	if bn == 0 {
				475	return
				476	}
				477	k := 1
				478	b := rb.rune[:]
				479	for s, i := 0, 1; i < bn; i++ {
				480	if isJamoVT(rb.bytesAt(i)) {
				481	// Redo from start in Hangul mode. Necessary to support
				482	// U+320E..U+321E in NFKC mode.
				483	rb.combineHangul(s, i, k)
				484	return
				485	}
				486	ii := b[i]
				487	// We can only use combineForward as a filter if we later
				488	// get the info for the combined character. This is more
				489	// expensive than using the filter. Using combinesBackward()
				490	// is safe.
				491	if ii.combinesBackward() {
				492	cccB := b[k-1].ccc
				493	cccC := ii.ccc
				494	blocked := false // b[i] blocked by starter or greater or equal CCC?
				495	if cccB == 0 {
				496	s = k - 1
				497	} else {
				498	blocked = s != k-1 && cccB >= cccC
				499	}
				500	if !blocked {
				501	combined := combine(rb.runeAt(s), rb.runeAt(i))
				502	if combined != 0 {
				503	rb.assignRune(s, combined)
				504	continue
				505	}
				506	}
				507	}
				508	b[k] = b[i]
				509	k++
				510	}
				511	rb.nrune = k
				512	}