Blame - vendor/golang.org/x/text/collate/build/colelem.go - ofagent-go

blob: 04fc3bfb72b0395fa1e79fea9cabefef2493b6b7 [file] [log] [blame]

Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame^]	1	// Copyright 2012 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package build
				6
				7	import (
				8	"fmt"
				9	"unicode"
				10
				11	"golang.org/x/text/internal/colltab"
				12	)
				13
				14	const (
				15	defaultSecondary = 0x20
				16	defaultTertiary = 0x2
				17	maxTertiary = 0x1F
				18	)
				19
				20	type rawCE struct {
				21	w []int
				22	ccc uint8
				23	}
				24
				25	func makeRawCE(w []int, ccc uint8) rawCE {
				26	ce := rawCE{w: make([]int, 4), ccc: ccc}
				27	copy(ce.w, w)
				28	return ce
				29	}
				30
				31	// A collation element is represented as an uint32.
				32	// In the typical case, a rune maps to a single collation element. If a rune
				33	// can be the start of a contraction or expands into multiple collation elements,
				34	// then the collation element that is associated with a rune will have a special
				35	// form to represent such m to n mappings. Such special collation elements
				36	// have a value >= 0x80000000.
				37
				38	const (
				39	maxPrimaryBits = 21
				40	maxSecondaryBits = 12
				41	maxTertiaryBits = 8
				42	)
				43
				44	func makeCE(ce rawCE) (uint32, error) {
				45	v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
				46	return uint32(v), e
				47	}
				48
				49	// For contractions, collation elements are of the form
				50	// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
				51	// - n* is the size of the first node in the contraction trie.
				52	// - i* is the index of the first node in the contraction trie.
				53	// - b* is the offset into the contraction collation element table.
				54	// See contract.go for details on the contraction trie.
				55	const (
				56	contractID = 0xC0000000
				57	maxNBits = 4
				58	maxTrieIndexBits = 12
				59	maxContractOffsetBits = 13
				60	)
				61
				62	func makeContractIndex(h ctHandle, offset int) (uint32, error) {
				63	if h.n >= 1<<maxNBits {
				64	return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
				65	}
				66	if h.index >= 1<<maxTrieIndexBits {
				67	return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
				68	}
				69	if offset >= 1<<maxContractOffsetBits {
				70	return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
				71	}
				72	ce := uint32(contractID)
				73	ce += uint32(offset << (maxNBits + maxTrieIndexBits))
				74	ce += uint32(h.index << maxNBits)
				75	ce += uint32(h.n)
				76	return ce, nil
				77	}
				78
				79	// For expansions, collation elements are of the form
				80	// 11100000 00000000 bbbbbbbb bbbbbbbb,
				81	// where b* is the index into the expansion sequence table.
				82	const (
				83	expandID = 0xE0000000
				84	maxExpandIndexBits = 16
				85	)
				86
				87	func makeExpandIndex(index int) (uint32, error) {
				88	if index >= 1<<maxExpandIndexBits {
				89	return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
				90	}
				91	return expandID + uint32(index), nil
				92	}
				93
				94	// Each list of collation elements corresponding to an expansion starts with
				95	// a header indicating the length of the sequence.
				96	func makeExpansionHeader(n int) (uint32, error) {
				97	return uint32(n), nil
				98	}
				99
				100	// Some runes can be expanded using NFKD decomposition. Instead of storing the full
				101	// sequence of collation elements, we decompose the rune and lookup the collation
				102	// elements for each rune in the decomposition and modify the tertiary weights.
				103	// The collation element, in this case, is of the form
				104	// 11110000 00000000 wwwwwwww vvvvvvvv, where
				105	// - v* is the replacement tertiary weight for the first rune,
				106	// - w* is the replacement tertiary weight for the second rune,
				107	// Tertiary weights of subsequent runes should be replaced with maxTertiary.
				108	// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
				109	const (
				110	decompID = 0xF0000000
				111	)
				112
				113	func makeDecompose(t1, t2 int) (uint32, error) {
				114	if t1 >= 256 \|\| t1 < 0 {
				115	return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
				116	}
				117	if t2 >= 256 \|\| t2 < 0 {
				118	return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
				119	}
				120	return uint32(t2<<8+t1) + decompID, nil
				121	}
				122
				123	const (
				124	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
				125	minUnified rune = 0x4E00
				126	maxUnified = 0x9FFF
				127	minCompatibility = 0xF900
				128	maxCompatibility = 0xFAFF
				129	minRare = 0x3400
				130	maxRare = 0x4DBF
				131	)
				132	const (
				133	commonUnifiedOffset = 0x10000
				134	rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
				135	otherOffset = 0x50000 // largest rune in rare is U+2FA1D
				136	illegalOffset = otherOffset + int(unicode.MaxRune)
				137	maxPrimary = illegalOffset + 1
				138	)
				139
				140	// implicitPrimary returns the primary weight for the a rune
				141	// for which there is no entry for the rune in the collation table.
				142	// We take a different approach from the one specified in
				143	// https://unicode.org/reports/tr10/#Implicit_Weights,
				144	// but preserve the resulting relative ordering of the runes.
				145	func implicitPrimary(r rune) int {
				146	if unicode.Is(unicode.Ideographic, r) {
				147	if r >= minUnified && r <= maxUnified {
				148	// The most common case for CJK.
				149	return int(r) + commonUnifiedOffset
				150	}
				151	if r >= minCompatibility && r <= maxCompatibility {
				152	// This will typically not hit. The DUCET explicitly specifies mappings
				153	// for all characters that do not decompose.
				154	return int(r) + commonUnifiedOffset
				155	}
				156	return int(r) + rareUnifiedOffset
				157	}
				158	return int(r) + otherOffset
				159	}
				160
				161	// convertLargeWeights converts collation elements with large
				162	// primaries (either double primaries or for illegal runes)
				163	// to our own representation.
				164	// A CJK character C is represented in the DUCET as
				165	// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
				166	// We will rewrite these characters to a single CE.
				167	// We assume the CJK values start at 0x8000.
				168	// See https://unicode.org/reports/tr10/#Implicit_Weights
				169	func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
				170	const (
				171	cjkPrimaryStart = 0xFB40
				172	rarePrimaryStart = 0xFB80
				173	otherPrimaryStart = 0xFBC0
				174	illegalPrimary = 0xFFFE
				175	highBitsMask = 0x3F
				176	lowBitsMask = 0x7FFF
				177	lowBitsFlag = 0x8000
				178	shiftBits = 15
				179	)
				180	for i := 0; i < len(elems); i++ {
				181	ce := elems[i].w
				182	p := ce[0]
				183	if p < cjkPrimaryStart {
				184	continue
				185	}
				186	if p > 0xFFFF {
				187	return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
				188	}
				189	if p >= illegalPrimary {
				190	ce[0] = illegalOffset + p - illegalPrimary
				191	} else {
				192	if i+1 >= len(elems) {
				193	return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
				194	}
				195	if elems[i+1].w[0]&lowBitsFlag == 0 {
				196	return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
				197	}
				198	np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
				199	switch {
				200	case p < rarePrimaryStart:
				201	np += commonUnifiedOffset
				202	case p < otherPrimaryStart:
				203	np += rareUnifiedOffset
				204	default:
				205	p += otherOffset
				206	}
				207	ce[0] = np
				208	for j := i + 1; j+1 < len(elems); j++ {
				209	elems[j] = elems[j+1]
				210	}
				211	elems = elems[:len(elems)-1]
				212	}
				213	}
				214	return elems, nil
				215	}
				216
				217	// nextWeight computes the first possible collation weights following elems
				218	// for the given level.
				219	func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
				220	if level == colltab.Identity {
				221	next := make([]rawCE, len(elems))
				222	copy(next, elems)
				223	return next
				224	}
				225	next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
				226	next[0].w[level]++
				227	if level < colltab.Secondary {
				228	next[0].w[colltab.Secondary] = defaultSecondary
				229	}
				230	if level < colltab.Tertiary {
				231	next[0].w[colltab.Tertiary] = defaultTertiary
				232	}
				233	// Filter entries that cannot influence ordering.
				234	for _, ce := range elems[1:] {
				235	skip := true
				236	for i := colltab.Primary; i < level; i++ {
				237	skip = skip && ce.w[i] == 0
				238	}
				239	if !skip {
				240	next = append(next, ce)
				241	}
				242	}
				243	return next
				244	}
				245
				246	func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
				247	for ; i < len(elems) && elems[i].w[level] == 0; i++ {
				248	}
				249	if i < len(elems) {
				250	return i, elems[i].w[level]
				251	}
				252	return i, 0
				253	}
				254
				255	// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
				256	// It also returns the collation level at which the difference is found.
				257	func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
				258	for level := colltab.Primary; level < colltab.Identity; level++ {
				259	var va, vb int
				260	for ia, ib := 0, 0; ia < len(a) \|\| ib < len(b); ia, ib = ia+1, ib+1 {
				261	ia, va = nextVal(a, ia, level)
				262	ib, vb = nextVal(b, ib, level)
				263	if va != vb {
				264	if va < vb {
				265	return -1, level
				266	} else {
				267	return 1, level
				268	}
				269	}
				270	}
				271	}
				272	return 0, colltab.Identity
				273	}
				274
				275	func equalCE(a, b rawCE) bool {
				276	for i := 0; i < 3; i++ {
				277	if b.w[i] != a.w[i] {
				278	return false
				279	}
				280	}
				281	return true
				282	}
				283
				284	func equalCEArrays(a, b []rawCE) bool {
				285	if len(a) != len(b) {
				286	return false
				287	}
				288	for i := range a {
				289	if !equalCE(a[i], b[i]) {
				290	return false
				291	}
				292	}
				293	return true
				294	}