Blame - vendor/golang.org/x/text/collate/build/contract.go - ofagent-go

blob: e2df64f0c656bc38f23b27c0fc27f51759e66f10 [file] [log] [blame]

Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame^]	1	// Copyright 2012 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package build
				6
				7	import (
				8	"fmt"
				9	"io"
				10	"reflect"
				11	"sort"
				12	"strings"
				13
				14	"golang.org/x/text/internal/colltab"
				15	)
				16
				17	// This file contains code for detecting contractions and generating
				18	// the necessary tables.
				19	// Any Unicode Collation Algorithm (UCA) table entry that has more than
				20	// one rune one the left-hand side is called a contraction.
				21	// See https://www.unicode.org/reports/tr10/#Contractions for more details.
				22	//
				23	// We define the following terms:
				24	// initial: a rune that appears as the first rune in a contraction.
				25	// suffix: a sequence of runes succeeding the initial rune
				26	// in a given contraction.
				27	// non-initial: a rune that appears in a suffix.
				28	//
				29	// A rune may be both an initial and a non-initial and may be so in
				30	// many contractions. An initial may typically also appear by itself.
				31	// In case of ambiguities, the UCA requires we match the longest
				32	// contraction.
				33	//
				34	// Many contraction rules share the same set of possible suffixes.
				35	// We store sets of suffixes in a trie that associates an index with
				36	// each suffix in the set. This index can be used to look up a
				37	// collation element associated with the (starter rune, suffix) pair.
				38	//
				39	// The trie is defined on a UTF-8 byte sequence.
				40	// The overall trie is represented as an array of ctEntries. Each node of the trie
				41	// is represented as a subsequence of ctEntries, where each entry corresponds to
				42	// a possible match of a next character in the search string. An entry
				43	// also includes the length and offset to the next sequence of entries
				44	// to check in case of a match.
				45
				46	const (
				47	final = 0
				48	noIndex = 0xFF
				49	)
				50
				51	// ctEntry associates to a matching byte an offset and/or next sequence of
				52	// bytes to check. A ctEntry c is called final if a match means that the
				53	// longest suffix has been found. An entry c is final if c.N == 0.
				54	// A single final entry can match a range of characters to an offset.
				55	// A non-final entry always matches a single byte. Note that a non-final
				56	// entry might still resemble a completed suffix.
				57	// Examples:
				58	// The suffix strings "ab" and "ac" can be represented as:
				59	// []ctEntry{
				60	// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
				61	// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
				62	// }
				63	//
				64	// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
				65	// []ctEntry{
				66	// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
				67	// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
				68	// {'d', 'd', final, 3}, // "abd" -> 3
				69	// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
				70	// {'d', 'd', final, 4}, // "abcd" -> 4
				71	// }
				72	// See genStateTests in contract_test.go for more examples.
				73	type ctEntry struct {
				74	L uint8 // non-final: byte value to match; final: lowest match in range.
				75	H uint8 // non-final: relative index to next block; final: highest match in range.
				76	N uint8 // non-final: length of next block; final: final
				77	I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
				78	}
				79
				80	// contractTrieSet holds a set of contraction tries. The tries are stored
				81	// consecutively in the entry field.
				82	type contractTrieSet []struct{ l, h, n, i uint8 }
				83
				84	// ctHandle is used to identify a trie in the trie set, consisting in an offset
				85	// in the array and the size of the first node.
				86	type ctHandle struct {
				87	index, n int
				88	}
				89
				90	// appendTrie adds a new trie for the given suffixes to the trie set and returns
				91	// a handle to it. The handle will be invalid on error.
				92	func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
				93	es := make([]stridx, len(suffixes))
				94	for i, s := range suffixes {
				95	es[i].str = s
				96	}
				97	sort.Sort(offsetSort(es))
				98	for i := range es {
				99	es[i].index = i + 1
				100	}
				101	sort.Sort(genidxSort(es))
				102	i := len(*ct)
				103	n, err := genStates(ct, es)
				104	if err != nil {
				105	ct = (ct)[:i]
				106	return ctHandle{}, err
				107	}
				108	return ctHandle{i, n}, nil
				109	}
				110
				111	// genStates generates ctEntries for a given suffix set and returns
				112	// the number of entries for the first node.
				113	func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
				114	if len(sis) == 0 {
				115	return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
				116	}
				117	start := len(*ct)
				118	// create entries for differing first bytes.
				119	for _, si := range sis {
				120	s := si.str
				121	if len(s) == 0 {
				122	continue
				123	}
				124	added := false
				125	c := s[0]
				126	if len(s) > 1 {
				127	for j := len(*ct) - 1; j >= start; j-- {
				128	if (*ct)[j].L == c {
				129	added = true
				130	break
				131	}
				132	}
				133	if !added {
				134	ct = append(ct, ctEntry{L: c, I: noIndex})
				135	}
				136	} else {
				137	for j := len(*ct) - 1; j >= start; j-- {
				138	// Update the offset for longer suffixes with the same byte.
				139	if (*ct)[j].L == c {
				140	(*ct)[j].I = uint8(si.index)
				141	added = true
				142	}
				143	// Extend range of final ctEntry, if possible.
				144	if (*ct)[j].H+1 == c {
				145	(*ct)[j].H = c
				146	added = true
				147	}
				148	}
				149	if !added {
				150	ct = append(ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
				151	}
				152	}
				153	}
				154	n := len(*ct) - start
				155	// Append nodes for the remainder of the suffixes for each ctEntry.
				156	sp := 0
				157	for i, end := start, len(*ct); i < end; i++ {
				158	fe := (*ct)[i]
				159	if fe.H == 0 { // uninitialized non-final
				160	ln := len(*ct) - start - n
				161	if ln > 0xFF {
				162	return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
				163	}
				164	fe.H = uint8(ln)
				165	// Find first non-final strings with same byte as current entry.
				166	for ; sis[sp].str[0] != fe.L; sp++ {
				167	}
				168	se := sp + 1
				169	for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
				170	}
				171	sl := sis[sp:se]
				172	sp = se
				173	for i, si := range sl {
				174	sl[i].str = si.str[1:]
				175	}
				176	nn, err := genStates(ct, sl)
				177	if err != nil {
				178	return 0, err
				179	}
				180	fe.N = uint8(nn)
				181	(*ct)[i] = fe
				182	}
				183	}
				184	sort.Sort(entrySort((*ct)[start : start+n]))
				185	return n, nil
				186	}
				187
				188	// There may be both a final and non-final entry for a byte if the byte
				189	// is implied in a range of matches in the final entry.
				190	// We need to ensure that the non-final entry comes first in that case.
				191	type entrySort colltab.ContractTrieSet
				192
				193	func (fe entrySort) Len() int { return len(fe) }
				194	func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
				195	func (fe entrySort) Less(i, j int) bool {
				196	return fe[i].L > fe[j].L
				197	}
				198
				199	// stridx is used for sorting suffixes and their associated offsets.
				200	type stridx struct {
				201	str string
				202	index int
				203	}
				204
				205	// For computing the offsets, we first sort by size, and then by string.
				206	// This ensures that strings that only differ in the last byte by 1
				207	// are sorted consecutively in increasing order such that they can
				208	// be packed as a range in a final ctEntry.
				209	type offsetSort []stridx
				210
				211	func (si offsetSort) Len() int { return len(si) }
				212	func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
				213	func (si offsetSort) Less(i, j int) bool {
				214	if len(si[i].str) != len(si[j].str) {
				215	return len(si[i].str) > len(si[j].str)
				216	}
				217	return si[i].str < si[j].str
				218	}
				219
				220	// For indexing, we want to ensure that strings are sorted in string order, where
				221	// for strings with the same prefix, we put longer strings before shorter ones.
				222	type genidxSort []stridx
				223
				224	func (si genidxSort) Len() int { return len(si) }
				225	func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
				226	func (si genidxSort) Less(i, j int) bool {
				227	if strings.HasPrefix(si[j].str, si[i].str) {
				228	return false
				229	}
				230	if strings.HasPrefix(si[i].str, si[j].str) {
				231	return true
				232	}
				233	return si[i].str < si[j].str
				234	}
				235
				236	// lookup matches the longest suffix in str and returns the associated offset
				237	// and the number of bytes consumed.
				238	func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
				239	states := (*ct)[h.index:]
				240	p := 0
				241	n := h.n
				242	for i := 0; i < n && p < len(str); {
				243	e := states[i]
				244	c := str[p]
				245	if c >= e.L {
				246	if e.L == c {
				247	p++
				248	if e.I != noIndex {
				249	index, ns = int(e.I), p
				250	}
				251	if e.N != final {
				252	// set to new state
				253	i, states, n = 0, states[int(e.H)+n:], int(e.N)
				254	} else {
				255	return
				256	}
				257	continue
				258	} else if e.N == final && c <= e.H {
				259	p++
				260	return int(c-e.L) + int(e.I), p
				261	}
				262	}
				263	i++
				264	}
				265	return
				266	}
				267
				268	// print writes the contractTrieSet t as compilable Go code to w. It returns
				269	// the total number of bytes written and the size of the resulting data structure in bytes.
				270	func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
				271	update3 := func(nn, sz int, e error) {
				272	n += nn
				273	if err == nil {
				274	err = e
				275	}
				276	size += sz
				277	}
				278	update2 := func(nn int, e error) { update3(nn, 0, e) }
				279
				280	update3(printArray(*t, w, name))
				281	update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
				282	update3(printStruct(*t, w, name))
				283	update2(fmt.Fprintln(w))
				284	return
				285	}
				286
				287	func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
				288	p := func(f string, a ...interface{}) {
				289	nn, e := fmt.Fprintf(w, f, a...)
				290	n += nn
				291	if err == nil {
				292	err = e
				293	}
				294	}
				295	size = len(ct) * 4
				296	p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
				297	p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
				298	for _, fe := range ct {
				299	p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
				300	}
				301	p("}\n")
				302	return
				303	}
				304
				305	func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
				306	n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
				307	size = int(reflect.TypeOf(ct).Size())
				308	return
				309	}