Blame - vendor/golang.org/x/text/internal/colltab/table.go - ofagent-go

blob: e26e36da93542cdafb23581da1954f3e1b002dfd [file] [log] [blame]

Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame^]	1	// Copyright 2012 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package colltab
				6
				7	import (
				8	"unicode/utf8"
				9
				10	"golang.org/x/text/unicode/norm"
				11	)
				12
				13	// Table holds all collation data for a given collation ordering.
				14	type Table struct {
				15	Index Trie // main trie
				16
				17	// expansion info
				18	ExpandElem []uint32
				19
				20	// contraction info
				21	ContractTries ContractTrieSet
				22	ContractElem []uint32
				23	MaxContractLen int
				24	VariableTop uint32
				25	}
				26
				27	func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
				28	return t.appendNext(w, source{bytes: b})
				29	}
				30
				31	func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
				32	return t.appendNext(w, source{str: s})
				33	}
				34
				35	func (t *Table) Start(p int, b []byte) int {
				36	// TODO: implement
				37	panic("not implemented")
				38	}
				39
				40	func (t *Table) StartString(p int, s string) int {
				41	// TODO: implement
				42	panic("not implemented")
				43	}
				44
				45	func (t *Table) Domain() []string {
				46	// TODO: implement
				47	panic("not implemented")
				48	}
				49
				50	func (t *Table) Top() uint32 {
				51	return t.VariableTop
				52	}
				53
				54	type source struct {
				55	str string
				56	bytes []byte
				57	}
				58
				59	func (src source) lookup(t Table) (ce Elem, sz int) {
				60	if src.bytes == nil {
				61	return t.Index.lookupString(src.str)
				62	}
				63	return t.Index.lookup(src.bytes)
				64	}
				65
				66	func (src *source) tail(sz int) {
				67	if src.bytes == nil {
				68	src.str = src.str[sz:]
				69	} else {
				70	src.bytes = src.bytes[sz:]
				71	}
				72	}
				73
				74	func (src *source) nfd(buf []byte, end int) []byte {
				75	if src.bytes == nil {
				76	return norm.NFD.AppendString(buf[:0], src.str[:end])
				77	}
				78	return norm.NFD.Append(buf[:0], src.bytes[:end]...)
				79	}
				80
				81	func (src *source) rune() (r rune, sz int) {
				82	if src.bytes == nil {
				83	return utf8.DecodeRuneInString(src.str)
				84	}
				85	return utf8.DecodeRune(src.bytes)
				86	}
				87
				88	func (src *source) properties(f norm.Form) norm.Properties {
				89	if src.bytes == nil {
				90	return f.PropertiesString(src.str)
				91	}
				92	return f.Properties(src.bytes)
				93	}
				94
				95	// appendNext appends the weights corresponding to the next rune or
				96	// contraction in s. If a contraction is matched to a discontinuous
				97	// sequence of runes, the weights for the interstitial runes are
				98	// appended as well. It returns a new slice that includes the appended
				99	// weights and the number of bytes consumed from s.
				100	func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
				101	ce, sz := src.lookup(t)
				102	tp := ce.ctype()
				103	if tp == ceNormal {
				104	if ce == 0 {
				105	r, _ := src.rune()
				106	const (
				107	hangulSize = 3
				108	firstHangul = 0xAC00
				109	lastHangul = 0xD7A3
				110	)
				111	if r >= firstHangul && r <= lastHangul {
				112	// TODO: performance can be considerably improved here.
				113	n = sz
				114	var buf [16]byte // Used for decomposing Hangul.
				115	for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
				116	ce, sz = t.Index.lookup(b)
				117	w = append(w, ce)
				118	}
				119	return w, n
				120	}
				121	ce = makeImplicitCE(implicitPrimary(r))
				122	}
				123	w = append(w, ce)
				124	} else if tp == ceExpansionIndex {
				125	w = t.appendExpansion(w, ce)
				126	} else if tp == ceContractionIndex {
				127	n := 0
				128	src.tail(sz)
				129	if src.bytes == nil {
				130	w, n = t.matchContractionString(w, ce, src.str)
				131	} else {
				132	w, n = t.matchContraction(w, ce, src.bytes)
				133	}
				134	sz += n
				135	} else if tp == ceDecompose {
				136	// Decompose using NFKD and replace tertiary weights.
				137	t1, t2 := splitDecompose(ce)
				138	i := len(w)
				139	nfkd := src.properties(norm.NFKD).Decomposition()
				140	for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
				141	w, p = t.appendNext(w, source{bytes: nfkd})
				142	}
				143	w[i] = w[i].updateTertiary(t1)
				144	if i++; i < len(w) {
				145	w[i] = w[i].updateTertiary(t2)
				146	for i++; i < len(w); i++ {
				147	w[i] = w[i].updateTertiary(maxTertiary)
				148	}
				149	}
				150	}
				151	return w, sz
				152	}
				153
				154	func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
				155	i := splitExpandIndex(ce)
				156	n := int(t.ExpandElem[i])
				157	i++
				158	for _, ce := range t.ExpandElem[i : i+n] {
				159	w = append(w, Elem(ce))
				160	}
				161	return w
				162	}
				163
				164	func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
				165	index, n, offset := splitContractIndex(ce)
				166
				167	scan := t.ContractTries.scanner(index, n, suffix)
				168	buf := [norm.MaxSegmentSize]byte{}
				169	bufp := 0
				170	p := scan.scan(0)
				171
				172	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
				173	// By now we should have filtered most cases.
				174	p0 := p
				175	bufn := 0
				176	rune := norm.NFD.Properties(suffix[p:])
				177	p += rune.Size()
				178	if rune.LeadCCC() != 0 {
				179	prevCC := rune.TrailCCC()
				180	// A gap may only occur in the last normalization segment.
				181	// This also ensures that len(scan.s) < norm.MaxSegmentSize.
				182	if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
				183	scan.s = suffix[:p+end]
				184	}
				185	for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
				186	rune = norm.NFD.Properties(suffix[p:])
				187	if ccc := rune.LeadCCC(); ccc == 0 \|\| prevCC >= ccc {
				188	break
				189	}
				190	prevCC = rune.TrailCCC()
				191	if pp := scan.scan(p); pp != p {
				192	// Copy the interstitial runes for later processing.
				193	bufn += copy(buf[bufn:], suffix[p0:p])
				194	if scan.pindex == pp {
				195	bufp = bufn
				196	}
				197	p, p0 = pp, pp
				198	} else {
				199	p += rune.Size()
				200	}
				201	}
				202	}
				203	}
				204	// Append weights for the matched contraction, which may be an expansion.
				205	i, n := scan.result()
				206	ce = Elem(t.ContractElem[i+offset])
				207	if ce.ctype() == ceNormal {
				208	w = append(w, ce)
				209	} else {
				210	w = t.appendExpansion(w, ce)
				211	}
				212	// Append weights for the runes in the segment not part of the contraction.
				213	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
				214	w, p = t.appendNext(w, source{bytes: b})
				215	}
				216	return w, n
				217	}
				218
				219	// TODO: unify the two implementations. This is best done after first simplifying
				220	// the algorithm taking into account the inclusion of both NFC and NFD forms
				221	// in the table.
				222	func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
				223	index, n, offset := splitContractIndex(ce)
				224
				225	scan := t.ContractTries.scannerString(index, n, suffix)
				226	buf := [norm.MaxSegmentSize]byte{}
				227	bufp := 0
				228	p := scan.scan(0)
				229
				230	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
				231	// By now we should have filtered most cases.
				232	p0 := p
				233	bufn := 0
				234	rune := norm.NFD.PropertiesString(suffix[p:])
				235	p += rune.Size()
				236	if rune.LeadCCC() != 0 {
				237	prevCC := rune.TrailCCC()
				238	// A gap may only occur in the last normalization segment.
				239	// This also ensures that len(scan.s) < norm.MaxSegmentSize.
				240	if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
				241	scan.s = suffix[:p+end]
				242	}
				243	for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
				244	rune = norm.NFD.PropertiesString(suffix[p:])
				245	if ccc := rune.LeadCCC(); ccc == 0 \|\| prevCC >= ccc {
				246	break
				247	}
				248	prevCC = rune.TrailCCC()
				249	if pp := scan.scan(p); pp != p {
				250	// Copy the interstitial runes for later processing.
				251	bufn += copy(buf[bufn:], suffix[p0:p])
				252	if scan.pindex == pp {
				253	bufp = bufn
				254	}
				255	p, p0 = pp, pp
				256	} else {
				257	p += rune.Size()
				258	}
				259	}
				260	}
				261	}
				262	// Append weights for the matched contraction, which may be an expansion.
				263	i, n := scan.result()
				264	ce = Elem(t.ContractElem[i+offset])
				265	if ce.ctype() == ceNormal {
				266	w = append(w, ce)
				267	} else {
				268	w = t.appendExpansion(w, ce)
				269	}
				270	// Append weights for the runes in the segment not part of the contraction.
				271	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
				272	w, p = t.appendNext(w, source{bytes: b})
				273	}
				274	return w, n
				275	}