blob: 0e6b85824b72672eb560ac119dd8f20c6a684b70 [file] [log] [blame]
David K. Bainbridge215e0242017-09-05 23:18:24 -07001// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package bidirule implements the Bidi Rule defined by RFC 5893.
6//
7// This package is under development. The API may change without notice and
8// without preserving backward compatibility.
9package bidirule
10
11import (
12 "errors"
13 "unicode/utf8"
14
15 "golang.org/x/text/transform"
16 "golang.org/x/text/unicode/bidi"
17)
18
19// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
20// Internationalized Domain Names for Applications (IDNA)
21//
22// A label is an individual component of a domain name. Labels are usually
23// shown separated by dots; for example, the domain name "www.example.com" is
24// composed of three labels: "www", "example", and "com".
25//
26// An RTL label is a label that contains at least one character of class R, AL,
27// or AN. An LTR label is any label that is not an RTL label.
28//
29// A "Bidi domain name" is a domain name that contains at least one RTL label.
30//
31// The following guarantees can be made based on the above:
32//
33// o In a domain name consisting of only labels that satisfy the rule,
34// the requirements of Section 3 are satisfied. Note that even LTR
35// labels and pure ASCII labels have to be tested.
36//
37// o In a domain name consisting of only LDH labels (as defined in the
38// Definitions document [RFC5890]) and labels that satisfy the rule,
39// the requirements of Section 3 are satisfied as long as a label
40// that starts with an ASCII digit does not come after a
41// right-to-left label.
42//
43// No guarantee is given for other combinations.
44
45// ErrInvalid indicates a label is invalid according to the Bidi Rule.
46var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
47
48type ruleState uint8
49
50const (
51 ruleInitial ruleState = iota
52 ruleLTR
53 ruleLTRFinal
54 ruleRTL
55 ruleRTLFinal
56 ruleInvalid
57)
58
59type ruleTransition struct {
60 next ruleState
61 mask uint16
62}
63
64var transitions = [...][2]ruleTransition{
65 // [2.1] The first character must be a character with Bidi property L, R, or
66 // AL. If it has the R or AL property, it is an RTL label; if it has the L
67 // property, it is an LTR label.
68 ruleInitial: {
69 {ruleLTRFinal, 1 << bidi.L},
70 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
71 },
72 ruleRTL: {
73 // [2.3] In an RTL label, the end of the label must be a character with
74 // Bidi property R, AL, EN, or AN, followed by zero or more characters
75 // with Bidi property NSM.
76 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
77
78 // [2.2] In an RTL label, only characters with the Bidi properties R,
79 // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
80 // We exclude the entries from [2.3]
81 {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
82 },
83 ruleRTLFinal: {
84 // [2.3] In an RTL label, the end of the label must be a character with
85 // Bidi property R, AL, EN, or AN, followed by zero or more characters
86 // with Bidi property NSM.
87 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
88
89 // [2.2] In an RTL label, only characters with the Bidi properties R,
90 // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
91 // We exclude the entries from [2.3] and NSM.
92 {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
93 },
94 ruleLTR: {
95 // [2.6] In an LTR label, the end of the label must be a character with
96 // Bidi property L or EN, followed by zero or more characters with Bidi
97 // property NSM.
98 {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
99
100 // [2.5] In an LTR label, only characters with the Bidi properties L,
101 // EN, ES, CS, ET, ON, BN, or NSM are allowed.
102 // We exclude the entries from [2.6].
103 {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
104 },
105 ruleLTRFinal: {
106 // [2.6] In an LTR label, the end of the label must be a character with
107 // Bidi property L or EN, followed by zero or more characters with Bidi
108 // property NSM.
109 {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
110
111 // [2.5] In an LTR label, only characters with the Bidi properties L,
112 // EN, ES, CS, ET, ON, BN, or NSM are allowed.
113 // We exclude the entries from [2.6].
114 {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
115 },
116 ruleInvalid: {
117 {ruleInvalid, 0},
118 {ruleInvalid, 0},
119 },
120}
121
122// [2.4] In an RTL label, if an EN is present, no AN may be present, and
123// vice versa.
124const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
125
126// From RFC 5893
127// An RTL label is a label that contains at least one character of type
128// R, AL, or AN.
129//
130// An LTR label is any label that is not an RTL label.
131
132// Direction reports the direction of the given label as defined by RFC 5893.
133// The Bidi Rule does not have to be applied to labels of the category
134// LeftToRight.
135func Direction(b []byte) bidi.Direction {
136 for i := 0; i < len(b); {
137 e, sz := bidi.Lookup(b[i:])
138 if sz == 0 {
139 i++
140 }
141 c := e.Class()
142 if c == bidi.R || c == bidi.AL || c == bidi.AN {
143 return bidi.RightToLeft
144 }
145 i += sz
146 }
147 return bidi.LeftToRight
148}
149
150// DirectionString reports the direction of the given label as defined by RFC
151// 5893. The Bidi Rule does not have to be applied to labels of the category
152// LeftToRight.
153func DirectionString(s string) bidi.Direction {
154 for i := 0; i < len(s); {
155 e, sz := bidi.LookupString(s[i:])
156 if sz == 0 {
157 i++
158 continue
159 }
160 c := e.Class()
161 if c == bidi.R || c == bidi.AL || c == bidi.AN {
162 return bidi.RightToLeft
163 }
164 i += sz
165 }
166 return bidi.LeftToRight
167}
168
169// Valid reports whether b conforms to the BiDi rule.
170func Valid(b []byte) bool {
171 var t Transformer
172 if n, ok := t.advance(b); !ok || n < len(b) {
173 return false
174 }
175 return t.isFinal()
176}
177
178// ValidString reports whether s conforms to the BiDi rule.
179func ValidString(s string) bool {
180 var t Transformer
181 if n, ok := t.advanceString(s); !ok || n < len(s) {
182 return false
183 }
184 return t.isFinal()
185}
186
187// New returns a Transformer that verifies that input adheres to the Bidi Rule.
188func New() *Transformer {
189 return &Transformer{}
190}
191
192// Transformer implements transform.Transform.
193type Transformer struct {
194 state ruleState
195 hasRTL bool
196 seen uint16
197}
198
199// A rule can only be violated for "Bidi Domain names", meaning if one of the
200// following categories has been observed.
201func (t *Transformer) isRTL() bool {
202 const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
203 return t.seen&isRTL != 0
204}
205
206func (t *Transformer) isFinal() bool {
207 return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
208}
209
210// Reset implements transform.Transformer.
211func (t *Transformer) Reset() { *t = Transformer{} }
212
213// Transform implements transform.Transformer. This Transformer has state and
214// needs to be reset between uses.
215func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
216 if len(dst) < len(src) {
217 src = src[:len(dst)]
218 atEOF = false
219 err = transform.ErrShortDst
220 }
221 n, err1 := t.Span(src, atEOF)
222 copy(dst, src[:n])
223 if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
224 err = err1
225 }
226 return n, n, err
227}
228
229// Span returns the first n bytes of src that conform to the Bidi rule.
230func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
231 if t.state == ruleInvalid && t.isRTL() {
232 return 0, ErrInvalid
233 }
234 n, ok := t.advance(src)
235 switch {
236 case !ok:
237 err = ErrInvalid
238 case n < len(src):
239 if !atEOF {
240 err = transform.ErrShortSrc
241 break
242 }
243 err = ErrInvalid
244 case !t.isFinal():
245 err = ErrInvalid
246 }
247 return n, err
248}
249
250// Precomputing the ASCII values decreases running time for the ASCII fast path
251// by about 30%.
252var asciiTable [128]bidi.Properties
253
254func init() {
255 for i := range asciiTable {
256 p, _ := bidi.LookupRune(rune(i))
257 asciiTable[i] = p
258 }
259}
260
261func (t *Transformer) advance(s []byte) (n int, ok bool) {
262 var e bidi.Properties
263 var sz int
264 for n < len(s) {
265 if s[n] < utf8.RuneSelf {
266 e, sz = asciiTable[s[n]], 1
267 } else {
268 e, sz = bidi.Lookup(s[n:])
269 if sz <= 1 {
270 if sz == 1 {
271 // We always consider invalid UTF-8 to be invalid, even if
272 // the string has not yet been determined to be RTL.
273 // TODO: is this correct?
274 return n, false
275 }
276 return n, true // incomplete UTF-8 encoding
277 }
278 }
279 // TODO: using CompactClass would result in noticeable speedup.
280 // See unicode/bidi/prop.go:Properties.CompactClass.
281 c := uint16(1 << e.Class())
282 t.seen |= c
283 if t.seen&exclusiveRTL == exclusiveRTL {
284 t.state = ruleInvalid
285 return n, false
286 }
287 switch tr := transitions[t.state]; {
288 case tr[0].mask&c != 0:
289 t.state = tr[0].next
290 case tr[1].mask&c != 0:
291 t.state = tr[1].next
292 default:
293 t.state = ruleInvalid
294 if t.isRTL() {
295 return n, false
296 }
297 }
298 n += sz
299 }
300 return n, true
301}
302
303func (t *Transformer) advanceString(s string) (n int, ok bool) {
304 var e bidi.Properties
305 var sz int
306 for n < len(s) {
307 if s[n] < utf8.RuneSelf {
308 e, sz = asciiTable[s[n]], 1
309 } else {
310 e, sz = bidi.LookupString(s[n:])
311 if sz <= 1 {
312 if sz == 1 {
313 return n, false // invalid UTF-8
314 }
315 return n, true // incomplete UTF-8 encoding
316 }
317 }
318 // TODO: using CompactClass results in noticeable speedup.
319 // See unicode/bidi/prop.go:Properties.CompactClass.
320 c := uint16(1 << e.Class())
321 t.seen |= c
322 if t.seen&exclusiveRTL == exclusiveRTL {
323 t.state = ruleInvalid
324 return n, false
325 }
326 switch tr := transitions[t.state]; {
327 case tr[0].mask&c != 0:
328 t.state = tr[0].next
329 case tr[1].mask&c != 0:
330 t.state = tr[1].next
331 default:
332 t.state = ruleInvalid
333 if t.isRTL() {
334 return n, false
335 }
336 }
337 n += sz
338 }
339 return n, true
340}