blob: 346fe4423ed71307f53361720cf251725700e8f4 [file] [log] [blame]
Zack Williamse940c7a2019-08-21 14:25:39 -07001// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
2
3// Copyright 2016 The Go Authors. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6
Zack Williamse940c7a2019-08-21 14:25:39 -07007// Package idna implements IDNA2008 using the compatibility processing
8// defined by UTS (Unicode Technical Standard) #46, which defines a standard to
9// deal with the transition from IDNA2003.
10//
11// IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC
12// 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894.
Scott Baker4a35a702019-11-26 08:17:33 -080013// UTS #46 is defined in http://www.unicode.org/reports/tr46.
14// See http://unicode.org/cldr/utility/idna.jsp for a visualization of the
Zack Williamse940c7a2019-08-21 14:25:39 -070015// differences between these two standards.
16package idna // import "golang.org/x/net/idna"
17
18import (
19 "fmt"
20 "strings"
21 "unicode/utf8"
22
23 "golang.org/x/text/secure/bidirule"
24 "golang.org/x/text/unicode/bidi"
25 "golang.org/x/text/unicode/norm"
26)
27
28// NOTE: Unlike common practice in Go APIs, the functions will return a
29// sanitized domain name in case of errors. Browsers sometimes use a partially
30// evaluated string as lookup.
31// TODO: the current error handling is, in my opinion, the least opinionated.
32// Other strategies are also viable, though:
33// Option 1) Return an empty string in case of error, but allow the user to
34// specify explicitly which errors to ignore.
35// Option 2) Return the partially evaluated string if it is itself a valid
36// string, otherwise return the empty string in case of error.
37// Option 3) Option 1 and 2.
38// Option 4) Always return an empty string for now and implement Option 1 as
39// needed, and document that the return string may not be empty in case of
40// error in the future.
41// I think Option 1 is best, but it is quite opinionated.
42
43// ToASCII is a wrapper for Punycode.ToASCII.
44func ToASCII(s string) (string, error) {
45 return Punycode.process(s, true)
46}
47
48// ToUnicode is a wrapper for Punycode.ToUnicode.
49func ToUnicode(s string) (string, error) {
50 return Punycode.process(s, false)
51}
52
53// An Option configures a Profile at creation time.
54type Option func(*options)
55
56// Transitional sets a Profile to use the Transitional mapping as defined in UTS
57// #46. This will cause, for example, "ß" to be mapped to "ss". Using the
58// transitional mapping provides a compromise between IDNA2003 and IDNA2008
59// compatibility. It is used by most browsers when resolving domain names. This
60// option is only meaningful if combined with MapForLookup.
61func Transitional(transitional bool) Option {
62 return func(o *options) { o.transitional = true }
63}
64
65// VerifyDNSLength sets whether a Profile should fail if any of the IDN parts
66// are longer than allowed by the RFC.
67func VerifyDNSLength(verify bool) Option {
68 return func(o *options) { o.verifyDNSLength = verify }
69}
70
71// RemoveLeadingDots removes leading label separators. Leading runes that map to
72// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well.
73//
74// This is the behavior suggested by the UTS #46 and is adopted by some
75// browsers.
76func RemoveLeadingDots(remove bool) Option {
77 return func(o *options) { o.removeLeadingDots = remove }
78}
79
80// ValidateLabels sets whether to check the mandatory label validation criteria
81// as defined in Section 5.4 of RFC 5891. This includes testing for correct use
82// of hyphens ('-'), normalization, validity of runes, and the context rules.
83func ValidateLabels(enable bool) Option {
84 return func(o *options) {
85 // Don't override existing mappings, but set one that at least checks
86 // normalization if it is not set.
87 if o.mapping == nil && enable {
88 o.mapping = normalize
89 }
90 o.trie = trie
91 o.validateLabels = enable
92 o.fromPuny = validateFromPunycode
93 }
94}
95
96// StrictDomainName limits the set of permissible ASCII characters to those
97// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the
98// hyphen). This is set by default for MapForLookup and ValidateForRegistration.
99//
100// This option is useful, for instance, for browsers that allow characters
101// outside this range, for example a '_' (U+005F LOW LINE). See
102// http://www.rfc-editor.org/std/std3.txt for more details This option
103// corresponds to the UseSTD3ASCIIRules option in UTS #46.
104func StrictDomainName(use bool) Option {
105 return func(o *options) {
106 o.trie = trie
107 o.useSTD3Rules = use
108 o.fromPuny = validateFromPunycode
109 }
110}
111
112// NOTE: the following options pull in tables. The tables should not be linked
113// in as long as the options are not used.
114
115// BidiRule enables the Bidi rule as defined in RFC 5893. Any application
116// that relies on proper validation of labels should include this rule.
117func BidiRule() Option {
118 return func(o *options) { o.bidirule = bidirule.ValidString }
119}
120
121// ValidateForRegistration sets validation options to verify that a given IDN is
122// properly formatted for registration as defined by Section 4 of RFC 5891.
123func ValidateForRegistration() Option {
124 return func(o *options) {
125 o.mapping = validateRegistration
126 StrictDomainName(true)(o)
127 ValidateLabels(true)(o)
128 VerifyDNSLength(true)(o)
129 BidiRule()(o)
130 }
131}
132
133// MapForLookup sets validation and mapping options such that a given IDN is
134// transformed for domain name lookup according to the requirements set out in
135// Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894,
136// RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option
137// to add this check.
138//
139// The mappings include normalization and mapping case, width and other
140// compatibility mappings.
141func MapForLookup() Option {
142 return func(o *options) {
143 o.mapping = validateAndMap
144 StrictDomainName(true)(o)
145 ValidateLabels(true)(o)
146 }
147}
148
149type options struct {
150 transitional bool
151 useSTD3Rules bool
152 validateLabels bool
153 verifyDNSLength bool
154 removeLeadingDots bool
155
156 trie *idnaTrie
157
158 // fromPuny calls validation rules when converting A-labels to U-labels.
159 fromPuny func(p *Profile, s string) error
160
161 // mapping implements a validation and mapping step as defined in RFC 5895
162 // or UTS 46, tailored to, for example, domain registration or lookup.
163 mapping func(p *Profile, s string) (mapped string, isBidi bool, err error)
164
165 // bidirule, if specified, checks whether s conforms to the Bidi Rule
166 // defined in RFC 5893.
167 bidirule func(s string) bool
168}
169
170// A Profile defines the configuration of an IDNA mapper.
171type Profile struct {
172 options
173}
174
175func apply(o *options, opts []Option) {
176 for _, f := range opts {
177 f(o)
178 }
179}
180
181// New creates a new Profile.
182//
183// With no options, the returned Profile is the most permissive and equals the
184// Punycode Profile. Options can be passed to further restrict the Profile. The
185// MapForLookup and ValidateForRegistration options set a collection of options,
186// for lookup and registration purposes respectively, which can be tailored by
187// adding more fine-grained options, where later options override earlier
188// options.
189func New(o ...Option) *Profile {
190 p := &Profile{}
191 apply(&p.options, o)
192 return p
193}
194
195// ToASCII converts a domain or domain label to its ASCII form. For example,
196// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
197// ToASCII("golang") is "golang". If an error is encountered it will return
198// an error and a (partially) processed result.
199func (p *Profile) ToASCII(s string) (string, error) {
200 return p.process(s, true)
201}
202
203// ToUnicode converts a domain or domain label to its Unicode form. For example,
204// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
205// ToUnicode("golang") is "golang". If an error is encountered it will return
206// an error and a (partially) processed result.
207func (p *Profile) ToUnicode(s string) (string, error) {
208 pp := *p
209 pp.transitional = false
210 return pp.process(s, false)
211}
212
213// String reports a string with a description of the profile for debugging
214// purposes. The string format may change with different versions.
215func (p *Profile) String() string {
216 s := ""
217 if p.transitional {
218 s = "Transitional"
219 } else {
220 s = "NonTransitional"
221 }
222 if p.useSTD3Rules {
223 s += ":UseSTD3Rules"
224 }
225 if p.validateLabels {
226 s += ":ValidateLabels"
227 }
228 if p.verifyDNSLength {
229 s += ":VerifyDNSLength"
230 }
231 return s
232}
233
234var (
235 // Punycode is a Profile that does raw punycode processing with a minimum
236 // of validation.
237 Punycode *Profile = punycode
238
239 // Lookup is the recommended profile for looking up domain names, according
240 // to Section 5 of RFC 5891. The exact configuration of this profile may
241 // change over time.
242 Lookup *Profile = lookup
243
244 // Display is the recommended profile for displaying domain names.
245 // The configuration of this profile may change over time.
246 Display *Profile = display
247
248 // Registration is the recommended profile for checking whether a given
249 // IDN is valid for registration, according to Section 4 of RFC 5891.
250 Registration *Profile = registration
251
252 punycode = &Profile{}
253 lookup = &Profile{options{
254 transitional: true,
255 useSTD3Rules: true,
256 validateLabels: true,
257 trie: trie,
258 fromPuny: validateFromPunycode,
259 mapping: validateAndMap,
260 bidirule: bidirule.ValidString,
261 }}
262 display = &Profile{options{
263 useSTD3Rules: true,
264 validateLabels: true,
265 trie: trie,
266 fromPuny: validateFromPunycode,
267 mapping: validateAndMap,
268 bidirule: bidirule.ValidString,
269 }}
270 registration = &Profile{options{
271 useSTD3Rules: true,
272 validateLabels: true,
273 verifyDNSLength: true,
274 trie: trie,
275 fromPuny: validateFromPunycode,
276 mapping: validateRegistration,
277 bidirule: bidirule.ValidString,
278 }}
279
280 // TODO: profiles
281 // Register: recommended for approving domain names: don't do any mappings
282 // but rather reject on invalid input. Bundle or block deviation characters.
283)
284
285type labelError struct{ label, code_ string }
286
287func (e labelError) code() string { return e.code_ }
288func (e labelError) Error() string {
289 return fmt.Sprintf("idna: invalid label %q", e.label)
290}
291
292type runeError rune
293
294func (e runeError) code() string { return "P1" }
295func (e runeError) Error() string {
296 return fmt.Sprintf("idna: disallowed rune %U", e)
297}
298
299// process implements the algorithm described in section 4 of UTS #46,
Scott Baker4a35a702019-11-26 08:17:33 -0800300// see http://www.unicode.org/reports/tr46.
Zack Williamse940c7a2019-08-21 14:25:39 -0700301func (p *Profile) process(s string, toASCII bool) (string, error) {
302 var err error
303 var isBidi bool
304 if p.mapping != nil {
305 s, isBidi, err = p.mapping(p, s)
306 }
307 // Remove leading empty labels.
308 if p.removeLeadingDots {
309 for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
310 }
311 }
312 // TODO: allow for a quick check of the tables data.
313 // It seems like we should only create this error on ToASCII, but the
314 // UTS 46 conformance tests suggests we should always check this.
315 if err == nil && p.verifyDNSLength && s == "" {
316 err = &labelError{s, "A4"}
317 }
318 labels := labelIter{orig: s}
319 for ; !labels.done(); labels.next() {
320 label := labels.label()
321 if label == "" {
322 // Empty labels are not okay. The label iterator skips the last
323 // label if it is empty.
324 if err == nil && p.verifyDNSLength {
325 err = &labelError{s, "A4"}
326 }
327 continue
328 }
329 if strings.HasPrefix(label, acePrefix) {
330 u, err2 := decode(label[len(acePrefix):])
331 if err2 != nil {
332 if err == nil {
333 err = err2
334 }
335 // Spec says keep the old label.
336 continue
337 }
338 isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight
339 labels.set(u)
340 if err == nil && p.validateLabels {
341 err = p.fromPuny(p, u)
342 }
343 if err == nil {
344 // This should be called on NonTransitional, according to the
345 // spec, but that currently does not have any effect. Use the
346 // original profile to preserve options.
347 err = p.validateLabel(u)
348 }
349 } else if err == nil {
350 err = p.validateLabel(label)
351 }
352 }
353 if isBidi && p.bidirule != nil && err == nil {
354 for labels.reset(); !labels.done(); labels.next() {
355 if !p.bidirule(labels.label()) {
356 err = &labelError{s, "B"}
357 break
358 }
359 }
360 }
361 if toASCII {
362 for labels.reset(); !labels.done(); labels.next() {
363 label := labels.label()
364 if !ascii(label) {
365 a, err2 := encode(acePrefix, label)
366 if err == nil {
367 err = err2
368 }
369 label = a
370 labels.set(a)
371 }
372 n := len(label)
373 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) {
374 err = &labelError{label, "A4"}
375 }
376 }
377 }
378 s = labels.result()
379 if toASCII && p.verifyDNSLength && err == nil {
380 // Compute the length of the domain name minus the root label and its dot.
381 n := len(s)
382 if n > 0 && s[n-1] == '.' {
383 n--
384 }
385 if len(s) < 1 || n > 253 {
386 err = &labelError{s, "A4"}
387 }
388 }
389 return s, err
390}
391
392func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) {
393 // TODO: consider first doing a quick check to see if any of these checks
394 // need to be done. This will make it slower in the general case, but
395 // faster in the common case.
396 mapped = norm.NFC.String(s)
397 isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft
398 return mapped, isBidi, nil
399}
400
401func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) {
402 // TODO: filter need for normalization in loop below.
403 if !norm.NFC.IsNormalString(s) {
404 return s, false, &labelError{s, "V1"}
405 }
406 for i := 0; i < len(s); {
407 v, sz := trie.lookupString(s[i:])
408 if sz == 0 {
409 return s, bidi, runeError(utf8.RuneError)
410 }
411 bidi = bidi || info(v).isBidi(s[i:])
412 // Copy bytes not copied so far.
413 switch p.simplify(info(v).category()) {
414 // TODO: handle the NV8 defined in the Unicode idna data set to allow
415 // for strict conformance to IDNA2008.
416 case valid, deviation:
417 case disallowed, mapped, unknown, ignored:
418 r, _ := utf8.DecodeRuneInString(s[i:])
419 return s, bidi, runeError(r)
420 }
421 i += sz
422 }
423 return s, bidi, nil
424}
425
426func (c info) isBidi(s string) bool {
427 if !c.isMapped() {
428 return c&attributesMask == rtl
429 }
430 // TODO: also store bidi info for mapped data. This is possible, but a bit
431 // cumbersome and not for the common case.
432 p, _ := bidi.LookupString(s)
433 switch p.Class() {
434 case bidi.R, bidi.AL, bidi.AN:
435 return true
436 }
437 return false
438}
439
440func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) {
441 var (
442 b []byte
443 k int
444 )
445 // combinedInfoBits contains the or-ed bits of all runes. We use this
446 // to derive the mayNeedNorm bit later. This may trigger normalization
447 // overeagerly, but it will not do so in the common case. The end result
448 // is another 10% saving on BenchmarkProfile for the common case.
449 var combinedInfoBits info
450 for i := 0; i < len(s); {
451 v, sz := trie.lookupString(s[i:])
452 if sz == 0 {
453 b = append(b, s[k:i]...)
454 b = append(b, "\ufffd"...)
455 k = len(s)
456 if err == nil {
457 err = runeError(utf8.RuneError)
458 }
459 break
460 }
461 combinedInfoBits |= info(v)
462 bidi = bidi || info(v).isBidi(s[i:])
463 start := i
464 i += sz
465 // Copy bytes not copied so far.
466 switch p.simplify(info(v).category()) {
467 case valid:
468 continue
469 case disallowed:
470 if err == nil {
471 r, _ := utf8.DecodeRuneInString(s[start:])
472 err = runeError(r)
473 }
474 continue
475 case mapped, deviation:
476 b = append(b, s[k:start]...)
477 b = info(v).appendMapping(b, s[start:i])
478 case ignored:
479 b = append(b, s[k:start]...)
480 // drop the rune
481 case unknown:
482 b = append(b, s[k:start]...)
483 b = append(b, "\ufffd"...)
484 }
485 k = i
486 }
487 if k == 0 {
488 // No changes so far.
489 if combinedInfoBits&mayNeedNorm != 0 {
490 s = norm.NFC.String(s)
491 }
492 } else {
493 b = append(b, s[k:]...)
494 if norm.NFC.QuickSpan(b) != len(b) {
495 b = norm.NFC.Bytes(b)
496 }
497 // TODO: the punycode converters require strings as input.
498 s = string(b)
499 }
500 return s, bidi, err
501}
502
503// A labelIter allows iterating over domain name labels.
504type labelIter struct {
505 orig string
506 slice []string
507 curStart int
508 curEnd int
509 i int
510}
511
512func (l *labelIter) reset() {
513 l.curStart = 0
514 l.curEnd = 0
515 l.i = 0
516}
517
518func (l *labelIter) done() bool {
519 return l.curStart >= len(l.orig)
520}
521
522func (l *labelIter) result() string {
523 if l.slice != nil {
524 return strings.Join(l.slice, ".")
525 }
526 return l.orig
527}
528
529func (l *labelIter) label() string {
530 if l.slice != nil {
531 return l.slice[l.i]
532 }
533 p := strings.IndexByte(l.orig[l.curStart:], '.')
534 l.curEnd = l.curStart + p
535 if p == -1 {
536 l.curEnd = len(l.orig)
537 }
538 return l.orig[l.curStart:l.curEnd]
539}
540
541// next sets the value to the next label. It skips the last label if it is empty.
542func (l *labelIter) next() {
543 l.i++
544 if l.slice != nil {
545 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" {
546 l.curStart = len(l.orig)
547 }
548 } else {
549 l.curStart = l.curEnd + 1
550 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' {
551 l.curStart = len(l.orig)
552 }
553 }
554}
555
556func (l *labelIter) set(s string) {
557 if l.slice == nil {
558 l.slice = strings.Split(l.orig, ".")
559 }
560 l.slice[l.i] = s
561}
562
563// acePrefix is the ASCII Compatible Encoding prefix.
564const acePrefix = "xn--"
565
566func (p *Profile) simplify(cat category) category {
567 switch cat {
568 case disallowedSTD3Mapped:
569 if p.useSTD3Rules {
570 cat = disallowed
571 } else {
572 cat = mapped
573 }
574 case disallowedSTD3Valid:
575 if p.useSTD3Rules {
576 cat = disallowed
577 } else {
578 cat = valid
579 }
580 case deviation:
581 if !p.transitional {
582 cat = valid
583 }
584 case validNV8, validXV8:
585 // TODO: handle V2008
586 cat = valid
587 }
588 return cat
589}
590
591func validateFromPunycode(p *Profile, s string) error {
592 if !norm.NFC.IsNormalString(s) {
593 return &labelError{s, "V1"}
594 }
595 // TODO: detect whether string may have to be normalized in the following
596 // loop.
597 for i := 0; i < len(s); {
598 v, sz := trie.lookupString(s[i:])
599 if sz == 0 {
600 return runeError(utf8.RuneError)
601 }
602 if c := p.simplify(info(v).category()); c != valid && c != deviation {
603 return &labelError{s, "V6"}
604 }
605 i += sz
606 }
607 return nil
608}
609
610const (
611 zwnj = "\u200c"
612 zwj = "\u200d"
613)
614
615type joinState int8
616
617const (
618 stateStart joinState = iota
619 stateVirama
620 stateBefore
621 stateBeforeVirama
622 stateAfter
623 stateFAIL
624)
625
626var joinStates = [][numJoinTypes]joinState{
627 stateStart: {
628 joiningL: stateBefore,
629 joiningD: stateBefore,
630 joinZWNJ: stateFAIL,
631 joinZWJ: stateFAIL,
632 joinVirama: stateVirama,
633 },
634 stateVirama: {
635 joiningL: stateBefore,
636 joiningD: stateBefore,
637 },
638 stateBefore: {
639 joiningL: stateBefore,
640 joiningD: stateBefore,
641 joiningT: stateBefore,
642 joinZWNJ: stateAfter,
643 joinZWJ: stateFAIL,
644 joinVirama: stateBeforeVirama,
645 },
646 stateBeforeVirama: {
647 joiningL: stateBefore,
648 joiningD: stateBefore,
649 joiningT: stateBefore,
650 },
651 stateAfter: {
652 joiningL: stateFAIL,
653 joiningD: stateBefore,
654 joiningT: stateAfter,
655 joiningR: stateStart,
656 joinZWNJ: stateFAIL,
657 joinZWJ: stateFAIL,
658 joinVirama: stateAfter, // no-op as we can't accept joiners here
659 },
660 stateFAIL: {
661 0: stateFAIL,
662 joiningL: stateFAIL,
663 joiningD: stateFAIL,
664 joiningT: stateFAIL,
665 joiningR: stateFAIL,
666 joinZWNJ: stateFAIL,
667 joinZWJ: stateFAIL,
668 joinVirama: stateFAIL,
669 },
670}
671
672// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are
673// already implicitly satisfied by the overall implementation.
674func (p *Profile) validateLabel(s string) (err error) {
675 if s == "" {
676 if p.verifyDNSLength {
677 return &labelError{s, "A4"}
678 }
679 return nil
680 }
681 if !p.validateLabels {
682 return nil
683 }
684 trie := p.trie // p.validateLabels is only set if trie is set.
685 if len(s) > 4 && s[2] == '-' && s[3] == '-' {
686 return &labelError{s, "V2"}
687 }
688 if s[0] == '-' || s[len(s)-1] == '-' {
689 return &labelError{s, "V3"}
690 }
691 // TODO: merge the use of this in the trie.
692 v, sz := trie.lookupString(s)
693 x := info(v)
694 if x.isModifier() {
695 return &labelError{s, "V5"}
696 }
697 // Quickly return in the absence of zero-width (non) joiners.
698 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 {
699 return nil
700 }
701 st := stateStart
702 for i := 0; ; {
703 jt := x.joinType()
704 if s[i:i+sz] == zwj {
705 jt = joinZWJ
706 } else if s[i:i+sz] == zwnj {
707 jt = joinZWNJ
708 }
709 st = joinStates[st][jt]
710 if x.isViramaModifier() {
711 st = joinStates[st][joinVirama]
712 }
713 if i += sz; i == len(s) {
714 break
715 }
716 v, sz = trie.lookupString(s[i:])
717 x = info(v)
718 }
719 if st == stateFAIL || st == stateAfter {
720 return &labelError{s, "C"}
721 }
722 return nil
723}
724
725func ascii(s string) bool {
726 for i := 0; i < len(s); i++ {
727 if s[i] >= utf8.RuneSelf {
728 return false
729 }
730 }
731 return true
732}