parse.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "errors"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. "golang.org/x/text/internal/language"
  11. )
  12. // ValueError is returned by any of the parsing functions when the
  13. // input is well-formed but the respective subtag is not recognized
  14. // as a valid value.
  15. type ValueError interface {
  16. error
  17. // Subtag returns the subtag for which the error occurred.
  18. Subtag() string
  19. }
  20. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  21. // failed it returns an error and any part of the tag that could be parsed.
  22. // If parsing succeeded but an unknown value was found, it returns
  23. // ValueError. The Tag returned in this case is just stripped of the unknown
  24. // value. All other values are preserved. It accepts tags in the BCP 47 format
  25. // and extensions to this standard defined in
  26. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  27. // The resulting tag is canonicalized using the default canonicalization type.
  28. func Parse(s string) (t Tag, err error) {
  29. return Default.Parse(s)
  30. }
  31. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  32. // failed it returns an error and any part of the tag that could be parsed.
  33. // If parsing succeeded but an unknown value was found, it returns
  34. // ValueError. The Tag returned in this case is just stripped of the unknown
  35. // value. All other values are preserved. It accepts tags in the BCP 47 format
  36. // and extensions to this standard defined in
  37. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  38. // The resulting tag is canonicalized using the canonicalization type c.
  39. func (c CanonType) Parse(s string) (t Tag, err error) {
  40. defer func() {
  41. if recover() != nil {
  42. t = Tag{}
  43. err = language.ErrSyntax
  44. }
  45. }()
  46. tt, err := language.Parse(s)
  47. if err != nil {
  48. return makeTag(tt), err
  49. }
  50. tt, changed := canonicalize(c, tt)
  51. if changed {
  52. tt.RemakeString()
  53. }
  54. return makeTag(tt), err
  55. }
  56. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  57. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  58. // Base, Script or Region or slice of type Variant or Extension is passed more
  59. // than once, the latter will overwrite the former. Variants and Extensions are
  60. // accumulated, but if two extensions of the same type are passed, the latter
  61. // will replace the former. For -u extensions, though, the key-type pairs are
  62. // added, where later values overwrite older ones. A Tag overwrites all former
  63. // values and typically only makes sense as the first argument. The resulting
  64. // tag is returned after canonicalizing using the Default CanonType. If one or
  65. // more errors are encountered, one of the errors is returned.
  66. func Compose(part ...interface{}) (t Tag, err error) {
  67. return Default.Compose(part...)
  68. }
  69. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  70. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  71. // Base, Script or Region or slice of type Variant or Extension is passed more
  72. // than once, the latter will overwrite the former. Variants and Extensions are
  73. // accumulated, but if two extensions of the same type are passed, the latter
  74. // will replace the former. For -u extensions, though, the key-type pairs are
  75. // added, where later values overwrite older ones. A Tag overwrites all former
  76. // values and typically only makes sense as the first argument. The resulting
  77. // tag is returned after canonicalizing using CanonType c. If one or more errors
  78. // are encountered, one of the errors is returned.
  79. func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
  80. defer func() {
  81. if recover() != nil {
  82. t = Tag{}
  83. err = language.ErrSyntax
  84. }
  85. }()
  86. var b language.Builder
  87. if err = update(&b, part...); err != nil {
  88. return und, err
  89. }
  90. b.Tag, _ = canonicalize(c, b.Tag)
  91. return makeTag(b.Make()), err
  92. }
  93. var errInvalidArgument = errors.New("invalid Extension or Variant")
  94. func update(b *language.Builder, part ...interface{}) (err error) {
  95. for _, x := range part {
  96. switch v := x.(type) {
  97. case Tag:
  98. b.SetTag(v.tag())
  99. case Base:
  100. b.Tag.LangID = v.langID
  101. case Script:
  102. b.Tag.ScriptID = v.scriptID
  103. case Region:
  104. b.Tag.RegionID = v.regionID
  105. case Variant:
  106. if v.variant == "" {
  107. err = errInvalidArgument
  108. break
  109. }
  110. b.AddVariant(v.variant)
  111. case Extension:
  112. if v.s == "" {
  113. err = errInvalidArgument
  114. break
  115. }
  116. b.SetExt(v.s)
  117. case []Variant:
  118. b.ClearVariants()
  119. for _, v := range v {
  120. b.AddVariant(v.variant)
  121. }
  122. case []Extension:
  123. b.ClearExtensions()
  124. for _, e := range v {
  125. b.SetExt(e.s)
  126. }
  127. // TODO: support parsing of raw strings based on morphology or just extensions?
  128. case error:
  129. if v != nil {
  130. err = v
  131. }
  132. }
  133. }
  134. return
  135. }
  136. var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
  137. var errTagListTooLarge = errors.New("tag list exceeds max length")
  138. // ParseAcceptLanguage parses the contents of an Accept-Language header as
  139. // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
  140. // a list of corresponding quality weights. It is more permissive than RFC 2616
  141. // and may return non-nil slices even if the input is not valid.
  142. // The Tags will be sorted by highest weight first and then by first occurrence.
  143. // Tags with a weight of zero will be dropped. An error will be returned if the
  144. // input could not be parsed.
  145. func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
  146. defer func() {
  147. if recover() != nil {
  148. tag = nil
  149. q = nil
  150. err = language.ErrSyntax
  151. }
  152. }()
  153. if strings.Count(s, "-") > 1000 {
  154. return nil, nil, errTagListTooLarge
  155. }
  156. var entry string
  157. for s != "" {
  158. if entry, s = split(s, ','); entry == "" {
  159. continue
  160. }
  161. entry, weight := split(entry, ';')
  162. // Scan the language.
  163. t, err := Parse(entry)
  164. if err != nil {
  165. id, ok := acceptFallback[entry]
  166. if !ok {
  167. return nil, nil, err
  168. }
  169. t = makeTag(language.Tag{LangID: id})
  170. }
  171. // Scan the optional weight.
  172. w := 1.0
  173. if weight != "" {
  174. weight = consume(weight, 'q')
  175. weight = consume(weight, '=')
  176. // consume returns the empty string when a token could not be
  177. // consumed, resulting in an error for ParseFloat.
  178. if w, err = strconv.ParseFloat(weight, 32); err != nil {
  179. return nil, nil, errInvalidWeight
  180. }
  181. // Drop tags with a quality weight of 0.
  182. if w <= 0 {
  183. continue
  184. }
  185. }
  186. tag = append(tag, t)
  187. q = append(q, float32(w))
  188. }
  189. sort.Stable(&tagSort{tag, q})
  190. return tag, q, nil
  191. }
  192. // consume removes a leading token c from s and returns the result or the empty
  193. // string if there is no such token.
  194. func consume(s string, c byte) string {
  195. if s == "" || s[0] != c {
  196. return ""
  197. }
  198. return strings.TrimSpace(s[1:])
  199. }
  200. func split(s string, c byte) (head, tail string) {
  201. if i := strings.IndexByte(s, c); i >= 0 {
  202. return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
  203. }
  204. return strings.TrimSpace(s), ""
  205. }
  206. // Add hack mapping to deal with a small number of cases that occur
  207. // in Accept-Language (with reasonable frequency).
  208. var acceptFallback = map[string]language.Language{
  209. "english": _en,
  210. "deutsch": _de,
  211. "italian": _it,
  212. "french": _fr,
  213. "*": _mul, // defined in the spec to match all languages.
  214. }
  215. type tagSort struct {
  216. tag []Tag
  217. q []float32
  218. }
  219. func (s *tagSort) Len() int {
  220. return len(s.q)
  221. }
  222. func (s *tagSort) Less(i, j int) bool {
  223. return s.q[i] > s.q[j]
  224. }
  225. func (s *tagSort) Swap(i, j int) {
  226. s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
  227. s.q[i], s.q[j] = s.q[j], s.q[i]
  228. }