charmap.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. // Copyright 2015 Garrett D'Amore
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use file except in compliance with the License.
  5. // You may obtain a copy of the license at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package encoding
  15. import (
  16. "sync"
  17. "unicode/utf8"
  18. "golang.org/x/text/encoding"
  19. "golang.org/x/text/transform"
  20. )
  21. const (
  22. // RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
  23. RuneError = '\uFFFD'
  24. // RuneSelf is the rune below which UTF-8 and the Unicode values are
  25. // identical. Its also the limit for ASCII.
  26. RuneSelf = 0x80
  27. // ASCIISub is the ASCII substitution character.
  28. ASCIISub = '\x1a'
  29. )
  30. // Charmap is a structure for setting up encodings for 8-bit character sets,
  31. // for transforming between UTF8 and that other character set. It has some
  32. // ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
  33. // different implementation. This implementation uses maps, and supports
  34. // user-defined maps.
  35. //
  36. // We do assume that a character map has a reasonable substitution character,
  37. // and that valid encodings are stable (exactly a 1:1 map) and stateless
  38. // (that is there is no shift character or anything like that.) Hence this
  39. // approach will not work for many East Asian character sets.
  40. //
  41. // Measurement shows little or no measurable difference in the performance of
  42. // the two approaches. The difference was down to a couple of nsec/op, and
  43. // no consistent pattern as to which ran faster. With the conversion to
  44. // UTF-8 the code takes about 25 nsec/op. The conversion in the reverse
  45. // direction takes about 100 nsec/op. (The larger cost for conversion
  46. // from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
  47. // to a rune before conversion.
  48. //
  49. type Charmap struct {
  50. transform.NopResetter
  51. bytes map[rune]byte
  52. runes [256][]byte
  53. once sync.Once
  54. // The map between bytes and runes. To indicate that a specific
  55. // byte value is invalid for a charcter set, use the rune
  56. // utf8.RuneError. Values that are absent from this map will
  57. // be assumed to have the identity mapping -- that is the default
  58. // is to assume ISO8859-1, where all 8-bit characters have the same
  59. // numeric value as their Unicode runes. (Not to be confused with
  60. // the UTF-8 values, which *will* be different for non-ASCII runes.)
  61. //
  62. // If no values less than RuneSelf are changed (or have non-identity
  63. // mappings), then the character set is assumed to be an ASCII
  64. // superset, and certain assumptions and optimizations become
  65. // available for ASCII bytes.
  66. Map map[byte]rune
  67. // The ReplacementChar is the byte value to use for substitution.
  68. // It should normally be ASCIISub for ASCII encodings. This may be
  69. // unset (left to zero) for mappings that are strictly ASCII supersets.
  70. // In that case ASCIISub will be assumed instead.
  71. ReplacementChar byte
  72. }
  73. type cmapDecoder struct {
  74. transform.NopResetter
  75. runes [256][]byte
  76. }
  77. type cmapEncoder struct {
  78. transform.NopResetter
  79. bytes map[rune]byte
  80. replace byte
  81. }
  82. // Init initializes internal values of a character map. This should
  83. // be done early, to minimize the cost of allocation of transforms
  84. // later. It is not strictly necessary however, as the allocation
  85. // functions will arrange to call it if it has not already been done.
  86. func (c *Charmap) Init() {
  87. c.once.Do(c.initialize)
  88. }
  89. func (c *Charmap) initialize() {
  90. c.bytes = make(map[rune]byte)
  91. ascii := true
  92. for i := 0; i < 256; i++ {
  93. r, ok := c.Map[byte(i)]
  94. if !ok {
  95. r = rune(i)
  96. }
  97. if r < 128 && r != rune(i) {
  98. ascii = false
  99. }
  100. if r != RuneError {
  101. c.bytes[r] = byte(i)
  102. }
  103. utf := make([]byte, utf8.RuneLen(r))
  104. utf8.EncodeRune(utf, r)
  105. c.runes[i] = utf
  106. }
  107. if ascii && c.ReplacementChar == '\x00' {
  108. c.ReplacementChar = ASCIISub
  109. }
  110. }
  111. // NewDecoder returns a Decoder the converts from the 8-bit
  112. // character set to UTF-8. Unknown mappings, if any, are mapped
  113. // to '\uFFFD'.
  114. func (c *Charmap) NewDecoder() *encoding.Decoder {
  115. c.Init()
  116. return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
  117. }
  118. // NewEncoder returns a Transformer that converts from UTF8 to the
  119. // 8-bit character set. Unknown mappings are mapped to 0x1A.
  120. func (c *Charmap) NewEncoder() *encoding.Encoder {
  121. c.Init()
  122. return &encoding.Encoder{
  123. Transformer: &cmapEncoder{
  124. bytes: c.bytes,
  125. replace: c.ReplacementChar,
  126. },
  127. }
  128. }
  129. func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
  130. var e error
  131. var ndst, nsrc int
  132. for _, c := range src {
  133. b := d.runes[c]
  134. l := len(b)
  135. if ndst+l > len(dst) {
  136. e = transform.ErrShortDst
  137. break
  138. }
  139. for i := 0; i < l; i++ {
  140. dst[ndst] = b[i]
  141. ndst++
  142. }
  143. nsrc++
  144. }
  145. return ndst, nsrc, e
  146. }
  147. func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
  148. var e error
  149. var ndst, nsrc int
  150. for nsrc < len(src) {
  151. if ndst >= len(dst) {
  152. e = transform.ErrShortDst
  153. break
  154. }
  155. r, sz := utf8.DecodeRune(src[nsrc:])
  156. if r == utf8.RuneError && sz == 1 {
  157. // If its inconclusive due to insufficient data in
  158. // in the source, report it
  159. if !atEOF && !utf8.FullRune(src[nsrc:]) {
  160. e = transform.ErrShortSrc
  161. break
  162. }
  163. }
  164. if c, ok := d.bytes[r]; ok {
  165. dst[ndst] = c
  166. } else {
  167. dst[ndst] = d.replace
  168. }
  169. nsrc += sz
  170. ndst++
  171. }
  172. return ndst, nsrc, e
  173. }