123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 |
- // Copyright 2015 Garrett D'Amore
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use file except in compliance with the License.
- // You may obtain a copy of the license at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package encoding
- import (
- "sync"
- "unicode/utf8"
- "golang.org/x/text/encoding"
- "golang.org/x/text/transform"
- )
- const (
- // RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
- RuneError = '\uFFFD'
- // RuneSelf is the rune below which UTF-8 and the Unicode values are
- // identical. Its also the limit for ASCII.
- RuneSelf = 0x80
- // ASCIISub is the ASCII substitution character.
- ASCIISub = '\x1a'
- )
- // Charmap is a structure for setting up encodings for 8-bit character sets,
- // for transforming between UTF8 and that other character set. It has some
- // ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
- // different implementation. This implementation uses maps, and supports
- // user-defined maps.
- //
- // We do assume that a character map has a reasonable substitution character,
- // and that valid encodings are stable (exactly a 1:1 map) and stateless
- // (that is there is no shift character or anything like that.) Hence this
- // approach will not work for many East Asian character sets.
- //
- // Measurement shows little or no measurable difference in the performance of
- // the two approaches. The difference was down to a couple of nsec/op, and
- // no consistent pattern as to which ran faster. With the conversion to
- // UTF-8 the code takes about 25 nsec/op. The conversion in the reverse
- // direction takes about 100 nsec/op. (The larger cost for conversion
- // from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
- // to a rune before conversion.
- //
- type Charmap struct {
- transform.NopResetter
- bytes map[rune]byte
- runes [256][]byte
- once sync.Once
- // The map between bytes and runes. To indicate that a specific
- // byte value is invalid for a charcter set, use the rune
- // utf8.RuneError. Values that are absent from this map will
- // be assumed to have the identity mapping -- that is the default
- // is to assume ISO8859-1, where all 8-bit characters have the same
- // numeric value as their Unicode runes. (Not to be confused with
- // the UTF-8 values, which *will* be different for non-ASCII runes.)
- //
- // If no values less than RuneSelf are changed (or have non-identity
- // mappings), then the character set is assumed to be an ASCII
- // superset, and certain assumptions and optimizations become
- // available for ASCII bytes.
- Map map[byte]rune
- // The ReplacementChar is the byte value to use for substitution.
- // It should normally be ASCIISub for ASCII encodings. This may be
- // unset (left to zero) for mappings that are strictly ASCII supersets.
- // In that case ASCIISub will be assumed instead.
- ReplacementChar byte
- }
- type cmapDecoder struct {
- transform.NopResetter
- runes [256][]byte
- }
- type cmapEncoder struct {
- transform.NopResetter
- bytes map[rune]byte
- replace byte
- }
- // Init initializes internal values of a character map. This should
- // be done early, to minimize the cost of allocation of transforms
- // later. It is not strictly necessary however, as the allocation
- // functions will arrange to call it if it has not already been done.
- func (c *Charmap) Init() {
- c.once.Do(c.initialize)
- }
- func (c *Charmap) initialize() {
- c.bytes = make(map[rune]byte)
- ascii := true
- for i := 0; i < 256; i++ {
- r, ok := c.Map[byte(i)]
- if !ok {
- r = rune(i)
- }
- if r < 128 && r != rune(i) {
- ascii = false
- }
- if r != RuneError {
- c.bytes[r] = byte(i)
- }
- utf := make([]byte, utf8.RuneLen(r))
- utf8.EncodeRune(utf, r)
- c.runes[i] = utf
- }
- if ascii && c.ReplacementChar == '\x00' {
- c.ReplacementChar = ASCIISub
- }
- }
- // NewDecoder returns a Decoder the converts from the 8-bit
- // character set to UTF-8. Unknown mappings, if any, are mapped
- // to '\uFFFD'.
- func (c *Charmap) NewDecoder() *encoding.Decoder {
- c.Init()
- return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
- }
- // NewEncoder returns a Transformer that converts from UTF8 to the
- // 8-bit character set. Unknown mappings are mapped to 0x1A.
- func (c *Charmap) NewEncoder() *encoding.Encoder {
- c.Init()
- return &encoding.Encoder{
- Transformer: &cmapEncoder{
- bytes: c.bytes,
- replace: c.ReplacementChar,
- },
- }
- }
- func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
- var e error
- var ndst, nsrc int
- for _, c := range src {
- b := d.runes[c]
- l := len(b)
- if ndst+l > len(dst) {
- e = transform.ErrShortDst
- break
- }
- for i := 0; i < l; i++ {
- dst[ndst] = b[i]
- ndst++
- }
- nsrc++
- }
- return ndst, nsrc, e
- }
- func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
- var e error
- var ndst, nsrc int
- for nsrc < len(src) {
- if ndst >= len(dst) {
- e = transform.ErrShortDst
- break
- }
- r, sz := utf8.DecodeRune(src[nsrc:])
- if r == utf8.RuneError && sz == 1 {
- // If its inconclusive due to insufficient data in
- // in the source, report it
- if !atEOF && !utf8.FullRune(src[nsrc:]) {
- e = transform.ErrShortSrc
- break
- }
- }
- if c, ok := d.bytes[r]; ok {
- dst[ndst] = c
- } else {
- dst[ndst] = d.replace
- }
- nsrc += sz
- ndst++
- }
- return ndst, nsrc, e
- }
|