peridot/vendor/golang.org/x/text/encoding/encoding.go

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package encoding defines an interface for character encodings, such as Shift
// JIS and Windows 1252, that can convert to and from UTF-8.
//
// Encoding implementations are provided in other packages, such as
// golang.org/x/text/encoding/charmap and
// golang.org/x/text/encoding/japanese.
package encoding // import "golang.org/x/text/encoding"

import (
	"errors"
	"io"
	"strconv"
	"unicode/utf8"

	"golang.org/x/text/encoding/internal/identifier"
	"golang.org/x/text/transform"
)

// TODO:
// - There seems to be some inconsistency in when decoders return errors
//   and when not. Also documentation seems to suggest they shouldn't return
//   errors at all (except for UTF-16).
// - Encoders seem to rely on or at least benefit from the input being in NFC
//   normal form. Perhaps add an example how users could prepare their output.

// Encoding is a character set encoding that can be transformed to and from
// UTF-8.
type Encoding interface {
	// NewDecoder returns a Decoder.
	NewDecoder() *Decoder

	// NewEncoder returns an Encoder.
	NewEncoder() *Encoder
}

// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
//
// Transforming source bytes that are not of that encoding will not result in an
// error per se. Each byte that cannot be transcoded will be represented in the
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
type Decoder struct {
	transform.Transformer

	// This forces external creators of Decoders to use names in struct
	// initializers, allowing for future extendibility without having to break
	// code.
	_ struct{}
}

// Bytes converts the given encoded bytes to UTF-8. It returns the converted
// bytes or nil, err if any error occurred.
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
	b, _, err := transform.Bytes(d, b)
	if err != nil {
		return nil, err
	}
	return b, nil
}

// String converts the given encoded string to UTF-8. It returns the converted
// string or "", err if any error occurred.
func (d *Decoder) String(s string) (string, error) {
	s, _, err := transform.String(d, s)
	if err != nil {
		return "", err
	}
	return s, nil
}

// Reader wraps another Reader to decode its bytes.
//
// The Decoder may not be used for any other operation as long as the returned
// Reader is in use.
func (d *Decoder) Reader(r io.Reader) io.Reader {
	return transform.NewReader(r, d)
}

// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
//
// Each rune that cannot be transcoded will result in an error. In this case,
// the transform will consume all source byte up to, not including the offending
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
// `\uFFFD`. To return early with an error instead, use transform.Chain to
// preprocess the data with a UTF8Validator.
type Encoder struct {
	transform.Transformer

	// This forces external creators of Encoders to use names in struct
	// initializers, allowing for future extendibility without having to break
	// code.
	_ struct{}
}

// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
// any error occurred.
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
	b, _, err := transform.Bytes(e, b)
	if err != nil {
		return nil, err
	}
	return b, nil
}

// String converts a string from UTF-8. It returns the converted string or
// "", err if any error occurred.
func (e *Encoder) String(s string) (string, error) {
	s, _, err := transform.String(e, s)
	if err != nil {
		return "", err
	}
	return s, nil
}

// Writer wraps another Writer to encode its UTF-8 output.
//
// The Encoder may not be used for any other operation as long as the returned
// Writer is in use.
func (e *Encoder) Writer(w io.Writer) io.Writer {
	return transform.NewWriter(w, e)
}

// ASCIISub is the ASCII substitute character, as recommended by
// https://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'

// Nop is the nop encoding. Its transformed bytes are the same as the source
// bytes; it does not replace invalid UTF-8 sequences.
var Nop Encoding = nop{}

type nop struct{}

func (nop) NewDecoder() *Decoder {
	return &Decoder{Transformer: transform.Nop}
}
func (nop) NewEncoder() *Encoder {
	return &Encoder{Transformer: transform.Nop}
}

// Replacement is the replacement encoding. Decoding from the replacement
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
// the replacement encoding yields the same as the source bytes except that
// invalid UTF-8 is converted to '\uFFFD'.
//
// It is defined at http://encoding.spec.whatwg.org/#replacement
var Replacement Encoding = replacement{}

type replacement struct{}

func (replacement) NewDecoder() *Decoder {
	return &Decoder{Transformer: replacementDecoder{}}
}

func (replacement) NewEncoder() *Encoder {
	return &Encoder{Transformer: replacementEncoder{}}
}

func (replacement) ID() (mib identifier.MIB, other string) {
	return identifier.Replacement, ""
}

type replacementDecoder struct{ transform.NopResetter }

func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	if len(dst) < 3 {
		return 0, 0, transform.ErrShortDst
	}
	if atEOF {
		const fffd = "\ufffd"
		dst[0] = fffd[0]
		dst[1] = fffd[1]
		dst[2] = fffd[2]
		nDst = 3
	}
	return nDst, len(src), nil
}

type replacementEncoder struct{ transform.NopResetter }

func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	r, size := rune(0), 0

	for ; nSrc < len(src); nSrc += size {
		r = rune(src[nSrc])

		// Decode a 1-byte rune.
		if r < utf8.RuneSelf {
			size = 1

		} else {
			// Decode a multi-byte rune.
			r, size = utf8.DecodeRune(src[nSrc:])
			if size == 1 {
				// All valid runes of size 1 (those below utf8.RuneSelf) were
				// handled above. We have invalid UTF-8 or we haven't seen the
				// full character yet.
				if !atEOF && !utf8.FullRune(src[nSrc:]) {
					err = transform.ErrShortSrc
					break
				}
				r = '\ufffd'
			}
		}

		if nDst+utf8.RuneLen(r) > len(dst) {
			err = transform.ErrShortDst
			break
		}
		nDst += utf8.EncodeRune(dst[nDst:], r)
	}
	return nDst, nSrc, err
}

// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with HTML escape sequences.
//
// This wrapper exists to comply to URL and HTML forms requiring a
// non-terminating legacy encoder. The produced sequences may lead to data
// loss as they are indistinguishable from legitimate input. To avoid this
// issue, use UTF-8 encodings whenever possible.
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
}

// ReplaceUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with an encoding-specific
// replacement.
//
// This wrapper is only provided for backwards compatibility and legacy
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
func ReplaceUnsupported(e *Encoder) *Encoder {
	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
}

type errorHandler struct {
	*Encoder
	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
}

// TODO: consider making this error public in some form.
type repertoireError interface {
	Replacement() byte
}

func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
	for err != nil {
		rerr, ok := err.(repertoireError)
		if !ok {
			return nDst, nSrc, err
		}
		r, sz := utf8.DecodeRune(src[nSrc:])
		n, ok := h.handler(dst[nDst:], r, rerr)
		if !ok {
			return nDst, nSrc, transform.ErrShortDst
		}
		err = nil
		nDst += n
		if nSrc += sz; nSrc < len(src) {
			var dn, sn int
			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
			nDst += dn
			nSrc += sn
		}
	}
	return nDst, nSrc, err
}

func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
	buf := [8]byte{}
	b := strconv.AppendUint(buf[:0], uint64(r), 10)
	if n = len(b) + len("&#;"); n >= len(dst) {
		return 0, false
	}
	dst[0] = '&'
	dst[1] = '#'
	dst[copy(dst[2:], b)+2] = ';'
	return n, true
}

func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
	if len(dst) == 0 {
		return 0, false
	}
	dst[0] = err.Replacement()
	return 1, true
}

// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")

// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
// input byte that is not valid UTF-8.
var UTF8Validator transform.Transformer = utf8Validator{}

type utf8Validator struct{ transform.NopResetter }

func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	n := len(src)
	if n > len(dst) {
		n = len(dst)
	}
	for i := 0; i < n; {
		if c := src[i]; c < utf8.RuneSelf {
			dst[i] = c
			i++
			continue
		}
		_, size := utf8.DecodeRune(src[i:])
		if size == 1 {
			// All valid runes of size 1 (those below utf8.RuneSelf) were
			// handled above. We have invalid UTF-8 or we haven't seen the
			// full character yet.
			err = ErrInvalidUTF8
			if !atEOF && !utf8.FullRune(src[i:]) {
				err = transform.ErrShortSrc
			}
			return i, i, err
		}
		if i+size > len(dst) {
			return i, i, transform.ErrShortDst
		}
		for ; size > 0; size-- {
			dst[i] = src[i]
			i++
		}
	}
	if len(src) > len(dst) {
		err = transform.ErrShortDst
	}
	return n, n, err
}
Initial commit 2022-07-07 20:11:50 +00:00			`// Copyright 2013 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`// Package encoding defines an interface for character encodings, such as Shift`
			`// JIS and Windows 1252, that can convert to and from UTF-8.`
			`//`
			`// Encoding implementations are provided in other packages, such as`
			`// golang.org/x/text/encoding/charmap and`
			`// golang.org/x/text/encoding/japanese.`
			`package encoding // import "golang.org/x/text/encoding"`

			`import (`
			`"errors"`
			`"io"`
			`"strconv"`
			`"unicode/utf8"`

			`"golang.org/x/text/encoding/internal/identifier"`
			`"golang.org/x/text/transform"`
			`)`

			`// TODO:`
			`// - There seems to be some inconsistency in when decoders return errors`
			`// and when not. Also documentation seems to suggest they shouldn't return`
			`// errors at all (except for UTF-16).`
			`// - Encoders seem to rely on or at least benefit from the input being in NFC`
			`// normal form. Perhaps add an example how users could prepare their output.`

			`// Encoding is a character set encoding that can be transformed to and from`
			`// UTF-8.`
			`type Encoding interface {`
			`// NewDecoder returns a Decoder.`
			`NewDecoder() *Decoder`

			`// NewEncoder returns an Encoder.`
			`NewEncoder() *Encoder`
			`}`

			`// A Decoder converts bytes to UTF-8. It implements transform.Transformer.`
			`//`
			`// Transforming source bytes that are not of that encoding will not result in an`
			`// error per se. Each byte that cannot be transcoded will be represented in the`
			`// output by the UTF-8 encoding of '\uFFFD', the replacement rune.`
			`type Decoder struct {`
			`transform.Transformer`

			`// This forces external creators of Decoders to use names in struct`
			`// initializers, allowing for future extendibility without having to break`
			`// code.`
			`_ struct{}`
			`}`

			`// Bytes converts the given encoded bytes to UTF-8. It returns the converted`
			`// bytes or nil, err if any error occurred.`
			`func (d *Decoder) Bytes(b []byte) ([]byte, error) {`
			`b, _, err := transform.Bytes(d, b)`
			`if err != nil {`
			`return nil, err`
			`}`
			`return b, nil`
			`}`

			`// String converts the given encoded string to UTF-8. It returns the converted`
			`// string or "", err if any error occurred.`
			`func (d *Decoder) String(s string) (string, error) {`
			`s, _, err := transform.String(d, s)`
			`if err != nil {`
			`return "", err`
			`}`
			`return s, nil`
			`}`

			`// Reader wraps another Reader to decode its bytes.`
			`//`
			`// The Decoder may not be used for any other operation as long as the returned`
			`// Reader is in use.`
			`func (d *Decoder) Reader(r io.Reader) io.Reader {`
			`return transform.NewReader(r, d)`
			`}`

			`// An Encoder converts bytes from UTF-8. It implements transform.Transformer.`
			`//`
			`// Each rune that cannot be transcoded will result in an error. In this case,`
			`// the transform will consume all source byte up to, not including the offending`
			`// rune. Transforming source bytes that are not valid UTF-8 will be replaced by`
			// `\uFFFD`. To return early with an error instead, use transform.Chain to
			`// preprocess the data with a UTF8Validator.`
			`type Encoder struct {`
			`transform.Transformer`

			`// This forces external creators of Encoders to use names in struct`
			`// initializers, allowing for future extendibility without having to break`
			`// code.`
			`_ struct{}`
			`}`

			`// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if`
			`// any error occurred.`
			`func (e *Encoder) Bytes(b []byte) ([]byte, error) {`
			`b, _, err := transform.Bytes(e, b)`
			`if err != nil {`
			`return nil, err`
			`}`
			`return b, nil`
			`}`

			`// String converts a string from UTF-8. It returns the converted string or`
			`// "", err if any error occurred.`
			`func (e *Encoder) String(s string) (string, error) {`
			`s, _, err := transform.String(e, s)`
			`if err != nil {`
			`return "", err`
			`}`
			`return s, nil`
			`}`

			`// Writer wraps another Writer to encode its UTF-8 output.`
			`//`
			`// The Encoder may not be used for any other operation as long as the returned`
			`// Writer is in use.`
			`func (e *Encoder) Writer(w io.Writer) io.Writer {`
			`return transform.NewWriter(w, e)`
			`}`

			`// ASCIISub is the ASCII substitute character, as recommended by`
			`// https://unicode.org/reports/tr36/#Text_Comparison`
			`const ASCIISub = '\x1a'`

			`// Nop is the nop encoding. Its transformed bytes are the same as the source`
			`// bytes; it does not replace invalid UTF-8 sequences.`
			`var Nop Encoding = nop{}`

			`type nop struct{}`

			`func (nop) NewDecoder() *Decoder {`
			`return &Decoder{Transformer: transform.Nop}`
			`}`
			`func (nop) NewEncoder() *Encoder {`
			`return &Encoder{Transformer: transform.Nop}`
			`}`

			`// Replacement is the replacement encoding. Decoding from the replacement`
			`// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to`
			`// the replacement encoding yields the same as the source bytes except that`
			`// invalid UTF-8 is converted to '\uFFFD'.`
			`//`
			`// It is defined at http://encoding.spec.whatwg.org/#replacement`
			`var Replacement Encoding = replacement{}`

			`type replacement struct{}`

			`func (replacement) NewDecoder() *Decoder {`
			`return &Decoder{Transformer: replacementDecoder{}}`
			`}`

			`func (replacement) NewEncoder() *Encoder {`
			`return &Encoder{Transformer: replacementEncoder{}}`
			`}`

			`func (replacement) ID() (mib identifier.MIB, other string) {`
			`return identifier.Replacement, ""`
			`}`

			`type replacementDecoder struct{ transform.NopResetter }`

			`func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {`
			`if len(dst) < 3 {`
			`return 0, 0, transform.ErrShortDst`
			`}`
			`if atEOF {`
			`const fffd = "\ufffd"`
			`dst[0] = fffd[0]`
			`dst[1] = fffd[1]`
			`dst[2] = fffd[2]`
			`nDst = 3`
			`}`
			`return nDst, len(src), nil`
			`}`

			`type replacementEncoder struct{ transform.NopResetter }`

			`func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {`
			`r, size := rune(0), 0`

			`for ; nSrc < len(src); nSrc += size {`
			`r = rune(src[nSrc])`

			`// Decode a 1-byte rune.`
			`if r < utf8.RuneSelf {`
			`size = 1`

			`} else {`
			`// Decode a multi-byte rune.`
			`r, size = utf8.DecodeRune(src[nSrc:])`
			`if size == 1 {`
			`// All valid runes of size 1 (those below utf8.RuneSelf) were`
			`// handled above. We have invalid UTF-8 or we haven't seen the`
			`// full character yet.`
			`if !atEOF && !utf8.FullRune(src[nSrc:]) {`
			`err = transform.ErrShortSrc`
			`break`
			`}`
			`r = '\ufffd'`
			`}`
			`}`

			`if nDst+utf8.RuneLen(r) > len(dst) {`
			`err = transform.ErrShortDst`
			`break`
			`}`
			`nDst += utf8.EncodeRune(dst[nDst:], r)`
			`}`
			`return nDst, nSrc, err`
			`}`

			`// HTMLEscapeUnsupported wraps encoders to replace source runes outside the`
			`// repertoire of the destination encoding with HTML escape sequences.`
			`//`
			`// This wrapper exists to comply to URL and HTML forms requiring a`
			`// non-terminating legacy encoder. The produced sequences may lead to data`
			`// loss as they are indistinguishable from legitimate input. To avoid this`
			`// issue, use UTF-8 encodings whenever possible.`
			`func HTMLEscapeUnsupported(e Encoder) Encoder {`
			`return &Encoder{Transformer: &errorHandler{e, errorToHTML}}`
			`}`

			`// ReplaceUnsupported wraps encoders to replace source runes outside the`
			`// repertoire of the destination encoding with an encoding-specific`
			`// replacement.`
			`//`
			`// This wrapper is only provided for backwards compatibility and legacy`
			`// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.`
			`func ReplaceUnsupported(e Encoder) Encoder {`
			`return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}`
			`}`

			`type errorHandler struct {`
			`*Encoder`
			`handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)`
			`}`

			`// TODO: consider making this error public in some form.`
			`type repertoireError interface {`
			`Replacement() byte`
			`}`

			`func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {`
			`nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)`
			`for err != nil {`
			`rerr, ok := err.(repertoireError)`
			`if !ok {`
			`return nDst, nSrc, err`
			`}`
			`r, sz := utf8.DecodeRune(src[nSrc:])`
			`n, ok := h.handler(dst[nDst:], r, rerr)`
			`if !ok {`
			`return nDst, nSrc, transform.ErrShortDst`
			`}`
			`err = nil`
			`nDst += n`
			`if nSrc += sz; nSrc < len(src) {`
			`var dn, sn int`
			`dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)`
			`nDst += dn`
			`nSrc += sn`
			`}`
			`}`
			`return nDst, nSrc, err`
			`}`

			`func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {`
			`buf := [8]byte{}`
			`b := strconv.AppendUint(buf[:0], uint64(r), 10)`
			`if n = len(b) + len("&#;"); n >= len(dst) {`
			`return 0, false`
			`}`
			`dst[0] = '&'`
			`dst[1] = '#'`
			`dst[copy(dst[2:], b)+2] = ';'`
			`return n, true`
			`}`

			`func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {`
			`if len(dst) == 0 {`
			`return 0, false`
			`}`
			`dst[0] = err.Replacement()`
			`return 1, true`
			`}`

			`// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.`
			`var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")`

			`// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first`
			`// input byte that is not valid UTF-8.`
			`var UTF8Validator transform.Transformer = utf8Validator{}`

			`type utf8Validator struct{ transform.NopResetter }`

			`func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {`
			`n := len(src)`
			`if n > len(dst) {`
			`n = len(dst)`
			`}`
			`for i := 0; i < n; {`
			`if c := src[i]; c < utf8.RuneSelf {`
			`dst[i] = c`
			`i++`
			`continue`
			`}`
			`_, size := utf8.DecodeRune(src[i:])`
			`if size == 1 {`
			`// All valid runes of size 1 (those below utf8.RuneSelf) were`
			`// handled above. We have invalid UTF-8 or we haven't seen the`
			`// full character yet.`
			`err = ErrInvalidUTF8`
			`if !atEOF && !utf8.FullRune(src[i:]) {`
			`err = transform.ErrShortSrc`
			`}`
			`return i, i, err`
			`}`
			`if i+size > len(dst) {`
			`return i, i, transform.ErrShortDst`
			`}`
			`for ; size > 0; size-- {`
			`dst[i] = src[i]`
			`i++`
			`}`
			`}`
			`if len(src) > len(dst) {`
			`err = transform.ErrShortDst`
			`}`
			`return n, n, err`
			`}`