mirror of
https://github.com/rocky-linux/peridot.git
synced 2024-10-19 07:55:07 +00:00
382 lines
12 KiB
Go
382 lines
12 KiB
Go
|
// Copyright 2020 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package json
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"errors"
|
||
|
"io"
|
||
|
"sort"
|
||
|
"sync"
|
||
|
"unicode/utf16"
|
||
|
"unicode/utf8"
|
||
|
)
|
||
|
|
||
|
// NOTE: RawValue is analogous to v1 json.RawMessage.
|
||
|
|
||
|
// RawValue represents a single raw JSON value, which may be one of the following:
|
||
|
// - a JSON literal (i.e., null, true, or false)
|
||
|
// - a JSON string (e.g., "hello, world!")
|
||
|
// - a JSON number (e.g., 123.456)
|
||
|
// - an entire JSON object (e.g., {"fizz":"buzz"} )
|
||
|
// - an entire JSON array (e.g., [1,2,3] )
|
||
|
//
|
||
|
// RawValue can represent entire array or object values, while Token cannot.
|
||
|
// RawValue may contain leading and/or trailing whitespace.
|
||
|
type RawValue []byte
|
||
|
|
||
|
// Clone returns a copy of v.
|
||
|
func (v RawValue) Clone() RawValue {
|
||
|
if v == nil {
|
||
|
return nil
|
||
|
}
|
||
|
return append(RawValue{}, v...)
|
||
|
}
|
||
|
|
||
|
// String returns the string formatting of v.
|
||
|
func (v RawValue) String() string {
|
||
|
if v == nil {
|
||
|
return "null"
|
||
|
}
|
||
|
return string(v)
|
||
|
}
|
||
|
|
||
|
// IsValid reports whether the raw JSON value is syntactically valid
|
||
|
// according to RFC 7493.
|
||
|
//
|
||
|
// It verifies whether the input is properly encoded as UTF-8,
|
||
|
// that escape sequences within strings decode to valid Unicode codepoints, and
|
||
|
// that all names in each object are unique.
|
||
|
// It does not verify whether numbers are representable within the limits
|
||
|
// of any common numeric type (e.g., float64, int64, or uint64).
|
||
|
func (v RawValue) IsValid() bool {
|
||
|
d := getBufferedDecoder(v, DecodeOptions{})
|
||
|
defer putBufferedDecoder(d)
|
||
|
_, errVal := d.ReadValue()
|
||
|
_, errEOF := d.ReadToken()
|
||
|
return errVal == nil && errEOF == io.EOF
|
||
|
}
|
||
|
|
||
|
// Compact removes all whitespace from the raw JSON value.
|
||
|
//
|
||
|
// It does not reformat JSON strings to use any other representation.
|
||
|
// It is guaranteed to succeed if the input is valid.
|
||
|
// If the value is already compacted, then the buffer is not mutated.
|
||
|
func (v *RawValue) Compact() error {
|
||
|
return v.reformat(false, false, "", "")
|
||
|
}
|
||
|
|
||
|
// Indent reformats the whitespace in the raw JSON value so that each element
|
||
|
// in a JSON object or array begins on a new, indented line beginning with
|
||
|
// prefix followed by one or more copies of indent according to the nesting.
|
||
|
// The value does not begin with the prefix nor any indention,
|
||
|
// to make it easier to embed inside other formatted JSON data.
|
||
|
//
|
||
|
// It does not reformat JSON strings to use any other representation.
|
||
|
// It is guaranteed to succeed if the input is valid.
|
||
|
// If the value is already indented properly, then the buffer is not mutated.
|
||
|
func (v *RawValue) Indent(prefix, indent string) error {
|
||
|
return v.reformat(false, true, prefix, indent)
|
||
|
}
|
||
|
|
||
|
// Canonicalize canonicalizes the raw JSON value according to the
|
||
|
// JSON Canonicalization Scheme (JCS) as defined by RFC 8785
|
||
|
// where it produces a stable representation of a JSON value.
|
||
|
//
|
||
|
// The output stability is dependent on the stability of the application data
|
||
|
// (see RFC 8785, Appendix E). It cannot produce stable output from
|
||
|
// fundamentally unstable input. For example, if the JSON value
|
||
|
// contains ephemeral data (e.g., a frequently changing timestamp),
|
||
|
// then the value is still unstable regardless of whether this is called.
|
||
|
//
|
||
|
// Note that JCS treats all JSON numbers as IEEE 754 double precision numbers.
|
||
|
// Any numbers with precision beyond what is representable by that form
|
||
|
// will lose their precision when canonicalized. For example, integer values
|
||
|
// beyond ±2⁵³ will lose their precision. It is recommended that
|
||
|
// int64 and uint64 data types be represented as a JSON string.
|
||
|
//
|
||
|
// It is guaranteed to succeed if the input is valid.
|
||
|
// If the value is already canonicalized, then the buffer is not mutated.
|
||
|
func (v *RawValue) Canonicalize() error {
|
||
|
return v.reformat(true, false, "", "")
|
||
|
}
|
||
|
|
||
|
// TODO: Instead of implementing the v1 Marshaler/Unmarshaler,
|
||
|
// consider implementing the v2 versions instead.
|
||
|
|
||
|
// MarshalJSON returns v as the JSON encoding of v.
|
||
|
// It returns the stored value as the raw JSON output without any validation.
|
||
|
// If v is nil, then this returns a JSON null.
|
||
|
func (v RawValue) MarshalJSON() ([]byte, error) {
|
||
|
// NOTE: This matches the behavior of v1 json.RawMessage.MarshalJSON.
|
||
|
if v == nil {
|
||
|
return []byte("null"), nil
|
||
|
}
|
||
|
return v, nil
|
||
|
}
|
||
|
|
||
|
// UnmarshalJSON sets v as the JSON encoding of b.
|
||
|
// It stores a copy of the provided raw JSON input without any validation.
|
||
|
func (v *RawValue) UnmarshalJSON(b []byte) error {
|
||
|
// NOTE: This matches the behavior of v1 json.RawMessage.UnmarshalJSON.
|
||
|
if v == nil {
|
||
|
return errors.New("json.RawValue: UnmarshalJSON on nil pointer")
|
||
|
}
|
||
|
*v = append((*v)[:0], b...)
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Kind returns the starting token kind.
|
||
|
// For a valid value, this will never include '}' or ']'.
|
||
|
func (v RawValue) Kind() Kind {
|
||
|
if v := v[consumeWhitespace(v):]; len(v) > 0 {
|
||
|
return Kind(v[0]).normalize()
|
||
|
}
|
||
|
return invalidKind
|
||
|
}
|
||
|
|
||
|
func (v *RawValue) reformat(canonical, multiline bool, prefix, indent string) error {
|
||
|
var eo EncodeOptions
|
||
|
if canonical {
|
||
|
eo.AllowInvalidUTF8 = false // per RFC 8785, section 3.2.4
|
||
|
eo.AllowDuplicateNames = false // per RFC 8785, section 3.1
|
||
|
eo.canonicalizeNumbers = true // per RFC 8785, section 3.2.2.3
|
||
|
eo.EscapeRune = nil // per RFC 8785, section 3.2.2.2
|
||
|
eo.multiline = false // per RFC 8785, section 3.2.1
|
||
|
} else {
|
||
|
if s := trimLeftSpaceTab(prefix); len(s) > 0 {
|
||
|
panic("json: invalid character " + quoteRune([]byte(s)) + " in indent prefix")
|
||
|
}
|
||
|
if s := trimLeftSpaceTab(indent); len(s) > 0 {
|
||
|
panic("json: invalid character " + quoteRune([]byte(s)) + " in indent")
|
||
|
}
|
||
|
eo.AllowInvalidUTF8 = true
|
||
|
eo.AllowDuplicateNames = true
|
||
|
eo.preserveRawStrings = true
|
||
|
eo.multiline = multiline // in case indent is empty
|
||
|
eo.IndentPrefix = prefix
|
||
|
eo.Indent = indent
|
||
|
}
|
||
|
eo.omitTopLevelNewline = true
|
||
|
|
||
|
// Write the entire value to reformat all tokens and whitespace.
|
||
|
e := getBufferedEncoder(eo)
|
||
|
defer putBufferedEncoder(e)
|
||
|
if err := e.WriteValue(*v); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
// For canonical output, we may need to reorder object members.
|
||
|
if canonical {
|
||
|
// Obtain a buffered encoder just to use its internal buffer as
|
||
|
// a scratch buffer in reorderObjects for reordering object members.
|
||
|
e2 := getBufferedEncoder(EncodeOptions{})
|
||
|
defer putBufferedEncoder(e2)
|
||
|
|
||
|
// Disable redundant checks performed earlier during encoding.
|
||
|
d := getBufferedDecoder(e.buf, DecodeOptions{AllowInvalidUTF8: true, AllowDuplicateNames: true})
|
||
|
defer putBufferedDecoder(d)
|
||
|
reorderObjects(d, &e2.buf) // per RFC 8785, section 3.2.3
|
||
|
}
|
||
|
|
||
|
// Store the result back into the value if different.
|
||
|
if !bytes.Equal(*v, e.buf) {
|
||
|
*v = append((*v)[:0], e.buf...)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func trimLeftSpaceTab(s string) string {
|
||
|
for i, r := range s {
|
||
|
switch r {
|
||
|
case ' ', '\t':
|
||
|
default:
|
||
|
return s[i:]
|
||
|
}
|
||
|
}
|
||
|
return ""
|
||
|
}
|
||
|
|
||
|
type memberName struct {
|
||
|
// name is the unescaped name.
|
||
|
name []byte
|
||
|
// before and after are byte offsets into Decoder.buf that represents
|
||
|
// the entire name/value pair. It may contain leading commas.
|
||
|
before, after int64
|
||
|
}
|
||
|
|
||
|
var memberNamePool = sync.Pool{New: func() any { return new(memberNames) }}
|
||
|
|
||
|
func getMemberNames() *memberNames {
|
||
|
ns := memberNamePool.Get().(*memberNames)
|
||
|
*ns = (*ns)[:0]
|
||
|
return ns
|
||
|
}
|
||
|
func putMemberNames(ns *memberNames) {
|
||
|
if cap(*ns) < 1<<10 {
|
||
|
for i := range *ns {
|
||
|
(*ns)[i] = memberName{} // avoid pinning name
|
||
|
}
|
||
|
memberNamePool.Put(ns)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
type memberNames []memberName
|
||
|
|
||
|
func (m *memberNames) Len() int { return len(*m) }
|
||
|
func (m *memberNames) Less(i, j int) bool { return lessUTF16((*m)[i].name, (*m)[j].name) }
|
||
|
func (m *memberNames) Swap(i, j int) { (*m)[i], (*m)[j] = (*m)[j], (*m)[i] }
|
||
|
|
||
|
// reorderObjects recursively reorders all object members in place
|
||
|
// according to the ordering specified in RFC 8785, section 3.2.3.
|
||
|
//
|
||
|
// Pre-conditions:
|
||
|
// - The value is valid (i.e., no decoder errors should ever occur).
|
||
|
// - The value is compact (i.e., no whitespace is present).
|
||
|
// - Initial call is provided a Decoder reading from the start of v.
|
||
|
//
|
||
|
// Post-conditions:
|
||
|
// - Exactly one JSON value is read from the Decoder.
|
||
|
// - All fully-parsed JSON objects are reordered by directly moving
|
||
|
// the members in the value buffer.
|
||
|
//
|
||
|
// The runtime is approximately O(n·log(n)) + O(m·log(m)),
|
||
|
// where n is len(v) and m is the total number of object members.
|
||
|
func reorderObjects(d *Decoder, scratch *[]byte) {
|
||
|
switch tok, _ := d.ReadToken(); tok.Kind() {
|
||
|
case '{':
|
||
|
// Iterate and collect the name and offsets for every object member.
|
||
|
members := getMemberNames()
|
||
|
defer putMemberNames(members)
|
||
|
var prevName []byte
|
||
|
isSorted := true
|
||
|
|
||
|
beforeBody := d.InputOffset() // offset after '{'
|
||
|
for d.PeekKind() != '}' {
|
||
|
beforeName := d.InputOffset()
|
||
|
var flags valueFlags
|
||
|
name, _ := d.readValue(&flags)
|
||
|
name = unescapeStringMayCopy(name, flags.isVerbatim())
|
||
|
reorderObjects(d, scratch)
|
||
|
afterValue := d.InputOffset()
|
||
|
|
||
|
if isSorted && len(*members) > 0 {
|
||
|
isSorted = lessUTF16(prevName, []byte(name))
|
||
|
}
|
||
|
*members = append(*members, memberName{name, beforeName, afterValue})
|
||
|
prevName = name
|
||
|
}
|
||
|
afterBody := d.InputOffset() // offset before '}'
|
||
|
d.ReadToken()
|
||
|
|
||
|
// Sort the members; return early if it's already sorted.
|
||
|
if isSorted {
|
||
|
return
|
||
|
}
|
||
|
// TODO(https://go.dev/issue/47619): Use slices.Sort.
|
||
|
sort.Sort(members)
|
||
|
|
||
|
// Append the reordered members to a new buffer,
|
||
|
// then copy the reordered members back over the original members.
|
||
|
// Avoid swapping in place since each member may be a different size
|
||
|
// where moving a member over a smaller member may corrupt the data
|
||
|
// for subsequent members before they have been moved.
|
||
|
//
|
||
|
// The following invariant must hold:
|
||
|
// sum([m.after-m.before for m in members]) == afterBody-beforeBody
|
||
|
sorted := (*scratch)[:0]
|
||
|
for i, member := range *members {
|
||
|
if d.buf[member.before] == ',' {
|
||
|
member.before++ // trim leading comma
|
||
|
}
|
||
|
sorted = append(sorted, d.buf[member.before:member.after]...)
|
||
|
if i < len(*members)-1 {
|
||
|
sorted = append(sorted, ',') // append trailing comma
|
||
|
}
|
||
|
}
|
||
|
if int(afterBody-beforeBody) != len(sorted) {
|
||
|
panic("BUG: length invariant violated")
|
||
|
}
|
||
|
copy(d.buf[beforeBody:afterBody], sorted)
|
||
|
|
||
|
// Update scratch buffer to the largest amount ever used.
|
||
|
if len(sorted) > len(*scratch) {
|
||
|
*scratch = sorted
|
||
|
}
|
||
|
case '[':
|
||
|
for d.PeekKind() != ']' {
|
||
|
reorderObjects(d, scratch)
|
||
|
}
|
||
|
d.ReadToken()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// lessUTF16 reports whether x is lexicographically less than y according
|
||
|
// to the UTF-16 codepoints of the UTF-8 encoded input strings.
|
||
|
// This implements the ordering specified in RFC 8785, section 3.2.3.
|
||
|
// The inputs must be valid UTF-8, otherwise this may panic.
|
||
|
func lessUTF16[Bytes []byte | string](x, y Bytes) bool {
|
||
|
// NOTE: This is an optimized, allocation-free implementation
|
||
|
// of lessUTF16Simple in fuzz_test.go. FuzzLessUTF16 verifies that the
|
||
|
// two implementations agree on the result of comparing any two strings.
|
||
|
|
||
|
isUTF16Self := func(r rune) bool {
|
||
|
return ('\u0000' <= r && r <= '\uD7FF') || ('\uE000' <= r && r <= '\uFFFF')
|
||
|
}
|
||
|
|
||
|
var invalidUTF8 bool
|
||
|
x0, y0 := x, y
|
||
|
for {
|
||
|
if len(x) == 0 || len(y) == 0 {
|
||
|
if len(x) == len(y) && invalidUTF8 {
|
||
|
return string(x0) < string(y0)
|
||
|
}
|
||
|
return len(x) < len(y)
|
||
|
}
|
||
|
|
||
|
// ASCII fast-path.
|
||
|
if x[0] < utf8.RuneSelf || y[0] < utf8.RuneSelf {
|
||
|
if x[0] != y[0] {
|
||
|
return x[0] < y[0]
|
||
|
}
|
||
|
x, y = x[1:], y[1:]
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// Decode next pair of runes as UTF-8.
|
||
|
// TODO(https://go.dev/issue/56948): Use a generic implementation
|
||
|
// of utf8.DecodeRune, or rely on a compiler optimization to statically
|
||
|
// hide the cost of a type switch (https://go.dev/issue/57072).
|
||
|
var rx, ry rune
|
||
|
var nx, ny int
|
||
|
switch any(x).(type) {
|
||
|
case string:
|
||
|
rx, nx = utf8.DecodeRuneInString(string(x))
|
||
|
ry, ny = utf8.DecodeRuneInString(string(y))
|
||
|
case []byte:
|
||
|
rx, nx = utf8.DecodeRune([]byte(x))
|
||
|
ry, ny = utf8.DecodeRune([]byte(y))
|
||
|
}
|
||
|
|
||
|
selfx := isUTF16Self(rx)
|
||
|
selfy := isUTF16Self(ry)
|
||
|
switch {
|
||
|
// The x rune is a single UTF-16 codepoint, while
|
||
|
// the y rune is a surrogate pair of UTF-16 codepoints.
|
||
|
case selfx && !selfy:
|
||
|
ry, _ = utf16.EncodeRune(ry)
|
||
|
// The y rune is a single UTF-16 codepoint, while
|
||
|
// the x rune is a surrogate pair of UTF-16 codepoints.
|
||
|
case selfy && !selfx:
|
||
|
rx, _ = utf16.EncodeRune(rx)
|
||
|
}
|
||
|
if rx != ry {
|
||
|
return rx < ry
|
||
|
}
|
||
|
invalidUTF8 = invalidUTF8 || (rx == utf8.RuneError && nx == 1) || (ry == utf8.RuneError && ny == 1)
|
||
|
x, y = x[nx:], y[ny:]
|
||
|
}
|
||
|
}
|