mirror of
https://github.com/rocky-linux/peridot.git
synced 2024-12-25 11:58:30 +00:00
200 lines
5.5 KiB
Go
200 lines
5.5 KiB
Go
package characters
|
|
|
|
import (
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type utf8Err struct {
|
|
Index int
|
|
Size int
|
|
}
|
|
|
|
func (u utf8Err) Zero() bool {
|
|
return u.Size == 0
|
|
}
|
|
|
|
// Verified that a given string is only made of valid UTF-8 characters allowed
|
|
// by the TOML spec:
|
|
//
|
|
// Any Unicode character may be used except those that must be escaped:
|
|
// quotation mark, backslash, and the control characters other than tab (U+0000
|
|
// to U+0008, U+000A to U+001F, U+007F).
|
|
//
|
|
// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
|
|
// when a character is not allowed.
|
|
//
|
|
// The returned utf8Err is Zero() if the string is valid, or contains the byte
|
|
// index and size of the invalid character.
|
|
//
|
|
// quotation mark => already checked
|
|
// backslash => already checked
|
|
// 0-0x8 => invalid
|
|
// 0x9 => tab, ok
|
|
// 0xA - 0x1F => invalid
|
|
// 0x7F => invalid
|
|
func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
|
|
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
|
|
offset := 0
|
|
for len(p) >= 8 {
|
|
// Combining two 32 bit loads allows the same code to be used
|
|
// for 32 and 64 bit platforms.
|
|
// The compiler can generate a 32bit load for first32 and second32
|
|
// on many platforms. See test/codegen/memcombine.go.
|
|
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
|
|
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
|
|
if (first32|second32)&0x80808080 != 0 {
|
|
// Found a non ASCII byte (>= RuneSelf).
|
|
break
|
|
}
|
|
|
|
for i, b := range p[:8] {
|
|
if InvalidAscii(b) {
|
|
err.Index = offset + i
|
|
err.Size = 1
|
|
return
|
|
}
|
|
}
|
|
|
|
p = p[8:]
|
|
offset += 8
|
|
}
|
|
n := len(p)
|
|
for i := 0; i < n; {
|
|
pi := p[i]
|
|
if pi < utf8.RuneSelf {
|
|
if InvalidAscii(pi) {
|
|
err.Index = offset + i
|
|
err.Size = 1
|
|
return
|
|
}
|
|
i++
|
|
continue
|
|
}
|
|
x := first[pi]
|
|
if x == xx {
|
|
// Illegal starter byte.
|
|
err.Index = offset + i
|
|
err.Size = 1
|
|
return
|
|
}
|
|
size := int(x & 7)
|
|
if i+size > n {
|
|
// Short or invalid.
|
|
err.Index = offset + i
|
|
err.Size = n - i
|
|
return
|
|
}
|
|
accept := acceptRanges[x>>4]
|
|
if c := p[i+1]; c < accept.lo || accept.hi < c {
|
|
err.Index = offset + i
|
|
err.Size = 2
|
|
return
|
|
} else if size == 2 {
|
|
} else if c := p[i+2]; c < locb || hicb < c {
|
|
err.Index = offset + i
|
|
err.Size = 3
|
|
return
|
|
} else if size == 3 {
|
|
} else if c := p[i+3]; c < locb || hicb < c {
|
|
err.Index = offset + i
|
|
err.Size = 4
|
|
return
|
|
}
|
|
i += size
|
|
}
|
|
return
|
|
}
|
|
|
|
// Return the size of the next rune if valid, 0 otherwise.
|
|
func Utf8ValidNext(p []byte) int {
|
|
c := p[0]
|
|
|
|
if c < utf8.RuneSelf {
|
|
if InvalidAscii(c) {
|
|
return 0
|
|
}
|
|
return 1
|
|
}
|
|
|
|
x := first[c]
|
|
if x == xx {
|
|
// Illegal starter byte.
|
|
return 0
|
|
}
|
|
size := int(x & 7)
|
|
if size > len(p) {
|
|
// Short or invalid.
|
|
return 0
|
|
}
|
|
accept := acceptRanges[x>>4]
|
|
if c := p[1]; c < accept.lo || accept.hi < c {
|
|
return 0
|
|
} else if size == 2 {
|
|
} else if c := p[2]; c < locb || hicb < c {
|
|
return 0
|
|
} else if size == 3 {
|
|
} else if c := p[3]; c < locb || hicb < c {
|
|
return 0
|
|
}
|
|
|
|
return size
|
|
}
|
|
|
|
// acceptRange gives the range of valid values for the second byte in a UTF-8
|
|
// sequence.
|
|
type acceptRange struct {
|
|
lo uint8 // lowest value for second byte.
|
|
hi uint8 // highest value for second byte.
|
|
}
|
|
|
|
// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
|
|
var acceptRanges = [16]acceptRange{
|
|
0: {locb, hicb},
|
|
1: {0xA0, hicb},
|
|
2: {locb, 0x9F},
|
|
3: {0x90, hicb},
|
|
4: {locb, 0x8F},
|
|
}
|
|
|
|
// first is information about the first byte in a UTF-8 sequence.
|
|
var first = [256]uint8{
|
|
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
|
|
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
|
|
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
|
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
|
|
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
|
|
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
|
|
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
|
|
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
|
|
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
|
|
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
|
|
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
|
|
}
|
|
|
|
const (
|
|
// The default lowest and highest continuation byte.
|
|
locb = 0b10000000
|
|
hicb = 0b10111111
|
|
|
|
// These names of these constants are chosen to give nice alignment in the
|
|
// table below. The first nibble is an index into acceptRanges or F for
|
|
// special one-byte cases. The second nibble is the Rune length or the
|
|
// Status for the special one-byte case.
|
|
xx = 0xF1 // invalid: size 1
|
|
as = 0xF0 // ASCII: size 1
|
|
s1 = 0x02 // accept 0, size 2
|
|
s2 = 0x13 // accept 1, size 3
|
|
s3 = 0x03 // accept 0, size 3
|
|
s4 = 0x23 // accept 2, size 3
|
|
s5 = 0x34 // accept 3, size 4
|
|
s6 = 0x04 // accept 0, size 4
|
|
s7 = 0x44 // accept 4, size 4
|
|
)
|