mirror of
https://github.com/rocky-linux/peridot.git
synced 2024-11-16 10:41:25 +00:00
84 lines
1.6 KiB
Go
84 lines
1.6 KiB
Go
|
package chardet
|
||
|
|
||
|
type recognizer interface {
|
||
|
Match(*recognizerInput) recognizerOutput
|
||
|
}
|
||
|
|
||
|
type recognizerOutput Result
|
||
|
|
||
|
type recognizerInput struct {
|
||
|
raw []byte
|
||
|
input []byte
|
||
|
tagStripped bool
|
||
|
byteStats []int
|
||
|
hasC1Bytes bool
|
||
|
}
|
||
|
|
||
|
func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
|
||
|
input, stripped := mayStripInput(raw, stripTag)
|
||
|
byteStats := computeByteStats(input)
|
||
|
return &recognizerInput{
|
||
|
raw: raw,
|
||
|
input: input,
|
||
|
tagStripped: stripped,
|
||
|
byteStats: byteStats,
|
||
|
hasC1Bytes: computeHasC1Bytes(byteStats),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
|
||
|
const inputBufferSize = 8192
|
||
|
out = make([]byte, 0, inputBufferSize)
|
||
|
var badTags, openTags int32
|
||
|
var inMarkup bool = false
|
||
|
stripped = false
|
||
|
if stripTag {
|
||
|
stripped = true
|
||
|
for _, c := range raw {
|
||
|
if c == '<' {
|
||
|
if inMarkup {
|
||
|
badTags += 1
|
||
|
}
|
||
|
inMarkup = true
|
||
|
openTags += 1
|
||
|
}
|
||
|
if !inMarkup {
|
||
|
out = append(out, c)
|
||
|
if len(out) >= inputBufferSize {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if c == '>' {
|
||
|
inMarkup = false
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
|
||
|
limit := len(raw)
|
||
|
if limit > inputBufferSize {
|
||
|
limit = inputBufferSize
|
||
|
}
|
||
|
out = make([]byte, limit)
|
||
|
copy(out, raw[:limit])
|
||
|
stripped = false
|
||
|
}
|
||
|
return
|
||
|
}
|
||
|
|
||
|
func computeByteStats(input []byte) []int {
|
||
|
r := make([]int, 256)
|
||
|
for _, c := range input {
|
||
|
r[c] += 1
|
||
|
}
|
||
|
return r
|
||
|
}
|
||
|
|
||
|
func computeHasC1Bytes(byteStats []int) bool {
|
||
|
for _, count := range byteStats[0x80 : 0x9F+1] {
|
||
|
if count > 0 {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|