mirror of
https://github.com/rocky-linux/peridot.git
synced 2024-11-14 01:41:23 +00:00
116 lines
3.2 KiB
Go
116 lines
3.2 KiB
Go
// Copyright 2018 Adam Tauber
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package colly
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"mime"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"github.com/saintfish/chardet"
|
|
"golang.org/x/net/html/charset"
|
|
)
|
|
|
|
// Response is the representation of a HTTP response made by a Collector
|
|
type Response struct {
|
|
// StatusCode is the status code of the Response
|
|
StatusCode int
|
|
// Body is the content of the Response
|
|
Body []byte
|
|
// Ctx is a context between a Request and a Response
|
|
Ctx *Context
|
|
// Request is the Request object of the response
|
|
Request *Request
|
|
// Headers contains the Response's HTTP headers
|
|
Headers *http.Header
|
|
// Trace contains the HTTPTrace for the request. Will only be set by the
|
|
// collector if Collector.TraceHTTP is set to true.
|
|
Trace *HTTPTrace
|
|
}
|
|
|
|
// Save writes response body to disk
|
|
func (r *Response) Save(fileName string) error {
|
|
return ioutil.WriteFile(fileName, r.Body, 0644)
|
|
}
|
|
|
|
// FileName returns the sanitized file name parsed from "Content-Disposition"
|
|
// header or from URL
|
|
func (r *Response) FileName() string {
|
|
_, params, err := mime.ParseMediaType(r.Headers.Get("Content-Disposition"))
|
|
if fName, ok := params["filename"]; ok && err == nil {
|
|
return SanitizeFileName(fName)
|
|
}
|
|
if r.Request.URL.RawQuery != "" {
|
|
return SanitizeFileName(fmt.Sprintf("%s_%s", r.Request.URL.Path, r.Request.URL.RawQuery))
|
|
}
|
|
return SanitizeFileName(strings.TrimPrefix(r.Request.URL.Path, "/"))
|
|
}
|
|
|
|
func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error {
|
|
if len(r.Body) == 0 {
|
|
return nil
|
|
}
|
|
if defaultEncoding != "" {
|
|
tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
r.Body = tmpBody
|
|
return nil
|
|
}
|
|
contentType := strings.ToLower(r.Headers.Get("Content-Type"))
|
|
|
|
if strings.Contains(contentType, "image/") ||
|
|
strings.Contains(contentType, "video/") ||
|
|
strings.Contains(contentType, "audio/") ||
|
|
strings.Contains(contentType, "font/") {
|
|
// These MIME types should not have textual data.
|
|
|
|
return nil
|
|
}
|
|
|
|
if !strings.Contains(contentType, "charset") {
|
|
if !detectCharset {
|
|
return nil
|
|
}
|
|
d := chardet.NewTextDetector()
|
|
r, err := d.DetectBest(r.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
contentType = "text/plain; charset=" + r.Charset
|
|
}
|
|
if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") {
|
|
return nil
|
|
}
|
|
tmpBody, err := encodeBytes(r.Body, contentType)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
r.Body = tmpBody
|
|
return nil
|
|
}
|
|
|
|
func encodeBytes(b []byte, contentType string) ([]byte, error) {
|
|
r, err := charset.NewReader(bytes.NewReader(b), contentType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ioutil.ReadAll(r)
|
|
}
|