peridot/vendor/github.com/gocolly/colly/v2/storage/storage.go

129 lines
3.4 KiB
Go

// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"net/http"
"net/http/cookiejar"
"net/url"
"strings"
"sync"
)
// Storage is an interface which handles Collector's internal data,
// like visited urls and cookies.
// The default Storage of the Collector is the InMemoryStorage.
// Collector's storage can be changed by calling Collector.SetStorage()
// function.
type Storage interface {
// Init initializes the storage
Init() error
// Visited receives and stores a request ID that is visited by the Collector
Visited(requestID uint64) error
// IsVisited returns true if the request was visited before IsVisited
// is called
IsVisited(requestID uint64) (bool, error)
// Cookies retrieves stored cookies for a given host
Cookies(u *url.URL) string
// SetCookies stores cookies for a given host
SetCookies(u *url.URL, cookies string)
}
// InMemoryStorage is the default storage backend of colly.
// InMemoryStorage keeps cookies and visited urls in memory
// without persisting data on the disk.
type InMemoryStorage struct {
visitedURLs map[uint64]bool
lock *sync.RWMutex
jar *cookiejar.Jar
}
// Init initializes InMemoryStorage
func (s *InMemoryStorage) Init() error {
if s.visitedURLs == nil {
s.visitedURLs = make(map[uint64]bool)
}
if s.lock == nil {
s.lock = &sync.RWMutex{}
}
if s.jar == nil {
var err error
s.jar, err = cookiejar.New(nil)
return err
}
return nil
}
// Visited implements Storage.Visited()
func (s *InMemoryStorage) Visited(requestID uint64) error {
s.lock.Lock()
s.visitedURLs[requestID] = true
s.lock.Unlock()
return nil
}
// IsVisited implements Storage.IsVisited()
func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) {
s.lock.RLock()
visited := s.visitedURLs[requestID]
s.lock.RUnlock()
return visited, nil
}
// Cookies implements Storage.Cookies()
func (s *InMemoryStorage) Cookies(u *url.URL) string {
return StringifyCookies(s.jar.Cookies(u))
}
// SetCookies implements Storage.SetCookies()
func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) {
s.jar.SetCookies(u, UnstringifyCookies(cookies))
}
// Close implements Storage.Close()
func (s *InMemoryStorage) Close() error {
return nil
}
// StringifyCookies serializes list of http.Cookies to string
func StringifyCookies(cookies []*http.Cookie) string {
// Stringify cookies.
cs := make([]string, len(cookies))
for i, c := range cookies {
cs[i] = c.String()
}
return strings.Join(cs, "\n")
}
// UnstringifyCookies deserializes a cookie string to http.Cookies
func UnstringifyCookies(s string) []*http.Cookie {
h := http.Header{}
for _, c := range strings.Split(s, "\n") {
h.Add("Set-Cookie", c)
}
r := http.Response{Header: h}
return r.Cookies()
}
// ContainsCookie checks if a cookie name is represented in cookies
func ContainsCookie(cookies []*http.Cookie, name string) bool {
for _, c := range cookies {
if c.Name == name {
return true
}
}
return false
}