peridot/vendor/github.com/temoto/robotstxt/robotstxt.go

// Package robotstxt implements the robots.txt Exclusion Protocol
// as specified in http://www.robotstxt.org/wc/robots.html
// with various extensions.
package robotstxt

// Comments explaining the logic are taken from either the Google's spec:
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt

import (
	"bytes"
	"errors"
	"io/ioutil"
	"net/http"
	"regexp"
	"strconv"
	"strings"
	"time"
)

type RobotsData struct {
	// private
	groups      map[string]*Group
	allowAll    bool
	disallowAll bool
	Host        string
	Sitemaps    []string
}

type Group struct {
	rules      []*rule
	Agent      string
	CrawlDelay time.Duration
}

type rule struct {
	path    string
	allow   bool
	pattern *regexp.Regexp
}

type ParseError struct {
	Errs []error
}

func newParseError(errs []error) *ParseError {
	return &ParseError{errs}
}

func (e ParseError) Error() string {
	var b bytes.Buffer

	b.WriteString("Parse error(s): " + "\n")
	for _, er := range e.Errs {
		b.WriteString(er.Error() + "\n")
	}
	return b.String()
}

var allowAll = &RobotsData{allowAll: true}
var disallowAll = &RobotsData{disallowAll: true}
var emptyGroup = &Group{}

func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
	switch {
	case statusCode >= 200 && statusCode < 300:
		return FromBytes(body)

	// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
	//
	// Google treats all 4xx errors in the same way and assumes that no valid
	// robots.txt file exists. It is assumed that there are no restrictions.
	// This is a "full allow" for crawling. Note: this includes 401
	// "Unauthorized" and 403 "Forbidden" HTTP result codes.
	case statusCode >= 400 && statusCode < 500:
		return allowAll, nil

	// From Google's spec:
	// Server errors (5xx) are seen as temporary errors that result in a "full
	// disallow" of crawling.
	case statusCode >= 500 && statusCode < 600:
		return disallowAll, nil
	}

	return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))
}

func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
	return FromStatusAndBytes(statusCode, []byte(body))
}

func FromResponse(res *http.Response) (*RobotsData, error) {
	if res == nil {
		// Edge case, if res is nil, return nil data
		return nil, nil
	}
	buf, e := ioutil.ReadAll(res.Body)
	if e != nil {
		return nil, e
	}
	return FromStatusAndBytes(res.StatusCode, buf)
}

func FromBytes(body []byte) (r *RobotsData, err error) {
	var errs []error

	// special case (probably not worth optimization?)
	trimmed := bytes.TrimSpace(body)
	if len(trimmed) == 0 {
		return allowAll, nil
	}

	sc := newByteScanner("bytes", true)
	//sc.Quiet = !print_errors
	sc.feed(body, true)
	tokens := sc.scanAll()

	// special case worth optimization
	if len(tokens) == 0 {
		return allowAll, nil
	}

	r = &RobotsData{}
	parser := newParser(tokens)
	r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
	if len(errs) > 0 {
		return nil, newParseError(errs)
	}

	return r, nil
}

func FromString(body string) (r *RobotsData, err error) {
	return FromBytes([]byte(body))
}

func (r *RobotsData) TestAgent(path, agent string) bool {
	if r.allowAll {
		return true
	}
	if r.disallowAll {
		return false
	}

	// Find a group of rules that applies to this agent
	// From Google's spec:
	// The user-agent is non-case-sensitive.
	g := r.FindGroup(agent)
	return g.Test(path)
}

// FindGroup searches block of declarations for specified user-agent.
// From Google's spec:
// Only one group of group-member records is valid for a particular crawler.
// The crawler must determine the correct group of records by finding the group
// with the most specific user-agent that still matches. All other groups of
// records are ignored by the crawler. The user-agent is non-case-sensitive.
// The order of the groups within the robots.txt file is irrelevant.
func (r *RobotsData) FindGroup(agent string) (ret *Group) {
	var prefixLen int

	agent = strings.ToLower(agent)
	if ret = r.groups["*"]; ret != nil {
		// Weakest match possible
		prefixLen = 1
	}
	for a, g := range r.groups {
		if a != "*" && strings.HasPrefix(agent, a) {
			if l := len(a); l > prefixLen {
				prefixLen = l
				ret = g
			}
		}
	}

	if ret == nil {
		return emptyGroup
	}
	return
}

func (g *Group) Test(path string) bool {
	if r := g.findRule(path); r != nil {
		return r.allow
	}

	// From Google's spec:
	// By default, there are no restrictions for crawling for the designated crawlers.
	return true
}

// From Google's spec:
// The path value is used as a basis to determine whether or not a rule applies
// to a specific URL on a site. With the exception of wildcards, the path is
// used to match the beginning of a URL (and any valid URLs that start with the
// same path).
//
// At a group-member level, in particular for allow and disallow directives,
// the most specific rule based on the length of the [path] entry will trump
// the less specific (shorter) rule. The order of precedence for rules with
// wildcards is undefined.
func (g *Group) findRule(path string) (ret *rule) {
	var prefixLen int

	for _, r := range g.rules {
		if r.pattern != nil {
			if r.pattern.MatchString(path) {
				// Consider this a match equal to the length of the pattern.
				// From Google's spec:
				// The order of precedence for rules with wildcards is undefined.
				if l := len(r.pattern.String()); l > prefixLen {
					prefixLen = l
					ret = r
				}
			}
		} else if r.path == "/" && prefixLen == 0 {
			// Weakest match possible
			prefixLen = 1
			ret = r
		} else if strings.HasPrefix(path, r.path) {
			if l := len(r.path); l > prefixLen {
				prefixLen = l
				ret = r
			}
		}
	}
	return
}
Initial commit 2022-07-07 20:11:50 +00:00			`// Package robotstxt implements the robots.txt Exclusion Protocol`
			`// as specified in http://www.robotstxt.org/wc/robots.html`
			`// with various extensions.`
			`package robotstxt`

			`// Comments explaining the logic are taken from either the Google's spec:`
			`// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt`

			`import (`
			`"bytes"`
			`"errors"`
			`"io/ioutil"`
			`"net/http"`
			`"regexp"`
			`"strconv"`
			`"strings"`
			`"time"`
			`)`

			`type RobotsData struct {`
			`// private`
			`groups map[string]*Group`
			`allowAll bool`
			`disallowAll bool`
			`Host string`
			`Sitemaps []string`
			`}`

			`type Group struct {`
			`rules []*rule`
			`Agent string`
			`CrawlDelay time.Duration`
			`}`

			`type rule struct {`
			`path string`
			`allow bool`
			`pattern *regexp.Regexp`
			`}`

			`type ParseError struct {`
			`Errs []error`
			`}`

			`func newParseError(errs []error) *ParseError {`
			`return &ParseError{errs}`
			`}`

			`func (e ParseError) Error() string {`
			`var b bytes.Buffer`

			`b.WriteString("Parse error(s): " + "\n")`
			`for _, er := range e.Errs {`
			`b.WriteString(er.Error() + "\n")`
			`}`
			`return b.String()`
			`}`

			`var allowAll = &RobotsData{allowAll: true}`
			`var disallowAll = &RobotsData{disallowAll: true}`
			`var emptyGroup = &Group{}`

			`func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {`
			`switch {`
			`case statusCode >= 200 && statusCode < 300:`
			`return FromBytes(body)`

			`// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt`
			`//`
			`// Google treats all 4xx errors in the same way and assumes that no valid`
			`// robots.txt file exists. It is assumed that there are no restrictions.`
			`// This is a "full allow" for crawling. Note: this includes 401`
			`// "Unauthorized" and 403 "Forbidden" HTTP result codes.`
			`case statusCode >= 400 && statusCode < 500:`
			`return allowAll, nil`

			`// From Google's spec:`
			`// Server errors (5xx) are seen as temporary errors that result in a "full`
			`// disallow" of crawling.`
			`case statusCode >= 500 && statusCode < 600:`
			`return disallowAll, nil`
			`}`

			`return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))`
			`}`

			`func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {`
			`return FromStatusAndBytes(statusCode, []byte(body))`
			`}`

			`func FromResponse(res http.Response) (RobotsData, error) {`
			`if res == nil {`
			`// Edge case, if res is nil, return nil data`
			`return nil, nil`
			`}`
			`buf, e := ioutil.ReadAll(res.Body)`
			`if e != nil {`
			`return nil, e`
			`}`
			`return FromStatusAndBytes(res.StatusCode, buf)`
			`}`

			`func FromBytes(body []byte) (r *RobotsData, err error) {`
			`var errs []error`

			`// special case (probably not worth optimization?)`
			`trimmed := bytes.TrimSpace(body)`
			`if len(trimmed) == 0 {`
			`return allowAll, nil`
			`}`

			`sc := newByteScanner("bytes", true)`
			`//sc.Quiet = !print_errors`
			`sc.feed(body, true)`
			`tokens := sc.scanAll()`

			`// special case worth optimization`
			`if len(tokens) == 0 {`
			`return allowAll, nil`
			`}`

			`r = &RobotsData{}`
			`parser := newParser(tokens)`
			`r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()`
			`if len(errs) > 0 {`
			`return nil, newParseError(errs)`
			`}`

			`return r, nil`
			`}`

			`func FromString(body string) (r *RobotsData, err error) {`
			`return FromBytes([]byte(body))`
			`}`

			`func (r *RobotsData) TestAgent(path, agent string) bool {`
			`if r.allowAll {`
			`return true`
			`}`
			`if r.disallowAll {`
			`return false`
			`}`

			`// Find a group of rules that applies to this agent`
			`// From Google's spec:`
			`// The user-agent is non-case-sensitive.`
			`g := r.FindGroup(agent)`
			`return g.Test(path)`
			`}`

			`// FindGroup searches block of declarations for specified user-agent.`
			`// From Google's spec:`
			`// Only one group of group-member records is valid for a particular crawler.`
			`// The crawler must determine the correct group of records by finding the group`
			`// with the most specific user-agent that still matches. All other groups of`
			`// records are ignored by the crawler. The user-agent is non-case-sensitive.`
			`// The order of the groups within the robots.txt file is irrelevant.`
			`func (r RobotsData) FindGroup(agent string) (ret Group) {`
			`var prefixLen int`

			`agent = strings.ToLower(agent)`
			`if ret = r.groups["*"]; ret != nil {`
			`// Weakest match possible`
			`prefixLen = 1`
			`}`
			`for a, g := range r.groups {`
			`if a != "*" && strings.HasPrefix(agent, a) {`
			`if l := len(a); l > prefixLen {`
			`prefixLen = l`
			`ret = g`
			`}`
			`}`
			`}`

			`if ret == nil {`
			`return emptyGroup`
			`}`
			`return`
			`}`

			`func (g *Group) Test(path string) bool {`
			`if r := g.findRule(path); r != nil {`
			`return r.allow`
			`}`

			`// From Google's spec:`
			`// By default, there are no restrictions for crawling for the designated crawlers.`
			`return true`
			`}`

			`// From Google's spec:`
			`// The path value is used as a basis to determine whether or not a rule applies`
			`// to a specific URL on a site. With the exception of wildcards, the path is`
			`// used to match the beginning of a URL (and any valid URLs that start with the`
			`// same path).`
			`//`
			`// At a group-member level, in particular for allow and disallow directives,`
			`// the most specific rule based on the length of the [path] entry will trump`
			`// the less specific (shorter) rule. The order of precedence for rules with`
			`// wildcards is undefined.`
			`func (g Group) findRule(path string) (ret rule) {`
			`var prefixLen int`

			`for _, r := range g.rules {`
			`if r.pattern != nil {`
			`if r.pattern.MatchString(path) {`
			`// Consider this a match equal to the length of the pattern.`
			`// From Google's spec:`
			`// The order of precedence for rules with wildcards is undefined.`
			`if l := len(r.pattern.String()); l > prefixLen {`
			`prefixLen = l`
			`ret = r`
			`}`
			`}`
			`} else if r.path == "/" && prefixLen == 0 {`
			`// Weakest match possible`
			`prefixLen = 1`
			`ret = r`
			`} else if strings.HasPrefix(path, r.path) {`
			`if l := len(r.path); l > prefixLen {`
			`prefixLen = l`
			`ret = r`
			`}`
			`}`
			`}`
			`return`
			`}`