[bug] respect X-Robots-Tag and robots.txt on api/v1/instance and nodeinfo (#3756)

* feat: check X-Robots-Tag when accessing /api/v1/instance or /nodeinfo endpoints respect X-Robots-Tag * chore: go fmt ./... * Check robots.txt as well, add tests --------- Co-authored-by: tobi <tobi.smethurst@protonmail.com>
2025-12-19 13:53:02 -06:00 · 2025-02-11 13:16:14 +01:00 · 2025-02-11 13:16:14 +01:00 · d0de3ad492
commit d0de3ad492
parent 2c95fd4115
20 changed files with 1404 additions and 24 deletions
--- a/vendor/github.com/temoto/robotstxt/.gitignore
+++ b/vendor/github.com/temoto/robotstxt/.gitignore
@ -0,0 +1,15 @@
+*.cgo?.*
+*.o
+*.so
+*.sublime-*
+*.zip
+.DS_Store
+.idea/
+.tags*
+_cgo_*
+_gofuzz/crashers/
+_gofuzz/suppressions/
+_obj
+_test
+coverage.txt
+robots.txt-check/robots.txt-check
--- a/vendor/github.com/temoto/robotstxt/.golangci.yml
+++ b/vendor/github.com/temoto/robotstxt/.golangci.yml
@ -0,0 +1,20 @@
+linters:
+  enable:
+    - goconst
+    - gofmt
+    - gosec
+    - maligned
+    - prealloc
+    - staticcheck
+  disable:
+    - deadcode
+    - structcheck
+    - varcheck
+
+linters-settings:
+  gofmt:
+    simplify: true
+  govet:
+    check-shadowing: true
+  maligned:
+    suggest-new: true
--- a/vendor/github.com/temoto/robotstxt/.travis.yml
+++ b/vendor/github.com/temoto/robotstxt/.travis.yml
@ -0,0 +1,30 @@
+cache:
+  go: true
+  directories:
+  - $HOME/.cache
+  - $HOME/bin
+  - $HOME/gopath/pkg/mod
+language: go
+go:
+- 1.11
+- 1.12
+- 1.13
+- 1.14
+- 1.x
+- master
+install: true
+script: GO111MODULE=on go test -race
+
+matrix:
+  include:
+  - go: 1.x
+    env: task=coverage
+    script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt
+    after_success: bash <(curl -s https://codecov.io/bash)
+  - go: 1.x
+    env: task=bench
+    script: GO111MODULE=on ./script/bench
+  - go: 1.x
+    install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1
+    env: task=clean
+    script: GO111MODULE=on ./script/clean
--- a/vendor/github.com/temoto/robotstxt/LICENSE
+++ b/vendor/github.com/temoto/robotstxt/LICENSE
@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2010 Sergey Shepelev <temotor@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/vendor/github.com/temoto/robotstxt/README.rst
+++ b/vendor/github.com/temoto/robotstxt/README.rst
@ -0,0 +1,115 @@
+What
+====
+
+This is a robots.txt exclusion protocol implementation for Go language (golang).
+
+
+Build
+=====
+
+To build and run tests run `go test` in source directory.
+
+
+Contribute
+==========
+
+Warm welcome.
+
+* If desired, add your name in README.rst, section Who.
+* Run `script/test && script/clean && echo ok`
+* You can ignore linter warnings, but everything else must pass.
+* Send your change as pull request or just a regular patch to current maintainer (see section Who).
+
+Thank you.
+
+
+Usage
+=====
+
+As usual, no special installation is required, just
+
+    import "github.com/temoto/robotstxt"
+
+run `go get` and you're ready.
+
+1. Parse
+^^^^^^^^
+
+First of all, you need to parse robots.txt data. You can do it with
+functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`::
+
+    robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:"))
+    robots, err := robotstxt.FromString("User-agent: *\nDisallow:")
+
+As of 2012-10-03, `FromBytes` is the most efficient method, everything else
+is a wrapper for this core function.
+
+There are few convenient constructors for various purposes:
+
+* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data
+from HTTP response. It *does not* call `response.Body.Close()`::
+
+    robots, err := robotstxt.FromResponse(resp)
+    resp.Body.Close()
+    if err != nil {
+        log.Println("Error parsing robots.txt:", err.Error())
+    }
+
+* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or
+`FromStatusAndString` if you prefer to read bytes (string) yourself.
+Passing status code applies following logic in line with Google's interpretation
+of robots.txt files:
+
+    * status 2xx  -> parse body with `FromBytes` and apply rules listed there.
+    * status 4xx  -> allow all (even 401/403, as recommended by Google).
+    * other (5xx) -> disallow all, consider this a temporary unavailability.
+
+2. Query
+^^^^^^^^
+
+Parsing robots.txt content builds a kind of logic database, which you can
+query with `(r *RobotsData) TestAgent(url, agent string) (bool)`.
+
+Explicit passing of agent is useful if you want to query for different agents. For
+single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)`
+returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`.
+
+Simple query with explicit user agent. Each call will scan all rules.
+
+::
+
+    allow := robots.TestAgent("/", "FooBot")
+
+Or query several paths against same user agent for performance.
+
+::
+
+    group := robots.FindGroup("BarBot")
+    group.Test("/")
+    group.Test("/download.mp3")
+    group.Test("/news/article-2012-1")
+
+
+Who
+===
+
+Honorable contributors (in undefined order):
+
+    * Ilya Grigorik (igrigorik)
+    * Martin Angers (PuerkitoBio)
+    * Micha Gorelick (mynameisfiber)
+
+Initial commit and other: Sergey Shepelev temotor@gmail.com
+
+
+Flair
+=====
+
+.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master
+    :target: https://travis-ci.org/temoto/robotstxt
+
+.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg
+    :target: https://codecov.io/gh/temoto/robotstxt
+
+.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt
+    :target: https://goreportcard.com/report/github.com/temoto/robotstxt
--- a/vendor/github.com/temoto/robotstxt/codecov.yml
+++ b/vendor/github.com/temoto/robotstxt/codecov.yml
@ -0,0 +1,2 @@
+codecov:
+  token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04
--- a/vendor/github.com/temoto/robotstxt/fuzz.go
+++ b/vendor/github.com/temoto/robotstxt/fuzz.go
@ -0,0 +1,29 @@
+// +build gofuzz
+
+package robotstxt
+
+import "testing/quick"
+
+func Fuzz(data []byte) int {
+	r, err := FromBytes(data)
+	if err != nil {
+		if r != nil {
+			panic("r != nil on error")
+		}
+		return 0
+	}
+
+	// FindGroup must never return nil
+	f1 := func(agent string) bool { return r.FindGroup(agent) != nil }
+	if err := quick.Check(f1, nil); err != nil {
+		panic(err)
+	}
+
+	// just check TestAgent doesn't panic
+	f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true }
+	if err := quick.Check(f2, nil); err != nil {
+		panic(err)
+	}
+
+	return 1
+}
--- a/vendor/github.com/temoto/robotstxt/parser.go
+++ b/vendor/github.com/temoto/robotstxt/parser.go
@ -0,0 +1,271 @@
+package robotstxt
+
+// Comments explaining the logic are taken from either the google's spec:
+// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+//
+// or the Wikipedia's entry on robots.txt:
+// http://en.wikipedia.org/wiki/Robots.txt
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+)
+
+type lineType uint
+
+const (
+	lIgnore lineType = iota
+	lUnknown
+	lUserAgent
+	lAllow
+	lDisallow
+	lCrawlDelay
+	lSitemap
+	lHost
+)
+
+type parser struct {
+	tokens []string
+	pos    int
+}
+
+type lineInfo struct {
+	t  lineType       // Type of line key
+	k  string         // String representation of the type of key
+	vs string         // String value of the key
+	vf float64        // Float value of the key
+	vr *regexp.Regexp // Regexp value of the key
+}
+
+func newParser(tokens []string) *parser {
+	return &parser{tokens: tokens}
+}
+
+func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
+	var g *Group
+	for _, a := range agents {
+		if g = groups[a]; g == nil {
+			g = new(Group)
+			groups[a] = g
+		}
+		fun(g)
+	}
+}
+
+func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
+	groups = make(map[string]*Group, 16)
+	agents := make([]string, 0, 4)
+	isEmptyGroup := true
+
+	// Reset internal fields, tokens are assigned at creation time, never change
+	p.pos = 0
+
+	for {
+		if li, err := p.parseLine(); err != nil {
+			if err == io.EOF {
+				break
+			}
+			errs = append(errs, err)
+		} else {
+			switch li.t {
+			case lUserAgent:
+				// Two successive user-agent lines are part of the same group.
+				if !isEmptyGroup {
+					// End previous group
+					agents = make([]string, 0, 4)
+				}
+				if len(agents) == 0 {
+					isEmptyGroup = true
+				}
+				agents = append(agents, li.vs)
+
+			case lDisallow:
+				// Error if no current group
+				if len(agents) == 0 {
+					errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
+				} else {
+					isEmptyGroup = false
+					var r *rule
+					if li.vr != nil {
+						r = &rule{"", false, li.vr}
+					} else {
+						r = &rule{li.vs, false, nil}
+					}
+					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
+				}
+
+			case lAllow:
+				// Error if no current group
+				if len(agents) == 0 {
+					errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
+				} else {
+					isEmptyGroup = false
+					var r *rule
+					if li.vr != nil {
+						r = &rule{"", true, li.vr}
+					} else {
+						r = &rule{li.vs, true, nil}
+					}
+					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
+				}
+
+			case lHost:
+				host = li.vs
+
+			case lSitemap:
+				sitemaps = append(sitemaps, li.vs)
+
+			case lCrawlDelay:
+				if len(agents) == 0 {
+					errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
+				} else {
+					isEmptyGroup = false
+					delay := time.Duration(li.vf * float64(time.Second))
+					parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
+				}
+			}
+		}
+	}
+	return
+}
+
+func (p *parser) parseLine() (li *lineInfo, err error) {
+	t1, ok1 := p.popToken()
+	if !ok1 {
+		// proper EOF
+		return nil, io.EOF
+	}
+
+	t2, ok2 := p.peekToken()
+	if !ok2 {
+		// EOF, no value associated with the token, so ignore token and return
+		return nil, io.EOF
+	}
+
+	// Helper closure for all string-based tokens, common behaviour:
+	// - Consume t2 token
+	// - If empty, return unknown line info
+	// - Otherwise return the specified line info
+	returnStringVal := func(t lineType) (*lineInfo, error) {
+		p.popToken()
+		if t2 != "" {
+			return &lineInfo{t: t, k: t1, vs: t2}, nil
+		}
+		return &lineInfo{t: lIgnore}, nil
+	}
+
+	// Helper closure for all path tokens (allow/disallow), common behaviour:
+	// - Consume t2 token
+	// - If empty, return unknown line info
+	// - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
+	// - Detect if wildcards are present, if so, compile into a regexp
+	// - Return the specified line info
+	returnPathVal := func(t lineType) (*lineInfo, error) {
+		p.popToken()
+		if t2 != "" {
+			if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
+				t2 = "/" + t2
+			}
+			t2 = strings.TrimRightFunc(t2, isAsterisk)
+			// From google's spec:
+			// Google, Bing, Yahoo, and Ask support a limited form of
+			// "wildcards" for path values. These are:
+			//   * designates 0 or more instances of any valid character
+			//   $ designates the end of the URL
+			if strings.ContainsAny(t2, "*$") {
+				// Must compile a regexp, this is a pattern.
+				// Escape string before compile.
+				t2 = regexp.QuoteMeta(t2)
+				t2 = strings.Replace(t2, `\*`, `.*`, -1)
+				t2 = strings.Replace(t2, `\$`, `$`, -1)
+				if r, e := regexp.Compile(t2); e != nil {
+					return nil, e
+				} else {
+					return &lineInfo{t: t, k: t1, vr: r}, nil
+				}
+			} else {
+				// Simple string path
+				return &lineInfo{t: t, k: t1, vs: t2}, nil
+			}
+		}
+		return &lineInfo{t: lIgnore}, nil
+	}
+
+	switch strings.ToLower(t1) {
+	case tokEOL:
+		// Don't consume t2 and continue parsing
+		return &lineInfo{t: lIgnore}, nil
+
+	case "user-agent", "useragent":
+		// From google's spec:
+		// Handling of <field> elements with simple errors / typos (eg "useragent"
+		// instead of "user-agent") is undefined and may be interpreted as correct
+		// directives by some user-agents.
+		// The user-agent is non-case-sensitive.
+		t2 = strings.ToLower(t2)
+		return returnStringVal(lUserAgent)
+
+	case "disallow":
+		// From google's spec:
+		// When no path is specified, the directive is ignored (so an empty Disallow
+		// CAN be an allow, since allow is the default. The actual result depends
+		// on the other rules in the group).
+		return returnPathVal(lDisallow)
+
+	case "allow":
+		// From google's spec:
+		// When no path is specified, the directive is ignored.
+		return returnPathVal(lAllow)
+
+	case "host":
+		// Host directive to specify main site mirror
+		// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
+		return returnStringVal(lHost)
+
+	case "sitemap":
+		// Non-group field, applies to the host as a whole, not to a specific user-agent
+		return returnStringVal(lSitemap)
+
+	case "crawl-delay", "crawldelay":
+		// From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
+		// Several major crawlers support a Crawl-delay parameter, set to the
+		// number of seconds to wait between successive requests to the same server.
+		p.popToken()
+		if cd, e := strconv.ParseFloat(t2, 64); e != nil {
+			return nil, e
+		} else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
+			return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
+		} else {
+			return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
+		}
+	}
+
+	// Consume t2 token
+	p.popToken()
+	return &lineInfo{t: lUnknown, k: t1}, nil
+}
+
+func (p *parser) popToken() (tok string, ok bool) {
+	tok, ok = p.peekToken()
+	if !ok {
+		return
+	}
+	p.pos++
+	return tok, true
+}
+
+func (p *parser) peekToken() (tok string, ok bool) {
+	if p.pos >= len(p.tokens) {
+		return "", false
+	}
+	return p.tokens[p.pos], true
+}
+
+func isAsterisk(r rune) bool {
+	return r == '*'
+}
--- a/vendor/github.com/temoto/robotstxt/robotstxt.go
+++ b/vendor/github.com/temoto/robotstxt/robotstxt.go
@ -0,0 +1,227 @@
+// Package robotstxt implements the robots.txt Exclusion Protocol
+// as specified in http://www.robotstxt.org/wc/robots.html
+// with various extensions.
+package robotstxt
+
+// Comments explaining the logic are taken from either the Google's spec:
+// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+
+import (
+	"bytes"
+	"errors"
+	"io/ioutil"
+	"net/http"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+)
+
+type RobotsData struct {
+	// private
+	groups      map[string]*Group
+	allowAll    bool
+	disallowAll bool
+	Host        string
+	Sitemaps    []string
+}
+
+type Group struct {
+	rules      []*rule
+	Agent      string
+	CrawlDelay time.Duration
+}
+
+type rule struct {
+	path    string
+	allow   bool
+	pattern *regexp.Regexp
+}
+
+type ParseError struct {
+	Errs []error
+}
+
+func newParseError(errs []error) *ParseError {
+	return &ParseError{errs}
+}
+
+func (e ParseError) Error() string {
+	var b bytes.Buffer
+
+	b.WriteString("Parse error(s): " + "\n")
+	for _, er := range e.Errs {
+		b.WriteString(er.Error() + "\n")
+	}
+	return b.String()
+}
+
+var allowAll = &RobotsData{allowAll: true}
+var disallowAll = &RobotsData{disallowAll: true}
+var emptyGroup = &Group{}
+
+func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
+	switch {
+	case statusCode >= 200 && statusCode < 300:
+		return FromBytes(body)
+
+	// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+	//
+	// Google treats all 4xx errors in the same way and assumes that no valid
+	// robots.txt file exists. It is assumed that there are no restrictions.
+	// This is a "full allow" for crawling. Note: this includes 401
+	// "Unauthorized" and 403 "Forbidden" HTTP result codes.
+	case statusCode >= 400 && statusCode < 500:
+		return allowAll, nil
+
+	// From Google's spec:
+	// Server errors (5xx) are seen as temporary errors that result in a "full
+	// disallow" of crawling.
+	case statusCode >= 500 && statusCode < 600:
+		return disallowAll, nil
+	}
+
+	return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))
+}
+
+func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
+	return FromStatusAndBytes(statusCode, []byte(body))
+}
+
+func FromResponse(res *http.Response) (*RobotsData, error) {
+	if res == nil {
+		// Edge case, if res is nil, return nil data
+		return nil, nil
+	}
+	buf, e := ioutil.ReadAll(res.Body)
+	if e != nil {
+		return nil, e
+	}
+	return FromStatusAndBytes(res.StatusCode, buf)
+}
+
+func FromBytes(body []byte) (r *RobotsData, err error) {
+	var errs []error
+
+	// special case (probably not worth optimization?)
+	trimmed := bytes.TrimSpace(body)
+	if len(trimmed) == 0 {
+		return allowAll, nil
+	}
+
+	sc := newByteScanner("bytes", true)
+	//sc.Quiet = !print_errors
+	sc.feed(body, true)
+	tokens := sc.scanAll()
+
+	// special case worth optimization
+	if len(tokens) == 0 {
+		return allowAll, nil
+	}
+
+	r = &RobotsData{}
+	parser := newParser(tokens)
+	r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
+	if len(errs) > 0 {
+		return nil, newParseError(errs)
+	}
+
+	return r, nil
+}
+
+func FromString(body string) (r *RobotsData, err error) {
+	return FromBytes([]byte(body))
+}
+
+func (r *RobotsData) TestAgent(path, agent string) bool {
+	if r.allowAll {
+		return true
+	}
+	if r.disallowAll {
+		return false
+	}
+
+	// Find a group of rules that applies to this agent
+	// From Google's spec:
+	// The user-agent is non-case-sensitive.
+	g := r.FindGroup(agent)
+	return g.Test(path)
+}
+
+// FindGroup searches block of declarations for specified user-agent.
+// From Google's spec:
+// Only one group of group-member records is valid for a particular crawler.
+// The crawler must determine the correct group of records by finding the group
+// with the most specific user-agent that still matches. All other groups of
+// records are ignored by the crawler. The user-agent is non-case-sensitive.
+// The order of the groups within the robots.txt file is irrelevant.
+func (r *RobotsData) FindGroup(agent string) (ret *Group) {
+	var prefixLen int
+
+	agent = strings.ToLower(agent)
+	if ret = r.groups["*"]; ret != nil {
+		// Weakest match possible
+		prefixLen = 1
+	}
+	for a, g := range r.groups {
+		if a != "*" && strings.HasPrefix(agent, a) {
+			if l := len(a); l > prefixLen {
+				prefixLen = l
+				ret = g
+			}
+		}
+	}
+
+	if ret == nil {
+		return emptyGroup
+	}
+	return
+}
+
+func (g *Group) Test(path string) bool {
+	if r := g.findRule(path); r != nil {
+		return r.allow
+	}
+
+	// From Google's spec:
+	// By default, there are no restrictions for crawling for the designated crawlers.
+	return true
+}
+
+// From Google's spec:
+// The path value is used as a basis to determine whether or not a rule applies
+// to a specific URL on a site. With the exception of wildcards, the path is
+// used to match the beginning of a URL (and any valid URLs that start with the
+// same path).
+//
+// At a group-member level, in particular for allow and disallow directives,
+// the most specific rule based on the length of the [path] entry will trump
+// the less specific (shorter) rule. The order of precedence for rules with
+// wildcards is undefined.
+func (g *Group) findRule(path string) (ret *rule) {
+	var prefixLen int
+
+	for _, r := range g.rules {
+		if r.pattern != nil {
+			if r.pattern.MatchString(path) {
+				// Consider this a match equal to the length of the pattern.
+				// From Google's spec:
+				// The order of precedence for rules with wildcards is undefined.
+				if l := len(r.pattern.String()); l > prefixLen {
+					prefixLen = l
+					ret = r
+				}
+			}
+		} else if r.path == "/" && prefixLen == 0 {
+			// Weakest match possible
+			prefixLen = 1
+			ret = r
+		} else if strings.HasPrefix(path, r.path) {
+			if l := len(r.path); l > prefixLen {
+				prefixLen = l
+				ret = r
+			}
+		}
+	}
+	return
+}
--- a/vendor/github.com/temoto/robotstxt/scanner.go
+++ b/vendor/github.com/temoto/robotstxt/scanner.go
@ -0,0 +1,185 @@
+package robotstxt
+
+import (
+	"bytes"
+	"fmt"
+	"go/token"
+	"os"
+	"sync"
+	"unicode/utf8"
+)
+
+type byteScanner struct {
+	pos           token.Position
+	buf           []byte
+	ErrorCount    int
+	ch            rune
+	Quiet         bool
+	keyTokenFound bool
+	lastChunk     bool
+}
+
+const tokEOL = "\n"
+
+var WhitespaceChars = []rune{' ', '\t', '\v'}
+var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
+
+func newByteScanner(srcname string, quiet bool) *byteScanner {
+	return &byteScanner{
+		Quiet: quiet,
+		ch:    -1,
+		pos:   token.Position{Filename: srcname},
+	}
+}
+
+func (s *byteScanner) feed(input []byte, end bool) {
+	s.buf = input
+	s.pos.Offset = 0
+	s.pos.Line = 1
+	s.pos.Column = 1
+	s.lastChunk = end
+
+	// Read first char into look-ahead buffer `s.ch`.
+	if !s.nextChar() {
+		return
+	}
+
+	// Skip UTF-8 byte order mark
+	if s.ch == 65279 {
+		s.nextChar()
+		s.pos.Column = 1
+	}
+}
+
+func (s *byteScanner) GetPosition() token.Position {
+	return s.pos
+}
+
+func (s *byteScanner) scan() string {
+	// Note Offset > len, not >=, so we can scan last character.
+	if s.lastChunk && s.pos.Offset > len(s.buf) {
+		return ""
+	}
+
+	s.skipSpace()
+
+	if s.ch == -1 {
+		return ""
+	}
+
+	// EOL
+	if s.isEol() {
+		s.keyTokenFound = false
+		// skip subsequent newline chars
+		for s.ch != -1 && s.isEol() {
+			s.nextChar()
+		}
+		// emit newline as separate token
+		return tokEOL
+	}
+
+	// skip comments
+	if s.ch == '#' {
+		s.keyTokenFound = false
+		s.skipUntilEol()
+		if s.ch == -1 {
+			return ""
+		}
+		// emit newline as separate token
+		return tokEOL
+	}
+
+	// else we found something
+	tok := tokBuffers.Get().(*bytes.Buffer)
+	defer tokBuffers.Put(tok)
+	tok.Reset()
+	tok.WriteRune(s.ch)
+	s.nextChar()
+	for s.ch != -1 && !s.isSpace() && !s.isEol() {
+		// Do not consider ":" to be a token separator if a first key token
+		// has already been found on this line (avoid cutting an absolute URL
+		// after the "http:")
+		if s.ch == ':' && !s.keyTokenFound {
+			s.nextChar()
+			s.keyTokenFound = true
+			break
+		}
+
+		tok.WriteRune(s.ch)
+		s.nextChar()
+	}
+	return tok.String()
+}
+
+func (s *byteScanner) scanAll() []string {
+	results := make([]string, 0, 64) // random guess of average tokens length
+	for {
+		token := s.scan()
+		if token != "" {
+			results = append(results, token)
+		} else {
+			break
+		}
+	}
+	return results
+}
+
+func (s *byteScanner) error(pos token.Position, msg string) {
+	s.ErrorCount++
+	if !s.Quiet {
+		fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
+	}
+}
+
+func (s *byteScanner) isEol() bool {
+	return s.ch == '\n' || s.ch == '\r'
+}
+
+func (s *byteScanner) isSpace() bool {
+	for _, r := range WhitespaceChars {
+		if s.ch == r {
+			return true
+		}
+	}
+	return false
+}
+
+func (s *byteScanner) skipSpace() {
+	for s.ch != -1 && s.isSpace() {
+		s.nextChar()
+	}
+}
+
+func (s *byteScanner) skipUntilEol() {
+	for s.ch != -1 && !s.isEol() {
+		s.nextChar()
+	}
+	// skip subsequent newline chars
+	for s.ch != -1 && s.isEol() {
+		s.nextChar()
+	}
+}
+
+// Reads next Unicode char.
+func (s *byteScanner) nextChar() bool {
+	if s.pos.Offset >= len(s.buf) {
+		s.ch = -1
+		return false
+	}
+	s.pos.Column++
+	if s.ch == '\n' {
+		s.pos.Line++
+		s.pos.Column = 1
+	}
+	r, w := rune(s.buf[s.pos.Offset]), 1
+	if r >= 0x80 {
+		r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
+		if r == utf8.RuneError && w == 1 {
+			s.error(s.pos, "illegal UTF-8 encoding")
+		}
+	}
+	s.pos.Column++
+	s.pos.Offset += w
+	s.ch = r
+	return true
+}