mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-11-03 21:22:25 -06:00 
			
		
		
		
	[bug] respect X-Robots-Tag and robots.txt on api/v1/instance and nodeinfo (#3756)
		
	* feat: check X-Robots-Tag when accessing /api/v1/instance or /nodeinfo endpoints respect X-Robots-Tag * chore: go fmt ./... * Check robots.txt as well, add tests --------- Co-authored-by: tobi <tobi.smethurst@protonmail.com>
This commit is contained in:
		
					parent
					
						
							
								2c95fd4115
							
						
					
				
			
			
				commit
				
					
						d0de3ad492
					
				
			
		
					 20 changed files with 1404 additions and 24 deletions
				
			
		
							
								
								
									
										15
									
								
								vendor/github.com/temoto/robotstxt/.gitignore
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								vendor/github.com/temoto/robotstxt/.gitignore
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,15 @@
 | 
			
		|||
*.cgo?.*
 | 
			
		||||
*.o
 | 
			
		||||
*.so
 | 
			
		||||
*.sublime-*
 | 
			
		||||
*.zip
 | 
			
		||||
.DS_Store
 | 
			
		||||
.idea/
 | 
			
		||||
.tags*
 | 
			
		||||
_cgo_*
 | 
			
		||||
_gofuzz/crashers/
 | 
			
		||||
_gofuzz/suppressions/
 | 
			
		||||
_obj
 | 
			
		||||
_test
 | 
			
		||||
coverage.txt
 | 
			
		||||
robots.txt-check/robots.txt-check
 | 
			
		||||
							
								
								
									
										20
									
								
								vendor/github.com/temoto/robotstxt/.golangci.yml
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								vendor/github.com/temoto/robotstxt/.golangci.yml
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,20 @@
 | 
			
		|||
linters:
 | 
			
		||||
  enable:
 | 
			
		||||
    - goconst
 | 
			
		||||
    - gofmt
 | 
			
		||||
    - gosec
 | 
			
		||||
    - maligned
 | 
			
		||||
    - prealloc
 | 
			
		||||
    - staticcheck
 | 
			
		||||
  disable:
 | 
			
		||||
    - deadcode
 | 
			
		||||
    - structcheck
 | 
			
		||||
    - varcheck
 | 
			
		||||
 | 
			
		||||
linters-settings:
 | 
			
		||||
  gofmt:
 | 
			
		||||
    simplify: true
 | 
			
		||||
  govet:
 | 
			
		||||
    check-shadowing: true
 | 
			
		||||
  maligned:
 | 
			
		||||
    suggest-new: true
 | 
			
		||||
							
								
								
									
										30
									
								
								vendor/github.com/temoto/robotstxt/.travis.yml
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								vendor/github.com/temoto/robotstxt/.travis.yml
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,30 @@
 | 
			
		|||
cache:
 | 
			
		||||
  go: true
 | 
			
		||||
  directories:
 | 
			
		||||
  - $HOME/.cache
 | 
			
		||||
  - $HOME/bin
 | 
			
		||||
  - $HOME/gopath/pkg/mod
 | 
			
		||||
language: go
 | 
			
		||||
go:
 | 
			
		||||
- 1.11
 | 
			
		||||
- 1.12
 | 
			
		||||
- 1.13
 | 
			
		||||
- 1.14
 | 
			
		||||
- 1.x
 | 
			
		||||
- master
 | 
			
		||||
install: true
 | 
			
		||||
script: GO111MODULE=on go test -race
 | 
			
		||||
 | 
			
		||||
matrix:
 | 
			
		||||
  include:
 | 
			
		||||
  - go: 1.x
 | 
			
		||||
    env: task=coverage
 | 
			
		||||
    script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt
 | 
			
		||||
    after_success: bash <(curl -s https://codecov.io/bash)
 | 
			
		||||
  - go: 1.x
 | 
			
		||||
    env: task=bench
 | 
			
		||||
    script: GO111MODULE=on ./script/bench
 | 
			
		||||
  - go: 1.x
 | 
			
		||||
    install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1
 | 
			
		||||
    env: task=clean
 | 
			
		||||
    script: GO111MODULE=on ./script/clean
 | 
			
		||||
							
								
								
									
										21
									
								
								vendor/github.com/temoto/robotstxt/LICENSE
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								vendor/github.com/temoto/robotstxt/LICENSE
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,21 @@
 | 
			
		|||
The MIT License
 | 
			
		||||
 | 
			
		||||
Copyright (c) 2010 Sergey Shepelev <temotor@gmail.com>
 | 
			
		||||
 | 
			
		||||
Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
in the Software without restriction, including without limitation the rights
 | 
			
		||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
furnished to do so, subject to the following conditions:
 | 
			
		||||
 | 
			
		||||
The above copyright notice and this permission notice shall be included in
 | 
			
		||||
all copies or substantial portions of the Software.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 | 
			
		||||
THE SOFTWARE.
 | 
			
		||||
							
								
								
									
										115
									
								
								vendor/github.com/temoto/robotstxt/README.rst
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								vendor/github.com/temoto/robotstxt/README.rst
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,115 @@
 | 
			
		|||
What
 | 
			
		||||
====
 | 
			
		||||
 | 
			
		||||
This is a robots.txt exclusion protocol implementation for Go language (golang).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Build
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
To build and run tests run `go test` in source directory.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Contribute
 | 
			
		||||
==========
 | 
			
		||||
 | 
			
		||||
Warm welcome.
 | 
			
		||||
 | 
			
		||||
* If desired, add your name in README.rst, section Who.
 | 
			
		||||
* Run `script/test && script/clean && echo ok`
 | 
			
		||||
* You can ignore linter warnings, but everything else must pass.
 | 
			
		||||
* Send your change as pull request or just a regular patch to current maintainer (see section Who).
 | 
			
		||||
 | 
			
		||||
Thank you.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Usage
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
As usual, no special installation is required, just
 | 
			
		||||
 | 
			
		||||
    import "github.com/temoto/robotstxt"
 | 
			
		||||
 | 
			
		||||
run `go get` and you're ready.
 | 
			
		||||
 | 
			
		||||
1. Parse
 | 
			
		||||
^^^^^^^^
 | 
			
		||||
 | 
			
		||||
First of all, you need to parse robots.txt data. You can do it with
 | 
			
		||||
functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`::
 | 
			
		||||
 | 
			
		||||
    robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:"))
 | 
			
		||||
    robots, err := robotstxt.FromString("User-agent: *\nDisallow:")
 | 
			
		||||
 | 
			
		||||
As of 2012-10-03, `FromBytes` is the most efficient method, everything else
 | 
			
		||||
is a wrapper for this core function.
 | 
			
		||||
 | 
			
		||||
There are few convenient constructors for various purposes:
 | 
			
		||||
 | 
			
		||||
* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data
 | 
			
		||||
from HTTP response. It *does not* call `response.Body.Close()`::
 | 
			
		||||
 | 
			
		||||
    robots, err := robotstxt.FromResponse(resp)
 | 
			
		||||
    resp.Body.Close()
 | 
			
		||||
    if err != nil {
 | 
			
		||||
        log.Println("Error parsing robots.txt:", err.Error())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or
 | 
			
		||||
`FromStatusAndString` if you prefer to read bytes (string) yourself.
 | 
			
		||||
Passing status code applies following logic in line with Google's interpretation
 | 
			
		||||
of robots.txt files:
 | 
			
		||||
 | 
			
		||||
    * status 2xx  -> parse body with `FromBytes` and apply rules listed there.
 | 
			
		||||
    * status 4xx  -> allow all (even 401/403, as recommended by Google).
 | 
			
		||||
    * other (5xx) -> disallow all, consider this a temporary unavailability.
 | 
			
		||||
 | 
			
		||||
2. Query
 | 
			
		||||
^^^^^^^^
 | 
			
		||||
 | 
			
		||||
Parsing robots.txt content builds a kind of logic database, which you can
 | 
			
		||||
query with `(r *RobotsData) TestAgent(url, agent string) (bool)`.
 | 
			
		||||
 | 
			
		||||
Explicit passing of agent is useful if you want to query for different agents. For
 | 
			
		||||
single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)`
 | 
			
		||||
returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`.
 | 
			
		||||
 | 
			
		||||
Simple query with explicit user agent. Each call will scan all rules.
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    allow := robots.TestAgent("/", "FooBot")
 | 
			
		||||
 | 
			
		||||
Or query several paths against same user agent for performance.
 | 
			
		||||
 | 
			
		||||
::
 | 
			
		||||
 | 
			
		||||
    group := robots.FindGroup("BarBot")
 | 
			
		||||
    group.Test("/")
 | 
			
		||||
    group.Test("/download.mp3")
 | 
			
		||||
    group.Test("/news/article-2012-1")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Who
 | 
			
		||||
===
 | 
			
		||||
 | 
			
		||||
Honorable contributors (in undefined order):
 | 
			
		||||
 | 
			
		||||
    * Ilya Grigorik (igrigorik)
 | 
			
		||||
    * Martin Angers (PuerkitoBio)
 | 
			
		||||
    * Micha Gorelick (mynameisfiber)
 | 
			
		||||
 | 
			
		||||
Initial commit and other: Sergey Shepelev temotor@gmail.com
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Flair
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master
 | 
			
		||||
    :target: https://travis-ci.org/temoto/robotstxt
 | 
			
		||||
 | 
			
		||||
.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg
 | 
			
		||||
    :target: https://codecov.io/gh/temoto/robotstxt
 | 
			
		||||
 | 
			
		||||
.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt
 | 
			
		||||
    :target: https://goreportcard.com/report/github.com/temoto/robotstxt
 | 
			
		||||
							
								
								
									
										2
									
								
								vendor/github.com/temoto/robotstxt/codecov.yml
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								vendor/github.com/temoto/robotstxt/codecov.yml
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,2 @@
 | 
			
		|||
codecov:
 | 
			
		||||
  token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04
 | 
			
		||||
							
								
								
									
										29
									
								
								vendor/github.com/temoto/robotstxt/fuzz.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								vendor/github.com/temoto/robotstxt/fuzz.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,29 @@
 | 
			
		|||
// +build gofuzz
 | 
			
		||||
 | 
			
		||||
package robotstxt
 | 
			
		||||
 | 
			
		||||
import "testing/quick"
 | 
			
		||||
 | 
			
		||||
func Fuzz(data []byte) int {
 | 
			
		||||
	r, err := FromBytes(data)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		if r != nil {
 | 
			
		||||
			panic("r != nil on error")
 | 
			
		||||
		}
 | 
			
		||||
		return 0
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// FindGroup must never return nil
 | 
			
		||||
	f1 := func(agent string) bool { return r.FindGroup(agent) != nil }
 | 
			
		||||
	if err := quick.Check(f1, nil); err != nil {
 | 
			
		||||
		panic(err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// just check TestAgent doesn't panic
 | 
			
		||||
	f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true }
 | 
			
		||||
	if err := quick.Check(f2, nil); err != nil {
 | 
			
		||||
		panic(err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return 1
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										271
									
								
								vendor/github.com/temoto/robotstxt/parser.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										271
									
								
								vendor/github.com/temoto/robotstxt/parser.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,271 @@
 | 
			
		|||
package robotstxt
 | 
			
		||||
 | 
			
		||||
// Comments explaining the logic are taken from either the google's spec:
 | 
			
		||||
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
 | 
			
		||||
//
 | 
			
		||||
// or the Wikipedia's entry on robots.txt:
 | 
			
		||||
// http://en.wikipedia.org/wiki/Robots.txt
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"io"
 | 
			
		||||
	"math"
 | 
			
		||||
	"regexp"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type lineType uint
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	lIgnore lineType = iota
 | 
			
		||||
	lUnknown
 | 
			
		||||
	lUserAgent
 | 
			
		||||
	lAllow
 | 
			
		||||
	lDisallow
 | 
			
		||||
	lCrawlDelay
 | 
			
		||||
	lSitemap
 | 
			
		||||
	lHost
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type parser struct {
 | 
			
		||||
	tokens []string
 | 
			
		||||
	pos    int
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type lineInfo struct {
 | 
			
		||||
	t  lineType       // Type of line key
 | 
			
		||||
	k  string         // String representation of the type of key
 | 
			
		||||
	vs string         // String value of the key
 | 
			
		||||
	vf float64        // Float value of the key
 | 
			
		||||
	vr *regexp.Regexp // Regexp value of the key
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func newParser(tokens []string) *parser {
 | 
			
		||||
	return &parser{tokens: tokens}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
 | 
			
		||||
	var g *Group
 | 
			
		||||
	for _, a := range agents {
 | 
			
		||||
		if g = groups[a]; g == nil {
 | 
			
		||||
			g = new(Group)
 | 
			
		||||
			groups[a] = g
 | 
			
		||||
		}
 | 
			
		||||
		fun(g)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
 | 
			
		||||
	groups = make(map[string]*Group, 16)
 | 
			
		||||
	agents := make([]string, 0, 4)
 | 
			
		||||
	isEmptyGroup := true
 | 
			
		||||
 | 
			
		||||
	// Reset internal fields, tokens are assigned at creation time, never change
 | 
			
		||||
	p.pos = 0
 | 
			
		||||
 | 
			
		||||
	for {
 | 
			
		||||
		if li, err := p.parseLine(); err != nil {
 | 
			
		||||
			if err == io.EOF {
 | 
			
		||||
				break
 | 
			
		||||
			}
 | 
			
		||||
			errs = append(errs, err)
 | 
			
		||||
		} else {
 | 
			
		||||
			switch li.t {
 | 
			
		||||
			case lUserAgent:
 | 
			
		||||
				// Two successive user-agent lines are part of the same group.
 | 
			
		||||
				if !isEmptyGroup {
 | 
			
		||||
					// End previous group
 | 
			
		||||
					agents = make([]string, 0, 4)
 | 
			
		||||
				}
 | 
			
		||||
				if len(agents) == 0 {
 | 
			
		||||
					isEmptyGroup = true
 | 
			
		||||
				}
 | 
			
		||||
				agents = append(agents, li.vs)
 | 
			
		||||
 | 
			
		||||
			case lDisallow:
 | 
			
		||||
				// Error if no current group
 | 
			
		||||
				if len(agents) == 0 {
 | 
			
		||||
					errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
 | 
			
		||||
				} else {
 | 
			
		||||
					isEmptyGroup = false
 | 
			
		||||
					var r *rule
 | 
			
		||||
					if li.vr != nil {
 | 
			
		||||
						r = &rule{"", false, li.vr}
 | 
			
		||||
					} else {
 | 
			
		||||
						r = &rule{li.vs, false, nil}
 | 
			
		||||
					}
 | 
			
		||||
					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
			case lAllow:
 | 
			
		||||
				// Error if no current group
 | 
			
		||||
				if len(agents) == 0 {
 | 
			
		||||
					errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
 | 
			
		||||
				} else {
 | 
			
		||||
					isEmptyGroup = false
 | 
			
		||||
					var r *rule
 | 
			
		||||
					if li.vr != nil {
 | 
			
		||||
						r = &rule{"", true, li.vr}
 | 
			
		||||
					} else {
 | 
			
		||||
						r = &rule{li.vs, true, nil}
 | 
			
		||||
					}
 | 
			
		||||
					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
			case lHost:
 | 
			
		||||
				host = li.vs
 | 
			
		||||
 | 
			
		||||
			case lSitemap:
 | 
			
		||||
				sitemaps = append(sitemaps, li.vs)
 | 
			
		||||
 | 
			
		||||
			case lCrawlDelay:
 | 
			
		||||
				if len(agents) == 0 {
 | 
			
		||||
					errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
 | 
			
		||||
				} else {
 | 
			
		||||
					isEmptyGroup = false
 | 
			
		||||
					delay := time.Duration(li.vf * float64(time.Second))
 | 
			
		||||
					parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p *parser) parseLine() (li *lineInfo, err error) {
 | 
			
		||||
	t1, ok1 := p.popToken()
 | 
			
		||||
	if !ok1 {
 | 
			
		||||
		// proper EOF
 | 
			
		||||
		return nil, io.EOF
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	t2, ok2 := p.peekToken()
 | 
			
		||||
	if !ok2 {
 | 
			
		||||
		// EOF, no value associated with the token, so ignore token and return
 | 
			
		||||
		return nil, io.EOF
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Helper closure for all string-based tokens, common behaviour:
 | 
			
		||||
	// - Consume t2 token
 | 
			
		||||
	// - If empty, return unknown line info
 | 
			
		||||
	// - Otherwise return the specified line info
 | 
			
		||||
	returnStringVal := func(t lineType) (*lineInfo, error) {
 | 
			
		||||
		p.popToken()
 | 
			
		||||
		if t2 != "" {
 | 
			
		||||
			return &lineInfo{t: t, k: t1, vs: t2}, nil
 | 
			
		||||
		}
 | 
			
		||||
		return &lineInfo{t: lIgnore}, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Helper closure for all path tokens (allow/disallow), common behaviour:
 | 
			
		||||
	// - Consume t2 token
 | 
			
		||||
	// - If empty, return unknown line info
 | 
			
		||||
	// - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
 | 
			
		||||
	// - Detect if wildcards are present, if so, compile into a regexp
 | 
			
		||||
	// - Return the specified line info
 | 
			
		||||
	returnPathVal := func(t lineType) (*lineInfo, error) {
 | 
			
		||||
		p.popToken()
 | 
			
		||||
		if t2 != "" {
 | 
			
		||||
			if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
 | 
			
		||||
				t2 = "/" + t2
 | 
			
		||||
			}
 | 
			
		||||
			t2 = strings.TrimRightFunc(t2, isAsterisk)
 | 
			
		||||
			// From google's spec:
 | 
			
		||||
			// Google, Bing, Yahoo, and Ask support a limited form of
 | 
			
		||||
			// "wildcards" for path values. These are:
 | 
			
		||||
			//   * designates 0 or more instances of any valid character
 | 
			
		||||
			//   $ designates the end of the URL
 | 
			
		||||
			if strings.ContainsAny(t2, "*$") {
 | 
			
		||||
				// Must compile a regexp, this is a pattern.
 | 
			
		||||
				// Escape string before compile.
 | 
			
		||||
				t2 = regexp.QuoteMeta(t2)
 | 
			
		||||
				t2 = strings.Replace(t2, `\*`, `.*`, -1)
 | 
			
		||||
				t2 = strings.Replace(t2, `\$`, `$`, -1)
 | 
			
		||||
				if r, e := regexp.Compile(t2); e != nil {
 | 
			
		||||
					return nil, e
 | 
			
		||||
				} else {
 | 
			
		||||
					return &lineInfo{t: t, k: t1, vr: r}, nil
 | 
			
		||||
				}
 | 
			
		||||
			} else {
 | 
			
		||||
				// Simple string path
 | 
			
		||||
				return &lineInfo{t: t, k: t1, vs: t2}, nil
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		return &lineInfo{t: lIgnore}, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	switch strings.ToLower(t1) {
 | 
			
		||||
	case tokEOL:
 | 
			
		||||
		// Don't consume t2 and continue parsing
 | 
			
		||||
		return &lineInfo{t: lIgnore}, nil
 | 
			
		||||
 | 
			
		||||
	case "user-agent", "useragent":
 | 
			
		||||
		// From google's spec:
 | 
			
		||||
		// Handling of <field> elements with simple errors / typos (eg "useragent"
 | 
			
		||||
		// instead of "user-agent") is undefined and may be interpreted as correct
 | 
			
		||||
		// directives by some user-agents.
 | 
			
		||||
		// The user-agent is non-case-sensitive.
 | 
			
		||||
		t2 = strings.ToLower(t2)
 | 
			
		||||
		return returnStringVal(lUserAgent)
 | 
			
		||||
 | 
			
		||||
	case "disallow":
 | 
			
		||||
		// From google's spec:
 | 
			
		||||
		// When no path is specified, the directive is ignored (so an empty Disallow
 | 
			
		||||
		// CAN be an allow, since allow is the default. The actual result depends
 | 
			
		||||
		// on the other rules in the group).
 | 
			
		||||
		return returnPathVal(lDisallow)
 | 
			
		||||
 | 
			
		||||
	case "allow":
 | 
			
		||||
		// From google's spec:
 | 
			
		||||
		// When no path is specified, the directive is ignored.
 | 
			
		||||
		return returnPathVal(lAllow)
 | 
			
		||||
 | 
			
		||||
	case "host":
 | 
			
		||||
		// Host directive to specify main site mirror
 | 
			
		||||
		// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
 | 
			
		||||
		return returnStringVal(lHost)
 | 
			
		||||
 | 
			
		||||
	case "sitemap":
 | 
			
		||||
		// Non-group field, applies to the host as a whole, not to a specific user-agent
 | 
			
		||||
		return returnStringVal(lSitemap)
 | 
			
		||||
 | 
			
		||||
	case "crawl-delay", "crawldelay":
 | 
			
		||||
		// From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
 | 
			
		||||
		// Several major crawlers support a Crawl-delay parameter, set to the
 | 
			
		||||
		// number of seconds to wait between successive requests to the same server.
 | 
			
		||||
		p.popToken()
 | 
			
		||||
		if cd, e := strconv.ParseFloat(t2, 64); e != nil {
 | 
			
		||||
			return nil, e
 | 
			
		||||
		} else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
 | 
			
		||||
			return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
 | 
			
		||||
		} else {
 | 
			
		||||
			return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Consume t2 token
 | 
			
		||||
	p.popToken()
 | 
			
		||||
	return &lineInfo{t: lUnknown, k: t1}, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p *parser) popToken() (tok string, ok bool) {
 | 
			
		||||
	tok, ok = p.peekToken()
 | 
			
		||||
	if !ok {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
	p.pos++
 | 
			
		||||
	return tok, true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (p *parser) peekToken() (tok string, ok bool) {
 | 
			
		||||
	if p.pos >= len(p.tokens) {
 | 
			
		||||
		return "", false
 | 
			
		||||
	}
 | 
			
		||||
	return p.tokens[p.pos], true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func isAsterisk(r rune) bool {
 | 
			
		||||
	return r == '*'
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										227
									
								
								vendor/github.com/temoto/robotstxt/robotstxt.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										227
									
								
								vendor/github.com/temoto/robotstxt/robotstxt.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,227 @@
 | 
			
		|||
// Package robotstxt implements the robots.txt Exclusion Protocol
 | 
			
		||||
// as specified in http://www.robotstxt.org/wc/robots.html
 | 
			
		||||
// with various extensions.
 | 
			
		||||
package robotstxt
 | 
			
		||||
 | 
			
		||||
// Comments explaining the logic are taken from either the Google's spec:
 | 
			
		||||
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"errors"
 | 
			
		||||
	"io/ioutil"
 | 
			
		||||
	"net/http"
 | 
			
		||||
	"regexp"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type RobotsData struct {
 | 
			
		||||
	// private
 | 
			
		||||
	groups      map[string]*Group
 | 
			
		||||
	allowAll    bool
 | 
			
		||||
	disallowAll bool
 | 
			
		||||
	Host        string
 | 
			
		||||
	Sitemaps    []string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Group struct {
 | 
			
		||||
	rules      []*rule
 | 
			
		||||
	Agent      string
 | 
			
		||||
	CrawlDelay time.Duration
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type rule struct {
 | 
			
		||||
	path    string
 | 
			
		||||
	allow   bool
 | 
			
		||||
	pattern *regexp.Regexp
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type ParseError struct {
 | 
			
		||||
	Errs []error
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func newParseError(errs []error) *ParseError {
 | 
			
		||||
	return &ParseError{errs}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (e ParseError) Error() string {
 | 
			
		||||
	var b bytes.Buffer
 | 
			
		||||
 | 
			
		||||
	b.WriteString("Parse error(s): " + "\n")
 | 
			
		||||
	for _, er := range e.Errs {
 | 
			
		||||
		b.WriteString(er.Error() + "\n")
 | 
			
		||||
	}
 | 
			
		||||
	return b.String()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
var allowAll = &RobotsData{allowAll: true}
 | 
			
		||||
var disallowAll = &RobotsData{disallowAll: true}
 | 
			
		||||
var emptyGroup = &Group{}
 | 
			
		||||
 | 
			
		||||
func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
 | 
			
		||||
	switch {
 | 
			
		||||
	case statusCode >= 200 && statusCode < 300:
 | 
			
		||||
		return FromBytes(body)
 | 
			
		||||
 | 
			
		||||
	// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
 | 
			
		||||
	//
 | 
			
		||||
	// Google treats all 4xx errors in the same way and assumes that no valid
 | 
			
		||||
	// robots.txt file exists. It is assumed that there are no restrictions.
 | 
			
		||||
	// This is a "full allow" for crawling. Note: this includes 401
 | 
			
		||||
	// "Unauthorized" and 403 "Forbidden" HTTP result codes.
 | 
			
		||||
	case statusCode >= 400 && statusCode < 500:
 | 
			
		||||
		return allowAll, nil
 | 
			
		||||
 | 
			
		||||
	// From Google's spec:
 | 
			
		||||
	// Server errors (5xx) are seen as temporary errors that result in a "full
 | 
			
		||||
	// disallow" of crawling.
 | 
			
		||||
	case statusCode >= 500 && statusCode < 600:
 | 
			
		||||
		return disallowAll, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
 | 
			
		||||
	return FromStatusAndBytes(statusCode, []byte(body))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func FromResponse(res *http.Response) (*RobotsData, error) {
 | 
			
		||||
	if res == nil {
 | 
			
		||||
		// Edge case, if res is nil, return nil data
 | 
			
		||||
		return nil, nil
 | 
			
		||||
	}
 | 
			
		||||
	buf, e := ioutil.ReadAll(res.Body)
 | 
			
		||||
	if e != nil {
 | 
			
		||||
		return nil, e
 | 
			
		||||
	}
 | 
			
		||||
	return FromStatusAndBytes(res.StatusCode, buf)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func FromBytes(body []byte) (r *RobotsData, err error) {
 | 
			
		||||
	var errs []error
 | 
			
		||||
 | 
			
		||||
	// special case (probably not worth optimization?)
 | 
			
		||||
	trimmed := bytes.TrimSpace(body)
 | 
			
		||||
	if len(trimmed) == 0 {
 | 
			
		||||
		return allowAll, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	sc := newByteScanner("bytes", true)
 | 
			
		||||
	//sc.Quiet = !print_errors
 | 
			
		||||
	sc.feed(body, true)
 | 
			
		||||
	tokens := sc.scanAll()
 | 
			
		||||
 | 
			
		||||
	// special case worth optimization
 | 
			
		||||
	if len(tokens) == 0 {
 | 
			
		||||
		return allowAll, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	r = &RobotsData{}
 | 
			
		||||
	parser := newParser(tokens)
 | 
			
		||||
	r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
 | 
			
		||||
	if len(errs) > 0 {
 | 
			
		||||
		return nil, newParseError(errs)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return r, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func FromString(body string) (r *RobotsData, err error) {
 | 
			
		||||
	return FromBytes([]byte(body))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (r *RobotsData) TestAgent(path, agent string) bool {
 | 
			
		||||
	if r.allowAll {
 | 
			
		||||
		return true
 | 
			
		||||
	}
 | 
			
		||||
	if r.disallowAll {
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Find a group of rules that applies to this agent
 | 
			
		||||
	// From Google's spec:
 | 
			
		||||
	// The user-agent is non-case-sensitive.
 | 
			
		||||
	g := r.FindGroup(agent)
 | 
			
		||||
	return g.Test(path)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// FindGroup searches block of declarations for specified user-agent.
 | 
			
		||||
// From Google's spec:
 | 
			
		||||
// Only one group of group-member records is valid for a particular crawler.
 | 
			
		||||
// The crawler must determine the correct group of records by finding the group
 | 
			
		||||
// with the most specific user-agent that still matches. All other groups of
 | 
			
		||||
// records are ignored by the crawler. The user-agent is non-case-sensitive.
 | 
			
		||||
// The order of the groups within the robots.txt file is irrelevant.
 | 
			
		||||
func (r *RobotsData) FindGroup(agent string) (ret *Group) {
 | 
			
		||||
	var prefixLen int
 | 
			
		||||
 | 
			
		||||
	agent = strings.ToLower(agent)
 | 
			
		||||
	if ret = r.groups["*"]; ret != nil {
 | 
			
		||||
		// Weakest match possible
 | 
			
		||||
		prefixLen = 1
 | 
			
		||||
	}
 | 
			
		||||
	for a, g := range r.groups {
 | 
			
		||||
		if a != "*" && strings.HasPrefix(agent, a) {
 | 
			
		||||
			if l := len(a); l > prefixLen {
 | 
			
		||||
				prefixLen = l
 | 
			
		||||
				ret = g
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if ret == nil {
 | 
			
		||||
		return emptyGroup
 | 
			
		||||
	}
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (g *Group) Test(path string) bool {
 | 
			
		||||
	if r := g.findRule(path); r != nil {
 | 
			
		||||
		return r.allow
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// From Google's spec:
 | 
			
		||||
	// By default, there are no restrictions for crawling for the designated crawlers.
 | 
			
		||||
	return true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// From Google's spec:
 | 
			
		||||
// The path value is used as a basis to determine whether or not a rule applies
 | 
			
		||||
// to a specific URL on a site. With the exception of wildcards, the path is
 | 
			
		||||
// used to match the beginning of a URL (and any valid URLs that start with the
 | 
			
		||||
// same path).
 | 
			
		||||
//
 | 
			
		||||
// At a group-member level, in particular for allow and disallow directives,
 | 
			
		||||
// the most specific rule based on the length of the [path] entry will trump
 | 
			
		||||
// the less specific (shorter) rule. The order of precedence for rules with
 | 
			
		||||
// wildcards is undefined.
 | 
			
		||||
func (g *Group) findRule(path string) (ret *rule) {
 | 
			
		||||
	var prefixLen int
 | 
			
		||||
 | 
			
		||||
	for _, r := range g.rules {
 | 
			
		||||
		if r.pattern != nil {
 | 
			
		||||
			if r.pattern.MatchString(path) {
 | 
			
		||||
				// Consider this a match equal to the length of the pattern.
 | 
			
		||||
				// From Google's spec:
 | 
			
		||||
				// The order of precedence for rules with wildcards is undefined.
 | 
			
		||||
				if l := len(r.pattern.String()); l > prefixLen {
 | 
			
		||||
					prefixLen = l
 | 
			
		||||
					ret = r
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		} else if r.path == "/" && prefixLen == 0 {
 | 
			
		||||
			// Weakest match possible
 | 
			
		||||
			prefixLen = 1
 | 
			
		||||
			ret = r
 | 
			
		||||
		} else if strings.HasPrefix(path, r.path) {
 | 
			
		||||
			if l := len(r.path); l > prefixLen {
 | 
			
		||||
				prefixLen = l
 | 
			
		||||
				ret = r
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										185
									
								
								vendor/github.com/temoto/robotstxt/scanner.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										185
									
								
								vendor/github.com/temoto/robotstxt/scanner.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,185 @@
 | 
			
		|||
package robotstxt
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"go/token"
 | 
			
		||||
	"os"
 | 
			
		||||
	"sync"
 | 
			
		||||
	"unicode/utf8"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type byteScanner struct {
 | 
			
		||||
	pos           token.Position
 | 
			
		||||
	buf           []byte
 | 
			
		||||
	ErrorCount    int
 | 
			
		||||
	ch            rune
 | 
			
		||||
	Quiet         bool
 | 
			
		||||
	keyTokenFound bool
 | 
			
		||||
	lastChunk     bool
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const tokEOL = "\n"
 | 
			
		||||
 | 
			
		||||
var WhitespaceChars = []rune{' ', '\t', '\v'}
 | 
			
		||||
var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
 | 
			
		||||
 | 
			
		||||
func newByteScanner(srcname string, quiet bool) *byteScanner {
 | 
			
		||||
	return &byteScanner{
 | 
			
		||||
		Quiet: quiet,
 | 
			
		||||
		ch:    -1,
 | 
			
		||||
		pos:   token.Position{Filename: srcname},
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) feed(input []byte, end bool) {
 | 
			
		||||
	s.buf = input
 | 
			
		||||
	s.pos.Offset = 0
 | 
			
		||||
	s.pos.Line = 1
 | 
			
		||||
	s.pos.Column = 1
 | 
			
		||||
	s.lastChunk = end
 | 
			
		||||
 | 
			
		||||
	// Read first char into look-ahead buffer `s.ch`.
 | 
			
		||||
	if !s.nextChar() {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Skip UTF-8 byte order mark
 | 
			
		||||
	if s.ch == 65279 {
 | 
			
		||||
		s.nextChar()
 | 
			
		||||
		s.pos.Column = 1
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) GetPosition() token.Position {
 | 
			
		||||
	return s.pos
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) scan() string {
 | 
			
		||||
	// Note Offset > len, not >=, so we can scan last character.
 | 
			
		||||
	if s.lastChunk && s.pos.Offset > len(s.buf) {
 | 
			
		||||
		return ""
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	s.skipSpace()
 | 
			
		||||
 | 
			
		||||
	if s.ch == -1 {
 | 
			
		||||
		return ""
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// EOL
 | 
			
		||||
	if s.isEol() {
 | 
			
		||||
		s.keyTokenFound = false
 | 
			
		||||
		// skip subsequent newline chars
 | 
			
		||||
		for s.ch != -1 && s.isEol() {
 | 
			
		||||
			s.nextChar()
 | 
			
		||||
		}
 | 
			
		||||
		// emit newline as separate token
 | 
			
		||||
		return tokEOL
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// skip comments
 | 
			
		||||
	if s.ch == '#' {
 | 
			
		||||
		s.keyTokenFound = false
 | 
			
		||||
		s.skipUntilEol()
 | 
			
		||||
		if s.ch == -1 {
 | 
			
		||||
			return ""
 | 
			
		||||
		}
 | 
			
		||||
		// emit newline as separate token
 | 
			
		||||
		return tokEOL
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// else we found something
 | 
			
		||||
	tok := tokBuffers.Get().(*bytes.Buffer)
 | 
			
		||||
	defer tokBuffers.Put(tok)
 | 
			
		||||
	tok.Reset()
 | 
			
		||||
	tok.WriteRune(s.ch)
 | 
			
		||||
	s.nextChar()
 | 
			
		||||
	for s.ch != -1 && !s.isSpace() && !s.isEol() {
 | 
			
		||||
		// Do not consider ":" to be a token separator if a first key token
 | 
			
		||||
		// has already been found on this line (avoid cutting an absolute URL
 | 
			
		||||
		// after the "http:")
 | 
			
		||||
		if s.ch == ':' && !s.keyTokenFound {
 | 
			
		||||
			s.nextChar()
 | 
			
		||||
			s.keyTokenFound = true
 | 
			
		||||
			break
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		tok.WriteRune(s.ch)
 | 
			
		||||
		s.nextChar()
 | 
			
		||||
	}
 | 
			
		||||
	return tok.String()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) scanAll() []string {
 | 
			
		||||
	results := make([]string, 0, 64) // random guess of average tokens length
 | 
			
		||||
	for {
 | 
			
		||||
		token := s.scan()
 | 
			
		||||
		if token != "" {
 | 
			
		||||
			results = append(results, token)
 | 
			
		||||
		} else {
 | 
			
		||||
			break
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return results
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) error(pos token.Position, msg string) {
 | 
			
		||||
	s.ErrorCount++
 | 
			
		||||
	if !s.Quiet {
 | 
			
		||||
		fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) isEol() bool {
 | 
			
		||||
	return s.ch == '\n' || s.ch == '\r'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) isSpace() bool {
 | 
			
		||||
	for _, r := range WhitespaceChars {
 | 
			
		||||
		if s.ch == r {
 | 
			
		||||
			return true
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return false
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) skipSpace() {
 | 
			
		||||
	for s.ch != -1 && s.isSpace() {
 | 
			
		||||
		s.nextChar()
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *byteScanner) skipUntilEol() {
 | 
			
		||||
	for s.ch != -1 && !s.isEol() {
 | 
			
		||||
		s.nextChar()
 | 
			
		||||
	}
 | 
			
		||||
	// skip subsequent newline chars
 | 
			
		||||
	for s.ch != -1 && s.isEol() {
 | 
			
		||||
		s.nextChar()
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Reads next Unicode char.
 | 
			
		||||
func (s *byteScanner) nextChar() bool {
 | 
			
		||||
	if s.pos.Offset >= len(s.buf) {
 | 
			
		||||
		s.ch = -1
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
	s.pos.Column++
 | 
			
		||||
	if s.ch == '\n' {
 | 
			
		||||
		s.pos.Line++
 | 
			
		||||
		s.pos.Column = 1
 | 
			
		||||
	}
 | 
			
		||||
	r, w := rune(s.buf[s.pos.Offset]), 1
 | 
			
		||||
	if r >= 0x80 {
 | 
			
		||||
		r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
 | 
			
		||||
		if r == utf8.RuneError && w == 1 {
 | 
			
		||||
			s.error(s.pos, "illegal UTF-8 encoding")
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	s.pos.Column++
 | 
			
		||||
	s.pos.Offset += w
 | 
			
		||||
	s.ch = r
 | 
			
		||||
	return true
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue