mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-11-04 07:52:25 -06:00 
			
		
		
		
	
		
			
				
	
	
		
			360 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			360 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
// Copyright 2012 The Gorilla Authors. All rights reserved.
 | 
						|
// Use of this source code is governed by a BSD-style
 | 
						|
// license that can be found in the LICENSE file.
 | 
						|
 | 
						|
package scanner
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"regexp"
 | 
						|
	"strings"
 | 
						|
	"unicode"
 | 
						|
	"unicode/utf8"
 | 
						|
)
 | 
						|
 | 
						|
// tokenType identifies the type of lexical tokens.
 | 
						|
type tokenType int
 | 
						|
 | 
						|
// String returns a string representation of the token type.
 | 
						|
func (t tokenType) String() string {
 | 
						|
	return tokenNames[t]
 | 
						|
}
 | 
						|
 | 
						|
// Token represents a token and the corresponding string.
 | 
						|
type Token struct {
 | 
						|
	Type   tokenType
 | 
						|
	Value  string
 | 
						|
	Line   int
 | 
						|
	Column int
 | 
						|
}
 | 
						|
 | 
						|
// String returns a string representation of the token.
 | 
						|
func (t *Token) String() string {
 | 
						|
	if len(t.Value) > 10 {
 | 
						|
		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
 | 
						|
			t.Type, t.Line, t.Column, t.Value)
 | 
						|
	}
 | 
						|
	return fmt.Sprintf("%s (line: %d, column: %d): %q",
 | 
						|
		t.Type, t.Line, t.Column, t.Value)
 | 
						|
}
 | 
						|
 | 
						|
// All tokens -----------------------------------------------------------------
 | 
						|
 | 
						|
// The complete list of tokens in CSS3.
 | 
						|
const (
 | 
						|
	// Scanner flags.
 | 
						|
	TokenError tokenType = iota
 | 
						|
	TokenEOF
 | 
						|
	// From now on, only tokens from the CSS specification.
 | 
						|
	TokenIdent
 | 
						|
	TokenAtKeyword
 | 
						|
	TokenString
 | 
						|
	TokenHash
 | 
						|
	TokenNumber
 | 
						|
	TokenPercentage
 | 
						|
	TokenDimension
 | 
						|
	TokenURI
 | 
						|
	TokenUnicodeRange
 | 
						|
	TokenCDO
 | 
						|
	TokenCDC
 | 
						|
	TokenS
 | 
						|
	TokenComment
 | 
						|
	TokenFunction
 | 
						|
	TokenIncludes
 | 
						|
	TokenDashMatch
 | 
						|
	TokenPrefixMatch
 | 
						|
	TokenSuffixMatch
 | 
						|
	TokenSubstringMatch
 | 
						|
	TokenChar
 | 
						|
	TokenBOM
 | 
						|
)
 | 
						|
 | 
						|
// tokenNames maps tokenType's to their names. Used for conversion to string.
 | 
						|
var tokenNames = map[tokenType]string{
 | 
						|
	TokenError:          "error",
 | 
						|
	TokenEOF:            "EOF",
 | 
						|
	TokenIdent:          "IDENT",
 | 
						|
	TokenAtKeyword:      "ATKEYWORD",
 | 
						|
	TokenString:         "STRING",
 | 
						|
	TokenHash:           "HASH",
 | 
						|
	TokenNumber:         "NUMBER",
 | 
						|
	TokenPercentage:     "PERCENTAGE",
 | 
						|
	TokenDimension:      "DIMENSION",
 | 
						|
	TokenURI:            "URI",
 | 
						|
	TokenUnicodeRange:   "UNICODE-RANGE",
 | 
						|
	TokenCDO:            "CDO",
 | 
						|
	TokenCDC:            "CDC",
 | 
						|
	TokenS:              "S",
 | 
						|
	TokenComment:        "COMMENT",
 | 
						|
	TokenFunction:       "FUNCTION",
 | 
						|
	TokenIncludes:       "INCLUDES",
 | 
						|
	TokenDashMatch:      "DASHMATCH",
 | 
						|
	TokenPrefixMatch:    "PREFIXMATCH",
 | 
						|
	TokenSuffixMatch:    "SUFFIXMATCH",
 | 
						|
	TokenSubstringMatch: "SUBSTRINGMATCH",
 | 
						|
	TokenChar:           "CHAR",
 | 
						|
	TokenBOM:            "BOM",
 | 
						|
}
 | 
						|
 | 
						|
// Macros and productions -----------------------------------------------------
 | 
						|
// http://www.w3.org/TR/css3-syntax/#tokenization
 | 
						|
 | 
						|
var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
 | 
						|
 | 
						|
// macros maps macro names to patterns to be expanded.
 | 
						|
var macros = map[string]string{
 | 
						|
	// must be escaped: `\.+*?()|[]{}^$`
 | 
						|
	"ident":      `-?{nmstart}{nmchar}*`,
 | 
						|
	"name":       `{nmchar}+`,
 | 
						|
	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
 | 
						|
	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
 | 
						|
	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
 | 
						|
	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
 | 
						|
	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
 | 
						|
	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
 | 
						|
	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
 | 
						|
	"stringchar": `{urlchar}|[ ]|\\{nl}`,
 | 
						|
	"nl":         `[\n\r\f]|\r\n`,
 | 
						|
	"w":          `{wc}*`,
 | 
						|
	"wc":         `[\t\n\f\r ]`,
 | 
						|
 | 
						|
	// urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
 | 
						|
	// ASCII characters range = `[\u0020-\u007e]`
 | 
						|
	// Skip space \u0020 = `[\u0021-\u007e]`
 | 
						|
	// Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
 | 
						|
	// Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
 | 
						|
	// Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
 | 
						|
	// Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
 | 
						|
	"urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
 | 
						|
}
 | 
						|
 | 
						|
// productions maps the list of tokens to patterns to be expanded.
 | 
						|
var productions = map[tokenType]string{
 | 
						|
	// Unused regexps (matched using other methods) are commented out.
 | 
						|
	TokenIdent:        `{ident}`,
 | 
						|
	TokenAtKeyword:    `@{ident}`,
 | 
						|
	TokenString:       `{string}`,
 | 
						|
	TokenHash:         `#{name}`,
 | 
						|
	TokenNumber:       `{num}`,
 | 
						|
	TokenPercentage:   `{num}%`,
 | 
						|
	TokenDimension:    `{num}{ident}`,
 | 
						|
	TokenURI:          `url\({w}(?:{string}|{urlchar}*?){w}\)`,
 | 
						|
	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
 | 
						|
	//TokenCDO:            `<!--`,
 | 
						|
	TokenCDC:      `-->`,
 | 
						|
	TokenS:        `{wc}+`,
 | 
						|
	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
 | 
						|
	TokenFunction: `{ident}\(`,
 | 
						|
	//TokenIncludes:       `~=`,
 | 
						|
	//TokenDashMatch:      `\|=`,
 | 
						|
	//TokenPrefixMatch:    `\^=`,
 | 
						|
	//TokenSuffixMatch:    `\$=`,
 | 
						|
	//TokenSubstringMatch: `\*=`,
 | 
						|
	//TokenChar:           `[^"']`,
 | 
						|
	//TokenBOM:            "\uFEFF",
 | 
						|
}
 | 
						|
 | 
						|
// matchers maps the list of tokens to compiled regular expressions.
 | 
						|
//
 | 
						|
// The map is filled on init() using the macros and productions defined in
 | 
						|
// the CSS specification.
 | 
						|
var matchers = map[tokenType]*regexp.Regexp{}
 | 
						|
 | 
						|
// matchOrder is the order to test regexps when first-char shortcuts
 | 
						|
// can't be used.
 | 
						|
var matchOrder = []tokenType{
 | 
						|
	TokenURI,
 | 
						|
	TokenFunction,
 | 
						|
	TokenUnicodeRange,
 | 
						|
	TokenIdent,
 | 
						|
	TokenDimension,
 | 
						|
	TokenPercentage,
 | 
						|
	TokenNumber,
 | 
						|
	TokenCDC,
 | 
						|
}
 | 
						|
 | 
						|
func init() {
 | 
						|
	// replace macros and compile regexps for productions.
 | 
						|
	replaceMacro := func(s string) string {
 | 
						|
		return "(?:" + macros[s[1:len(s)-1]] + ")"
 | 
						|
	}
 | 
						|
	for t, s := range productions {
 | 
						|
		for macroRegexp.MatchString(s) {
 | 
						|
			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
 | 
						|
		}
 | 
						|
		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// Scanner --------------------------------------------------------------------
 | 
						|
 | 
						|
// New returns a new CSS scanner for the given input.
 | 
						|
func New(input string) *Scanner {
 | 
						|
	// Normalize newlines.
 | 
						|
	// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
 | 
						|
	input = strings.Replace(input, "\r\n", "\n", -1)
 | 
						|
	input = strings.Replace(input, "\r", "\n", -1)
 | 
						|
	input = strings.Replace(input, "\f", "\n", -1)
 | 
						|
	input = strings.Replace(input, "\u0000", "\ufffd", -1)
 | 
						|
	return &Scanner{
 | 
						|
		input: input,
 | 
						|
		row:   1,
 | 
						|
		col:   1,
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// Scanner scans an input and emits tokens following the CSS3 specification.
 | 
						|
type Scanner struct {
 | 
						|
	input string
 | 
						|
	pos   int
 | 
						|
	row   int
 | 
						|
	col   int
 | 
						|
	err   *Token
 | 
						|
}
 | 
						|
 | 
						|
// Next returns the next token from the input.
 | 
						|
//
 | 
						|
// At the end of the input the token type is TokenEOF.
 | 
						|
//
 | 
						|
// If the input can't be tokenized the token type is TokenError. This occurs
 | 
						|
// in case of unclosed quotation marks or comments.
 | 
						|
func (s *Scanner) Next() *Token {
 | 
						|
	if s.err != nil {
 | 
						|
		return s.err
 | 
						|
	}
 | 
						|
	if s.pos >= len(s.input) {
 | 
						|
		s.err = &Token{TokenEOF, "", s.row, s.col}
 | 
						|
		return s.err
 | 
						|
	}
 | 
						|
	if s.pos == 0 {
 | 
						|
		// Test BOM only once, at the beginning of the file.
 | 
						|
		if strings.HasPrefix(s.input, "\uFEFF") {
 | 
						|
			return s.emitSimple(TokenBOM, "\uFEFF")
 | 
						|
		}
 | 
						|
	}
 | 
						|
	// There's a lot we can guess based on the first byte so we'll take a
 | 
						|
	// shortcut before testing multiple regexps.
 | 
						|
	input := s.input[s.pos:]
 | 
						|
	switch input[0] {
 | 
						|
	case '\t', '\n', ' ':
 | 
						|
		// Whitespace.
 | 
						|
		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
 | 
						|
	case '.':
 | 
						|
		// Dot is too common to not have a quick check.
 | 
						|
		// We'll test if this is a Char; if it is followed by a number it is a
 | 
						|
		// dimension/percentage/number, and this will be matched later.
 | 
						|
		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
 | 
						|
			return s.emitSimple(TokenChar, ".")
 | 
						|
		}
 | 
						|
	case '#':
 | 
						|
		// Another common one: Hash or Char.
 | 
						|
		if match := matchers[TokenHash].FindString(input); match != "" {
 | 
						|
			return s.emitToken(TokenHash, match)
 | 
						|
		}
 | 
						|
		return s.emitSimple(TokenChar, "#")
 | 
						|
	case '@':
 | 
						|
		// Another common one: AtKeyword or Char.
 | 
						|
		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
 | 
						|
			return s.emitSimple(TokenAtKeyword, match)
 | 
						|
		}
 | 
						|
		return s.emitSimple(TokenChar, "@")
 | 
						|
	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
 | 
						|
		// More common chars.
 | 
						|
		return s.emitSimple(TokenChar, string(input[0]))
 | 
						|
	case '"', '\'':
 | 
						|
		// String or error.
 | 
						|
		match := matchers[TokenString].FindString(input)
 | 
						|
		if match != "" {
 | 
						|
			return s.emitToken(TokenString, match)
 | 
						|
		}
 | 
						|
 | 
						|
		s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
 | 
						|
		return s.err
 | 
						|
	case '/':
 | 
						|
		// Comment, error or Char.
 | 
						|
		if len(input) > 1 && input[1] == '*' {
 | 
						|
			match := matchers[TokenComment].FindString(input)
 | 
						|
			if match != "" {
 | 
						|
				return s.emitToken(TokenComment, match)
 | 
						|
			} else {
 | 
						|
				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
 | 
						|
				return s.err
 | 
						|
			}
 | 
						|
		}
 | 
						|
		return s.emitSimple(TokenChar, "/")
 | 
						|
	case '~':
 | 
						|
		// Includes or Char.
 | 
						|
		return s.emitPrefixOrChar(TokenIncludes, "~=")
 | 
						|
	case '|':
 | 
						|
		// DashMatch or Char.
 | 
						|
		return s.emitPrefixOrChar(TokenDashMatch, "|=")
 | 
						|
	case '^':
 | 
						|
		// PrefixMatch or Char.
 | 
						|
		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
 | 
						|
	case '$':
 | 
						|
		// SuffixMatch or Char.
 | 
						|
		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
 | 
						|
	case '*':
 | 
						|
		// SubstringMatch or Char.
 | 
						|
		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
 | 
						|
	case '<':
 | 
						|
		// CDO or Char.
 | 
						|
		return s.emitPrefixOrChar(TokenCDO, "<!--")
 | 
						|
	}
 | 
						|
	// Test all regexps, in order.
 | 
						|
	for _, token := range matchOrder {
 | 
						|
		if match := matchers[token].FindString(input); match != "" {
 | 
						|
			return s.emitToken(token, match)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	// We already handled unclosed quotation marks and comments,
 | 
						|
	// so this can only be a Char.
 | 
						|
	r, width := utf8.DecodeRuneInString(input)
 | 
						|
	token := &Token{TokenChar, string(r), s.row, s.col}
 | 
						|
	s.col += width
 | 
						|
	s.pos += width
 | 
						|
	return token
 | 
						|
}
 | 
						|
 | 
						|
// updatePosition updates input coordinates based on the consumed text.
 | 
						|
func (s *Scanner) updatePosition(text string) {
 | 
						|
	width := utf8.RuneCountInString(text)
 | 
						|
	lines := strings.Count(text, "\n")
 | 
						|
	s.row += lines
 | 
						|
	if lines == 0 {
 | 
						|
		s.col += width
 | 
						|
	} else {
 | 
						|
		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
 | 
						|
	}
 | 
						|
	s.pos += len(text) // while col is a rune index, pos is a byte index
 | 
						|
}
 | 
						|
 | 
						|
// emitToken returns a Token for the string v and updates the scanner position.
 | 
						|
func (s *Scanner) emitToken(t tokenType, v string) *Token {
 | 
						|
	token := &Token{t, v, s.row, s.col}
 | 
						|
	s.updatePosition(v)
 | 
						|
	return token
 | 
						|
}
 | 
						|
 | 
						|
// emitSimple returns a Token for the string v and updates the scanner
 | 
						|
// position in a simplified manner.
 | 
						|
//
 | 
						|
// The string is known to have only ASCII characters and to not have a newline.
 | 
						|
func (s *Scanner) emitSimple(t tokenType, v string) *Token {
 | 
						|
	token := &Token{t, v, s.row, s.col}
 | 
						|
	s.col += len(v)
 | 
						|
	s.pos += len(v)
 | 
						|
	return token
 | 
						|
}
 | 
						|
 | 
						|
// emitPrefixOrChar returns a Token for type t if the current position
 | 
						|
// matches the given prefix. Otherwise it returns a Char token using the
 | 
						|
// first character from the prefix.
 | 
						|
//
 | 
						|
// The prefix is known to have only ASCII characters and to not have a newline.
 | 
						|
func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
 | 
						|
	if strings.HasPrefix(s.input[s.pos:], prefix) {
 | 
						|
		return s.emitSimple(t, prefix)
 | 
						|
	}
 | 
						|
	return s.emitSimple(TokenChar, string(prefix[0]))
 | 
						|
}
 |