Text/status parsing fixes (#141)

* aaaaaa

* vendor minify

* update + test markdown parsing
This commit is contained in:
Tobi Smethurst 2021-08-16 19:17:56 +02:00 committed by GitHub
commit ce190d867c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
56 changed files with 7390 additions and 45 deletions

98
vendor/github.com/tdewolff/parse/v2/html/README.md generated vendored Normal file
View file

@ -0,0 +1,98 @@
# HTML [![API reference](https://img.shields.io/badge/godoc-reference-5272B4)](https://pkg.go.dev/github.com/tdewolff/parse/v2/html?tab=doc)
This package is an HTML5 lexer written in [Go][1]. It follows the specification at [The HTML syntax](http://www.w3.org/TR/html5/syntax.html). The lexer takes an io.Reader and converts it into tokens until the EOF.
## Installation
Run the following command
go get -u github.com/tdewolff/parse/v2/html
or add the following import and run project with `go get`
import "github.com/tdewolff/parse/v2/html"
## Lexer
### Usage
The following initializes a new Lexer with io.Reader `r`:
``` go
l := html.NewLexer(parse.NewInput(r))
```
To tokenize until EOF an error, use:
``` go
for {
tt, data := l.Next()
switch tt {
case html.ErrorToken:
// error or EOF set in l.Err()
return
case html.StartTagToken:
// ...
for {
ttAttr, dataAttr := l.Next()
if ttAttr != html.AttributeToken {
break
}
// ...
}
// ...
}
}
```
All tokens:
``` go
ErrorToken TokenType = iota // extra token when errors occur
CommentToken
DoctypeToken
StartTagToken
StartTagCloseToken
StartTagVoidToken
EndTagToken
AttributeToken
TextToken
```
### Examples
``` go
package main
import (
"os"
"github.com/tdewolff/parse/v2/html"
)
// Tokenize HTML from stdin.
func main() {
l := html.NewLexer(parse.NewInput(os.Stdin))
for {
tt, data := l.Next()
switch tt {
case html.ErrorToken:
if l.Err() != io.EOF {
fmt.Println("Error on line", l.Line(), ":", l.Err())
}
return
case html.StartTagToken:
fmt.Println("Tag", string(data))
for {
ttAttr, dataAttr := l.Next()
if ttAttr != html.AttributeToken {
break
}
key := dataAttr
val := l.AttrVal()
fmt.Println("Attribute", string(key), "=", string(val))
}
// ...
}
}
}
```
## License
Released under the [MIT license](https://github.com/tdewolff/parse/blob/master/LICENSE.md).
[1]: http://golang.org/ "Go Language"

81
vendor/github.com/tdewolff/parse/v2/html/hash.go generated vendored Normal file
View file

@ -0,0 +1,81 @@
package html
// generated by hasher -type=Hash -file=hash.go; DO NOT EDIT, except for adding more constants to the list and rerun go generate
// uses github.com/tdewolff/hasher
//go:generate hasher -type=Hash -file=hash.go
// Hash defines perfect hashes for a predefined list of strings
type Hash uint32
// Unique hash definitions to be used instead of strings
const (
Iframe Hash = 0x6 // iframe
Math Hash = 0x604 // math
Plaintext Hash = 0x1e09 // plaintext
Script Hash = 0xa06 // script
Style Hash = 0x1405 // style
Svg Hash = 0x1903 // svg
Textarea Hash = 0x2308 // textarea
Title Hash = 0xf05 // title
Xmp Hash = 0x1c03 // xmp
)
// String returns the hash' name.
func (i Hash) String() string {
start := uint32(i >> 8)
n := uint32(i & 0xff)
if start+n > uint32(len(_Hash_text)) {
return ""
}
return _Hash_text[start : start+n]
}
// ToHash returns the hash whose name is s. It returns zero if there is no
// such hash. It is case sensitive.
func ToHash(s []byte) Hash {
if len(s) == 0 || len(s) > _Hash_maxLen {
return 0
}
h := uint32(_Hash_hash0)
for i := 0; i < len(s); i++ {
h ^= uint32(s[i])
h *= 16777619
}
if i := _Hash_table[h&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) {
t := _Hash_text[i>>8 : i>>8+i&0xff]
for i := 0; i < len(s); i++ {
if t[i] != s[i] {
goto NEXT
}
}
return i
}
NEXT:
if i := _Hash_table[(h>>16)&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) {
t := _Hash_text[i>>8 : i>>8+i&0xff]
for i := 0; i < len(s); i++ {
if t[i] != s[i] {
return 0
}
}
return i
}
return 0
}
const _Hash_hash0 = 0x9acb0442
const _Hash_maxLen = 9
const _Hash_text = "iframemathscriptitlestylesvgxmplaintextarea"
var _Hash_table = [1 << 4]Hash{
0x0: 0x2308, // textarea
0x2: 0x6, // iframe
0x4: 0xf05, // title
0x5: 0x1e09, // plaintext
0x7: 0x1405, // style
0x8: 0x604, // math
0x9: 0xa06, // script
0xa: 0x1903, // svg
0xb: 0x1c03, // xmp
}

493
vendor/github.com/tdewolff/parse/v2/html/lex.go generated vendored Normal file
View file

@ -0,0 +1,493 @@
// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
package html
import (
"strconv"
"github.com/tdewolff/parse/v2"
)
// TokenType determines the type of token, eg. a number or a semicolon.
type TokenType uint32
// TokenType values.
const (
ErrorToken TokenType = iota // extra token when errors occur
CommentToken
DoctypeToken
StartTagToken
StartTagCloseToken
StartTagVoidToken
EndTagToken
AttributeToken
TextToken
SvgToken
MathToken
)
// String returns the string representation of a TokenType.
func (tt TokenType) String() string {
switch tt {
case ErrorToken:
return "Error"
case CommentToken:
return "Comment"
case DoctypeToken:
return "Doctype"
case StartTagToken:
return "StartTag"
case StartTagCloseToken:
return "StartTagClose"
case StartTagVoidToken:
return "StartTagVoid"
case EndTagToken:
return "EndTag"
case AttributeToken:
return "Attribute"
case TextToken:
return "Text"
case SvgToken:
return "Svg"
case MathToken:
return "Math"
}
return "Invalid(" + strconv.Itoa(int(tt)) + ")"
}
////////////////////////////////////////////////////////////////
// Lexer is the state for the lexer.
type Lexer struct {
r *parse.Input
err error
rawTag Hash
inTag bool
text []byte
attrVal []byte
}
// NewLexer returns a new Lexer for a given io.Reader.
func NewLexer(r *parse.Input) *Lexer {
return &Lexer{
r: r,
}
}
// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
if l.err != nil {
return l.err
}
return l.r.Err()
}
// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
func (l *Lexer) Text() []byte {
return l.text
}
// AttrVal returns the attribute value when an AttributeToken was returned from Next.
func (l *Lexer) AttrVal() []byte {
return l.attrVal
}
// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
func (l *Lexer) Next() (TokenType, []byte) {
l.text = nil
var c byte
if l.inTag {
l.attrVal = nil
for { // before attribute name state
if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
l.r.Move(1)
continue
}
break
}
if c == 0 && l.r.Err() != nil {
return ErrorToken, nil
} else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
return AttributeToken, l.shiftAttribute()
}
l.r.Skip()
l.inTag = false
if c == '/' {
l.r.Move(2)
return StartTagVoidToken, l.r.Shift()
}
l.r.Move(1)
return StartTagCloseToken, l.r.Shift()
}
if l.rawTag != 0 {
if rawText := l.shiftRawText(); len(rawText) > 0 {
l.rawTag = 0
return TextToken, rawText
}
l.rawTag = 0
}
for {
c = l.r.Peek(0)
if c == '<' {
c = l.r.Peek(1)
isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
if l.r.Pos() > 0 {
if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
// return currently buffered texttoken so that we can return tag next iteration
l.text = l.r.Shift()
return TextToken, l.text
}
} else if isEndTag {
l.r.Move(2)
// only endtags that are not followed by > or EOF arrive here
if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
return CommentToken, l.shiftBogusComment()
}
return EndTagToken, l.shiftEndTag()
} else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
l.r.Move(1)
l.inTag = true
return l.shiftStartTag()
} else if c == '!' {
l.r.Move(2)
return l.readMarkup()
} else if c == '?' {
l.r.Move(1)
return CommentToken, l.shiftBogusComment()
}
} else if c == 0 && l.r.Err() != nil {
if l.r.Pos() > 0 {
l.text = l.r.Shift()
return TextToken, l.text
}
return ErrorToken, nil
}
l.r.Move(1)
}
}
////////////////////////////////////////////////////////////////
// The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html
func (l *Lexer) shiftRawText() []byte {
if l.rawTag == Plaintext {
for {
if l.r.Peek(0) == 0 && l.r.Err() != nil {
return l.r.Shift()
}
l.r.Move(1)
}
} else { // RCDATA, RAWTEXT and SCRIPT
for {
c := l.r.Peek(0)
if c == '<' {
if l.r.Peek(1) == '/' {
mark := l.r.Pos()
l.r.Move(2)
for {
if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
break
}
l.r.Move(1)
}
if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
l.r.Rewind(mark)
return l.r.Shift()
}
} else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
l.r.Move(4)
inScript := false
for {
c := l.r.Peek(0)
if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
l.r.Move(3)
break
} else if c == '<' {
isEnd := l.r.Peek(1) == '/'
if isEnd {
l.r.Move(2)
} else {
l.r.Move(1)
}
mark := l.r.Pos()
for {
if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
break
}
l.r.Move(1)
}
if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
if !isEnd {
inScript = true
} else {
if !inScript {
l.r.Rewind(mark - 2)
return l.r.Shift()
}
inScript = false
}
}
} else if c == 0 && l.r.Err() != nil {
return l.r.Shift()
} else {
l.r.Move(1)
}
}
} else {
l.r.Move(1)
}
} else if c == 0 && l.r.Err() != nil {
return l.r.Shift()
} else {
l.r.Move(1)
}
}
}
}
func (l *Lexer) readMarkup() (TokenType, []byte) {
if l.at('-', '-') {
l.r.Move(2)
for {
if l.r.Peek(0) == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[4:]
return CommentToken, l.r.Shift()
} else if l.at('-', '-', '>') {
l.text = l.r.Lexeme()[4:]
l.r.Move(3)
return CommentToken, l.r.Shift()
} else if l.at('-', '-', '!', '>') {
l.text = l.r.Lexeme()[4:]
l.r.Move(4)
return CommentToken, l.r.Shift()
}
l.r.Move(1)
}
} else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
l.r.Move(7)
for {
if l.r.Peek(0) == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[9:]
return TextToken, l.r.Shift()
} else if l.at(']', ']', '>') {
l.text = l.r.Lexeme()[9:]
l.r.Move(3)
return TextToken, l.r.Shift()
}
l.r.Move(1)
}
} else {
if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
l.r.Move(7)
if l.r.Peek(0) == ' ' {
l.r.Move(1)
}
for {
if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[9:]
if c == '>' {
l.r.Move(1)
}
return DoctypeToken, l.r.Shift()
}
l.r.Move(1)
}
}
}
return CommentToken, l.shiftBogusComment()
}
func (l *Lexer) shiftBogusComment() []byte {
for {
c := l.r.Peek(0)
if c == '>' {
l.text = l.r.Lexeme()[2:]
l.r.Move(1)
return l.r.Shift()
} else if c == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[2:]
return l.r.Shift()
}
l.r.Move(1)
}
}
func (l *Lexer) shiftStartTag() (TokenType, []byte) {
for {
if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
}
l.text = parse.ToLower(l.r.Lexeme()[1:])
if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
if h == Svg || h == Math {
data := l.shiftXML(h)
if l.err != nil {
return ErrorToken, nil
}
l.inTag = false
if h == Svg {
return SvgToken, data
}
return MathToken, data
}
l.rawTag = h
}
return StartTagToken, l.r.Shift()
}
func (l *Lexer) shiftAttribute() []byte {
nameStart := l.r.Pos()
var c byte
for { // attribute name state
if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
}
nameEnd := l.r.Pos()
for { // after attribute name state
if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
l.r.Move(1)
continue
}
break
}
if c == '=' {
l.r.Move(1)
for { // before attribute value state
if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
l.r.Move(1)
continue
}
break
}
attrPos := l.r.Pos()
delim := c
if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
l.r.Move(1)
for {
c := l.r.Peek(0)
if c == delim {
l.r.Move(1)
break
} else if c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
}
} else { // attribute value unquoted state
for {
if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
}
}
l.attrVal = l.r.Lexeme()[attrPos:]
} else {
l.r.Rewind(nameEnd)
l.attrVal = nil
}
l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd])
return l.r.Shift()
}
func (l *Lexer) shiftEndTag() []byte {
for {
c := l.r.Peek(0)
if c == '>' {
l.text = l.r.Lexeme()[2:]
l.r.Move(1)
break
} else if c == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[2:]
break
}
l.r.Move(1)
}
end := len(l.text)
for end > 0 {
if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
end--
continue
}
break
}
l.text = l.text[:end]
return parse.ToLower(l.r.Shift())
}
// shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
// So far we have already parsed `<svg` or `<math`.
func (l *Lexer) shiftXML(rawTag Hash) []byte {
inQuote := false
for {
c := l.r.Peek(0)
if c == '"' {
inQuote = !inQuote
l.r.Move(1)
} else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
mark := l.r.Pos()
l.r.Move(2)
for {
if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
break
}
l.r.Move(1)
}
if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
break
}
} else if c == 0 {
if l.r.Err() == nil {
l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
}
return l.r.Shift()
} else {
l.r.Move(1)
}
}
for {
c := l.r.Peek(0)
if c == '>' {
l.r.Move(1)
break
} else if c == 0 {
if l.r.Err() == nil {
l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
}
return l.r.Shift()
}
l.r.Move(1)
}
return l.r.Shift()
}
////////////////////////////////////////////////////////////////
func (l *Lexer) at(b ...byte) bool {
for i, c := range b {
if l.r.Peek(i) != c {
return false
}
}
return true
}
func (l *Lexer) atCaseInsensitive(b ...byte) bool {
for i, c := range b {
if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
return false
}
}
return true
}

103
vendor/github.com/tdewolff/parse/v2/html/util.go generated vendored Normal file
View file

@ -0,0 +1,103 @@
package html
var (
singleQuoteEntityBytes = []byte("&#39;")
doubleQuoteEntityBytes = []byte("&#34;")
)
// EscapeAttrVal returns the escaped attribute value bytes without quotes.
func EscapeAttrVal(buf *[]byte, orig, b []byte, isXML bool) []byte {
singles := 0
doubles := 0
unquoted := true
entities := false
for _, c := range b {
if charTable[c] {
unquoted = false
if c == '"' {
doubles++
} else if c == '\'' {
singles++
}
}
}
if unquoted && !isXML {
return b
} else if !entities && len(orig) == len(b)+2 && (singles == 0 && orig[0] == '\'' || doubles == 0 && orig[0] == '"') {
return orig
}
n := len(b) + 2
var quote byte
var escapedQuote []byte
if singles >= doubles || isXML {
n += doubles * 4
quote = '"'
escapedQuote = doubleQuoteEntityBytes
} else {
n += singles * 4
quote = '\''
escapedQuote = singleQuoteEntityBytes
}
if n > cap(*buf) {
*buf = make([]byte, 0, n) // maximum size, not actual size
}
t := (*buf)[:n] // maximum size, not actual size
t[0] = quote
j := 1
start := 0
for i, c := range b {
if c == quote {
j += copy(t[j:], b[start:i])
j += copy(t[j:], escapedQuote)
start = i + 1
}
}
j += copy(t[j:], b[start:])
t[j] = quote
return t[:j+1]
}
var charTable = [256]bool{
// ASCII
false, false, false, false, false, false, false, false,
false, true, true, false, true, true, false, false, // tab, line feed, form feed, carriage return
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
true, false, true, false, false, false, false, true, // space, "), '
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, true, true, true, false, // <, =, >
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
true, false, false, false, false, false, false, false, // `
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
// non-ASCII
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false,
}