mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-11-03 17:32:27 -06:00 
			
		
		
		
	* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
		
			
				
	
	
		
			333 lines
		
	
	
	
		
			8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			333 lines
		
	
	
	
		
			8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package html2text
 | 
						|
 | 
						|
import (
 | 
						|
	"bytes"
 | 
						|
	"regexp"
 | 
						|
	"strconv"
 | 
						|
	"strings"
 | 
						|
)
 | 
						|
 | 
						|
// Line break constants
 | 
						|
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
 | 
						|
const (
 | 
						|
	WIN_LBR  = "\r\n"
 | 
						|
	UNIX_LBR = "\n"
 | 
						|
)
 | 
						|
 | 
						|
var legacyLBR = WIN_LBR
 | 
						|
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
 | 
						|
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
 | 
						|
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
 | 
						|
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
 | 
						|
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
 | 
						|
 | 
						|
type options struct {
 | 
						|
	lbr            string
 | 
						|
	linksInnerText bool
 | 
						|
	listPrefix     string
 | 
						|
}
 | 
						|
 | 
						|
func newOptions() *options {
 | 
						|
	// apply defaults
 | 
						|
	return &options{
 | 
						|
		lbr: WIN_LBR,
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// Option is a functional option
 | 
						|
type Option func(*options)
 | 
						|
 | 
						|
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
 | 
						|
func WithUnixLineBreaks() Option {
 | 
						|
	return func(o *options) {
 | 
						|
		o.lbr = UNIX_LBR
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
 | 
						|
// Example: click news <http://bit.ly/2n4wXRs>
 | 
						|
func WithLinksInnerText() Option {
 | 
						|
	return func(o *options) {
 | 
						|
		o.linksInnerText = true
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
 | 
						|
func WithListSupportPrefix(prefix string) Option {
 | 
						|
	return func(o *options) {
 | 
						|
		o.listPrefix = prefix
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// WithListSupport formats <ul> and <li> lists with " - " prefix
 | 
						|
func WithListSupport() Option {
 | 
						|
	return WithListSupportPrefix(" - ")
 | 
						|
}
 | 
						|
 | 
						|
func parseHTMLEntity(entName string) (string, bool) {
 | 
						|
	if r, ok := entity[entName]; ok {
 | 
						|
		return string(r), true
 | 
						|
	}
 | 
						|
 | 
						|
	if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
 | 
						|
		var (
 | 
						|
			err    error
 | 
						|
			n      int64
 | 
						|
			digits = match[1]
 | 
						|
		)
 | 
						|
 | 
						|
		if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
 | 
						|
			n, err = strconv.ParseInt(digits[1:], 16, 64)
 | 
						|
		} else {
 | 
						|
			n, err = strconv.ParseInt(digits, 10, 64)
 | 
						|
		}
 | 
						|
 | 
						|
		if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
 | 
						|
			return string(rune(n)), true
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return "", false
 | 
						|
}
 | 
						|
 | 
						|
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
 | 
						|
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
 | 
						|
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
 | 
						|
func SetUnixLbr(b bool) {
 | 
						|
	if b {
 | 
						|
		legacyLBR = UNIX_LBR
 | 
						|
	} else {
 | 
						|
		legacyLBR = WIN_LBR
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// HTMLEntitiesToText decodes HTML entities inside a provided
 | 
						|
// string and returns decoded text
 | 
						|
func HTMLEntitiesToText(htmlEntsText string) string {
 | 
						|
	outBuf := bytes.NewBufferString("")
 | 
						|
	inEnt := false
 | 
						|
 | 
						|
	for i, r := range htmlEntsText {
 | 
						|
		switch {
 | 
						|
		case r == ';' && inEnt:
 | 
						|
			inEnt = false
 | 
						|
			continue
 | 
						|
 | 
						|
		case r == '&': //possible html entity
 | 
						|
			entName := ""
 | 
						|
			isEnt := false
 | 
						|
 | 
						|
			// parse the entity name - max 10 chars
 | 
						|
			chars := 0
 | 
						|
			for _, er := range htmlEntsText[i+1:] {
 | 
						|
				if er == ';' {
 | 
						|
					isEnt = true
 | 
						|
					break
 | 
						|
				} else {
 | 
						|
					entName += string(er)
 | 
						|
				}
 | 
						|
 | 
						|
				chars++
 | 
						|
				if chars == 10 {
 | 
						|
					break
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
			if isEnt {
 | 
						|
				if ent, isEnt := parseHTMLEntity(entName); isEnt {
 | 
						|
					outBuf.WriteString(ent)
 | 
						|
					inEnt = true
 | 
						|
					continue
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if !inEnt {
 | 
						|
			outBuf.WriteRune(r)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return outBuf.String()
 | 
						|
}
 | 
						|
 | 
						|
func writeSpace(outBuf *bytes.Buffer) {
 | 
						|
	bts := outBuf.Bytes()
 | 
						|
	if len(bts) > 0 && bts[len(bts)-1] != ' ' {
 | 
						|
		outBuf.WriteString(" ")
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// HTML2Text converts html into a text form
 | 
						|
func HTML2Text(html string) string {
 | 
						|
	var opts []Option
 | 
						|
	if legacyLBR == UNIX_LBR {
 | 
						|
		opts = append(opts, WithUnixLineBreaks())
 | 
						|
	}
 | 
						|
	return HTML2TextWithOptions(html, opts...)
 | 
						|
}
 | 
						|
 | 
						|
// HTML2TextWithOptions converts html into a text form with additional options
 | 
						|
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
 | 
						|
	opts := newOptions()
 | 
						|
	for _, opt := range reqOpts {
 | 
						|
		opt(opts)
 | 
						|
	}
 | 
						|
 | 
						|
	inLen := len(html)
 | 
						|
	tagStart := 0
 | 
						|
	inEnt := false
 | 
						|
	badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
 | 
						|
	shouldOutput := true
 | 
						|
	// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
 | 
						|
	hrefs := []string{}
 | 
						|
	// new line cannot be printed at the beginning or
 | 
						|
	// for <p> after a new line created by previous <p></p>
 | 
						|
	canPrintNewline := false
 | 
						|
 | 
						|
	outBuf := bytes.NewBufferString("")
 | 
						|
 | 
						|
	for i, r := range html {
 | 
						|
		if inLen > 0 && i == inLen-1 {
 | 
						|
			// prevent new line at the end of the document
 | 
						|
			canPrintNewline = false
 | 
						|
		}
 | 
						|
 | 
						|
		switch {
 | 
						|
		// skip new lines and spaces adding a single space if not there yet
 | 
						|
		case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
 | 
						|
			r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
 | 
						|
			if shouldOutput && badTagStackDepth == 0 && !inEnt {
 | 
						|
				//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
 | 
						|
				writeSpace(outBuf)
 | 
						|
			}
 | 
						|
			continue
 | 
						|
 | 
						|
		case r == ';' && inEnt: // end of html entity
 | 
						|
			inEnt = false
 | 
						|
			continue
 | 
						|
 | 
						|
		case r == '&' && shouldOutput: // possible html entity
 | 
						|
			entName := ""
 | 
						|
			isEnt := false
 | 
						|
 | 
						|
			// parse the entity name - max 10 chars
 | 
						|
			chars := 0
 | 
						|
			for _, er := range html[i+1:] {
 | 
						|
				if er == ';' {
 | 
						|
					isEnt = true
 | 
						|
					break
 | 
						|
				} else {
 | 
						|
					entName += string(er)
 | 
						|
				}
 | 
						|
 | 
						|
				chars++
 | 
						|
				if chars == 10 {
 | 
						|
					break
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
			if isEnt {
 | 
						|
				if ent, isEnt := parseHTMLEntity(entName); isEnt {
 | 
						|
					outBuf.WriteString(ent)
 | 
						|
					inEnt = true
 | 
						|
					continue
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
		case r == '<': // start of a tag
 | 
						|
			tagStart = i + 1
 | 
						|
			shouldOutput = false
 | 
						|
			continue
 | 
						|
 | 
						|
		case r == '>': // end of a tag
 | 
						|
			shouldOutput = true
 | 
						|
			tag := html[tagStart:i]
 | 
						|
			tagNameLowercase := strings.ToLower(tag)
 | 
						|
 | 
						|
			if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
 | 
						|
				outBuf.WriteString(opts.lbr)
 | 
						|
			} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
 | 
						|
				if opts.listPrefix != "" {
 | 
						|
					outBuf.WriteString(opts.lbr + opts.listPrefix)
 | 
						|
				} else {
 | 
						|
					outBuf.WriteString(opts.lbr)
 | 
						|
				}
 | 
						|
			} else if headersRE.MatchString(tagNameLowercase) {
 | 
						|
				if canPrintNewline {
 | 
						|
					outBuf.WriteString(opts.lbr + opts.lbr)
 | 
						|
				}
 | 
						|
				canPrintNewline = false
 | 
						|
			} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
 | 
						|
				// new line
 | 
						|
				outBuf.WriteString(opts.lbr)
 | 
						|
			} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
 | 
						|
				if canPrintNewline {
 | 
						|
					outBuf.WriteString(opts.lbr + opts.lbr)
 | 
						|
				}
 | 
						|
				canPrintNewline = false
 | 
						|
			} else if opts.linksInnerText && tagNameLowercase == "/a" {
 | 
						|
				// end of link
 | 
						|
				// links can be empty can happen if the link matches the badLinkHrefRE
 | 
						|
				if len(hrefs) > 0 {
 | 
						|
					outBuf.WriteString(" <")
 | 
						|
					outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
 | 
						|
					outBuf.WriteString(">")
 | 
						|
					hrefs = hrefs[1:]
 | 
						|
				}
 | 
						|
			} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
 | 
						|
				// parse link href
 | 
						|
				// add special handling for a tags
 | 
						|
				m := linkTagRE.FindStringSubmatch(tag)
 | 
						|
				if len(m) == 5 {
 | 
						|
					link := m[2]
 | 
						|
					if len(link) == 0 {
 | 
						|
						link = m[3]
 | 
						|
						if len(link) == 0 {
 | 
						|
							link = m[4]
 | 
						|
						}
 | 
						|
					}
 | 
						|
 | 
						|
					if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
 | 
						|
						hrefs = append(hrefs, link)
 | 
						|
					}
 | 
						|
				}
 | 
						|
			} else if badTagnamesRE.MatchString(tagNameLowercase) {
 | 
						|
				// unwanted block
 | 
						|
				badTagStackDepth++
 | 
						|
 | 
						|
				// if link inner text preservation is not enabled
 | 
						|
				// and the current tag is a link tag, parse its href and output that
 | 
						|
				if !opts.linksInnerText {
 | 
						|
					// parse link href
 | 
						|
					m := linkTagRE.FindStringSubmatch(tag)
 | 
						|
					if len(m) == 5 {
 | 
						|
						link := m[2]
 | 
						|
						if len(link) == 0 {
 | 
						|
							link = m[3]
 | 
						|
							if len(link) == 0 {
 | 
						|
								link = m[4]
 | 
						|
							}
 | 
						|
						}
 | 
						|
 | 
						|
						if !badLinkHrefRE.MatchString(link) {
 | 
						|
							outBuf.WriteString(HTMLEntitiesToText(link))
 | 
						|
						}
 | 
						|
					}
 | 
						|
				}
 | 
						|
			} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
 | 
						|
				badTagnamesRE.MatchString(tagNameLowercase[1:]) {
 | 
						|
				// end of unwanted block
 | 
						|
				badTagStackDepth--
 | 
						|
			}
 | 
						|
			continue
 | 
						|
 | 
						|
		} // switch end
 | 
						|
 | 
						|
		if shouldOutput && badTagStackDepth == 0 && !inEnt {
 | 
						|
			canPrintNewline = true
 | 
						|
			outBuf.WriteRune(r)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return outBuf.String()
 | 
						|
}
 |