mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-10-30 20:22:25 -05:00 
			
		
		
		
	* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
		
			
				
	
	
		
			333 lines
		
	
	
	
		
			8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			333 lines
		
	
	
	
		
			8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package html2text
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"regexp"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| )
 | |
| 
 | |
| // Line break constants
 | |
| // Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
 | |
| const (
 | |
| 	WIN_LBR  = "\r\n"
 | |
| 	UNIX_LBR = "\n"
 | |
| )
 | |
| 
 | |
| var legacyLBR = WIN_LBR
 | |
| var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
 | |
| var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
 | |
| var badLinkHrefRE = regexp.MustCompile(`javascript:`)
 | |
| var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
 | |
| var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
 | |
| 
 | |
| type options struct {
 | |
| 	lbr            string
 | |
| 	linksInnerText bool
 | |
| 	listPrefix     string
 | |
| }
 | |
| 
 | |
| func newOptions() *options {
 | |
| 	// apply defaults
 | |
| 	return &options{
 | |
| 		lbr: WIN_LBR,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Option is a functional option
 | |
| type Option func(*options)
 | |
| 
 | |
| // WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
 | |
| func WithUnixLineBreaks() Option {
 | |
| 	return func(o *options) {
 | |
| 		o.lbr = UNIX_LBR
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
 | |
| // Example: click news <http://bit.ly/2n4wXRs>
 | |
| func WithLinksInnerText() Option {
 | |
| 	return func(o *options) {
 | |
| 		o.linksInnerText = true
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
 | |
| func WithListSupportPrefix(prefix string) Option {
 | |
| 	return func(o *options) {
 | |
| 		o.listPrefix = prefix
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // WithListSupport formats <ul> and <li> lists with " - " prefix
 | |
| func WithListSupport() Option {
 | |
| 	return WithListSupportPrefix(" - ")
 | |
| }
 | |
| 
 | |
| func parseHTMLEntity(entName string) (string, bool) {
 | |
| 	if r, ok := entity[entName]; ok {
 | |
| 		return string(r), true
 | |
| 	}
 | |
| 
 | |
| 	if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
 | |
| 		var (
 | |
| 			err    error
 | |
| 			n      int64
 | |
| 			digits = match[1]
 | |
| 		)
 | |
| 
 | |
| 		if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
 | |
| 			n, err = strconv.ParseInt(digits[1:], 16, 64)
 | |
| 		} else {
 | |
| 			n, err = strconv.ParseInt(digits, 10, 64)
 | |
| 		}
 | |
| 
 | |
| 		if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
 | |
| 			return string(rune(n)), true
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return "", false
 | |
| }
 | |
| 
 | |
| // SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
 | |
| // with argument false sets Windows-style line-breaks in output ("\r\n", the default)
 | |
| // Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
 | |
| func SetUnixLbr(b bool) {
 | |
| 	if b {
 | |
| 		legacyLBR = UNIX_LBR
 | |
| 	} else {
 | |
| 		legacyLBR = WIN_LBR
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // HTMLEntitiesToText decodes HTML entities inside a provided
 | |
| // string and returns decoded text
 | |
| func HTMLEntitiesToText(htmlEntsText string) string {
 | |
| 	outBuf := bytes.NewBufferString("")
 | |
| 	inEnt := false
 | |
| 
 | |
| 	for i, r := range htmlEntsText {
 | |
| 		switch {
 | |
| 		case r == ';' && inEnt:
 | |
| 			inEnt = false
 | |
| 			continue
 | |
| 
 | |
| 		case r == '&': //possible html entity
 | |
| 			entName := ""
 | |
| 			isEnt := false
 | |
| 
 | |
| 			// parse the entity name - max 10 chars
 | |
| 			chars := 0
 | |
| 			for _, er := range htmlEntsText[i+1:] {
 | |
| 				if er == ';' {
 | |
| 					isEnt = true
 | |
| 					break
 | |
| 				} else {
 | |
| 					entName += string(er)
 | |
| 				}
 | |
| 
 | |
| 				chars++
 | |
| 				if chars == 10 {
 | |
| 					break
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			if isEnt {
 | |
| 				if ent, isEnt := parseHTMLEntity(entName); isEnt {
 | |
| 					outBuf.WriteString(ent)
 | |
| 					inEnt = true
 | |
| 					continue
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if !inEnt {
 | |
| 			outBuf.WriteRune(r)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return outBuf.String()
 | |
| }
 | |
| 
 | |
| func writeSpace(outBuf *bytes.Buffer) {
 | |
| 	bts := outBuf.Bytes()
 | |
| 	if len(bts) > 0 && bts[len(bts)-1] != ' ' {
 | |
| 		outBuf.WriteString(" ")
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // HTML2Text converts html into a text form
 | |
| func HTML2Text(html string) string {
 | |
| 	var opts []Option
 | |
| 	if legacyLBR == UNIX_LBR {
 | |
| 		opts = append(opts, WithUnixLineBreaks())
 | |
| 	}
 | |
| 	return HTML2TextWithOptions(html, opts...)
 | |
| }
 | |
| 
 | |
| // HTML2TextWithOptions converts html into a text form with additional options
 | |
| func HTML2TextWithOptions(html string, reqOpts ...Option) string {
 | |
| 	opts := newOptions()
 | |
| 	for _, opt := range reqOpts {
 | |
| 		opt(opts)
 | |
| 	}
 | |
| 
 | |
| 	inLen := len(html)
 | |
| 	tagStart := 0
 | |
| 	inEnt := false
 | |
| 	badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
 | |
| 	shouldOutput := true
 | |
| 	// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
 | |
| 	hrefs := []string{}
 | |
| 	// new line cannot be printed at the beginning or
 | |
| 	// for <p> after a new line created by previous <p></p>
 | |
| 	canPrintNewline := false
 | |
| 
 | |
| 	outBuf := bytes.NewBufferString("")
 | |
| 
 | |
| 	for i, r := range html {
 | |
| 		if inLen > 0 && i == inLen-1 {
 | |
| 			// prevent new line at the end of the document
 | |
| 			canPrintNewline = false
 | |
| 		}
 | |
| 
 | |
| 		switch {
 | |
| 		// skip new lines and spaces adding a single space if not there yet
 | |
| 		case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
 | |
| 			r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
 | |
| 			if shouldOutput && badTagStackDepth == 0 && !inEnt {
 | |
| 				//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
 | |
| 				writeSpace(outBuf)
 | |
| 			}
 | |
| 			continue
 | |
| 
 | |
| 		case r == ';' && inEnt: // end of html entity
 | |
| 			inEnt = false
 | |
| 			continue
 | |
| 
 | |
| 		case r == '&' && shouldOutput: // possible html entity
 | |
| 			entName := ""
 | |
| 			isEnt := false
 | |
| 
 | |
| 			// parse the entity name - max 10 chars
 | |
| 			chars := 0
 | |
| 			for _, er := range html[i+1:] {
 | |
| 				if er == ';' {
 | |
| 					isEnt = true
 | |
| 					break
 | |
| 				} else {
 | |
| 					entName += string(er)
 | |
| 				}
 | |
| 
 | |
| 				chars++
 | |
| 				if chars == 10 {
 | |
| 					break
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			if isEnt {
 | |
| 				if ent, isEnt := parseHTMLEntity(entName); isEnt {
 | |
| 					outBuf.WriteString(ent)
 | |
| 					inEnt = true
 | |
| 					continue
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 		case r == '<': // start of a tag
 | |
| 			tagStart = i + 1
 | |
| 			shouldOutput = false
 | |
| 			continue
 | |
| 
 | |
| 		case r == '>': // end of a tag
 | |
| 			shouldOutput = true
 | |
| 			tag := html[tagStart:i]
 | |
| 			tagNameLowercase := strings.ToLower(tag)
 | |
| 
 | |
| 			if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
 | |
| 				outBuf.WriteString(opts.lbr)
 | |
| 			} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
 | |
| 				if opts.listPrefix != "" {
 | |
| 					outBuf.WriteString(opts.lbr + opts.listPrefix)
 | |
| 				} else {
 | |
| 					outBuf.WriteString(opts.lbr)
 | |
| 				}
 | |
| 			} else if headersRE.MatchString(tagNameLowercase) {
 | |
| 				if canPrintNewline {
 | |
| 					outBuf.WriteString(opts.lbr + opts.lbr)
 | |
| 				}
 | |
| 				canPrintNewline = false
 | |
| 			} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
 | |
| 				// new line
 | |
| 				outBuf.WriteString(opts.lbr)
 | |
| 			} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
 | |
| 				if canPrintNewline {
 | |
| 					outBuf.WriteString(opts.lbr + opts.lbr)
 | |
| 				}
 | |
| 				canPrintNewline = false
 | |
| 			} else if opts.linksInnerText && tagNameLowercase == "/a" {
 | |
| 				// end of link
 | |
| 				// links can be empty can happen if the link matches the badLinkHrefRE
 | |
| 				if len(hrefs) > 0 {
 | |
| 					outBuf.WriteString(" <")
 | |
| 					outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
 | |
| 					outBuf.WriteString(">")
 | |
| 					hrefs = hrefs[1:]
 | |
| 				}
 | |
| 			} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
 | |
| 				// parse link href
 | |
| 				// add special handling for a tags
 | |
| 				m := linkTagRE.FindStringSubmatch(tag)
 | |
| 				if len(m) == 5 {
 | |
| 					link := m[2]
 | |
| 					if len(link) == 0 {
 | |
| 						link = m[3]
 | |
| 						if len(link) == 0 {
 | |
| 							link = m[4]
 | |
| 						}
 | |
| 					}
 | |
| 
 | |
| 					if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
 | |
| 						hrefs = append(hrefs, link)
 | |
| 					}
 | |
| 				}
 | |
| 			} else if badTagnamesRE.MatchString(tagNameLowercase) {
 | |
| 				// unwanted block
 | |
| 				badTagStackDepth++
 | |
| 
 | |
| 				// if link inner text preservation is not enabled
 | |
| 				// and the current tag is a link tag, parse its href and output that
 | |
| 				if !opts.linksInnerText {
 | |
| 					// parse link href
 | |
| 					m := linkTagRE.FindStringSubmatch(tag)
 | |
| 					if len(m) == 5 {
 | |
| 						link := m[2]
 | |
| 						if len(link) == 0 {
 | |
| 							link = m[3]
 | |
| 							if len(link) == 0 {
 | |
| 								link = m[4]
 | |
| 							}
 | |
| 						}
 | |
| 
 | |
| 						if !badLinkHrefRE.MatchString(link) {
 | |
| 							outBuf.WriteString(HTMLEntitiesToText(link))
 | |
| 						}
 | |
| 					}
 | |
| 				}
 | |
| 			} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
 | |
| 				badTagnamesRE.MatchString(tagNameLowercase[1:]) {
 | |
| 				// end of unwanted block
 | |
| 				badTagStackDepth--
 | |
| 			}
 | |
| 			continue
 | |
| 
 | |
| 		} // switch end
 | |
| 
 | |
| 		if shouldOutput && badTagStackDepth == 0 && !inEnt {
 | |
| 			canPrintNewline = true
 | |
| 			outBuf.WriteRune(r)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return outBuf.String()
 | |
| }
 |