[bugfix] Use better plaintext representation of status for filtering

This commit is contained in:
tobi 2024-09-15 16:12:57 +02:00
commit 03251e311b
25 changed files with 3316 additions and 57 deletions

View file

@ -45,7 +45,7 @@ steps:
go test
-failfast
-timeout=20m
-tags "netgo osusergo static_build kvformat timetzdata"
-tags "netgo osusergo static_build kvformat timetzdata purego"
./...
- ./test/envparsing.sh
- ./test/swagger.sh
@ -207,6 +207,6 @@ steps:
---
kind: signature
hmac: f4008d87e4e5b67251eb89f255c1224e6ab5818828cab24fc319b8f829176058
hmac: 3f3a24557b67760dd0c4091eaaed4842b0545f5aa65f90ce70d5e45da23c5260
...

View file

@ -27,6 +27,7 @@ builds:
- static_build
- kvformat
- timetzdata
- purego
- >-
{{ if and (index .Env "DEBUG") (.Env.DEBUG) }}debugenv{{ end }}
- >-

2
go.mod
View file

@ -28,6 +28,7 @@ require (
github.com/DmitriyVTitov/size v1.5.0
github.com/KimMachineGun/automemlimit v0.6.1
github.com/buckket/go-blurhash v1.1.0
github.com/cespare/xxhash v1.1.0
github.com/coreos/go-oidc/v3 v3.11.0
github.com/gin-contrib/cors v1.7.2
github.com/gin-contrib/gzip v1.0.1
@ -40,6 +41,7 @@ require (
github.com/gorilla/feeds v1.2.0
github.com/gorilla/websocket v1.5.2
github.com/jackc/pgx/v5 v5.6.0
github.com/k3a/html2text v1.2.1
github.com/microcosm-cc/bluemonday v1.0.27
github.com/miekg/dns v1.1.62
github.com/minio/minio-go/v7 v7.0.76

8
go.sum
View file

@ -98,6 +98,8 @@ github.com/Masterminds/semver/v3 v3.2.1 h1:RN9w6+7QoMeJVGyfmbcgs28Br8cvmnucEXnY0
github.com/Masterminds/semver/v3 v3.2.1/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ=
github.com/Masterminds/sprig/v3 v3.2.3 h1:eL2fZNezLomi0uOLqjQoN6BfsDD+fyLtgbJMAj9n6YA=
github.com/Masterminds/sprig/v3 v3.2.3/go.mod h1:rXcFaZ2zZbLRJv/xSysmlgIM1u11eBaRMhvYXJNkGuM=
github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/ajg/form v1.5.1 h1:t9c7v8JUKu/XxOGBU0yjNpaMloxGEJhUkqFRq0ibGeU=
github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY=
github.com/andybalholm/brotli v1.0.0/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
@ -118,6 +120,8 @@ github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
@ -384,6 +388,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
@ -506,6 +512,8 @@ github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIK
github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ=
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=

View file

@ -18,26 +18,37 @@
package typeutils
import (
"log"
"sync"
"time"
"codeberg.org/gruf/go-cache/v3"
"github.com/superseriousbusiness/gotosocial/internal/filter/interaction"
"github.com/superseriousbusiness/gotosocial/internal/filter/visibility"
"github.com/superseriousbusiness/gotosocial/internal/state"
)
type Converter struct {
state *state.State
defaultAvatars []string
randAvatars sync.Map
visFilter *visibility.Filter
intFilter *interaction.Filter
state *state.State
defaultAvatars []string
randAvatars sync.Map
visFilter *visibility.Filter
intFilter *interaction.Filter
statusHashesToFilterableText cache.TTLCache[string, string]
}
func NewConverter(state *state.State) *Converter {
statusHashesToFilterableText := cache.NewTTL[string, string](0, 512, 0)
statusHashesToFilterableText.SetTTL(time.Hour, true)
if !statusHashesToFilterableText.Start(time.Minute) {
log.Panic(nil, "failed to start statusHashesToFilterableText cache")
}
return &Converter{
state: state,
defaultAvatars: populateDefaultAvatars(),
visFilter: visibility.NewFilter(state),
intFilter: interaction.NewFilter(state),
state: state,
defaultAvatars: populateDefaultAvatars(),
visFilter: visibility.NewFilter(state),
intFilter: interaction.NewFilter(state),
statusHashesToFilterableText: statusHashesToFilterableText,
}
}

View file

@ -35,7 +35,6 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/language"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/media"
"github.com/superseriousbusiness/gotosocial/internal/text"
"github.com/superseriousbusiness/gotosocial/internal/uris"
"github.com/superseriousbusiness/gotosocial/internal/util"
)
@ -939,8 +938,18 @@ func (c *Converter) statusToAPIFilterResults(
return nil, nil
}
// Extract text fields from the status that we will match filters against.
fields := filterableTextFields(s)
// Derive a hash of this status.
statusHash := StatusHash(s)
// Check if we have the filterable
// text stored already for this hash.
statusText, stored := c.statusHashesToFilterableText.Get(statusHash)
if !stored {
// We don't have this filterable text
// cached, calculate + cache it now.
statusText = filterableText(s)
c.statusHashesToFilterableText.Set(statusHash, statusText)
}
// Record all matching warn filters and the reasons they matched.
filterResults := make([]apimodel.FilterResult, 0, len(filters))
@ -956,14 +965,7 @@ func (c *Converter) statusToAPIFilterResults(
// List all matching keywords.
keywordMatches := make([]string, 0, len(filter.Keywords))
for _, filterKeyword := range filter.Keywords {
var isMatch bool
for _, field := range fields {
if filterKeyword.Regexp.MatchString(field) {
isMatch = true
break
}
}
if isMatch {
if filterKeyword.Regexp.MatchString(statusText) {
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
}
}
@ -1001,40 +1003,6 @@ func (c *Converter) statusToAPIFilterResults(
return filterResults, nil
}
// filterableTextFields returns all text from a status that we might want to filter on:
// - content
// - content warning
// - media descriptions
// - poll options
func filterableTextFields(s *gtsmodel.Status) []string {
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)
if s.Content != "" {
fields = append(fields, text.SanitizeToPlaintext(s.Content))
}
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}
if s.Poll != nil {
for _, option := range s.Poll.Options {
if option != "" {
fields = append(fields, option)
}
}
}
return fields
}
// filterAppliesInContext returns whether a given filter applies in a given context.
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
switch filterContext {

View file

@ -19,6 +19,7 @@ package typeutils
import (
"context"
"encoding/hex"
"fmt"
"math"
"net/url"
@ -27,6 +28,8 @@ import (
"strconv"
"strings"
"github.com/cespare/xxhash/v2"
"github.com/k3a/html2text"
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -284,3 +287,77 @@ func ContentToContentLanguage(
return contentStr, langTagStr
}
// StatusHash returns an xxhash of text
// from a status, taking account of:
//
// - content warning
// - content
// - media IDs + descriptions
// - poll options
func StatusHash(s *gtsmodel.Status) string {
hash := xxhash.New()
// Content warning / title.
hash.WriteString(s.ContentWarning)
// Status content.
hash.WriteString(s.Content)
// Media IDs + descriptions.
for _, attachment := range s.Attachments {
hash.WriteString(attachment.ID)
hash.WriteString(attachment.Description)
}
// Poll options.
if s.Poll != nil {
for _, option := range s.Poll.Options {
hash.WriteString(option)
}
}
sum := hash.Sum(nil)
return hex.EncodeToString(sum)
}
// filterableText concatenates text from a
// status that we might want to filter on:
//
// - content warning
// - content (converted to plaintext from HTML)
// - media descriptions
// - poll options
func filterableText(s *gtsmodel.Status) string {
fields := []string{}
// Content warning / title.
fields = append(fields, s.ContentWarning)
// Status content; use raw text if available,
// else use text parsed from content HTML.
if s.Text != "" {
fields = append(fields, s.Text)
} else {
text := html2text.HTML2TextWithOptions(
s.Content,
html2text.WithLinksInnerText(),
html2text.WithUnixLineBreaks(),
)
fields = append(fields, text)
}
// Media descriptions.
for _, attachment := range s.Attachments {
fields = append(fields, attachment.Description)
}
// Poll options.
if s.Poll != nil {
for _, option := range s.Poll.Options {
fields = append(fields, option)
}
}
return strings.Join(fields, " ")
}

View file

@ -158,3 +158,90 @@ func TestContentToContentLanguage(t *testing.T) {
}
}
}
func TestFilterableText(t *testing.T) {
type testcase struct {
status *gtsmodel.Status
expectedText string
}
for i, testcase := range []testcase{
{
status: &gtsmodel.Status{
ContentWarning: "This is a test status",
Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
},
expectedText: `This is a test status Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.`,
},
{
status: &gtsmodel.Status{
Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
},
expectedText: ` @zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)
https://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>`,
},
{
status: &gtsmodel.Status{
Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
Attachments: []*gtsmodel.MediaAttachment{
{
Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
},
},
},
expectedText: ` Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything. Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
},
} {
text := filterableText(testcase.status)
if text != testcase.expectedText {
t.Errorf(
"test %d expected text '%s' got '%s'",
i, testcase.expectedText, text,
)
}
}
}
func TestStatusHash(t *testing.T) {
type testcase struct {
status *gtsmodel.Status
expectedHash string
}
for i, testcase := range []testcase{
{
status: &gtsmodel.Status{
ContentWarning: "This is a test status",
Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
},
expectedHash: `8bbb5439dbe62ae0`,
},
{
status: &gtsmodel.Status{
Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
},
expectedHash: `d039dfb4d04752d5`,
},
{
status: &gtsmodel.Status{
Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
Attachments: []*gtsmodel.MediaAttachment{
{
ID: "01J7TYSH1V5V4DCTVPASH3K9PQ",
Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
},
},
},
expectedHash: `414d975b2ef9d112`,
},
} {
hash := StatusHash(testcase.status)
if hash != testcase.expectedHash {
t.Errorf(
"test %d expected hash '%s' got '%s'",
i, testcase.expectedHash, hash,
)
}
}
}

View file

@ -6,7 +6,7 @@ set -e
log_exec() { echo "$ ${*}"; "$@"; }
# Grab environment variables and set defaults + requirements.
GO_BUILDTAGS="${GO_BUILDTAGS-} netgo osusergo static_build kvformat timetzdata"
GO_BUILDTAGS="${GO_BUILDTAGS-} netgo osusergo static_build kvformat timetzdata purego"
GO_LDFLAGS="${GO_LDFLAGS-} -s -w -extldflags '-static' -X 'main.Version=${VERSION:-$(git describe --tags --abbrev=0)}'"
GO_GCFLAGS=${GO_GCFLAGS-}
@ -17,6 +17,7 @@ GO_GCFLAGS=${GO_GCFLAGS-}
# Available Go build tags, with explanation, followed by benefits of enabling it:
# - kvformat: enables prettier output of log fields (slightly better performance)
# - timetzdata: embed timezone database inside binary (allow setting local time inside Docker containers, at cost of 450KB)
# - purego: disable amd64/arm64 assembly implementation for xxhash (increase portability at marginal performance cost)
# - notracing: disables compiling-in otel tracing support (reduced binary size, better performance)
# - nometrics: disables compiling-in otel metrics support (reduced binary size, better performance)
# - noerrcaller: disables caller function prefix in errors (slightly better performance, at cost of err readability)

22
vendor/github.com/cespare/xxhash/LICENSE.txt generated vendored Normal file
View file

@ -0,0 +1,22 @@
Copyright (c) 2016 Caleb Spare
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

50
vendor/github.com/cespare/xxhash/README.md generated vendored Normal file
View file

@ -0,0 +1,50 @@
# xxhash
[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
xxhash is a Go implementation of the 64-bit
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go
standard library.
The API is very small, taking its cue from the other hashing packages in the
standard library:
$ go doc github.com/cespare/xxhash !
package xxhash // import "github.com/cespare/xxhash"
Package xxhash implements the 64-bit variant of xxHash (XXH64) as described
at http://cyan4973.github.io/xxHash/.
func New() hash.Hash64
func Sum64(b []byte) uint64
func Sum64String(s string) uint64
This implementation provides a fast pure-Go implementation and an even faster
assembly implementation for amd64.
## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64 against another popular Go XXH64 implementation,
[github.com/OneOfOne/xxhash](https://github.com/OneOfOne/xxhash):
| input size | OneOfOne | cespare (purego) | cespare |
| --- | --- | --- | --- |
| 5 B | 416 MB/s | 720 MB/s | 872 MB/s |
| 100 B | 3980 MB/s | 5013 MB/s | 5252 MB/s |
| 4 KB | 12727 MB/s | 12999 MB/s | 13026 MB/s |
| 10 MB | 9879 MB/s | 10775 MB/s | 10913 MB/s |
These numbers were generated with:
```
$ go test -benchtime 10s -bench '/OneOfOne,'
$ go test -tags purego -benchtime 10s -bench '/xxhash,'
$ go test -benchtime 10s -bench '/xxhash,'
```
## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus)

14
vendor/github.com/cespare/xxhash/rotate.go generated vendored Normal file
View file

@ -0,0 +1,14 @@
// +build !go1.9
package xxhash
// TODO(caleb): After Go 1.10 comes out, remove this fallback code.
func rol1(x uint64) uint64 { return (x << 1) | (x >> (64 - 1)) }
func rol7(x uint64) uint64 { return (x << 7) | (x >> (64 - 7)) }
func rol11(x uint64) uint64 { return (x << 11) | (x >> (64 - 11)) }
func rol12(x uint64) uint64 { return (x << 12) | (x >> (64 - 12)) }
func rol18(x uint64) uint64 { return (x << 18) | (x >> (64 - 18)) }
func rol23(x uint64) uint64 { return (x << 23) | (x >> (64 - 23)) }
func rol27(x uint64) uint64 { return (x << 27) | (x >> (64 - 27)) }
func rol31(x uint64) uint64 { return (x << 31) | (x >> (64 - 31)) }

14
vendor/github.com/cespare/xxhash/rotate19.go generated vendored Normal file
View file

@ -0,0 +1,14 @@
// +build go1.9
package xxhash
import "math/bits"
func rol1(x uint64) uint64 { return bits.RotateLeft64(x, 1) }
func rol7(x uint64) uint64 { return bits.RotateLeft64(x, 7) }
func rol11(x uint64) uint64 { return bits.RotateLeft64(x, 11) }
func rol12(x uint64) uint64 { return bits.RotateLeft64(x, 12) }
func rol18(x uint64) uint64 { return bits.RotateLeft64(x, 18) }
func rol23(x uint64) uint64 { return bits.RotateLeft64(x, 23) }
func rol27(x uint64) uint64 { return bits.RotateLeft64(x, 27) }
func rol31(x uint64) uint64 { return bits.RotateLeft64(x, 31) }

168
vendor/github.com/cespare/xxhash/xxhash.go generated vendored Normal file
View file

@ -0,0 +1,168 @@
// Package xxhash implements the 64-bit variant of xxHash (XXH64) as described
// at http://cyan4973.github.io/xxHash/.
package xxhash
import (
"encoding/binary"
"hash"
)
const (
prime1 uint64 = 11400714785074694791
prime2 uint64 = 14029467366897019727
prime3 uint64 = 1609587929392839161
prime4 uint64 = 9650029242287828579
prime5 uint64 = 2870177450012600261
)
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
// possible in the Go code is worth a small (but measurable) performance boost
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
// convenience in the Go code in a few places where we need to intentionally
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
// result overflows a uint64).
var (
prime1v = prime1
prime2v = prime2
prime3v = prime3
prime4v = prime4
prime5v = prime5
)
type xxh struct {
v1 uint64
v2 uint64
v3 uint64
v4 uint64
total int
mem [32]byte
n int // how much of mem is used
}
// New creates a new hash.Hash64 that implements the 64-bit xxHash algorithm.
func New() hash.Hash64 {
var x xxh
x.Reset()
return &x
}
func (x *xxh) Reset() {
x.n = 0
x.total = 0
x.v1 = prime1v + prime2
x.v2 = prime2
x.v3 = 0
x.v4 = -prime1v
}
func (x *xxh) Size() int { return 8 }
func (x *xxh) BlockSize() int { return 32 }
// Write adds more data to x. It always returns len(b), nil.
func (x *xxh) Write(b []byte) (n int, err error) {
n = len(b)
x.total += len(b)
if x.n+len(b) < 32 {
// This new data doesn't even fill the current block.
copy(x.mem[x.n:], b)
x.n += len(b)
return
}
if x.n > 0 {
// Finish off the partial block.
copy(x.mem[x.n:], b)
x.v1 = round(x.v1, u64(x.mem[0:8]))
x.v2 = round(x.v2, u64(x.mem[8:16]))
x.v3 = round(x.v3, u64(x.mem[16:24]))
x.v4 = round(x.v4, u64(x.mem[24:32]))
b = b[32-x.n:]
x.n = 0
}
if len(b) >= 32 {
// One or more full blocks left.
b = writeBlocks(x, b)
}
// Store any remaining partial block.
copy(x.mem[:], b)
x.n = len(b)
return
}
func (x *xxh) Sum(b []byte) []byte {
s := x.Sum64()
return append(
b,
byte(s>>56),
byte(s>>48),
byte(s>>40),
byte(s>>32),
byte(s>>24),
byte(s>>16),
byte(s>>8),
byte(s),
)
}
func (x *xxh) Sum64() uint64 {
var h uint64
if x.total >= 32 {
v1, v2, v3, v4 := x.v1, x.v2, x.v3, x.v4
h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
} else {
h = x.v3 + prime5
}
h += uint64(x.total)
i, end := 0, x.n
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(x.mem[i:i+8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(x.mem[i:i+4])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
}
for i < end {
h ^= uint64(x.mem[i]) * prime5
h = rol11(h) * prime1
i++
}
h ^= h >> 33
h *= prime2
h ^= h >> 29
h *= prime3
h ^= h >> 32
return h
}
func u64(b []byte) uint64 { return binary.LittleEndian.Uint64(b) }
func u32(b []byte) uint32 { return binary.LittleEndian.Uint32(b) }
func round(acc, input uint64) uint64 {
acc += input * prime2
acc = rol31(acc)
acc *= prime1
return acc
}
func mergeRound(acc, val uint64) uint64 {
val = round(0, val)
acc ^= val
acc = acc*prime1 + prime4
return acc
}

12
vendor/github.com/cespare/xxhash/xxhash_amd64.go generated vendored Normal file
View file

@ -0,0 +1,12 @@
// +build !appengine
// +build gc
// +build !purego
package xxhash
// Sum64 computes the 64-bit xxHash digest of b.
//
//go:noescape
func Sum64(b []byte) uint64
func writeBlocks(x *xxh, b []byte) []byte

233
vendor/github.com/cespare/xxhash/xxhash_amd64.s generated vendored Normal file
View file

@ -0,0 +1,233 @@
// +build !appengine
// +build gc
// +build !purego
#include "textflag.h"
// Register allocation:
// AX h
// CX pointer to advance through b
// DX n
// BX loop end
// R8 v1, k1
// R9 v2
// R10 v3
// R11 v4
// R12 tmp
// R13 prime1v
// R14 prime2v
// R15 prime4v
// round reads from and advances the buffer pointer in CX.
// It assumes that R13 has prime1v and R14 has prime2v.
#define round(r) \
MOVQ (CX), R12 \
ADDQ $8, CX \
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r
// mergeRound applies a merge round on the two registers acc and val.
// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
#define mergeRound(acc, val) \
IMULQ R14, val \
ROLQ $31, val \
IMULQ R13, val \
XORQ val, acc \
IMULQ R13, acc \
ADDQ R15, acc
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32
// Load fixed primes.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·prime4v(SB), R15
// Load slice.
MOVQ b_base+0(FP), CX
MOVQ b_len+8(FP), DX
LEAQ (CX)(DX*1), BX
// The first loop limit will be len(b)-32.
SUBQ $32, BX
// Check whether we have at least one block.
CMPQ DX, $32
JLT noBlocks
// Set up initial state (v1, v2, v3, v4).
MOVQ R13, R8
ADDQ R14, R8
MOVQ R14, R9
XORQ R10, R10
XORQ R11, R11
SUBQ R13, R11
// Loop until CX > BX.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ CX, BX
JLE blockLoop
MOVQ R8, AX
ROLQ $1, AX
MOVQ R9, R12
ROLQ $7, R12
ADDQ R12, AX
MOVQ R10, R12
ROLQ $12, R12
ADDQ R12, AX
MOVQ R11, R12
ROLQ $18, R12
ADDQ R12, AX
mergeRound(AX, R8)
mergeRound(AX, R9)
mergeRound(AX, R10)
mergeRound(AX, R11)
JMP afterBlocks
noBlocks:
MOVQ ·prime5v(SB), AX
afterBlocks:
ADDQ DX, AX
// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
ADDQ $24, BX
CMPQ CX, BX
JG fourByte
wordLoop:
// Calculate k1.
MOVQ (CX), R8
ADDQ $8, CX
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8
XORQ R8, AX
ROLQ $27, AX
IMULQ R13, AX
ADDQ R15, AX
CMPQ CX, BX
JLE wordLoop
fourByte:
ADDQ $4, BX
CMPQ CX, BX
JG singles
MOVL (CX), R8
ADDQ $4, CX
IMULQ R13, R8
XORQ R8, AX
ROLQ $23, AX
IMULQ R14, AX
ADDQ ·prime3v(SB), AX
singles:
ADDQ $4, BX
CMPQ CX, BX
JGE finalize
singlesLoop:
MOVBQZX (CX), R12
ADDQ $1, CX
IMULQ ·prime5v(SB), R12
XORQ R12, AX
ROLQ $11, AX
IMULQ R13, AX
CMPQ CX, BX
JL singlesLoop
finalize:
MOVQ AX, R12
SHRQ $33, R12
XORQ R12, AX
IMULQ R14, AX
MOVQ AX, R12
SHRQ $29, R12
XORQ R12, AX
IMULQ ·prime3v(SB), AX
MOVQ AX, R12
SHRQ $32, R12
XORQ R12, AX
MOVQ AX, ret+24(FP)
RET
// writeBlocks uses the same registers as above except that it uses AX to store
// the x pointer.
// func writeBlocks(x *xxh, b []byte) []byte
TEXT ·writeBlocks(SB), NOSPLIT, $0-56
// Load fixed primes needed for round.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
// Load slice.
MOVQ b_base+8(FP), CX
MOVQ CX, ret_base+32(FP) // initialize return base pointer; see NOTE below
MOVQ b_len+16(FP), DX
LEAQ (CX)(DX*1), BX
SUBQ $32, BX
// Load vN from x.
MOVQ x+0(FP), AX
MOVQ 0(AX), R8 // v1
MOVQ 8(AX), R9 // v2
MOVQ 16(AX), R10 // v3
MOVQ 24(AX), R11 // v4
// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ CX, BX
JLE blockLoop
// Copy vN back to x.
MOVQ R8, 0(AX)
MOVQ R9, 8(AX)
MOVQ R10, 16(AX)
MOVQ R11, 24(AX)
// Construct return slice.
// NOTE: It's important that we don't construct a slice that has a base
// pointer off the end of the original slice, as in Go 1.7+ this will
// cause runtime crashes. (See discussion in, for example,
// https://github.com/golang/go/issues/16772.)
// Therefore, we calculate the length/cap first, and if they're zero, we
// keep the old base. This is what the compiler does as well if you
// write code like
// b = b[len(b):]
// New length is 32 - (CX - BX) -> BX+32 - CX.
ADDQ $32, BX
SUBQ CX, BX
JZ afterSetBase
MOVQ CX, ret_base+32(FP)
afterSetBase:
MOVQ BX, ret_len+40(FP)
MOVQ BX, ret_cap+48(FP) // set cap == len
RET

75
vendor/github.com/cespare/xxhash/xxhash_other.go generated vendored Normal file
View file

@ -0,0 +1,75 @@
// +build !amd64 appengine !gc purego
package xxhash
// Sum64 computes the 64-bit xxHash digest of b.
func Sum64(b []byte) uint64 {
// A simpler version would be
// x := New()
// x.Write(b)
// return x.Sum64()
// but this is faster, particularly for small inputs.
n := len(b)
var h uint64
if n >= 32 {
v1 := prime1v + prime2
v2 := prime2
v3 := uint64(0)
v4 := -prime1v
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
v3 = round(v3, u64(b[16:24:len(b)]))
v4 = round(v4, u64(b[24:32:len(b)]))
b = b[32:len(b):len(b)]
}
h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
} else {
h = prime5
}
h += uint64(n)
i, end := 0, len(b)
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(b[i:i+8:len(b)]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
}
for ; i < end; i++ {
h ^= uint64(b[i]) * prime5
h = rol11(h) * prime1
}
h ^= h >> 33
h *= prime2
h ^= h >> 29
h *= prime3
h ^= h >> 32
return h
}
func writeBlocks(x *xxh, b []byte) []byte {
v1, v2, v3, v4 := x.v1, x.v2, x.v3, x.v4
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
v3 = round(v3, u64(b[16:24:len(b)]))
v4 = round(v4, u64(b[24:32:len(b)]))
b = b[32:len(b):len(b)]
}
x.v1, x.v2, x.v3, x.v4 = v1, v2, v3, v4
return b
}

10
vendor/github.com/cespare/xxhash/xxhash_safe.go generated vendored Normal file
View file

@ -0,0 +1,10 @@
// +build appengine
// This file contains the safe implementations of otherwise unsafe-using code.
package xxhash
// Sum64String computes the 64-bit xxHash digest of s.
func Sum64String(s string) uint64 {
return Sum64([]byte(s))
}

30
vendor/github.com/cespare/xxhash/xxhash_unsafe.go generated vendored Normal file
View file

@ -0,0 +1,30 @@
// +build !appengine
// This file encapsulates usage of unsafe.
// xxhash_safe.go contains the safe implementations.
package xxhash
import (
"reflect"
"unsafe"
)
// Sum64String computes the 64-bit xxHash digest of s.
// It may be faster than Sum64([]byte(s)) by avoiding a copy.
//
// TODO(caleb): Consider removing this if an optimization is ever added to make
// it unnecessary: https://golang.org/issue/2205.
//
// TODO(caleb): We still have a function call; we could instead write Go/asm
// copies of Sum64 for strings to squeeze out a bit more speed.
func Sum64String(s string) uint64 {
// See https://groups.google.com/d/msg/golang-nuts/dcjzJy-bSpw/tcZYBzQqAQAJ
// for some discussion about this unsafe conversion.
var b []byte
bh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
bh.Data = (*reflect.StringHeader)(unsafe.Pointer(&s)).Data
bh.Len = len(s)
bh.Cap = len(s)
return Sum64(b)
}

10
vendor/github.com/k3a/html2text/.travis.yml generated vendored Normal file
View file

@ -0,0 +1,10 @@
language: go
go:
- master
before_install:
- go get github.com/axw/gocov/gocov
- go get github.com/mattn/goveralls
- if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
script:
- $HOME/gopath/bin/goveralls -service=travis-ci

21
vendor/github.com/k3a/html2text/LICENSE generated vendored Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

60
vendor/github.com/k3a/html2text/README.md generated vendored Normal file
View file

@ -0,0 +1,60 @@
[![GoDoc](https://godoc.org/github.com/k3a/html2text?status.svg)](https://godoc.org/github.com/k3a/html2text)
[![Build Status](https://travis-ci.org/k3a/html2text.svg?branch=master)](https://travis-ci.org/k3a/html2text)
[![Coverage Status](https://coveralls.io/repos/github/k3a/html2text/badge.svg?branch=master)](https://coveralls.io/github/k3a/html2text?branch=master)
[![Report Card](https://goreportcard.com/badge/github.com/k3a/html2text)](https://goreportcard.com/report/github.com/k3a/html2text)
# html2text
A simple Golang package to convert HTML to plain text (without non-standard dependencies).
It converts HTML tags to text and also parses HTML entities into characters they represent.
A `<head>` section of the HTML document, as well as most other tags are stripped out but
links are properly converted into their href attribute.
It can be used for converting HTML emails into text.
Some tests are installed as well.
Uses semantic versioning and no breaking changes are planned.
Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
## Install
```bash
go get github.com/k3a/html2text
```
## Usage
```go
package main
import (
"fmt"
"github.com/k3a/html2text"
)
func main() {
html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
plain := html2text.HTML2Text(html)
fmt.Println(plain)
}
/* Outputs:
clean text
*/
```
To see all features, please look info `html2text_test.go`.
## Alternatives
- https://github.com/jaytaylor/html2text (heavier, with more features)
- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
## License
MIT

2046
vendor/github.com/k3a/html2text/entity.go generated vendored Normal file

File diff suppressed because it is too large Load diff

333
vendor/github.com/k3a/html2text/html2text.go generated vendored Normal file
View file

@ -0,0 +1,333 @@
package html2text
import (
"bytes"
"regexp"
"strconv"
"strings"
)
// Line break constants
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
const (
WIN_LBR = "\r\n"
UNIX_LBR = "\n"
)
var legacyLBR = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
type options struct {
lbr string
linksInnerText bool
listPrefix string
}
func newOptions() *options {
// apply defaults
return &options{
lbr: WIN_LBR,
}
}
// Option is a functional option
type Option func(*options)
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
func WithUnixLineBreaks() Option {
return func(o *options) {
o.lbr = UNIX_LBR
}
}
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
// Example: click news <http://bit.ly/2n4wXRs>
func WithLinksInnerText() Option {
return func(o *options) {
o.linksInnerText = true
}
}
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
func WithListSupportPrefix(prefix string) Option {
return func(o *options) {
o.listPrefix = prefix
}
}
// WithListSupport formats <ul> and <li> lists with " - " prefix
func WithListSupport() Option {
return WithListSupportPrefix(" - ")
}
func parseHTMLEntity(entName string) (string, bool) {
if r, ok := entity[entName]; ok {
return string(r), true
}
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
var (
err error
n int64
digits = match[1]
)
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
n, err = strconv.ParseInt(digits[1:], 16, 64)
} else {
n, err = strconv.ParseInt(digits, 10, 64)
}
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
return string(rune(n)), true
}
}
return "", false
}
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
func SetUnixLbr(b bool) {
if b {
legacyLBR = UNIX_LBR
} else {
legacyLBR = WIN_LBR
}
}
// HTMLEntitiesToText decodes HTML entities inside a provided
// string and returns decoded text
func HTMLEntitiesToText(htmlEntsText string) string {
outBuf := bytes.NewBufferString("")
inEnt := false
for i, r := range htmlEntsText {
switch {
case r == ';' && inEnt:
inEnt = false
continue
case r == '&': //possible html entity
entName := ""
isEnt := false
// parse the entity name - max 10 chars
chars := 0
for _, er := range htmlEntsText[i+1:] {
if er == ';' {
isEnt = true
break
} else {
entName += string(er)
}
chars++
if chars == 10 {
break
}
}
if isEnt {
if ent, isEnt := parseHTMLEntity(entName); isEnt {
outBuf.WriteString(ent)
inEnt = true
continue
}
}
}
if !inEnt {
outBuf.WriteRune(r)
}
}
return outBuf.String()
}
func writeSpace(outBuf *bytes.Buffer) {
bts := outBuf.Bytes()
if len(bts) > 0 && bts[len(bts)-1] != ' ' {
outBuf.WriteString(" ")
}
}
// HTML2Text converts html into a text form
func HTML2Text(html string) string {
var opts []Option
if legacyLBR == UNIX_LBR {
opts = append(opts, WithUnixLineBreaks())
}
return HTML2TextWithOptions(html, opts...)
}
// HTML2TextWithOptions converts html into a text form with additional options
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
opts := newOptions()
for _, opt := range reqOpts {
opt(opts)
}
inLen := len(html)
tagStart := 0
inEnt := false
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
shouldOutput := true
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
hrefs := []string{}
// new line cannot be printed at the beginning or
// for <p> after a new line created by previous <p></p>
canPrintNewline := false
outBuf := bytes.NewBufferString("")
for i, r := range html {
if inLen > 0 && i == inLen-1 {
// prevent new line at the end of the document
canPrintNewline = false
}
switch {
// skip new lines and spaces adding a single space if not there yet
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
if shouldOutput && badTagStackDepth == 0 && !inEnt {
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
writeSpace(outBuf)
}
continue
case r == ';' && inEnt: // end of html entity
inEnt = false
continue
case r == '&' && shouldOutput: // possible html entity
entName := ""
isEnt := false
// parse the entity name - max 10 chars
chars := 0
for _, er := range html[i+1:] {
if er == ';' {
isEnt = true
break
} else {
entName += string(er)
}
chars++
if chars == 10 {
break
}
}
if isEnt {
if ent, isEnt := parseHTMLEntity(entName); isEnt {
outBuf.WriteString(ent)
inEnt = true
continue
}
}
case r == '<': // start of a tag
tagStart = i + 1
shouldOutput = false
continue
case r == '>': // end of a tag
shouldOutput = true
tag := html[tagStart:i]
tagNameLowercase := strings.ToLower(tag)
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
outBuf.WriteString(opts.lbr)
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
if opts.listPrefix != "" {
outBuf.WriteString(opts.lbr + opts.listPrefix)
} else {
outBuf.WriteString(opts.lbr)
}
} else if headersRE.MatchString(tagNameLowercase) {
if canPrintNewline {
outBuf.WriteString(opts.lbr + opts.lbr)
}
canPrintNewline = false
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
// new line
outBuf.WriteString(opts.lbr)
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
if canPrintNewline {
outBuf.WriteString(opts.lbr + opts.lbr)
}
canPrintNewline = false
} else if opts.linksInnerText && tagNameLowercase == "/a" {
// end of link
// links can be empty can happen if the link matches the badLinkHrefRE
if len(hrefs) > 0 {
outBuf.WriteString(" <")
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
outBuf.WriteString(">")
hrefs = hrefs[1:]
}
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
// parse link href
// add special handling for a tags
m := linkTagRE.FindStringSubmatch(tag)
if len(m) == 5 {
link := m[2]
if len(link) == 0 {
link = m[3]
if len(link) == 0 {
link = m[4]
}
}
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
hrefs = append(hrefs, link)
}
}
} else if badTagnamesRE.MatchString(tagNameLowercase) {
// unwanted block
badTagStackDepth++
// if link inner text preservation is not enabled
// and the current tag is a link tag, parse its href and output that
if !opts.linksInnerText {
// parse link href
m := linkTagRE.FindStringSubmatch(tag)
if len(m) == 5 {
link := m[2]
if len(link) == 0 {
link = m[3]
if len(link) == 0 {
link = m[4]
}
}
if !badLinkHrefRE.MatchString(link) {
outBuf.WriteString(HTMLEntitiesToText(link))
}
}
}
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
// end of unwanted block
badTagStackDepth--
}
continue
} // switch end
if shouldOutput && badTagStackDepth == 0 && !inEnt {
canPrintNewline = true
outBuf.WriteRune(r)
}
}
return outBuf.String()
}

6
vendor/modules.txt vendored
View file

@ -140,6 +140,9 @@ github.com/bytedance/sonic/loader/internal/rt
# github.com/cenkalti/backoff/v4 v4.3.0
## explicit; go 1.18
github.com/cenkalti/backoff/v4
# github.com/cespare/xxhash v1.1.0
## explicit
github.com/cespare/xxhash
# github.com/cespare/xxhash/v2 v2.3.0
## explicit; go 1.11
github.com/cespare/xxhash/v2
@ -446,6 +449,9 @@ github.com/josharian/intern
# github.com/json-iterator/go v1.1.12
## explicit; go 1.12
github.com/json-iterator/go
# github.com/k3a/html2text v1.2.1
## explicit; go 1.16
github.com/k3a/html2text
# github.com/klauspost/compress v1.17.9
## explicit; go 1.20
github.com/klauspost/compress