rename some of the text functions for clarity

This commit is contained in:
tobi 2025-03-07 11:40:44 +01:00
commit c8556cfd97
16 changed files with 145 additions and 124 deletions

View file

@ -339,7 +339,7 @@ func NormalizeIncomingName(item WithName, rawJSON map[string]interface{}) {
//
// todo: We probably want to update this to allow
// *escaped* HTML markup, but for now just nuke it.
name = text.RemoveHTML(name)
name = text.StripHTMLFromText(name)
// Set normalized name property from the raw string; this
// will replace any existing name property on the item.

View file

@ -67,7 +67,7 @@ func OGBase(instance *apimodel.InstanceV1) *OGMeta {
}
og := &OGMeta{
Title: text.RemoveHTML(instance.Title) + " - GoToSocial",
Title: text.StripHTMLFromText(instance.Title) + " - GoToSocial",
Type: "website",
Locale: locale,
URL: instance.URI,
@ -161,7 +161,7 @@ func AccountTitle(account *apimodel.WebAccount, accountDomain string) string {
// ParseDescription returns a string description which is
// safe to use as a template.HTMLAttr inside templates.
func ParseDescription(in string) string {
i := text.RemoveHTML(in)
i := text.StripHTMLFromText(in)
i = strings.ReplaceAll(i, "\n", " ")
i = strings.Join(strings.Fields(i), " ")
i = html.EscapeString(i)

View file

@ -97,8 +97,8 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
return nil, gtserror.NewErrorBadRequest(err, err.Error())
}
// Parse new display name (always from plaintext).
account.DisplayName = text.RemoveHTML(displayName)
// HTML tags not allowed in display name.
account.DisplayName = text.StripHTMLFromText(displayName)
acctColumns = append(acctColumns, "display_name")
}
@ -145,7 +145,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
}
if form.AvatarDescription != nil {
desc := text.RemoveHTML(*form.AvatarDescription)
desc := text.StripHTMLFromText(*form.AvatarDescription)
form.AvatarDescription = &desc
}
@ -175,7 +175,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
}
if form.HeaderDescription != nil {
desc := text.RemoveHTML(*form.HeaderDescription)
desc := text.StripHTMLFromText(*form.HeaderDescription)
form.HeaderDescription = util.Ptr(desc)
}
@ -265,7 +265,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
return nil, gtserror.NewErrorBadRequest(err, err.Error())
}
account.Settings.CustomCSS = text.RemoveHTML(customCSS)
account.Settings.CustomCSS = text.StripHTMLFromText(customCSS)
settingsColumns = append(settingsColumns, "custom_css")
}
@ -356,8 +356,8 @@ func (p *Processor) updateFields(
// Sanitize raw field values.
fieldRaw := &gtsmodel.Field{
Name: text.RemoveHTML(name),
Value: text.RemoveHTML(value),
Name: text.StripHTMLFromText(name),
Value: text.StripHTMLFromText(value),
}
fieldsRaw = append(fieldsRaw, fieldRaw)
}

View file

@ -53,8 +53,8 @@ func (p *Processor) createDomainAllow(
ID: id.NewULID(),
Domain: domain,
CreatedByAccountID: adminAcct.ID,
PrivateComment: text.RemoveHTML(privateComment),
PublicComment: text.RemoveHTML(publicComment),
PrivateComment: text.StripHTMLFromText(privateComment),
PublicComment: text.StripHTMLFromText(publicComment),
Obfuscate: &obfuscate,
SubscriptionID: subscriptionID,
}

View file

@ -53,8 +53,8 @@ func (p *Processor) createDomainBlock(
ID: id.NewULID(),
Domain: domain,
CreatedByAccountID: adminAcct.ID,
PrivateComment: text.RemoveHTML(privateComment),
PublicComment: text.RemoveHTML(publicComment),
PrivateComment: text.StripHTMLFromText(privateComment),
PublicComment: text.StripHTMLFromText(publicComment),
Obfuscate: &obfuscate,
SubscriptionID: subscriptionID,
}

View file

@ -165,7 +165,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe
}
// Don't allow html in site title.
instance.Title = text.RemoveHTML(title)
instance.Title = text.StripHTMLFromText(title)
columns = append(columns, "title")
}
@ -235,7 +235,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe
return nil, gtserror.NewErrorBadRequest(err, err.Error())
}
instance.CustomCSS = text.RemoveHTML(customCSS)
instance.CustomCSS = text.StripHTMLFromText(customCSS)
columns = append(columns, []string{"custom_css"}...)
}

View file

@ -87,7 +87,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, media
// processDescription will sanitize and valid description against server configuration.
func processDescription(description string) (string, gtserror.WithCode) {
description = text.RemoveHTML(description)
description = text.StripHTMLFromText(description)
chars := len([]rune(description))
if min := config.GetMediaDescriptionMinChars(); chars < min {

View file

@ -236,7 +236,7 @@ func (p *Processor) processContent(
// Strip each poll option and format.
//
// For polls just use basic formatting.
option = text.RemoveHTML(option)
option = text.StripHTMLFromText(option)
optionRes := formatInput(p.formatter.FromPlainBasic, option)
// Gather results of the formatted.

View file

@ -122,7 +122,7 @@ func (p *Processor) Create(
Username: form.Username,
Email: form.Email,
Password: form.Password,
Reason: text.RemoveHTML(reason),
Reason: text.StripHTMLFromText(reason),
SignUpIP: form.IP,
Locale: form.Locale,
AppID: app.ID,

View file

@ -20,6 +20,7 @@ package text
import (
"bytes"
"context"
gohtml "html"
"strings"
"codeberg.org/gruf/go-byteutil"
@ -193,9 +194,22 @@ func (f *Formatter) fromPlain(
return result
}
// HTMLToPlain parses the given HTML and then outputs
// it to close-as-possible equivalent plaintext.
func HTMLToPlain(html string) string {
// ParseHTMLToPlain parses the given HTML string, then
// outputs it to equivalent plaintext while trying to
// keep as much of the smenantic intent of the input
// HTML as possible, ie., titles are placed on separate
// lines, `<br>`s are converted to newlines, text inside
// `<strong>` and `<em>` tags is retained, but without
// emphasis, `<a>` links are unnested and the URL they
// link to is placed in angle brackets next to them,
// lists are replaced with newline-separated indented
// items, etc.
//
// This function is useful when you need to filter on
// HTML and want to avoid catching tags in the filter,
// or when you want to serve something in a plaintext
// format that may contain HTML tags (eg., CWs).
func ParseHTMLToPlain(html string) string {
plain := html2text.HTML2TextWithOptions(
html,
html2text.WithLinksInnerText(),
@ -204,3 +218,21 @@ func HTMLToPlain(html string) string {
)
return strings.TrimSpace(plain)
}
// StripHTMLFromText runs text through strict sanitization
// to completely remove any HTML from the input without
// trying to preserve the semantic intent of any HTML tags.
//
// This is useful in cases where the input was not allowed
// to contain HTML at all, and the output isn't either.
func StripHTMLFromText(text string) string {
// Unescape first to catch any tricky critters.
content := gohtml.UnescapeString(text)
// Remove all detected HTML.
content = strict.Sanitize(content)
// Unescape again to return plaintext.
content = gohtml.UnescapeString(content)
return strings.TrimSpace(content)
}

View file

@ -184,7 +184,7 @@ func (suite *PlainTestSuite) TestNumbersAreNotHashtags() {
suite.Len(f.Tags, 0)
}
func (suite *PlainTestSuite) TestHTMLToPlain() {
func (suite *PlainTestSuite) TestParseHTMLToPlain() {
for _, t := range []struct {
html string
expectedPlain string
@ -246,11 +246,88 @@ See the domain permission subscription documentation <https://docs.gotosocial.or
Thanks for reading! And seriously back up your database.`,
},
} {
plain := text.HTMLToPlain(t.html)
plain := text.ParseHTMLToPlain(t.html)
suite.Equal(t.expectedPlain, plain)
}
}
func (suite *PlainTestSuite) TestStripCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("this is just a normal caption ;)", stripped)
}
func (suite *PlainTestSuite) TestStripCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("here's a LOUD caption", stripped)
}
func (suite *PlainTestSuite) TestStripCaption3() {
dodgyCaption := ""
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("", stripped)
}
func (suite *PlainTestSuite) TestStripCaption4() {
dodgyCaption := `
here is
a multi line
caption
with some newlines
`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", stripped)
}
func (suite *PlainTestSuite) TestStripCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("hello world", stripped)
}
func (suite *PlainTestSuite) TestStripCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("hello world", stripped)
}
func (suite *PlainTestSuite) TestStripCustomCSS() {
customCSS := `.toot .username {
color: var(--link_fg);
line-height: 2rem;
margin-top: -0.5rem;
align-self: start;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}`
stripped := text.StripHTMLFromText(customCSS)
suite.Equal(customCSS, stripped) // should be the same as it was before
}
func (suite *PlainTestSuite) TestStripNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
stripped := text.StripHTMLFromText(customCSS)
suite.Empty(stripped)
}
func (suite *PlainTestSuite) TestStripNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
stripped := text.StripHTMLFromText(customCSS)
suite.Equal("pee pee poo poo", stripped)
}
func TestPlainTestSuite(t *testing.T) {
suite.Run(t, new(PlainTestSuite))
}

View file

@ -18,9 +18,7 @@
package text
import (
"html"
"regexp"
"strings"
"github.com/microcosm-cc/bluemonday"
)
@ -165,21 +163,8 @@ var strict *bluemonday.Policy = bluemonday.StrictPolicy()
// SanitizeHTML sanitizes only risky html elements
// from the given string, allowing safe ones through.
func SanitizeHTML(in string) string {
return regular.Sanitize(in)
}
// RemoveHTML runs text through strict sanitization.
// This removes any html elements that were in the
// string, and returns pruned plaintext.
func RemoveHTML(in string) string {
// Unescape first to catch any tricky critters.
content := html.UnescapeString(in)
// Remove all detected HTML.
content = strict.Sanitize(content)
// Unescape again to return plaintext.
content = html.UnescapeString(content)
return strings.TrimSpace(content)
//
// It returns an HTML string.
func SanitizeHTML(html string) string {
return regular.Sanitize(html)
}

View file

@ -45,83 +45,6 @@ func (suite *SanitizeTestSuite) TestSanitizeHTML() {
suite.Equal(sanitizedHTML, s)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("this is just a normal caption ;)", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("here's a LOUD caption", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption3() {
dodgyCaption := ""
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption4() {
dodgyCaption := `
here is
a multi line
caption
with some newlines
`
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCustomCSS() {
customCSS := `.toot .username {
color: var(--link_fg);
line-height: 2rem;
margin-top: -0.5rem;
align-self: start;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}`
sanitized := text.RemoveHTML(customCSS)
suite.Equal(customCSS, sanitized) // should be the same as it was before
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
sanitized := text.RemoveHTML(customCSS)
suite.Empty(sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
sanitized := text.RemoveHTML(customCSS)
suite.Equal("pee pee poo poo", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeInlineImg() {
withInlineImg := "<p>Here's an inline image: <img class=\"fixed-size-img svelte-uci8eb\" aria-hidden=\"false\" alt=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" title=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" width=\"0\" height=\"0\" src=\"https://example.org/fileserver/01H7J83147QMCE17C0RS9P10Y9/attachment/small/01H7J8365XXRTCP6CAMGEM49ZE.jpg\" style=\"object-position: 50% 50%;\"></p>"
sanitized := text.SanitizeHTML(withInlineImg)

View file

@ -1376,7 +1376,6 @@ func (c *Converter) baseStatusToFrontend(
InReplyToID: nil, // Set below.
InReplyToAccountID: nil, // Set below.
Sensitive: *s.Sensitive,
SpoilerText: text.HTMLToPlain(s.ContentWarning),
Visibility: c.VisToAPIVis(ctx, s.Visibility),
LocalOnly: s.IsLocalOnly(),
Language: nil, // Set below.
@ -1397,6 +1396,11 @@ func (c *Converter) baseStatusToFrontend(
Text: s.Text,
ContentType: ContentTypeToAPIContentType(s.ContentType),
InteractionPolicy: *apiInteractionPolicy,
// Mastodon API says spoiler_text should be *text*, not HTML, so
// parse any HTML back to plaintext when serializing via the API,
// attempting to preserve semantic intent to keep it readable.
SpoilerText: text.ParseHTMLToPlain(s.ContentWarning),
}
if at := s.EditedAt; !at.IsZero() {

View file

@ -383,7 +383,7 @@ func filterableFields(s *gtsmodel.Status) []string {
// remove markdown-formatting characters
// and ensure more consistent filtering.
if s.Content != "" {
text := text.HTMLToPlain(s.Content)
text := text.ParseHTMLToPlain(s.Content)
if text != "" {
fields = append(fields, text)
}

View file

@ -361,10 +361,10 @@ func formatNotificationBody(apiNotification *apimodel.Notification) string {
if apiNotification.Status.SpoilerText != "" {
body = apiNotification.Status.SpoilerText
} else {
body = text.RemoveHTML(apiNotification.Status.Content)
body = text.StripHTMLFromText(apiNotification.Status.Content)
}
} else {
body = text.RemoveHTML(apiNotification.Account.Note)
body = text.StripHTMLFromText(apiNotification.Account.Note)
}
return firstNBytesTrimSpace(body, bodyMaxLen)
}