rename some of the text functions for clarity

This commit is contained in:
tobi 2025-03-07 11:40:44 +01:00
commit c8556cfd97
16 changed files with 145 additions and 124 deletions

View file

@ -339,7 +339,7 @@ func NormalizeIncomingName(item WithName, rawJSON map[string]interface{}) {
// //
// todo: We probably want to update this to allow // todo: We probably want to update this to allow
// *escaped* HTML markup, but for now just nuke it. // *escaped* HTML markup, but for now just nuke it.
name = text.RemoveHTML(name) name = text.StripHTMLFromText(name)
// Set normalized name property from the raw string; this // Set normalized name property from the raw string; this
// will replace any existing name property on the item. // will replace any existing name property on the item.

View file

@ -67,7 +67,7 @@ func OGBase(instance *apimodel.InstanceV1) *OGMeta {
} }
og := &OGMeta{ og := &OGMeta{
Title: text.RemoveHTML(instance.Title) + " - GoToSocial", Title: text.StripHTMLFromText(instance.Title) + " - GoToSocial",
Type: "website", Type: "website",
Locale: locale, Locale: locale,
URL: instance.URI, URL: instance.URI,
@ -161,7 +161,7 @@ func AccountTitle(account *apimodel.WebAccount, accountDomain string) string {
// ParseDescription returns a string description which is // ParseDescription returns a string description which is
// safe to use as a template.HTMLAttr inside templates. // safe to use as a template.HTMLAttr inside templates.
func ParseDescription(in string) string { func ParseDescription(in string) string {
i := text.RemoveHTML(in) i := text.StripHTMLFromText(in)
i = strings.ReplaceAll(i, "\n", " ") i = strings.ReplaceAll(i, "\n", " ")
i = strings.Join(strings.Fields(i), " ") i = strings.Join(strings.Fields(i), " ")
i = html.EscapeString(i) i = html.EscapeString(i)

View file

@ -97,8 +97,8 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
return nil, gtserror.NewErrorBadRequest(err, err.Error()) return nil, gtserror.NewErrorBadRequest(err, err.Error())
} }
// Parse new display name (always from plaintext). // HTML tags not allowed in display name.
account.DisplayName = text.RemoveHTML(displayName) account.DisplayName = text.StripHTMLFromText(displayName)
acctColumns = append(acctColumns, "display_name") acctColumns = append(acctColumns, "display_name")
} }
@ -145,7 +145,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
} }
if form.AvatarDescription != nil { if form.AvatarDescription != nil {
desc := text.RemoveHTML(*form.AvatarDescription) desc := text.StripHTMLFromText(*form.AvatarDescription)
form.AvatarDescription = &desc form.AvatarDescription = &desc
} }
@ -175,7 +175,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
} }
if form.HeaderDescription != nil { if form.HeaderDescription != nil {
desc := text.RemoveHTML(*form.HeaderDescription) desc := text.StripHTMLFromText(*form.HeaderDescription)
form.HeaderDescription = util.Ptr(desc) form.HeaderDescription = util.Ptr(desc)
} }
@ -265,7 +265,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
return nil, gtserror.NewErrorBadRequest(err, err.Error()) return nil, gtserror.NewErrorBadRequest(err, err.Error())
} }
account.Settings.CustomCSS = text.RemoveHTML(customCSS) account.Settings.CustomCSS = text.StripHTMLFromText(customCSS)
settingsColumns = append(settingsColumns, "custom_css") settingsColumns = append(settingsColumns, "custom_css")
} }
@ -356,8 +356,8 @@ func (p *Processor) updateFields(
// Sanitize raw field values. // Sanitize raw field values.
fieldRaw := &gtsmodel.Field{ fieldRaw := &gtsmodel.Field{
Name: text.RemoveHTML(name), Name: text.StripHTMLFromText(name),
Value: text.RemoveHTML(value), Value: text.StripHTMLFromText(value),
} }
fieldsRaw = append(fieldsRaw, fieldRaw) fieldsRaw = append(fieldsRaw, fieldRaw)
} }

View file

@ -53,8 +53,8 @@ func (p *Processor) createDomainAllow(
ID: id.NewULID(), ID: id.NewULID(),
Domain: domain, Domain: domain,
CreatedByAccountID: adminAcct.ID, CreatedByAccountID: adminAcct.ID,
PrivateComment: text.RemoveHTML(privateComment), PrivateComment: text.StripHTMLFromText(privateComment),
PublicComment: text.RemoveHTML(publicComment), PublicComment: text.StripHTMLFromText(publicComment),
Obfuscate: &obfuscate, Obfuscate: &obfuscate,
SubscriptionID: subscriptionID, SubscriptionID: subscriptionID,
} }

View file

@ -53,8 +53,8 @@ func (p *Processor) createDomainBlock(
ID: id.NewULID(), ID: id.NewULID(),
Domain: domain, Domain: domain,
CreatedByAccountID: adminAcct.ID, CreatedByAccountID: adminAcct.ID,
PrivateComment: text.RemoveHTML(privateComment), PrivateComment: text.StripHTMLFromText(privateComment),
PublicComment: text.RemoveHTML(publicComment), PublicComment: text.StripHTMLFromText(publicComment),
Obfuscate: &obfuscate, Obfuscate: &obfuscate,
SubscriptionID: subscriptionID, SubscriptionID: subscriptionID,
} }

View file

@ -165,7 +165,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe
} }
// Don't allow html in site title. // Don't allow html in site title.
instance.Title = text.RemoveHTML(title) instance.Title = text.StripHTMLFromText(title)
columns = append(columns, "title") columns = append(columns, "title")
} }
@ -235,7 +235,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe
return nil, gtserror.NewErrorBadRequest(err, err.Error()) return nil, gtserror.NewErrorBadRequest(err, err.Error())
} }
instance.CustomCSS = text.RemoveHTML(customCSS) instance.CustomCSS = text.StripHTMLFromText(customCSS)
columns = append(columns, []string{"custom_css"}...) columns = append(columns, []string{"custom_css"}...)
} }

View file

@ -87,7 +87,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, media
// processDescription will sanitize and valid description against server configuration. // processDescription will sanitize and valid description against server configuration.
func processDescription(description string) (string, gtserror.WithCode) { func processDescription(description string) (string, gtserror.WithCode) {
description = text.RemoveHTML(description) description = text.StripHTMLFromText(description)
chars := len([]rune(description)) chars := len([]rune(description))
if min := config.GetMediaDescriptionMinChars(); chars < min { if min := config.GetMediaDescriptionMinChars(); chars < min {

View file

@ -236,7 +236,7 @@ func (p *Processor) processContent(
// Strip each poll option and format. // Strip each poll option and format.
// //
// For polls just use basic formatting. // For polls just use basic formatting.
option = text.RemoveHTML(option) option = text.StripHTMLFromText(option)
optionRes := formatInput(p.formatter.FromPlainBasic, option) optionRes := formatInput(p.formatter.FromPlainBasic, option)
// Gather results of the formatted. // Gather results of the formatted.

View file

@ -122,7 +122,7 @@ func (p *Processor) Create(
Username: form.Username, Username: form.Username,
Email: form.Email, Email: form.Email,
Password: form.Password, Password: form.Password,
Reason: text.RemoveHTML(reason), Reason: text.StripHTMLFromText(reason),
SignUpIP: form.IP, SignUpIP: form.IP,
Locale: form.Locale, Locale: form.Locale,
AppID: app.ID, AppID: app.ID,

View file

@ -20,6 +20,7 @@ package text
import ( import (
"bytes" "bytes"
"context" "context"
gohtml "html"
"strings" "strings"
"codeberg.org/gruf/go-byteutil" "codeberg.org/gruf/go-byteutil"
@ -193,9 +194,22 @@ func (f *Formatter) fromPlain(
return result return result
} }
// HTMLToPlain parses the given HTML and then outputs // ParseHTMLToPlain parses the given HTML string, then
// it to close-as-possible equivalent plaintext. // outputs it to equivalent plaintext while trying to
func HTMLToPlain(html string) string { // keep as much of the smenantic intent of the input
// HTML as possible, ie., titles are placed on separate
// lines, `<br>`s are converted to newlines, text inside
// `<strong>` and `<em>` tags is retained, but without
// emphasis, `<a>` links are unnested and the URL they
// link to is placed in angle brackets next to them,
// lists are replaced with newline-separated indented
// items, etc.
//
// This function is useful when you need to filter on
// HTML and want to avoid catching tags in the filter,
// or when you want to serve something in a plaintext
// format that may contain HTML tags (eg., CWs).
func ParseHTMLToPlain(html string) string {
plain := html2text.HTML2TextWithOptions( plain := html2text.HTML2TextWithOptions(
html, html,
html2text.WithLinksInnerText(), html2text.WithLinksInnerText(),
@ -204,3 +218,21 @@ func HTMLToPlain(html string) string {
) )
return strings.TrimSpace(plain) return strings.TrimSpace(plain)
} }
// StripHTMLFromText runs text through strict sanitization
// to completely remove any HTML from the input without
// trying to preserve the semantic intent of any HTML tags.
//
// This is useful in cases where the input was not allowed
// to contain HTML at all, and the output isn't either.
func StripHTMLFromText(text string) string {
// Unescape first to catch any tricky critters.
content := gohtml.UnescapeString(text)
// Remove all detected HTML.
content = strict.Sanitize(content)
// Unescape again to return plaintext.
content = gohtml.UnescapeString(content)
return strings.TrimSpace(content)
}

View file

@ -184,7 +184,7 @@ func (suite *PlainTestSuite) TestNumbersAreNotHashtags() {
suite.Len(f.Tags, 0) suite.Len(f.Tags, 0)
} }
func (suite *PlainTestSuite) TestHTMLToPlain() { func (suite *PlainTestSuite) TestParseHTMLToPlain() {
for _, t := range []struct { for _, t := range []struct {
html string html string
expectedPlain string expectedPlain string
@ -246,11 +246,88 @@ See the domain permission subscription documentation <https://docs.gotosocial.or
Thanks for reading! And seriously back up your database.`, Thanks for reading! And seriously back up your database.`,
}, },
} { } {
plain := text.HTMLToPlain(t.html) plain := text.ParseHTMLToPlain(t.html)
suite.Equal(t.expectedPlain, plain) suite.Equal(t.expectedPlain, plain)
} }
} }
func (suite *PlainTestSuite) TestStripCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("this is just a normal caption ;)", stripped)
}
func (suite *PlainTestSuite) TestStripCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("here's a LOUD caption", stripped)
}
func (suite *PlainTestSuite) TestStripCaption3() {
dodgyCaption := ""
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("", stripped)
}
func (suite *PlainTestSuite) TestStripCaption4() {
dodgyCaption := `
here is
a multi line
caption
with some newlines
`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", stripped)
}
func (suite *PlainTestSuite) TestStripCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("hello world", stripped)
}
func (suite *PlainTestSuite) TestStripCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("hello world", stripped)
}
func (suite *PlainTestSuite) TestStripCustomCSS() {
customCSS := `.toot .username {
color: var(--link_fg);
line-height: 2rem;
margin-top: -0.5rem;
align-self: start;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}`
stripped := text.StripHTMLFromText(customCSS)
suite.Equal(customCSS, stripped) // should be the same as it was before
}
func (suite *PlainTestSuite) TestStripNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
stripped := text.StripHTMLFromText(customCSS)
suite.Empty(stripped)
}
func (suite *PlainTestSuite) TestStripNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
stripped := text.StripHTMLFromText(customCSS)
suite.Equal("pee pee poo poo", stripped)
}
func TestPlainTestSuite(t *testing.T) { func TestPlainTestSuite(t *testing.T) {
suite.Run(t, new(PlainTestSuite)) suite.Run(t, new(PlainTestSuite))
} }

View file

@ -18,9 +18,7 @@
package text package text
import ( import (
"html"
"regexp" "regexp"
"strings"
"github.com/microcosm-cc/bluemonday" "github.com/microcosm-cc/bluemonday"
) )
@ -165,21 +163,8 @@ var strict *bluemonday.Policy = bluemonday.StrictPolicy()
// SanitizeHTML sanitizes only risky html elements // SanitizeHTML sanitizes only risky html elements
// from the given string, allowing safe ones through. // from the given string, allowing safe ones through.
func SanitizeHTML(in string) string { //
return regular.Sanitize(in) // It returns an HTML string.
} func SanitizeHTML(html string) string {
return regular.Sanitize(html)
// RemoveHTML runs text through strict sanitization.
// This removes any html elements that were in the
// string, and returns pruned plaintext.
func RemoveHTML(in string) string {
// Unescape first to catch any tricky critters.
content := html.UnescapeString(in)
// Remove all detected HTML.
content = strict.Sanitize(content)
// Unescape again to return plaintext.
content = html.UnescapeString(content)
return strings.TrimSpace(content)
} }

View file

@ -45,83 +45,6 @@ func (suite *SanitizeTestSuite) TestSanitizeHTML() {
suite.Equal(sanitizedHTML, s) suite.Equal(sanitizedHTML, s)
} }
func (suite *SanitizeTestSuite) TestSanitizeCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("this is just a normal caption ;)", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("here's a LOUD caption", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption3() {
dodgyCaption := ""
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption4() {
dodgyCaption := `
here is
a multi line
caption
with some newlines
`
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
sanitized := text.RemoveHTML(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCustomCSS() {
customCSS := `.toot .username {
color: var(--link_fg);
line-height: 2rem;
margin-top: -0.5rem;
align-self: start;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}`
sanitized := text.RemoveHTML(customCSS)
suite.Equal(customCSS, sanitized) // should be the same as it was before
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
sanitized := text.RemoveHTML(customCSS)
suite.Empty(sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
sanitized := text.RemoveHTML(customCSS)
suite.Equal("pee pee poo poo", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeInlineImg() { func (suite *SanitizeTestSuite) TestSanitizeInlineImg() {
withInlineImg := "<p>Here's an inline image: <img class=\"fixed-size-img svelte-uci8eb\" aria-hidden=\"false\" alt=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" title=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" width=\"0\" height=\"0\" src=\"https://example.org/fileserver/01H7J83147QMCE17C0RS9P10Y9/attachment/small/01H7J8365XXRTCP6CAMGEM49ZE.jpg\" style=\"object-position: 50% 50%;\"></p>" withInlineImg := "<p>Here's an inline image: <img class=\"fixed-size-img svelte-uci8eb\" aria-hidden=\"false\" alt=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" title=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" width=\"0\" height=\"0\" src=\"https://example.org/fileserver/01H7J83147QMCE17C0RS9P10Y9/attachment/small/01H7J8365XXRTCP6CAMGEM49ZE.jpg\" style=\"object-position: 50% 50%;\"></p>"
sanitized := text.SanitizeHTML(withInlineImg) sanitized := text.SanitizeHTML(withInlineImg)

View file

@ -1376,7 +1376,6 @@ func (c *Converter) baseStatusToFrontend(
InReplyToID: nil, // Set below. InReplyToID: nil, // Set below.
InReplyToAccountID: nil, // Set below. InReplyToAccountID: nil, // Set below.
Sensitive: *s.Sensitive, Sensitive: *s.Sensitive,
SpoilerText: text.HTMLToPlain(s.ContentWarning),
Visibility: c.VisToAPIVis(ctx, s.Visibility), Visibility: c.VisToAPIVis(ctx, s.Visibility),
LocalOnly: s.IsLocalOnly(), LocalOnly: s.IsLocalOnly(),
Language: nil, // Set below. Language: nil, // Set below.
@ -1397,6 +1396,11 @@ func (c *Converter) baseStatusToFrontend(
Text: s.Text, Text: s.Text,
ContentType: ContentTypeToAPIContentType(s.ContentType), ContentType: ContentTypeToAPIContentType(s.ContentType),
InteractionPolicy: *apiInteractionPolicy, InteractionPolicy: *apiInteractionPolicy,
// Mastodon API says spoiler_text should be *text*, not HTML, so
// parse any HTML back to plaintext when serializing via the API,
// attempting to preserve semantic intent to keep it readable.
SpoilerText: text.ParseHTMLToPlain(s.ContentWarning),
} }
if at := s.EditedAt; !at.IsZero() { if at := s.EditedAt; !at.IsZero() {

View file

@ -383,7 +383,7 @@ func filterableFields(s *gtsmodel.Status) []string {
// remove markdown-formatting characters // remove markdown-formatting characters
// and ensure more consistent filtering. // and ensure more consistent filtering.
if s.Content != "" { if s.Content != "" {
text := text.HTMLToPlain(s.Content) text := text.ParseHTMLToPlain(s.Content)
if text != "" { if text != "" {
fields = append(fields, text) fields = append(fields, text)
} }

View file

@ -361,10 +361,10 @@ func formatNotificationBody(apiNotification *apimodel.Notification) string {
if apiNotification.Status.SpoilerText != "" { if apiNotification.Status.SpoilerText != "" {
body = apiNotification.Status.SpoilerText body = apiNotification.Status.SpoilerText
} else { } else {
body = text.RemoveHTML(apiNotification.Status.Content) body = text.StripHTMLFromText(apiNotification.Status.Content)
} }
} else { } else {
body = text.RemoveHTML(apiNotification.Account.Note) body = text.StripHTMLFromText(apiNotification.Account.Note)
} }
return firstNBytesTrimSpace(body, bodyMaxLen) return firstNBytesTrimSpace(body, bodyMaxLen)
} }