From c8556cfd9758d2e014fe6c7df8b80f7f532d383c Mon Sep 17 00:00:00 2001 From: tobi Date: Fri, 7 Mar 2025 11:40:44 +0100 Subject: [PATCH] rename some of the text functions for clarity --- internal/ap/normalize.go | 2 +- internal/api/util/opengraph.go | 4 +- internal/processing/account/update.go | 14 ++-- internal/processing/admin/domainallow.go | 4 +- internal/processing/admin/domainblock.go | 4 +- internal/processing/instance.go | 4 +- internal/processing/media/update.go | 2 +- internal/processing/status/common.go | 2 +- internal/processing/user/create.go | 2 +- internal/text/plain.go | 38 ++++++++++- internal/text/plain_test.go | 81 +++++++++++++++++++++++- internal/text/sanitize.go | 23 ++----- internal/text/sanitize_test.go | 77 ---------------------- internal/typeutils/internaltofrontend.go | 6 +- internal/typeutils/util.go | 2 +- internal/webpush/realsender.go | 4 +- 16 files changed, 145 insertions(+), 124 deletions(-) diff --git a/internal/ap/normalize.go b/internal/ap/normalize.go index 5b88d9085..ffe90733c 100644 --- a/internal/ap/normalize.go +++ b/internal/ap/normalize.go @@ -339,7 +339,7 @@ func NormalizeIncomingName(item WithName, rawJSON map[string]interface{}) { // // todo: We probably want to update this to allow // *escaped* HTML markup, but for now just nuke it. - name = text.RemoveHTML(name) + name = text.StripHTMLFromText(name) // Set normalized name property from the raw string; this // will replace any existing name property on the item. diff --git a/internal/api/util/opengraph.go b/internal/api/util/opengraph.go index 121f29595..770bada83 100644 --- a/internal/api/util/opengraph.go +++ b/internal/api/util/opengraph.go @@ -67,7 +67,7 @@ func OGBase(instance *apimodel.InstanceV1) *OGMeta { } og := &OGMeta{ - Title: text.RemoveHTML(instance.Title) + " - GoToSocial", + Title: text.StripHTMLFromText(instance.Title) + " - GoToSocial", Type: "website", Locale: locale, URL: instance.URI, @@ -161,7 +161,7 @@ func AccountTitle(account *apimodel.WebAccount, accountDomain string) string { // ParseDescription returns a string description which is // safe to use as a template.HTMLAttr inside templates. func ParseDescription(in string) string { - i := text.RemoveHTML(in) + i := text.StripHTMLFromText(in) i = strings.ReplaceAll(i, "\n", " ") i = strings.Join(strings.Fields(i), " ") i = html.EscapeString(i) diff --git a/internal/processing/account/update.go b/internal/processing/account/update.go index e65589fc5..3a59dbdf3 100644 --- a/internal/processing/account/update.go +++ b/internal/processing/account/update.go @@ -97,8 +97,8 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form return nil, gtserror.NewErrorBadRequest(err, err.Error()) } - // Parse new display name (always from plaintext). - account.DisplayName = text.RemoveHTML(displayName) + // HTML tags not allowed in display name. + account.DisplayName = text.StripHTMLFromText(displayName) acctColumns = append(acctColumns, "display_name") } @@ -145,7 +145,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form } if form.AvatarDescription != nil { - desc := text.RemoveHTML(*form.AvatarDescription) + desc := text.StripHTMLFromText(*form.AvatarDescription) form.AvatarDescription = &desc } @@ -175,7 +175,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form } if form.HeaderDescription != nil { - desc := text.RemoveHTML(*form.HeaderDescription) + desc := text.StripHTMLFromText(*form.HeaderDescription) form.HeaderDescription = util.Ptr(desc) } @@ -265,7 +265,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form return nil, gtserror.NewErrorBadRequest(err, err.Error()) } - account.Settings.CustomCSS = text.RemoveHTML(customCSS) + account.Settings.CustomCSS = text.StripHTMLFromText(customCSS) settingsColumns = append(settingsColumns, "custom_css") } @@ -356,8 +356,8 @@ func (p *Processor) updateFields( // Sanitize raw field values. fieldRaw := >smodel.Field{ - Name: text.RemoveHTML(name), - Value: text.RemoveHTML(value), + Name: text.StripHTMLFromText(name), + Value: text.StripHTMLFromText(value), } fieldsRaw = append(fieldsRaw, fieldRaw) } diff --git a/internal/processing/admin/domainallow.go b/internal/processing/admin/domainallow.go index d752ef202..02101ccff 100644 --- a/internal/processing/admin/domainallow.go +++ b/internal/processing/admin/domainallow.go @@ -53,8 +53,8 @@ func (p *Processor) createDomainAllow( ID: id.NewULID(), Domain: domain, CreatedByAccountID: adminAcct.ID, - PrivateComment: text.RemoveHTML(privateComment), - PublicComment: text.RemoveHTML(publicComment), + PrivateComment: text.StripHTMLFromText(privateComment), + PublicComment: text.StripHTMLFromText(publicComment), Obfuscate: &obfuscate, SubscriptionID: subscriptionID, } diff --git a/internal/processing/admin/domainblock.go b/internal/processing/admin/domainblock.go index 62a6d5fea..249df744c 100644 --- a/internal/processing/admin/domainblock.go +++ b/internal/processing/admin/domainblock.go @@ -53,8 +53,8 @@ func (p *Processor) createDomainBlock( ID: id.NewULID(), Domain: domain, CreatedByAccountID: adminAcct.ID, - PrivateComment: text.RemoveHTML(privateComment), - PublicComment: text.RemoveHTML(publicComment), + PrivateComment: text.StripHTMLFromText(privateComment), + PublicComment: text.StripHTMLFromText(publicComment), Obfuscate: &obfuscate, SubscriptionID: subscriptionID, } diff --git a/internal/processing/instance.go b/internal/processing/instance.go index 62a1685a0..4cbbb742a 100644 --- a/internal/processing/instance.go +++ b/internal/processing/instance.go @@ -165,7 +165,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe } // Don't allow html in site title. - instance.Title = text.RemoveHTML(title) + instance.Title = text.StripHTMLFromText(title) columns = append(columns, "title") } @@ -235,7 +235,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe return nil, gtserror.NewErrorBadRequest(err, err.Error()) } - instance.CustomCSS = text.RemoveHTML(customCSS) + instance.CustomCSS = text.StripHTMLFromText(customCSS) columns = append(columns, []string{"custom_css"}...) } diff --git a/internal/processing/media/update.go b/internal/processing/media/update.go index 1eaa74764..5afa5d63c 100644 --- a/internal/processing/media/update.go +++ b/internal/processing/media/update.go @@ -87,7 +87,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, media // processDescription will sanitize and valid description against server configuration. func processDescription(description string) (string, gtserror.WithCode) { - description = text.RemoveHTML(description) + description = text.StripHTMLFromText(description) chars := len([]rune(description)) if min := config.GetMediaDescriptionMinChars(); chars < min { diff --git a/internal/processing/status/common.go b/internal/processing/status/common.go index 56adcc4a8..77058ed10 100644 --- a/internal/processing/status/common.go +++ b/internal/processing/status/common.go @@ -236,7 +236,7 @@ func (p *Processor) processContent( // Strip each poll option and format. // // For polls just use basic formatting. - option = text.RemoveHTML(option) + option = text.StripHTMLFromText(option) optionRes := formatInput(p.formatter.FromPlainBasic, option) // Gather results of the formatted. diff --git a/internal/processing/user/create.go b/internal/processing/user/create.go index fb7188ab9..dde69a6ef 100644 --- a/internal/processing/user/create.go +++ b/internal/processing/user/create.go @@ -122,7 +122,7 @@ func (p *Processor) Create( Username: form.Username, Email: form.Email, Password: form.Password, - Reason: text.RemoveHTML(reason), + Reason: text.StripHTMLFromText(reason), SignUpIP: form.IP, Locale: form.Locale, AppID: app.ID, diff --git a/internal/text/plain.go b/internal/text/plain.go index 246d0001c..ee4947bf7 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -20,6 +20,7 @@ package text import ( "bytes" "context" + gohtml "html" "strings" "codeberg.org/gruf/go-byteutil" @@ -193,9 +194,22 @@ func (f *Formatter) fromPlain( return result } -// HTMLToPlain parses the given HTML and then outputs -// it to close-as-possible equivalent plaintext. -func HTMLToPlain(html string) string { +// ParseHTMLToPlain parses the given HTML string, then +// outputs it to equivalent plaintext while trying to +// keep as much of the smenantic intent of the input +// HTML as possible, ie., titles are placed on separate +// lines, `
`s are converted to newlines, text inside +// `` and `` tags is retained, but without +// emphasis, `` links are unnested and the URL they +// link to is placed in angle brackets next to them, +// lists are replaced with newline-separated indented +// items, etc. +// +// This function is useful when you need to filter on +// HTML and want to avoid catching tags in the filter, +// or when you want to serve something in a plaintext +// format that may contain HTML tags (eg., CWs). +func ParseHTMLToPlain(html string) string { plain := html2text.HTML2TextWithOptions( html, html2text.WithLinksInnerText(), @@ -204,3 +218,21 @@ func HTMLToPlain(html string) string { ) return strings.TrimSpace(plain) } + +// StripHTMLFromText runs text through strict sanitization +// to completely remove any HTML from the input without +// trying to preserve the semantic intent of any HTML tags. +// +// This is useful in cases where the input was not allowed +// to contain HTML at all, and the output isn't either. +func StripHTMLFromText(text string) string { + // Unescape first to catch any tricky critters. + content := gohtml.UnescapeString(text) + + // Remove all detected HTML. + content = strict.Sanitize(content) + + // Unescape again to return plaintext. + content = gohtml.UnescapeString(content) + return strings.TrimSpace(content) +} diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index 3f43d987a..cb8f4677e 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -184,7 +184,7 @@ func (suite *PlainTestSuite) TestNumbersAreNotHashtags() { suite.Len(f.Tags, 0) } -func (suite *PlainTestSuite) TestHTMLToPlain() { +func (suite *PlainTestSuite) TestParseHTMLToPlain() { for _, t := range []struct { html string expectedPlain string @@ -246,11 +246,88 @@ See the domain permission subscription documentation console.log('aha!') hello world" + dodgyCaption := `<script>console.log('aha!')</script> hello world` + stripped := text.StripHTMLFromText(dodgyCaption) + suite.Equal("hello world", stripped) +} + +func (suite *PlainTestSuite) TestStripCaption6() { + // html-encoded: " hello world" + dodgyCaption := `<script>console.log('aha!')</script> hello world` + stripped := text.StripHTMLFromText(dodgyCaption) + suite.Equal("hello world", stripped) +} + +func (suite *PlainTestSuite) TestStripCustomCSS() { + customCSS := `.toot .username { + color: var(--link_fg); + line-height: 2rem; + margin-top: -0.5rem; + align-self: start; + + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +}` + stripped := text.StripHTMLFromText(customCSS) + suite.Equal(customCSS, stripped) // should be the same as it was before +} + +func (suite *PlainTestSuite) TestStripNaughtyCustomCSS1() { + // try to break out of pee pee poo poopee pee poo poo