diff --git a/internal/ap/normalize.go b/internal/ap/normalize.go
index 5b88d9085..ffe90733c 100644
--- a/internal/ap/normalize.go
+++ b/internal/ap/normalize.go
@@ -339,7 +339,7 @@ func NormalizeIncomingName(item WithName, rawJSON map[string]interface{}) {
//
// todo: We probably want to update this to allow
// *escaped* HTML markup, but for now just nuke it.
- name = text.RemoveHTML(name)
+ name = text.StripHTMLFromText(name)
// Set normalized name property from the raw string; this
// will replace any existing name property on the item.
diff --git a/internal/api/util/opengraph.go b/internal/api/util/opengraph.go
index 121f29595..770bada83 100644
--- a/internal/api/util/opengraph.go
+++ b/internal/api/util/opengraph.go
@@ -67,7 +67,7 @@ func OGBase(instance *apimodel.InstanceV1) *OGMeta {
}
og := &OGMeta{
- Title: text.RemoveHTML(instance.Title) + " - GoToSocial",
+ Title: text.StripHTMLFromText(instance.Title) + " - GoToSocial",
Type: "website",
Locale: locale,
URL: instance.URI,
@@ -161,7 +161,7 @@ func AccountTitle(account *apimodel.WebAccount, accountDomain string) string {
// ParseDescription returns a string description which is
// safe to use as a template.HTMLAttr inside templates.
func ParseDescription(in string) string {
- i := text.RemoveHTML(in)
+ i := text.StripHTMLFromText(in)
i = strings.ReplaceAll(i, "\n", " ")
i = strings.Join(strings.Fields(i), " ")
i = html.EscapeString(i)
diff --git a/internal/processing/account/update.go b/internal/processing/account/update.go
index e65589fc5..3a59dbdf3 100644
--- a/internal/processing/account/update.go
+++ b/internal/processing/account/update.go
@@ -97,8 +97,8 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
return nil, gtserror.NewErrorBadRequest(err, err.Error())
}
- // Parse new display name (always from plaintext).
- account.DisplayName = text.RemoveHTML(displayName)
+ // HTML tags not allowed in display name.
+ account.DisplayName = text.StripHTMLFromText(displayName)
acctColumns = append(acctColumns, "display_name")
}
@@ -145,7 +145,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
}
if form.AvatarDescription != nil {
- desc := text.RemoveHTML(*form.AvatarDescription)
+ desc := text.StripHTMLFromText(*form.AvatarDescription)
form.AvatarDescription = &desc
}
@@ -175,7 +175,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
}
if form.HeaderDescription != nil {
- desc := text.RemoveHTML(*form.HeaderDescription)
+ desc := text.StripHTMLFromText(*form.HeaderDescription)
form.HeaderDescription = util.Ptr(desc)
}
@@ -265,7 +265,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form
return nil, gtserror.NewErrorBadRequest(err, err.Error())
}
- account.Settings.CustomCSS = text.RemoveHTML(customCSS)
+ account.Settings.CustomCSS = text.StripHTMLFromText(customCSS)
settingsColumns = append(settingsColumns, "custom_css")
}
@@ -356,8 +356,8 @@ func (p *Processor) updateFields(
// Sanitize raw field values.
fieldRaw := >smodel.Field{
- Name: text.RemoveHTML(name),
- Value: text.RemoveHTML(value),
+ Name: text.StripHTMLFromText(name),
+ Value: text.StripHTMLFromText(value),
}
fieldsRaw = append(fieldsRaw, fieldRaw)
}
diff --git a/internal/processing/admin/domainallow.go b/internal/processing/admin/domainallow.go
index d752ef202..02101ccff 100644
--- a/internal/processing/admin/domainallow.go
+++ b/internal/processing/admin/domainallow.go
@@ -53,8 +53,8 @@ func (p *Processor) createDomainAllow(
ID: id.NewULID(),
Domain: domain,
CreatedByAccountID: adminAcct.ID,
- PrivateComment: text.RemoveHTML(privateComment),
- PublicComment: text.RemoveHTML(publicComment),
+ PrivateComment: text.StripHTMLFromText(privateComment),
+ PublicComment: text.StripHTMLFromText(publicComment),
Obfuscate: &obfuscate,
SubscriptionID: subscriptionID,
}
diff --git a/internal/processing/admin/domainblock.go b/internal/processing/admin/domainblock.go
index 62a6d5fea..249df744c 100644
--- a/internal/processing/admin/domainblock.go
+++ b/internal/processing/admin/domainblock.go
@@ -53,8 +53,8 @@ func (p *Processor) createDomainBlock(
ID: id.NewULID(),
Domain: domain,
CreatedByAccountID: adminAcct.ID,
- PrivateComment: text.RemoveHTML(privateComment),
- PublicComment: text.RemoveHTML(publicComment),
+ PrivateComment: text.StripHTMLFromText(privateComment),
+ PublicComment: text.StripHTMLFromText(publicComment),
Obfuscate: &obfuscate,
SubscriptionID: subscriptionID,
}
diff --git a/internal/processing/instance.go b/internal/processing/instance.go
index 62a1685a0..4cbbb742a 100644
--- a/internal/processing/instance.go
+++ b/internal/processing/instance.go
@@ -165,7 +165,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe
}
// Don't allow html in site title.
- instance.Title = text.RemoveHTML(title)
+ instance.Title = text.StripHTMLFromText(title)
columns = append(columns, "title")
}
@@ -235,7 +235,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe
return nil, gtserror.NewErrorBadRequest(err, err.Error())
}
- instance.CustomCSS = text.RemoveHTML(customCSS)
+ instance.CustomCSS = text.StripHTMLFromText(customCSS)
columns = append(columns, []string{"custom_css"}...)
}
diff --git a/internal/processing/media/update.go b/internal/processing/media/update.go
index 1eaa74764..5afa5d63c 100644
--- a/internal/processing/media/update.go
+++ b/internal/processing/media/update.go
@@ -87,7 +87,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, media
// processDescription will sanitize and valid description against server configuration.
func processDescription(description string) (string, gtserror.WithCode) {
- description = text.RemoveHTML(description)
+ description = text.StripHTMLFromText(description)
chars := len([]rune(description))
if min := config.GetMediaDescriptionMinChars(); chars < min {
diff --git a/internal/processing/status/common.go b/internal/processing/status/common.go
index 56adcc4a8..77058ed10 100644
--- a/internal/processing/status/common.go
+++ b/internal/processing/status/common.go
@@ -236,7 +236,7 @@ func (p *Processor) processContent(
// Strip each poll option and format.
//
// For polls just use basic formatting.
- option = text.RemoveHTML(option)
+ option = text.StripHTMLFromText(option)
optionRes := formatInput(p.formatter.FromPlainBasic, option)
// Gather results of the formatted.
diff --git a/internal/processing/user/create.go b/internal/processing/user/create.go
index fb7188ab9..dde69a6ef 100644
--- a/internal/processing/user/create.go
+++ b/internal/processing/user/create.go
@@ -122,7 +122,7 @@ func (p *Processor) Create(
Username: form.Username,
Email: form.Email,
Password: form.Password,
- Reason: text.RemoveHTML(reason),
+ Reason: text.StripHTMLFromText(reason),
SignUpIP: form.IP,
Locale: form.Locale,
AppID: app.ID,
diff --git a/internal/text/plain.go b/internal/text/plain.go
index 246d0001c..ee4947bf7 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -20,6 +20,7 @@ package text
import (
"bytes"
"context"
+ gohtml "html"
"strings"
"codeberg.org/gruf/go-byteutil"
@@ -193,9 +194,22 @@ func (f *Formatter) fromPlain(
return result
}
-// HTMLToPlain parses the given HTML and then outputs
-// it to close-as-possible equivalent plaintext.
-func HTMLToPlain(html string) string {
+// ParseHTMLToPlain parses the given HTML string, then
+// outputs it to equivalent plaintext while trying to
+// keep as much of the smenantic intent of the input
+// HTML as possible, ie., titles are placed on separate
+// lines, `
`s are converted to newlines, text inside
+// `` and `` tags is retained, but without
+// emphasis, `` links are unnested and the URL they
+// link to is placed in angle brackets next to them,
+// lists are replaced with newline-separated indented
+// items, etc.
+//
+// This function is useful when you need to filter on
+// HTML and want to avoid catching tags in the filter,
+// or when you want to serve something in a plaintext
+// format that may contain HTML tags (eg., CWs).
+func ParseHTMLToPlain(html string) string {
plain := html2text.HTML2TextWithOptions(
html,
html2text.WithLinksInnerText(),
@@ -204,3 +218,21 @@ func HTMLToPlain(html string) string {
)
return strings.TrimSpace(plain)
}
+
+// StripHTMLFromText runs text through strict sanitization
+// to completely remove any HTML from the input without
+// trying to preserve the semantic intent of any HTML tags.
+//
+// This is useful in cases where the input was not allowed
+// to contain HTML at all, and the output isn't either.
+func StripHTMLFromText(text string) string {
+ // Unescape first to catch any tricky critters.
+ content := gohtml.UnescapeString(text)
+
+ // Remove all detected HTML.
+ content = strict.Sanitize(content)
+
+ // Unescape again to return plaintext.
+ content = gohtml.UnescapeString(content)
+ return strings.TrimSpace(content)
+}
diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go
index 3f43d987a..cb8f4677e 100644
--- a/internal/text/plain_test.go
+++ b/internal/text/plain_test.go
@@ -184,7 +184,7 @@ func (suite *PlainTestSuite) TestNumbersAreNotHashtags() {
suite.Len(f.Tags, 0)
}
-func (suite *PlainTestSuite) TestHTMLToPlain() {
+func (suite *PlainTestSuite) TestParseHTMLToPlain() {
for _, t := range []struct {
html string
expectedPlain string
@@ -246,11 +246,88 @@ See the domain permission subscription documentation console.log('aha!') hello world"
+ dodgyCaption := `<script>console.log('aha!')</script> hello world`
+ stripped := text.StripHTMLFromText(dodgyCaption)
+ suite.Equal("hello world", stripped)
+}
+
+func (suite *PlainTestSuite) TestStripCaption6() {
+ // html-encoded: " hello world"
+ dodgyCaption := `<script>console.log('aha!')</script> hello world`
+ stripped := text.StripHTMLFromText(dodgyCaption)
+ suite.Equal("hello world", stripped)
+}
+
+func (suite *PlainTestSuite) TestStripCustomCSS() {
+ customCSS := `.toot .username {
+ color: var(--link_fg);
+ line-height: 2rem;
+ margin-top: -0.5rem;
+ align-self: start;
+
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+}`
+ stripped := text.StripHTMLFromText(customCSS)
+ suite.Equal(customCSS, stripped) // should be the same as it was before
+}
+
+func (suite *PlainTestSuite) TestStripNaughtyCustomCSS1() {
+ // try to break out of pee pee poo poopee pee poo poo