[feature] Parse content warning to HTML, serialize via client API as plaintext (#3876)

* [feature] Parse content warning as HTML, serialize via API to plaintext

* tidy up some cruft

* whoops

* oops

* i'm da joker baybee

* clemency muy lorde

* rename some of the text functions for clarity

* jiggle the opts

* fiddle de deee

* hopefully the last test fix i ever have to do in my beautiful life
This commit is contained in:
tobi 2025-03-07 15:04:34 +01:00 committed by GitHub
commit d8113c11e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 985 additions and 635 deletions

View file

@ -93,6 +93,16 @@ func (suite *TextStandardTestSuite) FromMarkdown(input string) *text.FormatResul
)
}
func (suite *TextStandardTestSuite) FromMarkdownBasic(input string) *text.FormatResult {
return suite.formatter.FromMarkdownBasic(
context.Background(),
suite.parseMention,
suite.testAccounts["local_account_1"].ID,
"dummy_status_ID",
input,
)
}
func (suite *TextStandardTestSuite) FromPlain(input string) *text.FormatResult {
return suite.formatter.FromPlain(
context.Background(),

View file

@ -20,6 +20,8 @@ package text
import (
"bytes"
"context"
"regexp"
"strings"
"codeberg.org/gruf/go-byteutil"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -27,11 +29,15 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/regexes"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/renderer"
"github.com/yuin/goldmark/renderer/html"
)
// FromMarkdown fulfils FormatFunc by parsing
// the given markdown input into a FormatResult.
//
// Inline (aka unsafe) HTML elements are allowed,
// as they should be sanitized afterwards anyway.
func (f *Formatter) FromMarkdown(
ctx context.Context,
parseMention gtsmodel.ParseMentionFunc,
@ -39,18 +45,85 @@ func (f *Formatter) FromMarkdown(
statusID string,
input string,
) *FormatResult {
result := new(FormatResult)
return f.fromMarkdown(
ctx,
false, // basic = false
parseMention,
authorID,
statusID,
input,
)
}
// FromMarkdownBasic fulfils FormatFunc by parsing
// the given markdown input into a FormatResult.
//
// Unlike FromMarkdown, it will only parse emojis with
// the custom renderer, leaving aside mentions and tags.
//
// Inline (aka unsafe) HTML elements are not allowed.
//
// If the result is a single paragraph,
// it will not be wrapped in <p> tags.
func (f *Formatter) FromMarkdownBasic(
ctx context.Context,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
statusID string,
input string,
) *FormatResult {
res := f.fromMarkdown(
ctx,
true, // basic = true
parseMention,
authorID,
statusID,
input,
)
res.HTML = unwrapParagraph(res.HTML)
return res
}
// fromMarkdown parses the given input text either
// with or without emojis, and returns the result.
func (f *Formatter) fromMarkdown(
ctx context.Context,
basic bool,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
statusID string,
input string,
) *FormatResult {
var (
result = new(FormatResult)
opts []renderer.Option
)
if basic {
// Don't allow raw HTML tags,
// markdown syntax only.
opts = []renderer.Option{
html.WithXHTML(),
html.WithHardWraps(),
}
} else {
opts = []renderer.Option{
html.WithXHTML(),
html.WithHardWraps(),
// Allow raw HTML tags, we
// sanitize at the end anyway.
html.WithUnsafe(),
}
}
// Instantiate goldmark parser for
// markdown, using custom renderer
// to add hashtag/mention links.
md := goldmark.New(
goldmark.WithRendererOptions(
html.WithXHTML(),
html.WithHardWraps(),
// Allows raw HTML. We sanitize
// at the end so this is OK.
html.WithUnsafe(),
opts...,
),
goldmark.WithExtensions(
&customRenderer{
@ -59,7 +132,9 @@ func (f *Formatter) FromMarkdown(
parseMention,
authorID,
statusID,
false, // emojiOnly = false.
// If basic, pass
// emojiOnly = true.
basic,
result,
},
// Turns URLs into links.
@ -85,8 +160,36 @@ func (f *Formatter) FromMarkdown(
// Clean and shrink HTML.
result.HTML = byteutil.B2S(htmlBytes.Bytes())
result.HTML = SanitizeToHTML(result.HTML)
result.HTML = SanitizeHTML(result.HTML)
result.HTML = MinifyHTML(result.HTML)
return result
}
var parasRegexp = regexp.MustCompile(`</?p>`)
// unwrapParagraph removes opening and closing paragraph tags
// of input HTML, if input html is a single paragraph only.
func unwrapParagraph(html string) string {
if !strings.HasPrefix(html, "<p>") {
return html
}
if !strings.HasSuffix(html, "</p>") {
return html
}
// Make a substring excluding the
// opening and closing paragraph tags.
sub := html[3 : len(html)-4]
// If there are still other paragraph tags left
// inside the substring, return html unchanged.
containsOtherParas := parasRegexp.MatchString(sub)
if containsOtherParas {
return html
}
// Return the substring.
return sub
}

View file

@ -41,43 +41,45 @@ that was some JSON :)
`
const (
simpleMarkdown = "# Title\n\nHere's a simple text in markdown.\n\nHere's a [link](https://example.org)."
simpleMarkdownExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>"
withCodeBlockExpected = "<h1>Title</h1><p>Below is some JSON.</p><pre><code class=\"language-json\">{\n &#34;key&#34;: &#34;value&#34;,\n &#34;another_key&#34;: [\n &#34;value1&#34;,\n &#34;value2&#34;\n ]\n}\n</code></pre><p>that was some JSON :)</p>"
withInlineCode = "`Nobody tells you about the <code><del>SECRET CODE</del></code>, do they?`"
withInlineCodeExpected = "<p><code>Nobody tells you about the &lt;code>&lt;del>SECRET CODE&lt;/del>&lt;/code>, do they?</code></p>"
withInlineCode2 = "`Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?`"
withInlineCode2Expected = "<p><code>Nobody tells you about the &lt;/code>&lt;del>SECRET CODE&lt;/del>&lt;code>, do they?</code></p>"
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
withTamilHashtag = "here's a simple status that uses a hashtag in Tamil #தமிழ்"
withTamilHashtagExpected = "<p>here's a simple status that uses a hashtag in Tamil <a href=\"http://localhost:8080/tags/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>தமிழ்</span></a></p>"
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
mdWithCheekyHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a cheeky little script:</p>"
mdWithHashtagInitial = "#welcome #Hashtag"
mdWithHashtagInitialExpected = "<p><a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a> <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a></p>"
mdCodeBlockWithNewlines = "some code coming up\n\n```\n\n\n\n```\nthat was some code"
mdCodeBlockWithNewlinesExpected = "<p>some code coming up</p><pre><code>\n\n\n</code></pre><p>that was some code</p>"
mdWithFootnote = "fox mulder,fbi.[^1]\n\n[^1]: federated bureau of investigation"
mdWithFootnoteExpected = "<p>fox mulder,fbi.[^1]</p><p>[^1]: federated bureau of investigation</p>"
mdWithBlockQuote = "get ready, there's a block quote coming:\n\n>line1\n>line2\n>\n>line3\n\n"
mdWithBlockQuoteExpected = "<p>get ready, there's a block quote coming:</p><blockquote><p>line1<br>line2</p><p>line3</p></blockquote>"
mdHashtagAndCodeBlock = "#Hashtag\n\n```\n#Hashtag\n```"
mdHashtagAndCodeBlockExpected = "<p><a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a></p><pre><code>#Hashtag\n</code></pre>"
mdMentionAndCodeBlock = "@the_mighty_zork\n\n```\n@the_mighty_zork\n```"
mdMentionAndCodeBlockExpected = "<p><span class=\"h-card\"><a href=\"http://localhost:8080/@the_mighty_zork\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>the_mighty_zork</span></a></span></p><pre><code>@the_mighty_zork\n</code></pre>"
mdWithSmartypants = "\"you have to quargle the bleepflorp\" they said with 1/2 of nominal speed and 1/3 of the usual glumping"
mdWithSmartypantsExpected = "<p>\"you have to quargle the bleepflorp\" they said with 1/2 of nominal speed and 1/3 of the usual glumping</p>"
mdWithAsciiHeart = "hello <3 old friend <3 i loved u </3 :(( you stole my heart"
mdWithAsciiHeartExpected = "<p>hello &lt;3 old friend &lt;3 i loved u &lt;/3 :(( you stole my heart</p>"
mdWithStrikethrough = "I have ~~mdae~~ made an error"
mdWithStrikethroughExpected = "<p>I have <del>mdae</del> made an error</p>"
mdWithLink = "Check out this code, i heard it was written by a sloth https://github.com/superseriousbusiness/gotosocial"
mdWithLinkExpected = "<p>Check out this code, i heard it was written by a sloth <a href=\"https://github.com/superseriousbusiness/gotosocial\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">https://github.com/superseriousbusiness/gotosocial</a></p>"
mdObjectInCodeBlock = "@foss_satan@fossbros-anonymous.io this is how to mention a user\n```\n@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you've been writing lately! :rainbow:\n```\nhope that helps"
mdObjectInCodeBlockExpected = "<p><span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span> this is how to mention a user</p><pre><code>@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you&#39;ve been writing lately! :rainbow:\n</code></pre><p>hope that helps</p>"
simpleMarkdown = "# Title\n\nHere's a simple text in markdown.\n\nHere's a [link](https://example.org)."
simpleMarkdownExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>"
withCodeBlockExpected = "<h1>Title</h1><p>Below is some JSON.</p><pre><code class=\"language-json\">{\n &#34;key&#34;: &#34;value&#34;,\n &#34;another_key&#34;: [\n &#34;value1&#34;,\n &#34;value2&#34;\n ]\n}\n</code></pre><p>that was some JSON :)</p>"
withInlineCode = "`Nobody tells you about the <code><del>SECRET CODE</del></code>, do they?`"
withInlineCodeExpected = "<p><code>Nobody tells you about the &lt;code>&lt;del>SECRET CODE&lt;/del>&lt;/code>, do they?</code></p>"
withInlineCode2 = "`Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?`"
withInlineCode2Expected = "<p><code>Nobody tells you about the &lt;/code>&lt;del>SECRET CODE&lt;/del>&lt;code>, do they?</code></p>"
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
withTamilHashtag = "here's a simple status that uses a hashtag in Tamil #தமிழ்"
withTamilHashtagExpected = "<p>here's a simple status that uses a hashtag in Tamil <a href=\"http://localhost:8080/tags/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>தமிழ்</span></a></p>"
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
mdWithCheekyHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a cheeky little script:</p>"
mdWithHashtagInitial = "#welcome #Hashtag"
mdWithHashtagInitialExpected = "<p><a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a> <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a></p>"
mdCodeBlockWithNewlines = "some code coming up\n\n```\n\n\n\n```\nthat was some code"
mdCodeBlockWithNewlinesExpected = "<p>some code coming up</p><pre><code>\n\n\n</code></pre><p>that was some code</p>"
mdWithFootnote = "fox mulder,fbi.[^1]\n\n[^1]: federated bureau of investigation"
mdWithFootnoteExpected = "<p>fox mulder,fbi.[^1]</p><p>[^1]: federated bureau of investigation</p>"
mdWithBlockQuote = "get ready, there's a block quote coming:\n\n>line1\n>line2\n>\n>line3\n\n"
mdWithBlockQuoteExpected = "<p>get ready, there's a block quote coming:</p><blockquote><p>line1<br>line2</p><p>line3</p></blockquote>"
mdHashtagAndCodeBlock = "#Hashtag\n\n```\n#Hashtag\n```"
mdHashtagAndCodeBlockExpected = "<p><a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a></p><pre><code>#Hashtag\n</code></pre>"
mdMentionAndCodeBlock = "@the_mighty_zork\n\n```\n@the_mighty_zork\n```"
mdMentionAndCodeBlockExpected = "<p><span class=\"h-card\"><a href=\"http://localhost:8080/@the_mighty_zork\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>the_mighty_zork</span></a></span></p><pre><code>@the_mighty_zork\n</code></pre>"
mdMentionAndCodeBlockBasicExpected = "<p>@the_mighty_zork</p><pre><code>@the_mighty_zork\n</code></pre>"
mdWithSmartypants = "\"you have to quargle the bleepflorp\" they said with 1/2 of nominal speed and 1/3 of the usual glumping"
mdWithSmartypantsExpected = "<p>\"you have to quargle the bleepflorp\" they said with 1/2 of nominal speed and 1/3 of the usual glumping</p>"
mdWithAsciiHeart = "hello <3 old friend <3 i loved u </3 :(( you stole my heart"
mdWithAsciiHeartExpected = "<p>hello &lt;3 old friend &lt;3 i loved u &lt;/3 :(( you stole my heart</p>"
mdWithStrikethrough = "I have ~~mdae~~ made an error"
mdWithStrikethroughExpected = "<p>I have <del>mdae</del> made an error</p>"
mdWithLink = "Check out this code, i heard it was written by a sloth https://github.com/superseriousbusiness/gotosocial"
mdWithLinkExpected = "<p>Check out this code, i heard it was written by a sloth <a href=\"https://github.com/superseriousbusiness/gotosocial\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">https://github.com/superseriousbusiness/gotosocial</a></p>"
mdWithLinkBasicExpected = "Check out this code, i heard it was written by a sloth <a href=\"https://github.com/superseriousbusiness/gotosocial\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">https://github.com/superseriousbusiness/gotosocial</a>"
mdObjectInCodeBlock = "@foss_satan@fossbros-anonymous.io this is how to mention a user\n```\n@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you've been writing lately! :rainbow:\n```\nhope that helps"
mdObjectInCodeBlockExpected = "<p><span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span> this is how to mention a user</p><pre><code>@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you&#39;ve been writing lately! :rainbow:\n</code></pre><p>hope that helps</p>"
// Hashtags can be italicized but only with *, not _.
mdItalicHashtag = "*#hashtag*"
mdItalicHashtagExpected = "<p><em><a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>"
@ -169,6 +171,11 @@ func (suite *MarkdownTestSuite) TestParseMentionWithCodeBlock() {
suite.Equal(mdMentionAndCodeBlockExpected, formatted.HTML)
}
func (suite *MarkdownTestSuite) TestParseMentionWithCodeBlockBasic() {
formatted := suite.FromMarkdownBasic(mdMentionAndCodeBlock)
suite.Equal(mdMentionAndCodeBlockBasicExpected, formatted.HTML)
}
func (suite *MarkdownTestSuite) TestParseSmartypants() {
formatted := suite.FromMarkdown(mdWithSmartypants)
suite.Equal(mdWithSmartypantsExpected, formatted.HTML)
@ -189,6 +196,11 @@ func (suite *MarkdownTestSuite) TestParseLink() {
suite.Equal(mdWithLinkExpected, formatted.HTML)
}
func (suite *MarkdownTestSuite) TestParseLinkBasic() {
formatted := suite.FromMarkdownBasic(mdWithLink)
suite.Equal(mdWithLinkBasicExpected, formatted.HTML)
}
func (suite *MarkdownTestSuite) TestParseObjectInCodeBlock() {
formatted := suite.FromMarkdown(mdObjectInCodeBlock)
suite.Equal(mdObjectInCodeBlockExpected, formatted.HTML)

View file

@ -20,8 +20,11 @@ package text
import (
"bytes"
"context"
gohtml "html"
"strings"
"codeberg.org/gruf/go-byteutil"
"github.com/k3a/html2text"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
@ -52,7 +55,7 @@ func (f *Formatter) FromPlain(
return f.fromPlain(
ctx,
plainTextParser,
false, // emojiOnly = false
false, // basic = false
parseMention,
authorID,
statusID,
@ -85,7 +88,7 @@ func (f *Formatter) FromPlainNoParagraph(
return f.fromPlain(
ctx,
plainTextParser,
false, // emojiOnly = false
false, // basic = false
parseMention,
authorID,
statusID,
@ -93,12 +96,14 @@ func (f *Formatter) FromPlainNoParagraph(
)
}
// FromPlainEmojiOnly fulfils FormatFunc by parsing
// FromPlainBasic fulfils FormatFunc by parsing
// the given plaintext input into a FormatResult.
//
// Unlike FromPlain, it will only parse emojis with
// the custom renderer, leaving aside mentions and tags.
func (f *Formatter) FromPlainEmojiOnly(
//
// Resulting HTML will also NOT be wrapped in <p> tags.
func (f *Formatter) FromPlainBasic(
ctx context.Context,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
@ -116,7 +121,7 @@ func (f *Formatter) FromPlainEmojiOnly(
return f.fromPlain(
ctx,
plainTextParser,
true, // emojiOnly = true
true, // basic = true
parseMention,
authorID,
statusID,
@ -130,7 +135,7 @@ func (f *Formatter) FromPlainEmojiOnly(
func (f *Formatter) fromPlain(
ctx context.Context,
plainTextParser parser.Parser,
emojiOnly bool,
basic bool,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
statusID string,
@ -156,7 +161,9 @@ func (f *Formatter) fromPlain(
parseMention,
authorID,
statusID,
emojiOnly,
// If basic, pass
// emojiOnly = true.
basic,
result,
},
// Turns URLs into links.
@ -181,8 +188,51 @@ func (f *Formatter) fromPlain(
// Clean and shrink HTML.
result.HTML = byteutil.B2S(htmlBytes.Bytes())
result.HTML = SanitizeToHTML(result.HTML)
result.HTML = SanitizeHTML(result.HTML)
result.HTML = MinifyHTML(result.HTML)
return result
}
// ParseHTMLToPlain parses the given HTML string, then
// outputs it to equivalent plaintext while trying to
// keep as much of the smenantic intent of the input
// HTML as possible, ie., titles are placed on separate
// lines, `<br>`s are converted to newlines, text inside
// `<strong>` and `<em>` tags is retained, but without
// emphasis, `<a>` links are unnested and the URL they
// link to is placed in angle brackets next to them,
// lists are replaced with newline-separated indented
// items, etc.
//
// This function is useful when you need to filter on
// HTML and want to avoid catching tags in the filter,
// or when you want to serve something in a plaintext
// format that may contain HTML tags (eg., CWs).
func ParseHTMLToPlain(html string) string {
plain := html2text.HTML2TextWithOptions(
html,
html2text.WithLinksInnerText(),
html2text.WithUnixLineBreaks(),
html2text.WithListSupport(),
)
return strings.TrimSpace(plain)
}
// StripHTMLFromText runs text through strict sanitization
// to completely remove any HTML from the input without
// trying to preserve the semantic intent of any HTML tags.
//
// This is useful in cases where the input was not allowed
// to contain HTML at all, and the output isn't either.
func StripHTMLFromText(text string) string {
// Unescape first to catch any tricky critters.
content := gohtml.UnescapeString(text)
// Remove all detected HTML.
content = strict.Sanitize(content)
// Unescape again to return plaintext.
content = gohtml.UnescapeString(content)
return strings.TrimSpace(content)
}

View file

@ -21,6 +21,7 @@ import (
"testing"
"github.com/stretchr/testify/suite"
"github.com/superseriousbusiness/gotosocial/internal/text"
)
const (
@ -183,6 +184,150 @@ func (suite *PlainTestSuite) TestNumbersAreNotHashtags() {
suite.Len(f.Tags, 0)
}
func (suite *PlainTestSuite) TestParseHTMLToPlain() {
for _, t := range []struct {
html string
expectedPlain string
}{
{
// Check newlines between paras preserved.
html: "<p>butting into a serious discussion about programming languages*: \"elixir? I barely know 'er! honk honk!\"</p><p><small>*insofar as any discussion about programming languages can truly be considered \"serious\" since programmers are fucking clowns</small></p>",
expectedPlain: `butting into a serious discussion about programming languages*: "elixir? I barely know 'er! honk honk!"
*insofar as any discussion about programming languages can truly be considered "serious" since programmers are fucking clowns`,
},
{
// This one looks a bit wacky but nobody should
// be putting definition lists in summaries *really*.
html: "<dl class=\"status-stats\"><div class=\"stats-grouping\"><div class=\"stats-item published-at text-cutoff\"><dt class=\"sr-only\">Published</dt><dd><time class=\"dt-published\" datetime=\"2025-01-15T23:49:59.299Z\">Jan 16, 2025, 00:49</time></dd></div><div class=\"stats-grouping\"><div class=\"stats-item\" title=\"Replies\"><dt><span class=\"sr-only\">Replies</span><i class=\"fa fa-reply-all\" aria-hidden=\"true\"></i></dt><dd>0</dd></div><div class=\"stats-item\" title=\"Faves\"><dt><span class=\"sr-only\">Favourites</span><i class=\"fa fa-star\" aria-hidden=\"true\"></i></dt><dd>4</dd></div><div class=\"stats-item\" title=\"Boosts\"><dt><span class=\"sr-only\">Reblogs</span><i class=\"fa fa-retweet\" aria-hidden=\"true\"></i></dt><dd>0</dd></div></div></div><div class=\"stats-item language\" title=\"English\"><dt class=\"sr-only\">Language</dt><dd><span class=\"sr-only\">English</span><span aria-hidden=\"true\">en</span></dd></div></dl>",
expectedPlain: `PublishedJan 16, 2025, 00:49Replies0Favourites4Reblogs0LanguageEnglishen`,
},
{
// Check <br> converted to newlines and leading / trailing space removed.
html: " <p>i'm a milf,<br>i'm a lover,<br>do your mom,<br>do your brother</p><p>i'm a sinner,<br>i'm a saint,<br>i will not be ashamed!</p><br> <br>",
expectedPlain: `i'm a milf,
i'm a lover,
do your mom,
do your brother
i'm a sinner,
i'm a saint,
i will not be ashamed!`,
},
{
// Check newlines, links, lists still more or less readable as such.
html: "<p>Hello everyone, after a week or two down the release candidate mines, we've emerged blinking into the light carrying with us <a href=\"https://gts.superseriousbusiness.org/tags/gotosocial\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>GoToSocial</span></a> <strong>v0.18.0 Scroingly Sloth</strong>!</p><p><a href=\"https://github.com/superseriousbusiness/gotosocial/releases/tag/v0.18.0\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">https://github.com/superseriousbusiness/gotosocial/releases/tag/v0.18.0</a></p><p>Please read the migration notes carefully for instructions on how to upgrade to this version. <strong>This version contains several very long migrations so you will need to be patient when upgrading, and backup your database first!!</strong></p><p><strong>Release highlights</strong></p><ul><li><strong>Status edit support</strong>: one of our most-requested features! You can now edit your own statuses, and see instance edit history from other accounts too (if your instance has them stored).</li><li><strong>Push notifications</strong>: probably the second most-requested feature! GoToSocial can now send push notifications to clients via their configured push providers.<br>You may need to uninstall / reinstall client applications, or log out and back in again, for this feature to work. (And if you're using Tusky, <a href=\"https://tusky.app/faq/#why-are-notifications-less-frequent-with-tusky\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">make sure you've got ntfy installed</a>).</li><li><strong>Global instance css customization</strong>: admins can now apply custom CSS across their entire instance via the settings panel.</li><li><strong>Domain permission subscriptions</strong>: it's now possible to configure your instance to subscribe to CSV, JSON, or plaintext lists of domain permissions.<br>Each night, your instance will fetch and automatically create domain permissions (or permission drafts) based on what it finds in a subscribed list.<br>See the <a href=\"https://docs.gotosocial.org/en/latest/admin/domain_permission_subscriptions/\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">domain permission subscription documentation</a> for more information.</li><li><strong>Trusted-proxies helper</strong>: instances with improperly configured trusted-proxies settings will now show a warning on the homepage, so admins can make sure their instance is configured correctly. Check your own instance homepage after updating to see if you need to do anything.</li><li><strong>Better outbox sorting</strong>: messages from GoToSocial are now delivered more quickly to people you mention, so conversations across instances should feel a bit snappier.</li><li><strong>Log in button</strong>: there's now a login button in the top right of the instance homepage, which leads to a helpful page about clients, with a link to the settings panel. Should make things less confusing for new users!</li><li><strong>Granular stats controls</strong>: with the <code>instance-stats-mode</code> setting, admins can now choose if and how their instance serves stats via the nodeinfo endpoints. Existing behavior from v0.17.0 is the default.</li><li><strong>Post backdating</strong>: via the API you can now backdate posts (if enabled in config.yaml). This is our first step towards making it possible to import your post history from elsewhere into your GoToSocial instance. While there's no way to do this in the settings panel yet, you can already use third-party tools like Slurp to import posts from a Mastodon export (see <a href=\"https://github.com/VyrCossont/slurp\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">Slurp</a>).</li><li><strong>Configurable sign-up limits</strong>: you can now configure your sign-up backlog length and sign-up throttling (defaults remain the same).</li><li><strong>NetBSD and FreeBSD builds</strong>: yep!</li><li><strong>Respect users <code>prefers-color-scheme</code> preference</strong>: there's now a light mode default theme to complement our trusty dark mode theme, and the theme will switch based on a visitor's <code>prefers-color-scheme</code> configuration. This applies to all page and profiles, with the exception of some custom themes. Works in the settings panel too!</li></ul><p>Thanks for reading! And seriously back up your database.</p>",
expectedPlain: `Hello everyone, after a week or two down the release candidate mines, we've emerged blinking into the light carrying with us #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> v0.18.0 Scroingly Sloth!
https://github.com/superseriousbusiness/gotosocial/releases/tag/v0.18.0 <https://github.com/superseriousbusiness/gotosocial/releases/tag/v0.18.0>
Please read the migration notes carefully for instructions on how to upgrade to this version. This version contains several very long migrations so you will need to be patient when upgrading, and backup your database first!!
Release highlights
- Status edit support: one of our most-requested features! You can now edit your own statuses, and see instance edit history from other accounts too (if your instance has them stored).
- Push notifications: probably the second most-requested feature! GoToSocial can now send push notifications to clients via their configured push providers.
You may need to uninstall / reinstall client applications, or log out and back in again, for this feature to work. (And if you're using Tusky, make sure you've got ntfy installed <https://tusky.app/faq/#why-are-notifications-less-frequent-with-tusky>).
- Global instance css customization: admins can now apply custom CSS across their entire instance via the settings panel.
- Domain permission subscriptions: it's now possible to configure your instance to subscribe to CSV, JSON, or plaintext lists of domain permissions.
Each night, your instance will fetch and automatically create domain permissions (or permission drafts) based on what it finds in a subscribed list.
See the domain permission subscription documentation <https://docs.gotosocial.org/en/latest/admin/domain_permission_subscriptions/> for more information.
- Trusted-proxies helper: instances with improperly configured trusted-proxies settings will now show a warning on the homepage, so admins can make sure their instance is configured correctly. Check your own instance homepage after updating to see if you need to do anything.
- Better outbox sorting: messages from GoToSocial are now delivered more quickly to people you mention, so conversations across instances should feel a bit snappier.
- Log in button: there's now a login button in the top right of the instance homepage, which leads to a helpful page about clients, with a link to the settings panel. Should make things less confusing for new users!
- Granular stats controls: with the instance-stats-mode setting, admins can now choose if and how their instance serves stats via the nodeinfo endpoints. Existing behavior from v0.17.0 is the default.
- Post backdating: via the API you can now backdate posts (if enabled in config.yaml). This is our first step towards making it possible to import your post history from elsewhere into your GoToSocial instance. While there's no way to do this in the settings panel yet, you can already use third-party tools like Slurp to import posts from a Mastodon export (see Slurp <https://github.com/VyrCossont/slurp>).
- Configurable sign-up limits: you can now configure your sign-up backlog length and sign-up throttling (defaults remain the same).
- NetBSD and FreeBSD builds: yep!
- Respect users prefers-color-scheme preference: there's now a light mode default theme to complement our trusty dark mode theme, and the theme will switch based on a visitor's prefers-color-scheme configuration. This applies to all page and profiles, with the exception of some custom themes. Works in the settings panel too!
Thanks for reading! And seriously back up your database.`,
},
} {
plain := text.ParseHTMLToPlain(t.html)
suite.Equal(t.expectedPlain, plain)
}
}
func (suite *PlainTestSuite) TestStripCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("this is just a normal caption ;)", stripped)
}
func (suite *PlainTestSuite) TestStripCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("here's a LOUD caption", stripped)
}
func (suite *PlainTestSuite) TestStripCaption3() {
dodgyCaption := ""
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("", stripped)
}
func (suite *PlainTestSuite) TestStripCaption4() {
dodgyCaption := `
here is
a multi line
caption
with some newlines
`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", stripped)
}
func (suite *PlainTestSuite) TestStripCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("hello world", stripped)
}
func (suite *PlainTestSuite) TestStripCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
stripped := text.StripHTMLFromText(dodgyCaption)
suite.Equal("hello world", stripped)
}
func (suite *PlainTestSuite) TestStripCustomCSS() {
customCSS := `.toot .username {
color: var(--link_fg);
line-height: 2rem;
margin-top: -0.5rem;
align-self: start;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}`
stripped := text.StripHTMLFromText(customCSS)
suite.Equal(customCSS, stripped) // should be the same as it was before
}
func (suite *PlainTestSuite) TestStripNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
stripped := text.StripHTMLFromText(customCSS)
suite.Empty(stripped)
}
func (suite *PlainTestSuite) TestStripNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
stripped := text.StripHTMLFromText(customCSS)
suite.Equal("pee pee poo poo", stripped)
}
func TestPlainTestSuite(t *testing.T) {
suite.Run(t, new(PlainTestSuite))
}

View file

@ -1,56 +0,0 @@
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package text
import (
"testing"
"github.com/stretchr/testify/suite"
)
const (
test_removeHTML = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/>Text</p>`
test_removedHTML = `Another test @foss_satan#HashtagText`
test_withEscapedLiteral = `it\u0026amp;#39;s its it is`
test_withEscapedLiteralExpected = `it\u0026amp;#39;s its it is`
test_withEscaped = "it\u0026amp;#39;s its it is"
test_withEscapedExpected = "it&amp;#39;s its it is"
)
type RemoveHTMLTestSuite struct {
suite.Suite
}
func (suite *RemoveHTMLTestSuite) TestSanitizeWithEscapedLiteral() {
s := removeHTML(test_withEscapedLiteral)
suite.Equal(test_withEscapedLiteralExpected, s)
}
func (suite *RemoveHTMLTestSuite) TestSanitizeWithEscaped() {
s := removeHTML(test_withEscaped)
suite.Equal(test_withEscapedExpected, s)
}
func (suite *RemoveHTMLTestSuite) TestRemoveHTML() {
s := removeHTML(test_removeHTML)
suite.Equal(test_removedHTML, s)
}
func TestRemoveHTMLTestSuite(t *testing.T) {
suite.Run(t, &RemoveHTMLTestSuite{})
}

View file

@ -18,9 +18,7 @@
package text
import (
"html"
"regexp"
"strings"
"github.com/microcosm-cc/bluemonday"
)
@ -163,29 +161,10 @@ var regular *bluemonday.Policy = func() *bluemonday.Policy {
// Source: https://github.com/microcosm-cc/bluemonday#usage
var strict *bluemonday.Policy = bluemonday.StrictPolicy()
// removeHTML strictly removes *all* recognized
// HTML elements from the given string.
func removeHTML(in string) string {
return strict.Sanitize(in)
}
// SanitizeToHTML sanitizes only risky html elements
// SanitizeHTML sanitizes only risky html elements
// from the given string, allowing safe ones through.
func SanitizeToHTML(in string) string {
return regular.Sanitize(in)
}
// SanitizeToPlaintext runs text through basic sanitization.
// This removes any html elements that were in the string,
// and returns clean plaintext.
func SanitizeToPlaintext(in string) string {
// Unescape first to catch any tricky critters.
content := html.UnescapeString(in)
// Remove all detected HTML.
content = removeHTML(content)
// Unescape again to return plaintext.
content = html.UnescapeString(content)
return strings.TrimSpace(content)
//
// It returns an HTML string.
func SanitizeHTML(html string) string {
return regular.Sanitize(html)
}

View file

@ -36,95 +36,18 @@ type SanitizeTestSuite struct {
}
func (suite *SanitizeTestSuite) TestSanitizeOutgoing() {
s := text.SanitizeToHTML(sanitizeOutgoing)
s := text.SanitizeHTML(sanitizeOutgoing)
suite.Equal(sanitizedOutgoing, s)
}
func (suite *SanitizeTestSuite) TestSanitizeHTML() {
s := text.SanitizeToHTML(sanitizeHTML)
s := text.SanitizeHTML(sanitizeHTML)
suite.Equal(sanitizedHTML, s)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("this is just a normal caption ;)", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("here's a LOUD caption", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption3() {
dodgyCaption := ""
sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption4() {
dodgyCaption := `
here is
a multi line
caption
with some newlines
`
sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCustomCSS() {
customCSS := `.toot .username {
color: var(--link_fg);
line-height: 2rem;
margin-top: -0.5rem;
align-self: start;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}`
sanitized := text.SanitizeToPlaintext(customCSS)
suite.Equal(customCSS, sanitized) // should be the same as it was before
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
sanitized := text.SanitizeToPlaintext(customCSS)
suite.Empty(sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
sanitized := text.SanitizeToPlaintext(customCSS)
suite.Equal("pee pee poo poo", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeInlineImg() {
withInlineImg := "<p>Here's an inline image: <img class=\"fixed-size-img svelte-uci8eb\" aria-hidden=\"false\" alt=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" title=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" width=\"0\" height=\"0\" src=\"https://example.org/fileserver/01H7J83147QMCE17C0RS9P10Y9/attachment/small/01H7J8365XXRTCP6CAMGEM49ZE.jpg\" style=\"object-position: 50% 50%;\"></p>"
sanitized := text.SanitizeToHTML(withInlineImg)
sanitized := text.SanitizeHTML(withInlineImg)
suite.Equal(`<p>Here&#39;s an inline image: </p>`, sanitized)
}