[bugfix] html escape special characters in text instead of totally removing them (#719)

* remove minify dependency

* tidy up some tests

* remove pre + postformat funcs

* rework sanitization + formatting

* update tests

* add some more markdown tests
This commit is contained in:
tobi 2022-07-19 15:21:17 +02:00 committed by GitHub
commit c84384e660
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
51 changed files with 129 additions and 7419 deletions

View file

@ -21,7 +21,6 @@ package text
import (
"bytes"
"context"
"html"
"strings"
"unicode"
@ -30,38 +29,6 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/regexes"
)
// preformat contains some common logic for making a string ready for formatting, which should be used for all user-input text.
func preformat(in string) string {
// do some preformatting of the text
// 1. unescape everything that might be html escaped
s := html.UnescapeString(in)
// 2. trim leading or trailing whitespace
s = strings.TrimSpace(s)
return s
}
// postformat contains some common logic for html sanitization of text, wrapping elements, and trimming newlines and whitespace
func postformat(in string) string {
// do some postformatting of the text
// 1. sanitize html to remove potentially dangerous elements
s := SanitizeHTML(in)
// 2. the sanitize step tends to escape characters inside codeblocks, which is behavior we don't want, so unescape everything again
s = html.UnescapeString(s)
// 3. minify html to remove any trailing newlines, spaces, unnecessary elements, etc etc
mini, err := MinifyHTML(s)
if err != nil {
// if the minify failed, just return what we have
return s
}
// return minified version of the html
return mini
}
func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string {
return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string {
// we have a match

View file

@ -28,44 +28,14 @@ import (
)
const (
replaceMentionsString = `Another test @foss_satan@fossbros-anonymous.io
#Hashtag
Text`
replaceMentionsExpected = `Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention">@<span>foss_satan</span></a></span>
#Hashtag
Text`
replaceHashtagsExpected = `Another test @foss_satan@fossbros-anonymous.io
<a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag">#<span>Hashtag</span></a>
Text`
replaceHashtagsAfterMentionsExpected = `Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention">@<span>foss_satan</span></a></span>
<a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag">#<span>Hashtag</span></a>
Text`
replaceMentionsWithLinkString = `Another test @foss_satan@fossbros-anonymous.io
http://fossbros-anonymous.io/@foss_satan/statuses/6675ee73-fccc-4562-a46a-3e8cd9798060`
replaceMentionsWithLinkStringExpected = `Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention">@<span>foss_satan</span></a></span>
http://fossbros-anonymous.io/@foss_satan/statuses/6675ee73-fccc-4562-a46a-3e8cd9798060`
replaceMentionsWithLinkSelfString = `Mentioning myself: @the_mighty_zork
and linking to my own status: https://localhost:8080/@the_mighty_zork/statuses/01FGXKJRX2PMERJQ9EQF8Y6HCR`
replaceMemtionsWithLinkSelfExpected = `Mentioning myself: <span class="h-card"><a href="http://localhost:8080/@the_mighty_zork" class="u-url mention">@<span>the_mighty_zork</span></a></span>
and linking to my own status: https://localhost:8080/@the_mighty_zork/statuses/01FGXKJRX2PMERJQ9EQF8Y6HCR`
replaceMentionsString = "Another test @foss_satan@fossbros-anonymous.io\n\n#Hashtag\n\nText"
replaceMentionsExpected = "Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\">@<span>foss_satan</span></a></span>\n\n#Hashtag\n\nText"
replaceHashtagsExpected = "Another test @foss_satan@fossbros-anonymous.io\n\n<a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>Hashtag</span></a>\n\nText"
replaceHashtagsAfterMentionsExpected = "Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\">@<span>foss_satan</span></a></span>\n\n<a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>Hashtag</span></a>\n\nText"
replaceMentionsWithLinkString = "Another test @foss_satan@fossbros-anonymous.io\n\nhttp://fossbros-anonymous.io/@foss_satan/statuses/6675ee73-fccc-4562-a46a-3e8cd9798060"
replaceMentionsWithLinkStringExpected = "Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\">@<span>foss_satan</span></a></span>\n\nhttp://fossbros-anonymous.io/@foss_satan/statuses/6675ee73-fccc-4562-a46a-3e8cd9798060"
replaceMentionsWithLinkSelfString = "Mentioning myself: @the_mighty_zork\n\nand linking to my own status: https://localhost:8080/@the_mighty_zork/statuses/01FGXKJRX2PMERJQ9EQF8Y6HCR"
replaceMemtionsWithLinkSelfExpected = "Mentioning myself: <span class=\"h-card\"><a href=\"http://localhost:8080/@the_mighty_zork\" class=\"u-url mention\">@<span>the_mighty_zork</span></a></span>\n\nand linking to my own status: https://localhost:8080/@the_mighty_zork/statuses/01FGXKJRX2PMERJQ9EQF8Y6HCR"
)
type CommonTestSuite struct {

View file

@ -71,16 +71,16 @@ type LinkTestSuite struct {
func (suite *LinkTestSuite) TestParseSimple() {
f := suite.formatter.FromPlain(context.Background(), simple, nil, nil)
assert.Equal(suite.T(), simpleExpected, f)
suite.Equal(simpleExpected, f)
}
func (suite *LinkTestSuite) TestParseURLsFromText1() {
urls := text.FindLinks(text1)
assert.Equal(suite.T(), "https://example.org/link/to/something#fragment", urls[0].String())
assert.Equal(suite.T(), "http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String())
assert.Equal(suite.T(), "https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it", urls[2].String())
assert.Equal(suite.T(), "https://example.orghttps://google.com", urls[3].String())
suite.Equal("https://example.org/link/to/something#fragment", urls[0].String())
suite.Equal("http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String())
suite.Equal("https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it", urls[2].String())
suite.Equal("https://example.orghttps://google.com", urls[3].String())
}
func (suite *LinkTestSuite) TestParseURLsFromText2() {
@ -99,7 +99,7 @@ func (suite *LinkTestSuite) TestParseURLsFromText3() {
func (suite *LinkTestSuite) TestReplaceLinksFromText1() {
replaced := suite.formatter.ReplaceLinks(context.Background(), text1)
assert.Equal(suite.T(), `
suite.Equal(`
This is a text with some links in it. Here's link number one: <a href="https://example.org/link/to/something#fragment" rel="noopener">example.org/link/to/something#fragment</a>
Here's link number two: <a href="http://test.example.org?q=bahhhhhhhhhhhh" rel="noopener">test.example.org?q=bahhhhhhhhhhhh</a>
@ -114,7 +114,7 @@ really.cool.website <-- this one shouldn't be parsed as a link because it doesn'
func (suite *LinkTestSuite) TestReplaceLinksFromText2() {
replaced := suite.formatter.ReplaceLinks(context.Background(), text2)
assert.Equal(suite.T(), `
suite.Equal(`
this is one link: <a href="https://example.org" rel="noopener">example.org</a>
this is the same link again: <a href="https://example.org" rel="noopener">example.org</a>
@ -126,14 +126,14 @@ these should be deduplicated
func (suite *LinkTestSuite) TestReplaceLinksFromText3() {
// we know mailto links won't be replaced with hrefs -- we only accept https and http
replaced := suite.formatter.ReplaceLinks(context.Background(), text3)
assert.Equal(suite.T(), `
suite.Equal(`
here's a mailto link: mailto:whatever@test.org
`, replaced)
}
func (suite *LinkTestSuite) TestReplaceLinksFromText4() {
replaced := suite.formatter.ReplaceLinks(context.Background(), text4)
assert.Equal(suite.T(), `
suite.Equal(`
two similar links:
<a href="https://example.org" rel="noopener">example.org</a>
@ -145,7 +145,7 @@ two similar links:
func (suite *LinkTestSuite) TestReplaceLinksFromText5() {
// we know this one doesn't work properly, which is why html should always be sanitized before being passed into the ReplaceLinks function
replaced := suite.formatter.ReplaceLinks(context.Background(), text5)
assert.Equal(suite.T(), `
suite.Equal(`
what happens when we already have a link within an href?
<a href="<a href="https://example.org" rel="noopener">example.org</a>"><a href="https://example.org" rel="noopener">example.org</a></a>

View file

@ -26,13 +26,11 @@ import (
)
func (f *formatter) FromMarkdown(ctx context.Context, md string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(md)
// do the markdown parsing *first*
contentBytes := blackfriday.Run([]byte(content))
contentBytes := blackfriday.Run([]byte(md))
// format tags nicely
content = f.ReplaceTags(ctx, string(contentBytes), tags)
content := f.ReplaceTags(ctx, string(contentBytes), tags)
// format mentions nicely
content = f.ReplaceMentions(ctx, content, mentions)

View file

@ -44,15 +44,19 @@ that was some JSON :)
`
const (
simpleMarkdown = "# Title\n\nHere's a simple text in markdown.\n\nHere's a [link](https://example.org)."
simpleMarkdownExpected = "<h1>Title</h1>\n\n<p>Heres a simple text in markdown.</p>\n\n<p>Heres a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>\n"
withCodeBlockExpected = "<h1>Title</h1>\n\n<p>Below is some JSON.</p>\n\n<pre><code class=\"language-json\">{\n &#34;key&#34;: &#34;value&#34;,\n &#34;another_key&#34;: [\n &#34;value1&#34;,\n &#34;value2&#34;\n ]\n}\n</code></pre>\n\n<p>that was some JSON :)</p>\n"
withInlineCode = "`Nobody tells you about the <code><del>SECRET CODE</del></code>, do they?`"
withInlineCodeExpected = "<p><code>Nobody tells you about the &lt;code&gt;&lt;del&gt;SECRET CODE&lt;/del&gt;&lt;/code&gt;, do they?</code></p>\n"
withInlineCode2 = "`Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?`"
withInlineCode2Expected = "<p><code>Nobody tells you about the &lt;/code&gt;&lt;del&gt;SECRET CODE&lt;/del&gt;&lt;code&gt;, do they?</code></p>\n"
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1>\n\n<p>heres a simple status that uses hashtag <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>\n"
simpleMarkdown = "# Title\n\nHere's a simple text in markdown.\n\nHere's a [link](https://example.org)."
simpleMarkdownExpected = "<h1>Title</h1>\n\n<p>Heres a simple text in markdown.</p>\n\n<p>Heres a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>\n"
withCodeBlockExpected = "<h1>Title</h1>\n\n<p>Below is some JSON.</p>\n\n<pre><code class=\"language-json\">{\n &#34;key&#34;: &#34;value&#34;,\n &#34;another_key&#34;: [\n &#34;value1&#34;,\n &#34;value2&#34;\n ]\n}\n</code></pre>\n\n<p>that was some JSON :)</p>\n"
withInlineCode = "`Nobody tells you about the <code><del>SECRET CODE</del></code>, do they?`"
withInlineCodeExpected = "<p><code>Nobody tells you about the &lt;code&gt;&lt;del&gt;SECRET CODE&lt;/del&gt;&lt;/code&gt;, do they?</code></p>\n"
withInlineCode2 = "`Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?`"
withInlineCode2Expected = "<p><code>Nobody tells you about the &lt;/code&gt;&lt;del&gt;SECRET CODE&lt;/del&gt;&lt;code&gt;, do they?</code></p>\n"
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1>\n\n<p>heres a simple status that uses hashtag <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>\n"
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
mdWithHTMLExpected = "<h1>Title</h1>\n\n<p>Heres a simple text in markdown.</p>\n\n<p>Heres a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>\n\n<p>Heres an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\" crossorigin=\"anonymous\"></p>\n"
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
mdWithCheekyHTMLExpected = "<h1>Title</h1>\n\n<p>Heres a simple text in markdown.</p>\n\n<p>Heres a cheeky little script: </p>\n"
)
type MarkdownTestSuite struct {
@ -88,6 +92,16 @@ func (suite *MarkdownTestSuite) TestParseWithHashtag() {
suite.Equal(withHashtagExpected, s)
}
func (suite *MarkdownTestSuite) TestParseWithHTML() {
s := suite.formatter.FromMarkdown(context.Background(), mdWithHTML, nil, nil)
suite.Equal(mdWithHTMLExpected, s)
}
func (suite *MarkdownTestSuite) TestParseWithCheekyHTML() {
s := suite.formatter.FromMarkdown(context.Background(), mdWithCheekyHTML, nil, nil)
suite.Equal(mdWithCheekyHTMLExpected, s)
}
func TestMarkdownTestSuite(t *testing.T) {
suite.Run(t, new(MarkdownTestSuite))
}

View file

@ -1,39 +0,0 @@
/*
GoToSocial
Copyright (C) 2021-2022 GoToSocial Authors admin@gotosocial.org
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package text
import (
"github.com/tdewolff/minify/v2"
"github.com/tdewolff/minify/v2/html"
)
var m *minify.M
// MinifyHTML runs html through a minifier, reducing it in size.
func MinifyHTML(in string) (string, error) {
if m == nil {
m = minify.New()
m.Add("text/html", &html.Minifier{
KeepQuotes: true,
KeepEndTags: true,
KeepDocumentTags: true,
})
}
return m.String("text/html", in)
}

View file

@ -20,6 +20,7 @@ package text
import (
"context"
"html"
"strings"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -32,10 +33,11 @@ var breakReplacer = strings.NewReplacer(
)
func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(plain)
// trim any crap
content := strings.TrimSpace(plain)
// sanitize any html elements
content = removeHTML(content)
// clean 'er up
content = html.EscapeString(content)
// format links nicely
content = f.ReplaceLinks(ctx, content)
@ -52,5 +54,5 @@ func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gts
// wrap the whole thing in a pee
content = `<p>` + content + `</p>`
return postformat(content)
return SanitizeHTML(content)
}

View file

@ -20,27 +20,21 @@ package text_test
import (
"context"
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
)
const (
simple = "this is a plain and simple status"
simpleExpected = "<p>this is a plain and simple status</p>"
withTag = "here's a simple status that uses hashtag #welcome!"
withTagExpected = "<p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>"
moreComplex = `Another test @foss_satan@fossbros-anonymous.io
#Hashtag
Text`
moreComplexFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br><br>Text</p>"
simple = "this is a plain and simple status"
simpleExpected = "<p>this is a plain and simple status</p>"
withTag = "here's a simple status that uses hashtag #welcome!"
withTagExpected = "<p>here&#39;s a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>"
withHTML = "<div>blah this should just be html escaped blah</div>"
withHTMLExpected = "<p>&lt;div&gt;blah this should just be html escaped blah&lt;/div&gt;</p>"
moreComplex = "Another test @foss_satan@fossbros-anonymous.io\n\n#Hashtag\n\nText"
moreComplexFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br/><br/><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br/><br/>Text</p>"
)
type PlainTestSuite struct {
@ -49,7 +43,7 @@ type PlainTestSuite struct {
func (suite *PlainTestSuite) TestParseSimple() {
f := suite.formatter.FromPlain(context.Background(), simple, nil, nil)
assert.Equal(suite.T(), simpleExpected, f)
suite.Equal(simpleExpected, f)
}
func (suite *PlainTestSuite) TestParseWithTag() {
@ -58,7 +52,12 @@ func (suite *PlainTestSuite) TestParseWithTag() {
}
f := suite.formatter.FromPlain(context.Background(), withTag, nil, foundTags)
assert.Equal(suite.T(), withTagExpected, f)
suite.Equal(withTagExpected, f)
}
func (suite *PlainTestSuite) TestParseWithHTML() {
f := suite.formatter.FromPlain(context.Background(), withHTML, nil, nil)
suite.Equal(withHTMLExpected, f)
}
func (suite *PlainTestSuite) TestParseMoreComplex() {
@ -71,10 +70,7 @@ func (suite *PlainTestSuite) TestParseMoreComplex() {
}
f := suite.formatter.FromPlain(context.Background(), moreComplex, foundMentions, foundTags)
fmt.Println(f)
assert.Equal(suite.T(), moreComplexFull, f)
suite.Equal(moreComplexFull, f)
}
func TestPlainTestSuite(t *testing.T) {

View file

@ -19,7 +19,9 @@
package text
import (
"html"
"regexp"
"strings"
"github.com/microcosm-cc/bluemonday"
)
@ -59,7 +61,8 @@ func SanitizeHTML(in string) string {
// SanitizePlaintext runs text through basic sanitization. This removes
// any html elements that were in the string, and returns clean plaintext.
func SanitizePlaintext(in string) string {
content := preformat(in)
content := html.UnescapeString(in)
content = removeHTML(content)
return postformat(content)
content = html.UnescapeString(content)
return strings.TrimSpace(content)
}