Text/status parsing fixes (#141)

* aaaaaa

* vendor minify

* update + test markdown parsing
This commit is contained in:
Tobi Smethurst 2021-08-16 19:17:56 +02:00 committed by GitHub
commit ce190d867c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
56 changed files with 7390 additions and 45 deletions

View file

@ -165,7 +165,7 @@ func (suite *StatusCreateTestSuite) TestPostAnotherNewStatus() {
err = json.Unmarshal(b, statusReply)
assert.NoError(suite.T(), err)
assert.Equal(suite.T(), "\u003cp\u003e\u003ca href=\"http://localhost:8080/tags/test\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\"\u003e#\u003cspan\u003etest\u003c/span\u003e\u003c/a\u003e alright, should be able to post \u003ca href=\"http://localhost:8080/tags/links\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\"\u003e#\u003cspan\u003elinks\u003c/span\u003e\u003c/a\u003e with fragments in them now, let\u0026#39;s see........\u003cbr/\u003e\u003cbr/\u003e\u003ca href=\"https://docs.gotosocial.org/en/latest/user_guide/posts/#links\" rel=\"noopener nofollow noreferrer\" target=\"_blank\"\u003edocs.gotosocial.org/en/latest/user_guide/posts/#links\u003c/a\u003e\u003cbr/\u003e\u003cbr/\u003e\u003ca href=\"http://localhost:8080/tags/gotosocial\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\"\u003e#\u003cspan\u003egotosocial\u003c/span\u003e\u003c/a\u003e\u003cbr/\u003e\u003cbr/\u003e(tobi remember to pull the docker image challenge)\u003c/p\u003e", statusReply.Content)
assert.Equal(suite.T(), "<p><a href=\"http://localhost:8080/tags/test\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>test</span></a> alright, should be able to post <a href=\"http://localhost:8080/tags/links\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>links</span></a> with fragments in them now, let's see........<br><br><a href=\"https://docs.gotosocial.org/en/latest/user_guide/posts/#links\" rel=\"noopener nofollow noreferrer\" target=\"_blank\">docs.gotosocial.org/en/latest/user_guide/posts/#links</a><br><br><a href=\"http://localhost:8080/tags/gotosocial\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>gotosocial</span></a><br><br>(tobi remember to pull the docker image challenge)</p>", statusReply.Content)
}
func (suite *StatusCreateTestSuite) TestPostNewStatusWithEmoji() {
@ -198,7 +198,7 @@ func (suite *StatusCreateTestSuite) TestPostNewStatusWithEmoji() {
assert.NoError(suite.T(), err)
assert.Equal(suite.T(), "", statusReply.SpoilerText)
assert.Equal(suite.T(), "<p>here is a rainbow emoji a few times! :rainbow: :rainbow: :rainbow: <br/> here&#39;s an emoji that isn&#39;t in the db: :test_emoji:</p>", statusReply.Content)
assert.Equal(suite.T(), "<p>here is a rainbow emoji a few times! :rainbow: :rainbow: :rainbow:<br>here's an emoji that isn't in the db: :test_emoji:</p>", statusReply.Content)
assert.Len(suite.T(), statusReply.Emojis, 1)
mastoEmoji := statusReply.Emojis[0]
@ -314,7 +314,7 @@ func (suite *StatusCreateTestSuite) TestAttachNewMediaSuccess() {
assert.NoError(suite.T(), err)
assert.Equal(suite.T(), "", statusResponse.SpoilerText)
assert.Equal(suite.T(), "<p>here&#39;s an image attachment</p>", statusResponse.Content)
assert.Equal(suite.T(), "<p>here's an image attachment</p>", statusResponse.Content)
assert.False(suite.T(), statusResponse.Sensitive)
assert.Equal(suite.T(), model.VisibilityPublic, statusResponse.Visibility)

View file

@ -17,8 +17,8 @@ const statusText1 = `Another test @foss_satan@fossbros-anonymous.io
#Hashtag
Text`
const statusText1ExpectedFull = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/>Text</p>`
const statusText1ExpectedPartial = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/>#Hashtag<br/><br/>Text</p>`
const statusText1ExpectedFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br><br>Text</p>"
const statusText1ExpectedPartial = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br>#Hashtag<br><br>Text</p>"
const statusText2 = `Another test @foss_satan@fossbros-anonymous.io
@ -26,7 +26,7 @@ const statusText2 = `Another test @foss_satan@fossbros-anonymous.io
#hashTAG`
const status2TextExpectedFull = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>hashTAG</span></a></p>`
const status2TextExpectedFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashTAG</span></a></p>"
type UtilTestSuite struct {
StatusStandardTestSuite

View file

@ -20,6 +20,7 @@ package text
import (
"fmt"
"html"
"strings"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -29,23 +30,33 @@ import (
// preformat contains some common logic for making a string ready for formatting, which should be used for all user-input text.
func preformat(in string) string {
// do some preformatting of the text
// 1. Trim all the whitespace
s := strings.TrimSpace(in)
// 1. unescape everything that might be html escaped
s := html.UnescapeString(in)
// 2. trim leading or trailing whitespace
s = strings.TrimSpace(s)
return s
}
// postformat contains some common logic for html sanitization of text, wrapping elements, and trimming newlines and whitespace
func postformat(in string) string {
// do some postformatting of the text
// 1. sanitize html to remove any dodgy scripts or other disallowed elements
s := SanitizeOutgoing(in)
// 2. wrap the whole thing in a paragraph
s = fmt.Sprintf(`<p>%s</p>`, s)
// 3. remove any cheeky newlines
s = strings.ReplaceAll(s, "\n", "")
// 4. remove any whitespace added as a result of the formatting
s = strings.TrimSpace(s)
return s
// 1. sanitize html to remove potentially dangerous elements
s := SanitizeHTML(in)
// 2. the sanitize step tends to escape characters inside codeblocks, which is behavior we don't want, so unescape everything again
s = html.UnescapeString(s)
// 3. minify html to remove any trailing newlines, spaces, unnecessary elements, etc etc
mini, err := minifyHTML(s)
if err != nil {
// if the minify failed, just return what we have
return s
}
// return minified version of the html
return mini
}
func (f *formatter) ReplaceTags(in string, tags []*gtsmodel.Tag) string {

View file

@ -23,21 +23,14 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
)
var bfExtensions = blackfriday.NoIntraEmphasis |
blackfriday.FencedCode |
blackfriday.Autolink |
blackfriday.Strikethrough |
blackfriday.SpaceHeadings |
blackfriday.BackslashLineBreak
func (f *formatter) FromMarkdown(md string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(md)
// do the markdown parsing *first*
content = string(blackfriday.Run([]byte(content), blackfriday.WithExtensions(bfExtensions)))
contentBytes := blackfriday.Run([]byte(content))
// format tags nicely
content = f.ReplaceTags(content, tags)
content = f.ReplaceTags(string(contentBytes), tags)
// format mentions nicely
content = f.ReplaceMentions(content, mentions)

View file

@ -0,0 +1,116 @@
/*
GoToSocial
Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package text_test
import (
"fmt"
"testing"
"github.com/stretchr/testify/suite"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/text"
"github.com/superseriousbusiness/gotosocial/testrig"
)
const (
simpleMarkdown = `# Title
Here's a simple text in markdown.
Here's a [link](https://example.org).`
simpleMarkdownExpected = "<h1>Title</h1><p>Heres a simple text in markdown.</p><p>Heres a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>"
withCodeBlockExpected = "<h1>Title</h1><p>Below is some JSON.</p><pre><code class=\"language-json\">{\n \"key\": \"value\",\n \"another_key\": [\n \"value1\",\n \"value2\"\n ]\n}\n</code></pre><p>that was some JSON :)</p>"
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1><p>heres a simple status that uses hashtag <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
)
var (
withCodeBlock = `# Title
Below is some JSON.
` + "```" + `json
{
"key": "value",
"another_key": [
"value1",
"value2"
]
}
` + "```" + `
that was some JSON :)
`
)
type MarkdownTestSuite struct {
TextStandardTestSuite
}
func (suite *MarkdownTestSuite) SetupSuite() {
suite.testTokens = testrig.NewTestTokens()
suite.testClients = testrig.NewTestClients()
suite.testApplications = testrig.NewTestApplications()
suite.testUsers = testrig.NewTestUsers()
suite.testAccounts = testrig.NewTestAccounts()
suite.testAttachments = testrig.NewTestAttachments()
suite.testStatuses = testrig.NewTestStatuses()
suite.testTags = testrig.NewTestTags()
suite.testMentions = testrig.NewTestMentions()
}
func (suite *MarkdownTestSuite) SetupTest() {
suite.config = testrig.NewTestConfig()
suite.db = testrig.NewTestDB()
suite.log = testrig.NewTestLog()
suite.formatter = text.NewFormatter(suite.config, suite.db, suite.log)
testrig.StandardDBSetup(suite.db, suite.testAccounts)
}
func (suite *MarkdownTestSuite) TearDownTest() {
testrig.StandardDBTeardown(suite.db)
}
func (suite *MarkdownTestSuite) TestParseSimple() {
s := suite.formatter.FromMarkdown(simpleMarkdown, nil, nil)
suite.Equal(simpleMarkdownExpected, s)
}
func (suite *MarkdownTestSuite) TestParseWithCodeBlock() {
fmt.Println(withCodeBlock)
s := suite.formatter.FromMarkdown(withCodeBlock, nil, nil)
suite.Equal(withCodeBlockExpected, s)
}
func (suite *MarkdownTestSuite) TestParseWithHashtag() {
foundTags := []*gtsmodel.Tag{
suite.testTags["Hashtag"],
}
s := suite.formatter.FromMarkdown(withHashtag, nil, foundTags)
suite.Equal(withHashtagExpected, s)
}
func TestMarkdownTestSuite(t *testing.T) {
suite.Run(t, new(MarkdownTestSuite))
}

39
internal/text/minify.go Normal file
View file

@ -0,0 +1,39 @@
/*
GoToSocial
Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package text
import (
"github.com/tdewolff/minify/v2"
"github.com/tdewolff/minify/v2/html"
)
var m *minify.M
// minifyHTML runs html through a minifier, reducing it in size.
func minifyHTML(in string) (string, error) {
if m == nil {
m = minify.New()
m.Add("text/html", &html.Minifier{
KeepQuotes: true,
KeepEndTags: true,
KeepDocumentTags: true,
})
}
return m.String("text/html", in)
}

View file

@ -19,6 +19,7 @@
package text
import (
"fmt"
"strings"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -27,6 +28,9 @@ import (
func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(plain)
// sanitize any html elements
content = RemoveHTML(content)
// format links nicely
content = f.ReplaceLinks(content)
@ -39,5 +43,8 @@ func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags [
// replace newlines with breaks
content = strings.ReplaceAll(content, "\n", "<br />")
// wrap the whole thing in a pee
content = fmt.Sprintf(`<p>%s</p>`, content)
return postformat(content)
}

View file

@ -33,15 +33,15 @@ const (
simple = "this is a plain and simple status"
simpleExpected = "<p>this is a plain and simple status</p>"
withTag = "this is a simple status that uses hashtag #welcome!"
withTagExpected = "<p>this is a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>"
withTag = "here's a simple status that uses hashtag #welcome!"
withTagExpected = "<p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>"
moreComplex = `Another test @foss_satan@fossbros-anonymous.io
#Hashtag
Text`
moreComplexExpected = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/>Text</p>`
moreComplexFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br><br>Text</p>"
)
type PlainTestSuite struct {
@ -102,7 +102,7 @@ func (suite *PlainTestSuite) TestParseMoreComplex() {
fmt.Println(f)
assert.Equal(suite.T(), moreComplexExpected, f)
assert.Equal(suite.T(), moreComplexFull, f)
}
func TestPlainTestSuite(t *testing.T) {

View file

@ -19,6 +19,8 @@
package text
import (
"regexp"
"github.com/microcosm-cc/bluemonday"
)
@ -31,12 +33,11 @@ var regular *bluemonday.Policy = bluemonday.UGCPolicy().
RequireNoReferrerOnLinks(true).
RequireNoFollowOnLinks(true).
RequireCrossOriginAnonymous(true).
AddTargetBlankToFullyQualifiedLinks(true)
// outgoing policy should be used on statuses we've already parsed and added our own elements etc to. It is less strict than regular.
var outgoing *bluemonday.Policy = regular.
AddTargetBlankToFullyQualifiedLinks(true).
AllowAttrs("class", "href", "rel").OnElements("a").
AllowAttrs("class").OnElements("span")
AllowAttrs("class").OnElements("span").
AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code").
SkipElementsContent("code", "pre")
// '[C]an be thought of as equivalent to stripping all HTML elements and their attributes as it has nothing on its allowlist.
// An example usage scenario would be blog post titles where HTML tags are not expected at all
@ -54,9 +55,3 @@ func SanitizeHTML(in string) string {
func RemoveHTML(in string) string {
return strict.Sanitize(in)
}
// SanitizeOutgoing cleans up HTML in the given string, allowing through only safe elements and elements that were added during the parsing process.
// This should be used on text that we've already converted into HTML, just to catch any weirdness.
func SanitizeOutgoing(in string) string {
return outgoing.Sanitize(in)
}

View file

@ -0,0 +1,75 @@
/*
GoToSocial
Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package text_test
import (
"testing"
"github.com/stretchr/testify/suite"
"github.com/superseriousbusiness/gotosocial/internal/text"
)
const (
removeHTML = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/>Text</p>`
removedHTML = `Another test @foss_satan#HashtagText`
sanitizeHTML = `here's some naughty html: <script>alert(ahhhh)</script> !!!`
sanitizedHTML = `here&#39;s some naughty html: !!!`
withEscapedLiteral = `it\u0026amp;#39;s its it is`
withEscapedLiteralExpected = `it\u0026amp;#39;s its it is`
withEscaped = "it\u0026amp;#39;s its it is"
withEscapedExpected = "it&amp;#39;s its it is"
sanitizeOutgoing = `<p>gotta test some fucking &#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39; marks</p>`
sanitizedOutgoing = `<p>gotta test some fucking &#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39; marks</p>`
)
type SanitizeTestSuite struct {
suite.Suite
}
func (suite *SanitizeTestSuite) TestRemoveHTML() {
s := text.RemoveHTML(removeHTML)
suite.Equal(removedHTML, s)
}
func (suite *SanitizeTestSuite) TestSanitizeOutgoing() {
s := text.SanitizeHTML(sanitizeOutgoing)
suite.Equal(sanitizedOutgoing, s)
}
func (suite *SanitizeTestSuite) TestSanitizeHTML() {
s := text.SanitizeHTML(sanitizeHTML)
suite.Equal(sanitizedHTML, s)
}
func (suite *SanitizeTestSuite) TestSanitizeWithEscapedLiteral() {
s := text.RemoveHTML(withEscapedLiteral)
suite.Equal(withEscapedLiteralExpected, s)
}
func (suite *SanitizeTestSuite) TestSanitizeWithEscaped() {
s := text.RemoveHTML(withEscaped)
suite.Equal(withEscapedExpected, s)
}
func TestSanitizeTestSuite(t *testing.T) {
suite.Run(t, new(SanitizeTestSuite))
}