From 04b080303fde88e94a829ee5b4d6c8daa47e64f2 Mon Sep 17 00:00:00 2001 From: tsmethurst Date: Mon, 16 Aug 2021 19:05:49 +0200 Subject: [PATCH] update + test markdown parsing --- CONTRIBUTING.md | 4 +- README.md | 1 + go.mod | 1 + go.sum | 10 +++++ .../api/client/status/statuscreate_test.go | 6 +-- internal/processing/status/util_test.go | 6 +-- internal/text/common.go | 31 ++++++++++----- internal/text/markdown.go | 2 +- internal/text/markdown_test.go | 24 +++++++++++- internal/text/minify.go | 39 +++++++++++++++++++ internal/text/plain.go | 2 - internal/text/plain_test.go | 4 +- internal/text/sanitize.go | 3 +- 13 files changed, 108 insertions(+), 25 deletions(-) create mode 100644 internal/text/minify.go diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 97220f221..8ccd7e7dc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -139,7 +139,7 @@ We use [golangci-lint](https://golangci-lint.run/) for linting. To run this loca Then, you can run the linter with: ```bash -golangci-lint run +golangci-lint run --tests=false ``` Note that this linter also runs as a job on the Github repo, so if you make a PR that doesn't pass the linter, it will be rejected. As such, it's good practice to run the linter locally before pushing or opening a PR. @@ -155,7 +155,7 @@ go get -u github.com/golang/lint/golint To run the linter, use: ```bash -golint ./... +golint ./internal/... ``` Then make sure to run `go fmt ./...` to update whitespace and other opinionated formatting. diff --git a/README.md b/README.md index f336b440a..5a64dfa4f 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,7 @@ The following libraries and frameworks are used by GoToSocial, with gratitude * [superseriousbusiness/exifremove](https://github.com/superseriousbusiness/exifremove) forked from [scottleedavis/go-exif-remove](https://github.com/scottleedavis/go-exif-remove); EXIF data removal. [MIT License](https://spdx.org/licenses/MIT.html). * [superseriousbusiness/oauth2](https://github.com/superseriousbusiness/oauth2) forked from [go-oauth2/oauth2](https://github.com/go-oauth2/oauth2); oauth server framework and token handling. [MIT License](https://spdx.org/licenses/MIT.html). * [go-swagger/go-swagger](https://github.com/go-swagger/go-swagger); Swagger OpenAPI spec generation. [Apache-2.0 License](https://spdx.org/licenses/Apache-2.0.html). +* [tdewolff/minify](https://github.com/tdewolff/minify); HTML minification. [MIT License](https://spdx.org/licenses/MIT.html). * [urfave/cli](https://github.com/urfave/cli); command-line interface framework. [MIT License](https://spdx.org/licenses/MIT.html). * [wagslane/go-password-validator](https://github.com/wagslane/go-password-validator); password strength validation. [MIT License](https://spdx.org/licenses/MIT.html). diff --git a/go.mod b/go.mod index 48febd4a6..10597a06b 100644 --- a/go.mod +++ b/go.mod @@ -44,6 +44,7 @@ require ( github.com/stretchr/testify v1.7.0 github.com/superseriousbusiness/exifremove v0.0.0-20210330092427-6acd27eac203 github.com/superseriousbusiness/oauth2/v4 v4.3.0-SSB + github.com/tdewolff/minify/v2 v2.9.21 github.com/tidwall/buntdb v1.2.4 // indirect github.com/urfave/cli/v2 v2.3.0 github.com/vmihailenco/msgpack/v5 v5.3.4 // indirect diff --git a/go.sum b/go.sum index e3599fa9d..4d6968ada 100644 --- a/go.sum +++ b/go.sum @@ -46,6 +46,7 @@ github.com/bradleypeabody/gorilla-sessions-memcache v0.0.0-20181103040241-659414 github.com/buckket/go-blurhash v1.1.0 h1:X5M6r0LIvwdvKiUtiNcRL2YlmOfMzYobI3VCKCZc9Do= github.com/buckket/go-blurhash v1.1.0/go.mod h1:aT2iqo5W9vu9GpyoLErKfTHwgODsZp3bQfXjXJUxNb8= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927/go.mod h1:h/aW8ynjgkuj+NQRlZcDbAbM1ORAbXjXX77sX7T289U= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= @@ -92,6 +93,7 @@ github.com/dsoprea/go-utility v0.0.0-20200711062821-fab8125e9bdf/go.mod h1:95+K3 github.com/dsoprea/go-utility v0.0.0-20200717064901-2fccff4aa15e h1:ojqYA1mU6LuRm8XzrVOvyfb000y59cbUcu6Wt8sFSAs= github.com/dsoprea/go-utility v0.0.0-20200717064901-2fccff4aa15e/go.mod h1:KVK+/Hul09ujXAGq+42UBgCTnXkiJZRnLYdURGjQUwo= github.com/dsoprea/go-utility/v2 v2.0.0-20200717064901-2fccff4aa15e/go.mod h1:uAzdkPTub5Y9yQwXe8W4m2XuP0tK4a9Q/dantD0+uaU= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= @@ -266,6 +268,7 @@ github.com/leodido/go-urn v1.1.0/go.mod h1:+cyI34gQWZcE1eQU7NVgKkkzdXDQHr1dBMtdA github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= github.com/leodido/go-urn v1.2.1 h1:BqpAaACuzVSgi/VLzGZIobT2z4v53pjosyNd9Yv6n/w= github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY= +github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2/go.mod h1:0KeJpeMD6o+O4hW7qJOT7vyQPKrWmj26uf5wMc/IiIs= github.com/mattn/go-colorable v0.1.7/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= @@ -323,6 +326,7 @@ github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykE github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= @@ -335,6 +339,12 @@ github.com/superseriousbusiness/exifremove v0.0.0-20210330092427-6acd27eac203 h1 github.com/superseriousbusiness/exifremove v0.0.0-20210330092427-6acd27eac203/go.mod h1:0Xw5cYMOYpgaWs+OOSx41ugycl2qvKTi9tlMMcZhFyY= github.com/superseriousbusiness/oauth2/v4 v4.3.0-SSB h1:dzMVC+oPTxFL5cv29egBrftlqIWPXQ6/VzkuoySwgm4= github.com/superseriousbusiness/oauth2/v4 v4.3.0-SSB/go.mod h1:8p0a/BEN9hhsGzE3tPaFFlIZgxAaLyLN5KY0bPg9ZBc= +github.com/tdewolff/minify/v2 v2.9.21 h1:nO4s1PEMy7aRjlIlbr3Jgr+bJby8QYuifa2Vs2f9lh4= +github.com/tdewolff/minify/v2 v2.9.21/go.mod h1:PoDBts2L7sCwUT28vTAlozGeD6qxjrrihtin4bR/RMM= +github.com/tdewolff/parse/v2 v2.5.19 h1:Kjaj3KQOx/4elIxlBSglus4E2oMfdROphvbq2b+OBZ0= +github.com/tdewolff/parse/v2 v2.5.19/go.mod h1:WzaJpRSbwq++EIQHYIRTpbYKNA3gn9it1Ik++q4zyho= +github.com/tdewolff/test v1.0.6 h1:76mzYJQ83Op284kMT+63iCNCI7NEERsIN8dLM+RiKr4= +github.com/tdewolff/test v1.0.6/go.mod h1:6DAvZliBAAnD7rhVgwaM7DE5/d9NMOAJ09SqYqeK4QE= github.com/tidwall/btree v0.0.0-20191029221954-400434d76274/go.mod h1:huei1BkDWJ3/sLXmO+bsCNELL+Bp2Kks9OLyQFkzvA8= github.com/tidwall/btree v0.5.0 h1:IBfCtOj4uOMQcodv3wzYVo0zPqSJObm71mE039/dlXY= github.com/tidwall/btree v0.5.0/go.mod h1:TzIRzen6yHbibdSfK6t8QimqbUnoxUSrZfeW7Uob0q4= diff --git a/internal/api/client/status/statuscreate_test.go b/internal/api/client/status/statuscreate_test.go index c175a54ec..33912397e 100644 --- a/internal/api/client/status/statuscreate_test.go +++ b/internal/api/client/status/statuscreate_test.go @@ -165,7 +165,7 @@ func (suite *StatusCreateTestSuite) TestPostAnotherNewStatus() { err = json.Unmarshal(b, statusReply) assert.NoError(suite.T(), err) - assert.Equal(suite.T(), "\u003cp\u003e\u003ca href=\"http://localhost:8080/tags/test\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\"\u003e#\u003cspan\u003etest\u003c/span\u003e\u003c/a\u003e alright, should be able to post \u003ca href=\"http://localhost:8080/tags/links\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\"\u003e#\u003cspan\u003elinks\u003c/span\u003e\u003c/a\u003e with fragments in them now, let\u0026#39;s see........\u003cbr/\u003e\u003cbr/\u003e\u003ca href=\"https://docs.gotosocial.org/en/latest/user_guide/posts/#links\" rel=\"noopener nofollow noreferrer\" target=\"_blank\"\u003edocs.gotosocial.org/en/latest/user_guide/posts/#links\u003c/a\u003e\u003cbr/\u003e\u003cbr/\u003e\u003ca href=\"http://localhost:8080/tags/gotosocial\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\"\u003e#\u003cspan\u003egotosocial\u003c/span\u003e\u003c/a\u003e\u003cbr/\u003e\u003cbr/\u003e(tobi remember to pull the docker image challenge)\u003c/p\u003e", statusReply.Content) + assert.Equal(suite.T(), "

#test alright, should be able to post #links with fragments in them now, let's see........

docs.gotosocial.org/en/latest/user_guide/posts/#links

#gotosocial

(tobi remember to pull the docker image challenge)

", statusReply.Content) } func (suite *StatusCreateTestSuite) TestPostNewStatusWithEmoji() { @@ -198,7 +198,7 @@ func (suite *StatusCreateTestSuite) TestPostNewStatusWithEmoji() { assert.NoError(suite.T(), err) assert.Equal(suite.T(), "", statusReply.SpoilerText) - assert.Equal(suite.T(), "

here is a rainbow emoji a few times! :rainbow: :rainbow: :rainbow:
here's an emoji that isn't in the db: :test_emoji:

", statusReply.Content) + assert.Equal(suite.T(), "

here is a rainbow emoji a few times! :rainbow: :rainbow: :rainbow:
here's an emoji that isn't in the db: :test_emoji:

", statusReply.Content) assert.Len(suite.T(), statusReply.Emojis, 1) mastoEmoji := statusReply.Emojis[0] @@ -314,7 +314,7 @@ func (suite *StatusCreateTestSuite) TestAttachNewMediaSuccess() { assert.NoError(suite.T(), err) assert.Equal(suite.T(), "", statusResponse.SpoilerText) - assert.Equal(suite.T(), "

here's an image attachment

", statusResponse.Content) + assert.Equal(suite.T(), "

here's an image attachment

", statusResponse.Content) assert.False(suite.T(), statusResponse.Sensitive) assert.Equal(suite.T(), model.VisibilityPublic, statusResponse.Visibility) diff --git a/internal/processing/status/util_test.go b/internal/processing/status/util_test.go index 9a4bd6515..4bf508848 100644 --- a/internal/processing/status/util_test.go +++ b/internal/processing/status/util_test.go @@ -17,8 +17,8 @@ const statusText1 = `Another test @foss_satan@fossbros-anonymous.io #Hashtag Text` -const statusText1ExpectedFull = `

Another test @foss_satan

#Hashtag

Text

` -const statusText1ExpectedPartial = `

Another test @foss_satan

#Hashtag

Text

` +const statusText1ExpectedFull = "

Another test @foss_satan

#Hashtag

Text

" +const statusText1ExpectedPartial = "

Another test @foss_satan

#Hashtag

Text

" const statusText2 = `Another test @foss_satan@fossbros-anonymous.io @@ -26,7 +26,7 @@ const statusText2 = `Another test @foss_satan@fossbros-anonymous.io #hashTAG` -const status2TextExpectedFull = `

Another test @foss_satan

#Hashtag

#hashTAG

` +const status2TextExpectedFull = "

Another test @foss_satan

#Hashtag

#hashTAG

" type UtilTestSuite struct { StatusStandardTestSuite diff --git a/internal/text/common.go b/internal/text/common.go index 073a07264..f6a5ca5f5 100644 --- a/internal/text/common.go +++ b/internal/text/common.go @@ -20,6 +20,7 @@ package text import ( "fmt" + "html" "strings" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" @@ -29,21 +30,33 @@ import ( // preformat contains some common logic for making a string ready for formatting, which should be used for all user-input text. func preformat(in string) string { // do some preformatting of the text - // 1. Trim all the whitespace - s := strings.TrimSpace(in) + + // 1. unescape everything that might be html escaped + s := html.UnescapeString(in) + + // 2. trim leading or trailing whitespace + s = strings.TrimSpace(s) return s } // postformat contains some common logic for html sanitization of text, wrapping elements, and trimming newlines and whitespace func postformat(in string) string { // do some postformatting of the text - // 1. remove any cheeky newlines - s := strings.ReplaceAll(in, "\n", "") - // 2. remove any whitespace added as a result of the formatting - s = strings.TrimSpace(s) - // 3. sanitize - s = regular.Sanitize(s) - return s + + // 1. sanitize html to remove potentially dangerous elements + s := SanitizeHTML(in) + + // 2. the sanitize step tends to escape characters inside codeblocks, which is behavior we don't want, so unescape everything again + s = html.UnescapeString(s) + + // 3. minify html to remove any trailing newlines, spaces, unnecessary elements, etc etc + mini, err := minifyHTML(s) + if err != nil { + // if the minify failed, just return what we have + return s + } + // return minified version of the html + return mini } func (f *formatter) ReplaceTags(in string, tags []*gtsmodel.Tag) string { diff --git a/internal/text/markdown.go b/internal/text/markdown.go index 720f8f570..5a7603615 100644 --- a/internal/text/markdown.go +++ b/internal/text/markdown.go @@ -27,7 +27,7 @@ func (f *formatter) FromMarkdown(md string, mentions []*gtsmodel.Mention, tags [ content := preformat(md) // do the markdown parsing *first* - contentBytes := blackfriday.Run([]byte(md)) + contentBytes := blackfriday.Run([]byte(content)) // format tags nicely content = f.ReplaceTags(string(contentBytes), tags) diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go index d0645dcad..432e9a4ec 100644 --- a/internal/text/markdown_test.go +++ b/internal/text/markdown_test.go @@ -19,6 +19,7 @@ package text_test import ( + "fmt" "testing" "github.com/stretchr/testify/suite" @@ -36,13 +37,31 @@ Here's a [link](https://example.org).` simpleMarkdownExpected = "

Title

Here’s a simple text in markdown.

Here’s a link.

" - withCodeBlock = "# Title\n\n``` text\nhere's some code!\n```\n\nthat was some code :)" - withCodeBlockExpected = "

Title

here's some code!

that was some code :)

" + withCodeBlockExpected = "

Title

Below is some JSON.

{\n  \"key\": \"value\",\n  \"another_key\": [\n    \"value1\",\n    \"value2\"\n  ]\n}\n

that was some JSON :)

" withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!" withHashtagExpected = "

Title

here’s a simple status that uses hashtag #Hashtag!

" ) +var ( + withCodeBlock = `# Title + +Below is some JSON. + +` + "```" + `json +{ + "key": "value", + "another_key": [ + "value1", + "value2" + ] +} +` + "```" + ` + +that was some JSON :) +` +) + type MarkdownTestSuite struct { TextStandardTestSuite } @@ -78,6 +97,7 @@ func (suite *MarkdownTestSuite) TestParseSimple() { } func (suite *MarkdownTestSuite) TestParseWithCodeBlock() { + fmt.Println(withCodeBlock) s := suite.formatter.FromMarkdown(withCodeBlock, nil, nil) suite.Equal(withCodeBlockExpected, s) } diff --git a/internal/text/minify.go b/internal/text/minify.go new file mode 100644 index 000000000..c6d7b9bc1 --- /dev/null +++ b/internal/text/minify.go @@ -0,0 +1,39 @@ +/* + GoToSocial + Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package text + +import ( + "github.com/tdewolff/minify/v2" + "github.com/tdewolff/minify/v2/html" +) + +var m *minify.M + +// minifyHTML runs html through a minifier, reducing it in size. +func minifyHTML(in string) (string, error) { + if m == nil { + m = minify.New() + m.Add("text/html", &html.Minifier{ + KeepQuotes: true, + KeepEndTags: true, + KeepDocumentTags: true, + }) + } + return m.String("text/html", in) +} diff --git a/internal/text/plain.go b/internal/text/plain.go index 4d467a351..a44e02c80 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -46,7 +46,5 @@ func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags [ // wrap the whole thing in a pee content = fmt.Sprintf(`

%s

`, content) - content = SanitizeHTML(content) - return postformat(content) } diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index b41279a2e..33c95234c 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -34,14 +34,14 @@ const ( simpleExpected = "

this is a plain and simple status

" withTag = "here's a simple status that uses hashtag #welcome!" - withTagExpected = "

here's a simple status that uses hashtag #welcome!

" + withTagExpected = "

here's a simple status that uses hashtag #welcome!

" moreComplex = `Another test @foss_satan@fossbros-anonymous.io #Hashtag Text` - moreComplexFull = `

Another test @foss_satan

#Hashtag

Text

` + moreComplexFull = "

Another test @foss_satan

#Hashtag

Text

" ) type PlainTestSuite struct { diff --git a/internal/text/sanitize.go b/internal/text/sanitize.go index a7a274e2f..e1bc73559 100644 --- a/internal/text/sanitize.go +++ b/internal/text/sanitize.go @@ -36,7 +36,8 @@ var regular *bluemonday.Policy = bluemonday.UGCPolicy(). AddTargetBlankToFullyQualifiedLinks(true). AllowAttrs("class", "href", "rel").OnElements("a"). AllowAttrs("class").OnElements("span"). - AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code") + AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code"). + SkipElementsContent("code", "pre") // '[C]an be thought of as equivalent to stripping all HTML elements and their attributes as it has nothing on its allowlist. // An example usage scenario would be blog post titles where HTML tags are not expected at all