mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-10-29 04:52:24 -05:00
Link parsing (#120)
* add link parsing + formatting functionality * refinement + docs * add missing test * credit url library
This commit is contained in:
parent
fddacdd1a8
commit
ea8ad8b346
9 changed files with 434 additions and 51 deletions
115
internal/text/link.go
Normal file
115
internal/text/link.go
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
GoToSocial
|
||||
Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package text
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
|
||||
"mvdan.cc/xurls/v2"
|
||||
)
|
||||
|
||||
// schemes is the regex for schemes we accept when looking for links.
|
||||
// Basically, we accept https or http.
|
||||
var schemes = `(((http|https))://)`
|
||||
|
||||
// FindLinks parses the given string looking for recognizable URLs (including scheme).
|
||||
// It returns a list of those URLs, without changing the string, or an error if something goes wrong.
|
||||
// If no URLs are found within the given string, an empty slice and nil will be returned.
|
||||
func FindLinks(in string) ([]*url.URL, error) {
|
||||
rxStrict, err := xurls.StrictMatchingScheme(schemes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
urls := []*url.URL{}
|
||||
|
||||
// bail already if we don't find anything
|
||||
found := rxStrict.FindAllString(in, -1)
|
||||
if len(found) == 0 {
|
||||
return urls, nil
|
||||
}
|
||||
|
||||
// for each string we find, we want to parse it into a URL if we can
|
||||
// if we fail to parse it, just ignore this match and continue
|
||||
for _, f := range found {
|
||||
u, err := url.Parse(f)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
urls = append(urls, u)
|
||||
}
|
||||
|
||||
// deduplicate the URLs
|
||||
urlsDeduped := []*url.URL{}
|
||||
|
||||
for _, u := range urls {
|
||||
if !contains(urlsDeduped, u) {
|
||||
urlsDeduped = append(urlsDeduped, u)
|
||||
}
|
||||
}
|
||||
|
||||
return urlsDeduped, nil
|
||||
}
|
||||
|
||||
// contains checks if the given url is already within a slice of URLs
|
||||
func contains(urls []*url.URL, url *url.URL) bool {
|
||||
for _, u := range urls {
|
||||
if u.String() == url.String() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents.
|
||||
// Note: because Go doesn't allow negative lookbehinds in regex, it's possible that an already-formatted
|
||||
// href will end up double-formatted, if the text you pass here contains one or more hrefs already.
|
||||
// To avoid this, you should sanitize any HTML out of text before you pass it into this function.
|
||||
func ReplaceLinks(in string) string {
|
||||
rxStrict, err := xurls.StrictMatchingScheme(schemes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
replaced := rxStrict.ReplaceAllStringFunc(in, func(urlString string) string {
|
||||
thisURL, err := url.Parse(urlString)
|
||||
if err != nil {
|
||||
return urlString // we can't parse it as a URL so don't replace it
|
||||
}
|
||||
|
||||
shortString := thisURL.Hostname()
|
||||
|
||||
if thisURL.Path != "" {
|
||||
shortString = shortString + thisURL.Path
|
||||
}
|
||||
|
||||
if thisURL.Fragment != "" {
|
||||
shortString = shortString + "#" + thisURL.Fragment
|
||||
}
|
||||
|
||||
if thisURL.RawQuery != "" {
|
||||
shortString = shortString + "?" + thisURL.RawQuery
|
||||
}
|
||||
|
||||
replacement := fmt.Sprintf(`<a href="%s" rel="noopener">%s</a>`, urlString, shortString)
|
||||
return replacement
|
||||
})
|
||||
return replaced
|
||||
}
|
||||
155
internal/text/link_test.go
Normal file
155
internal/text/link_test.go
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
GoToSocial
|
||||
Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package text_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/suite"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/text"
|
||||
)
|
||||
|
||||
const text1 = `
|
||||
This is a text with some links in it. Here's link number one: https://example.org/link/to/something#fragment
|
||||
|
||||
Here's link number two: http://test.example.org?q=bahhhhhhhhhhhh
|
||||
|
||||
https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it
|
||||
|
||||
really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme
|
||||
|
||||
https://example.orghttps://google.com <-- this shouldn't work either, but it does?! OK
|
||||
`
|
||||
|
||||
const text2 = `
|
||||
this is one link: https://example.org
|
||||
|
||||
this is the same link again: https://example.org
|
||||
|
||||
these should be deduplicated
|
||||
`
|
||||
|
||||
const text3 = `
|
||||
here's a mailto link: mailto:whatever@test.org
|
||||
`
|
||||
|
||||
const text4 = `
|
||||
two similar links:
|
||||
|
||||
https://example.org
|
||||
|
||||
https://example.org/test
|
||||
`
|
||||
|
||||
const text5 = `
|
||||
what happens when we already have a link within an href?
|
||||
|
||||
<a href="https://example.org">https://example.org</a>
|
||||
`
|
||||
|
||||
type TextTestSuite struct {
|
||||
suite.Suite
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestParseURLsFromText1() {
|
||||
urls, err := text.FindLinks(text1)
|
||||
|
||||
assert.NoError(suite.T(), err)
|
||||
|
||||
assert.Equal(suite.T(), "https://example.org/link/to/something#fragment", urls[0].String())
|
||||
assert.Equal(suite.T(), "http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String())
|
||||
assert.Equal(suite.T(), "https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it", urls[2].String())
|
||||
assert.Equal(suite.T(), "https://example.orghttps://google.com", urls[3].String())
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestParseURLsFromText2() {
|
||||
urls, err := text.FindLinks(text2)
|
||||
assert.NoError(suite.T(), err)
|
||||
|
||||
// assert length 1 because the found links will be deduplicated
|
||||
assert.Len(suite.T(), urls, 1)
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestParseURLsFromText3() {
|
||||
urls, err := text.FindLinks(text3)
|
||||
assert.NoError(suite.T(), err)
|
||||
|
||||
// assert length 0 because `mailto:` isn't accepted
|
||||
assert.Len(suite.T(), urls, 0)
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestReplaceLinksFromText1() {
|
||||
replaced := text.ReplaceLinks(text1)
|
||||
assert.Equal(suite.T(), `
|
||||
This is a text with some links in it. Here's link number one: <a href="https://example.org/link/to/something#fragment" rel="noopener">example.org/link/to/something#fragment</a>
|
||||
|
||||
Here's link number two: <a href="http://test.example.org?q=bahhhhhhhhhhhh" rel="noopener">test.example.org?q=bahhhhhhhhhhhh</a>
|
||||
|
||||
<a href="https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it" rel="noopener">another.link.example.org/with/a/pretty/long/path/at/the/end/of/it</a>
|
||||
|
||||
really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme
|
||||
|
||||
<a href="https://example.orghttps://google.com" rel="noopener">example.orghttps//google.com</a> <-- this shouldn't work either, but it does?! OK
|
||||
`, replaced)
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestReplaceLinksFromText2() {
|
||||
replaced := text.ReplaceLinks(text2)
|
||||
assert.Equal(suite.T(), `
|
||||
this is one link: <a href="https://example.org" rel="noopener">example.org</a>
|
||||
|
||||
this is the same link again: <a href="https://example.org" rel="noopener">example.org</a>
|
||||
|
||||
these should be deduplicated
|
||||
`, replaced)
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestReplaceLinksFromText3() {
|
||||
// we know mailto links won't be replaced with hrefs -- we only accept https and http
|
||||
replaced := text.ReplaceLinks(text3)
|
||||
assert.Equal(suite.T(), `
|
||||
here's a mailto link: mailto:whatever@test.org
|
||||
`, replaced)
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestReplaceLinksFromText4() {
|
||||
replaced := text.ReplaceLinks(text4)
|
||||
assert.Equal(suite.T(), `
|
||||
two similar links:
|
||||
|
||||
<a href="https://example.org" rel="noopener">example.org</a>
|
||||
|
||||
<a href="https://example.org/test" rel="noopener">example.org/test</a>
|
||||
`, replaced)
|
||||
}
|
||||
|
||||
func (suite *TextTestSuite) TestReplaceLinksFromText5() {
|
||||
// we know this one doesn't work properly, which is why html should always be sanitized before being passed into the ReplaceLinks function
|
||||
replaced := text.ReplaceLinks(text5)
|
||||
assert.Equal(suite.T(), `
|
||||
what happens when we already have a link within an href?
|
||||
|
||||
<a href="<a href="https://example.org" rel="noopener">example.org</a>"><a href="https://example.org" rel="noopener">example.org</a></a>
|
||||
`, replaced)
|
||||
}
|
||||
|
||||
func TestTextTestSuite(t *testing.T) {
|
||||
suite.Run(t, new(TextTestSuite))
|
||||
}
|
||||
|
|
@ -28,6 +28,9 @@ import (
|
|||
func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
|
||||
content := preformat(plain)
|
||||
|
||||
// format links nicely
|
||||
content = ReplaceLinks(content)
|
||||
|
||||
// format mentions nicely
|
||||
for _, menchie := range mentions {
|
||||
targetAccount := >smodel.Account{}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue