From 5155718bcbfc52b2dcd822e0c80e9e2bdd70d052 Mon Sep 17 00:00:00 2001 From: Daenney Date: Thu, 20 Mar 2025 15:28:28 +0100 Subject: [PATCH] feat: Relax URL matching Instead of only linkifying things with an explicit http or https scheme, the xurls.Relaxed also matches links with known TLDs. This means that text like 'banana.com' will also be matched, despite the missing http/https scheme. This also works to linkify email addresses, which is handy. This should also ensure we catch links without a scheme for the purpose of spam checking. --- internal/filter/spam/statusable.go | 2 +- internal/regexes/regexes.go | 12 +++--------- internal/text/markdown.go | 2 +- internal/text/plain.go | 2 +- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/internal/filter/spam/statusable.go b/internal/filter/spam/statusable.go index 60598f920..819d03265 100644 --- a/internal/filter/spam/statusable.go +++ b/internal/filter/spam/statusable.go @@ -375,7 +375,7 @@ func (f *Filter) errantLinks( } // Find + parse every http/https link in the status. - rawLinks := regexes.LinkScheme.FindAllString(concat, -1) + rawLinks := regexes.URL.FindAllString(concat, -1) links := make([]preppedLink, 0, len(rawLinks)) for _, rawLink := range rawLinks { linkURI, err := url.Parse(rawLink) diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go index 515f69a12..a8a190645 100644 --- a/internal/regexes/regexes.go +++ b/internal/regexes/regexes.go @@ -40,7 +40,6 @@ const ( reports = "reports" accepts = "accepts" - schemes = `(http|https)://` // Allowed URI protocols for parsing links in text. alphaNumeric = `\p{L}\p{M}*|\p{N}` // A single number or script character in any language, including chars with accents. usernameGrp = `(?:` + alphaNumeric + `|\.|\-|\_)` // Non-capturing group that matches against a single valid username character. domainGrp = `(?:` + alphaNumeric + `|\.|\-|\:)` // Non-capturing group that matches against a single valid domain character. @@ -79,14 +78,9 @@ const ( ) var ( - // LinkScheme captures http/https schemes in URLs. - LinkScheme = func() *regexp.Regexp { - rgx, err := xurls.StrictMatchingScheme(schemes) - if err != nil { - panic(err) - } - return rgx - }() + // URL captures anything that looks like a URL. This includes + // URLs without a scheme, based on a built-in list of TLDs. + URL = xurls.Relaxed() // MentionName captures the username and domain part from // a mention string such as @whatever_user@example.org, diff --git a/internal/text/markdown.go b/internal/text/markdown.go index 7e75f2898..96da9e57d 100644 --- a/internal/text/markdown.go +++ b/internal/text/markdown.go @@ -139,7 +139,7 @@ func (f *Formatter) fromMarkdown( }, // Turns URLs into links. extension.NewLinkify( - extension.WithLinkifyURLRegexp(regexes.LinkScheme), + extension.WithLinkifyURLRegexp(regexes.URL), ), extension.Strikethrough, ), diff --git a/internal/text/plain.go b/internal/text/plain.go index ee4947bf7..7176dfec7 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -168,7 +168,7 @@ func (f *Formatter) fromPlain( }, // Turns URLs into links. extension.NewLinkify( - extension.WithLinkifyURLRegexp(regexes.LinkScheme), + extension.WithLinkifyURLRegexp(regexes.URL), ), ), )