[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049)

* [bugfix] Fix unicode-unaware word boundary check in hashtag regex

Go `\b` does not care for Unicode, and without lookahead, the workarounds got
very ugly. So I replaced the regex with a parser.

The parser runs in O(n) time and performance should not be affected.

* [bugfix] Add back hashtag max length and add tests for it
This commit is contained in:
ugla 2022-11-15 16:05:34 +01:00 committed by GitHub
commit 52109776f6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 146 additions and 45 deletions

View file

@ -47,7 +47,6 @@ const (
const (
maximumUsernameLength = 64
maximumEmojiShortcodeLength = 30
maximumHashtagLength = 30
)
var (
@ -66,17 +65,11 @@ var (
// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
MentionName = regexp.MustCompile(mentionName)
// mention regex can be played around with here: https://regex101.com/r/G1oGR0/1
// mention regex can be played around with here: https://regex101.com/r/P0vpYG/1
mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)`
// MentionFinder extracts mentions from a piece of text.
MentionFinder = regexp.MustCompile(mentionFinder)
// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1
hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength)
// HashtagFinder finds possible hashtags in a string.
// It returns just the string part of the hashtag, not the # symbol.
HashtagFinder = regexp.MustCompile(hashtagFinder)
emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength)
// EmojiShortcode validates an emoji name.
EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))