[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049)

* [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it
2025-10-29 09:52:26 -05:00 · 2022-11-15 16:05:34 +01:00 · 2022-11-15 16:05:34 +01:00 · 52109776f6
commit 52109776f6
parent fece7fa706
4 changed files with 146 additions and 45 deletions
--- a/internal/regexes/regexes.go
+++ b/internal/regexes/regexes.go
@ -47,7 +47,6 @@ const (
 const (
 	maximumUsernameLength       = 64
 	maximumEmojiShortcodeLength = 30
-	maximumHashtagLength        = 30
 )

 var (
@ -66,17 +65,11 @@ var (
 	// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
 	MentionName = regexp.MustCompile(mentionName)

-	// mention regex can be played around with here: https://regex101.com/r/G1oGR0/1
+	// mention regex can be played around with here: https://regex101.com/r/P0vpYG/1
 	mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)`
 	// MentionFinder extracts mentions from a piece of text.
 	MentionFinder = regexp.MustCompile(mentionFinder)

-	// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1
-	hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength)
-	// HashtagFinder finds possible hashtags in a string.
-	// It returns just the string part of the hashtag, not the # symbol.
-	HashtagFinder = regexp.MustCompile(hashtagFinder)
-
 	emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength)
 	// EmojiShortcode validates an emoji name.
 	EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))