mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-10-31 09:02:25 -05:00 
			
		
		
		
	[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049)
* [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it
This commit is contained in:
		
					parent
					
						
							
								fece7fa706
							
						
					
				
			
			
				commit
				
					
						52109776f6
					
				
			
		
					 4 changed files with 146 additions and 45 deletions
				
			
		|  | @ -47,7 +47,6 @@ const ( | ||||||
| const ( | const ( | ||||||
| 	maximumUsernameLength       = 64 | 	maximumUsernameLength       = 64 | ||||||
| 	maximumEmojiShortcodeLength = 30 | 	maximumEmojiShortcodeLength = 30 | ||||||
| 	maximumHashtagLength        = 30 |  | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| var ( | var ( | ||||||
|  | @ -66,17 +65,11 @@ var ( | ||||||
| 	// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols) | 	// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols) | ||||||
| 	MentionName = regexp.MustCompile(mentionName) | 	MentionName = regexp.MustCompile(mentionName) | ||||||
| 
 | 
 | ||||||
| 	// mention regex can be played around with here: https://regex101.com/r/G1oGR0/1 | 	// mention regex can be played around with here: https://regex101.com/r/P0vpYG/1 | ||||||
| 	mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)` | 	mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)` | ||||||
| 	// MentionFinder extracts mentions from a piece of text. | 	// MentionFinder extracts mentions from a piece of text. | ||||||
| 	MentionFinder = regexp.MustCompile(mentionFinder) | 	MentionFinder = regexp.MustCompile(mentionFinder) | ||||||
| 
 | 
 | ||||||
| 	// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1 |  | ||||||
| 	hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength) |  | ||||||
| 	// HashtagFinder finds possible hashtags in a string. |  | ||||||
| 	// It returns just the string part of the hashtag, not the # symbol. |  | ||||||
| 	HashtagFinder = regexp.MustCompile(hashtagFinder) |  | ||||||
| 
 |  | ||||||
| 	emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength) | 	emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength) | ||||||
| 	// EmojiShortcode validates an emoji name. | 	// EmojiShortcode validates an emoji name. | ||||||
| 	EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode)) | 	EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode)) | ||||||
|  |  | ||||||
|  | @ -27,36 +27,46 @@ import ( | ||||||
| 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel" | 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel" | ||||||
| 	"github.com/superseriousbusiness/gotosocial/internal/log" | 	"github.com/superseriousbusiness/gotosocial/internal/log" | ||||||
| 	"github.com/superseriousbusiness/gotosocial/internal/regexes" | 	"github.com/superseriousbusiness/gotosocial/internal/regexes" | ||||||
|  | 	"github.com/superseriousbusiness/gotosocial/internal/util" | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { | func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { | ||||||
| 	return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string { | 	spans := util.FindHashtagSpansInText(in) | ||||||
| 		// we have a match | 
 | ||||||
| 		matchTrimmed := strings.TrimSpace(match) | 	if len(spans) == 0 { | ||||||
| 		tagAsEntered := matchTrimmed[1:] | 		return in | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	var b strings.Builder | ||||||
|  | 	i := 0 | ||||||
|  | 
 | ||||||
|  | spans: | ||||||
|  | 	for _, t := range spans { | ||||||
|  | 		b.WriteString(in[i:t.First]) | ||||||
|  | 		i = t.Second | ||||||
|  | 		tagAsEntered := in[t.First+1 : t.Second] | ||||||
| 
 | 
 | ||||||
| 		// check through the tags to find what we're matching |  | ||||||
| 		for _, tag := range tags { | 		for _, tag := range tags { | ||||||
| 			if strings.EqualFold(tagAsEntered, tag.Name) { | 			if strings.EqualFold(tagAsEntered, tag.Name) { | ||||||
| 				// Add any dropped space from match |  | ||||||
| 				if unicode.IsSpace(rune(match[0])) { |  | ||||||
| 					buf.WriteByte(match[0]) |  | ||||||
| 				} |  | ||||||
| 
 |  | ||||||
| 				// replace the #tag with the formatted tag content | 				// replace the #tag with the formatted tag content | ||||||
| 				// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a> | 				// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a> | ||||||
| 				buf.WriteString(`<a href="`) | 				b.WriteString(`<a href="`) | ||||||
| 				buf.WriteString(tag.URL) | 				b.WriteString(tag.URL) | ||||||
| 				buf.WriteString(`" class="mention hashtag" rel="tag">#<span>`) | 				b.WriteString(`" class="mention hashtag" rel="tag">#<span>`) | ||||||
| 				buf.WriteString(tagAsEntered) | 				b.WriteString(tagAsEntered) | ||||||
| 				buf.WriteString(`</span></a>`) | 				b.WriteString(`</span></a>`) | ||||||
| 				return buf.String() | 				continue spans | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		// the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes | 		b.WriteString(in[t.First:t.Second]) | ||||||
| 		return match | 	} | ||||||
| 	}) | 
 | ||||||
|  | 	// Get the last bits. | ||||||
|  | 	i = spans[len(spans)-1].Second | ||||||
|  | 	b.WriteString(in[i:]) | ||||||
|  | 
 | ||||||
|  | 	return b.String() | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { | func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { | ||||||
|  |  | ||||||
|  | @ -19,11 +19,16 @@ | ||||||
| package util | package util | ||||||
| 
 | 
 | ||||||
| import ( | import ( | ||||||
| 	"strings" | 	"unicode" | ||||||
|  | 	"unicode/utf8" | ||||||
| 
 | 
 | ||||||
| 	"github.com/superseriousbusiness/gotosocial/internal/regexes" | 	"github.com/superseriousbusiness/gotosocial/internal/regexes" | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | const ( | ||||||
|  | 	maximumHashtagLength = 30 | ||||||
|  | ) | ||||||
|  | 
 | ||||||
| // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, | // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, | ||||||
| // and applies a regex to it to return a deduplicated list of account names | // and applies a regex to it to return a deduplicated list of account names | ||||||
| // mentioned in that text, in the format "@user@example.org" or "@username" for | // mentioned in that text, in the format "@user@example.org" or "@username" for | ||||||
|  | @ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string { | ||||||
| 	return UniqueStrings(mentionedAccounts) | 	return UniqueStrings(mentionedAccounts) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text, | type Pair[A, B any] struct { | ||||||
| // and applies a regex to it to return a deduplicated list of hashtags | 	First  A | ||||||
| // used in that text, without the leading #. The case of the returned | 	Second B | ||||||
| // tags will be lowered, for consistency. | } | ||||||
|  | 
 | ||||||
|  | // Byte index in original string | ||||||
|  | // `First` includes `#`. | ||||||
|  | type Span = Pair[int, int] | ||||||
|  | 
 | ||||||
|  | // Takes a plaintext (ie., not HTML-formatted) text, | ||||||
|  | // and returns a slice of unique hashtags. | ||||||
| func DeriveHashtagsFromText(text string) []string { | func DeriveHashtagsFromText(text string) []string { | ||||||
|  | 	tagsMap := make(map[string]bool) | ||||||
| 	tags := []string{} | 	tags := []string{} | ||||||
| 	for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) { | 
 | ||||||
| 		tags = append(tags, strings.TrimPrefix(m[1], "#")) | 	for _, v := range FindHashtagSpansInText(text) { | ||||||
|  | 		t := text[v.First+1 : v.Second] | ||||||
|  | 		if _, value := tagsMap[t]; !value { | ||||||
|  | 			tagsMap[t] = true | ||||||
|  | 			tags = append(tags, t) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return tags | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Takes a plaintext (ie., not HTML-formatted) text, | ||||||
|  | // and returns a list of pairs of indices into the original string, where | ||||||
|  | // hashtags are located. | ||||||
|  | func FindHashtagSpansInText(text string) []Span { | ||||||
|  | 	tags := []Span{} | ||||||
|  | 	start := 0 | ||||||
|  | 	// Keep one rune of lookbehind. | ||||||
|  | 	prev := ' ' | ||||||
|  | 	inTag := false | ||||||
|  | 
 | ||||||
|  | 	for i, r := range text { | ||||||
|  | 		if r == '#' && isHashtagBoundary(prev) { | ||||||
|  | 			// Start of hashtag. | ||||||
|  | 			inTag = true | ||||||
|  | 			start = i | ||||||
|  | 		} else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) { | ||||||
|  | 			// Inside the hashtag, but it was a phoney, gottem. | ||||||
|  | 			inTag = false | ||||||
|  | 		} else if inTag && isHashtagBoundary(r) { | ||||||
|  | 			// End of hashtag. | ||||||
|  | 			inTag = false | ||||||
|  | 			appendTag(&tags, text, start, i) | ||||||
|  | 		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { | ||||||
|  | 			// End of text. | ||||||
|  | 			appendTag(&tags, text, start, irl) | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		prev = r | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return tags | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func appendTag(tags *[]Span, text string, start int, end int) { | ||||||
|  | 	l := end - start - 1 | ||||||
|  | 	// This check could be moved out into the parsing loop if necessary! | ||||||
|  | 	if 0 < l && l <= maximumHashtagLength { | ||||||
|  | 		*tags = append(*tags, Span{First: start, Second: end}) | ||||||
| 	} | 	} | ||||||
| 	return UniqueStrings(tags) |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, | // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, | ||||||
|  | @ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string { | ||||||
| 	} | 	} | ||||||
| 	return UniqueStrings(emojis) | 	return UniqueStrings(emojis) | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | func isPermittedInHashtag(r rune) bool { | ||||||
|  | 	return unicode.IsLetter(r) || unicode.IsNumber(r) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Decides where to break before or after a hashtag. | ||||||
|  | func isHashtagBoundary(r rune) bool { | ||||||
|  | 	return r == '#' || // `###lol` should work | ||||||
|  | 		unicode.IsSpace(r) || // All kinds of Unicode whitespace. | ||||||
|  | 		unicode.IsControl(r) || // All kinds of control characters, like tab. | ||||||
|  | 		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). | ||||||
|  | 		// But `someurl/#fragment` should not match, neither should HTML entities like `#`. | ||||||
|  | 		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | @ -77,26 +77,50 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() { | ||||||
| 
 | 
 | ||||||
| # testing this one shouldn't work | # testing this one shouldn't work | ||||||
| 
 | 
 | ||||||
| 			#thisshouldwork | 			#thisshouldwork #dupe #dupe!! #dupe | ||||||
| 
 | 
 | ||||||
| 	here's a link with a fragment: https://example.org/whatever#ahhh | 	here's a link with a fragment: https://example.org/whatever#ahhh | ||||||
|  | 	here's another link with a fragment: https://example.org/whatever/#ahhh | ||||||
| 
 | 
 | ||||||
| #ThisShouldAlsoWork #not_this_though | (#ThisShouldAlsoWork) #not_this_though | ||||||
| 
 | 
 | ||||||
| #111111 thisalsoshouldn'twork#### ## | #111111 thisalsoshouldn'twork#### ## | ||||||
| 
 | 
 | ||||||
| #alimentación, #saúde | #alimentación, #saúde, #lävistää, #ö, #네 | ||||||
|  | #ThisOneIsThirtyOneCharactersLon...  ...ng | ||||||
|  | #ThisOneIsThirteyCharactersLong | ||||||
| ` | ` | ||||||
| 
 | 
 | ||||||
| 	tags := util.DeriveHashtagsFromText(statusText) | 	tags := util.DeriveHashtagsFromText(statusText) | ||||||
| 	assert.Len(suite.T(), tags, 7) | 	assert.Len(suite.T(), tags, 12) | ||||||
| 	assert.Equal(suite.T(), "testing123", tags[0]) | 	assert.Equal(suite.T(), "testing123", tags[0]) | ||||||
| 	assert.Equal(suite.T(), "also", tags[1]) | 	assert.Equal(suite.T(), "also", tags[1]) | ||||||
| 	assert.Equal(suite.T(), "thisshouldwork", tags[2]) | 	assert.Equal(suite.T(), "thisshouldwork", tags[2]) | ||||||
| 	assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3]) | 	assert.Equal(suite.T(), "dupe", tags[3]) | ||||||
| 	assert.Equal(suite.T(), "111111", tags[4]) | 	assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4]) | ||||||
| 	assert.Equal(suite.T(), "alimentación", tags[5]) | 	assert.Equal(suite.T(), "111111", tags[5]) | ||||||
| 	assert.Equal(suite.T(), "saúde", tags[6]) | 	assert.Equal(suite.T(), "alimentación", tags[6]) | ||||||
|  | 	assert.Equal(suite.T(), "saúde", tags[7]) | ||||||
|  | 	assert.Equal(suite.T(), "lävistää", tags[8]) | ||||||
|  | 	assert.Equal(suite.T(), "ö", tags[9]) | ||||||
|  | 	assert.Equal(suite.T(), "네", tags[10]) | ||||||
|  | 	assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11]) | ||||||
|  | 
 | ||||||
|  | 	statusText = `#올빼미 hej` | ||||||
|  | 	tags = util.DeriveHashtagsFromText(statusText) | ||||||
|  | 	assert.Equal(suite.T(), "올빼미", tags[0]) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func (suite *StatusTestSuite) TestHashtagSpansOK() { | ||||||
|  | 	statusText := `#0 #3   #8aa` | ||||||
|  | 
 | ||||||
|  | 	spans := util.FindHashtagSpansInText(statusText) | ||||||
|  | 	assert.Equal(suite.T(), 0, spans[0].First) | ||||||
|  | 	assert.Equal(suite.T(), 2, spans[0].Second) | ||||||
|  | 	assert.Equal(suite.T(), 3, spans[1].First) | ||||||
|  | 	assert.Equal(suite.T(), 5, spans[1].Second) | ||||||
|  | 	assert.Equal(suite.T(), 8, spans[2].First) | ||||||
|  | 	assert.Equal(suite.T(), 12, spans[2].Second) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (suite *StatusTestSuite) TestDeriveEmojiOK() { | func (suite *StatusTestSuite) TestDeriveEmojiOK() { | ||||||
|  | @ -127,7 +151,7 @@ Here's some normal text with an :emoji: at the end | ||||||
| func (suite *StatusTestSuite) TestDeriveMultiple() { | func (suite *StatusTestSuite) TestDeriveMultiple() { | ||||||
| 	statusText := `Another test @foss_satan@fossbros-anonymous.io | 	statusText := `Another test @foss_satan@fossbros-anonymous.io | ||||||
| 
 | 
 | ||||||
| 	#Hashtag | 	#HashTag | ||||||
| 
 | 
 | ||||||
| 	Text` | 	Text` | ||||||
| 
 | 
 | ||||||
|  | @ -139,7 +163,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() { | ||||||
| 	assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0]) | 	assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0]) | ||||||
| 
 | 
 | ||||||
| 	assert.Len(suite.T(), hs, 1) | 	assert.Len(suite.T(), hs, 1) | ||||||
| 	assert.Equal(suite.T(), "Hashtag", hs[0]) | 	assert.Contains(suite.T(), hs, "HashTag") | ||||||
| 
 | 
 | ||||||
| 	assert.Len(suite.T(), es, 0) | 	assert.Len(suite.T(), es, 0) | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue