[chore] Text formatting overhaul (#1406)

* Implement goldmark debug print for hashtags and mentions * Minify HTML in FromPlain * Convert plaintext status parser to goldmark * Move mention/tag/emoji finding logic into formatter * Combine mention and hashtag boundary characters * Normalize unicode when rendering hashtags
2025-12-16 12:33:00 -06:00 · 2023-02-03 10:58:58 +00:00 · 2023-02-03 10:58:58 +00:00 · 49beb17a8f
commit 49beb17a8f
parent 271da016b9
26 changed files with 826 additions and 1314 deletions
--- a/internal/util/statustools.go
+++ b/internal/util/statustools.go
@ -20,115 +20,19 @@ package util

 import (
 	"unicode"
-	"unicode/utf8"
-
-	"github.com/superseriousbusiness/gotosocial/internal/regexes"
 )

-const (
-	maximumHashtagLength = 30
-)
-
-// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
-// and applies a regex to it to return a deduplicated list of account names
-// mentioned in that text, in the format "@user@example.org" or "@username" for
-// local users.
-func DeriveMentionNamesFromText(text string) []string {
-	mentionedAccounts := []string{}
-	for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) {
-		mentionedAccounts = append(mentionedAccounts, m[1])
-	}
-	return UniqueStrings(mentionedAccounts)
-}
-
-type Pair[A, B any] struct {
-	First  A
-	Second B
-}
-
-// Byte index in original string
-// `First` includes `#`.
-type Span = Pair[int, int]
-
-// Takes a plaintext (ie., not HTML-formatted) text,
-// and returns a slice of unique hashtags.
-func DeriveHashtagsFromText(text string) []string {
-	tagsMap := make(map[string]bool)
-	tags := []string{}
-
-	for _, v := range FindHashtagSpansInText(text) {
-		t := text[v.First+1 : v.Second]
-		if _, value := tagsMap[t]; !value {
-			tagsMap[t] = true
-			tags = append(tags, t)
-		}
-	}
-
-	return tags
-}
-
-// Takes a plaintext (ie., not HTML-formatted) text,
-// and returns a list of pairs of indices into the original string, where
-// hashtags are located.
-func FindHashtagSpansInText(text string) []Span {
-	tags := []Span{}
-	start := 0
-	// Keep one rune of lookbehind.
-	prev := ' '
-	inTag := false
-
-	for i, r := range text {
-		if r == '#' && IsHashtagBoundary(prev) {
-			// Start of hashtag.
-			inTag = true
-			start = i
-		} else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) {
-			// Inside the hashtag, but it was a phoney, gottem.
-			inTag = false
-		} else if inTag && IsHashtagBoundary(r) {
-			// End of hashtag.
-			inTag = false
-			appendTag(&tags, text, start, i)
-		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
-			// End of text.
-			appendTag(&tags, text, start, irl)
-		}
-
-		prev = r
-	}
-
-	return tags
-}
-
-func appendTag(tags *[]Span, text string, start int, end int) {
-	l := end - start - 1
-	// This check could be moved out into the parsing loop if necessary!
-	if 0 < l && l <= maximumHashtagLength {
-		*tags = append(*tags, Span{First: start, Second: end})
-	}
-}
-
-// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
-// and applies a regex to it to return a deduplicated list of emojis
-// used in that text, without the surrounding `::`
-func DeriveEmojisFromText(text string) []string {
-	emojis := []string{}
-	for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) {
-		emojis = append(emojis, m[1])
-	}
-	return UniqueStrings(emojis)
+func IsPlausiblyInHashtag(r rune) bool {
+	// Marks are allowed during parsing, prior to normalization, but not after,
+	// since they may be combined into letters during normalization.
+	return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r)
 }

 func IsPermittedInHashtag(r rune) bool {
 	return unicode.IsLetter(r) || unicode.IsNumber(r)
 }

-// Decides where to break before or after a hashtag.
-func IsHashtagBoundary(r rune) bool {
-	return r == '#' || // `###lol` should work
-		unicode.IsSpace(r) || // All kinds of Unicode whitespace.
-		unicode.IsControl(r) || // All kinds of control characters, like tab.
-		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
-		// But `someurl/#fragment` should not match, neither should HTML entities like `&#35;`.
-		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
+// Decides where to break before or after a #hashtag or @mention
+func IsMentionOrHashtagBoundary(r rune) bool {
+	return unicode.IsSpace(r) || unicode.IsPunct(r)
 }