Allow marks after NFC normalization

Includes regression test for the Tamil example from #3618
2026-01-05 20:33:17 -06:00 · 2025-01-27 09:41:10 -08:00 · 2025-01-27 09:41:10 -08:00 · 13a2573ca2
commit 13a2573ca2
parent 726d2ba483
5 changed files with 29 additions and 22 deletions
--- a/internal/text/util.go
+++ b/internal/text/util.go
@ -19,19 +19,16 @@ package text

 import "unicode"

-func isPlausiblyInHashtag(r rune) bool {
-	// Marks are allowed during parsing
-	// prior to normalization, but not after,
-	// since they may be combined into letters
-	// during normalization.
-	return unicode.IsMark(r) ||
-		isPermittedInHashtag(r)
-}
-
 func isPermittedInHashtag(r rune) bool {
 	return unicode.IsLetter(r) ||
 		unicode.IsNumber(r) ||
-		r == '_'
+		isPermittedIfNotEntireHashtag(r)
+}
+
+// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag
+// but are not allowed to be the only characters making up the hashtag.
+func isPermittedIfNotEntireHashtag(r rune) bool {
+	return unicode.IsMark(r) || r == '_'
 }

 // isHashtagBoundary returns true if rune r