mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-10-29 19:32:26 -05:00
[bugfix] Extend parser to handle more non-Latin hashtags (#3700)
* Allow marks after NFC normalization Includes regression test for the Tamil example from #3618 * Disallow just numbers + marks + underscore as hashtag
This commit is contained in:
parent
ab758cc233
commit
b9e0689359
5 changed files with 48 additions and 37 deletions
|
|
@ -19,19 +19,14 @@ package text
|
|||
|
||||
import "unicode"
|
||||
|
||||
func isPlausiblyInHashtag(r rune) bool {
|
||||
// Marks are allowed during parsing
|
||||
// prior to normalization, but not after,
|
||||
// since they may be combined into letters
|
||||
// during normalization.
|
||||
return unicode.IsMark(r) ||
|
||||
isPermittedInHashtag(r)
|
||||
func isPermittedInHashtag(r rune) bool {
|
||||
return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r)
|
||||
}
|
||||
|
||||
func isPermittedInHashtag(r rune) bool {
|
||||
return unicode.IsLetter(r) ||
|
||||
unicode.IsNumber(r) ||
|
||||
r == '_'
|
||||
// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag
|
||||
// but are not allowed to be the only characters making up the hashtag.
|
||||
func isPermittedIfNotEntireHashtag(r rune) bool {
|
||||
return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_'
|
||||
}
|
||||
|
||||
// isHashtagBoundary returns true if rune r
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue