mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-10-31 14:42:26 -05:00 
			
		
		
		
	[chore] Text formatting overhaul (#1406)
* Implement goldmark debug print for hashtags and mentions * Minify HTML in FromPlain * Convert plaintext status parser to goldmark * Move mention/tag/emoji finding logic into formatter * Combine mention and hashtag boundary characters * Normalize unicode when rendering hashtags
This commit is contained in:
		
					parent
					
						
							
								271da016b9
							
						
					
				
			
			
				commit
				
					
						49beb17a8f
					
				
			
		
					 26 changed files with 826 additions and 1314 deletions
				
			
		|  | @ -20,115 +20,19 @@ package util | |||
| 
 | ||||
| import ( | ||||
| 	"unicode" | ||||
| 	"unicode/utf8" | ||||
| 
 | ||||
| 	"github.com/superseriousbusiness/gotosocial/internal/regexes" | ||||
| ) | ||||
| 
 | ||||
| const ( | ||||
| 	maximumHashtagLength = 30 | ||||
| ) | ||||
| 
 | ||||
| // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, | ||||
| // and applies a regex to it to return a deduplicated list of account names | ||||
| // mentioned in that text, in the format "@user@example.org" or "@username" for | ||||
| // local users. | ||||
| func DeriveMentionNamesFromText(text string) []string { | ||||
| 	mentionedAccounts := []string{} | ||||
| 	for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) { | ||||
| 		mentionedAccounts = append(mentionedAccounts, m[1]) | ||||
| 	} | ||||
| 	return UniqueStrings(mentionedAccounts) | ||||
| } | ||||
| 
 | ||||
| type Pair[A, B any] struct { | ||||
| 	First  A | ||||
| 	Second B | ||||
| } | ||||
| 
 | ||||
| // Byte index in original string | ||||
| // `First` includes `#`. | ||||
| type Span = Pair[int, int] | ||||
| 
 | ||||
| // Takes a plaintext (ie., not HTML-formatted) text, | ||||
| // and returns a slice of unique hashtags. | ||||
| func DeriveHashtagsFromText(text string) []string { | ||||
| 	tagsMap := make(map[string]bool) | ||||
| 	tags := []string{} | ||||
| 
 | ||||
| 	for _, v := range FindHashtagSpansInText(text) { | ||||
| 		t := text[v.First+1 : v.Second] | ||||
| 		if _, value := tagsMap[t]; !value { | ||||
| 			tagsMap[t] = true | ||||
| 			tags = append(tags, t) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return tags | ||||
| } | ||||
| 
 | ||||
| // Takes a plaintext (ie., not HTML-formatted) text, | ||||
| // and returns a list of pairs of indices into the original string, where | ||||
| // hashtags are located. | ||||
| func FindHashtagSpansInText(text string) []Span { | ||||
| 	tags := []Span{} | ||||
| 	start := 0 | ||||
| 	// Keep one rune of lookbehind. | ||||
| 	prev := ' ' | ||||
| 	inTag := false | ||||
| 
 | ||||
| 	for i, r := range text { | ||||
| 		if r == '#' && IsHashtagBoundary(prev) { | ||||
| 			// Start of hashtag. | ||||
| 			inTag = true | ||||
| 			start = i | ||||
| 		} else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) { | ||||
| 			// Inside the hashtag, but it was a phoney, gottem. | ||||
| 			inTag = false | ||||
| 		} else if inTag && IsHashtagBoundary(r) { | ||||
| 			// End of hashtag. | ||||
| 			inTag = false | ||||
| 			appendTag(&tags, text, start, i) | ||||
| 		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { | ||||
| 			// End of text. | ||||
| 			appendTag(&tags, text, start, irl) | ||||
| 		} | ||||
| 
 | ||||
| 		prev = r | ||||
| 	} | ||||
| 
 | ||||
| 	return tags | ||||
| } | ||||
| 
 | ||||
| func appendTag(tags *[]Span, text string, start int, end int) { | ||||
| 	l := end - start - 1 | ||||
| 	// This check could be moved out into the parsing loop if necessary! | ||||
| 	if 0 < l && l <= maximumHashtagLength { | ||||
| 		*tags = append(*tags, Span{First: start, Second: end}) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, | ||||
| // and applies a regex to it to return a deduplicated list of emojis | ||||
| // used in that text, without the surrounding `::` | ||||
| func DeriveEmojisFromText(text string) []string { | ||||
| 	emojis := []string{} | ||||
| 	for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) { | ||||
| 		emojis = append(emojis, m[1]) | ||||
| 	} | ||||
| 	return UniqueStrings(emojis) | ||||
| func IsPlausiblyInHashtag(r rune) bool { | ||||
| 	// Marks are allowed during parsing, prior to normalization, but not after, | ||||
| 	// since they may be combined into letters during normalization. | ||||
| 	return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) | ||||
| } | ||||
| 
 | ||||
| func IsPermittedInHashtag(r rune) bool { | ||||
| 	return unicode.IsLetter(r) || unicode.IsNumber(r) | ||||
| } | ||||
| 
 | ||||
| // Decides where to break before or after a hashtag. | ||||
| func IsHashtagBoundary(r rune) bool { | ||||
| 	return r == '#' || // `###lol` should work | ||||
| 		unicode.IsSpace(r) || // All kinds of Unicode whitespace. | ||||
| 		unicode.IsControl(r) || // All kinds of control characters, like tab. | ||||
| 		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). | ||||
| 		// But `someurl/#fragment` should not match, neither should HTML entities like `#`. | ||||
| 		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) | ||||
| // Decides where to break before or after a #hashtag or @mention | ||||
| func IsMentionOrHashtagBoundary(r rune) bool { | ||||
| 	return unicode.IsSpace(r) || unicode.IsPunct(r) | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue