mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-10-30 23:22:26 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			134 lines
		
	
	
	
		
			4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			134 lines
		
	
	
	
		
			4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
|    GoToSocial
 | |
|    Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org
 | |
| 
 | |
|    This program is free software: you can redistribute it and/or modify
 | |
|    it under the terms of the GNU Affero General Public License as published by
 | |
|    the Free Software Foundation, either version 3 of the License, or
 | |
|    (at your option) any later version.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU Affero General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU Affero General Public License
 | |
|    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
| */
 | |
| 
 | |
| package util
 | |
| 
 | |
| import (
 | |
| 	"unicode"
 | |
| 	"unicode/utf8"
 | |
| 
 | |
| 	"github.com/superseriousbusiness/gotosocial/internal/regexes"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	maximumHashtagLength = 30
 | |
| )
 | |
| 
 | |
| // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
 | |
| // and applies a regex to it to return a deduplicated list of account names
 | |
| // mentioned in that text, in the format "@user@example.org" or "@username" for
 | |
| // local users.
 | |
| func DeriveMentionNamesFromText(text string) []string {
 | |
| 	mentionedAccounts := []string{}
 | |
| 	for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) {
 | |
| 		mentionedAccounts = append(mentionedAccounts, m[1])
 | |
| 	}
 | |
| 	return UniqueStrings(mentionedAccounts)
 | |
| }
 | |
| 
 | |
| type Pair[A, B any] struct {
 | |
| 	First  A
 | |
| 	Second B
 | |
| }
 | |
| 
 | |
| // Byte index in original string
 | |
| // `First` includes `#`.
 | |
| type Span = Pair[int, int]
 | |
| 
 | |
| // Takes a plaintext (ie., not HTML-formatted) text,
 | |
| // and returns a slice of unique hashtags.
 | |
| func DeriveHashtagsFromText(text string) []string {
 | |
| 	tagsMap := make(map[string]bool)
 | |
| 	tags := []string{}
 | |
| 
 | |
| 	for _, v := range FindHashtagSpansInText(text) {
 | |
| 		t := text[v.First+1 : v.Second]
 | |
| 		if _, value := tagsMap[t]; !value {
 | |
| 			tagsMap[t] = true
 | |
| 			tags = append(tags, t)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return tags
 | |
| }
 | |
| 
 | |
| // Takes a plaintext (ie., not HTML-formatted) text,
 | |
| // and returns a list of pairs of indices into the original string, where
 | |
| // hashtags are located.
 | |
| func FindHashtagSpansInText(text string) []Span {
 | |
| 	tags := []Span{}
 | |
| 	start := 0
 | |
| 	// Keep one rune of lookbehind.
 | |
| 	prev := ' '
 | |
| 	inTag := false
 | |
| 
 | |
| 	for i, r := range text {
 | |
| 		if r == '#' && IsHashtagBoundary(prev) {
 | |
| 			// Start of hashtag.
 | |
| 			inTag = true
 | |
| 			start = i
 | |
| 		} else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) {
 | |
| 			// Inside the hashtag, but it was a phoney, gottem.
 | |
| 			inTag = false
 | |
| 		} else if inTag && IsHashtagBoundary(r) {
 | |
| 			// End of hashtag.
 | |
| 			inTag = false
 | |
| 			appendTag(&tags, text, start, i)
 | |
| 		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
 | |
| 			// End of text.
 | |
| 			appendTag(&tags, text, start, irl)
 | |
| 		}
 | |
| 
 | |
| 		prev = r
 | |
| 	}
 | |
| 
 | |
| 	return tags
 | |
| }
 | |
| 
 | |
| func appendTag(tags *[]Span, text string, start int, end int) {
 | |
| 	l := end - start - 1
 | |
| 	// This check could be moved out into the parsing loop if necessary!
 | |
| 	if 0 < l && l <= maximumHashtagLength {
 | |
| 		*tags = append(*tags, Span{First: start, Second: end})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
 | |
| // and applies a regex to it to return a deduplicated list of emojis
 | |
| // used in that text, without the surrounding `::`
 | |
| func DeriveEmojisFromText(text string) []string {
 | |
| 	emojis := []string{}
 | |
| 	for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) {
 | |
| 		emojis = append(emojis, m[1])
 | |
| 	}
 | |
| 	return UniqueStrings(emojis)
 | |
| }
 | |
| 
 | |
| func IsPermittedInHashtag(r rune) bool {
 | |
| 	return unicode.IsLetter(r) || unicode.IsNumber(r)
 | |
| }
 | |
| 
 | |
| // Decides where to break before or after a hashtag.
 | |
| func IsHashtagBoundary(r rune) bool {
 | |
| 	return r == '#' || // `###lol` should work
 | |
| 		unicode.IsSpace(r) || // All kinds of Unicode whitespace.
 | |
| 		unicode.IsControl(r) || // All kinds of control characters, like tab.
 | |
| 		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
 | |
| 		// But `someurl/#fragment` should not match, neither should HTML entities like `#`.
 | |
| 		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
 | |
| }
 |