| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | /* | 
					
						
							|  |  |  |    GoToSocial | 
					
						
							| 
									
										
										
										
											2021-12-20 18:42:19 +01:00
										 |  |  |    Copyright (C) 2021-2022 GoToSocial Authors admin@gotosocial.org | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |    This program is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  |    it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  |    the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  |    (at your option) any later version. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  |    GNU Affero General Public License for more details. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  |    along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
					
						
							|  |  |  | */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | package util | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 	"unicode" | 
					
						
							|  |  |  | 	"unicode/utf8" | 
					
						
							| 
									
										
										
										
											2021-09-01 18:29:25 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	"github.com/superseriousbusiness/gotosocial/internal/regexes" | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | const ( | 
					
						
							|  |  |  | 	maximumHashtagLength = 30 | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-29 11:54:56 +02:00
										 |  |  | // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, | 
					
						
							|  |  |  | // and applies a regex to it to return a deduplicated list of account names | 
					
						
							|  |  |  | // mentioned in that text, in the format "@user@example.org" or "@username" for | 
					
						
							|  |  |  | // local users. | 
					
						
							|  |  |  | func DeriveMentionNamesFromText(text string) []string { | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 	mentionedAccounts := []string{} | 
					
						
							| 
									
										
										
										
											2021-09-11 13:19:06 +02:00
										 |  |  | 	for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) { | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 		mentionedAccounts = append(mentionedAccounts, m[1]) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2021-08-20 12:26:56 +02:00
										 |  |  | 	return UniqueStrings(mentionedAccounts) | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | type Pair[A, B any] struct { | 
					
						
							|  |  |  | 	First  A | 
					
						
							|  |  |  | 	Second B | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Byte index in original string | 
					
						
							|  |  |  | // `First` includes `#`. | 
					
						
							|  |  |  | type Span = Pair[int, int] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Takes a plaintext (ie., not HTML-formatted) text, | 
					
						
							|  |  |  | // and returns a slice of unique hashtags. | 
					
						
							| 
									
										
										
										
											2021-09-11 13:19:06 +02:00
										 |  |  | func DeriveHashtagsFromText(text string) []string { | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 	tagsMap := make(map[string]bool) | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 	tags := []string{} | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	for _, v := range FindHashtagSpansInText(text) { | 
					
						
							|  |  |  | 		t := text[v.First+1 : v.Second] | 
					
						
							|  |  |  | 		if _, value := tagsMap[t]; !value { | 
					
						
							|  |  |  | 			tagsMap[t] = true | 
					
						
							|  |  |  | 			tags = append(tags, t) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return tags | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Takes a plaintext (ie., not HTML-formatted) text, | 
					
						
							|  |  |  | // and returns a list of pairs of indices into the original string, where | 
					
						
							|  |  |  | // hashtags are located. | 
					
						
							|  |  |  | func FindHashtagSpansInText(text string) []Span { | 
					
						
							|  |  |  | 	tags := []Span{} | 
					
						
							|  |  |  | 	start := 0 | 
					
						
							|  |  |  | 	// Keep one rune of lookbehind. | 
					
						
							|  |  |  | 	prev := ' ' | 
					
						
							|  |  |  | 	inTag := false | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for i, r := range text { | 
					
						
							| 
									
										
										
										
											2022-12-16 11:20:22 +00:00
										 |  |  | 		if r == '#' && IsHashtagBoundary(prev) { | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 			// Start of hashtag. | 
					
						
							|  |  |  | 			inTag = true | 
					
						
							|  |  |  | 			start = i | 
					
						
							| 
									
										
										
										
											2022-12-16 11:20:22 +00:00
										 |  |  | 		} else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) { | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 			// Inside the hashtag, but it was a phoney, gottem. | 
					
						
							|  |  |  | 			inTag = false | 
					
						
							| 
									
										
										
										
											2022-12-16 11:20:22 +00:00
										 |  |  | 		} else if inTag && IsHashtagBoundary(r) { | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 			// End of hashtag. | 
					
						
							|  |  |  | 			inTag = false | 
					
						
							|  |  |  | 			appendTag(&tags, text, start, i) | 
					
						
							|  |  |  | 		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { | 
					
						
							|  |  |  | 			// End of text. | 
					
						
							|  |  |  | 			appendTag(&tags, text, start, irl) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		prev = r | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return tags | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func appendTag(tags *[]Span, text string, start int, end int) { | 
					
						
							|  |  |  | 	l := end - start - 1 | 
					
						
							|  |  |  | 	// This check could be moved out into the parsing loop if necessary! | 
					
						
							|  |  |  | 	if 0 < l && l <= maximumHashtagLength { | 
					
						
							|  |  |  | 		*tags = append(*tags, Span{First: start, Second: end}) | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-11 13:19:06 +02:00
										 |  |  | // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | // and applies a regex to it to return a deduplicated list of emojis | 
					
						
							| 
									
										
										
										
											2021-09-11 13:19:06 +02:00
										 |  |  | // used in that text, without the surrounding `::` | 
					
						
							|  |  |  | func DeriveEmojisFromText(text string) []string { | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 	emojis := []string{} | 
					
						
							| 
									
										
										
										
											2021-09-11 13:19:06 +02:00
										 |  |  | 	for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) { | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | 		emojis = append(emojis, m[1]) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2021-08-20 12:26:56 +02:00
										 |  |  | 	return UniqueStrings(emojis) | 
					
						
							| 
									
										
										
										
											2021-04-19 19:42:19 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-12-16 11:20:22 +00:00
										 |  |  | func IsPermittedInHashtag(r rune) bool { | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 	return unicode.IsLetter(r) || unicode.IsNumber(r) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Decides where to break before or after a hashtag. | 
					
						
							| 
									
										
										
										
											2022-12-16 11:20:22 +00:00
										 |  |  | func IsHashtagBoundary(r rune) bool { | 
					
						
							| 
									
										
										
										
											2022-11-15 16:05:34 +01:00
										 |  |  | 	return r == '#' || // `###lol` should work | 
					
						
							|  |  |  | 		unicode.IsSpace(r) || // All kinds of Unicode whitespace. | 
					
						
							|  |  |  | 		unicode.IsControl(r) || // All kinds of control characters, like tab. | 
					
						
							|  |  |  | 		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). | 
					
						
							|  |  |  | 		// But `someurl/#fragment` should not match, neither should HTML entities like `#`. | 
					
						
							|  |  |  | 		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) | 
					
						
							|  |  |  | } |