| 
									
										
										
										
											2023-07-31 15:47:35 +02:00
										 |  |  | // GoToSocial | 
					
						
							|  |  |  | // Copyright (C) GoToSocial Authors admin@gotosocial.org | 
					
						
							|  |  |  | // SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | // it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | // the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | // (at your option) any later version. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | // but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | // GNU Affero General Public License for more details. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | // along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | package text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							|  |  |  | 	"strings" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	"golang.org/x/text/unicode/norm" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | const ( | 
					
						
							|  |  |  | 	maximumHashtagLength = 100 | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // NormalizeHashtag normalizes the given hashtag text by | 
					
						
							|  |  |  | // removing the initial '#' symbol, and then decomposing | 
					
						
							|  |  |  | // and canonically recomposing chars + combining diacritics | 
					
						
							|  |  |  | // in the text to single unicode characters, following | 
					
						
							|  |  |  | // Normalization Form C (https://unicode.org/reports/tr15/). | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Finally, it will do a check on the normalized string to | 
					
						
							|  |  |  | // ensure that it's below maximumHashtagLength chars, and | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | // contains only letters, numbers, and underscores (and not | 
					
						
							|  |  |  | // *JUST* underscores). | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // If all this passes, returned bool will be true. | 
					
						
							| 
									
										
										
										
											2023-07-31 15:47:35 +02:00
										 |  |  | func NormalizeHashtag(text string) (string, bool) { | 
					
						
							|  |  |  | 	// This normalization is specifically to avoid cases | 
					
						
							|  |  |  | 	// where visually-identical hashtags are stored with | 
					
						
							|  |  |  | 	// different unicode representations (e.g. with combining | 
					
						
							|  |  |  | 	// diacritics). It allows a tasteful number of combining | 
					
						
							|  |  |  | 	// diacritics to be used, as long as they can be combined | 
					
						
							|  |  |  | 	// with parent characters to form regular letter symbols. | 
					
						
							|  |  |  | 	normalized := norm.NFC.String(strings.TrimPrefix(text, "#")) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	// Validate normalized result. | 
					
						
							|  |  |  | 	var ( | 
					
						
							|  |  |  | 		notJustUnderscores = false | 
					
						
							|  |  |  | 		onlyPermittedChars = true | 
					
						
							|  |  |  | 		lengthOK           = true | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 15:47:35 +02:00
										 |  |  | 	for i, r := range normalized { | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 		if r != '_' { | 
					
						
							|  |  |  | 			// This isn't an underscore, | 
					
						
							|  |  |  | 			// so the whole hashtag isn't | 
					
						
							|  |  |  | 			// just underscores. | 
					
						
							|  |  |  | 			notJustUnderscores = true | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if i >= maximumHashtagLength { | 
					
						
							|  |  |  | 			lengthOK = false | 
					
						
							|  |  |  | 			break | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if !isPermittedInHashtag(r) { | 
					
						
							|  |  |  | 			onlyPermittedChars = false | 
					
						
							| 
									
										
										
										
											2023-07-31 15:47:35 +02:00
										 |  |  | 			break | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores) | 
					
						
							| 
									
										
										
										
											2023-07-31 15:47:35 +02:00
										 |  |  | } |