| 
									
										
										
										
											2023-03-12 16:00:57 +01:00
										 |  |  | // GoToSocial | 
					
						
							|  |  |  | // Copyright (C) GoToSocial Authors admin@gotosocial.org | 
					
						
							|  |  |  | // SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | // it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | // the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | // (at your option) any later version. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | // but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | // GNU Affero General Public License for more details. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | // along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
					
						
							| 
									
										
										
										
											2021-07-26 20:25:54 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | package text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							| 
									
										
										
										
											2023-02-03 10:58:58 +00:00
										 |  |  | 	"bytes" | 
					
						
							| 
									
										
										
										
											2021-08-25 15:34:33 +02:00
										 |  |  | 	"context" | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 	gohtml "html" | 
					
						
							|  |  |  | 	"strings" | 
					
						
							| 
									
										
										
										
											2021-07-26 20:25:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-26 15:34:10 +02:00
										 |  |  | 	"code.superseriousbusiness.org/gotosocial/internal/gtsmodel" | 
					
						
							|  |  |  | 	"code.superseriousbusiness.org/gotosocial/internal/log" | 
					
						
							|  |  |  | 	"code.superseriousbusiness.org/gotosocial/internal/regexes" | 
					
						
							| 
									
										
										
										
											2023-10-04 13:09:42 +01:00
										 |  |  | 	"codeberg.org/gruf/go-byteutil" | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 	"github.com/k3a/html2text" | 
					
						
							| 
									
										
										
										
											2023-02-03 10:58:58 +00:00
										 |  |  | 	"github.com/yuin/goldmark" | 
					
						
							|  |  |  | 	"github.com/yuin/goldmark/extension" | 
					
						
							|  |  |  | 	"github.com/yuin/goldmark/parser" | 
					
						
							|  |  |  | 	"github.com/yuin/goldmark/renderer/html" | 
					
						
							|  |  |  | 	"github.com/yuin/goldmark/util" | 
					
						
							| 
									
										
										
										
											2021-07-26 20:25:54 +02:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | // FromPlain fulfils FormatFunc by parsing | 
					
						
							|  |  |  | // the given plaintext input into a FormatResult. | 
					
						
							|  |  |  | func (f *Formatter) FromPlain( | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | 	ctx context.Context, | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	parseMention gtsmodel.ParseMentionFunc, | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | 	authorID string, | 
					
						
							|  |  |  | 	statusID string, | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	input string, | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | ) *FormatResult { | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	// Initialize standard block parser | 
					
						
							|  |  |  | 	// that wraps result in <p> tags. | 
					
						
							|  |  |  | 	plainTextParser := parser.NewParser( | 
					
						
							|  |  |  | 		parser.WithBlockParsers( | 
					
						
							|  |  |  | 			util.Prioritized(newPlaintextParser(), 500), | 
					
						
							| 
									
										
										
										
											2023-02-03 10:58:58 +00:00
										 |  |  | 		), | 
					
						
							|  |  |  | 	) | 
					
						
							| 
									
										
										
										
											2021-07-29 13:18:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	return f.fromPlain( | 
					
						
							|  |  |  | 		ctx, | 
					
						
							|  |  |  | 		plainTextParser, | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 		false, // basic = false | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 		parseMention, | 
					
						
							|  |  |  | 		authorID, | 
					
						
							|  |  |  | 		statusID, | 
					
						
							|  |  |  | 		input, | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-07-26 20:25:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | // FromPlainNoParagraph fulfils FormatFunc by parsing | 
					
						
							|  |  |  | // the given plaintext input into a FormatResult. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Unlike FromPlain, it will not wrap the resulting | 
					
						
							|  |  |  | // HTML in <p> tags, making it useful for parsing | 
					
						
							|  |  |  | // short fragments of text that oughtn't be formally | 
					
						
							|  |  |  | // wrapped as a paragraph. | 
					
						
							|  |  |  | func (f *Formatter) FromPlainNoParagraph( | 
					
						
							|  |  |  | 	ctx context.Context, | 
					
						
							|  |  |  | 	parseMention gtsmodel.ParseMentionFunc, | 
					
						
							|  |  |  | 	authorID string, | 
					
						
							|  |  |  | 	statusID string, | 
					
						
							|  |  |  | 	input string, | 
					
						
							|  |  |  | ) *FormatResult { | 
					
						
							|  |  |  | 	// Initialize block parser that | 
					
						
							|  |  |  | 	// doesn't wrap result in <p> tags. | 
					
						
							|  |  |  | 	plainTextParser := parser.NewParser( | 
					
						
							|  |  |  | 		parser.WithBlockParsers( | 
					
						
							|  |  |  | 			util.Prioritized(newPlaintextParserNoParagraph(), 500), | 
					
						
							|  |  |  | 		), | 
					
						
							|  |  |  | 	) | 
					
						
							| 
									
										
										
										
											2021-08-16 19:17:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	return f.fromPlain( | 
					
						
							|  |  |  | 		ctx, | 
					
						
							|  |  |  | 		plainTextParser, | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 		false, // basic = false | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 		parseMention, | 
					
						
							|  |  |  | 		authorID, | 
					
						
							|  |  |  | 		statusID, | 
					
						
							|  |  |  | 		input, | 
					
						
							|  |  |  | 	) | 
					
						
							| 
									
										
										
										
											2021-07-26 20:25:54 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | // FromPlainBasic fulfils FormatFunc by parsing | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | // the given plaintext input into a FormatResult. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Unlike FromPlain, it will only parse emojis with | 
					
						
							|  |  |  | // the custom renderer, leaving aside mentions and tags. | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | // | 
					
						
							|  |  |  | // Resulting HTML will also NOT be wrapped in <p> tags. | 
					
						
							|  |  |  | func (f *Formatter) FromPlainBasic( | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	ctx context.Context, | 
					
						
							|  |  |  | 	parseMention gtsmodel.ParseMentionFunc, | 
					
						
							|  |  |  | 	authorID string, | 
					
						
							|  |  |  | 	statusID string, | 
					
						
							|  |  |  | 	input string, | 
					
						
							|  |  |  | ) *FormatResult { | 
					
						
							| 
									
										
										
										
											2023-10-04 13:09:42 +01:00
										 |  |  | 	// Initialize block parser that | 
					
						
							|  |  |  | 	// doesn't wrap result in <p> tags. | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	plainTextParser := parser.NewParser( | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | 		parser.WithBlockParsers( | 
					
						
							| 
									
										
										
										
											2023-10-04 13:09:42 +01:00
										 |  |  | 			util.Prioritized(newPlaintextParserNoParagraph(), 500), | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | 		), | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	return f.fromPlain( | 
					
						
							|  |  |  | 		ctx, | 
					
						
							|  |  |  | 		plainTextParser, | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 		true, // basic = true | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 		parseMention, | 
					
						
							|  |  |  | 		authorID, | 
					
						
							|  |  |  | 		statusID, | 
					
						
							|  |  |  | 		input, | 
					
						
							|  |  |  | 	) | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | // fromPlain parses the given input text | 
					
						
							|  |  |  | // using the given plainTextParser, and | 
					
						
							|  |  |  | // returns the result. | 
					
						
							|  |  |  | func (f *Formatter) fromPlain( | 
					
						
							|  |  |  | 	ctx context.Context, | 
					
						
							|  |  |  | 	plainTextParser parser.Parser, | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 	basic bool, | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	parseMention gtsmodel.ParseMentionFunc, | 
					
						
							|  |  |  | 	authorID string, | 
					
						
							|  |  |  | 	statusID string, | 
					
						
							|  |  |  | 	input string, | 
					
						
							|  |  |  | ) *FormatResult { | 
					
						
							|  |  |  | 	result := new(FormatResult) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Instantiate goldmark parser for | 
					
						
							|  |  |  | 	// plaintext, using custom renderer | 
					
						
							|  |  |  | 	// to add hashtag/mention links. | 
					
						
							|  |  |  | 	md := goldmark.New( | 
					
						
							|  |  |  | 		goldmark.WithRendererOptions( | 
					
						
							|  |  |  | 			html.WithXHTML(), | 
					
						
							|  |  |  | 			html.WithHardWraps(), | 
					
						
							|  |  |  | 		), | 
					
						
							|  |  |  | 		// Use whichever plaintext | 
					
						
							|  |  |  | 		// parser we were passed. | 
					
						
							|  |  |  | 		goldmark.WithParser(plainTextParser), | 
					
						
							|  |  |  | 		goldmark.WithExtensions( | 
					
						
							|  |  |  | 			&customRenderer{ | 
					
						
							|  |  |  | 				ctx, | 
					
						
							|  |  |  | 				f.db, | 
					
						
							|  |  |  | 				parseMention, | 
					
						
							|  |  |  | 				authorID, | 
					
						
							|  |  |  | 				statusID, | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 				// If basic, pass | 
					
						
							|  |  |  | 				// emojiOnly = true. | 
					
						
							|  |  |  | 				basic, | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 				result, | 
					
						
							|  |  |  | 			}, | 
					
						
							| 
									
										
										
										
											2024-03-15 18:26:53 +01:00
										 |  |  | 			// Turns URLs into links. | 
					
						
							|  |  |  | 			extension.NewLinkify( | 
					
						
							| 
									
										
										
										
											2025-03-24 14:13:32 +01:00
										 |  |  | 				extension.WithLinkifyURLRegexp(regexes.URLLike), | 
					
						
							| 
									
										
										
										
											2024-03-15 18:26:53 +01:00
										 |  |  | 			), | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | 		), | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-04 13:09:42 +01:00
										 |  |  | 	// Convert input string to bytes | 
					
						
							|  |  |  | 	// without performing any allocs. | 
					
						
							|  |  |  | 	bInput := byteutil.S2B(input) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	// Parse input into HTML. | 
					
						
							|  |  |  | 	var htmlBytes bytes.Buffer | 
					
						
							|  |  |  | 	if err := md.Convert( | 
					
						
							| 
									
										
										
										
											2023-10-04 13:09:42 +01:00
										 |  |  | 		bInput, | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 		&htmlBytes, | 
					
						
							|  |  |  | 	); err != nil { | 
					
						
							|  |  |  | 		log.Errorf(ctx, "error formatting plaintext input to HTML: %s", err) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Clean and shrink HTML. | 
					
						
							| 
									
										
										
										
											2023-10-04 13:09:42 +01:00
										 |  |  | 	result.HTML = byteutil.B2S(htmlBytes.Bytes()) | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 	result.HTML = SanitizeHTML(result.HTML) | 
					
						
							| 
									
										
										
										
											2023-09-29 10:39:56 +02:00
										 |  |  | 	result.HTML = MinifyHTML(result.HTML) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return result | 
					
						
							| 
									
										
										
										
											2023-05-09 12:16:10 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2025-03-07 15:04:34 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | // ParseHTMLToPlain parses the given HTML string, then | 
					
						
							|  |  |  | // outputs it to equivalent plaintext while trying to | 
					
						
							|  |  |  | // keep as much of the smenantic intent of the input | 
					
						
							|  |  |  | // HTML as possible, ie., titles are placed on separate | 
					
						
							|  |  |  | // lines, `<br>`s are converted to newlines, text inside | 
					
						
							|  |  |  | // `<strong>` and `<em>` tags is retained, but without | 
					
						
							|  |  |  | // emphasis, `<a>` links are unnested and the URL they | 
					
						
							|  |  |  | // link to is placed in angle brackets next to them, | 
					
						
							|  |  |  | // lists are replaced with newline-separated indented | 
					
						
							|  |  |  | // items, etc. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This function is useful when you need to filter on | 
					
						
							|  |  |  | // HTML and want to avoid catching tags in the filter, | 
					
						
							|  |  |  | // or when you want to serve something in a plaintext | 
					
						
							|  |  |  | // format that may contain HTML tags (eg., CWs). | 
					
						
							|  |  |  | func ParseHTMLToPlain(html string) string { | 
					
						
							|  |  |  | 	plain := html2text.HTML2TextWithOptions( | 
					
						
							|  |  |  | 		html, | 
					
						
							|  |  |  | 		html2text.WithLinksInnerText(), | 
					
						
							|  |  |  | 		html2text.WithUnixLineBreaks(), | 
					
						
							|  |  |  | 		html2text.WithListSupport(), | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | 	return strings.TrimSpace(plain) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // StripHTMLFromText runs text through strict sanitization | 
					
						
							|  |  |  | // to completely remove any HTML from the input without | 
					
						
							|  |  |  | // trying to preserve the semantic intent of any HTML tags. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This is useful in cases where the input was not allowed | 
					
						
							|  |  |  | // to contain HTML at all, and the output isn't either. | 
					
						
							|  |  |  | func StripHTMLFromText(text string) string { | 
					
						
							|  |  |  | 	// Unescape first to catch any tricky critters. | 
					
						
							|  |  |  | 	content := gohtml.UnescapeString(text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Remove all detected HTML. | 
					
						
							|  |  |  | 	content = strict.Sanitize(content) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Unescape again to return plaintext. | 
					
						
							|  |  |  | 	content = gohtml.UnescapeString(content) | 
					
						
							|  |  |  | 	return strings.TrimSpace(content) | 
					
						
							|  |  |  | } |