mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-10-31 02:22:26 -05:00 
			
		
		
		
	[bugfix] Extend parser to handle more non-Latin hashtags (#3700)
* Allow marks after NFC normalization Includes regression test for the Tamil example from #3618 * Disallow just numbers + marks + underscore as hashtag
This commit is contained in:
		
					parent
					
						
							
								ab758cc233
							
						
					
				
			
			
				commit
				
					
						b9e0689359
					
				
			
		
					 5 changed files with 48 additions and 37 deletions
				
			
		|  | @ -177,7 +177,7 @@ func (p *hashtagParser) Parse( | ||||||
| 			// Ignore initial '#'. | 			// Ignore initial '#'. | ||||||
| 			continue | 			continue | ||||||
| 
 | 
 | ||||||
| 		case !isPlausiblyInHashtag(r) && | 		case !isPermittedInHashtag(r) && | ||||||
| 			!isHashtagBoundary(r): | 			!isHashtagBoundary(r): | ||||||
| 			// Weird non-boundary character | 			// Weird non-boundary character | ||||||
| 			// in the hashtag. Don't trust it. | 			// in the hashtag. Don't trust it. | ||||||
|  |  | ||||||
|  | @ -50,6 +50,8 @@ const ( | ||||||
| 	withInlineCode2Expected         = "<p><code>Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?</code></p>" | 	withInlineCode2Expected         = "<p><code>Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?</code></p>" | ||||||
| 	withHashtag                     = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!" | 	withHashtag                     = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!" | ||||||
| 	withHashtagExpected             = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>" | 	withHashtagExpected             = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>" | ||||||
|  | 	withTamilHashtag                = "here's a simple status that uses a hashtag in Tamil #தமிழ்" | ||||||
|  | 	withTamilHashtagExpected        = "<p>here's a simple status that uses a hashtag in Tamil <a href=\"http://localhost:8080/tags/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>தமிழ்</span></a></p>" | ||||||
| 	mdWithHTML                      = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">" | 	mdWithHTML                      = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">" | ||||||
| 	mdWithHTMLExpected              = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>" | 	mdWithHTMLExpected              = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>" | ||||||
| 	mdWithCheekyHTML                = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>" | 	mdWithCheekyHTML                = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>" | ||||||
|  | @ -121,6 +123,12 @@ func (suite *MarkdownTestSuite) TestParseWithHashtag() { | ||||||
| 	suite.Equal(withHashtagExpected, formatted.HTML) | 	suite.Equal(withHashtagExpected, formatted.HTML) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // Regressiom test for https://github.com/superseriousbusiness/gotosocial/issues/3618 | ||||||
|  | func (suite *MarkdownTestSuite) TestParseWithTamilHashtag() { | ||||||
|  | 	formatted := suite.FromMarkdown(withTamilHashtag) | ||||||
|  | 	suite.Equal(withTamilHashtagExpected, formatted.HTML) | ||||||
|  | } | ||||||
|  | 
 | ||||||
| func (suite *MarkdownTestSuite) TestParseWithHTML() { | func (suite *MarkdownTestSuite) TestParseWithHTML() { | ||||||
| 	formatted := suite.FromMarkdown(mdWithHTML) | 	formatted := suite.FromMarkdown(mdWithHTML) | ||||||
| 	suite.Equal(mdWithHTMLExpected, formatted.HTML) | 	suite.Equal(mdWithHTMLExpected, formatted.HTML) | ||||||
|  |  | ||||||
|  | @ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) { | ||||||
| 
 | 
 | ||||||
| 	// Validate normalized result. | 	// Validate normalized result. | ||||||
| 	var ( | 	var ( | ||||||
| 		notJustUnderscores = false | 		atLeastOneRequiredChar = false | ||||||
| 		onlyPermittedChars     = true | 		onlyPermittedChars     = true | ||||||
| 		lengthOK               = true | 		lengthOK               = true | ||||||
| 	) | 	) | ||||||
| 
 | 
 | ||||||
| 	for i, r := range normalized { | 	for i, r := range normalized { | ||||||
| 		if r != '_' { | 		if !isPermittedIfNotEntireHashtag(r) { | ||||||
| 			// This isn't an underscore, | 			// This isn't an underscore, mark, etc, | ||||||
| 			// so the whole hashtag isn't | 			// so the hashtag contains at least one | ||||||
| 			// just underscores. | 			atLeastOneRequiredChar = true | ||||||
| 			notJustUnderscores = true |  | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		if i >= maximumHashtagLength { | 		if i >= maximumHashtagLength { | ||||||
|  | @ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) { | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores) | 	return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -118,20 +118,20 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() { | ||||||
| ` | ` | ||||||
| 
 | 
 | ||||||
| 	tags := suite.FromPlain(statusText).Tags | 	tags := suite.FromPlain(statusText).Tags | ||||||
| 	suite.Len(tags, 13) | 	if suite.Len(tags, 12) { | ||||||
| 		suite.Equal("testing123", tags[0].Name) | 		suite.Equal("testing123", tags[0].Name) | ||||||
| 		suite.Equal("also", tags[1].Name) | 		suite.Equal("also", tags[1].Name) | ||||||
| 		suite.Equal("thisshouldwork", tags[2].Name) | 		suite.Equal("thisshouldwork", tags[2].Name) | ||||||
| 		suite.Equal("dupe", tags[3].Name) | 		suite.Equal("dupe", tags[3].Name) | ||||||
| 		suite.Equal("ThisShouldAlsoWork", tags[4].Name) | 		suite.Equal("ThisShouldAlsoWork", tags[4].Name) | ||||||
| 		suite.Equal("this_should_not_be_split", tags[5].Name) | 		suite.Equal("this_should_not_be_split", tags[5].Name) | ||||||
| 	suite.Equal("111111", tags[6].Name) | 		suite.Equal("alimentación", tags[6].Name) | ||||||
| 	suite.Equal("alimentación", tags[7].Name) | 		suite.Equal("saúde", tags[7].Name) | ||||||
| 	suite.Equal("saúde", tags[8].Name) | 		suite.Equal("lävistää", tags[8].Name) | ||||||
| 	suite.Equal("lävistää", tags[9].Name) | 		suite.Equal("ö", tags[9].Name) | ||||||
| 	suite.Equal("ö", tags[10].Name) | 		suite.Equal("네", tags[10].Name) | ||||||
| 	suite.Equal("네", tags[11].Name) | 		suite.Equal("ThisOneIsThirteyCharactersLong", tags[11].Name) | ||||||
| 	suite.Equal("ThisOneIsThirteyCharactersLong", tags[12].Name) | 	} | ||||||
| 
 | 
 | ||||||
| 	statusText = `#올빼미 hej` | 	statusText = `#올빼미 hej` | ||||||
| 	tags = suite.FromPlain(statusText).Tags | 	tags = suite.FromPlain(statusText).Tags | ||||||
|  | @ -170,8 +170,17 @@ func (suite *PlainTestSuite) TestDeriveMultiple() { | ||||||
| func (suite *PlainTestSuite) TestZalgoHashtag() { | func (suite *PlainTestSuite) TestZalgoHashtag() { | ||||||
| 	statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?` | 	statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?` | ||||||
| 	f := suite.FromPlain(statusText) | 	f := suite.FromPlain(statusText) | ||||||
| 	suite.Len(f.Tags, 1) | 	if suite.Len(f.Tags, 2) { | ||||||
| 		suite.Equal("praying", f.Tags[0].Name) | 		suite.Equal("praying", f.Tags[0].Name) | ||||||
|  | 		// NFC doesn't do much for Zalgo text, but it's difficult to strip marks without affecting non-Latin text. | ||||||
|  | 		suite.Equal("z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪", f.Tags[1].Name) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func (suite *PlainTestSuite) TestNumbersAreNotHashtags() { | ||||||
|  | 	statusText := `yo who else thinks #19_98 is #1?` | ||||||
|  | 	f := suite.FromPlain(statusText) | ||||||
|  | 	suite.Len(f.Tags, 0) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func TestPlainTestSuite(t *testing.T) { | func TestPlainTestSuite(t *testing.T) { | ||||||
|  |  | ||||||
|  | @ -19,19 +19,14 @@ package text | ||||||
| 
 | 
 | ||||||
| import "unicode" | import "unicode" | ||||||
| 
 | 
 | ||||||
| func isPlausiblyInHashtag(r rune) bool { | func isPermittedInHashtag(r rune) bool { | ||||||
| 	// Marks are allowed during parsing | 	return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r) | ||||||
| 	// prior to normalization, but not after, |  | ||||||
| 	// since they may be combined into letters |  | ||||||
| 	// during normalization. |  | ||||||
| 	return unicode.IsMark(r) || |  | ||||||
| 		isPermittedInHashtag(r) |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func isPermittedInHashtag(r rune) bool { | // isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag | ||||||
| 	return unicode.IsLetter(r) || | // but are not allowed to be the only characters making up the hashtag. | ||||||
| 		unicode.IsNumber(r) || | func isPermittedIfNotEntireHashtag(r rune) bool { | ||||||
| 		r == '_' | 	return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_' | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // isHashtagBoundary returns true if rune r | // isHashtagBoundary returns true if rune r | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue