mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-10-31 12:12:25 -05:00 
			
		
		
		
	[feature] Block a bunch of "AI" crawlers (#2239)
* [feature] Block Google Bard/AI crawlers * [feature] Block the other OpenAI crawler * [feature] Block Common Crawl crawler This is used in research, but also gleefully advertises itself as the training source used in all LLMs and GPT-3. Fixes: #2240 * [feature] Block Omgilikebot Used by some shady big web data engine company. * [feature] Block Meta's language model crawler * [feature] Block well-known.dev crawler
This commit is contained in:
		
					parent
					
						
							
								2b6b9cdf83
							
						
					
				
			
			
				commit
				
					
						0cce2c0838
					
				
			
		
					 1 changed files with 30 additions and 0 deletions
				
			
		|  | @ -34,6 +34,36 @@ const ( | ||||||
| User-agent: GPTBot | User-agent: GPTBot | ||||||
| Disallow: / | Disallow: / | ||||||
| 
 | 
 | ||||||
|  | # As of September 2023, GPTBot and ChatGPT-User are equivalent. But there's no telling | ||||||
|  | # when OpenAI might decide to change that, so block this one too. | ||||||
|  | User-agent: ChatGPT-User | ||||||
|  | Disallow: / | ||||||
|  | 
 | ||||||
|  | # And a giant fuck you to Google Bard and their other generative AI ventures too. | ||||||
|  | # https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers | ||||||
|  | User-agent: Google-Extended | ||||||
|  | Disallow: / | ||||||
|  | 
 | ||||||
|  | # Block CommonCrawl. Used in training LLMs and specifically GPT-3. | ||||||
|  | # https://commoncrawl.org/faq | ||||||
|  | User-agent: CCBot | ||||||
|  | Disallow: / | ||||||
|  | 
 | ||||||
|  | # Block Omgilike/Webz.io, a "Big Web Data" engine. | ||||||
|  | # https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/ | ||||||
|  | User-agent: Omgilibot | ||||||
|  | Disallow: / | ||||||
|  | 
 | ||||||
|  | # Block Faceboobot, because Meta. | ||||||
|  | # https://developers.facebook.com/docs/sharing/bot | ||||||
|  | User-agent: FacebookBot | ||||||
|  | Disallow: / | ||||||
|  | 
 | ||||||
|  | # Well-known.dev crawler. Indexes stuff under /.well-known. | ||||||
|  | # https://well-known.dev/about/ | ||||||
|  | User-agent: WellKnownBot | ||||||
|  | Disallow: / | ||||||
|  | 
 | ||||||
| # Rules for everything else. | # Rules for everything else. | ||||||
| User-agent: * | User-agent: * | ||||||
| Crawl-delay: 500 | Crawl-delay: 500 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue