| 
									
										
										
										
											2023-03-12 16:00:57 +01:00
										 |  |  | // GoToSocial | 
					
						
							|  |  |  | // Copyright (C) GoToSocial Authors admin@gotosocial.org | 
					
						
							|  |  |  | // SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | // it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | // the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | // (at your option) any later version. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | // but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | // GNU Affero General Public License for more details. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | // along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
					
						
							| 
									
										
										
										
											2022-09-29 12:03:17 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-05 12:47:13 +01:00
										 |  |  | package util | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-05 12:47:13 +01:00
										 |  |  | // See: | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | //   - https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta | 
					
						
							|  |  |  | //   - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag | 
					
						
							|  |  |  | //   - https://www.rfc-editor.org/rfc/rfc9309.html | 
					
						
							| 
									
										
										
										
											2022-09-29 12:03:17 +02:00
										 |  |  | const ( | 
					
						
							| 
									
										
										
										
											2025-02-05 12:47:13 +01:00
										 |  |  | 	RobotsDirectivesDisallow  = "noindex, nofollow" | 
					
						
							|  |  |  | 	RobotsDirectivesAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" | 
					
						
							|  |  |  | 	RobotsTxt                 = `# GoToSocial robots.txt -- to edit, see internal/api/util/robots.go | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | # More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | # AI scrapers and the like. | 
					
						
							|  |  |  | # https://github.com/ai-robots-txt/ai.robots.txt/ | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: AI2Bot | 
					
						
							|  |  |  | User-agent: Ai2Bot-Dolma | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: Amazonbot | 
					
						
							|  |  |  | User-agent: anthropic-ai | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: Applebot | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: Applebot-Extended | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: Brightbot 1.0 | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: Bytespider | 
					
						
							| 
									
										
										
										
											2023-09-30 21:44:57 +02:00
										 |  |  | User-agent: CCBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: ChatGPT-User | 
					
						
							|  |  |  | User-agent: Claude-Web | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: ClaudeBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: cohere-ai | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: cohere-training-data-crawler | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: Crawlspace | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: Diffbot | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: DuckAssistBot | 
					
						
							| 
									
										
										
										
											2023-09-30 21:44:57 +02:00
										 |  |  | User-agent: FacebookBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: FriendlyCrawler | 
					
						
							|  |  |  | User-agent: Google-Extended | 
					
						
							|  |  |  | User-agent: GoogleOther | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: GoogleOther-Image | 
					
						
							|  |  |  | User-agent: GoogleOther-Video | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: GPTBot | 
					
						
							| 
									
										
										
										
											2024-09-07 17:21:36 +02:00
										 |  |  | User-agent: iaskspider/2.0 | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: ICC-Crawler | 
					
						
							| 
									
										
										
										
											2024-07-10 15:10:34 +02:00
										 |  |  | User-agent: ImagesiftBot | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: img2dataset | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: imgproxy | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: ISSCyberRiskCrawler | 
					
						
							|  |  |  | User-agent: Kangaroo Bot | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: Meta-ExternalAgent | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: Meta-ExternalFetcher | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: NovaAct | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: OAI-SearchBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: omgili | 
					
						
							|  |  |  | User-agent: omgilibot | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: Operator | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: PanguBot | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: Perplexity-User | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: PerplexityBot | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: PetalBot | 
					
						
							|  |  |  | User-agent: Scrapy | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | User-agent: SemrushBot-OCOB | 
					
						
							|  |  |  | User-agent: SemrushBot-SWA | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: Sidetrade indexer bot | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: Timpibot | 
					
						
							|  |  |  | User-agent: VelenPublicWebCrawler | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: Webzio-Extended | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: YouBot | 
					
						
							|  |  |  | Disallow: / | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Marketing/SEO "intelligence" data scrapers | 
					
						
							|  |  |  | User-agent: AwarioRssBot | 
					
						
							|  |  |  | User-agent: AwarioSmartBot | 
					
						
							|  |  |  | User-agent: DataForSeoBot | 
					
						
							|  |  |  | User-agent: magpie-crawler | 
					
						
							|  |  |  | User-agent: Meltwater | 
					
						
							| 
									
										
										
										
											2024-07-10 15:10:34 +02:00
										 |  |  | User-agent: peer39_crawler | 
					
						
							|  |  |  | User-agent: peer39_crawler/1.0 | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: PiplBot | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: scoop.it | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: Seekr | 
					
						
							| 
									
										
										
										
											2023-09-30 21:44:57 +02:00
										 |  |  | Disallow: / | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Well-known.dev crawler. Indexes stuff under /.well-known. | 
					
						
							|  |  |  | # https://well-known.dev/about/ | 
					
						
							| 
									
										
										
										
											2025-02-04 16:52:42 +01:00
										 |  |  | User-agent: WellKnownBot | 
					
						
							|  |  |  | Disallow: / | 
					
						
							| 
									
										
										
										
											2024-02-27 14:25:08 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | # Rules for everything else. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Crawl-delay: 500 | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # API endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /api/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-27 11:23:52 +01:00
										 |  |  | # Auth/Sign in endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /auth/ | 
					
						
							|  |  |  | Disallow: /oauth/ | 
					
						
							|  |  |  | Disallow: /check_your_email | 
					
						
							|  |  |  | Disallow: /wait_for_approval | 
					
						
							|  |  |  | Disallow: /account_disabled | 
					
						
							| 
									
										
										
										
											2024-04-11 11:45:53 +02:00
										 |  |  | Disallow: /signup | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Fileserver/media. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /fileserver/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Fedi S2S API endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /users/ | 
					
						
							|  |  |  | Disallow: /emoji/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Settings panels. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /admin | 
					
						
							|  |  |  | Disallow: /user | 
					
						
							| 
									
										
										
										
											2023-01-25 18:06:41 +01:00
										 |  |  | Disallow: /settings/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Domain blocklist. | 
					
						
							| 
									
										
										
										
											2025-02-04 16:52:42 +01:00
										 |  |  | Disallow: /about/suspended | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Webfinger endpoint. | 
					
						
							|  |  |  | Disallow: /.well-known/webfinger | 
					
						
							|  |  |  | ` | 
					
						
							| 
									
										
										
										
											2025-02-05 12:47:13 +01:00
										 |  |  | 	RobotsTxtDisallowNodeInfo = RobotsTxt + ` | 
					
						
							| 
									
										
										
										
											2025-02-04 16:52:42 +01:00
										 |  |  | # Disallow nodeinfo | 
					
						
							|  |  |  | Disallow: /.well-known/nodeinfo | 
					
						
							|  |  |  | Disallow: /nodeinfo/ | 
					
						
							|  |  |  | ` | 
					
						
							| 
									
										
										
										
											2025-02-24 11:17:18 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// MD5 hash of basic robots.txt. | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | 	RobotsTxtETag = `7b6b498f7381ac33cb3efb34c68f662d` | 
					
						
							| 
									
										
										
										
											2025-02-24 11:17:18 +01:00
										 |  |  | 	// MD5 hash of robots.txt with NodeInfo disallowed. | 
					
						
							| 
									
										
										
										
											2025-04-19 13:34:20 +02:00
										 |  |  | 	RobotsTxtDisallowNodeInfoETag = `6d21be573d502581a3bf7271b7e63fc8` | 
					
						
							| 
									
										
										
										
											2022-09-29 12:03:17 +02:00
										 |  |  | ) |