| 
									
										
										
										
											2023-03-12 16:00:57 +01:00
										 |  |  | // GoToSocial | 
					
						
							|  |  |  | // Copyright (C) GoToSocial Authors admin@gotosocial.org | 
					
						
							|  |  |  | // SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | // it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | // the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | // (at your option) any later version. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | // but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | // GNU Affero General Public License for more details. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | // along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
					
						
							| 
									
										
										
										
											2022-09-29 12:03:17 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | package web | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | import ( | 
					
						
							|  |  |  | 	"net/http" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	"github.com/gin-gonic/gin" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-29 12:03:17 +02:00
										 |  |  | const ( | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | 	robotsPath          = "/robots.txt" | 
					
						
							|  |  |  | 	robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta | 
					
						
							|  |  |  | 	robotsTxt           = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | # More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | # AI scrapers and the like. | 
					
						
							|  |  |  | # https://github.com/ai-robots-txt/ai.robots.txt/ | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: AI2Bot | 
					
						
							|  |  |  | User-agent: Ai2Bot-Dolma | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: Amazonbot | 
					
						
							|  |  |  | User-agent: anthropic-ai | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: Applebot | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: Applebot-Extended | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: Bytespider | 
					
						
							| 
									
										
										
										
											2023-09-30 21:44:57 +02:00
										 |  |  | User-agent: CCBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: ChatGPT-User | 
					
						
							|  |  |  | User-agent: ClaudeBot | 
					
						
							|  |  |  | User-agent: Claude-Web | 
					
						
							|  |  |  | User-agent: cohere-ai | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: cohere-training-data-crawler | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: Diffbot | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: DuckAssistBot | 
					
						
							| 
									
										
										
										
											2023-09-30 21:44:57 +02:00
										 |  |  | User-agent: FacebookBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: FriendlyCrawler | 
					
						
							|  |  |  | User-agent: Google-Extended | 
					
						
							|  |  |  | User-agent: GoogleOther | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: GoogleOther-Image | 
					
						
							|  |  |  | User-agent: GoogleOther-Video | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: GPTBot | 
					
						
							| 
									
										
										
										
											2024-09-07 17:21:36 +02:00
										 |  |  | User-agent: iaskspider/2.0 | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: ICC-Crawler | 
					
						
							| 
									
										
										
										
											2024-07-10 15:10:34 +02:00
										 |  |  | User-agent: ImagesiftBot | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: img2dataset | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: ISSCyberRiskCrawler | 
					
						
							|  |  |  | User-agent: Kangaroo Bot | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: Meta-ExternalAgent | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: Meta-ExternalFetcher | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: OAI-SearchBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: omgili | 
					
						
							|  |  |  | User-agent: omgilibot | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: PanguBot | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: PerplexityBot | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: PetalBot | 
					
						
							|  |  |  | User-agent: Scrapy | 
					
						
							| 
									
										
										
										
											2025-01-03 12:46:59 +01:00
										 |  |  | User-agent: Sidetrade indexer bot | 
					
						
							| 
									
										
										
										
											2024-08-02 18:22:39 +02:00
										 |  |  | User-agent: Timpibot | 
					
						
							|  |  |  | User-agent: VelenPublicWebCrawler | 
					
						
							| 
									
										
										
										
											2024-08-29 17:42:48 +02:00
										 |  |  | User-agent: Webzio-Extended | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: YouBot | 
					
						
							|  |  |  | Disallow: / | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Marketing/SEO "intelligence" data scrapers | 
					
						
							|  |  |  | User-agent: AwarioRssBot | 
					
						
							|  |  |  | User-agent: AwarioSmartBot | 
					
						
							|  |  |  | User-agent: DataForSeoBot | 
					
						
							|  |  |  | User-agent: magpie-crawler | 
					
						
							|  |  |  | User-agent: Meltwater | 
					
						
							| 
									
										
										
										
											2024-07-10 15:10:34 +02:00
										 |  |  | User-agent: peer39_crawler | 
					
						
							|  |  |  | User-agent: peer39_crawler/1.0 | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: PiplBot | 
					
						
							| 
									
										
										
										
											2024-06-23 15:34:21 +02:00
										 |  |  | User-agent: scoop.it | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: Seekr | 
					
						
							| 
									
										
										
										
											2023-09-30 21:44:57 +02:00
										 |  |  | Disallow: / | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Well-known.dev crawler. Indexes stuff under /.well-known. | 
					
						
							|  |  |  | # https://well-known.dev/about/ | 
					
						
							| 
									
										
										
										
											2024-04-22 11:01:37 +02:00
										 |  |  | User-agent: WellKnownBot      | 
					
						
							|  |  |  | Disallow: /    | 
					
						
							| 
									
										
										
										
											2024-02-27 14:25:08 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | # Rules for everything else. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | User-agent: * | 
					
						
							|  |  |  | Crawl-delay: 500 | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # API endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /api/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-27 11:23:52 +01:00
										 |  |  | # Auth/Sign in endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /auth/ | 
					
						
							|  |  |  | Disallow: /oauth/ | 
					
						
							|  |  |  | Disallow: /check_your_email | 
					
						
							|  |  |  | Disallow: /wait_for_approval | 
					
						
							|  |  |  | Disallow: /account_disabled | 
					
						
							| 
									
										
										
										
											2024-04-11 11:45:53 +02:00
										 |  |  | Disallow: /signup | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Well-known endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /.well-known/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Fileserver/media. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /fileserver/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Fedi S2S API endpoints. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /users/ | 
					
						
							|  |  |  | Disallow: /emoji/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Settings panels. | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | Disallow: /admin | 
					
						
							|  |  |  | Disallow: /user | 
					
						
							| 
									
										
										
										
											2023-01-25 18:06:41 +01:00
										 |  |  | Disallow: /settings/ | 
					
						
							| 
									
										
										
										
											2023-08-08 13:16:34 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Domain blocklist. | 
					
						
							| 
									
										
										
										
											2023-01-25 18:06:41 +01:00
										 |  |  | Disallow: /about/suspended` | 
					
						
							| 
									
										
										
										
											2022-09-29 12:03:17 +02:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-01-02 13:10:50 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | // robotsGETHandler returns a decent robots.txt that prevents crawling | 
					
						
							|  |  |  | // the api, auth pages, settings pages, etc. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // More granular robots meta tags are then applied for web pages | 
					
						
							|  |  |  | // depending on user preferences (see internal/web). | 
					
						
							|  |  |  | func (m *Module) robotsGETHandler(c *gin.Context) { | 
					
						
							|  |  |  | 	c.String(http.StatusOK, robotsTxt) | 
					
						
							|  |  |  | } |