[feature] Use X-Robots-Tag headers to instruct scrapers/crawlers (#3737)

* [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers

* use switch for RobotsHeaders
This commit is contained in:
tobi 2025-02-05 12:47:13 +01:00 committed by GitHub
commit baed591a1d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 311 additions and 142 deletions

View file

@ -44,12 +44,5 @@ func ExtraHeaders() gin.HandlerFunc {
//
// See: https://github.com/patcg-individual-drafts/topics
c.Header("Permissions-Policy", "browsing-topics=()")
// Some AI scrapers respect the following tags to opt-out
// of their crawling and datasets.
c.Header("X-Robots-Tag", "noimageai")
// c.Header calls .Set(), but we want to emit the header
// twice, not override it.
c.Writer.Header().Add("X-Robots-Tag", "noai")
}
}

View file

@ -0,0 +1,67 @@
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package middleware
import (
"github.com/gin-gonic/gin"
apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
)
// RobotsHeaders adds robots directives to the X-Robots-Tag HTTP header.
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag
//
// If mode == "aiOnly" then only the noai and noimageai values will be set,
// and other headers will be left alone (for route groups / handlers to set).
//
// If mode == "allowSome" then noai, noimageai, and some indexing will be set.
//
// If mode == "" then noai, noimageai, noindex, and nofollow will be set
// (ie., as restrictive as possible).
func RobotsHeaders(mode string) gin.HandlerFunc {
const (
key = "X-Robots-Tag"
// Some AI scrapers respect the following tags
// to opt-out of their crawling and datasets.
// We add them regardless of allowSome.
noai = "noai, noimageai"
)
switch mode {
// Just set ai headers and
// leave the other headers be.
case "aiOnly":
return func(c *gin.Context) {
c.Writer.Header().Set(key, noai)
}
// Allow some limited indexing.
case "allowSome":
return func(c *gin.Context) {
c.Writer.Header().Set(key, apiutil.RobotsDirectivesAllowSome)
c.Writer.Header().Add(key, noai)
}
// Disallow indexing via noindex, nofollow.
default:
return func(c *gin.Context) {
c.Writer.Header().Set(key, apiutil.RobotsDirectivesDisallow)
c.Writer.Header().Add(key, noai)
}
}
}