[bug] respect X-Robots-Tag and robots.txt on api/v1/instance and nodeinfo (#3756)

* feat: check X-Robots-Tag

when accessing /api/v1/instance or /nodeinfo endpoints respect
X-Robots-Tag

* chore: go fmt ./...

* Check robots.txt as well, add tests

---------

Co-authored-by: tobi <tobi.smethurst@protonmail.com>
This commit is contained in:
alemi.dev 2025-02-11 13:16:14 +01:00 committed by GitHub
commit d0de3ad492
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1404 additions and 24 deletions

View file

@ -25,6 +25,7 @@ import (
"io"
"net/http"
"net/url"
"slices"
"strings"
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
@ -35,18 +36,29 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/util"
"github.com/superseriousbusiness/gotosocial/internal/validate"
"github.com/temoto/robotstxt"
)
func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) {
// Try to fetch robots.txt to check
// if we're allowed to try endpoints:
//
// - /api/v1/instance
// - /.well-known/nodeinfo
// - /nodeinfo/2.0|2.1 endpoints
robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host)
if err != nil {
log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err)
}
var i *gtsmodel.Instance
var err error
// First try to dereference using /api/v1/instance.
// This will provide the most complete picture of an instance, and avoid unnecessary api calls.
//
// This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial.
log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host)
i, err = dereferenceByAPIV1Instance(ctx, t, iri)
i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt)
if err == nil {
log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance")
return i, nil
@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts
// If that doesn't work, try to dereference using /.well-known/nodeinfo.
// This will involve two API calls and return less info overall, but should be more widely compatible.
log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host)
i, err = dereferenceByNodeInfo(ctx, t, iri)
i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt)
if err == nil {
log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo")
return i, nil
@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts
}, nil
}
func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) {
func (t *transport) dereferenceByAPIV1Instance(
ctx context.Context,
iri *url.URL,
robotsTxt *robotstxt.RobotsData,
) (*gtsmodel.Instance, error) {
const path = "api/v1/instance"
// Bail if we're not allowed to fetch this endpoint.
if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) {
err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path)
return nil, gtserror.SetNotPermitted(err)
}
cleanIRI := &url.URL{
Scheme: iri.Scheme,
Host: iri.Host,
Path: "api/v1/instance",
Path: path,
}
// Build IRI just once
@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
return nil, gtserror.NewFromResponse(resp)
}
// Ensure that we can use data returned from this endpoint.
robots := resp.Header.Values("X-Robots-Tag")
if slices.ContainsFunc(
robots,
func(key string) bool {
return strings.Contains(key, "noindex")
},
) {
err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path)
return nil, gtserror.SetNotPermitted(err)
}
// Ensure that the incoming request content-type is expected.
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
err := gtserror.Newf("non json response type: %s", ct)
@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
return nil, errors.New("response bytes was len 0")
}
// try to parse the returned bytes directly into an Instance model
// Try to parse the returned bytes
// directly into an Instance model.
apiResp := &apimodel.InstanceV1{}
if err := json.Unmarshal(b, apiResp); err != nil {
return nil, err
@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
return i, nil
}
func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) {
niIRI, err := callNodeInfoWellKnown(c, t, iri)
func (t *transport) dereferenceByNodeInfo(
ctx context.Context,
iri *url.URL,
robotsTxt *robotstxt.RobotsData,
) (*gtsmodel.Instance, error) {
// Retrieve the nodeinfo IRI from .well-known/nodeinfo.
niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt)
if err != nil {
return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err)
return nil, gtserror.Newf("error during initial call to .well-known: %w", err)
}
ni, err := callNodeInfo(c, t, niIRI)
// Use the returned nodeinfo IRI to make a followup call.
ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt)
if err != nil {
return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err)
return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err)
}
// we got a response of some kind! take what we can from it...
// We got a response of some kind!
//
// Start building out the bare minimum
// instance model, we'll add to it if we can.
id, err := id.NewRandomULID()
if err != nil {
return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err)
return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err)
}
// this is the bare minimum instance we'll return, and we'll add more stuff to it if we can
i := &gtsmodel.Instance{
ID: id,
Domain: iri.Host,
@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm
return i, nil
}
func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) {
func (t *transport) callNodeInfoWellKnown(
ctx context.Context,
iri *url.URL,
robotsTxt *robotstxt.RobotsData,
) (*url.URL, error) {
const path = ".well-known/nodeinfo"
// Bail if we're not allowed to fetch this endpoint.
if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) {
err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path)
return nil, gtserror.SetNotPermitted(err)
}
cleanIRI := &url.URL{
Scheme: iri.Scheme,
Host: iri.Host,
Path: ".well-known/nodeinfo",
Path: path,
}
// Build IRI just once
@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
return nil, gtserror.NewFromResponse(resp)
}
// Ensure that the incoming request content-type is expected.
// Ensure that we can use data returned from this endpoint.
robots := resp.Header.Values("X-Robots-Tag")
if slices.ContainsFunc(
robots,
func(key string) bool {
return strings.Contains(key, "noindex")
},
) {
err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path)
return nil, gtserror.SetNotPermitted(err)
}
// Ensure that the returned content-type is expected.
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
err := gtserror.Newf("non json response type: %s", ct)
return nil, gtserror.SetMalformed(err)
@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err)
}
// look through the links for the first one that matches the nodeinfo schema, this is what we need
// Look through the links for the first one that
// matches nodeinfo schema, this is what we need.
var nodeinfoHref *url.URL
for _, l := range wellKnownResp.Links {
if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") {
@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
return nodeinfoHref, nil
}
func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) {
func (t *transport) callNodeInfo(
ctx context.Context,
iri *url.URL,
robotsTxt *robotstxt.RobotsData,
) (*apimodel.Nodeinfo, error) {
// Normalize robots.txt test path.
testPath := iri.Path
if !strings.HasPrefix(testPath, "/") {
testPath = "/" + testPath
}
// Bail if we're not allowed to fetch this endpoint.
if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) {
err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath)
return nil, gtserror.SetNotPermitted(err)
}
// Build IRI just once
iriStr := iri.String()
@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No
return nil, gtserror.SetMalformed(err)
}
// Ensure that we can use data returned from this endpoint.
robots := resp.Header.Values("X-Robots-Tag")
if slices.ContainsFunc(
robots,
func(key string) bool {
return strings.Contains(key, "noindex")
},
) {
err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path)
return nil, gtserror.SetNotPermitted(err)
}
b, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err

View file

@ -0,0 +1,91 @@
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package transport
import (
"context"
"net/http"
"net/url"
"codeberg.org/gruf/go-bytesize"
"codeberg.org/gruf/go-iotools"
apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
"github.com/temoto/robotstxt"
)
func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) {
robotsIRI := &url.URL{
Scheme: protocol,
Host: host,
Path: "robots.txt",
}
// Build IRI just once
iriStr := robotsIRI.String()
// Prepare new HTTP request to endpoint
req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil)
if err != nil {
return nil, err
}
// We want text/plain utf-8 encoding.
//
// https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method
req.Header.Add("Accept", apiutil.TextPlain)
req.Header.Add("Accept-Charset", apiutil.UTF8)
// Perform the HTTP request
rsp, err := t.GET(req)
if err != nil {
return nil, err
}
// Ensure a non-error status response.
if rsp.StatusCode != http.StatusOK {
err := gtserror.NewFromResponse(rsp)
_ = rsp.Body.Close() // close early.
return nil, err
}
// Ensure that the incoming request content-type is expected.
if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) {
err := gtserror.Newf("non text/plain response: %s", ct)
_ = rsp.Body.Close() // close early.
return nil, gtserror.SetMalformed(err)
}
// Limit the robots.txt size to 500KiB
//
// https://www.rfc-editor.org/rfc/rfc9309.html#name-limits
const maxsz = int64(500 * bytesize.KiB)
// Check body claims to be within size limit.
if rsp.ContentLength > maxsz {
_ = rsp.Body.Close() // close early.
sz := bytesize.Size(maxsz) //nolint:gosec
return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz)
}
// Update response body with maximum size.
rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz)
defer rsp.Body.Close()
return robotstxt.FromResponse(rsp)
}