[chore] Use bulk updates + fewer loops in status rethreading migration (#4459)

This pull request tries to optimize our status rethreading migration by using bulk updates + avoiding unnecessary writes, and doing the migration in one top-level loop and one stragglers loop, without the extra loop to copy thread_id over.

On my machine it runs at about 2400 rows per second on Postgres, now, and about 9000 rows per second on SQLite.

Tried *many* different ways of doing this, with and without temporary indexes, with different batch and transaction sizes, etc., and this seems to be just about the most performant way of getting stuff done.

With the changes, a few minutes have been shaved off migration time testing on my development machine. *Hopefully* this will translate to more time shaved off when running on a vps with slower read/write speed and less processor power.

SQLite before:

```
real	20m58,446s
user	16m26,635s
sys	5m53,648s
```

SQLite after:

```
real	14m25,435s
user	12m47,449s
sys	2m27,898s
```

Postgres before:

```
real	28m25,307s
user	3m40,005s
sys	4m45,018s
```

Postgres after:

```
real	22m31,999s
user	3m46,674s
sys	4m39,592s
```

Reviewed-on: https://codeberg.org/superseriousbusiness/gotosocial/pulls/4459
Co-authored-by: tobi <tobi.smethurst@protonmail.com>
Co-committed-by: tobi <tobi.smethurst@protonmail.com>
This commit is contained in:
tobi 2025-10-03 12:28:55 +02:00 committed by kim
commit e7cd8bb43e
7 changed files with 429 additions and 271 deletions

View file

@ -66,98 +66,6 @@ func doWALCheckpoint(ctx context.Context, db *bun.DB) error {
return nil
}
// batchUpdateByID performs the given updateQuery with updateArgs
// over the entire given table, batching by the ID of batchByCol.
func batchUpdateByID(
ctx context.Context,
tx bun.Tx,
table string,
batchByCol string,
updateQuery string,
updateArgs []any,
) error {
// Get a count of all in table.
total, err := tx.NewSelect().
Table(table).
Count(ctx)
if err != nil {
return gtserror.Newf("error selecting total count: %w", err)
}
// Query batch size
// in number of rows.
const batchsz = 5000
// Stores highest batch value
// used in iterate queries,
// starting at highest possible.
highest := id.Highest
// Total updated rows.
var updated int
for {
// Limit to batchsz
// items at once.
batchQ := tx.
NewSelect().
Table(table).
Column(batchByCol).
Where("? < ?", bun.Ident(batchByCol), highest).
OrderExpr("? DESC", bun.Ident(batchByCol)).
Limit(batchsz)
// Finalize UPDATE to act only on batch.
qStr := updateQuery + " WHERE ? IN (?)"
args := append(slices.Clone(updateArgs),
bun.Ident(batchByCol),
batchQ,
)
// Execute the prepared raw query with arguments.
res, err := tx.NewRaw(qStr, args...).Exec(ctx)
if err != nil {
return gtserror.Newf("error updating old column values: %w", err)
}
// Check how many items we updated.
thisUpdated, err := res.RowsAffected()
if err != nil {
return gtserror.Newf("error counting affected rows: %w", err)
}
if thisUpdated == 0 {
// Nothing updated
// means we're done.
break
}
// Update the overall count.
updated += int(thisUpdated)
// Log helpful message to admin.
log.Infof(ctx, "migrated %d of %d %s (up to %s)",
updated, total, table, highest)
// Get next highest
// id for next batch.
if err := tx.
NewSelect().
With("batch_query", batchQ).
ColumnExpr("min(?) FROM ?", bun.Ident(batchByCol), bun.Ident("batch_query")).
Scan(ctx, &highest); err != nil {
return gtserror.Newf("error selecting next highest: %w", err)
}
}
if total != int(updated) {
// Return error here in order to rollback the whole transaction.
return fmt.Errorf("total=%d does not match updated=%d", total, updated)
}
return nil
}
// convertEnums performs a transaction that converts
// a table's column of our old-style enums (strings) to
// more performant and space-saving integer types.