mirror of
				https://github.com/superseriousbusiness/gotosocial.git
				synced 2025-11-04 04:02:25 -06:00 
			
		
		
		
	
		
			
				
	
	
		
			508 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			508 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
// Copyright 2015 go-swagger maintainers
 | 
						|
//
 | 
						|
// Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
// you may not use this file except in compliance with the License.
 | 
						|
// You may obtain a copy of the License at
 | 
						|
//
 | 
						|
//    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
//
 | 
						|
// Unless required by applicable law or agreed to in writing, software
 | 
						|
// distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
// See the License for the specific language governing permissions and
 | 
						|
// limitations under the License.
 | 
						|
 | 
						|
package swag
 | 
						|
 | 
						|
import (
 | 
						|
	"bytes"
 | 
						|
	"sync"
 | 
						|
	"unicode"
 | 
						|
	"unicode/utf8"
 | 
						|
)
 | 
						|
 | 
						|
type (
 | 
						|
	splitter struct {
 | 
						|
		initialisms              []string
 | 
						|
		initialismsRunes         [][]rune
 | 
						|
		initialismsUpperCased    [][]rune // initialisms cached in their trimmed, upper-cased version
 | 
						|
		postSplitInitialismCheck bool
 | 
						|
	}
 | 
						|
 | 
						|
	splitterOption func(*splitter)
 | 
						|
 | 
						|
	initialismMatch struct {
 | 
						|
		body       []rune
 | 
						|
		start, end int
 | 
						|
		complete   bool
 | 
						|
	}
 | 
						|
	initialismMatches []initialismMatch
 | 
						|
)
 | 
						|
 | 
						|
type (
 | 
						|
	// memory pools of temporary objects.
 | 
						|
	//
 | 
						|
	// These are used to recycle temporarily allocated objects
 | 
						|
	// and relieve the GC from undue pressure.
 | 
						|
 | 
						|
	matchesPool struct {
 | 
						|
		*sync.Pool
 | 
						|
	}
 | 
						|
 | 
						|
	buffersPool struct {
 | 
						|
		*sync.Pool
 | 
						|
	}
 | 
						|
 | 
						|
	lexemsPool struct {
 | 
						|
		*sync.Pool
 | 
						|
	}
 | 
						|
 | 
						|
	splittersPool struct {
 | 
						|
		*sync.Pool
 | 
						|
	}
 | 
						|
)
 | 
						|
 | 
						|
var (
 | 
						|
	// poolOfMatches holds temporary slices for recycling during the initialism match process
 | 
						|
	poolOfMatches = matchesPool{
 | 
						|
		Pool: &sync.Pool{
 | 
						|
			New: func() any {
 | 
						|
				s := make(initialismMatches, 0, maxAllocMatches)
 | 
						|
 | 
						|
				return &s
 | 
						|
			},
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	poolOfBuffers = buffersPool{
 | 
						|
		Pool: &sync.Pool{
 | 
						|
			New: func() any {
 | 
						|
				return new(bytes.Buffer)
 | 
						|
			},
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	poolOfLexems = lexemsPool{
 | 
						|
		Pool: &sync.Pool{
 | 
						|
			New: func() any {
 | 
						|
				s := make([]nameLexem, 0, maxAllocMatches)
 | 
						|
 | 
						|
				return &s
 | 
						|
			},
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	poolOfSplitters = splittersPool{
 | 
						|
		Pool: &sync.Pool{
 | 
						|
			New: func() any {
 | 
						|
				s := newSplitter()
 | 
						|
 | 
						|
				return &s
 | 
						|
			},
 | 
						|
		},
 | 
						|
	}
 | 
						|
)
 | 
						|
 | 
						|
// nameReplaceTable finds a word representation for special characters.
 | 
						|
func nameReplaceTable(r rune) (string, bool) {
 | 
						|
	switch r {
 | 
						|
	case '@':
 | 
						|
		return "At ", true
 | 
						|
	case '&':
 | 
						|
		return "And ", true
 | 
						|
	case '|':
 | 
						|
		return "Pipe ", true
 | 
						|
	case '$':
 | 
						|
		return "Dollar ", true
 | 
						|
	case '!':
 | 
						|
		return "Bang ", true
 | 
						|
	case '-':
 | 
						|
		return "", true
 | 
						|
	case '_':
 | 
						|
		return "", true
 | 
						|
	default:
 | 
						|
		return "", false
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// split calls the splitter.
 | 
						|
//
 | 
						|
// Use newSplitter for more control and options
 | 
						|
func split(str string) []string {
 | 
						|
	s := poolOfSplitters.BorrowSplitter()
 | 
						|
	lexems := s.split(str)
 | 
						|
	result := make([]string, 0, len(*lexems))
 | 
						|
 | 
						|
	for _, lexem := range *lexems {
 | 
						|
		result = append(result, lexem.GetOriginal())
 | 
						|
	}
 | 
						|
	poolOfLexems.RedeemLexems(lexems)
 | 
						|
	poolOfSplitters.RedeemSplitter(s)
 | 
						|
 | 
						|
	return result
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
func newSplitter(options ...splitterOption) splitter {
 | 
						|
	s := splitter{
 | 
						|
		postSplitInitialismCheck: false,
 | 
						|
		initialisms:              initialisms,
 | 
						|
		initialismsRunes:         initialismsRunes,
 | 
						|
		initialismsUpperCased:    initialismsUpperCased,
 | 
						|
	}
 | 
						|
 | 
						|
	for _, option := range options {
 | 
						|
		option(&s)
 | 
						|
	}
 | 
						|
 | 
						|
	return s
 | 
						|
}
 | 
						|
 | 
						|
// withPostSplitInitialismCheck allows to catch initialisms after main split process
 | 
						|
func withPostSplitInitialismCheck(s *splitter) {
 | 
						|
	s.postSplitInitialismCheck = true
 | 
						|
}
 | 
						|
 | 
						|
func (p matchesPool) BorrowMatches() *initialismMatches {
 | 
						|
	s := p.Get().(*initialismMatches)
 | 
						|
	*s = (*s)[:0] // reset slice, keep allocated capacity
 | 
						|
 | 
						|
	return s
 | 
						|
}
 | 
						|
 | 
						|
func (p buffersPool) BorrowBuffer(size int) *bytes.Buffer {
 | 
						|
	s := p.Get().(*bytes.Buffer)
 | 
						|
	s.Reset()
 | 
						|
 | 
						|
	if s.Cap() < size {
 | 
						|
		s.Grow(size)
 | 
						|
	}
 | 
						|
 | 
						|
	return s
 | 
						|
}
 | 
						|
 | 
						|
func (p lexemsPool) BorrowLexems() *[]nameLexem {
 | 
						|
	s := p.Get().(*[]nameLexem)
 | 
						|
	*s = (*s)[:0] // reset slice, keep allocated capacity
 | 
						|
 | 
						|
	return s
 | 
						|
}
 | 
						|
 | 
						|
func (p splittersPool) BorrowSplitter(options ...splitterOption) *splitter {
 | 
						|
	s := p.Get().(*splitter)
 | 
						|
	s.postSplitInitialismCheck = false // reset options
 | 
						|
	for _, apply := range options {
 | 
						|
		apply(s)
 | 
						|
	}
 | 
						|
 | 
						|
	return s
 | 
						|
}
 | 
						|
 | 
						|
func (p matchesPool) RedeemMatches(s *initialismMatches) {
 | 
						|
	p.Put(s)
 | 
						|
}
 | 
						|
 | 
						|
func (p buffersPool) RedeemBuffer(s *bytes.Buffer) {
 | 
						|
	p.Put(s)
 | 
						|
}
 | 
						|
 | 
						|
func (p lexemsPool) RedeemLexems(s *[]nameLexem) {
 | 
						|
	p.Put(s)
 | 
						|
}
 | 
						|
 | 
						|
func (p splittersPool) RedeemSplitter(s *splitter) {
 | 
						|
	p.Put(s)
 | 
						|
}
 | 
						|
 | 
						|
func (m initialismMatch) isZero() bool {
 | 
						|
	return m.start == 0 && m.end == 0
 | 
						|
}
 | 
						|
 | 
						|
func (s splitter) split(name string) *[]nameLexem {
 | 
						|
	nameRunes := []rune(name)
 | 
						|
	matches := s.gatherInitialismMatches(nameRunes)
 | 
						|
	if matches == nil {
 | 
						|
		return poolOfLexems.BorrowLexems()
 | 
						|
	}
 | 
						|
 | 
						|
	return s.mapMatchesToNameLexems(nameRunes, matches)
 | 
						|
}
 | 
						|
 | 
						|
func (s splitter) gatherInitialismMatches(nameRunes []rune) *initialismMatches {
 | 
						|
	var matches *initialismMatches
 | 
						|
 | 
						|
	for currentRunePosition, currentRune := range nameRunes {
 | 
						|
		// recycle these allocations as we loop over runes
 | 
						|
		// with such recycling, only 2 slices should be allocated per call
 | 
						|
		// instead of o(n).
 | 
						|
		newMatches := poolOfMatches.BorrowMatches()
 | 
						|
 | 
						|
		// check current initialism matches
 | 
						|
		if matches != nil { // skip first iteration
 | 
						|
			for _, match := range *matches {
 | 
						|
				if keepCompleteMatch := match.complete; keepCompleteMatch {
 | 
						|
					*newMatches = append(*newMatches, match)
 | 
						|
					continue
 | 
						|
				}
 | 
						|
 | 
						|
				// drop failed match
 | 
						|
				currentMatchRune := match.body[currentRunePosition-match.start]
 | 
						|
				if currentMatchRune != currentRune {
 | 
						|
					continue
 | 
						|
				}
 | 
						|
 | 
						|
				// try to complete ongoing match
 | 
						|
				if currentRunePosition-match.start == len(match.body)-1 {
 | 
						|
					// we are close; the next step is to check the symbol ahead
 | 
						|
					// if it is a small letter, then it is not the end of match
 | 
						|
					// but beginning of the next word
 | 
						|
 | 
						|
					if currentRunePosition < len(nameRunes)-1 {
 | 
						|
						nextRune := nameRunes[currentRunePosition+1]
 | 
						|
						if newWord := unicode.IsLower(nextRune); newWord {
 | 
						|
							// oh ok, it was the start of a new word
 | 
						|
							continue
 | 
						|
						}
 | 
						|
					}
 | 
						|
 | 
						|
					match.complete = true
 | 
						|
					match.end = currentRunePosition
 | 
						|
				}
 | 
						|
 | 
						|
				*newMatches = append(*newMatches, match)
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		// check for new initialism matches
 | 
						|
		for i := range s.initialisms {
 | 
						|
			initialismRunes := s.initialismsRunes[i]
 | 
						|
			if initialismRunes[0] == currentRune {
 | 
						|
				*newMatches = append(*newMatches, initialismMatch{
 | 
						|
					start:    currentRunePosition,
 | 
						|
					body:     initialismRunes,
 | 
						|
					complete: false,
 | 
						|
				})
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if matches != nil {
 | 
						|
			poolOfMatches.RedeemMatches(matches)
 | 
						|
		}
 | 
						|
		matches = newMatches
 | 
						|
	}
 | 
						|
 | 
						|
	// up to the caller to redeem this last slice
 | 
						|
	return matches
 | 
						|
}
 | 
						|
 | 
						|
func (s splitter) mapMatchesToNameLexems(nameRunes []rune, matches *initialismMatches) *[]nameLexem {
 | 
						|
	nameLexems := poolOfLexems.BorrowLexems()
 | 
						|
 | 
						|
	var lastAcceptedMatch initialismMatch
 | 
						|
	for _, match := range *matches {
 | 
						|
		if !match.complete {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		if firstMatch := lastAcceptedMatch.isZero(); firstMatch {
 | 
						|
			s.appendBrokenDownCasualString(nameLexems, nameRunes[:match.start])
 | 
						|
			*nameLexems = append(*nameLexems, s.breakInitialism(string(match.body)))
 | 
						|
 | 
						|
			lastAcceptedMatch = match
 | 
						|
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		if overlappedMatch := match.start <= lastAcceptedMatch.end; overlappedMatch {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		middle := nameRunes[lastAcceptedMatch.end+1 : match.start]
 | 
						|
		s.appendBrokenDownCasualString(nameLexems, middle)
 | 
						|
		*nameLexems = append(*nameLexems, s.breakInitialism(string(match.body)))
 | 
						|
 | 
						|
		lastAcceptedMatch = match
 | 
						|
	}
 | 
						|
 | 
						|
	// we have not found any accepted matches
 | 
						|
	if lastAcceptedMatch.isZero() {
 | 
						|
		*nameLexems = (*nameLexems)[:0]
 | 
						|
		s.appendBrokenDownCasualString(nameLexems, nameRunes)
 | 
						|
	} else if lastAcceptedMatch.end+1 != len(nameRunes) {
 | 
						|
		rest := nameRunes[lastAcceptedMatch.end+1:]
 | 
						|
		s.appendBrokenDownCasualString(nameLexems, rest)
 | 
						|
	}
 | 
						|
 | 
						|
	poolOfMatches.RedeemMatches(matches)
 | 
						|
 | 
						|
	return nameLexems
 | 
						|
}
 | 
						|
 | 
						|
func (s splitter) breakInitialism(original string) nameLexem {
 | 
						|
	return newInitialismNameLexem(original, original)
 | 
						|
}
 | 
						|
 | 
						|
func (s splitter) appendBrokenDownCasualString(segments *[]nameLexem, str []rune) {
 | 
						|
	currentSegment := poolOfBuffers.BorrowBuffer(len(str)) // unlike strings.Builder, bytes.Buffer initial storage can reused
 | 
						|
	defer func() {
 | 
						|
		poolOfBuffers.RedeemBuffer(currentSegment)
 | 
						|
	}()
 | 
						|
 | 
						|
	addCasualNameLexem := func(original string) {
 | 
						|
		*segments = append(*segments, newCasualNameLexem(original))
 | 
						|
	}
 | 
						|
 | 
						|
	addInitialismNameLexem := func(original, match string) {
 | 
						|
		*segments = append(*segments, newInitialismNameLexem(original, match))
 | 
						|
	}
 | 
						|
 | 
						|
	var addNameLexem func(string)
 | 
						|
	if s.postSplitInitialismCheck {
 | 
						|
		addNameLexem = func(original string) {
 | 
						|
			for i := range s.initialisms {
 | 
						|
				if isEqualFoldIgnoreSpace(s.initialismsUpperCased[i], original) {
 | 
						|
					addInitialismNameLexem(original, s.initialisms[i])
 | 
						|
 | 
						|
					return
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
			addCasualNameLexem(original)
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		addNameLexem = addCasualNameLexem
 | 
						|
	}
 | 
						|
 | 
						|
	for _, rn := range str {
 | 
						|
		if replace, found := nameReplaceTable(rn); found {
 | 
						|
			if currentSegment.Len() > 0 {
 | 
						|
				addNameLexem(currentSegment.String())
 | 
						|
				currentSegment.Reset()
 | 
						|
			}
 | 
						|
 | 
						|
			if replace != "" {
 | 
						|
				addNameLexem(replace)
 | 
						|
			}
 | 
						|
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		if !unicode.In(rn, unicode.L, unicode.M, unicode.N, unicode.Pc) {
 | 
						|
			if currentSegment.Len() > 0 {
 | 
						|
				addNameLexem(currentSegment.String())
 | 
						|
				currentSegment.Reset()
 | 
						|
			}
 | 
						|
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		if unicode.IsUpper(rn) {
 | 
						|
			if currentSegment.Len() > 0 {
 | 
						|
				addNameLexem(currentSegment.String())
 | 
						|
			}
 | 
						|
			currentSegment.Reset()
 | 
						|
		}
 | 
						|
 | 
						|
		currentSegment.WriteRune(rn)
 | 
						|
	}
 | 
						|
 | 
						|
	if currentSegment.Len() > 0 {
 | 
						|
		addNameLexem(currentSegment.String())
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// isEqualFoldIgnoreSpace is the same as strings.EqualFold, but
 | 
						|
// it ignores leading and trailing blank spaces in the compared
 | 
						|
// string.
 | 
						|
//
 | 
						|
// base is assumed to be composed of upper-cased runes, and be already
 | 
						|
// trimmed.
 | 
						|
//
 | 
						|
// This code is heavily inspired from strings.EqualFold.
 | 
						|
func isEqualFoldIgnoreSpace(base []rune, str string) bool {
 | 
						|
	var i, baseIndex int
 | 
						|
	// equivalent to b := []byte(str), but without data copy
 | 
						|
	b := hackStringBytes(str)
 | 
						|
 | 
						|
	for i < len(b) {
 | 
						|
		if c := b[i]; c < utf8.RuneSelf {
 | 
						|
			// fast path for ASCII
 | 
						|
			if c != ' ' && c != '\t' {
 | 
						|
				break
 | 
						|
			}
 | 
						|
			i++
 | 
						|
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		// unicode case
 | 
						|
		r, size := utf8.DecodeRune(b[i:])
 | 
						|
		if !unicode.IsSpace(r) {
 | 
						|
			break
 | 
						|
		}
 | 
						|
		i += size
 | 
						|
	}
 | 
						|
 | 
						|
	if i >= len(b) {
 | 
						|
		return len(base) == 0
 | 
						|
	}
 | 
						|
 | 
						|
	for _, baseRune := range base {
 | 
						|
		if i >= len(b) {
 | 
						|
			break
 | 
						|
		}
 | 
						|
 | 
						|
		if c := b[i]; c < utf8.RuneSelf {
 | 
						|
			// single byte rune case (ASCII)
 | 
						|
			if baseRune >= utf8.RuneSelf {
 | 
						|
				return false
 | 
						|
			}
 | 
						|
 | 
						|
			baseChar := byte(baseRune)
 | 
						|
			if c != baseChar &&
 | 
						|
				!('a' <= c && c <= 'z' && c-'a'+'A' == baseChar) {
 | 
						|
				return false
 | 
						|
			}
 | 
						|
 | 
						|
			baseIndex++
 | 
						|
			i++
 | 
						|
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		// unicode case
 | 
						|
		r, size := utf8.DecodeRune(b[i:])
 | 
						|
		if unicode.ToUpper(r) != baseRune {
 | 
						|
			return false
 | 
						|
		}
 | 
						|
		baseIndex++
 | 
						|
		i += size
 | 
						|
	}
 | 
						|
 | 
						|
	if baseIndex != len(base) {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	// all passed: now we should only have blanks
 | 
						|
	for i < len(b) {
 | 
						|
		if c := b[i]; c < utf8.RuneSelf {
 | 
						|
			// fast path for ASCII
 | 
						|
			if c != ' ' && c != '\t' {
 | 
						|
				return false
 | 
						|
			}
 | 
						|
			i++
 | 
						|
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		// unicode case
 | 
						|
		r, size := utf8.DecodeRune(b[i:])
 | 
						|
		if !unicode.IsSpace(r) {
 | 
						|
			return false
 | 
						|
		}
 | 
						|
 | 
						|
		i += size
 | 
						|
	}
 | 
						|
 | 
						|
	return true
 | 
						|
}
 |