sample: temporarily use grammars for constrained generation in new engine (#9586)

This commit is contained in:
Jeffrey Morgan 2025-03-10 16:17:39 +01:00 committed by GitHub
parent a1cda80bcb
commit e093db92c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 301 additions and 213 deletions

View file

@ -2,43 +2,88 @@ package sample
import (
"errors"
"math"
"math/rand/v2"
"slices"
"sync"
"github.com/ollama/ollama/llama"
)
// Sampler is not thread-safe. Each goroutine should have its own instance
type Sampler interface {
Sample([]float32) (int32, error)
}
// logit represents information about a single token during sampling
type logit struct {
// token represents information about a single token during sampling
type token struct {
id int32 // The token's unique identifier
value float32 // The raw logit or probability from the model
}
type weighted struct {
type Sampler struct {
rng *rand.Rand
tokens []logit
topK int
topP float32
minP float32
temperature float32
grammar *Grammar
}
func (s *weighted) Sample(logits []float32) (int32, error) {
if len(s.tokens) < len(logits) {
s.tokens = make([]logit, len(logits))
}
tokens := s.tokens[:len(logits)]
for i, v := range logits {
func (s *Sampler) Sample(logits []float32) (int32, error) {
tokens := make([]token, len(logits))
for i := range logits {
tokens[i].id = int32(i)
tokens[i].value = v
tokens[i].value = logits[i]
}
t, err := s.sample(tokens)
if err != nil {
return -1, err
}
if s.grammar != nil {
// optimization: first check if the max logit is accepted by the grammar
// if the max logit is rejected, apply the grammar to all logits (slower)
top := []token{t}
s.grammar.Apply(top)
if !math.IsInf(float64(top[0].value), -1) {
s.grammar.Accept(top[0].id)
return top[0].id, nil
}
// since .sample has side effects of modifying the tokens
// we need to reset them before applying the grammar and
// sampling again
for i := range logits {
tokens[i].id = int32(i)
tokens[i].value = logits[i]
}
s.grammar.Apply(tokens)
t, err = s.sample(tokens)
if err != nil {
return -1, err
}
s.grammar.Accept(t.id)
}
return t.id, nil
}
// greedy returns the highest probability token from the tokens
func greedy(tokens []token) token {
max := tokens[0]
for i := 1; i < len(tokens); i++ {
if tokens[i].value > max.value {
max = tokens[i]
}
}
return max
}
// sample returns the highest probability token from the tokens
// given sampler parameters. It also has side effects of modifying the tokens
func (s *Sampler) sample(tokens []token) (token, error) {
if s.temperature == 0 {
return greedy(tokens), nil
}
// Tokens are sorted by logits in TopK or SortTokens
if s.topK > 0 {
tokens = topK(tokens, s.topK)
} else {
@ -47,12 +92,14 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
tokens = temperature(tokens, s.temperature)
tokens = softmax(tokens)
tokens = topP(tokens, s.topP)
tokens = minP(tokens, s.minP)
// TODO: this should fall back to greedy sampling
// or topP, topK values etc should be such that
// there are always tokens to sample from
if len(tokens) == 0 {
return -1, errors.New("no valid logits found for weighted sampling")
return token{}, errors.New("no tokens to sample from")
}
var r float32
@ -70,48 +117,18 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
}
r *= tokens[len(tokens)-1].value
idx, _ := slices.BinarySearchFunc(tokens, r, func(token logit, target float32) int {
// Compare cumulative probabilities
idx, _ := slices.BinarySearchFunc(tokens, r, func(token token, target float32) int {
if token.value < target {
return -1
}
// First token that exceeds target
return 1
})
if idx >= len(tokens) {
idx = len(tokens) - 1
}
return tokens[idx].id, nil
}
type greedy struct{}
// Greedy sample returns the index of the maximum value in logits.
func (s greedy) Sample(logits []float32) (int32, error) {
if len(logits) == 0 {
return -1, errors.New("no logits provided for greedy sampling")
}
maxIdx := 0
maxVal := logits[0]
for i := 1; i < len(logits); i++ {
if logits[i] > maxVal {
maxVal = logits[i]
maxIdx = i
}
}
return int32(maxIdx), nil
return tokens[idx], nil
}
// TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int) Sampler {
if temperature == 0 {
return &greedy{}
}
func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
var rng *rand.Rand
if seed != -1 {
// PCG requires two parameters: sequence and stream
@ -120,7 +137,9 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
// Use golden ratio hash to generate statistically independent seeds
rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
}
temperature = max(temperature, 1)
if temperature < 0.0 {
temperature = 0.0
}
if topP < 0.0 {
topP = 0.0
@ -136,11 +155,73 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
minP = 1.0
}
return &weighted{
return Sampler{
rng: rng,
topK: topK,
topP: topP,
minP: minP,
temperature: temperature,
grammar: grammar,
}
}
type Grammar struct {
vocab *Vocab
grammar string
sampler *llama.Sampler
}
func NewGrammar(vocab *Vocab, grammar string) (*Grammar, error) {
v, err := vocab.Load()
if err != nil {
return nil, err
}
return &Grammar{
vocab: vocab,
grammar: grammar,
sampler: llama.NewGrammarSampler(v, grammar),
}, nil
}
func (g *Grammar) Apply(tokens []token) {
tds := make([]llama.TokenData, len(tokens))
for i, token := range tokens {
tds[i].Id = token.id
tds[i].Logit = token.value
}
g.sampler.Apply(tds)
for i := range tokens {
tokens[i].value = tds[i].Logit
}
}
func (g *Grammar) Accept(token int32) {
g.sampler.Accept(token)
}
type Vocab struct {
once sync.Once
vocab *llama.Vocab
err error
path string
}
func NewVocab(path string) *Vocab {
return &Vocab{path: path}
}
// Load returns the lazily-loaded vocabulary
func (v *Vocab) Load() (*llama.Vocab, error) {
v.once.Do(func() {
vocab, err := llama.LoadVocabFromFile(v.path)
if err != nil {
v.err = err
return
}
v.vocab = vocab
})
return v.vocab, v.err
}