sample: temporarily use grammars for constrained generation in new engine (#9586)

2025-05-11 10:26:53 +02:00 · 2025-03-10 16:17:39 +01:00 · 2025-03-10 16:17:39 +01:00 · e093db92c4
commit e093db92c4
parent a1cda80bcb
10 changed files with 301 additions and 213 deletions
--- a/sample/samplers.go
+++ b/sample/samplers.go
@ -2,43 +2,88 @@ package sample

 import (
 	"errors"
+	"math"
 	"math/rand/v2"
 	"slices"
+	"sync"
+
+	"github.com/ollama/ollama/llama"
 )

-// Sampler is not thread-safe. Each goroutine should have its own instance
-type Sampler interface {
-	Sample([]float32) (int32, error)
-}
-
-// logit represents information about a single token during sampling
-type logit struct {
+// token represents information about a single token during sampling
+type token struct {
 	id    int32   // The token's unique identifier
 	value float32 // The raw logit or probability from the model
 }

-type weighted struct {
+type Sampler struct {
 	rng         *rand.Rand
-	tokens      []logit
 	topK        int
 	topP        float32
 	minP        float32
 	temperature float32
+	grammar     *Grammar
 }

-func (s *weighted) Sample(logits []float32) (int32, error) {
-	if len(s.tokens) < len(logits) {
-		s.tokens = make([]logit, len(logits))
-	}
-
-	tokens := s.tokens[:len(logits)]
-
-	for i, v := range logits {
+func (s *Sampler) Sample(logits []float32) (int32, error) {
+	tokens := make([]token, len(logits))
+	for i := range logits {
 		tokens[i].id = int32(i)
-		tokens[i].value = v
+		tokens[i].value = logits[i]
+	}
+
+	t, err := s.sample(tokens)
+	if err != nil {
+		return -1, err
+	}
+
+	if s.grammar != nil {
+		// optimization: first check if the max logit is accepted by the grammar
+		// if the max logit is rejected, apply the grammar to all logits (slower)
+		top := []token{t}
+		s.grammar.Apply(top)
+		if !math.IsInf(float64(top[0].value), -1) {
+			s.grammar.Accept(top[0].id)
+			return top[0].id, nil
+		}
+
+		// since .sample has side effects of modifying the tokens
+		// we need to reset them before applying the grammar and
+		// sampling again
+		for i := range logits {
+			tokens[i].id = int32(i)
+			tokens[i].value = logits[i]
+		}
+		s.grammar.Apply(tokens)
+		t, err = s.sample(tokens)
+		if err != nil {
+			return -1, err
+		}
+		s.grammar.Accept(t.id)
+	}
+
+	return t.id, nil
+}
+
+// greedy returns the highest probability token from the tokens
+func greedy(tokens []token) token {
+	max := tokens[0]
+	for i := 1; i < len(tokens); i++ {
+		if tokens[i].value > max.value {
+			max = tokens[i]
+		}
+	}
+
+	return max
+}
+
+// sample returns the highest probability token from the tokens
+// given sampler parameters. It also has side effects of modifying the tokens
+func (s *Sampler) sample(tokens []token) (token, error) {
+	if s.temperature == 0 {
+		return greedy(tokens), nil
 	}

-	// Tokens are sorted by logits in TopK or SortTokens
 	if s.topK > 0 {
 		tokens = topK(tokens, s.topK)
 	} else {
@ -47,12 +92,14 @@ func (s *weighted) Sample(logits []float32) (int32, error) {

 	tokens = temperature(tokens, s.temperature)
 	tokens = softmax(tokens)
-
 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)

+	// TODO: this should fall back to greedy sampling
+	// or topP, topK values etc should be such that
+	// there are always tokens to sample from
 	if len(tokens) == 0 {
-		return -1, errors.New("no valid logits found for weighted sampling")
+		return token{}, errors.New("no tokens to sample from")
 	}

 	var r float32
@ -70,48 +117,18 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
 	}
 	r *= tokens[len(tokens)-1].value

-	idx, _ := slices.BinarySearchFunc(tokens, r, func(token logit, target float32) int {
-		// Compare cumulative probabilities
+	idx, _ := slices.BinarySearchFunc(tokens, r, func(token token, target float32) int {
 		if token.value < target {
 			return -1
 		}
-		// First token that exceeds target
 		return 1
 	})

-	if idx >= len(tokens) {
-		idx = len(tokens) - 1
-	}
-
-	return tokens[idx].id, nil
-}
-
-type greedy struct{}
-
-// Greedy sample returns the index of the maximum value in logits.
-func (s greedy) Sample(logits []float32) (int32, error) {
-	if len(logits) == 0 {
-		return -1, errors.New("no logits provided for greedy sampling")
-	}
-
-	maxIdx := 0
-	maxVal := logits[0]
-	for i := 1; i < len(logits); i++ {
-		if logits[i] > maxVal {
-			maxVal = logits[i]
-			maxIdx = i
-		}
-	}
-
-	return int32(maxIdx), nil
+	return tokens[idx], nil
 }

 // TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
-func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int) Sampler {
-	if temperature == 0 {
-		return &greedy{}
-	}
-
+func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
 	var rng *rand.Rand
 	if seed != -1 {
 		// PCG requires two parameters: sequence and stream
@ -120,7 +137,9 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 		// Use golden ratio hash to generate statistically independent seeds
 		rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
 	}
-	temperature = max(temperature, 1)
+	if temperature < 0.0 {
+		temperature = 0.0
+	}

 	if topP < 0.0 {
 		topP = 0.0
@ -136,11 +155,73 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 		minP = 1.0
 	}

-	return &weighted{
+	return Sampler{
 		rng:         rng,
 		topK:        topK,
 		topP:        topP,
 		minP:        minP,
 		temperature: temperature,
+		grammar:     grammar,
 	}
 }
+
+type Grammar struct {
+	vocab   *Vocab
+	grammar string
+	sampler *llama.Sampler
+}
+
+func NewGrammar(vocab *Vocab, grammar string) (*Grammar, error) {
+	v, err := vocab.Load()
+	if err != nil {
+		return nil, err
+	}
+
+	return &Grammar{
+		vocab:   vocab,
+		grammar: grammar,
+		sampler: llama.NewGrammarSampler(v, grammar),
+	}, nil
+}
+
+func (g *Grammar) Apply(tokens []token) {
+	tds := make([]llama.TokenData, len(tokens))
+	for i, token := range tokens {
+		tds[i].Id = token.id
+		tds[i].Logit = token.value
+	}
+
+	g.sampler.Apply(tds)
+
+	for i := range tokens {
+		tokens[i].value = tds[i].Logit
+	}
+}
+
+func (g *Grammar) Accept(token int32) {
+	g.sampler.Accept(token)
+}
+
+type Vocab struct {
+	once  sync.Once
+	vocab *llama.Vocab
+	err   error
+	path  string
+}
+
+func NewVocab(path string) *Vocab {
+	return &Vocab{path: path}
+}
+
+// Load returns the lazily-loaded vocabulary
+func (v *Vocab) Load() (*llama.Vocab, error) {
+	v.once.Do(func() {
+		vocab, err := llama.LoadVocabFromFile(v.path)
+		if err != nil {
+			v.err = err
+			return
+		}
+		v.vocab = vocab
+	})
+	return v.vocab, v.err
+}