diff --git a/convert/convert.go b/convert/convert.go index ffcc2b8ab..3c2d0b707 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV { } for _, sv := range t.SpecialVocabulary { - kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID) kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken + kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID) + if len(sv.IDs) > 0 { + kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs + } } return kv diff --git a/convert/tokenizer.go b/convert/tokenizer.go index 74e2efed0..768b9fdb8 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) } if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) { + // noop } else if err != nil { return nil, err } else { @@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) } } + if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) { + } else if err != nil { + return nil, err + } else { + defer f.Close() + + var p map[string]json.RawMessage + if err := json.NewDecoder(f).Decode(&p); err != nil { + return nil, err + } + + for _, st := range specialTokenTypes { + if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok { + var ids []int32 + if err := json.Unmarshal(bts, &ids); err != nil { + // value is not a list so the existing ID is used + continue + } + + if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool { + return sv.Type == st + }); i >= 0 { + t.SpecialVocabulary[i].IDs = ids + } + } + } + } + return t, nil } @@ -278,6 +307,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) { type SpecialVocabulary struct { Type string ID int + IDs []int32 Content string AddToken bool } diff --git a/convert/tokenizer_test.go b/convert/tokenizer_test.go index c6ef9732f..813096fd9 100644 --- a/convert/tokenizer_test.go +++ b/convert/tokenizer_test.go @@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) { Pre: "default", }, }, + { + name: "generation config eos token ids", + fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ + "tokenizer.json": strings.NewReader(`{ + "added_tokens": [ + { + "id": 0, + "content": "", + "special": true + }, + { + "id": 1, + "content": "", + "special": true + }, + { + "id": 2, + "content": "", + "special": true + }, + { + "id": 3, + "content": "", + "special": true + } + ], + "model": { + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3 + } + } + }`), + "tokenizer_config.json": strings.NewReader(`{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": "", + "eos_token": "" + }`), + "generation_config.json": strings.NewReader(`{ + "bos_token_id": 0, + "eos_token_id": [1, 2, 3] + }`), + }), + specialTokenTypes: []string{"pad", "eos", "bos", "unk"}, + want: &Tokenizer{ + Vocabulary: &Vocabulary{ + Model: "gpt2", + Tokens: []string{"", "", "", ""}, + Scores: []float32{0, 1, 2, 3}, + Types: []int32{3, 3, 3, 3}, + }, + SpecialVocabulary: []*SpecialVocabulary{ + {Type: "eos", Content: "", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false}, + {Type: "bos", Content: "", ID: 0, AddToken: true}, + }, + Pre: "default", + }, + }, } for _, tt := range cases { diff --git a/llama/llama.go b/llama/llama.go index ccd63b5a4..9cae1df2d 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -684,7 +684,7 @@ type Grammar struct { mu sync.Mutex } -func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar { +func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar { cGrammar := C.CString(grammar) defer C.free(unsafe.Pointer(cGrammar)) @@ -704,7 +704,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke cEogTokens[i] = C.uint32_t(token) } - g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens))) + g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens))) if g == nil { return nil } diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go index d418f6827..a9ec285bc 100644 --- a/model/models/gemma2/model.go +++ b/model/models/gemma2/model.go @@ -43,8 +43,14 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + c.Ints("tokenizer.ggml.eos_token_ids"), + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id")), + ), }, ), Layers: make([]Layer, c.Uint("block_count")), diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go index bf396b6a0..9485e10c4 100644 --- a/model/models/gemma3/model.go +++ b/model/models/gemma3/model.go @@ -60,12 +60,14 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(1), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), - EOT: int32(106), - AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false), + EOS: append( + c.Ints("tokenizer.ggml.eos_token_ids"), + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id")), + ), }, ), ImageProcessor: newImageProcessor(c), diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go index c1e843d8f..b372ae934 100644 --- a/model/models/gemma3/model_text.go +++ b/model/models/gemma3/model_text.go @@ -21,7 +21,6 @@ type TextConfig struct { type TextModel struct { model.Base - model.SentencePieceModel TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []TextLayer `gguf:"blk"` @@ -45,15 +44,6 @@ func newTextModel(c fs.Config) *TextModel { numBlocks := int(c.Uint("block_count")) m := TextModel{ - SentencePieceModel: model.NewSentencePieceModel( - &model.Vocabulary{ - Values: c.Strings("tokenizer.ggml.tokens"), - Scores: c.Floats("tokenizer.ggml.scores"), - Types: c.Ints("tokenizer.ggml.token_type"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), - }, - ), Layers: make([]TextLayer, numBlocks), TextConfig: &TextConfig{ hiddenSize: int(c.Uint("embedding_length")), diff --git a/model/models/llama/model.go b/model/models/llama/model.go index 3e5a54278..84843b1dd 100644 --- a/model/models/llama/model.go +++ b/model/models/llama/model.go @@ -43,10 +43,14 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + c.Ints("tokenizer.ggml.eos_token_ids"), + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id")), + ), }, ), Layers: make([]Layer, c.Uint("block_count")), diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go index 632d313ec..d3aee52d0 100644 --- a/model/models/llama4/model.go +++ b/model/models/llama4/model.go @@ -41,10 +41,14 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + c.Ints("tokenizer.ggml.eos_token_ids"), + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id")), + ), }, ), ImageProcessor: newImageProcessor(c), diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go index f749fdcd2..e6571a563 100644 --- a/model/models/mistral3/model.go +++ b/model/models/mistral3/model.go @@ -16,6 +16,8 @@ import ( type Model struct { model.Base + model.BytePairEncoding + *TextModel *VisionModel `gguf:"v,vision"` *MultiModalProjector `gguf:"mm"` @@ -36,6 +38,22 @@ func New(c fs.Config) (model.Model, error) { } m := &Model{ + BytePairEncoding: model.NewBytePairEncoding( + c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`), + &model.Vocabulary{ + Values: c.Strings("tokenizer.ggml.tokens"), + Types: c.Ints("tokenizer.ggml.token_type"), + Merges: c.Strings("tokenizer.ggml.merges"), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + c.Ints("tokenizer.ggml.eos_token_ids"), + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id")), + ), + }, + ), TextModel: textModel, VisionModel: newVisionModel(c), ImageProcessor: newImageProcessor(c), diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go index 1bf72acd8..565b001a7 100644 --- a/model/models/mistral3/model_text.go +++ b/model/models/mistral3/model_text.go @@ -21,7 +21,6 @@ type TextOptions struct { type TextModel struct { model.Base - model.BytePairEncoding TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -148,18 +147,6 @@ func NewTextModel(c fs.Config) (*TextModel, error) { } textModel := &TextModel{ - BytePairEncoding: model.NewBytePairEncoding( - c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`), - &model.Vocabulary{ - Values: c.Strings("tokenizer.ggml.tokens"), - Types: c.Ints("tokenizer.ggml.token_type"), - Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)), - AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)), - AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), - }, - ), Layers: make([]Layer, c.Uint("block_count")), TextOptions: &TextOptions{ hiddenSize: int(c.Uint("embedding_length")), diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 149876c9c..00e79f3d6 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -45,10 +45,14 @@ func New(c fs.Config) (model.Model, error) { Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), - BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), - EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: append( + c.Ints("tokenizer.ggml.eos_token_ids"), + int32(c.Uint("tokenizer.ggml.eos_token_id")), + int32(c.Uint("tokenizer.ggml.eot_token_id")), + ), }, ), ImageProcessor: newImageProcessor(c), diff --git a/model/process_text.go b/model/process_text.go index 90b220a2e..fd5fa79fd 100644 --- a/model/process_text.go +++ b/model/process_text.go @@ -41,8 +41,8 @@ type Vocabulary struct { Scores []float32 Merges []string - BOS, EOS, EOT int32 - AddBOS, AddEOS, AddEOT bool + BOS, EOS []int32 + AddBOS, AddEOS bool specialOnce sync.Once special []string @@ -57,14 +57,36 @@ type Vocabulary struct { func (v *Vocabulary) Is(id int32, special Special) bool { switch special { case SpecialBOS: - return id == v.BOS + return slices.Contains(v.BOS, id) case SpecialEOS: - return id == v.EOS || id == v.EOT + return slices.Contains(v.EOS, id) default: return false } } +func (v *Vocabulary) addSpecials(ids []int32) []int32 { + if v.AddBOS && len(v.BOS) > 0 { + if slices.Contains(v.BOS, ids[0]) { + slog.Warn("adding bos token to prompt which already has it", "id", v.BOS) + } + + slog.Debug("adding bos token to prompt", "id", v.BOS) + ids = append([]int32{v.BOS[0]}, ids...) + } + + if v.AddEOS && len(v.EOS) > 0 { + if slices.Contains(v.BOS, ids[len(ids)-1]) { + slog.Warn("adding eos token to prompt which already has it", "id", v.EOS) + } + + slog.Debug("adding eos token to prompt", "id", v.EOS) + ids = append(ids, v.EOS[0]) + } + + return ids +} + func (v *Vocabulary) Encode(s string) int32 { v.valuesOnce.Do(func() { v.values = make(map[string]int32, len(v.Values)) @@ -303,23 +325,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { } if addSpecial && len(ids) > 0 { - if bpe.vocab.AddBOS { - if ids[0] == bpe.vocab.BOS { - slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS) - } - - slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS) - ids = append([]int32{bpe.vocab.BOS}, ids...) - } - - if bpe.vocab.AddEOS { - if ids[len(ids)-1] == bpe.vocab.EOS { - slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS) - } - - slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS) - ids = append(ids, bpe.vocab.EOS) - } + ids = bpe.vocab.addSpecials(ids) } return ids, nil diff --git a/model/process_text_spm.go b/model/process_text_spm.go index 446d5d604..f8e623864 100644 --- a/model/process_text_spm.go +++ b/model/process_text_spm.go @@ -180,23 +180,7 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error) } if addSpecial && len(ids) > 0 { - if spm.vocab.AddBOS { - if ids[0] == spm.vocab.BOS { - slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS) - } - - slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS) - ids = append([]int32{spm.vocab.BOS}, ids...) - } - - if spm.vocab.AddEOS { - if ids[len(ids)-1] == spm.vocab.EOS { - slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS) - } - - slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS) - ids = append(ids, spm.vocab.EOS) - } + ids = spm.vocab.addSpecials(ids) } return ids, nil diff --git a/sample/samplers.go b/sample/samplers.go index f0846c8dd..d395650d9 100644 --- a/sample/samplers.go +++ b/sample/samplers.go @@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa vocabIds[i] = uint32(i) } - grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)}) + grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS) if grammar == nil { return nil, errors.New("sample: failed to initialize grammar") }