mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
get eos_token_id from generation_config.json
This commit is contained in:
parent
3b2d2c8326
commit
855de683ca
15 changed files with 176 additions and 77 deletions
|
@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
|||
}
|
||||
|
||||
for _, sv := range t.SpecialVocabulary {
|
||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
||||
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
||||
if len(sv.IDs) > 0 {
|
||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
|
||||
}
|
||||
}
|
||||
|
||||
return kv
|
||||
|
|
|
@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
|||
}
|
||||
|
||||
if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||
// noop
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
} else {
|
||||
|
@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
|||
}
|
||||
}
|
||||
|
||||
if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
} else {
|
||||
defer f.Close()
|
||||
|
||||
var p map[string]json.RawMessage
|
||||
if err := json.NewDecoder(f).Decode(&p); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, st := range specialTokenTypes {
|
||||
if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
|
||||
var ids []int32
|
||||
if err := json.Unmarshal(bts, &ids); err != nil {
|
||||
// value is not a list so the existing ID is used
|
||||
continue
|
||||
}
|
||||
|
||||
if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
|
||||
return sv.Type == st
|
||||
}); i >= 0 {
|
||||
t.SpecialVocabulary[i].IDs = ids
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
|
@ -278,6 +307,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
|
|||
type SpecialVocabulary struct {
|
||||
Type string
|
||||
ID int
|
||||
IDs []int32
|
||||
Content string
|
||||
AddToken bool
|
||||
}
|
||||
|
|
|
@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
|
|||
Pre: "default",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "generation config eos token ids",
|
||||
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||
"tokenizer.json": strings.NewReader(`{
|
||||
"added_tokens": [
|
||||
{
|
||||
"id": 0,
|
||||
"content": "<bos>",
|
||||
"special": true
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"content": "<eos>",
|
||||
"special": true
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"content": "<eot>",
|
||||
"special": true
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"content": "<eom>",
|
||||
"special": true
|
||||
}
|
||||
],
|
||||
"model": {
|
||||
"vocab": {
|
||||
"<bos>": 0,
|
||||
"<eos>": 1,
|
||||
"<eot>": 2,
|
||||
"<eom>": 3
|
||||
}
|
||||
}
|
||||
}`),
|
||||
"tokenizer_config.json": strings.NewReader(`{
|
||||
"add_bos_token": true,
|
||||
"add_eos_token": false,
|
||||
"bos_token": "<bos>",
|
||||
"eos_token": "<eos>"
|
||||
}`),
|
||||
"generation_config.json": strings.NewReader(`{
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": [1, 2, 3]
|
||||
}`),
|
||||
}),
|
||||
specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
|
||||
want: &Tokenizer{
|
||||
Vocabulary: &Vocabulary{
|
||||
Model: "gpt2",
|
||||
Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
|
||||
Scores: []float32{0, 1, 2, 3},
|
||||
Types: []int32{3, 3, 3, 3},
|
||||
},
|
||||
SpecialVocabulary: []*SpecialVocabulary{
|
||||
{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
|
||||
{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
|
||||
},
|
||||
Pre: "default",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
|
|
|
@ -684,7 +684,7 @@ type Grammar struct {
|
|||
mu sync.Mutex
|
||||
}
|
||||
|
||||
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
|
||||
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
|
||||
cGrammar := C.CString(grammar)
|
||||
defer C.free(unsafe.Pointer(cGrammar))
|
||||
|
||||
|
@ -704,7 +704,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
|
|||
cEogTokens[i] = C.uint32_t(token)
|
||||
}
|
||||
|
||||
g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
|
||||
g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
|
||||
if g == nil {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -43,8 +43,14 @@ func New(c fs.Config) (model.Model, error) {
|
|||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
c.Ints("tokenizer.ggml.eos_token_ids"),
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id")),
|
||||
),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
|
|
|
@ -60,12 +60,14 @@ func New(c fs.Config) (model.Model, error) {
|
|||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(1),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOT: int32(106),
|
||||
AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
|
||||
EOS: append(
|
||||
c.Ints("tokenizer.ggml.eos_token_ids"),
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id")),
|
||||
),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
|
|
|
@ -21,7 +21,6 @@ type TextConfig struct {
|
|||
|
||||
type TextModel struct {
|
||||
model.Base
|
||||
model.SentencePieceModel
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []TextLayer `gguf:"blk"`
|
||||
|
@ -45,15 +44,6 @@ func newTextModel(c fs.Config) *TextModel {
|
|||
numBlocks := int(c.Uint("block_count"))
|
||||
|
||||
m := TextModel{
|
||||
SentencePieceModel: model.NewSentencePieceModel(
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
},
|
||||
),
|
||||
Layers: make([]TextLayer, numBlocks),
|
||||
TextConfig: &TextConfig{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
|
|
|
@ -43,10 +43,14 @@ func New(c fs.Config) (model.Model, error) {
|
|||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
c.Ints("tokenizer.ggml.eos_token_ids"),
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id")),
|
||||
),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
|
|
|
@ -41,10 +41,14 @@ func New(c fs.Config) (model.Model, error) {
|
|||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
c.Ints("tokenizer.ggml.eos_token_ids"),
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id")),
|
||||
),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
|
|
|
@ -16,6 +16,8 @@ import (
|
|||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*TextModel
|
||||
*VisionModel `gguf:"v,vision"`
|
||||
*MultiModalProjector `gguf:"mm"`
|
||||
|
@ -36,6 +38,22 @@ func New(c fs.Config) (model.Model, error) {
|
|||
}
|
||||
|
||||
m := &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
c.Ints("tokenizer.ggml.eos_token_ids"),
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id")),
|
||||
),
|
||||
},
|
||||
),
|
||||
TextModel: textModel,
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
|
|
|
@ -21,7 +21,6 @@ type TextOptions struct {
|
|||
|
||||
type TextModel struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []Layer `gguf:"blk"`
|
||||
|
@ -148,18 +147,6 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
|
|||
}
|
||||
|
||||
textModel := &TextModel{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
TextOptions: &TextOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
|
|
|
@ -45,10 +45,14 @@ func New(c fs.Config) (model.Model, error) {
|
|||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
c.Ints("tokenizer.ggml.eos_token_ids"),
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id")),
|
||||
),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
|
|
|
@ -41,8 +41,8 @@ type Vocabulary struct {
|
|||
Scores []float32
|
||||
Merges []string
|
||||
|
||||
BOS, EOS, EOT int32
|
||||
AddBOS, AddEOS, AddEOT bool
|
||||
BOS, EOS []int32
|
||||
AddBOS, AddEOS bool
|
||||
|
||||
specialOnce sync.Once
|
||||
special []string
|
||||
|
@ -57,14 +57,36 @@ type Vocabulary struct {
|
|||
func (v *Vocabulary) Is(id int32, special Special) bool {
|
||||
switch special {
|
||||
case SpecialBOS:
|
||||
return id == v.BOS
|
||||
return slices.Contains(v.BOS, id)
|
||||
case SpecialEOS:
|
||||
return id == v.EOS || id == v.EOT
|
||||
return slices.Contains(v.EOS, id)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (v *Vocabulary) addSpecials(ids []int32) []int32 {
|
||||
if v.AddBOS && len(v.BOS) > 0 {
|
||||
if slices.Contains(v.BOS, ids[0]) {
|
||||
slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding bos token to prompt", "id", v.BOS)
|
||||
ids = append([]int32{v.BOS[0]}, ids...)
|
||||
}
|
||||
|
||||
if v.AddEOS && len(v.EOS) > 0 {
|
||||
if slices.Contains(v.BOS, ids[len(ids)-1]) {
|
||||
slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding eos token to prompt", "id", v.EOS)
|
||||
ids = append(ids, v.EOS[0])
|
||||
}
|
||||
|
||||
return ids
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Encode(s string) int32 {
|
||||
v.valuesOnce.Do(func() {
|
||||
v.values = make(map[string]int32, len(v.Values))
|
||||
|
@ -303,23 +325,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
|
|||
}
|
||||
|
||||
if addSpecial && len(ids) > 0 {
|
||||
if bpe.vocab.AddBOS {
|
||||
if ids[0] == bpe.vocab.BOS {
|
||||
slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
|
||||
ids = append([]int32{bpe.vocab.BOS}, ids...)
|
||||
}
|
||||
|
||||
if bpe.vocab.AddEOS {
|
||||
if ids[len(ids)-1] == bpe.vocab.EOS {
|
||||
slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
|
||||
ids = append(ids, bpe.vocab.EOS)
|
||||
}
|
||||
ids = bpe.vocab.addSpecials(ids)
|
||||
}
|
||||
|
||||
return ids, nil
|
||||
|
|
|
@ -180,23 +180,7 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
|
|||
}
|
||||
|
||||
if addSpecial && len(ids) > 0 {
|
||||
if spm.vocab.AddBOS {
|
||||
if ids[0] == spm.vocab.BOS {
|
||||
slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
|
||||
ids = append([]int32{spm.vocab.BOS}, ids...)
|
||||
}
|
||||
|
||||
if spm.vocab.AddEOS {
|
||||
if ids[len(ids)-1] == spm.vocab.EOS {
|
||||
slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
|
||||
ids = append(ids, spm.vocab.EOS)
|
||||
}
|
||||
ids = spm.vocab.addSpecials(ids)
|
||||
}
|
||||
|
||||
return ids, nil
|
||||
|
|
|
@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa
|
|||
vocabIds[i] = uint32(i)
|
||||
}
|
||||
|
||||
grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)})
|
||||
grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
|
||||
if grammar == nil {
|
||||
return nil, errors.New("sample: failed to initialize grammar")
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue