diff --git a/cmd/cmd.go b/cmd/cmd.go index 58c6dbf22..0f8072f06 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -162,7 +162,11 @@ func CreateHandler(cmd *cobra.Command, args []string) error { if resp.Digest != "" { bar, ok := bars[resp.Digest] if !ok { - bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed) + msg := resp.Status + if msg == "" { + msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19]) + } + bar = progress.NewBar(msg, resp.Total, resp.Completed) bars[resp.Digest] = bar p.Add(resp.Digest, bar) } diff --git a/convert/convert.go b/convert/convert.go index ffcc2b8ab..249ec8077 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -4,9 +4,9 @@ import ( "encoding/json" "errors" "fmt" - "io" "io/fs" "log/slog" + "os" "slices" "strings" @@ -89,7 +89,7 @@ type ModelConverter interface { // KV maps parameters to LLM key-values KV(*Tokenizer) ggml.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. - Tensors([]Tensor) []ggml.Tensor + Tensors([]Tensor) []*ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string @@ -106,13 +106,13 @@ type AdapterConverter interface { // KV maps parameters to LLM key-values KV(ggml.KV) ggml.KV // Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here. - Tensors([]Tensor) []ggml.Tensor + Tensors([]Tensor) []*ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string } -func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { +func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error { bts, err := fs.ReadFile(fsys, "adapter_config.json") if err != nil { return err @@ -147,14 +147,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { return err } - return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts)) + return writeFile(f, conv.KV(baseKV), conv.Tensors(ts)) } // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. -func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { +func ConvertModel(fsys fs.FS, f *os.File) error { bts, err := fs.ReadFile(fsys, "config.json") if err != nil { return err @@ -239,13 +239,13 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { return err } - return writeFile(ws, conv.KV(t), conv.Tensors(ts)) + return writeFile(f, conv.KV(t), conv.Tensors(ts)) } -func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error { +func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error { for i := range ts { ts[i].Shape = slices.Clone(ts[i].Shape) slices.Reverse(ts[i].Shape) } - return ggml.WriteGGUF(ws, kv, ts) + return ggml.WriteGGUF(f, kv, ts) } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 8575652aa..a9f4b8a77 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if slices.Contains([]string{ "embeddings.position_ids", @@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { continue } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_commandr.go b/convert/convert_commandr.go index 738a2cf3b..a909515bd 100644 --- a/convert/convert_commandr.go +++ b/convert/convert_commandr.go @@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 2f329943e..26698d6a6 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go index 3494aa3f9..6299cd9e0 100644 --- a/convert/convert_gemma2_adapter.go +++ b/convert/convert_gemma2_adapter.go @@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV { return kv } -func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 0caaa1949..e491a9d8d 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -126,11 +126,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor if p.RopeScaling.factors != nil { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: "rope_freqs.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.factors))}, @@ -145,7 +145,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { } } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama4.go b/convert/convert_llama4.go index 26a230b33..3e3792339 100644 --- a/convert/convert_llama4.go +++ b/convert/convert_llama4.go @@ -88,13 +88,13 @@ func (p *llama4Model) Replacements() []string { } // Tensors implements ModelConverter. -func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor var textTensors []Tensor for _, t := range ts { if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), @@ -112,7 +112,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { // clone tensor since we need separate repackers tt := t.Clone() tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim))) - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name), Kind: tt.Kind(), Shape: newShape, @@ -125,7 +125,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack()) newShape := slices.Clone(t.Shape()) newShape[1], newShape[2] = newShape[2], newShape[1] - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: newShape, diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go index 718ef047e..4cc451153 100644 --- a/convert/convert_llama_adapter.go +++ b/convert/convert_llama_adapter.go @@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV { return kv } -func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: shape, diff --git a/convert/convert_mistral.go b/convert/convert_mistral.go index 6c224ae4f..a6fd4c41a 100644 --- a/convert/convert_mistral.go +++ b/convert/convert_mistral.go @@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV { return kv } -func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if !strings.HasPrefix(t.Name(), "v.") { @@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { } } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 95a289f76..17580ff8f 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { +func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor { oldnew := []string{ "model.layers", "blk", "w1", "ffn_gate_exps", @@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { return true }) - var out []ggml.Tensor + var out []*ggml.Tensor for n, e := range experts { // TODO(mxyng): sanity check experts - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: n, Kind: e[0].Kind(), Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...), diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index d1c13795a..5a6756053 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV { return kv } -func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { +func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor { var addRopeFactors sync.Once - out := make([]ggml.Tensor, 0, len(ts)+2) + out := make([]*ggml.Tensor, 0, len(ts)+2) for _, t := range ts { if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: "rope_factors_long.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, WriterTo: p.RopeScaling.LongFactor, - }, ggml.Tensor{ + }, &ggml.Tensor{ Name: "rope_factors_short.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, @@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { }) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_qwen2.go b/convert/convert_qwen2.go index 18278802e..edcb82e29 100644 --- a/convert/convert_qwen2.go +++ b/convert/convert_qwen2.go @@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV { return kv } -func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 1acf552f5..735d41fa5 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -36,12 +36,12 @@ func (kv KV) ParameterCount() uint64 { return keyValue(kv, "general.parameter_count", uint64(0)) } -func (kv KV) FileType() fileType { +func (kv KV) FileType() FileType { if t := kv.Uint("general.file_type"); t > 0 { - return fileType(t) + return FileType(t) } - return fileTypeUnknown + return FileTypeUnknown } func (kv KV) BlockCount() uint64 { @@ -226,7 +226,11 @@ func (t Tensor) block() (n int) { } func (t Tensor) blockSize() uint64 { - switch t.Kind { + return (TensorType)(t.Kind).BlockSize() +} + +func (t TensorType) BlockSize() uint64 { + switch t { case 0, // F32 1, // F16 @@ -252,73 +256,77 @@ func (t Tensor) blockSize() uint64 { } func (t Tensor) typeSize() uint64 { - blockSize := t.blockSize() + return TensorType(t.Kind).TypeSize() +} - switch t.Kind { - case 0: // FP32 +func (t TensorType) TypeSize() uint64 { + blockSize := t.BlockSize() + + switch t { + case TensorTypeF32: return 4 - case 1: // FP16 + case TensorTypeF16: return 2 - case 2: // Q4_0 + case TensorTypeQ4_0: return 2 + blockSize/2 - case 3: // Q4_1 + case TensorTypeQ4_1: return 2 + 2 + blockSize/2 - case 6: // Q5_0 + case TensorTypeQ5_0: return 2 + 4 + blockSize/2 - case 7: // Q5_1 + case TensorTypeQ5_1: return 2 + 2 + 4 + blockSize/2 - case 8: // Q8_0 + case TensorTypeQ8_0: return 2 + blockSize - case 9: // Q8_1 + case TensorTypeQ8_1: return 2 + 2 + blockSize - case 10: // Q2_K + case TensorTypeQ2_K: return blockSize/16 + blockSize/4 + 2 + 2 - case 11: // Q3_K + case TensorTypeQ3_K: return blockSize/8 + blockSize/4 + 12 + 2 - case 12: // Q4_K + case TensorTypeQ4_K: return 2 + 2 + 12 + blockSize/2 - case 13: // Q5_K + case TensorTypeQ5_K: return 2 + 2 + 12 + blockSize/8 + blockSize/2 - case 14: // Q6_K + case TensorTypeQ6_K: return blockSize/2 + blockSize/4 + blockSize/16 + 2 - case 15: // Q8_K + case TensorTypeQ8_K: return 4 + blockSize + 2*blockSize/16 - case 16: // IQ2_XXS + case tensorTypeIQ2_XXS: return 2 + 2*blockSize/8 - case 17: // IQ2_XS + case tensorTypeIQ2_XS: return 2 + 2*blockSize/8 + blockSize/32 - case 18: // IQ3_XXS + case tensorTypeIQ3_XXS: return 2 + blockSize/4 + blockSize/8 - case 19: // IQ1_S + case tensorTypeIQ1_S: return 2 + blockSize/8 + blockSize/16 - case 20: // IQ4_NL + case tensorTypeIQ4_NL: return 2 + blockSize/2 - case 21: // IQ3_S + case tensorTypeIQ3_S: return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4 - case 22: // IQ2_S + case tensorTypeIQ2_S: return 2 + blockSize/4 + blockSize/16 - case 23: // IQ4_XS + case tensorTypeIQ4_XS: return 2 + 2 + blockSize/2 + blockSize/64 - case 24: // I8 + case TensorTypeI8: return 1 - case 25: // I16 + case TensorTypeI16: return 2 - case 26: // I32 + case TensorTypeI32: return 4 - case 27: // I64 + case TensorTypeI64: return 8 - case 28: // F64 + case TensorTypeF64: return 8 - case 29: // IQ1_M + case tensorTypeIQ1_M: return blockSize/8 + blockSize/16 + blockSize/32 - case 30: // BF16 + case TensorTypeBF16: return 2 default: return 0 } } -func (t Tensor) parameters() uint64 { +func (t Tensor) Elements() uint64 { var count uint64 = 1 for _, n := range t.Shape { count *= n @@ -327,11 +335,11 @@ func (t Tensor) parameters() uint64 { } func (t Tensor) Size() uint64 { - return t.parameters() * t.typeSize() / t.blockSize() + return t.Elements() * t.typeSize() / t.blockSize() } func (t Tensor) Type() string { - return fileType(t.Kind).String() + return TensorType(t.Kind).String() } type container interface { @@ -480,7 +488,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri var ropeFreqsCount uint64 if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { - ropeFreqsCount = ropeFreqsWeights.parameters() + ropeFreqsCount = ropeFreqsWeights.Elements() } } diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go index b7029bc38..b3120820d 100644 --- a/fs/ggml/gguf.go +++ b/fs/ggml/gguf.go @@ -9,8 +9,12 @@ import ( "io" "log/slog" "maps" + "os" + "runtime" "slices" "strings" + + "golang.org/x/sync/errgroup" ) type containerGGUF struct { @@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { } llm.tensors = append(llm.tensors, &tensor) - llm.parameters += tensor.parameters() + llm.parameters += tensor.Elements() } // patch KV with parameter count @@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error { return err } + if t == ggufTypeString { + for _, e := range any(s).([]string) { + if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil { + return err + } + + if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil { + return err + } + } + return nil + } + return binary.Write(w, binary.LittleEndian, s) } -func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { +func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error { alignment := kv.Uint("general.alignment", 32) - if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil { + if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil { return err } @@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { slices.Sort(keys) for _, key := range keys { - if err := ggufWriteKV(ws, key, kv[key]); err != nil { + if err := ggufWriteKV(f, key, kv[key]); err != nil { return err } } - slices.SortStableFunc(ts, func(a, b Tensor) int { + slices.SortStableFunc(ts, func(a, b *Tensor) int { if i, j := a.block(), b.block(); i < 0 && j > 0 { return 1 } else if i > 0 && j < 0 { @@ -530,22 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { }) var s uint64 - for _, t := range ts { - t.Offset = s - if err := ggufWriteTensorInfo(ws, t); err != nil { + for i := range ts { + ts[i].Offset = s + if err := ggufWriteTensorInfo(f, ts[i]); err != nil { return err } - s += t.Size() + s += ts[i].Size() s += uint64(ggufPadding(int64(s), int64(alignment))) } + offset, err := f.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + offset += ggufPadding(offset, int64(alignment)) + + var g errgroup.Group + g.SetLimit(runtime.GOMAXPROCS(0)) + // TODO consider reducing if tensors size * gomaxprocs is larger than free memory for _, t := range ts { - if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil { + t := t + w := io.NewOffsetWriter(f, offset+int64(t.Offset)) + g.Go(func() error { + _, err = t.WriteTo(w) return err - } + }) } - return nil + return g.Wait() } func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { @@ -560,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { var err error switch v := v.(type) { - case uint32: + case uint32, FileType: err = writeGGUF(ws, ggufTypeUint32, v) + case uint64: + err = writeGGUF(ws, ggufTypeUint64, v) case float32: err = writeGGUF(ws, ggufTypeFloat32, v) case bool: @@ -570,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { err = writeGGUFString(ws, v) case []int32: err = writeGGUFArray(ws, ggufTypeInt32, v) + case *array[int32]: + err = writeGGUFArray(ws, ggufTypeInt32, v.values) case []uint32: err = writeGGUFArray(ws, ggufTypeUint32, v) + case *array[uint32]: + err = writeGGUFArray(ws, ggufTypeUint32, v.values) case []float32: err = writeGGUFArray(ws, ggufTypeFloat32, v) + case *array[float32]: + err = writeGGUFArray(ws, ggufTypeFloat32, v.values) case []string: - if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil { - return err - } - - for _, e := range v { - if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil { - return err - } - } + err = writeGGUFArray(ws, ggufTypeString, v) + case *array[string]: + err = writeGGUFArray(ws, ggufTypeString, v.values) default: return fmt.Errorf("improper type for '%s'", k) } @@ -603,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { return err } -func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { +func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error { slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset) if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil { return err @@ -630,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { return binary.Write(ws, binary.LittleEndian, t.Offset) } -func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error { - offset, err := ws.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil { - return err - } - - _, err = t.WriteTo(ws) - return err -} - func ggufPadding(offset, align int64) int64 { return (align - offset%align) % align } diff --git a/fs/ggml/gguf_test.go b/fs/ggml/gguf_test.go index 22e7a5514..10d3b6849 100644 --- a/fs/ggml/gguf_test.go +++ b/fs/ggml/gguf_test.go @@ -18,7 +18,7 @@ func TestWriteGGUF(t *testing.T) { if err := WriteGGUF(w, KV{ "general.alignment": uint32(16), - }, []Tensor{ + }, []*Tensor{ {Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, {Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, {Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, diff --git a/fs/ggml/type.go b/fs/ggml/type.go index 7265afbcd..8172c46d9 100644 --- a/fs/ggml/type.go +++ b/fs/ggml/type.go @@ -1,185 +1,341 @@ package ggml -import "fmt" - -type fileType uint32 - -const ( - fileTypeF32 fileType = iota - fileTypeF16 - fileTypeQ4_0 - fileTypeQ4_1 - fileTypeQ4_1_F16 - fileTypeQ4_2 // unused - fileTypeQ4_3 // unused - fileTypeQ8_0 - fileTypeQ5_0 - fileTypeQ5_1 - fileTypeQ2_K - fileTypeQ3_K_S - fileTypeQ3_K_M - fileTypeQ3_K_L - fileTypeQ4_K_S - fileTypeQ4_K_M - fileTypeQ5_K_S - fileTypeQ5_K_M - fileTypeQ6_K - fileTypeIQ2_XXS - fileTypeIQ2_XS - fileTypeQ2_K_S - fileTypeIQ3_XS - fileTypeIQ3_XXS - fileTypeIQ1_S - fileTypeIQ4_NL - fileTypeIQ3_S - fileTypeIQ3_M - fileTypeIQ2_S - fileTypeIQ2_M - fileTypeIQ4_XS - fileTypeIQ1_M - fileTypeBF16 - - fileTypeUnknown +import ( + "fmt" + "log/slog" + "strings" ) -func ParseFileType(s string) (fileType, error) { +// FileType is the Go equivalent to llama_ftype used for gguf file typing +type FileType uint32 + +const ( + FileTypeF32 FileType = iota + FileTypeF16 + FileTypeQ4_0 + FileTypeQ4_1 + fileTypeQ4_1_F16 // unused by GGML + fileTypeQ4_2 // unused by GGML + fileTypeQ4_3 // unused by GGML + FileTypeQ8_0 + FileTypeQ5_0 + FileTypeQ5_1 + FileTypeQ2_K + FileTypeQ3_K_S + FileTypeQ3_K_M + FileTypeQ3_K_L + FileTypeQ4_K_S + FileTypeQ4_K_M + FileTypeQ5_K_S + FileTypeQ5_K_M + FileTypeQ6_K + fileTypeIQ2_XXS // not supported by ollama + fileTypeIQ2_XS // not supported by ollama + FileTypeQ2_K_S + fileTypeIQ3_XS // not supported by ollama + fileTypeIQ3_XXS // not supported by ollama + fileTypeIQ1_S // not supported by ollama + fileTypeIQ4_NL // not supported by ollama + fileTypeIQ3_S // not supported by ollama + fileTypeIQ3_M // not supported by ollama + fileTypeIQ2_S // not supported by ollama + fileTypeIQ2_M // not supported by ollama + fileTypeIQ4_XS // not supported by ollama + fileTypeIQ1_M // not supported by ollama + FileTypeBF16 + fileTypeQ4_0_4_4 // unused by GGML + fileTypeQ4_0_4_8 // unused by GGML + fileTypeQ4_0_8_8 // unused by GGML + fileTypeTQ1_0 // not supported by ollama + fileTypeTQ2_0 // not supported by ollama + + FileTypeUnknown = 1024 +) + +// ParseFileType parses the provided GGUF file type +// Only Ollama supported types are considered valid +func ParseFileType(s string) (FileType, error) { switch s { case "F32": - return fileTypeF32, nil + return FileTypeF32, nil case "F16": - return fileTypeF16, nil + return FileTypeF16, nil case "Q4_0": - return fileTypeQ4_0, nil + return FileTypeQ4_0, nil case "Q4_1": - return fileTypeQ4_1, nil - case "Q4_1_F16": - return fileTypeQ4_1_F16, nil + return FileTypeQ4_1, nil case "Q8_0": - return fileTypeQ8_0, nil + return FileTypeQ8_0, nil case "Q5_0": - return fileTypeQ5_0, nil + return FileTypeQ5_0, nil case "Q5_1": - return fileTypeQ5_1, nil + return FileTypeQ5_1, nil case "Q2_K": - return fileTypeQ2_K, nil + return FileTypeQ2_K, nil case "Q3_K_S": - return fileTypeQ3_K_S, nil + return FileTypeQ3_K_S, nil case "Q3_K_M": - return fileTypeQ3_K_M, nil + return FileTypeQ3_K_M, nil case "Q3_K_L": - return fileTypeQ3_K_L, nil + return FileTypeQ3_K_L, nil case "Q4_K_S": - return fileTypeQ4_K_S, nil - case "Q4_K_M": - return fileTypeQ4_K_M, nil + return FileTypeQ4_K_S, nil + case "Q4_K_M", "Q4_K": + return FileTypeQ4_K_M, nil case "Q5_K_S": - return fileTypeQ5_K_S, nil - case "Q5_K_M": - return fileTypeQ5_K_M, nil + return FileTypeQ5_K_S, nil + case "Q5_K_M", "Q5_K": + return FileTypeQ5_K_M, nil case "Q6_K": - return fileTypeQ6_K, nil - case "IQ2_XXS": - return fileTypeIQ2_XXS, nil - case "IQ2_XS": - return fileTypeIQ2_XS, nil + return FileTypeQ6_K, nil case "Q2_K_S": - return fileTypeQ2_K_S, nil - case "IQ3_XS": - return fileTypeIQ3_XS, nil - case "IQ3_XXS": - return fileTypeIQ3_XXS, nil - case "IQ1_S": - return fileTypeIQ1_S, nil - case "IQ4_NL": - return fileTypeIQ4_NL, nil - case "IQ3_S": - return fileTypeIQ3_S, nil - case "IQ3_M": - return fileTypeIQ3_M, nil - case "IQ2_S": - return fileTypeIQ2_S, nil - case "IQ2_M": - return fileTypeIQ2_M, nil - case "IQ4_XS": - return fileTypeIQ4_XS, nil - case "IQ1_M": - return fileTypeIQ1_M, nil + return FileTypeQ2_K_S, nil case "BF16": - return fileTypeBF16, nil + return FileTypeBF16, nil default: - return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s) + supportedFileTypes := []FileType{ + FileTypeF32, + FileTypeF16, + FileTypeQ4_K_S, + FileTypeQ4_K_M, + FileTypeQ8_0, + // fsggml.FileTypeBF16, // TODO + } + strs := make([]string, len(supportedFileTypes)) + for i := range supportedFileTypes { + strs[i] = supportedFileTypes[i].String() + } + + return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", ")) } } -func (t fileType) String() string { +func (t FileType) String() string { switch t { - case fileTypeF32: + case FileTypeF32: return "F32" - case fileTypeF16: + case FileTypeF16: return "F16" - case fileTypeQ4_0: + case FileTypeQ4_0: return "Q4_0" - case fileTypeQ4_1: + case FileTypeQ4_1: return "Q4_1" - case fileTypeQ4_1_F16: - return "Q4_1_F16" - case fileTypeQ8_0: + case FileTypeQ8_0: return "Q8_0" - case fileTypeQ5_0: + case FileTypeQ5_0: return "Q5_0" - case fileTypeQ5_1: + case FileTypeQ5_1: return "Q5_1" - case fileTypeQ2_K: + case FileTypeQ2_K: return "Q2_K" - case fileTypeQ3_K_S: + case FileTypeQ3_K_S: return "Q3_K_S" - case fileTypeQ3_K_M: + case FileTypeQ3_K_M: return "Q3_K_M" - case fileTypeQ3_K_L: + case FileTypeQ3_K_L: return "Q3_K_L" - case fileTypeQ4_K_S: + case FileTypeQ4_K_S: return "Q4_K_S" - case fileTypeQ4_K_M: + case FileTypeQ4_K_M: return "Q4_K_M" - case fileTypeQ5_K_S: + case FileTypeQ5_K_S: return "Q5_K_S" - case fileTypeQ5_K_M: + case FileTypeQ5_K_M: return "Q5_K_M" - case fileTypeQ6_K: + case FileTypeQ6_K: return "Q6_K" - case fileTypeIQ2_XXS: - return "IQ2_XXS" - case fileTypeIQ2_XS: - return "IQ2_XS" - case fileTypeQ2_K_S: + case FileTypeQ2_K_S: return "Q2_K_S" - case fileTypeIQ3_XS: - return "IQ3_XS" - case fileTypeIQ3_XXS: - return "IQ3_XXS" - case fileTypeIQ1_S: - return "IQ1_S" - case fileTypeIQ4_NL: - return "IQ4_NL" - case fileTypeIQ3_S: - return "IQ3_S" - case fileTypeIQ3_M: - return "IQ3_M" - case fileTypeIQ2_S: - return "IQ2_S" - case fileTypeIQ4_XS: - return "IQ4_XS" - case fileTypeIQ2_M: - return "IQ2_M" - case fileTypeIQ1_M: - return "IQ1_M" - case fileTypeBF16: + case FileTypeBF16: return "BF16" default: return "unknown" } } -func (t fileType) Value() uint32 { +func (t FileType) Value() uint32 { return uint32(t) } + +func (ftype FileType) ToTensorType() TensorType { + switch ftype { + case FileTypeF32: + return TensorTypeF32 + case FileTypeF16: + return TensorTypeF16 + case FileTypeQ4_0: + return TensorTypeQ4_0 + case FileTypeQ4_1: + return TensorTypeQ4_1 + case FileTypeQ8_0: + return TensorTypeQ8_0 + case FileTypeQ5_0: + return TensorTypeQ5_0 + case FileTypeQ5_1: + return TensorTypeQ5_1 + case FileTypeQ2_K: + return TensorTypeQ2_K + case FileTypeQ3_K_S: + return TensorTypeQ3_K + case FileTypeQ3_K_M: + return TensorTypeQ3_K + case FileTypeQ3_K_L: + return TensorTypeQ3_K + case FileTypeQ4_K_S: + return TensorTypeQ4_K + case FileTypeQ4_K_M: + return TensorTypeQ4_K + case FileTypeQ5_K_S: + return TensorTypeQ5_K + case FileTypeQ5_K_M: + return TensorTypeQ5_K + case FileTypeQ6_K: + return TensorTypeQ6_K + case FileTypeQ2_K_S: + return TensorTypeQ2_K + case FileTypeBF16: + return TensorTypeBF16 + default: + slog.Warn("unsupported file type", "type", ftype) + return 0 // F32 + } +} + +// TensorType is equivalent to ggml_type for individual tensor types +// Note: these are not the same as FileType +type TensorType uint32 + +const ( + TensorTypeF32 TensorType = iota + TensorTypeF16 + TensorTypeQ4_0 + TensorTypeQ4_1 + tensorTypeQ4_2 // unused by GGML + tensorTypeQ4_3 // unused by GGML + TensorTypeQ5_0 + TensorTypeQ5_1 + TensorTypeQ8_0 + TensorTypeQ8_1 + TensorTypeQ2_K + TensorTypeQ3_K + TensorTypeQ4_K + TensorTypeQ5_K + TensorTypeQ6_K + TensorTypeQ8_K + tensorTypeIQ2_XXS // not supported by ollama + tensorTypeIQ2_XS // not supported by ollama + tensorTypeIQ3_XXS // not supported by ollama + tensorTypeIQ1_S // not supported by ollama + tensorTypeIQ4_NL // not supported by ollama + tensorTypeIQ3_S // not supported by ollama + tensorTypeIQ2_S // not supported by ollama + tensorTypeIQ4_XS // not supported by ollama + TensorTypeI8 + TensorTypeI16 + TensorTypeI32 + TensorTypeI64 + TensorTypeF64 + tensorTypeIQ1_M // not supported by ollama + TensorTypeBF16 + tensorTypeQ4_0_4_4 // unused by GGML + tensorTypeQ4_0_4_8 // unused by GGML + tensorTypeQ4_0_8_8 // unused by GGML + tensorTypeTQ1_0 // not supported by ollama + tensorTypeTQ2_0 // not supported by ollama + tensorTypeIQ4_NL_4_4 // unused by GGML + tensorTypeIQ4_NL_4_8 // unused by GGML + tensorTypeIQ4_NL_8_8 // unused by GGML +) + +// ParseFileType parses the provided GGUF file type +// Only Ollama supported types are considered valid +func ParseTensorType(s string) (TensorType, error) { + switch s { + case "F32": + return TensorTypeF32, nil + case "F16": + return TensorTypeF16, nil + case "Q4_0": + return TensorTypeQ4_0, nil + case "Q4_1": + return TensorTypeQ4_1, nil + case "Q5_0": + return TensorTypeQ5_0, nil + case "Q5_1": + return TensorTypeQ5_1, nil + case "Q8_0": + return TensorTypeQ8_0, nil + case "Q8_1": + return TensorTypeQ8_1, nil + case "Q2_K": + return TensorTypeQ2_K, nil + case "Q3_K": + return TensorTypeQ3_K, nil + case "Q4_K": + return TensorTypeQ4_K, nil + case "Q5_K": + return TensorTypeQ5_K, nil + case "Q6_K": + return TensorTypeQ6_K, nil + case "Q8_K": + return TensorTypeQ8_K, nil + case "F64": + return TensorTypeF64, nil + case "BF16": + return TensorTypeBF16, nil + default: + return 0, fmt.Errorf("unsupported quantization type %s", s) + } +} + +func (t TensorType) IsQuantized() bool { + switch t { + case TensorTypeF32, TensorTypeF16, TensorTypeBF16: + return false + default: + return true + } +} + +func (t TensorType) RowSize(ne uint64) uint64 { + return t.TypeSize() * ne / t.BlockSize() +} + +func (t TensorType) String() string { + switch t { + case TensorTypeF32: + return "F32" + case TensorTypeF16: + return "F16" + case TensorTypeQ4_0: + return "Q4_0" + case TensorTypeQ4_1: + return "Q4_1" + case TensorTypeQ5_0: + return "Q5_0" + case TensorTypeQ5_1: + return "Q5_1" + case TensorTypeQ8_0: + return "Q8_0" + case TensorTypeQ8_1: + return "Q8_1" + case TensorTypeQ2_K: + return "Q2_K" + case TensorTypeQ3_K: + return "Q3_K" + case TensorTypeQ4_K: + return "Q4_K" + case TensorTypeQ5_K: + return "Q5_K" + case TensorTypeQ6_K: + return "Q6_K" + case TensorTypeQ8_K: + return "Q8_K" + case TensorTypeF64: + return "F64" + case TensorTypeBF16: + return "BF16" + default: + return "unknown" + } +} diff --git a/integration/model_arch_test.go b/integration/model_arch_test.go index e094d3cea..6ce183d79 100644 --- a/integration/model_arch_test.go +++ b/integration/model_arch_test.go @@ -48,17 +48,6 @@ var ( } ) -func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) { - deadline, hasDeadline := t.Deadline() - if !hasDeadline { - return 8 * time.Minute, 10 * time.Minute - } else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 { - t.Skip("too little time") - return time.Duration(0), time.Duration(0) - } - return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second)) -} - func TestModelsGenerate(t *testing.T) { softTimeout, hardTimeout := getTimeouts(t) slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) diff --git a/integration/quantization_test.go b/integration/quantization_test.go new file mode 100644 index 000000000..af9da0b62 --- /dev/null +++ b/integration/quantization_test.go @@ -0,0 +1,130 @@ +//go:build integration && models + +package integration + +import ( + "bytes" + "context" + "fmt" + "log/slog" + "strings" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +func TestQuantization(t *testing.T) { + sourceModels := []string{ + "qwen2.5:0.5b-instruct-fp16", + } + quantizations := []string{ + "Q8_0", + "Q4_K_S", + "Q4_K_M", + "Q4_K", + } + softTimeout, hardTimeout := getTimeouts(t) + started := time.Now() + slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) + ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) + defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + for _, base := range sourceModels { + if err := PullIfMissing(ctx, client, base); err != nil { + t.Fatalf("pull failed %s", err) + } + for _, quant := range quantizations { + newName := fmt.Sprintf("%s__%s", base, quant) + t.Run(newName, func(t *testing.T) { + if time.Now().Sub(started) > softTimeout { + t.Skip("skipping remaining tests to avoid excessive runtime") + } + req := &api.CreateRequest{ + Model: newName, + Quantization: quant, + From: base, + } + fn := func(resp api.ProgressResponse) error { + // fmt.Print(".") + return nil + } + t.Logf("quantizing: %s -> %s", base, quant) + if err := client.Create(ctx, req, fn); err != nil { + t.Fatalf("create failed %s", err) + } + defer func() { + req := &api.DeleteRequest{ + Model: newName, + } + t.Logf("deleting: %s -> %s", base, quant) + if err := client.Delete(ctx, req); err != nil { + t.Logf("failed to clean up %s: %s", req.Model, err) + } + }() + // Check metadata on the model + resp, err := client.Show(ctx, &api.ShowRequest{Name: newName}) + if err != nil { + t.Fatalf("unable to show model: %s", err) + } + if !strings.Contains(resp.Details.QuantizationLevel, quant) { + t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel) + } + + stream := true + genReq := api.GenerateRequest{ + Model: newName, + Prompt: "why is the sky blue?", + KeepAlive: &api.Duration{Duration: 3 * time.Second}, + Options: map[string]any{ + "seed": 42, + "temperature": 0.0, + }, + Stream: &stream, + } + t.Logf("verifying: %s -> %s", base, quant) + + // Some smaller quantizations can cause models to have poor quality + // or get stuck in repetition loops, so we stop as soon as we have any matches + anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"} + reqCtx, reqCancel := context.WithCancel(ctx) + atLeastOne := false + var buf bytes.Buffer + genfn := func(response api.GenerateResponse) error { + buf.Write([]byte(response.Response)) + fullResp := strings.ToLower(buf.String()) + for _, resp := range anyResp { + if strings.Contains(fullResp, resp) { + atLeastOne = true + t.Log(fullResp) + reqCancel() + break + } + } + return nil + } + + done := make(chan int) + var genErr error + go func() { + genErr = client.Generate(reqCtx, &genReq, genfn) + done <- 0 + }() + + select { + case <-done: + if genErr != nil && !atLeastOne { + t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for generate") + } + + t.Logf("passed") + + }) + } + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index 5d44157bf..19f4d1bf8 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -359,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) { } } } + +func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) { + deadline, hasDeadline := t.Deadline() + if !hasDeadline { + return 8 * time.Minute, 10 * time.Minute + } else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 { + t.Skip("too little time") + return time.Duration(0), time.Duration(0) + } + return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second)) +} diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index df42d1a57..eb7b5325e 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -74,7 +74,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, - { LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1607,22 +1606,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, - { - LLM_ARCH_MISTRAL3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - } - }, { LLM_ARCH_UNKNOWN, { diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index bda9d0714..bc8a4f0bb 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -76,7 +76,6 @@ enum llm_arch { LLM_ARCH_CHAMELEON, LLM_ARCH_SOLAR, LLM_ARCH_WAVTOKENIZER_DEC, - LLM_ARCH_MISTRAL3, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, LLM_ARCH_UNKNOWN, diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index ef70486d0..9d099f117 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -1437,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; - case LLM_ARCH_MISTRAL3: break; default: throw std::runtime_error("unsupported model architecture"); } @@ -13752,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_CHAMELEON: case LLM_ARCH_SOLAR: case LLM_ARCH_BAILINGMOE: - case LLM_ARCH_MISTRAL3: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp index 8ae6dde87..223e1f3f9 100644 --- a/llama/llama.cpp/src/llama-quant.cpp +++ b/llama/llama.cpp/src/llama-quant.cpp @@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - // don't quantize vision stuff - quantize &= name.find("v.") == std::string::npos; - quantize &= name.find("mm.") == std::string::npos; - // quantize only 2D and 3D tensors (experts) quantize &= (ggml_n_dims(tensor) >= 2); diff --git a/llama/llama.go b/llama/llama.go index ccd63b5a4..40598f518 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -460,24 +460,6 @@ func (m *Model) NEmbd() int { return int(C.llama_model_n_embd(m.c)) } -func Quantize(infile, outfile string, ftype uint32) error { - cinfile := C.CString(infile) - defer C.free(unsafe.Pointer(cinfile)) - - coutfile := C.CString(outfile) - defer C.free(unsafe.Pointer(coutfile)) - - params := C.llama_model_quantize_default_params() - params.nthread = -1 - params.ftype = ftype - - if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 { - return fmt.Errorf("llama_model_quantize: %d", rc) - } - - return nil -} - // vision processing type ClipContext struct { c *C.struct_clip_ctx diff --git a/llama/patches/0016-add-model-quantizations.patch b/llama/patches/0016-add-model-quantizations.patch deleted file mode 100644 index 2e3be0c68..000000000 --- a/llama/patches/0016-add-model-quantizations.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Tue, 8 Apr 2025 20:39:32 -0700 -Subject: [PATCH] add model quantizations - -a temporary patch to add model quantization for -models not supported in llama.cpp ---- - src/llama-arch.cpp | 17 +++++++++++++++++ - src/llama-arch.h | 1 + - src/llama-model.cpp | 2 ++ - src/llama-quant.cpp | 4 ++++ - 4 files changed, 24 insertions(+) - -diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index eb7b5325..df42d1a5 100644 ---- a/src/llama-arch.cpp -+++ b/src/llama-arch.cpp -@@ -74,6 +74,7 @@ static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, - { LLM_ARCH_PLM, "plm" }, - { LLM_ARCH_BAILINGMOE, "bailingmoe" }, -+ { LLM_ARCH_MISTRAL3, "mistral3" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, - }; - -@@ -1606,6 +1607,22 @@ static const std::map> LLM_TENSOR_N - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, -+ { -+ LLM_ARCH_MISTRAL3, -+ { -+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -+ } -+ }, - { - LLM_ARCH_UNKNOWN, - { -diff --git a/src/llama-arch.h b/src/llama-arch.h -index bc8a4f0b..bda9d071 100644 ---- a/src/llama-arch.h -+++ b/src/llama-arch.h -@@ -76,6 +76,7 @@ enum llm_arch { - LLM_ARCH_CHAMELEON, - LLM_ARCH_SOLAR, - LLM_ARCH_WAVTOKENIZER_DEC, -+ LLM_ARCH_MISTRAL3, - LLM_ARCH_PLM, - LLM_ARCH_BAILINGMOE, - LLM_ARCH_UNKNOWN, -diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index 9d099f11..ef70486d 100644 ---- a/src/llama-model.cpp -+++ b/src/llama-model.cpp -@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - default: type = LLM_TYPE_UNKNOWN; - } - } break; -+ case LLM_ARCH_MISTRAL3: break; - default: throw std::runtime_error("unsupported model architecture"); - } - -@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { - case LLM_ARCH_CHAMELEON: - case LLM_ARCH_SOLAR: - case LLM_ARCH_BAILINGMOE: -+ case LLM_ARCH_MISTRAL3: - return LLAMA_ROPE_TYPE_NORM; - - // the pairs of head values are offset by n_rot/2 -diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp -index 223e1f3f..8ae6dde8 100644 ---- a/src/llama-quant.cpp -+++ b/src/llama-quant.cpp -@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - -+ // don't quantize vision stuff -+ quantize &= name.find("v.") == std::string::npos; -+ quantize &= name.find("mm.") == std::string::npos; -+ - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); - diff --git a/llama/patches/0017-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch similarity index 100% rename from llama/patches/0017-add-ollama-vocab-for-grammar-support.patch rename to llama/patches/0016-add-ollama-vocab-for-grammar-support.patch diff --git a/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch similarity index 100% rename from llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch rename to llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch diff --git a/llm/memory_test.go b/llm/memory_test.go index 213784a02..1d4f7a98c 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) { defer f.Close() inputLayerCount := 5 - tensors := []ggml.Tensor{ + tensors := []*ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index cd760643c..aace1335d 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -312,6 +312,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, g, ctx := errgroup.WithContext(ctx) g.SetLimit(runtime.GOMAXPROCS(0)) for _, t := range meta.Tensors().Items() { + t := t g.Go(func() error { tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name]))) for i := range tts { diff --git a/ml/backend/ggml/quantization.go b/ml/backend/ggml/quantization.go new file mode 100644 index 000000000..bb31e455d --- /dev/null +++ b/ml/backend/ggml/quantization.go @@ -0,0 +1,83 @@ +package ggml + +// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src +// #include +// #include +// #include "ggml.h" +// #include "ggml-cpu.h" +// #include "ggml-backend.h" +// #include "ggml-quants.h" +import "C" + +import ( + "unsafe" + + fsggml "github.com/ollama/ollama/fs/ggml" +) + +// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it +func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 { + f32s := make([]float32, nelements) + elems := C.int64_t(nelements) + switch dtype { + case C.GGML_TYPE_F16: + C.ggml_fp16_to_fp32_row((*C.uint16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_0: + C.dequantize_row_q4_0((*C.block_q4_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_1: + C.dequantize_row_q4_1((*C.block_q4_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_0: + C.dequantize_row_q5_0((*C.block_q5_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_1: + C.dequantize_row_q5_1((*C.block_q5_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q8_0: + C.dequantize_row_q8_0((*C.block_q8_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q2_K: + C.dequantize_row_q2_K((*C.block_q2_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q3_K: + C.dequantize_row_q3_K((*C.block_q3_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_K: + C.dequantize_row_q4_K((*C.block_q4_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_K: + C.dequantize_row_q5_K((*C.block_q5_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q6_K: + C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_BF16: + C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + default: + panic("unsupported quantization format") + } + return f32s +} + +func Quantize(newType fsggml.TensorType, f32s []float32, shape []uint64) []byte { + buf := make([]byte, len(f32s)*4) // upper bound on size + nPerRow := C.int64_t(shape[0]) + nrows := C.int64_t(1) + if len(shape) > 1 { + nrows = C.int64_t(shape[1]) + } + shape2 := C.int64_t(1) + if len(shape) > 2 { + shape2 = C.int64_t(shape[2]) + } + nelements_matrix := nPerRow * nrows + newSize := C.size_t(0) + for i03 := C.int64_t(0); i03 < shape2; i03++ { + f32s_03 := i03 * nelements_matrix + buf_03 := C.int64_t(C.ggml_row_size(uint32(newType), nPerRow)) * i03 * nrows + newSize += C.ggml_quantize_chunk( + uint32(newType), + (*C.float)(&f32s[f32s_03]), + unsafe.Pointer((uintptr)(unsafe.Pointer(&buf[0]))+uintptr(buf_03)), + 0, + nrows, + nPerRow, + nil) + } + return buf[:newSize] +} + +func QuantizationVersion() uint32 { + return uint32(C.GGML_QNT_VERSION) +} diff --git a/parser/parser_test.go b/parser/parser_test.go index f2aa5ab79..c848479c6 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -765,7 +765,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) { return fmt.Sprintf("sha256:%x", h.Sum(nil)), n } -func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) { +func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) { t.Helper() f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf") diff --git a/server/create.go b/server/create.go index 50e669db0..7ffa60a22 100644 --- a/server/create.go +++ b/server/create.go @@ -15,6 +15,7 @@ import ( "path/filepath" "slices" "strings" + "sync/atomic" "github.com/gin-gonic/gin" @@ -23,7 +24,6 @@ import ( "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" - "github.com/ollama/ollama/llama" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -425,9 +425,14 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) { ft := layer.GGML.KV().FileType() - fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)}) - - want, err := ggml.ParseFileType(quantizeType) + var doneBytes atomic.Uint64 + totalBytes := uint64(layer.Size) - layer.GGML.Tensors().Offset + fnWrap := func(n uint64) { + done := doneBytes.Add(n) + progress := float32(done) / float32(totalBytes) + fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))}) + } + ftype, err := ggml.ParseFileType(quantizeType) if err != nil { return nil, err } @@ -436,6 +441,11 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr if err != nil { return nil, err } + fp, err := os.Open(blob) + if err != nil { + return nil, err + } + defer fp.Close() temp, err := os.CreateTemp(filepath.Dir(blob), quantizeType) if err != nil { @@ -444,15 +454,15 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr defer temp.Close() defer os.Remove(temp.Name()) - if err := llama.Quantize(blob, temp.Name(), uint32(want)); err != nil { + if err := quantize(fp, temp, layer.GGML, ftype, fnWrap); err != nil { return nil, err } - + temp.Seek(0, io.SeekStart) + fn(api.ProgressResponse{Status: "verifying conversion"}) newLayer, err := NewLayer(temp, layer.MediaType) if err != nil { return nil, err } - if _, err := temp.Seek(0, io.SeekStart); err != nil { return nil, err } @@ -462,7 +472,6 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err)) return nil, err } - return &layerGGML{newLayer, f}, nil } diff --git a/server/model.go b/server/model.go index e733fbdb1..2149ff855 100644 --- a/server/model.go +++ b/server/model.go @@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe } defer blob.Close() - f, _, err := ggml.Decode(blob, 1024) + f, _, err := ggml.Decode(blob, -1) if err != nil { return nil, err } diff --git a/server/quantization.go b/server/quantization.go new file mode 100644 index 000000000..80bc093db --- /dev/null +++ b/server/quantization.go @@ -0,0 +1,274 @@ +package server + +import ( + "fmt" + "io" + "log/slog" + "maps" + "os" + "strings" + "unsafe" + + fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml/backend/ggml" +) + +type quantizer struct { + *os.File + offset uint64 + from, to *fsggml.Tensor + progressFn func(n uint64) +} + +func (q quantizer) WriteTo(w io.Writer) (int64, error) { + quantize := q.from.Kind != q.to.Kind + sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size())) + if !quantize { + n, err := io.Copy(w, sr) + q.progressFn(q.from.Size()) + return n, err + } + data, err := io.ReadAll(sr) + if err != nil { + slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err) + return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err) + } + var f32s []float32 + newType := fsggml.TensorType(q.to.Kind) + if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 { + f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements()) + } else { + f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements()) + } + data = ggml.Quantize(newType, f32s, q.from.Shape) + n, err := w.Write(data) + q.progressFn(q.from.Size()) + return int64(n), err +} + +type quantizeState struct { + nAttnV int // Number of attn_*v* weight tensors + nFfnDown int // Number of ffn_down tensors + iAttnV int // Running counter of number of attn_v tensors that have been processed + iFfnDown int // Running counter of number of ffn_down tensors that have been processed + hasOutput bool // used to figure out if a model shares tok_embd with the output weight +} + +func useMoreBits(iLayer, nLayers int) bool { + return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2 +} + +func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType { + // Ported from llama_tensor_get_type, removed unsupported quantization types + nExperts := max(1, kv.Uint("expert_count", 0)) + if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") { + nx := shape[0] + qk_k := newType.BlockSize() + if nx%qk_k != 0 { + newType = fsggml.TensorTypeQ8_0 + } else if newType != fsggml.TensorTypeQ8_0 { + newType = fsggml.TensorTypeQ6_K + } + } else if strings.Contains(name, "attn_v.weight") { + if ftype == fsggml.FileTypeQ2_K { + if kv.GQA() >= 4 { + newType = fsggml.TensorTypeQ4_K + } else { + newType = fsggml.TensorTypeQ3_K + } + } else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 { + newType = fsggml.TensorTypeQ4_K + } else if ftype == fsggml.FileTypeQ3_K_M { + if qs.iAttnV < 2 { + newType = fsggml.TensorTypeQ5_K + } else { + newType = fsggml.TensorTypeQ4_K + } + } else if ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ5_K + } else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) && + useMoreBits(qs.iAttnV, qs.nAttnV) { + newType = fsggml.TensorTypeQ6_K + } else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { + newType = fsggml.TensorTypeQ5_K + } + + // TODO + // if (qs.model.type == LLM_TYPE_70B) { + // // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // // nearly negligible increase in model size by quantizing this tensor with more bits: + // if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K; + // } + + if nExperts == 8 { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + newType = fsggml.TensorTypeQ8_0 + } + qs.iAttnV++ + } else if strings.Contains(name, "attn_k.weight") { + if nExperts == 8 { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + newType = fsggml.TensorTypeQ8_0 + } + } else if strings.Contains(name, "ffn_down") { + iLayer := qs.iFfnDown + n_layer := qs.nFfnDown + if ftype == fsggml.FileTypeQ2_K { + newType = fsggml.TensorTypeQ3_K + } else if ftype == fsggml.FileTypeQ2_K_S { + if iLayer < n_layer/8 { + newType = fsggml.TensorTypeQ4_K + } + } else if ftype == fsggml.FileTypeQ3_K_M { + if iLayer < n_layer/16 { + newType = fsggml.TensorTypeQ5_K + } else if useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ4_K + } else { + newType = fsggml.TensorTypeQ3_K + } + } else if ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ5_K + } else if ftype == fsggml.FileTypeQ4_K_M { + if useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ6_K + } + } else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ6_K + } else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { + newType = fsggml.TensorTypeQ5_K + } + qs.iFfnDown++ + } else if strings.Contains(name, "attn_output.weight") { + if nExperts == 8 { + if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M || + ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M { + newType = fsggml.TensorTypeQ5_K + } + } else { + if ftype == fsggml.FileTypeQ2_K { + newType = fsggml.TensorTypeQ3_K + } else if ftype == fsggml.FileTypeQ3_K_M { + newType = fsggml.TensorTypeQ4_K + } else if ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ5_K + } + } + } else if strings.Contains(name, "attn_qkv.weight") { + if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ4_K + } else if ftype == fsggml.FileTypeQ4_K_M { + newType = fsggml.TensorTypeQ5_K + } else if ftype == fsggml.FileTypeQ5_K_M { + newType = fsggml.TensorTypeQ6_K + } + } + + if newType.IsQuantized() { + nx := shape[0] + ny := uint64(1) + if len(shape) > 1 { + ny = shape[1] + } + qk_k := newType.BlockSize() + if nx%qk_k != 0 { + slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String())) + newType = fsggml.TensorTypeF16 + } + } + return newType +} + +func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error { + kv := maps.Clone(orig.KV()) + kv["general.file_type"] = newFileType + // kv["general.quantization_version"] = ggml.QuantizationVersion() + qs := &quantizeState{} + // Build up the quantize state so newType can adjust types + layerCount := 0 + for k, l := range orig.Tensors().GroupLayers() { + if strings.HasPrefix(k, "blk.") { + layerCount++ + } + for _, tensor := range l { + if strings.Contains(tensor.Name, "attn_v.weight") || + strings.Contains(tensor.Name, "attn_qkv.weight") || + strings.Contains(tensor.Name, "attn_kv_b.weight") { + qs.nAttnV++ + } else if tensor.Name == "output.weight" { + qs.hasOutput = true + } + } + } + qs.nFfnDown = layerCount + + origTensors := orig.Tensors().Items() + outputTensors := make([]*fsggml.Tensor, len(origTensors)) + for i, tensor := range origTensors { + tensor := tensor + newType := newType(tensor, kv, qs, newFileType) + newTensor := &fsggml.Tensor{ + Name: tensor.Name, + Shape: tensor.Shape, + Kind: uint32(newType), + } + outputTensors[i] = newTensor + outputTensors[i].WriterTo = quantizer{ + File: in, + offset: orig.Tensors().Offset + tensor.Offset, + from: tensor, + to: newTensor, + progressFn: progressFn, + } + } + return fsggml.WriteGGUF(out, kv, outputTensors) +} + +func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType { + defaultType := ftype.ToTensorType() + name := t.Name + quantize := strings.HasSuffix(name, "weight") + + // don't quantize vision stuff + quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v.")) + quantize = quantize && !strings.Contains(name, "mm.") + + // quantize only 2D and 3D tensors (experts) + quantize = quantize && (len(t.Shape) >= 2) + + // do not quantize norm tensors + quantize = quantize && !strings.Contains(name, "_norm.weight") + + // do not quantize expert gating tensors + quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight") + + // do not quantize positional embeddings and token types (BERT) + quantize = quantize && (name != "position_embd.weight") + quantize = quantize && (name != "token_types.weight") + + // do not quantize Mamba's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight") + + // do not quantize RWKV's time_mix_first tensors + quantize = quantize && !strings.Contains(name, "time_mix_first.weight") + quantize = quantize && !strings.Contains(name, "time_mix_w1.weight") + quantize = quantize && !strings.Contains(name, "time_mix_w2.weight") + quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight") + quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight") + quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight") + + // do not quantize relative position bias (T5) + quantize = quantize && !strings.Contains(name, "attn_rel_b.weight") + + newType := fsggml.TensorType(t.Kind) + if quantize { + // get more optimal quantization type based on the tensor shape, layer, etc. + newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype) + if newType != defaultType { + slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType) + } + } + return newType +} diff --git a/server/quantization_test.go b/server/quantization_test.go new file mode 100644 index 000000000..b7e133507 --- /dev/null +++ b/server/quantization_test.go @@ -0,0 +1,882 @@ +package server + +import ( + "bytes" + "fmt" + "math" + "os" + "strings" + "testing" + + fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml/backend/ggml" +) + +func TestGetTensorNewType(t *testing.T) { + cases := []struct { + name string + kv map[string]any + qs quantizeState + newType fsggml.TensorType + tensor_name string + shape []uint64 + ftype fsggml.FileType + expected fsggml.TensorType + expectedPanic string + }{ + { + name: "output_unsupported", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "output.weight", + shape: []uint64{100, 100}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeF16, + }, + { + name: "output_Q8", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "output.weight", + shape: []uint64{1024, 1024}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "attn_v.weight_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + "foo.attention.head_count": uint32(4), + "foo.attention.head_count_kv": uint32(1), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_v.weight_q3_k", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "attn_v.weight_q2_k_s_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + "foo.attention.head_count": uint32(4), + "foo.attention.head_count_kv": uint32(1), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K_S, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_v.weight_q3_k_m", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_q3_k_m_i", + qs: quantizeState{ + iAttnV: 2, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_v.weight_q3_k_l", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_L, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_q4_k_m", + qs: quantizeState{ + iAttnV: 2, + nAttnV: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "attn_v.weight_q4_k_s", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_S, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ8_0, + }, + { + name: "attn_k.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_k.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ8_0, + }, + { + name: "ffn_down_q2_k", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "ffn_down_q2_k_s", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K_S, + expected: fsggml.TensorTypeQ4_0, + }, + { + name: "ffn_down_q2_k_s_layers", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K_S, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "ffn_down_q3_k_m_base", + qs: quantizeState{ + iFfnDown: 1, + nFfnDown: 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "ffn_down_q3_k_m_16", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 16, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "ffn_down_q3_k_m_8", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "ffn_down_q3_k_l", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_L, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "ffn_down_q4_k_m", + qs: quantizeState{ + iFfnDown: 1, + nFfnDown: 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ4_0, + }, + { + name: "ffn_down_q4_k_m_6", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "ffn_down_q5_k_m", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ5_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "ffn_down_q4_k_s", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_S, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_output.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_output.weight_q2", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "attn_output.weight_q3_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_output.weight_q3_k_l", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_L, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_qkv.weight_q3_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_qkv.weight_q4_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_qkv.weight_q5_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ5_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + } + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + if tt.expectedPanic != "" { + defer func() { + e := recover() + if !strings.Contains(fmt.Sprintf("%v", e), tt.expectedPanic) { + t.Fatalf("incorrect panic\ngot: %v\nexpected: %s", e, tt.expectedPanic) + } + }() + } else { + defer func() { + e := recover() + if e != nil { + t.Fatalf("hit unexpected panic %v", e) + } + }() + } + ret := getTensorNewType(tt.kv, &tt.qs, tt.newType, tt.tensor_name, tt.shape, tt.ftype) + if ret != tt.expected { + t.Fatalf("incorrect type returned\ngot: %d\nexpected: %d", ret, tt.expected) + } + }) + } +} + +func TestQuantizeModel(t *testing.T) { + cases := []struct { + name string + kv map[string]any + tensors []*fsggml.Tensor + newType string + expectedTensorTypes map[string]fsggml.TensorType + }{ + { + name: "f16_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{512, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{256, 4}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + }, + newType: "Q4_K", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn.weight": fsggml.TensorTypeQ4_K, + "output.weight": fsggml.TensorTypeQ6_K, + }, + }, + { + name: "f32_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn_v.weight", Kind: uint32(fsggml.TensorTypeF32), + Offset: uint64(0), Shape: []uint64{512, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF32), + Offset: uint64(0), Shape: []uint64{512}, + WriterTo: bytes.NewReader(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...)), + }, + }, + newType: "Q4_K", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn_v.weight": fsggml.TensorTypeQ6_K, + "output.weight": fsggml.TensorTypeF32, + }, + }, + { + name: "f16_q8_0", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{32, 16, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{256, 4}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + }, + newType: "Q8_0", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn.weight": fsggml.TensorTypeQ8_0, + "output.weight": fsggml.TensorTypeQ8_0, + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + f, err := os.CreateTemp(t.TempDir(), tt.name) + if err != nil { + t.Fatal(err.Error()) + } + defer f.Close() + err = fsggml.WriteGGUF(f, tt.kv, tt.tensors) + if err != nil { + t.Fatalf("failed to create initial model: %s", err) + } + fp, err := os.Open(f.Name()) + if err != nil { + t.Fatal(err.Error()) + } + defer fp.Close() + meta, _, err := fsggml.Decode(fp, -1) + if err != nil { + t.Fatal(err.Error()) + } + progressCalled := false + progress := func(n uint64) { + // fmt.Fprintf(os.Stderr, "progress: %f\n", p) + progressCalled = true + } + tmp, err := os.CreateTemp(t.TempDir(), tt.name+".out") + if err != nil { + t.Fatal(err.Error()) + } + defer tmp.Close() + ftype, err := fsggml.ParseFileType(tt.newType) + if err != nil { + t.Fatal(err.Error()) + } + + err = quantize(fp, tmp, meta, ftype, progress) + if err != nil { + t.Fatalf("error during quantize: %s", err) + } + if !progressCalled { + t.Fatalf("progress was not reported") + } + // Now attempt to load it back and make sure types match expected + fpNew, err := os.Open(tmp.Name()) + if err != nil { + t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) + } + defer fpNew.Close() + newMeta, _, err := fsggml.Decode(fpNew, -1) + if err != nil { + t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) + } + tensors := newMeta.Tensors() + for _, l := range tensors.GroupLayers() { + for _, tensor := range l { + if fsggml.TensorType(tensor.Kind) != tt.expectedTensorTypes[tensor.Name] { + t.Fatalf("incorrect output type for %s\ngot:%s\nexpected:%s", tensor.Name, fsggml.TensorType(tensor.Kind), tt.expectedTensorTypes[tensor.Name]) + } + } + } + }) + } +} + +func TestConvertToF32(t *testing.T) { + expected := make([]float32, 256) + for i := range expected { + expected[i] = float32(i) + } + for dtype, data := range quantBytes { + // Skip the no-op + if dtype == fsggml.TensorTypeF32 { + continue + } + t.Run(dtype.String(), func(t *testing.T) { + fp32 := ggml.ConvertToF32(data, uint32(dtype), 256) + similarity := cosineSimilarity(expected, fp32) + if similarity < 0.999 { + t.Fatalf("Results not similar enough: %s %f", dtype.String(), similarity) + } + }) + } +} + +func dotProduct[V float32 | float64](v1, v2 []V) V { + var result V = 0 + for i := range v1 { + result += v1[i] * v2[i] + } + return result +} + +func magnitude[V float32 | float64](v []V) V { + var result V = 0 + for _, val := range v { + result += val * val + } + return V(math.Sqrt(float64(result))) +} + +func cosineSimilarity[V float32 | float64](v1, v2 []V) V { + return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2)) +} + +// Precomputed quantized data - arange 256 +// # For gguf-py supported types +// import gguf +// import numpy as np +// print(repr(gguf.quantize(np.arange(256, dtype=np.float16), gguf.GGMLQuantizationType.Q4_0))) +// +// For types not supported by gguf-py converted via ggml_fp32_to_fp16_row and quantize_XXX +// +// data := make([]byte, 256*2) +// fp32 := make([]float32, 256) +// for i := range 256 { +// fp32[i] = float32(i) +// } +// l := C.quantize_q6_K((*C.float)(&fp32[0]), unsafe.Pointer(&data[0]), 1, 256, nil) +// for i := range data[:int(l)] { +// fmt.Printf("%d, ", data[i]) +// } +var ( + quantBytes = map[fsggml.TensorType][]byte{ + fsggml.TensorTypeQ4_0: { + 192, 195, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, + 21, 21, 21, 4, 4, 224, 199, 36, 36, 36, 36, 19, 19, + 19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 240, 201, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, + 1, 1, 240, 203, 18, 18, 18, 18, 18, 18, 18, 18, 1, + 1, 1, 1, 1, 1, 1, 1, 248, 204, 18, 18, 17, 17, + 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, + 205, 17, 17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 248, 206, 17, 17, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, 207, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, + }, + fsggml.TensorTypeQ4_1: { + 34, 64, 0, 0, 128, 128, 145, 145, 162, 162, 179, 179, 196, + 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 80, 128, 128, + 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, 247, + 247, 34, 64, 0, 84, 128, 128, 145, 145, 162, 162, 179, 179, + 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 86, 128, + 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, + 247, 247, 34, 64, 0, 88, 128, 128, 145, 145, 162, 162, 179, + 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 89, + 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, + 230, 247, 247, 34, 64, 0, 90, 128, 128, 145, 145, 162, 162, + 179, 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, + 91, 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, + 230, 230, 247, 247, + }, + fsggml.TensorTypeQ5_0: { + 192, 191, 1, 0, 0, 0, 128, 127, 127, 110, 110, 93, 93, + 76, 76, 59, 59, 42, 42, 25, 25, 8, 224, 195, 0, 0, + 0, 0, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, + 21, 21, 21, 4, 4, 240, 197, 0, 0, 0, 0, 53, 37, + 37, 37, 37, 36, 36, 20, 20, 20, 20, 19, 19, 3, 3, + 3, 240, 199, 0, 0, 0, 0, 36, 36, 36, 36, 19, 19, + 19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 248, 200, 0, + 0, 0, 0, 35, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 2, 2, 2, 2, 2, 248, 201, 0, 0, 0, 0, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, + 1, 1, 248, 202, 0, 0, 0, 0, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 2, 2, 1, 1, 1, 1, 1, 248, 203, + 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 1, + 1, 1, 1, 1, 1, 1, 1, + }, + fsggml.TensorTypeQ5_1: { + 0, 60, 0, 0, 0, 0, 255, 255, 0, 17, 34, 51, 68, + 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, + 0, 80, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, + 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 84, + 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, + 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 86, 0, 0, + 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, + 187, 204, 221, 238, 255, 0, 60, 0, 88, 0, 0, 255, 255, + 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, + 221, 238, 255, 0, 60, 0, 89, 0, 0, 255, 255, 0, 17, + 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, + 255, 0, 60, 0, 90, 0, 0, 255, 255, 0, 17, 34, 51, + 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, + 60, 0, 91, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, + 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, + }, + fsggml.TensorTypeQ8_0: { + 208, 51, 0, 4, 8, 12, 16, 20, 25, 29, 33, 37, 41, + 45, 49, 53, 57, 61, 66, 70, 74, 78, 82, 86, 90, 94, + 98, 102, 107, 111, 115, 119, 123, 127, 240, 55, 65, 67, 69, + 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, + 123, 125, 127, 252, 57, 86, 87, 88, 90, 91, 92, 94, 95, + 96, 98, 99, 100, 102, 103, 104, 106, 107, 108, 110, 111, 112, + 114, 115, 116, 118, 119, 120, 122, 123, 124, 126, 127, 0, 60, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 123, 124, 125, 126, 127, 2, 61, 102, 103, 104, 105, 105, + 106, 107, 108, 109, 109, 110, 111, 112, 113, 113, 114, 115, 116, + 117, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 125, 126, + 127, 4, 62, 106, 107, 108, 108, 109, 110, 110, 111, 112, 112, + 113, 114, 114, 115, 116, 116, 117, 118, 118, 119, 120, 120, 121, + 122, 122, 123, 124, 124, 125, 126, 126, 127, 6, 63, 109, 110, + 110, 111, 112, 112, 113, 113, 114, 114, 115, 116, 116, 117, 117, + 118, 118, 119, 120, 120, 121, 121, 122, 122, 123, 124, 124, 125, + 125, 126, 126, 127, 4, 64, 112, 112, 113, 113, 114, 114, 115, + 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, + 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127, + }, + fsggml.TensorTypeBF16: { + 0, 0, 128, 63, 0, 64, 64, 64, 128, 64, 160, 64, 192, + 64, 224, 64, 0, 65, 16, 65, 32, 65, 48, 65, 64, 65, + 80, 65, 96, 65, 112, 65, 128, 65, 136, 65, 144, 65, 152, + 65, 160, 65, 168, 65, 176, 65, 184, 65, 192, 65, 200, 65, + 208, 65, 216, 65, 224, 65, 232, 65, 240, 65, 248, 65, 0, + 66, 4, 66, 8, 66, 12, 66, 16, 66, 20, 66, 24, 66, + 28, 66, 32, 66, 36, 66, 40, 66, 44, 66, 48, 66, 52, + 66, 56, 66, 60, 66, 64, 66, 68, 66, 72, 66, 76, 66, + 80, 66, 84, 66, 88, 66, 92, 66, 96, 66, 100, 66, 104, + 66, 108, 66, 112, 66, 116, 66, 120, 66, 124, 66, 128, 66, + 130, 66, 132, 66, 134, 66, 136, 66, 138, 66, 140, 66, 142, + 66, 144, 66, 146, 66, 148, 66, 150, 66, 152, 66, 154, 66, + 156, 66, 158, 66, 160, 66, 162, 66, 164, 66, 166, 66, 168, + 66, 170, 66, 172, 66, 174, 66, 176, 66, 178, 66, 180, 66, + 182, 66, 184, 66, 186, 66, 188, 66, 190, 66, 192, 66, 194, + 66, 196, 66, 198, 66, 200, 66, 202, 66, 204, 66, 206, 66, + 208, 66, 210, 66, 212, 66, 214, 66, 216, 66, 218, 66, 220, + 66, 222, 66, 224, 66, 226, 66, 228, 66, 230, 66, 232, 66, + 234, 66, 236, 66, 238, 66, 240, 66, 242, 66, 244, 66, 246, + 66, 248, 66, 250, 66, 252, 66, 254, 66, 0, 67, 1, 67, + 2, 67, 3, 67, 4, 67, 5, 67, 6, 67, 7, 67, 8, + 67, 9, 67, 10, 67, 11, 67, 12, 67, 13, 67, 14, 67, + 15, 67, 16, 67, 17, 67, 18, 67, 19, 67, 20, 67, 21, + 67, 22, 67, 23, 67, 24, 67, 25, 67, 26, 67, 27, 67, + 28, 67, 29, 67, 30, 67, 31, 67, 32, 67, 33, 67, 34, + 67, 35, 67, 36, 67, 37, 67, 38, 67, 39, 67, 40, 67, + 41, 67, 42, 67, 43, 67, 44, 67, 45, 67, 46, 67, 47, + 67, 48, 67, 49, 67, 50, 67, 51, 67, 52, 67, 53, 67, + 54, 67, 55, 67, 56, 67, 57, 67, 58, 67, 59, 67, 60, + 67, 61, 67, 62, 67, 63, 67, 64, 67, 65, 67, 66, 67, + 67, 67, 68, 67, 69, 67, 70, 67, 71, 67, 72, 67, 73, + 67, 74, 67, 75, 67, 76, 67, 77, 67, 78, 67, 79, 67, + 80, 67, 81, 67, 82, 67, 83, 67, 84, 67, 85, 67, 86, + 67, 87, 67, 88, 67, 89, 67, 90, 67, 91, 67, 92, 67, + 93, 67, 94, 67, 95, 67, 96, 67, 97, 67, 98, 67, 99, + 67, 100, 67, 101, 67, 102, 67, 103, 67, 104, 67, 105, 67, + 106, 67, 107, 67, 108, 67, 109, 67, 110, 67, 111, 67, 112, + 67, 113, 67, 114, 67, 115, 67, 116, 67, 117, 67, 118, 67, + 119, 67, 120, 67, 121, 67, 122, 67, 123, 67, 124, 67, 125, + 67, 126, 67, 127, 67, + }, + fsggml.TensorTypeF16: { + 0, 0, 0, 60, 0, 64, 0, 66, 0, 68, 0, 69, 0, 70, 0, 71, 0, + 72, 128, 72, 0, 73, 128, 73, 0, 74, 128, 74, 0, 75, 128, 75, + 0, 76, 64, 76, 128, 76, 192, 76, 0, 77, 64, 77, 128, 77, 192, + 77, 0, 78, 64, 78, 128, 78, 192, 78, 0, 79, 64, 79, 128, 79, + 192, 79, 0, 80, 32, 80, 64, 80, 96, 80, 128, 80, 160, 80, + 192, 80, 224, 80, 0, 81, 32, 81, 64, 81, 96, 81, 128, 81, + 160, 81, 192, 81, 224, 81, 0, 82, 32, 82, 64, 82, 96, 82, + 128, 82, 160, 82, 192, 82, 224, 82, 0, 83, 32, 83, 64, 83, + 96, 83, 128, 83, 160, 83, 192, 83, 224, 83, 0, 84, 16, 84, + 32, 84, 48, 84, 64, 84, 80, 84, 96, 84, 112, 84, 128, 84, + 144, 84, 160, 84, 176, 84, 192, 84, 208, 84, 224, 84, 240, + 84, 0, 85, 16, 85, 32, 85, 48, 85, 64, 85, 80, 85, 96, 85, + 112, 85, 128, 85, 144, 85, 160, 85, 176, 85, 192, 85, 208, + 85, 224, 85, 240, 85, 0, 86, 16, 86, 32, 86, 48, 86, 64, + 86, 80, 86, 96, 86, 112, 86, 128, 86, 144, 86, 160, 86, + 176, 86, 192, 86, 208, 86, 224, 86, 240, 86, 0, 87, 16, + 87, 32, 87, 48, 87, 64, 87, 80, 87, 96, 87, 112, 87, 128, + 87, 144, 87, 160, 87, 176, 87, 192, 87, 208, 87, 224, 87, + 240, 87, 0, 88, 8, 88, 16, 88, 24, 88, 32, 88, 40, 88, + 48, 88, 56, 88, 64, 88, 72, 88, 80, 88, 88, 88, 96, 88, + 104, 88, 112, 88, 120, 88, 128, 88, 136, 88, 144, 88, 152, + 88, 160, 88, 168, 88, 176, 88, 184, 88, 192, 88, 200, 88, + 208, 88, 216, 88, 224, 88, 232, 88, 240, 88, 248, 88, 0, + 89, 8, 89, 16, 89, 24, 89, 32, 89, 40, 89, 48, 89, 56, 89, + 64, 89, 72, 89, 80, 89, 88, 89, 96, 89, 104, 89, 112, 89, + 120, 89, 128, 89, 136, 89, 144, 89, 152, 89, 160, 89, 168, + 89, 176, 89, 184, 89, 192, 89, 200, 89, 208, 89, 216, 89, + 224, 89, 232, 89, 240, 89, 248, 89, 0, 90, 8, 90, 16, 90, + 24, 90, 32, 90, 40, 90, 48, 90, 56, 90, 64, 90, 72, 90, 80, + 90, 88, 90, 96, 90, 104, 90, 112, 90, 120, 90, 128, 90, + 136, 90, 144, 90, 152, 90, 160, 90, 168, 90, 176, 90, 184, + 90, 192, 90, 200, 90, 208, 90, 216, 90, 224, 90, 232, 90, + 240, 90, 248, 90, 0, 91, 8, 91, 16, 91, 24, 91, 32, 91, 40, + 91, 48, 91, 56, 91, 64, 91, 72, 91, 80, 91, 88, 91, 96, 91, + 104, 91, 112, 91, 120, 91, 128, 91, 136, 91, 144, 91, 152, + 91, 160, 91, 168, 91, 176, 91, 184, 91, 192, 91, 200, 91, + 208, 91, 216, 91, 224, 91, 232, 91, 240, 91, 248, 91, + }, + fsggml.TensorTypeF32: { + 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128, + 64, 0, 0, 160, 64, 0, 0, 192, 64, 0, 0, 224, 64, 0, 0, 0, 65, 0, + 0, 16, 65, 0, 0, 32, 65, 0, 0, 48, 65, 0, 0, 64, 65, 0, 0, 80, 65, + 0, 0, 96, 65, 0, 0, 112, 65, 0, 0, 128, 65, 0, 0, 136, 65, 0, 0, + 144, 65, 0, 0, 152, 65, 0, 0, 160, 65, 0, 0, 168, 65, 0, 0, 176, + 65, 0, 0, 184, 65, 0, 0, 192, 65, 0, 0, 200, 65, 0, 0, 208, 65, 0, + 0, 216, 65, 0, 0, 224, 65, 0, 0, 232, 65, 0, 0, 240, 65, 0, 0, 248, + 65, 0, 0, 0, 66, 0, 0, 4, 66, 0, 0, 8, 66, 0, 0, 12, 66, 0, 0, 16, + 66, 0, 0, 20, 66, 0, 0, 24, 66, 0, 0, 28, 66, 0, 0, 32, 66, 0, 0, + 36, 66, 0, 0, 40, 66, 0, 0, 44, 66, 0, 0, 48, 66, 0, 0, 52, 66, 0, + 0, 56, 66, 0, 0, 60, 66, 0, 0, 64, 66, 0, 0, 68, 66, 0, 0, 72, 66, + 0, 0, 76, 66, 0, 0, 80, 66, 0, 0, 84, 66, 0, 0, 88, 66, 0, 0, 92, 66, + 0, 0, 96, 66, 0, 0, 100, 66, 0, 0, 104, 66, 0, 0, 108, 66, 0, 0, 112, + 66, 0, 0, 116, 66, 0, 0, 120, 66, 0, 0, 124, 66, 0, 0, 128, 66, 0, 0, + 130, 66, 0, 0, 132, 66, 0, 0, 134, 66, 0, 0, 136, 66, 0, 0, 138, 66, + 0, 0, 140, 66, 0, 0, 142, 66, 0, 0, 144, 66, 0, 0, 146, 66, 0, 0, 148, + 66, 0, 0, 150, 66, 0, 0, 152, 66, 0, 0, 154, 66, 0, 0, 156, 66, 0, 0, + 158, 66, 0, 0, 160, 66, 0, 0, 162, 66, 0, 0, 164, 66, 0, 0, 166, 66, + 0, 0, 168, 66, 0, 0, 170, 66, 0, 0, 172, 66, 0, 0, 174, 66, 0, 0, 176, + 66, 0, 0, 178, 66, 0, 0, 180, 66, 0, 0, 182, 66, 0, 0, 184, 66, 0, 0, + 186, 66, 0, 0, 188, 66, 0, 0, 190, 66, 0, 0, 192, 66, 0, 0, 194, 66, 0, + 0, 196, 66, 0, 0, 198, 66, 0, 0, 200, 66, 0, 0, 202, 66, 0, 0, 204, 66, + 0, 0, 206, 66, 0, 0, 208, 66, 0, 0, 210, 66, 0, 0, 212, 66, 0, 0, 214, 66, + 0, 0, 216, 66, 0, 0, 218, 66, 0, 0, 220, 66, 0, 0, 222, 66, 0, 0, 224, 66, + 0, 0, 226, 66, 0, 0, 228, 66, 0, 0, 230, 66, 0, 0, 232, 66, 0, 0, 234, 66, + 0, 0, 236, 66, 0, 0, 238, 66, 0, 0, 240, 66, 0, 0, 242, 66, 0, 0, 244, 66, + 0, 0, 246, 66, 0, 0, 248, 66, 0, 0, 250, 66, 0, 0, 252, 66, 0, 0, 254, 66, + 0, 0, 0, 67, 0, 0, 1, 67, 0, 0, 2, 67, 0, 0, 3, 67, 0, 0, 4, 67, 0, 0, 5, 67, + 0, 0, 6, 67, 0, 0, 7, 67, 0, 0, 8, 67, 0, 0, 9, 67, 0, 0, 10, 67, 0, 0, 11, + 67, 0, 0, 12, 67, 0, 0, 13, 67, 0, 0, 14, 67, 0, 0, 15, 67, 0, 0, 16, 67, + 0, 0, 17, 67, 0, 0, 18, 67, 0, 0, 19, 67, 0, 0, 20, 67, 0, 0, 21, 67, 0, 0, + 22, 67, 0, 0, 23, 67, 0, 0, 24, 67, 0, 0, 25, 67, 0, 0, 26, 67, 0, 0, 27, + 67, 0, 0, 28, 67, 0, 0, 29, 67, 0, 0, 30, 67, 0, 0, 31, 67, 0, 0, 32, 67, + 0, 0, 33, 67, 0, 0, 34, 67, 0, 0, 35, 67, 0, 0, 36, 67, 0, 0, 37, 67, 0, 0, + 38, 67, 0, 0, 39, 67, 0, 0, 40, 67, 0, 0, 41, 67, 0, 0, 42, 67, 0, 0, 43, 67, + 0, 0, 44, 67, 0, 0, 45, 67, 0, 0, 46, 67, 0, 0, 47, 67, 0, 0, 48, 67, 0, 0, + 49, 67, 0, 0, 50, 67, 0, 0, 51, 67, 0, 0, 52, 67, 0, 0, 53, 67, 0, 0, 54, 67, + 0, 0, 55, 67, 0, 0, 56, 67, 0, 0, 57, 67, 0, 0, 58, 67, 0, 0, 59, 67, 0, 0, + 60, 67, 0, 0, 61, 67, 0, 0, 62, 67, 0, 0, 63, 67, 0, 0, 64, 67, 0, 0, 65, 67, + 0, 0, 66, 67, 0, 0, 67, 67, 0, 0, 68, 67, 0, 0, 69, 67, 0, 0, 70, 67, 0, 0, 71, + 67, 0, 0, 72, 67, 0, 0, 73, 67, 0, 0, 74, 67, 0, 0, 75, 67, 0, 0, 76, 67, 0, + 0, 77, 67, 0, 0, 78, 67, 0, 0, 79, 67, 0, 0, 80, 67, 0, 0, 81, 67, 0, 0, 82, + 67, 0, 0, 83, 67, 0, 0, 84, 67, 0, 0, 85, 67, 0, 0, 86, 67, 0, 0, 87, 67, 0, + 0, 88, 67, 0, 0, 89, 67, 0, 0, 90, 67, 0, 0, 91, 67, 0, 0, 92, 67, 0, 0, 93, + 67, 0, 0, 94, 67, 0, 0, 95, 67, 0, 0, 96, 67, 0, 0, 97, 67, 0, 0, 98, 67, 0, + 0, 99, 67, 0, 0, 100, 67, 0, 0, 101, 67, 0, 0, 102, 67, 0, 0, 103, 67, 0, 0, + 104, 67, 0, 0, 105, 67, 0, 0, 106, 67, 0, 0, 107, 67, 0, 0, 108, 67, 0, 0, 109, + 67, 0, 0, 110, 67, 0, 0, 111, 67, 0, 0, 112, 67, 0, 0, 113, 67, 0, 0, 114, 67, + 0, 0, 115, 67, 0, 0, 116, 67, 0, 0, 117, 67, 0, 0, 118, 67, 0, 0, 119, 67, 0, + 0, 120, 67, 0, 0, 121, 67, 0, 0, 122, 67, 0, 0, 123, 67, 0, 0, 124, 67, 0, 0, + 125, 67, 0, 0, 126, 67, 0, 0, 127, 67, + }, + fsggml.TensorTypeQ4_K: { + 52, 52, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 8, 15, 128, + 128, 129, 129, 146, 146, 147, 147, 164, 164, 165, 165, 166, 182, + 183, 183, 184, 200, 201, 201, 202, 218, 218, 219, 219, 236, 236, + 237, 237, 254, 254, 255, 202, 202, 202, 203, 203, 203, 219, 219, + 219, 220, 220, 220, 220, 220, 236, 237, 237, 237, 237, 237, + 237, 237, 238, 254, 254, 254, 254, 254, 255, 255, 255, 255, 220, + 220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 237, 237, 237, + 238, 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 255, 255, + 255, 255, 255, 255, 255, 237, 237, 237, 237, 237, 237, 237, 238, + 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 254, 254, 254, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + }, + fsggml.TensorTypeQ2_K: { + 1, 2, 3, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 184, 184, + 184, 185, 249, 249, 249, 249, 249, 250, 250, 254, 254, 254, 254, + 255, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 171, 69, 0, 0, + }, + fsggml.TensorTypeQ5_K: { + 32, 48, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 7, 15, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 19, 20, 37, 38, 55, 56, 73, 74, + 91, 92, 109, 110, 127, 112, 128, 129, 146, 147, 164, 165, 182, 183, + 200, 201, 218, 219, 236, 237, 254, 133, 133, 149, 150, 150, 150, + 167, 167, 167, 168, 184, 184, 185, 185, 201, 202, 202, 202, 219, + 219, 219, 219, 236, 236, 236, 237, 253, 253, 254, 254, 254, 255, + 169, 169, 169, 169, 186, 186, 186, 186, 186, 187, 187, 203, 203, + 203, 204, 204, 204, 220, 220, 221, 221, 221, 221, 237, 237, 238, + 238, 238, 238, 254, 255, 255, 203, 203, 203, 204, 204, 204, 204, + 204, 220, 220, 220, 221, 221, 221, 221, 221, 237, 237, 238, 238, + 238, 238, 238, 238, 254, 255, 255, 255, 255, 255, 255, 255, + }, + fsggml.TensorTypeQ6_K: { + 96, 110, 92, 90, 88, 70, 68, 50, 48, 46, 44, 42, 24, 22, 4, 2, 80, + 95, 78, 77, 76, 59, 58, 57, 40, 39, 38, 21, 20, 19, 2, 1, 75, 75, + 74, 57, 57, 56, 55, 39, 38, 37, 21, 20, 20, 19, 2, 2, 72, 55, 55, + 54, 54, 37, 37, 36, 36, 19, 19, 18, 18, 1, 1, 0, 35, 35, 35, 35, + 34, 18, 18, 18, 17, 17, 17, 1, 1, 0, 0, 0, 35, 35, 34, 34, 18, + 18, 18, 17, 17, 17, 17, 1, 0, 0, 0, 0, 35, 35, 35, 19, 19, 18, 18, + 18, 18, 18, 1, 1, 1, 1, 1, 1, 34, 34, 18, 18, 18, 18, 17, 17, 17, + 17, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 248, 240, 231, 224, 216, 208, 200, 192, 184, 176, + 166, 160, 152, 144, 136, 128, 235, 43, + }, + fsggml.TensorTypeQ3_K: { + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 20, 23, 23, 7, 7, 6, 6, 6, 2, + 1, 1, 1, 1, 0, 0, 22, 22, 6, 6, 5, 5, 5, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238, 204, 170, 136, 102, 68, + 34, 1, 5, 5, 5, 5, 189, 63, + }, + } +) diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 466dc04f1..3b3d99100 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -24,7 +24,7 @@ import ( var stream bool = false -func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) { +func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) { t.Helper() t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir())) diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 56121d41b..363d4057a 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -99,7 +99,7 @@ func TestGenerateChat(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -158,7 +158,7 @@ func TestGenerateChat(t *testing.T) { _, digest := createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []ggml.Tensor{}) + }, []*ggml.Tensor{}) w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", Files: map[string]string{"bert.gguf": digest}, @@ -643,7 +643,7 @@ func TestGenerate(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -698,7 +698,7 @@ func TestGenerate(t *testing.T) { _, digest := createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []ggml.Tensor{}) + }, []*ggml.Tensor{}) w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", diff --git a/server/sched_test.go b/server/sched_test.go index 1e8e11372..32ff6a8b9 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -126,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est "tokenizer.ggml.tokens": []string{" "}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, }))