Move quantization to new backend (#10363)

* Move quantization logic to GGML via new backend

This moves the model aware logic to Go code and calls GGMLs quantization code for model creation.

* Remove "add model quantizations"

This is no longer needed now that quantization is implemented in Go+GGML code directly.
This commit is contained in:
Daniel Hiltgen 2025-05-06 11:20:48 -07:00 committed by GitHub
parent 95e744beeb
commit 424810450f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
39 changed files with 1854 additions and 440 deletions

View file

@ -99,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []ggml.Tensor{
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@ -158,7 +158,7 @@ func TestGenerateChat(t *testing.T) {
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "bert",
"bert.pooling_type": uint32(0),
}, []ggml.Tensor{})
}, []*ggml.Tensor{})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert",
Files: map[string]string{"bert.gguf": digest},
@ -643,7 +643,7 @@ func TestGenerate(t *testing.T) {
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []ggml.Tensor{
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@ -698,7 +698,7 @@ func TestGenerate(t *testing.T) {
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "bert",
"bert.pooling_type": uint32(0),
}, []ggml.Tensor{})
}, []*ggml.Tensor{})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert",