Move quantization to new backend (#10363)

* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.
2025-05-11 10:26:53 +02:00 · 2025-05-06 11:20:48 -07:00 · 2025-05-06 11:20:48 -07:00 · 424810450f
commit 424810450f
parent 95e744beeb
39 changed files with 1854 additions and 440 deletions
--- a/integration/quantization_test.go
+++ b/integration/quantization_test.go
@ -0,0 +1,130 @@
+//go:build integration && models
+
+package integration
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"log/slog"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestQuantization(t *testing.T) {
+	sourceModels := []string{
+		"qwen2.5:0.5b-instruct-fp16",
+	}
+	quantizations := []string{
+		"Q8_0",
+		"Q4_K_S",
+		"Q4_K_M",
+		"Q4_K",
+	}
+	softTimeout, hardTimeout := getTimeouts(t)
+	started := time.Now()
+	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, base := range sourceModels {
+		if err := PullIfMissing(ctx, client, base); err != nil {
+			t.Fatalf("pull failed %s", err)
+		}
+		for _, quant := range quantizations {
+			newName := fmt.Sprintf("%s__%s", base, quant)
+			t.Run(newName, func(t *testing.T) {
+				if time.Now().Sub(started) > softTimeout {
+					t.Skip("skipping remaining tests to avoid excessive runtime")
+				}
+				req := &api.CreateRequest{
+					Model:        newName,
+					Quantization: quant,
+					From:         base,
+				}
+				fn := func(resp api.ProgressResponse) error {
+					// fmt.Print(".")
+					return nil
+				}
+				t.Logf("quantizing: %s -> %s", base, quant)
+				if err := client.Create(ctx, req, fn); err != nil {
+					t.Fatalf("create failed %s", err)
+				}
+				defer func() {
+					req := &api.DeleteRequest{
+						Model: newName,
+					}
+					t.Logf("deleting: %s -> %s", base, quant)
+					if err := client.Delete(ctx, req); err != nil {
+						t.Logf("failed to clean up %s: %s", req.Model, err)
+					}
+				}()
+				// Check metadata on the model
+				resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
+				if err != nil {
+					t.Fatalf("unable to show model: %s", err)
+				}
+				if !strings.Contains(resp.Details.QuantizationLevel, quant) {
+					t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
+				}
+
+				stream := true
+				genReq := api.GenerateRequest{
+					Model:     newName,
+					Prompt:    "why is the sky blue?",
+					KeepAlive: &api.Duration{Duration: 3 * time.Second},
+					Options: map[string]any{
+						"seed":        42,
+						"temperature": 0.0,
+					},
+					Stream: &stream,
+				}
+				t.Logf("verifying: %s -> %s", base, quant)
+
+				// Some smaller quantizations can cause models to have poor quality
+				// or get stuck in repetition loops, so we stop as soon as we have any matches
+				anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
+				reqCtx, reqCancel := context.WithCancel(ctx)
+				atLeastOne := false
+				var buf bytes.Buffer
+				genfn := func(response api.GenerateResponse) error {
+					buf.Write([]byte(response.Response))
+					fullResp := strings.ToLower(buf.String())
+					for _, resp := range anyResp {
+						if strings.Contains(fullResp, resp) {
+							atLeastOne = true
+							t.Log(fullResp)
+							reqCancel()
+							break
+						}
+					}
+					return nil
+				}
+
+				done := make(chan int)
+				var genErr error
+				go func() {
+					genErr = client.Generate(reqCtx, &genReq, genfn)
+					done <- 0
+				}()
+
+				select {
+				case <-done:
+					if genErr != nil && !atLeastOne {
+						t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
+					}
+				case <-ctx.Done():
+					t.Error("outer test context done while waiting for generate")
+				}
+
+				t.Logf("passed")
+
+			})
+		}
+	}
+}