next ollama runner (#7913)

feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2025-05-11 02:16:36 +02:00 · 2025-02-14 00:31:21 +00:00 · 2025-02-14 00:31:21 +00:00 · 58245413f4
commit 58245413f4
parent 8cf16063a5
57 changed files with 475427 additions and 494 deletions
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@ -19,12 +19,12 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 var stream bool = false

-func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
+func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
 	t.Helper()
 	t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))

@ -36,7 +36,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
 	}
 	defer f.Close()

-	if err := llm.WriteGGUF(f, kv, ti); err != nil {
+	if err := ggml.WriteGGUF(f, kv, ti); err != nil {
 		t.Fatal(err)
 	}
 	// Calculate sha256 of file
@ -672,7 +672,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 	var s Server

 	t.Run("matched", func(t *testing.T) {
-		_, digest := createBinFile(t, llm.KV{
+		_, digest := createBinFile(t, ggml.KV{
 			"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
 		}, nil)
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{