mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 18:36:41 +02:00
next ollama runner (#7913)
feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
This commit is contained in:
parent
8cf16063a5
commit
58245413f4
57 changed files with 475427 additions and 494 deletions
|
@ -15,6 +15,7 @@ import (
|
|||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
|
@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
|
|||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
var ggml *llm.GGML // value not used in tests
|
||||
var f *ggml.GGML // value not used in tests
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
|
@ -47,11 +48,11 @@ func TestLoad(t *testing.T) {
|
|||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := discover.GpuInfoList{}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
require.Empty(t, req.successCh)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
|
@ -61,10 +62,10 @@ func TestLoad(t *testing.T) {
|
|||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
|
@ -78,7 +79,7 @@ func TestLoad(t *testing.T) {
|
|||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = errors.New("wait failure")
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
|
@ -99,10 +100,10 @@ type reqBundle struct {
|
|||
ctxDone func()
|
||||
srv *mockLlm
|
||||
req *LlmRequest
|
||||
ggml *llm.GGML
|
||||
f *ggml.GGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
||||
|
@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||
require.NoError(t, err)
|
||||
defer f.Close()
|
||||
|
||||
require.NoError(t, llm.WriteGGUF(f, llm.KV{
|
||||
require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
|
@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||
"tokenizer.ggml.tokens": []string{" "},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []llm.Tensor{
|
||||
}, []ggml.Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
}))
|
||||
|
@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||
|
||||
fname := f.Name()
|
||||
model := &Model{Name: modelName, ModelPath: fname}
|
||||
b.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
||||
b.f, err = llm.LoadModel(model.ModelPath, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
if duration == nil {
|
||||
|
@ -174,7 +175,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
|
|||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
|
||||
b.req.model = a.req.model
|
||||
b.ggml = a.ggml
|
||||
b.f = a.f
|
||||
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
|
@ -218,7 +219,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
|
|||
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
tmpModel := *a.req.model
|
||||
b.req.model = &tmpModel
|
||||
b.ggml = a.ggml
|
||||
b.f = a.f
|
||||
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
|
@ -419,13 +420,13 @@ func TestExpireRunner(t *testing.T) {
|
|||
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||
}
|
||||
|
||||
var ggml *llm.GGML
|
||||
var f *ggml.GGML
|
||||
gpus := discover.GpuInfoList{}
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
|
@ -729,9 +730,9 @@ func TestHomogeneousGPUs(t *testing.T) {
|
|||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue