next ollama runner (#7913)

feat: add new Ollama engine using ggml through cgo

This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this.

- `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go`
- `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go`
- `ml.Tensor` defines the interface for a tensor and tensor operations

This is the first implementation of the new engine. Follow up PRs will implement more features:

- non-greedy sampling (#8410)
- integration with Ollama and KV caching (#8301)
- more model support (#9080) with more coming soon

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
This commit is contained in:
Michael Yang 2025-02-14 00:31:21 +00:00 committed by GitHub
parent 8cf16063a5
commit 58245413f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
57 changed files with 475427 additions and 494 deletions

View file

@ -28,6 +28,7 @@ import (
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama"
)
@ -71,7 +72,7 @@ type llmServer struct {
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*GGML, error) {
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@ -82,21 +83,17 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
}
defer f.Close()
ggml, _, err := DecodeGGML(f, maxArraySize)
ggml, _, err := ggml.Decode(f, maxArraySize)
return ggml, err
}
// NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family.
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var systemTotalMemory uint64
var systemFreeMemory uint64
var systemSwapFreeMemory uint64
func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
systemInfo := discover.GetSystemInfo()
systemTotalMemory = systemInfo.System.TotalMemory
systemFreeMemory = systemInfo.System.FreeMemory
systemSwapFreeMemory = systemInfo.System.FreeSwap
systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory
systemSwapFreeMemory := systemInfo.System.FreeSwap
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
@ -104,7 +101,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
gpus = discover.GetCPUInfo()
}
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
estimate := EstimateGPULayers(gpus, f, projectors, opts)
if len(gpus) > 1 || gpus[0].Library != "cpu" {
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
@ -130,7 +127,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
}
estimate.log()
slog.Info("offload", "", estimate)
params := []string{
"--model", model,
@ -174,7 +171,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
fa = false
}
if fa && !ggml.SupportsFlashAttention() {
if fa && !f.SupportsFlashAttention() {
slog.Warn("flash attention enabled but not supported by model")
fa = false
}
@ -187,7 +184,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
// Flash Attention also supports kv cache quantization
// Enable if the requested and kv cache type is supported by the model
if kvct != "" && ggml.SupportsKVCacheType(kvct) {
if kvct != "" && f.SupportsKVCacheType(kvct) {
params = append(params, "--kv-cache-type", kvct)
} else {
slog.Warn("kv cache type not supported by model", "type", kvct)
@ -200,7 +197,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
for _, g := range gpus {
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
opts.UseMMap = new(bool)
*opts.UseMMap = false
}
@ -335,7 +332,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
estimate: estimate,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
totalLayers: f.KV().BlockCount() + 1,
gpus: gpus,
done: make(chan error, 1),
}