mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 18:36:41 +02:00
Adjust mmap logic for cuda windows for faster model load
On Windows, recent llama.cpp changes make mmap slower in most cases, so default to off. This also implements a tri-state for use_mmap so we can detect the difference between a user provided value of true/false, or unspecified.
This commit is contained in:
parent
8ed51cac37
commit
171796791f
3 changed files with 96 additions and 15 deletions
|
@ -200,7 +200,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
if g.Library == "metal" &&
|
||||
uint64(opts.NumGPU) > 0 &&
|
||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
opts.UseMMap = false
|
||||
opts.UseMMap = api.TriStateFalse
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -208,7 +208,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
params = append(params, "--flash-attn")
|
||||
}
|
||||
|
||||
if !opts.UseMMap {
|
||||
// Windows CUDA should not use mmap for best performance
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
|
||||
params = append(params, "--no-mmap")
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue