Adjust mmap logic for cuda windows for faster model load

On Windows, recent llama.cpp changes make mmap slower in most
cases, so default to off.  This also implements a tri-state for
use_mmap so we can detect the difference between a user provided
value of true/false, or unspecified.
This commit is contained in:
Daniel Hiltgen 2024-06-17 12:14:42 -07:00
parent 8ed51cac37
commit 171796791f
3 changed files with 96 additions and 15 deletions

View file

@ -200,7 +200,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = false
opts.UseMMap = api.TriStateFalse
}
}
@ -208,7 +208,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--flash-attn")
}
if !opts.UseMMap {
// Windows CUDA should not use mmap for best performance
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap")
}