Merge pull request #4218 from dhiltgen/auto_parallel

Enable concurrency by default
2025-05-11 18:36:41 +02:00 · 2024-07-01 08:32:29 -07:00 · 2024-07-01 08:32:29 -07:00 · 3518aaef33
commit 3518aaef33
parent 1963c00201 642cee1342
7 changed files with 175 additions and 73 deletions
--- a/llm/server.go
+++ b/llm/server.go
@ -82,7 +82,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {

 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
+func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
 	var estimate MemoryEstimate
@ -218,8 +218,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 	// Windows CUDA should not use mmap for best performance
 	// Linux  with a model larger than free space, mmap leads to thrashing
+	// For CPU loads we want the memory to be allocated, not FS cache
 	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
 		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
+		(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
 		opts.UseMMap == api.TriStateFalse {
 		params = append(params, "--no-mmap")
 	}
@ -232,15 +234,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}

-	numParallel := envconfig.NumParallel
-
-	// TODO (jmorganca): multimodal models don't support parallel yet
-	// see https://github.com/ollama/ollama/issues/4165
-	if len(projectors) > 0 {
-		numParallel = 1
-		slog.Warn("multimodal models don't support parallel requests yet")
-	}
-
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))

 	if estimate.TensorSplit != "" {