Disable concurrency for AMD + Windows

Until ROCm v6.2 ships, we wont be able to get accurate free memory reporting on windows, which makes automatic concurrency too risky. Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes. All other platforms and GPUs have accurate VRAM reporting wired up now, so we can turn on concurrency by default.
2025-05-11 10:26:53 +02:00 · 2024-06-19 13:35:38 -07:00 · 2024-06-19 13:35:38 -07:00 · 9929751cc8
commit 9929751cc8
parent 17b7186cd7
4 changed files with 44 additions and 10 deletions
--- a/server/sched.go
+++ b/server/sched.go
@ -46,6 +46,16 @@ type Scheduler struct {
 	reschedDelay time.Duration
 }

+// Default automatic value for number of models we allow per GPU
+// Model will still need to fit in VRAM, but loading many small models
+// on a large GPU can cause stalling
+var defaultModelsPerGPU = 3
+
+// Default automatic value for parallel setting
+// Model will still need to fit in VRAM.  If this setting wont fit
+// we'll back off down to 1 to try to get it to fit
+var defaultParallel = 4
+
 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")

 func InitScheduler(ctx context.Context) *Scheduler {
@ -100,7 +110,6 @@ func (s *Scheduler) Run(ctx context.Context) {
 }

 func (s *Scheduler) processPending(ctx context.Context) {
-	maxRunnerFactor := 1 // number of GPUs or 1
 	for {
 		select {
 		case <-ctx.Done():
@ -143,7 +152,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) {
+				} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload()
 				} else {
@ -155,7 +164,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					} else {
 						gpus = s.getGpuFn()
 					}
-					maxRunnerFactor = max(len(gpus), 1)
+
+					if envconfig.MaxRunners <= 0 {
+						// No user specified MaxRunners, so figure out what automatic setting to use
+						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
+						// if any GPU has unreliable free memory reporting, 1x the number of GPUs
+						allReliable := true
+						for _, gpu := range gpus {
+							if gpu.UnreliableFreeMemory {
+								allReliable = false
+								break
+							}
+						}
+						if allReliable {
+							envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
+							slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
+						} else {
+							slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
+							envconfig.MaxRunners = len(gpus)
+						}
+					}

 					// Load model for fitting
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
@ -647,7 +675,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 	var numParallelToTry []int
 	if *numParallel <= 0 {
 		// If no specific parallel setting was provided, try larger then smaller, always end with 1
-		numParallelToTry = append(numParallelToTry, 4, 1)
+		numParallelToTry = append(numParallelToTry, defaultParallel, 1)
 	} else {
 		numParallelToTry = []int{*numParallel}
 	}