mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
Disable concurrency for AMD + Windows
Until ROCm v6.2 ships, we wont be able to get accurate free memory reporting on windows, which makes automatic concurrency too risky. Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes. All other platforms and GPUs have accurate VRAM reporting wired up now, so we can turn on concurrency by default.
This commit is contained in:
parent
17b7186cd7
commit
9929751cc8
4 changed files with 44 additions and 10 deletions
|
@ -46,6 +46,16 @@ type Scheduler struct {
|
|||
reschedDelay time.Duration
|
||||
}
|
||||
|
||||
// Default automatic value for number of models we allow per GPU
|
||||
// Model will still need to fit in VRAM, but loading many small models
|
||||
// on a large GPU can cause stalling
|
||||
var defaultModelsPerGPU = 3
|
||||
|
||||
// Default automatic value for parallel setting
|
||||
// Model will still need to fit in VRAM. If this setting wont fit
|
||||
// we'll back off down to 1 to try to get it to fit
|
||||
var defaultParallel = 4
|
||||
|
||||
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
func InitScheduler(ctx context.Context) *Scheduler {
|
||||
|
@ -100,7 +110,6 @@ func (s *Scheduler) Run(ctx context.Context) {
|
|||
}
|
||||
|
||||
func (s *Scheduler) processPending(ctx context.Context) {
|
||||
maxRunnerFactor := 1 // number of GPUs or 1
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
@ -143,7 +152,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||
break
|
||||
}
|
||||
} else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) {
|
||||
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
|
||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
} else {
|
||||
|
@ -155,7 +164,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
} else {
|
||||
gpus = s.getGpuFn()
|
||||
}
|
||||
maxRunnerFactor = max(len(gpus), 1)
|
||||
|
||||
if envconfig.MaxRunners <= 0 {
|
||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
|
||||
allReliable := true
|
||||
for _, gpu := range gpus {
|
||||
if gpu.UnreliableFreeMemory {
|
||||
allReliable = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allReliable {
|
||||
envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
|
||||
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
|
||||
} else {
|
||||
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
|
||||
envconfig.MaxRunners = len(gpus)
|
||||
}
|
||||
}
|
||||
|
||||
// Load model for fitting
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||
|
@ -647,7 +675,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
|||
var numParallelToTry []int
|
||||
if *numParallel <= 0 {
|
||||
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
||||
numParallelToTry = append(numParallelToTry, 4, 1)
|
||||
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
|
||||
} else {
|
||||
numParallelToTry = []int{*numParallel}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue