mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 18:36:41 +02:00
Fix concurrency for CPU mode
Prior refactoring passes accidentally removed the logic to bypass VRAM checks for CPU loads. This adds that back, along with test coverage. This also fixes loaded map access in the unit test to be behind the mutex which was likely the cause of various flakes in the tests.
This commit is contained in:
parent
114c932a8e
commit
d6e3b64582
2 changed files with 88 additions and 13 deletions
|
@ -149,6 +149,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
break
|
||||
}
|
||||
|
||||
// If we're CPU only mode, just limit by loadedMax above
|
||||
// TODO handle system memory exhaustion
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
||||
slog.Debug("cpu mode with existing models, loading")
|
||||
s.loadFn(pending, ggml, gpus)
|
||||
break
|
||||
}
|
||||
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
if loadedCount == 0 {
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue