From a27462b7085c7ba794f3b8da1553f4f1caa08ed0 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 29 Apr 2025 10:48:39 -0700 Subject: [PATCH] ollamarunner: Temporarily disable worst case graph preallocation When we later have a large batch running purely on a CPU, this results the error: GGML_ASSERT(talloc->buffer_id >= 0) Disabling this means that we will incrementally reallocate memory as the graph grows. Fixes #10410 --- runner/ollamarunner/runner.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 0ac543888..7ca6dc8c1 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -723,7 +723,9 @@ func (m *multiLPath) String() string { return strings.Join(*m, ", ") } -func (s *Server) reserveWorstCaseGraph() error { +// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded +// to the GPU +/*func (s *Server) reserveWorstCaseGraph() error { ctx := s.model.Backend().NewContext() defer ctx.Close() @@ -766,7 +768,7 @@ func (s *Server) reserveWorstCaseGraph() error { } return nil -} +}*/ func (s *Server) loadModel( ctx context.Context, @@ -803,10 +805,10 @@ func (s *Server) loadModel( s.seqs = make([]*Sequence, s.parallel) s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) - err = s.reserveWorstCaseGraph() + /*err = s.reserveWorstCaseGraph() if err != nil { panic(err) - } + }*/ s.status = llm.ServerStatusReady s.ready.Done()