ollamarunner: Re-enable worst case graph preallocation.

Worst case graph preallocation was disabled by a27462b "ollamarunner: Temporarily disable worst case graph preallocation" since it caused crashes with large batches when not using the GPU. This backports upstream llama.cpp commit f057808 "ggml: Don't assert fail when tensor data changes (#13222)", which fixes the underlying bug and allows reverting the previous workaround.
2025-05-11 02:16:36 +02:00 · 2025-05-02 11:24:19 -07:00 · 2025-05-02 11:24:19 -07:00 · c2f5d6662b
commit c2f5d6662b
parent 57fb759f3c
3 changed files with 46 additions and 7 deletions
--- a/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ b/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
@ -0,0 +1,38 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Jesse Gross <jesse@kernel.org>
 Date: Thu, 1 May 2025 13:46:10 -0700
 Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
 The following scenario will cause an assertion failure in the graph
 allocator:
 - Build and allocate a graph containing a tensor with a non-NULL data
   pointer
 - Build and allocate a new graph where that data is NULL
 Result:
 ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
 This happens during revalidation because we think that memory should
 have been previously allocated based on the current graph but in
 reality the previous graph was different. In this situation, we
 should do a full reallocation pass.
 ---
 ggml/src/ggml-alloc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
 index a3d3f690..5fd379f6 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
     size_t node_size = 0;
     if (!node->data && !node->view_src) {
 -        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
 +        // If we previously had data but don't now then reallocate
 +        if (talloc->buffer_id < 0) {
 +            return false;
 +        }
         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
     }
     return talloc->size_max >= node_size;
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
    size_t node_size = 0;
    if (!node->data && !node->view_src) {
-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        // If we previously had data but don't now then reallocate
        if (talloc->buffer_id < 0) {
            return false;
        }
        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
    }
    return talloc->size_max >= node_size;
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@ -715,9 +715,7 @@ func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }
-// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
+func (s *Server) reserveWorstCaseGraph() error {
 // to the GPU
 /*func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
@ -760,7 +758,7 @@ func (m *multiLPath) String() string {
 	}
 	return nil
-}*/
+}
 func (s *Server) loadModel(
 	ctx context.Context,
@ -797,10 +795,10 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
-	/*err = s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
-	}*/
+	}
 	s.status = llm.ServerStatusReady
 	s.ready.Done()