diff --git a/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch b/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch new file mode 100644 index 000000000..b3424c9ef --- /dev/null +++ b/llama/patches/0018-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Thu, 1 May 2025 13:46:10 -0700 +Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222) + +The following scenario will cause an assertion failure in the graph +allocator: + - Build and allocate a graph containing a tensor with a non-NULL data + pointer + - Build and allocate a new graph where that data is NULL + +Result: +ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed + +This happens during revalidation because we think that memory should +have been previously allocated based on the current graph but in +reality the previous graph was different. In this situation, we +should do a full reallocation pass. +--- + ggml/src/ggml-alloc.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c +index a3d3f690..5fd379f6 100644 +--- a/ggml/src/ggml-alloc.c ++++ b/ggml/src/ggml-alloc.c +@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * + static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { + size_t node_size = 0; + if (!node->data && !node->view_src) { +- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API ++ // If we previously had data but don't now then reallocate ++ if (talloc->buffer_id < 0) { ++ return false; ++ } + node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); + } + return talloc->size_max >= node_size; diff --git a/ml/backend/ggml/ggml/src/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c index a3d3f6901..5fd379f6a 100644 --- a/ml/backend/ggml/ggml/src/ggml-alloc.c +++ b/ml/backend/ggml/ggml/src/ggml-alloc.c @@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { size_t node_size = 0; if (!node->data && !node->view_src) { - GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API + // If we previously had data but don't now then reallocate + if (talloc->buffer_id < 0) { + return false; + } node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); } return talloc->size_max >= node_size; diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 9597f7cdd..3e0bb34ec 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -715,9 +715,7 @@ func (m *multiLPath) String() string { return strings.Join(*m, ", ") } -// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded -// to the GPU -/*func (s *Server) reserveWorstCaseGraph() error { +func (s *Server) reserveWorstCaseGraph() error { ctx := s.model.Backend().NewContext() defer ctx.Close() @@ -760,7 +758,7 @@ func (m *multiLPath) String() string { } return nil -}*/ +} func (s *Server) loadModel( ctx context.Context, @@ -797,10 +795,10 @@ func (s *Server) loadModel( s.seqs = make([]*Sequence, s.parallel) s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) - /*err = s.reserveWorstCaseGraph() + err = s.reserveWorstCaseGraph() if err != nil { panic(err) - }*/ + } s.status = llm.ServerStatusReady s.ready.Done()