mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 02:16:36 +02:00
ollamarunner: Re-enable worst case graph preallocation.
Worst case graph preallocation was disabled by a27462b
"ollamarunner: Temporarily disable worst case graph preallocation"
since it caused crashes with large batches when not using the GPU.
This backports upstream llama.cpp commit f057808
"ggml: Don't assert fail when tensor data changes (#13222)", which
fixes the underlying bug and allows reverting the previous workaround.
This commit is contained in:
parent
57fb759f3c
commit
c2f5d6662b
3 changed files with 46 additions and 7 deletions
|
@ -0,0 +1,38 @@
|
||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Jesse Gross <jesse@kernel.org>
|
||||||
|
Date: Thu, 1 May 2025 13:46:10 -0700
|
||||||
|
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
|
||||||
|
|
||||||
|
The following scenario will cause an assertion failure in the graph
|
||||||
|
allocator:
|
||||||
|
- Build and allocate a graph containing a tensor with a non-NULL data
|
||||||
|
pointer
|
||||||
|
- Build and allocate a new graph where that data is NULL
|
||||||
|
|
||||||
|
Result:
|
||||||
|
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
|
||||||
|
|
||||||
|
This happens during revalidation because we think that memory should
|
||||||
|
have been previously allocated based on the current graph but in
|
||||||
|
reality the previous graph was different. In this situation, we
|
||||||
|
should do a full reallocation pass.
|
||||||
|
---
|
||||||
|
ggml/src/ggml-alloc.c | 5 ++++-
|
||||||
|
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||||
|
index a3d3f690..5fd379f6 100644
|
||||||
|
--- a/ggml/src/ggml-alloc.c
|
||||||
|
+++ b/ggml/src/ggml-alloc.c
|
||||||
|
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||||
|
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||||
|
size_t node_size = 0;
|
||||||
|
if (!node->data && !node->view_src) {
|
||||||
|
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
||||||
|
+ // If we previously had data but don't now then reallocate
|
||||||
|
+ if (talloc->buffer_id < 0) {
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||||
|
}
|
||||||
|
return talloc->size_max >= node_size;
|
5
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
5
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
|
@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||||
size_t node_size = 0;
|
size_t node_size = 0;
|
||||||
if (!node->data && !node->view_src) {
|
if (!node->data && !node->view_src) {
|
||||||
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
// If we previously had data but don't now then reallocate
|
||||||
|
if (talloc->buffer_id < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||||
}
|
}
|
||||||
return talloc->size_max >= node_size;
|
return talloc->size_max >= node_size;
|
||||||
|
|
|
@ -715,9 +715,7 @@ func (m *multiLPath) String() string {
|
||||||
return strings.Join(*m, ", ")
|
return strings.Join(*m, ", ")
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
|
func (s *Server) reserveWorstCaseGraph() error {
|
||||||
// to the GPU
|
|
||||||
/*func (s *Server) reserveWorstCaseGraph() error {
|
|
||||||
ctx := s.model.Backend().NewContext()
|
ctx := s.model.Backend().NewContext()
|
||||||
defer ctx.Close()
|
defer ctx.Close()
|
||||||
|
|
||||||
|
@ -760,7 +758,7 @@ func (m *multiLPath) String() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}*/
|
}
|
||||||
|
|
||||||
func (s *Server) loadModel(
|
func (s *Server) loadModel(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
|
@ -797,10 +795,10 @@ func (s *Server) loadModel(
|
||||||
s.seqs = make([]*Sequence, s.parallel)
|
s.seqs = make([]*Sequence, s.parallel)
|
||||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||||
|
|
||||||
/*err = s.reserveWorstCaseGraph()
|
err = s.reserveWorstCaseGraph()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}*/
|
}
|
||||||
|
|
||||||
s.status = llm.ServerStatusReady
|
s.status = llm.ServerStatusReady
|
||||||
s.ready.Done()
|
s.ready.Done()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue