mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
ollamarunner: Temporarily disable worst case graph preallocation
When we later have a large batch running purely on a CPU, this results the error: GGML_ASSERT(talloc->buffer_id >= 0) Disabling this means that we will incrementally reallocate memory as the graph grows. Fixes #10410
This commit is contained in:
parent
6bf0b8193a
commit
a27462b708
1 changed files with 6 additions and 4 deletions
|
@ -723,7 +723,9 @@ func (m *multiLPath) String() string {
|
|||
return strings.Join(*m, ", ")
|
||||
}
|
||||
|
||||
func (s *Server) reserveWorstCaseGraph() error {
|
||||
// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
|
||||
// to the GPU
|
||||
/*func (s *Server) reserveWorstCaseGraph() error {
|
||||
ctx := s.model.Backend().NewContext()
|
||||
defer ctx.Close()
|
||||
|
||||
|
@ -766,7 +768,7 @@ func (s *Server) reserveWorstCaseGraph() error {
|
|||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
}*/
|
||||
|
||||
func (s *Server) loadModel(
|
||||
ctx context.Context,
|
||||
|
@ -803,10 +805,10 @@ func (s *Server) loadModel(
|
|||
s.seqs = make([]*Sequence, s.parallel)
|
||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||
|
||||
err = s.reserveWorstCaseGraph()
|
||||
/*err = s.reserveWorstCaseGraph()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}*/
|
||||
|
||||
s.status = llm.ServerStatusReady
|
||||
s.ready.Done()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue