mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
ollamarunner: Preallocate worst case graph at startup
Currently, the KV cache and graph are lazily allocated as needed. The cache is fully allocated on first use of the corresponding layer whereas the graph grows with the size of the context. This can be an issue if another application allocates more VRAM after we do our calculations - Ollama will crash in the middle of inference. If we instead allocate the maximum needed memory at startup of the runner, we will either succeed or fail at that point rather than at some surprising time in the future. Currently, this only generates a worst case batch for text, which means that vision models may get a partial allocation and continue to lazily allocate the rest.
This commit is contained in:
parent
a807985e59
commit
dbb149e6f7
10 changed files with 156 additions and 55 deletions
|
@ -281,7 +281,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
|
|||
context := backend.NewContext()
|
||||
defer context.Close()
|
||||
|
||||
err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs})
|
||||
err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs}, false)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
@ -315,7 +315,7 @@ func TestCanResume(t *testing.T) {
|
|||
err := cache.StartForward(context, input.Batch{
|
||||
Positions: []int32{0, 1, 2, 3},
|
||||
Sequences: []int{0, 0, 0, 0},
|
||||
})
|
||||
}, false)
|
||||
if err != nil {
|
||||
t.Fatalf("StartForward failed: %v", err)
|
||||
}
|
||||
|
@ -342,7 +342,7 @@ func TestCanResume(t *testing.T) {
|
|||
err = cache.StartForward(context, input.Batch{
|
||||
Positions: []int32{4, 5},
|
||||
Sequences: []int{0, 0},
|
||||
})
|
||||
}, false)
|
||||
if err != nil {
|
||||
t.Fatalf("StartForward failed: %v", err)
|
||||
}
|
||||
|
@ -440,6 +440,8 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
|
|||
|
||||
func (c *testContext) Compute(...ml.Tensor) {}
|
||||
|
||||
func (c *testContext) Reserve() error { return nil }
|
||||
|
||||
func (c *testContext) MaxGraphNodes() int {
|
||||
return 10
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue