mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 18:36:41 +02:00
kvcache: Pass granular cache size into implementations
Currently the runner computes the kv size needed and creates a cache of that size. This is the context size times number of parallel sequences. Cache implementations can make better decisions about their memory usage, so instead pass in the required capacity, number of sequences and maximum batch size. For now, the causal cache just uses this to compute the size in the same way as before.
This commit is contained in:
parent
6d1103048e
commit
3ed7ad3ab3
7 changed files with 44 additions and 32 deletions
|
@ -25,7 +25,7 @@ func TestStore(t *testing.T) {
|
|||
cache := NewCausalCache(nil)
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
|
@ -58,7 +58,7 @@ func TestSWA(t *testing.T) {
|
|||
cache := NewSWACache(1, nil)
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF32, 16)
|
||||
cache.Init(backend, ml.DTypeF32, 1, 16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
|
@ -81,7 +81,7 @@ func TestSequences(t *testing.T) {
|
|||
cache := NewCausalCache(nil)
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
|
@ -116,7 +116,7 @@ func TestRemove(t *testing.T) {
|
|||
})
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
|
@ -181,7 +181,7 @@ func TestDefrag(t *testing.T) {
|
|||
})
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
|
@ -229,7 +229,7 @@ func TestCopy(t *testing.T) {
|
|||
cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue