mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 10:26:53 +02:00
Merge pull request #9703 from ollama/mxyng/gemma3-memory
count gemma3 vision tensors
This commit is contained in:
commit
4ea4d2b189
2 changed files with 39 additions and 26 deletions
|
@ -583,39 +583,52 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
||||||
|
if llm.KV().Uint("vision.block_count") == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, layer := range llm.Tensors().GroupLayers() {
|
||||||
|
if name == "v" || strings.HasPrefix(name, "v.") {
|
||||||
|
for _, tensor := range layer {
|
||||||
|
weights += tensor.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
imageSize := uint64(llm.KV().Uint("vision.image_size"))
|
||||||
|
patchSize := uint64(llm.KV().Uint("vision.patch_size"))
|
||||||
|
if patchSize == 0 {
|
||||||
|
slog.Warn("unknown patch size for vision model")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
numChannels := uint64(llm.KV().Uint("vision.num_channels"))
|
||||||
|
|
||||||
|
numPatches := (imageSize / patchSize) * (imageSize / patchSize)
|
||||||
|
if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
|
||||||
|
numPatches++
|
||||||
|
}
|
||||||
|
|
||||||
|
headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
|
||||||
|
embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
|
||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch llm.KV().Architecture() {
|
||||||
case "mllama":
|
case "mllama":
|
||||||
for _, layer := range llm.Tensors().GroupLayers()["v"] {
|
|
||||||
weights += layer.Size()
|
|
||||||
}
|
|
||||||
|
|
||||||
kv := func(n string) uint64 {
|
|
||||||
if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok {
|
|
||||||
return uint64(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
imageSize := kv("image_size")
|
|
||||||
|
|
||||||
maxNumTiles := kv("max_num_tiles")
|
|
||||||
embeddingLength := kv("embedding_length")
|
|
||||||
headCount := kv("attention.head_count")
|
|
||||||
|
|
||||||
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
|
||||||
if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
|
|
||||||
numPatches++
|
|
||||||
}
|
|
||||||
|
|
||||||
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
|
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
|
||||||
|
|
||||||
|
maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
|
||||||
|
|
||||||
graphSize = 4 * (8 +
|
graphSize = 4 * (8 +
|
||||||
imageSize*imageSize*kv("num_channels")*maxNumTiles +
|
imageSize*imageSize*numChannels*maxNumTiles +
|
||||||
embeddingLength*numPatches*maxNumTiles +
|
embeddingLength*numPatches*maxNumTiles +
|
||||||
9*embeddingLength*numPaddedPatches*maxNumTiles +
|
9*embeddingLength*numPaddedPatches*maxNumTiles +
|
||||||
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
|
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
|
||||||
|
case "gemma3":
|
||||||
|
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||||
|
embeddingLength*patchSize +
|
||||||
|
numPatches*numPatches*headCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
return weights, graphSize
|
return weights, graphSize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -218,8 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||||
layerSize = blk.Size()
|
layerSize = blk.Size()
|
||||||
layerSize += kv / f.KV().BlockCount()
|
layerSize += kv / f.KV().BlockCount()
|
||||||
|
memoryWeights += blk.Size()
|
||||||
}
|
}
|
||||||
memoryWeights += layerSize
|
|
||||||
|
|
||||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||||
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||||
|
@ -376,7 +376,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
|
||||||
// memory of the weights
|
// memory of the weights
|
||||||
"total", format.HumanBytes2(m.memoryWeights),
|
"total", format.HumanBytes2(m.memoryWeights),
|
||||||
// memory of repeating layers
|
// memory of repeating layers
|
||||||
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
|
"repeating", format.HumanBytes2(m.memoryWeights),
|
||||||
// memory of non-repeating layers
|
// memory of non-repeating layers
|
||||||
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
||||||
),
|
),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue