mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
memory
This commit is contained in:
parent
7ba9fa9c7d
commit
f0ad49ea17
1 changed files with 5 additions and 2 deletions
|
@ -430,7 +430,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||
}
|
||||
|
||||
switch f.KV().Architecture() {
|
||||
case "llama":
|
||||
case "llama", "llama4":
|
||||
fullOffload = max(
|
||||
4*batch*(1+4*embedding+context*(1+heads)),
|
||||
4*batch*(embedding+vocab),
|
||||
|
@ -444,7 +444,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||
|
||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||
// mixtral 8x22b
|
||||
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
||||
ff := uint64(f.KV().Uint("feed_forward_length"))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||
|
@ -640,6 +640,9 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||
embeddingLength*patchSize +
|
||||
numPatches*numPatches*headCount)
|
||||
case "llama4":
|
||||
// vision graph is computed independently in the same schedule
|
||||
// and is negligible compared to the worst case text graph
|
||||
}
|
||||
|
||||
return weights, graphSize
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue