mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.
169 lines
5.5 KiB
Go
169 lines
5.5 KiB
Go
package convert
|
|
|
|
import (
|
|
"slices"
|
|
"strings"
|
|
|
|
"github.com/pdevine/tensor"
|
|
"github.com/pdevine/tensor/native"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type llama4Model struct {
|
|
ModelParameters
|
|
TextModel struct {
|
|
llamaModel
|
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
|
NumLocalExperts uint32 `json:"num_local_experts"`
|
|
InterleaveMOELayerStep uint32 `json:"interleave_moe_layer_step"`
|
|
UseQKNorm bool `json:"use_qk_norm"`
|
|
IntermediateSizeMLP uint32 `json:"intermediate_size_mlp"`
|
|
AttentionChunkSize uint32 `json:"attention_chunk_size"`
|
|
} `json:"text_config"`
|
|
VisionModel struct {
|
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
IntermediateSize uint32 `json:"intermediate_size"`
|
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
|
ImageSize uint32 `json:"image_size"`
|
|
PatchSize uint32 `json:"patch_size"`
|
|
RopeTheta float32 `json:"rope_theta"`
|
|
NormEpsilon float32 `json:"norm_eps"`
|
|
PixelShuffleRatio float32 `json:"pixel_shuffle_ratio"`
|
|
} `json:"vision_config"`
|
|
}
|
|
|
|
// KV implements ModelConverter.
|
|
func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
|
|
kv := p.ModelParameters.KV(t)
|
|
kv["general.architecture"] = "llama4"
|
|
|
|
for k, v := range p.TextModel.KV(t) {
|
|
if strings.HasPrefix(k, "llama.") {
|
|
kv[strings.ReplaceAll(k, "llama.", "llama4.")] = v
|
|
}
|
|
}
|
|
|
|
kv["llama4.feed_forward_length"] = p.TextModel.IntermediateSizeMLP
|
|
kv["llama4.expert_feed_forward_length"] = p.TextModel.IntermediateSize
|
|
|
|
kv["llama4.expert_count"] = p.TextModel.NumLocalExperts
|
|
kv["llama4.expert_used_count"] = p.TextModel.NumExpertsPerToken
|
|
kv["llama4.interleave_moe_layer_step"] = p.TextModel.InterleaveMOELayerStep
|
|
kv["llama4.use_qk_norm"] = p.TextModel.UseQKNorm
|
|
kv["llama4.attention.chunk_size"] = p.TextModel.AttentionChunkSize
|
|
|
|
kv["llama4.vision.block_count"] = p.VisionModel.NumHiddenLayers
|
|
kv["llama4.vision.embedding_length"] = p.VisionModel.HiddenSize
|
|
kv["llama4.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
|
|
kv["llama4.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
|
|
kv["llama4.vision.image_size"] = p.VisionModel.ImageSize
|
|
kv["llama4.vision.patch_size"] = p.VisionModel.PatchSize
|
|
kv["llama4.vision.rope.freq_base"] = p.VisionModel.RopeTheta
|
|
kv["llama4.vision.layer_norm_epsilon"] = p.VisionModel.NormEpsilon
|
|
kv["llama4.vision.pixel_shuffle_ratio"] = p.VisionModel.PixelShuffleRatio
|
|
return kv
|
|
}
|
|
|
|
// Replacements implements ModelConverter.
|
|
func (p *llama4Model) Replacements() []string {
|
|
return append(
|
|
p.TextModel.Replacements(),
|
|
"language_model.", "",
|
|
"vision_model", "v",
|
|
"multi_modal_projector", "mm",
|
|
"feed_forward.down_proj", "ffn_down",
|
|
"feed_forward.up_proj", "ffn_up",
|
|
"feed_forward.gate_proj", "ffn_gate",
|
|
"feed_forward.", "ffn_",
|
|
"shared_expert.down_proj", "down_shexp",
|
|
"shared_expert.gate_proj", "gate_shexp",
|
|
"shared_expert.up_proj", "up_shexp",
|
|
"experts.down_proj", "down_exps.weight",
|
|
"experts.gate_up_proj", "gate_up_exps.weight",
|
|
"router", "gate_inp",
|
|
"patch_embedding.linear", "patch_embedding",
|
|
)
|
|
}
|
|
|
|
// Tensors implements ModelConverter.
|
|
func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
var out []*ggml.Tensor
|
|
|
|
var textTensors []Tensor
|
|
for _, t := range ts {
|
|
if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: t.Shape(),
|
|
WriterTo: t,
|
|
})
|
|
} else if strings.Contains(t.Name(), "ffn_gate_up_exps") {
|
|
// gate and up projectors are fused
|
|
// dims[1], dims[2] must be swapped
|
|
// [experts, hidden_size, intermediate_size * 2] --> [experts, intermediate_size, hidden_size]
|
|
halfDim := int(t.Shape()[2]) / 2
|
|
|
|
newShape := slices.Clone(t.Shape())
|
|
newShape[1], newShape[2] = newShape[2]/2, newShape[1]
|
|
for i, name := range []string{"ffn_gate_exps", "ffn_up_exps"} {
|
|
// clone tensor since we need separate repackers
|
|
tt := t.Clone()
|
|
tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
|
|
out = append(out, &ggml.Tensor{
|
|
Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
|
|
Kind: tt.Kind(),
|
|
Shape: newShape,
|
|
WriterTo: tt,
|
|
})
|
|
}
|
|
} else if strings.Contains(t.Name(), "ffn_down_exps") {
|
|
// dims[1], dims[2] must be swapped
|
|
// [experts, intermediate_size, hidden_size] --> [experts, hidden_size, intermediate_size]
|
|
t.SetRepacker(p.repack())
|
|
newShape := slices.Clone(t.Shape())
|
|
newShape[1], newShape[2] = newShape[2], newShape[1]
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: newShape,
|
|
WriterTo: t,
|
|
})
|
|
} else {
|
|
textTensors = append(textTensors, t)
|
|
}
|
|
}
|
|
|
|
p.TextModel.skipRepack = true
|
|
out = append(out, p.TextModel.Tensors(textTensors)...)
|
|
return out
|
|
}
|
|
|
|
func (p *llama4Model) repack(slice ...tensor.Slice) Repacker {
|
|
return func(name string, data []float32, shape []uint64) ([]float32, error) {
|
|
dims := make([]int, len(shape))
|
|
for i, dim := range shape {
|
|
dims[i] = int(dim)
|
|
}
|
|
|
|
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
|
t, err := t.Slice(slice...)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := t.T(0, 2, 1); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
t = tensor.Materialize(t)
|
|
// flatten tensor so it can be return as a vector
|
|
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return native.VectorF32(t.(*tensor.Dense))
|
|
}
|
|
}
|