mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 02:16:36 +02:00
Mistral is a popular research lab making open source models. This updates the forward pass of llama architecture models to support both llama models and mistral models by accounting for additional metadata present in mistral models, and finding the correct dimensions for the output projection.
173 lines
7 KiB
Diff
173 lines
7 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Patrick Devine <patrick@infrahq.com>
|
|
Date: Fri, 14 Mar 2025 16:33:23 -0700
|
|
Subject: [PATCH] add model quantizations
|
|
|
|
- gemma3
|
|
- mistral3
|
|
---
|
|
src/llama-arch.cpp | 36 ++++++++++++++++++++++++++++++++++++
|
|
src/llama-arch.h | 2 ++
|
|
src/llama-model.cpp | 10 ++++++++++
|
|
src/llama-quant.cpp | 4 ++++
|
|
4 files changed, 52 insertions(+)
|
|
|
|
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
|
index b6f20286..13a0a988 100644
|
|
--- a/src/llama-arch.cpp
|
|
+++ b/src/llama-arch.cpp
|
|
@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
|
{ LLM_ARCH_GEMMA, "gemma" },
|
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
|
+ { LLM_ARCH_GEMMA3, "gemma3" },
|
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
{ LLM_ARCH_MAMBA, "mamba" },
|
|
{ LLM_ARCH_XVERSE, "xverse" },
|
|
@@ -64,6 +65,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
{ LLM_ARCH_SOLAR, "solar" },
|
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
+ { LLM_ARCH_MISTRAL3, "mistral3" },
|
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
};
|
|
|
|
@@ -804,6 +806,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
},
|
|
},
|
|
+ {
|
|
+ LLM_ARCH_GEMMA3,
|
|
+ {
|
|
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
+ },
|
|
+ },
|
|
{
|
|
LLM_ARCH_STARCODER2,
|
|
{
|
|
@@ -1352,6 +1372,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
|
},
|
|
},
|
|
+ {
|
|
+ LLM_ARCH_MISTRAL3,
|
|
+ {
|
|
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
+ }
|
|
+ },
|
|
{
|
|
LLM_ARCH_UNKNOWN,
|
|
{
|
|
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
|
index ec742224..8476ae0a 100644
|
|
--- a/src/llama-arch.h
|
|
+++ b/src/llama-arch.h
|
|
@@ -41,6 +41,7 @@ enum llm_arch {
|
|
LLM_ARCH_MINICPM3,
|
|
LLM_ARCH_GEMMA,
|
|
LLM_ARCH_GEMMA2,
|
|
+ LLM_ARCH_GEMMA3,
|
|
LLM_ARCH_STARCODER2,
|
|
LLM_ARCH_MAMBA,
|
|
LLM_ARCH_XVERSE,
|
|
@@ -68,6 +69,7 @@ enum llm_arch {
|
|
LLM_ARCH_CHAMELEON,
|
|
LLM_ARCH_SOLAR,
|
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
+ LLM_ARCH_MISTRAL3,
|
|
LLM_ARCH_UNKNOWN,
|
|
};
|
|
|
|
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
|
index ab1a07d1..db4f2685 100644
|
|
--- a/src/llama-model.cpp
|
|
+++ b/src/llama-model.cpp
|
|
@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
default: type = LLM_TYPE_UNKNOWN;
|
|
}
|
|
} break;
|
|
+ case LLM_ARCH_GEMMA3:
|
|
+ {
|
|
+ } break;
|
|
case LLM_ARCH_STARCODER2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1274,6 +1277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
} break;
|
|
+ case LLM_ARCH_MISTRAL3: break;
|
|
default: throw std::runtime_error("unsupported model architecture");
|
|
}
|
|
|
|
@@ -2537,6 +2541,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
}
|
|
} break;
|
|
+ case LLM_ARCH_GEMMA3:
|
|
+ {
|
|
+ } break;
|
|
case LLM_ARCH_STARCODER2:
|
|
{
|
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3531,6 +3538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
|
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
|
} break;
|
|
+ case LLM_ARCH_MISTRAL3: break;
|
|
default:
|
|
throw std::runtime_error("unknown architecture");
|
|
}
|
|
@@ -4009,6 +4017,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
|
case LLM_ARCH_GRANITE_MOE:
|
|
case LLM_ARCH_CHAMELEON:
|
|
case LLM_ARCH_SOLAR:
|
|
+ case LLM_ARCH_MISTRAL3:
|
|
return LLAMA_ROPE_TYPE_NORM;
|
|
|
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -4029,6 +4038,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
|
case LLM_ARCH_PHIMOE:
|
|
case LLM_ARCH_GEMMA:
|
|
case LLM_ARCH_GEMMA2:
|
|
+ case LLM_ARCH_GEMMA3:
|
|
case LLM_ARCH_STARCODER2:
|
|
case LLM_ARCH_OPENELM:
|
|
case LLM_ARCH_GPTNEOX:
|
|
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
|
index 6eb1da08..ebcbafa1 100644
|
|
--- a/src/llama-quant.cpp
|
|
+++ b/src/llama-quant.cpp
|
|
@@ -737,6 +737,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
|
|
|
+ // don't quantize vision stuff
|
|
+ quantize &= name.find("v.") == std::string::npos;
|
|
+ quantize &= name.find("mm.") == std::string::npos;
|
|
+
|
|
// quantize only 2D and 3D tensors (experts)
|
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
|
|