remove mllama patch

This commit is contained in:
Michael Yang 2025-04-25 15:54:42 -07:00
parent 0d6e35d3c6
commit f8586c6b2b
35 changed files with 37 additions and 2444 deletions

View file

@ -30,12 +30,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
PATCHES=$(wildcard llama/patches/*.patch)
PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
.PHONY: apply-patches
.NOTPARALLEL:
apply-patches: $(addsuffix ed, $(PATCHES))
apply-patches: $(PATCHED)
%.patched: %.patch
llama/patches/.%.patched: llama/patches/%.patch
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
.PHONY: checkout
@ -57,4 +58,4 @@ format-patches: llama/patches
.PHONE: clean
clean: checkout
$(RM) $(addsuffix ed, $(PATCHES))
$(RM) $(PATCHED)

View file

@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool {
"gemma3",
"mistral3",
"llama4",
"mllama",
}, kv.Architecture())
}

View file

@ -457,7 +457,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@ -469,7 +469,6 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@ -493,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;

View file

@ -256,7 +256,6 @@ extern "C" {
llama_token * token;
float * embd;
int32_t n_embd;
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@ -359,7 +358,6 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
bool cross_attn; // whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@ -461,10 +459,6 @@ extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
// TODO (jmorganca): this should most likely be passed in as part of a batch
// and not set on the context for all batches.
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);

View file

@ -6,7 +6,6 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
{
LLM_ARCH_MLLAMA,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
{ LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
{ LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
{ LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
{ LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
{ LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
{ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
{ LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
},
},
{
LLM_ARCH_DECI,
{
@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},

View file

@ -11,7 +11,6 @@
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4,
LLM_ARCH_MLLAMA,
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@ -149,7 +148,6 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -351,14 +349,6 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
LLM_TENSOR_CROSS_ATTN_K_NORM,
LLM_TENSOR_CROSS_ATTN_K_PROJ,
LLM_TENSOR_CROSS_ATTN_O_PROJ,
LLM_TENSOR_CROSS_ATTN_Q_NORM,
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
LLM_TENSOR_CROSS_ATTN_V_PROJ,
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,

View file

@ -316,7 +316,6 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
/*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@ -329,7 +328,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
/*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@ -338,7 +336,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
batch.n_embd = embd;
} else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}

View file

@ -809,7 +809,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
return logits + j*model.hparams.n_vocab;
return logits + j*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@ -930,10 +930,6 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value;
}
void llama_context::set_cross_attn(bool value) {
cparams.cross_attn = value;
}
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@ -1009,7 +1005,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@ -1149,9 +1145,10 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const int32_t n_vocab = hparams.n_vocab;
const int32_t n_vocab = vocab.n_tokens();
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
@ -1199,7 +1196,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
sbatch.from_batch(batch, batch.n_embd,
sbatch.from_batch(batch, n_embd,
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
@ -1436,11 +1433,12 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
const auto n_batch = cparams.n_batch;
const auto n_vocab = hparams.n_vocab;
const auto n_vocab = vocab.n_tokens();
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@ -1508,7 +1506,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
const uint32_t n_vocab = model.hparams.n_vocab;
const uint32_t n_vocab = model.vocab.n_tokens();
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@ -2015,7 +2013,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
io.write(&logits_size, sizeof(logits_size));
@ -2198,7 +2196,6 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@ -2326,10 +2323,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
}
void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
ctx->set_cross_attn(cross_attention);
}
void llama_synchronize(llama_context * ctx) {
ctx->synchronize();
}

View file

@ -66,7 +66,6 @@ struct llama_context {
void set_embeddings (bool value);
void set_causal_attn(bool value);
void set_warmup(bool value);
void set_cross_attn(bool value);
void set_adapter_lora(
llama_adapter_lora * adapter,

View file

@ -29,7 +29,6 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
bool cross_attn;
bool warmup;
enum llama_pooling_type pooling_type;

View file

@ -560,12 +560,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
if (ubatch->embd) {
ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
}
}
//
// llm_graph_context
//
@ -1538,25 +1532,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
const int64_t n_embd = hparams.n_embd;
auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
ggml_tensor * cur = nullptr;
inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
ggml_set_input(inp->cross_attn_state);
cur = inp->cross_attn_state;
cb(cur, "inp_cross_attn_state", -1);
res->add_input(std::move(inp));
return cur;
}
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,

View file

@ -86,7 +86,6 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
};
class llm_graph_input_pos : public llm_graph_input_i {
@ -284,16 +283,6 @@ public:
const llama_cross * cross = nullptr;
};
class llm_graph_input_cross_attn_state : public llm_graph_input_i {
public:
llm_graph_input_cross_attn_state() = default;
virtual ~llm_graph_input_cross_attn_state() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
};
//
// llm_graph_result
//
@ -502,7 +491,6 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
ggml_tensor * build_inp_cross_attn_state() const;
ggml_tensor * build_inp_cross_embd() const;
ggml_tensor * build_inp_pos_bucket_enc() const;

View file

@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error");
}
bool llama_hparams::cross_attention_layers(uint32_t il) const {
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
}

View file

@ -2,8 +2,6 @@
#include "llama.h"
#include <algorithm>
#include <array>
// bump if necessary
@ -44,7 +42,6 @@ struct llama_hparams {
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
uint32_t n_vocab = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla = 0;
@ -59,7 +56,6 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@ -163,9 +159,6 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
// cross attention layers
bool cross_attention_layers(uint32_t il) const;
bool is_swa(uint32_t il) const;
};

View file

@ -95,16 +95,8 @@ bool llama_kv_cache_unified::init(
return false;
}
ggml_tensor * k, *v;
// for cross attention layers
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
} else {
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
}
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
k_l.push_back(k);

View file

@ -315,8 +315,6 @@ namespace GGUFMeta {
return true;
}
template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());

View file

@ -423,7 +423,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
// everything past this point is not vocab-related
if (hparams.vocab_only) {
@ -435,7 +434,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@ -459,11 +457,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@ -516,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@ -579,16 +575,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
}
} break;
case LLM_ARCH_MLLAMA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_11B; break;
case 100: type = LLM_TYPE_90B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1576,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_vocab = hparams.n_vocab;
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@ -1829,52 +1815,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
case LLM_ARCH_MLLAMA:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
// output
{
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
if (hparams.cross_attention_layers(i)) {
layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
} else {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
}
} break;
case LLM_ARCH_DECI:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -4767,246 +4707,6 @@ struct llm_build_llama : public llm_graph_context {
}
};
struct llm_build_mllama: public llm_graph_context {
llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inpCAS;
inpL = build_inp_embd(model.tok_embd);
inpCAS = build_inp_cross_attn_state();
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
if (hparams.cross_attention_layers(il)) {
if (!ubatch.embd && !cparams.cross_attn) {
continue;
}
// cross attention layer
ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il);
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
cb(Qcur, "Qcur", il);
Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur, * Vcur;
if (ubatch.embd) {
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
cb(Kcur, "Kcur", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
cb(Kcur, "Kcur", il);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
cb(Kcur, "Kcur", il);
Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
cb(Vcur, "Vcur", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
cb(Vcur, "Vcur", il);
Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
cb(Vcur, "Vcur", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
} else {
Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
cb(Kcur, "Kcur (view)", il);
Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
cb(Vcur, "Vcur (view)", il);
}
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
cb(kq, "kq", il);
// TODO: apply causal masks
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq_soft_max, "kq_soft_max", il);
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
cb(Vcur, "Vcur", il);
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
cb(kqv, "kqv", il);
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
cb(cur, "cur", il);
// TODO: do this in place once?
cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
// TODO: do this inplace once?
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
} else {
// self attention layer
// rope freq factors for llama3; may return nullptr for llama2 and other models
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@ -13363,10 +13063,6 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
case LLM_ARCH_MLLAMA:
{
llm = std::make_unique<llm_build_mllama>(*this, params, gf);
} break;
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@ -13728,7 +13424,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:

View file

@ -11,7 +11,6 @@
#include <string>
#include <unordered_map>
#include <vector>
#include <stdexcept>
struct llama_cparams;
struct llama_ubatch;
@ -74,7 +73,6 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
@ -316,16 +314,6 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
// cross attention
struct ggml_tensor * cross_attn_k_norm = nullptr;
struct ggml_tensor * cross_attn_k_proj = nullptr;
struct ggml_tensor * cross_attn_o_proj = nullptr;
struct ggml_tensor * cross_attn_q_norm = nullptr;
struct ggml_tensor * cross_attn_q_proj = nullptr;
struct ggml_tensor * cross_attn_v_proj = nullptr;
struct ggml_tensor * cross_attn_attn_gate = nullptr;
struct ggml_tensor * cross_attn_mlp_gate = nullptr;
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;

View file

@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
if (qs.n_attention_wv != n_attn_layer) {
LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
}
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
}
size_t total_size_org = 0;

View file

@ -17,7 +17,6 @@ package llama
#include "llava.h"
#include "gguf.h"
#include "mllama.h"
#include "sampling_ext.h"
extern bool llamaProgressCallback(float progress, void *user_data);

887
llama/mllama.cpp vendored
View file

@ -1,887 +0,0 @@
// NOTE: This is modified from clip.cpp for Mllama only
#include "mllama.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#include <algorithm>
#include <cmath>
#include <cstdarg>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <stdexcept>
#include <vector>
#define REQUIRE(x) \
do { \
if (!(x)) { \
throw std::runtime_error("REQUIRE failed: " #x); \
} \
} while (0)
#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#if __GLIBCXX__
#include <cstdio>
#include <ext/stdio_filebuf.h>
#include <fcntl.h>
#endif
#endif
struct mllama_image {
int width;
int height;
int num_channels = 3;
int num_tiles = 4;
int aspect_ratio_id;
std::vector<float> data;
};
static std::string format(const char *fmt, ...) {
va_list args;
va_start(args, fmt);
std::vector<char> b(128);
int n = vsnprintf(b.data(), b.size(), fmt, args);
REQUIRE(n >= 0 && n < b.size());
va_end(args);
return std::string(b.data(), b.size());
}
//
// utilities to get data from a gguf file
//
static int get_key_index(const gguf_context *ctx, const char *key) {
int key_index = gguf_find_key(ctx, key);
REQUIRE(key_index != -1);
return key_index;
}
static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
const int i = get_key_index(ctx, key.c_str());
const int n = gguf_get_arr_n(ctx, i);
const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
std::vector<uint32_t> s(n);
for (size_t j = 0; j < s.size(); j++) {
s[j] = data[j];
}
return s;
}
static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
}
static float get_f32(const gguf_context *ctx, const std::string &key) {
return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
}
static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype));
}
//
// mllama layers
//
struct mllama_hparams {
uint32_t image_size;
uint32_t patch_size;
uint32_t hidden_size;
uint32_t n_intermediate;
uint32_t projection_dim;
uint32_t n_head;
uint32_t n_layer;
uint32_t n_global_layer;
uint32_t n_tiles;
float eps;
std::vector<bool> intermediate_layers;
};
struct mllama_layer {
// attention
struct ggml_tensor *k_w;
struct ggml_tensor *k_b;
struct ggml_tensor *q_w;
struct ggml_tensor *q_b;
struct ggml_tensor *v_w;
struct ggml_tensor *v_b;
struct ggml_tensor *o_w;
struct ggml_tensor *o_b;
struct ggml_tensor *attn_gate;
// layernorm 1
struct ggml_tensor *ln_1_w;
struct ggml_tensor *ln_1_b;
// ff
struct ggml_tensor *ff_i_w;
struct ggml_tensor *ff_i_b;
struct ggml_tensor *ff_o_w;
struct ggml_tensor *ff_o_b;
struct ggml_tensor *ff_gate;
// layernorm 2
struct ggml_tensor *ln_2_w;
struct ggml_tensor *ln_2_b;
};
struct mllama_vision_model {
struct mllama_hparams hparams;
// embeddings
struct ggml_tensor *class_embedding;
struct ggml_tensor *patch_embeddings;
struct ggml_tensor *position_embeddings;
struct ggml_tensor *position_embeddings_gate;
struct ggml_tensor *tile_position_embeddings;
struct ggml_tensor *tile_position_embeddings_gate;
struct ggml_tensor *pre_tile_position_embeddings;
struct ggml_tensor *pre_tile_position_embeddings_gate;
struct ggml_tensor *post_tile_position_embeddings;
struct ggml_tensor *post_tile_position_embeddings_gate;
struct ggml_tensor *pre_ln_w;
struct ggml_tensor *pre_ln_b;
std::vector<mllama_layer> layers;
std::vector<mllama_layer> global_layers;
struct ggml_tensor *post_ln_w;
struct ggml_tensor *post_ln_b;
struct ggml_tensor *mm_0_w;
struct ggml_tensor *mm_0_b;
};
struct mllama_ctx {
struct mllama_vision_model vision_model;
uint32_t ftype = 1;
struct gguf_context *ctx_gguf;
struct ggml_context *ctx_data;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = nullptr;
ggml_backend_t backend = nullptr;
ggml_gallocr_t compute_alloc = nullptr;
};
static ggml_tensor *mllama_image_build_encoder_layer(
struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
struct ggml_tensor *cur = embeddings;
{
// layernorm1
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
ggml_set_name(cur, format("%d pre layernorm", il).c_str());
}
{
// self-attention
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b != nullptr) {
Q = ggml_add(ctx0, Q, layer.q_b);
}
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
ggml_set_name(Q, format("%d query", il).c_str());
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b != nullptr) {
K = ggml_add(ctx0, K, layer.k_b);
}
K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
ggml_set_name(K, format("%d key", il).c_str());
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b != nullptr) {
V = ggml_add(ctx0, V, layer.v_b);
}
V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
ggml_set_name(V, format("%d value", il).c_str());
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
KQ = ggml_soft_max_inplace(ctx0, KQ);
ggml_set_name(KQ, format("%d KQ", il).c_str());
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
ggml_set_name(KQV, format("%d KQV", il).c_str());
cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
if (layer.o_b != nullptr) {
cur = ggml_add(ctx0, cur, layer.o_b);
}
ggml_set_name(cur, format("%d self attention", il).c_str());
if (layer.attn_gate != nullptr) {
cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
ggml_set_name(cur, format("%d self attention gate", il).c_str());
}
}
cur = ggml_add(ctx0, cur, embeddings);
ggml_set_name(cur, format("%d residual", il).c_str());
embeddings = cur;
{
// layernorm2
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
ggml_set_name(cur, format("%d post layernorm", il).c_str());
}
{
// feed forward
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
cur = ggml_gelu_inplace(ctx0, cur);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
ggml_set_name(cur, format("%d feed forward", il).c_str());
if (layer.ff_gate != nullptr) {
cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
ggml_set_name(cur, format("%d feed forward gate", il).c_str());
}
}
// residual 2
cur = ggml_add(ctx0, cur, embeddings);
ggml_set_name(cur, format("%d residual", il).c_str());
embeddings = cur;
return embeddings;
}
static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
const auto &model = ctx->vision_model;
const auto &hparams = model.hparams;
const int image_size = hparams.image_size;
const int image_size_width = image_size;
const int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int batch_size = imgs->size;
REQUIRE(batch_size == 1);
int num_tiles = 4;
int num_channels = 3;
if (imgs->data != nullptr) {
num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
}
struct ggml_init_params params = {
ctx->buf_compute_meta.size(), // mem_size
ctx->buf_compute_meta.data(), // mem_buffer
true, // no_alloc
};
struct ggml_context *ctx0 = ggml_init(params);
struct ggml_cgraph *gf = ggml_new_graph(ctx0);
struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
ggml_set_name(aspect_ratios, "aspect_ratios");
ggml_set_input(aspect_ratios);
if (model.pre_tile_position_embeddings != nullptr) {
struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
if (model.pre_tile_position_embeddings_gate != nullptr) {
pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
}
inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
}
struct ggml_tensor *embeddings = inp;
if (model.class_embedding != nullptr) {
// concat class_embeddings and patch_embeddings
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
for (int i = 0; i < num_tiles; ++i) {
// repeat class embeddings for each tile
embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
}
embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
if (model.position_embeddings_gate != nullptr) {
position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, position_embd);
if (model.tile_position_embeddings != nullptr) {
struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
if (model.tile_position_embeddings_gate != nullptr) {
tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
}
// pre-layernorm
if (model.pre_ln_w != nullptr) {
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
if (model.pre_ln_b != nullptr) {
embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
}
ggml_set_name(embeddings, "pre layernorm");
}
const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
std::vector<struct ggml_tensor *> intermediate_embeddings;
// encoder
for (size_t il = 0; il < model.layers.size(); il++) {
if (hparams.intermediate_layers[il]) {
intermediate_embeddings.push_back(embeddings);
}
embeddings = mllama_image_build_encoder_layer(
ctx0, il, model.layers[il], embeddings,
hparams.eps, hidden_size, batch_size, n_head, d_head);
}
// post-layernorm
if (model.post_ln_w != nullptr) {
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
if (model.post_ln_b != nullptr) {
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
}
ggml_set_name(embeddings, "post layernorm");
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
if (model.post_tile_position_embeddings != nullptr) {
struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
if (model.post_tile_position_embeddings_gate != nullptr) {
post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
// global encoder
for (size_t il = 0; il < model.global_layers.size(); il++) {
embeddings = mllama_image_build_encoder_layer(
ctx0, il, model.global_layers[il], embeddings,
hparams.eps, hidden_size, batch_size, n_head, d_head);
}
struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
}
stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
// mllama projector
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
ggml_set_name(embeddings, "multi modal projector");
// build the graph
ggml_build_forward_expand(gf, embeddings);
ggml_free(ctx0);
return gf;
}
static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
REQUIRE(cur != nullptr || optional);
return cur;
}
static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
std::vector<struct mllama_layer> layers(n);
for (size_t i = 0; i < layers.size(); i++) {
auto &layer = layers[i];
layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
}
return layers;
}
// read and create ggml_context containing the tensors and their data
struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
struct ggml_context *meta = nullptr;
struct gguf_init_params params = {
true, // no_alloc
&meta, // ctx
};
struct gguf_context *ctx = gguf_init_from_file(fname, params);
REQUIRE(ctx != nullptr);
if (verbosity >= 1) {
const int n_tensors = gguf_get_n_tensors(ctx);
const int n_kv = gguf_get_n_kv(ctx);
const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
const int idx_desc = get_key_index(ctx, "general.description");
const std::string description = gguf_get_val_str(ctx, idx_desc);
const int idx_name = gguf_find_key(ctx, "general.name");
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
LOG("model name: %s", name.c_str());
}
LOG("description: %s", description.c_str());
LOG("GGUF version: %d", gguf_get_version(ctx));
LOG("alignment: %zu", gguf_get_alignment(ctx));
LOG("n_tensors: %d", n_tensors);
LOG("n_kv: %d", n_kv);
LOG("ftype: %s", ftype.c_str());
LOG("");
}
const int n_tensors = gguf_get_n_tensors(ctx);
mllama_ctx *new_mllama = new mllama_ctx{};
ggml_backend_t backend = ggml_backend_init_best();
if (backend == nullptr) {
LOG("%s: failed to initialize backend\n", __func__);
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
new_mllama->backend = backend;
// load tensors
{
std::vector<uint8_t> read_buf;
struct ggml_init_params params = {
(n_tensors + 1) * ggml_tensor_overhead(), // mem_size
nullptr, // mem_buffer
true, // no_alloc
};
new_mllama->ctx_data = ggml_init(params);
if (!new_mllama->ctx_data) {
LOG("ggml_init() failed");
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
#ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
if (!wlen) {
return NULL;
}
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
if (!wlen) {
free(wbuf);
return NULL;
}
#if __GLIBCXX__
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
std::istream fin(&buffer);
#else // MSVC
// unused in our current build
auto fin = std::ifstream(wbuf, std::ios::binary);
#endif
free(wbuf);
#else
auto fin = std::ifstream(fname, std::ios::binary);
#endif
if (!fin) {
LOG("cannot open model file for loading tensors\n");
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
// add tensors to context
for (int i = 0; i < n_tensors; ++i) {
const char *name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor *t = ggml_get_tensor(meta, name);
struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
ggml_set_name(cur, name);
}
// alloc memory and offload data
new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
for (int i = 0; i < n_tensors; ++i) {
const char *name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
if (!fin) {
LOG("failed to seek for tensor %s\n", name);
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
} else {
// read into a temporary buffer first, then copy to device memory
read_buf.resize(num_bytes);
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
#if defined(_WIN32) && defined(__GLIBCXX__)
close(fd);
#else
fin.close();
#endif
}
// vision model
// load vision model
auto &vision_model = new_mllama->vision_model;
auto &hparams = vision_model.hparams;
hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
hparams.intermediate_layers.resize(hparams.n_layer);
for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
}
if (verbosity >= 2) {
LOG("");
LOG("vision model hparams");
LOG("image_size %d", hparams.image_size);
LOG("patch_size %d", hparams.patch_size);
LOG("v_hidden_size %d", hparams.hidden_size);
LOG("v_n_intermediate %d", hparams.n_intermediate);
LOG("v_projection_dim %d", hparams.projection_dim);
LOG("v_n_head %d", hparams.n_head);
LOG("v_n_layer %d", hparams.n_layer);
LOG("v_n_global_layer %d", hparams.n_global_layer);
LOG("v_eps %f", hparams.eps);
}
vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
ggml_free(meta);
new_mllama->ctx_gguf = ctx;
{
// measure mem requirement and allocate
new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
struct mllama_image_batch batch;
batch.size = 1;
ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
}
return new_mllama;
}
struct mllama_image *mllama_image_init() {
return new mllama_image();
}
void mllama_image_free(struct mllama_image *img) { delete img; }
void mllama_image_batch_free(struct mllama_image_batch *batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
}
}
bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
img->width = width;
img->height = height;
img->num_channels = num_channels;
img->num_tiles = num_tiles;
img->aspect_ratio_id = aspect_ratio_id;
img->data.resize(n);
memcpy(img->data.data(), data, n);
return true;
}
inline int mllama(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
void mllama_free(mllama_ctx *ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
mllama_image_batch imgs{};
imgs.size = 1;
imgs.data = img;
return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
}
bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
int batch_size = imgs->size;
REQUIRE(batch_size == 1);
// build the inference graph
ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
// set inputs
const auto &model = ctx->vision_model;
const auto &hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
{
struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
}
{
struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
if (embeddings != nullptr) {
void *zeros = malloc(ggml_nbytes(embeddings));
memset(zeros, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
free(zeros);
}
}
{
struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
if (positions != nullptr) {
int *positions_data = (int *)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
}
{
struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
if (aspect_ratios != nullptr) {
int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
free(aspect_ratios_data);
}
}
if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
ggml_backend_graph_compute(ctx->backend, gf);
// the last node is the embedding tensor
struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
return true;
}
int32_t mllama_image_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.image_size;
}
int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.patch_size;
}
int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.hidden_size;
}
int mllama_n_patches(const struct mllama_ctx *ctx) {
const auto &hparams = ctx->vision_model.hparams;
return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
}
int mllama_n_positions(const struct mllama_ctx *ctx) {
return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
}
int mllama_n_tiles(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.n_tiles;
}
int mllama_n_embd(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.projection_dim;
}
size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
}

61
llama/mllama.h vendored
View file

@ -1,61 +0,0 @@
#ifndef MLLAMA_H
#define MLLAMA_H
#include <stddef.h>
#include <stdint.h>
#ifdef LLAMA_SHARED
#if defined(_WIN32) && !defined(__MINGW32__)
#ifdef LLAMA_BUILD
#define MLLAMA_API __declspec(dllexport)
#else
#define MLLAMA_API __declspec(dllimport)
#endif
#else
#define MLLAMA_API __attribute__((visibility("default")))
#endif
#else
#define MLLAMA_API
#endif
#ifdef __cplusplus
extern "C" {
#endif
struct mllama_ctx;
struct mllama_image_batch {
struct mllama_image *data;
size_t size;
};
MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
MLLAMA_API struct mllama_image *mllama_image_init();
MLLAMA_API void mllama_image_free(struct mllama_image *img);
MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
#ifdef __cplusplus
}
#endif
#endif // MLLAMA_H

File diff suppressed because it is too large Load diff

View file

@ -236,7 +236,7 @@ diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 425524d0..112abef6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@@ -341,6 +341,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
@@ -1277,6 +1278,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);

View file

@ -22,7 +22,7 @@ multiple batches of processing until everything is complete.
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cd06ad91..77177c5e 100644
index 9c1fe93f..773c63fe 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
@ -202,7 +202,7 @@ index cd06ad91..77177c5e 100644
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1291,9 +1249,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
@ -219,7 +219,7 @@ index cd06ad91..77177c5e 100644
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index a50c4afa..30f84bfd 100644
index 5457f077..299fbd52 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
@ -230,7 +230,7 @@ index a50c4afa..30f84bfd 100644
#include "ggml-cpp.h"
@@ -179,7 +180,8 @@ private:
@@ -178,7 +179,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
@ -241,10 +241,10 @@ index a50c4afa..30f84bfd 100644
// TODO: read/write lora adapters and cvec
size_t state_write_data(llama_io_write_i & io);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 69f8d35a..35a750d3 100644
index 7c9d46d8..a38416d8 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -773,17 +773,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv);
@ -263,7 +263,7 @@ index 69f8d35a..35a750d3 100644
// determine which KV cells to move where
//
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -791,10 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
@ -275,7 +275,7 @@ index 69f8d35a..35a750d3 100644
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0];
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -843,19 +830,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory?
bool cont = false;
@ -295,7 +295,7 @@ index 69f8d35a..35a750d3 100644
cont = false;
continue;
}
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -871,8 +850,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used;
if (!cont) {
@ -307,7 +307,7 @@ index 69f8d35a..35a750d3 100644
}
nf++;
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -882,22 +863,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
}
}

View file

@ -11,7 +11,7 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 82ae1b5b..1487f322 100644
index 405d8e31..4e67d243 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {

View file

@ -178,9 +178,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
// #ifdef GGML_USE_BLAS
// register_backend(ggml_backend_blas_reg());
// #endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif