diff --git a/Makefile.sync b/Makefile.sync index 949ade809..238d76279 100644 --- a/Makefile.sync +++ b/Makefile.sync @@ -1,6 +1,6 @@ UPSTREAM=https://github.com/ggerganov/llama.cpp.git WORKDIR=llama/vendor -FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac +FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5 .PHONY: help help: diff --git a/llama/build-info.cpp b/llama/build-info.cpp index be908c364..27ce8e701 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,4 +1,4 @@ int LLAMA_BUILD_NUMBER = 0; -char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac"; +char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5"; char const *LLAMA_COMPILER = ""; char const *LLAMA_BUILD_TARGET = ""; diff --git a/llama/llama.cpp/common/common.h b/llama/llama.cpp/common/common.h index e6eaa8e80..0a9dc0599 100644 --- a/llama/llama.cpp/common/common.h +++ b/llama/llama.cpp/common/common.h @@ -342,6 +342,8 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool mmproj_use_gpu = true; // use GPU for multimodal model + bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding diff --git a/llama/llama.cpp/common/json-schema-to-grammar.cpp b/llama/llama.cpp/common/json-schema-to-grammar.cpp index 56043678c..656b3ecaa 100644 --- a/llama/llama.cpp/common/json-schema-to-grammar.cpp +++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp @@ -16,6 +16,9 @@ using json = nlohmann::ordered_json; static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); + if (max_items == 0) { + return ""; + } if (min_items == 0 && max_items == 1) { return item_rule + "?"; } diff --git a/llama/llama.cpp/examples/llava/clip-impl.h b/llama/llama.cpp/examples/llava/clip-impl.h index 180ae9880..66cb21ef1 100644 --- a/llama/llama.cpp/examples/llava/clip-impl.h +++ b/llama/llama.cpp/examples/llava/clip-impl.h @@ -2,8 +2,6 @@ #include "gguf.h" #include "clip.h" -#include "clip.h" - #include #include #include @@ -17,33 +15,31 @@ #define KEY_FTYPE "general.file_type" #define KEY_NAME "general.name" #define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" -#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" -#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" #define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_SILU "clip.use_silu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_N_EMBD "clip.vision.embedding_length" +#define KEY_N_FF "clip.vision.feed_forward_length" +#define KEY_N_BLOCK "clip.vision.block_count" +#define KEY_N_HEAD "clip.vision.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.vision.projection_dim" #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_TYPE "clip.projector_type" + +#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl +#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" +#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" +#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" // @@ -60,7 +56,9 @@ #define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_LN_1 "%s.blk.%d.ln1.%s" #define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_PRE "%s.pre_ln.%s" @@ -72,6 +70,8 @@ #define TN_IMAGE_NEWLINE "model.image_newline" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" @@ -87,18 +87,19 @@ #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" -#define TN_GLM_BOI_W "adapter.boi" -#define TN_GLM_EOI_W "adapter.eoi" enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, - PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_MINICPMV, PROJECTOR_TYPE_GLM_EDGE, - PROJECTOR_TYPE_MERGER, + PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_IDEFICS3, + PROJECTOR_TYPE_PIXTRAL, + PROJECTOR_TYPE_QWEN25VL, PROJECTOR_TYPE_UNKNOWN, }; @@ -106,10 +107,13 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, + { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/llama/llama.cpp/examples/llava/clip.cpp b/llama/llama.cpp/examples/llava/clip.cpp index d57b4bd6e..b3218c789 100644 --- a/llama/llama.cpp/examples/llava/clip.cpp +++ b/llama/llama.cpp/examples/llava/clip.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -172,14 +173,18 @@ struct clip_hparams { int32_t projection_dim; int32_t n_head; int32_t n_layer; + int32_t proj_scale_factor = 0; // idefics3 patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; - float eps; + float eps = 1e-6; + float rope_theta = 0.0; std::vector image_grid_pinpoints; int32_t image_crop_resolution; std::unordered_set vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; }; struct clip_layer { @@ -199,11 +204,20 @@ struct clip_layer { struct ggml_tensor * ln_1_b = nullptr; // ff - struct ggml_tensor * ff_i_w = nullptr; - struct ggml_tensor * ff_i_b = nullptr; + struct ggml_tensor * ff_i_w = nullptr; // legacy naming + struct ggml_tensor * ff_i_b = nullptr; // legacy naming + struct ggml_tensor * ff_o_w = nullptr; // legacy naming + struct ggml_tensor * ff_o_b = nullptr; // legacy naming - struct ggml_tensor * ff_o_w = nullptr; - struct ggml_tensor * ff_o_b = nullptr; + struct ggml_tensor * ff_up_w = nullptr; + struct ggml_tensor * ff_up_b = nullptr; + struct ggml_tensor * ff_gate_w = nullptr; + struct ggml_tensor * ff_gate_b = nullptr; + struct ggml_tensor * ff_down_w = nullptr; + struct ggml_tensor * ff_down_b = nullptr; + + struct ggml_tensor * ff_g_w = NULL; + struct ggml_tensor * ff_g_b = NULL; // layernorm 2 struct ggml_tensor * ln_2_w = nullptr; @@ -249,8 +263,6 @@ struct clip_vision_model { //GLMV-Edge projection struct ggml_tensor * mm_model_adapter_conv_w = nullptr; struct ggml_tensor * mm_model_adapter_conv_b = nullptr; - struct ggml_tensor * boi_w = nullptr; - struct ggml_tensor * eoi_w = nullptr; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w = nullptr; @@ -309,16 +321,14 @@ struct clip_vision_model { // gemma3 struct ggml_tensor * mm_input_proj_w = nullptr; struct ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // pixtral + struct ggml_tensor * token_embd_img_break = nullptr; }; struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; bool has_llava_projector = false; - bool has_minicpmv_projector = false; - bool has_glm_projector = false; - bool has_qwen2vl_merger = false; - int minicpmv_version = 2; + int minicpmv_version = 0; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -341,6 +351,7 @@ struct clip_ctx { ggml_backend_t backend_cpu; ggml_backend_buffer_ptr buf; + int max_nodes = 8192; ggml_backend_sched_ptr sched; clip_image_size load_image_size; @@ -376,23 +387,20 @@ struct clip_ctx { } }; -static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { +static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) { const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; + int image_size_width = img.nx; + int image_size_height = img.ny; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - const float eps = hparams.eps; - - GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1 + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), @@ -519,6 +527,482 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im embeddings = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), embeddings); + + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + + ggml_tensor * cur = embeddings; + const int scale_factor = model.hparams.proj_scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + GGML_ASSERT(scale_factor != 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_mul_mat(ctx0, model.projection, cur); + embeddings = cur; + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} + +// implementation of the 2D RoPE without adding a new op in ggml +// this is not efficient (use double the memory), but works on all backends +// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 +static ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_h, + ggml_tensor * pos_w, + const float freq_base +) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim); + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_h, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + n_dim/2 * ggml_element_size(cur)); + second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors + second = ggml_rope_ext( + ctx0, + second, + pos_w, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; +} + +static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) { + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL); + + int image_size_width = img.nx; + int image_size_height = img.ny; + + const int patch_size = hparams.patch_size; + const int n_patches_x = image_size_width / patch_size; + const int n_patches_y = image_size_height / patch_size; + const int num_patches = n_patches_x * n_patches_y; + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context_ptr ctx0_ptr(ggml_init(params)); + auto ctx0 = ctx0_ptr.get(); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // input raw + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + // 2D input positions + struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + struct ggml_tensor * embeddings = inp; + + // pre-layer norm + embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w); + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; + + // pre-attention norm + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w); + + // self-attention + { + struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches); + Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + + struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur); + + K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches); + K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + + struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur); + + V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches); + + cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // pre-ffn norm + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w); + + // feed-forward + { + ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur); + ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur); + gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu + cur = ggml_mul(ctx0, up_proj, gate_proj); + cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur); + } + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // LlavaMultiModalProjector (with GELU activation) + { + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + + // arrangement of the [IMG_BREAK] token + { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows] + + const int n_embd_text = embeddings->ne[0]; + const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + cur = ggml_concat(ctx0, cur, tok, 1); + embeddings = ggml_view_2d(ctx0, cur, + n_embd_text, n_tokens_output, + ggml_row_size(cur->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} + +static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) { + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const bool use_window_attn = hparams.n_wa_pattern > 0; + + const int n_wa_pattern = hparams.n_wa_pattern; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int patches_w = image_size_width / patch_size; + const int patches_h = image_size_height / patch_size; + const int num_positions = num_patches + (model.class_embedding ? 1 : 0); + const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + const int batch_size = imgs.entries.size(); + GGML_ASSERT(batch_size == 1); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context_ptr ctx0_ptr(ggml_init(params)); + auto ctx0 = ctx0_ptr.get(); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + GGML_ASSERT(image_size_height % (patch_size * 2) == 0); + + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, patches_h, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_3d( + ctx0, inp, + hidden_size, patches_w * patches_h, batch_size); + + if (model.patch_bias) { + // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + } + struct ggml_tensor * embeddings = inp; + struct ggml_tensor * window_mask = nullptr; + struct ggml_tensor * window_idx = nullptr; + struct ggml_tensor * inv_window_idx = nullptr; + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + embeddings = ggml_rms_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); + + embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // embeddings shape: [hidden_size, patches_w * patches_h, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4); + embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + + // rmsnorm1 + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w); + + // self-attention + { + + struct ggml_tensor * Q = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); + + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + Q = ggml_rope_multi( + ctx0, Q, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * K = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); + + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_rope_multi( + ctx0, K, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * V = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + if (full_attn) { + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + } else { + KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f); + } + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // rms norm2 + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w); + + // mlp + // ffn_up + auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b); + + auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur); + cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b); + // TODO : only 2 of these 3 are actually used, should we remove one of them? + if (ctx->use_gelu) { + cur_gate = ggml_gelu_inplace(ctx0, cur_gate); + } else if (ctx->use_silu) { + cur_gate = ggml_silu_inplace(ctx0, cur_gate); + } else { + cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate); + } + cur = ggml_mul(ctx0, cur_gate, cur_up); + + // ffn_down + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // post-layernorm + if (model.post_ln_w) { + embeddings = ggml_rms_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w); + } + + embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); + + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [hidden_size, patches_w * patches_h, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size); } // build the graph @@ -528,18 +1012,14 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im } static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); - return nullptr; - } - const auto & model = ctx->vision_model; const auto & hparams = model.hparams; const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { + + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height); image_size_width = load_image_size.width; image_size_height = load_image_size.height; @@ -548,7 +1028,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im image_size_height = imgs.entries[0]->ny; } } - else if (ctx->has_qwen2vl_merger) { + + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { // use the image's native resolution when image is avaible if (is_inf) { // if (imgs->data->nx && imgs->data->ny) { @@ -556,12 +1037,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im image_size_height = imgs.entries[0]->ny; } } + const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int patches_w = image_size_width / patch_size; const int patches_h = image_size_height / patch_size; const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; + const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; @@ -570,7 +1052,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im const int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { + if (ctx->has_llava_projector + || ctx->proj_type == PROJECTOR_TYPE_MINICPMV + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { GGML_ASSERT(batch_size == 1); } @@ -591,8 +1075,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - if (ctx->has_qwen2vl_merger) { - GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); GGML_ASSERT(image_size_height % (patch_size * 2) == 0); auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); @@ -621,40 +1105,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im struct ggml_tensor * embeddings = inp; struct ggml_tensor * pos_embed = nullptr; - if (ctx->has_llava_projector) { - // concat class_embeddings and patch_embeddings - if (model.class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); ggml_set_name(positions, "positions"); ggml_set_input(positions); - if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding + if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); } - if (ctx->has_minicpmv_projector) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; - if (ctx->minicpmv_version == 2) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 3) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 4) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } + int n_output_dim = clip_n_mmproj_embd(ctx); + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1); ggml_set_name(pos_embed, "pos_embed"); ggml_set_input(pos_embed); } @@ -697,7 +1171,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { Q = ggml_rope_multi( ctx0, Q, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); @@ -709,7 +1183,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { K = ggml_rope_multi( ctx0, K, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); @@ -974,106 +1448,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } } // minicpmv projector - else if (ctx->has_minicpmv_projector) - { - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - struct ggml_tensor * q = model.mm_model_query; - { // layernorm - q = ggml_norm(ctx0, q, eps); - q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - } - struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - { // layernorm - v = ggml_norm(ctx0, v, eps); - v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); - } - struct ggml_tensor * k; - { // position - // q = ggml_add(ctx0, q, model.mm_model_pos_embed); - k = ggml_add(ctx0, v, pos_embed); - } - - { // attention - int hidden_size = 4096; - const int d_head = 128; - int n_head = hidden_size/d_head; - int num_query = 96; - if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; - num_query = 96; - } - else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - else if (ctx->minicpmv_version == 4) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); - struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); - // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); - } - { // layernorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); - } - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); } - else { - GGML_ASSERT(false); + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); } + struct ggml_tensor * k; + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + int hidden_size = clip_n_mmproj_embd(ctx); + const int d_head = 128; + int n_head = hidden_size/d_head; + int num_query = 96; + if (ctx->minicpmv_version == 2) { + num_query = 96; + } + else if (ctx->minicpmv_version == 3) { + num_query = 64; + } + else if (ctx->minicpmv_version == 4) { + num_query = 64; + } + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); } + // glm projector - else if (ctx->has_glm_projector) { - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - //GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - struct ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_silu_inplace(ctx0, embeddings); - embeddings = ggml_mul(ctx0, embeddings,x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - } else { - GGML_ABORT("fatal error"); + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); + embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + struct ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_silu_inplace(ctx0, embeddings); + embeddings = ggml_mul(ctx0, embeddings,x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); } } - else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -1094,12 +1554,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return clip_image_build_graph_siglip(ctx, imgs); - } else { - // TODO: we should have one build_* function per model - return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + ggml_cgraph * res; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + GGML_ASSERT(imgs.entries.size() == 1); + res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + GGML_ASSERT(imgs.entries.size() == 1); + res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]); + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + res = clip_image_build_graph_qwen25vl(ctx, imgs); + } break; + default: + { + // TODO: we should have one build_* function per model + res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + } break; } + return res; } struct clip_model_loader { @@ -1109,7 +1587,7 @@ struct clip_model_loader { clip_ctx & ctx_clip; std::string fname; - size_t model_size; // in bytes + size_t model_size = 0; // in bytes // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) { @@ -1160,9 +1638,11 @@ struct clip_model_loader { } void load_hparams() { + auto & hparams = ctx_clip.vision_model.hparams; + // projector type + std::string proj_type; { - std::string proj_type; get_string(KEY_PROJ_TYPE, proj_type, false); if (!proj_type.empty()) { ctx_clip.proj_type = clip_projector_type_from_string(proj_type); @@ -1174,34 +1654,27 @@ struct clip_model_loader { // other hparams { - get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false); - get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false); - GGML_ASSERT(ctx_clip.has_vision_encoder); - GGML_ASSERT(!ctx_clip.has_text_encoder); - - // legacy keys, use KEY_PROJ_TYPE instead - get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false); - get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false); get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); - get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false); - get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false); - // !!! do NOT extend the list above, use KEY_PROJ_TYPE instead get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false); get_bool(KEY_USE_SILU, ctx_clip.use_silu, false); - auto & hparams = ctx_clip.vision_model.hparams; - get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); - get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); - get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); - get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer); - get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim); - get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps); - get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PATCH_SIZE, hparams.patch_size); - get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); + get_u32(KEY_N_EMBD, hparams.hidden_size); + get_u32(KEY_N_HEAD, hparams.n_head); + get_u32(KEY_N_FF, hparams.n_intermediate); + get_u32(KEY_N_BLOCK, hparams.n_layer); + get_u32(KEY_PROJ_DIM, hparams.projection_dim); + get_f32(KEY_LAYER_NORM_EPS, hparams.eps); + get_u32(KEY_IMAGE_SIZE, hparams.image_size); + get_u32(KEY_PATCH_SIZE, hparams.patch_size); + get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false); + ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP + || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM + || ctx_clip.proj_type == PROJECTOR_TYPE_LDP + || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2; + { std::string mm_patch_merge_type; get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); @@ -1234,15 +1707,62 @@ struct clip_model_loader { for (auto & layer : vision_feature_layer) { hparams.vision_feature_layer.insert(layer); } - // Calculate the deepest feature layer based on hparams and projector type - ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip); - LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder); - LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder); - LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector); - LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector); + // Calculate the deepest feature layer based on hparams and projector type + // NOTE: This is only used by build_graph_legacy() + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int n_layer = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV + || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE + || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL + || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) { + n_layer += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; + } + + // model-specific params + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + if (ctx_clip.minicpmv_version == 0) { + ctx_clip.minicpmv_version = 2; // default to 2 if not set + } + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + hparams.rope_theta = 10000.0f; + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); + } break; + default: + break; + } + + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); + LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); - LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector); + LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu); + LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } @@ -1298,9 +1818,6 @@ struct clip_model_loader { vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false); vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); - if (vision_model.patch_embeddings_1 == nullptr) { - ctx_clip.has_qwen2vl_merger = false; - } vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false); @@ -1314,16 +1831,28 @@ struct clip_model_loader { layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight")); layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false); layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); - layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight")); - layer.ff_o_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight")); layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false); layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false); layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false); layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false); - layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false); - layer.ff_o_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); + + // new naming + layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); + layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false); + layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false); + layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false); + + // legacy naming (the in and out is reversed! don't ask me why) + layer.ff_i_w = layer.ff_down_w; + layer.ff_o_w = layer.ff_up_w; + layer.ff_g_w = layer.ff_gate_w; + layer.ff_i_b = layer.ff_down_b; + layer.ff_o_b = layer.ff_up_b; + layer.ff_g_b = layer.ff_gate_b; } switch (ctx_clip.proj_type) { @@ -1388,7 +1917,7 @@ struct clip_model_loader { vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); } break; - case PROJECTOR_TYPE_RESAMPLER: + case PROJECTOR_TYPE_MINICPMV: { // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); @@ -1420,10 +1949,9 @@ struct clip_model_loader { vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); - vision_model.boi_w = get_tensor(TN_GLM_BOI_W); - vision_model.eoi_w = get_tensor(TN_GLM_EOI_W); } break; - case PROJECTOR_TYPE_MERGER: + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: { vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); @@ -1435,6 +1963,19 @@ struct clip_model_loader { vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; + case PROJECTOR_TYPE_IDEFICS3: + { + vision_model.projection = get_tensor(TN_MM_PROJECTOR); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + // [IMG_BREAK] token embedding + vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -1503,18 +2044,17 @@ struct clip_model_loader { } void alloc_compute_meta() { - ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); // create a fake batch clip_image_f32_batch batch; clip_image_f32_ptr img(clip_image_f32_init()); clip_image_size image_size; - image_size.width = clip_get_image_size(&ctx_clip); - image_size.height = clip_get_image_size(&ctx_clip); - int n_patches = clip_get_image_size(&ctx_clip) / image_size.width; - img->nx = n_patches; - img->ny = n_patches; - img->buf.resize(n_patches * image_size.width * image_size.height * 3); + image_size.width = ctx_clip.vision_model.hparams.image_size; + image_size.height = ctx_clip.vision_model.hparams.image_size; + img->nx = image_size.width; + img->ny = image_size.height; + img->buf.resize(image_size.width * image_size.height * 3); batch.entries.push_back(std::move(img)); ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false); @@ -1902,6 +2442,26 @@ struct image_manipulation { } } + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than max_dimension, it will be resized to max_dimension + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { + if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { + return {0, 0}; + } + + float scale = std::min(1.0f, std::min(static_cast(max_dimension) / inp_size.width, + static_cast(max_dimension) / inp_size.height)); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + int aligned_width = GGML_PAD((int)target_width_f, align_size); + int aligned_height = GGML_PAD((int)target_height_f, align_size); + + return {aligned_width, aligned_height}; + } + private: static inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); @@ -2194,11 +2754,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - clip_image_size original_size{img->nx, img->ny}; bool pad_to_square = true; auto & params = ctx->vision_model.hparams; @@ -2219,7 +2774,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } return true; } - else if (ctx->has_qwen2vl_merger) { + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { clip_image_u8 resized; auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; @@ -2233,17 +2788,27 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); return true; } - - if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE + || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 + || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { clip_image_u8 resized_image; int sz = params.image_size; - image_manipulation::bicubic_resize(*img, resized_image, sz, sz); + image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); clip_image_f32_ptr img_f32(clip_image_f32_init()); //clip_image_save_to_bmp(resized_image, "resized.bmp"); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); res_imgs->entries.push_back(std::move(img_f32)); return true; } + else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { + clip_image_u8 resized_image; + auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); + image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(img_f32)); + return true; + } // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -2299,16 +2864,18 @@ void clip_free(clip_ctx * ctx) { delete ctx; } +// deprecated size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - int extra_tokens = ctx->has_glm_projector ? 2 : 0; - return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); + const int32_t nx = ctx->vision_model.hparams.image_size; + const int32_t ny = ctx->vision_model.hparams.image_size; + return clip_embd_nbytes_by_img(ctx, nx, ny); } -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { clip_image_f32 img; img.nx = img_w; img.ny = img_h; - return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); + return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); } int32_t clip_get_image_size(const struct clip_ctx * ctx) { @@ -2338,21 +2905,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_grid_pinpoints.size(); } +// deprecated int clip_n_patches(const struct clip_ctx * ctx) { clip_image_f32 img; img.nx = ctx->vision_model.hparams.image_size; img.ny = ctx->vision_model.hparams.image_size; - return clip_n_patches_by_img(ctx, &img); + return clip_n_output_tokens(ctx, &img); } +// deprecated int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + return clip_n_output_tokens(ctx, img); +} + +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + const int n_total = clip_n_output_tokens(ctx, img); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); + } + return n_total; +} + +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); + } + return 1; +} + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { n_patches /= 4; - } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { if (ctx->minicpmv_version == 2) { n_patches = 96; } @@ -2362,13 +2952,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i else if (ctx->minicpmv_version == 4) { n_patches = 64; } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + else { + GGML_ABORT("Unknown minicpmv version"); + } + } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { int patch_size = params.patch_size * 2; int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); n_patches = x_patch * y_patch; } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { n_patches = 256; + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + n_patches /= ctx->vision_model.hparams.proj_scale_factor; + } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { + int n_patches_x = img->nx / params.patch_size; + int n_patches_y = img->ny / params.patch_size; + n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } return n_patches; @@ -2461,11 +3060,6 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co } bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - clip_image_f32_batch imgs; clip_image_f32_ptr img_copy(clip_image_f32_init()); *img_copy = *img; @@ -2476,25 +3070,13 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { const clip_image_f32_batch & imgs = *imgs_c_ptr; - - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector) { - GGML_ASSERT(batch_size == 1); // TODO: support multiple images - } - if (ctx->has_minicpmv_projector) { + + if (ctx->has_llava_projector + || ctx->proj_type == PROJECTOR_TYPE_MINICPMV + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { GGML_ASSERT(batch_size == 1); } - if (ctx->has_glm_projector) { - GGML_ASSERT(batch_size == 1); - ggml_tensor * boi = ctx->vision_model.boi_w; - ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi)); - vec = (float*)(vec+ggml_nelements(boi)); //offset for boi - } // build the inference graph ggml_backend_sched_reset(ctx->sched.get()); @@ -2502,164 +3084,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs - const auto & model = ctx->vision_model; + const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { - image_size_width = imgs.entries[0]->nx; - image_size_height = imgs.entries[0]->ny; - } + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int pos_w = ctx->load_image_size.width / patch_size; + const int pos_w = ctx->load_image_size.width / patch_size; const int pos_h = ctx->load_image_size.height / patch_size; + const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl + + auto get_inp_tensor = [&gf](const char * name) { + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + // set input pixel values { - struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - float * data = (float *)malloc(ggml_nbytes(inp_raw)); + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector inp_raw(nelem); + + // layout of data (note: the channel dim is unrolled to better visualize the layout): + // + // ┌──W──┐ + // │ H │ channel = R + // ├─────┤ │ + // │ H │ channel = G + // ├─────┤ │ + // │ H │ channel = B + // └─────┘ │ + // ──────┘ x B for (size_t i = 0; i < imgs.entries.size(); i++) { const int nx = imgs.entries[i]->nx; const int ny = imgs.entries[i]->ny; - if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { - GGML_ASSERT(nx == image_size && ny == image_size); - } - const int n = nx * ny; for (int b = 0; b < batch_size; b++) { - for (int k = 0; k < 3; k++) { - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k]; - } + float * batch_entry = inp_raw.data() + b * (3*n); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; } } } } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); - free(data); + set_input_f32("inp_raw", inp_raw); } - if (ctx->has_minicpmv_projector) { - { - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - int bucket_coords_h[1024]; - int bucket_coords_w[1024]; - for (int i = 0; i < pos_h; i++){ - bucket_coords_h[i] = std::floor(70.0*i/pos_h); - } - for (int i = 0; i < pos_w; i++){ - bucket_coords_w[i] = std::floor(70.0*i/pos_w); - } - for (int i = 0, id = 0; i < pos_h; i++){ - for (int j = 0; j < pos_w; j++){ - positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; - } - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - { - // inspired from resampler of Qwen-VL: - // -> https://huggingface.co/Qwen/Qwen-VL/tree/main - // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 - struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - int embed_dim = 4096; - if (ctx->minicpmv_version == 2) { - embed_dim = 4096; - } - else if (ctx->minicpmv_version == 3) { - embed_dim = 3584; - } - else if (ctx->minicpmv_version == 4) { - embed_dim = 3584; - } - auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - - float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;i < pos_w * pos_h; ++i){ - for(int j=0; j < embed_dim; ++j){ - pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j]; - } - } - - ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); - free(pos_embed_data); - } - } - else { - if (model.class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } - - if (ctx->has_qwen2vl_merger) { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - const int pw = image_size_width / patch_size; - const int ph = image_size_height / patch_size; - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - - int ptr = 0; - for (int y = 0; y < ph; y+=2) + // set input per projector + switch (ctx->proj_type) { + case PROJECTOR_TYPE_MINICPMV: { - for (int x = 0; x < pw; x+=2) - { - for (int dy = 0; dy < 2; dy++) { - for (int dx = 0; dx < 2; dx++) { - positions_data[ptr] = y + dy; - positions_data[num_patches + ptr] = x + dx; - positions_data[num_patches * 2 + ptr] = y + dy; - positions_data[num_patches * 3 + ptr] = x + dx; - ptr++; + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); + + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + int embed_dim = clip_n_mmproj_embd(ctx); + + // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + std::vector pos_embed(embed_dim * pos_w * pos_h); + for(int i = 0; i < pos_w * pos_h; ++i){ + for(int j = 0; j < embed_dim; ++j){ + pos_embed[i * embed_dim + j] = pos_embed_t[i][j]; + } + } + + set_input_f32("pos_embed", pos_embed); + } break; + case PROJECTOR_TYPE_QWEN2VL: + { + const int merge_ratio = 2; + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + std::vector positions(num_positions * 4); + int ptr = 0; + for (int y = 0; y < ph; y += merge_ratio) { + for (int x = 0; x < pw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions[ ptr] = y + dy; + positions[ num_patches + ptr] = x + dx; + positions[2 * num_patches + ptr] = y + dy; + positions[3 * num_patches + ptr] = x + dx; + ptr++; + } } } } - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - // do nothing - } - else { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + // pw * ph = number of tokens output by ViT after apply patch merger + // ipw * ipw = number of vision token been processed inside ViT + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; - int* positions_data = (int*)malloc(ggml_nbytes(positions)); + std::vector idx (ph * pw); + std::vector inv_idx(ph * pw); + + if (use_window_attn) { + const int attn_window_size = 112; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } + + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + + const int mpow = merge_ratio * merge_ratio; + std::vector positions(num_positions * 4); + + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(num_positions); + // dimension H + for (int i = 0; i < num_positions; i++) { + pos_data[i] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < num_positions; i++) { + pos_data[i] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + // llava and other models + std::vector positions(num_positions); for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + positions[i] = i; } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + { + // llava and other models + std::vector positions(num_positions); + for (int i = 0; i < num_positions; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); - if (!ctx->has_glm_projector) { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); // The patches vector is used to get rows to index into the embeds with; // we should skip dim 0 only if we have CLS to avoid going out of bounds // when retrieving the rows. int patch_offset = model.class_embedding ? 1 : 0; - int* patches_data = (int*)malloc(ggml_nbytes(patches)); + std::vector patches(num_patches); for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + patch_offset; + patches[i] = i + patch_offset; } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } - } + set_input_i32("patches", patches); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + // do nothing + } break; + default: + GGML_ABORT("Unknown projector type"); } ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); @@ -2676,13 +3377,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - if (ctx->has_glm_projector) { - //eoi - ggml_tensor * eoi = ctx->vision_model.eoi_w; - int offset = ggml_nelements(embeddings); - ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi)); - } - return true; } @@ -2822,56 +3516,52 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_PIXTRAL: + return ctx->vision_model.mm_2_b->ne[0]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->vision_model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_MINICPMV: + if (ctx->minicpmv_version == 2) { + return 4096; + } else if (ctx->minicpmv_version == 3) { + return 3584; + } else if (ctx->minicpmv_version == 4) { + return 3584; + } + GGML_ABORT("Unknown minicpmv version"); + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->vision_model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + return ctx->vision_model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_GEMMA3: + return ctx->vision_model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->vision_model.projection->ne[1]; + default: + GGML_ABORT("Unknown projector type"); } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - return 4096; - } - else if (ctx->minicpmv_version == 3) { - return 3584; - } - else if (ctx->minicpmv_version == 4) { - return 3584; - } - } - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ - return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - return ctx->vision_model.mm_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return ctx->vision_model.mm_input_proj_w->ne[0]; - } - - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } int clip_is_minicpmv(const struct clip_ctx * ctx) { - if (ctx->has_minicpmv_projector) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { return ctx->minicpmv_version; } return 0; } bool clip_is_glm(const struct clip_ctx * ctx) { - return ctx->has_glm_projector; + return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE; } bool clip_is_qwen2vl(const struct clip_ctx * ctx) { - return ctx->has_qwen2vl_merger; + return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL; } bool clip_is_llava(const struct clip_ctx * ctx) { @@ -2882,29 +3572,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; } -// Determine the number of encoder layers to iterate over -int get_deepest_feature_layer(const struct clip_ctx * ctx) { - // Get the index of the second to last layer; this is the - // default for models that have a llava projector - const auto & hparams = ctx->vision_model.hparams; - int n_layer = hparams.n_layer - 1; - int deepest_feature_layer = -1; - - // Handle other projectors; incrementing here indicates that we - // should use the last encoder layer for the vision features. - if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { - n_layer += 1; - } - - // If we set explicit vision feature layers, only go up to the deepest one - for (const auto & feature_layer : hparams.vision_feature_layer) { - if (feature_layer > deepest_feature_layer) { - deepest_feature_layer = feature_layer; - } - } - return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; -} - bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); diff --git a/llama/llama.cpp/examples/llava/clip.h b/llama/llama.cpp/examples/llava/clip.h index 5fc45d3e2..0a53bd8eb 100644 --- a/llama/llama.cpp/examples/llava/clip.h +++ b/llama/llama.cpp/examples/llava/clip.h @@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); -CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); @@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); -CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); +GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx), + "use clip_n_output_tokens instead"); +GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img), + "use clip_n_output_tokens instead"); + +CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// for M-RoPE, this will be the number of token positions in X and Y directions +// for other models, X will be the total number of tokens and Y will be 1 +CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); +CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// this should be equal to the embedding dimension of the text model +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); @@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); -CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); - CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/llama/llama.cpp/examples/llava/llava.cpp b/llama/llama.cpp/examples/llava/llava.cpp index 5eb40bcd1..bab027b50 100644 --- a/llama/llama.cpp/examples/llava/llava.cpp +++ b/llama/llama.cpp/examples/llava/llava.cpp @@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< } // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) -static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { +static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) { struct { struct ggml_context * ctx; } model; @@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector model.ctx = ggml_init(params); - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // fill it with the image embeddings, ignoring the base for (size_t i = 1; i < num_images; i++) { @@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches - *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); + memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches + *n_img_pos_out = static_cast(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input)); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python @@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes_by_img(ctx_clip, nx, ny)); - n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); + n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding - *n_img_pos = clip_n_patches(ctx_clip); clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 if (!encoded) { LOG_ERR("Unable to encode image\n"); @@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); int n_img_pos_out; - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); + clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h index f91896e48..f1628e88f 100644 --- a/llama/llama.cpp/include/llama.h +++ b/llama/llama.cpp/include/llama.h @@ -111,6 +111,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, }; enum llama_rope_type { @@ -1237,6 +1238,7 @@ extern "C" { "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + /// Setting k <= 0 makes this a noop LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index dd01df60a..df42d1a57 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -20,6 +20,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, @@ -109,6 +110,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, + { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, @@ -511,6 +513,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_NOMIC_BERT_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_JINA_BERT_V2, { diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index b6227eebf..bda9d0714 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -24,6 +24,7 @@ enum llm_arch { LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, + LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, @@ -113,6 +114,7 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, + LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, diff --git a/llama/llama.cpp/src/llama-chat.cpp b/llama/llama.cpp/src/llama-chat.cpp index 721faa4e8..735d2619c 100644 --- a/llama/llama.cpp/src/llama-chat.cpp +++ b/llama/llama.cpp/src/llama-chat.cpp @@ -50,8 +50,8 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, - { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, - { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, + { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, + { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 }, { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, @@ -62,6 +62,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "yandex", LLM_CHAT_TEMPLATE_YANDEX }, { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, + { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { if (tmpl_contains("<|im_start|>")) { return tmpl_contains("<|im_sep|>") ? LLM_CHAT_TEMPLATE_PHI_4 - : LLM_CHAT_TEMPLATE_CHATML; + : tmpl_contains("") + ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml + : LLM_CHAT_TEMPLATE_CHATML; } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { if (tmpl_contains("[SYSTEM_PROMPT]")) { return LLM_CHAT_TEMPLATE_MISTRAL_V7; @@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("[gMASK]")) { + return LLM_CHAT_TEMPLATE_CHATGLM_4; } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { return tmpl_contains("") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; + } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) { + return LLM_CHAT_TEMPLATE_GLMEDGE; } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { return LLM_CHAT_TEMPLATE_ZEPHYR; } else if (tmpl_contains("bos_token + message['role']")) { @@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_LLAMA_3; } else if (tmpl_contains("[gMASK]sop")) { // chatglm3-6b - return LLM_CHAT_TEMPLATE_CHATGML_3; - } else if (tmpl_contains("[gMASK]")) { - return LLM_CHAT_TEMPLATE_CHATGML_4; + return LLM_CHAT_TEMPLATE_CHATGLM_3; } else if (tmpl_contains(LU8("<用户>"))) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF return LLM_CHAT_TEMPLATE_MINICPM; @@ -432,7 +437,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) { // chatglm3-6b ss << "[gMASK]" << "sop"; for (auto message : chat) { @@ -442,7 +447,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { ss << "[gMASK]" << ""; for (auto message : chat) { std::string role(message->role); @@ -451,14 +456,6 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>" << "\n" << message->content; - } - if (add_ass) { - ss << "<|assistant|>"; - } } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF for (auto message : chat) { @@ -620,7 +617,23 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|header_start|>assistant<|header_end|>\n\n"; } - } else { + } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) { + // SmolVLM + ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n"; + } else { + ss << "Assistant: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "Assistant:"; + } + } else { // template not supported return -1; } diff --git a/llama/llama.cpp/src/llama-chat.h b/llama/llama.cpp/src/llama-chat.h index 34537ca21..3f5843466 100644 --- a/llama/llama.cpp/src/llama-chat.h +++ b/llama/llama.cpp/src/llama-chat.h @@ -29,8 +29,8 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, - LLM_CHAT_TEMPLATE_CHATGML_3, - LLM_CHAT_TEMPLATE_CHATGML_4, + LLM_CHAT_TEMPLATE_CHATGLM_3, + LLM_CHAT_TEMPLATE_CHATGLM_4, LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_EXAONE_3, @@ -41,6 +41,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_LLAMA4, + LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp index 4b3e6a83e..77177c5ee 100644 --- a/llama/llama.cpp/src/llama-context.cpp +++ b/llama/llama.cpp/src/llama-context.cpp @@ -114,7 +114,7 @@ llama_context::llama_context( } if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", __func__, n_ctx_per_seq, hparams.n_ctx_train); } @@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift( ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const { + float freq_scale) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & yarn_ext_factor = cparams.yarn_ext_factor; @@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift( // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); - if (bbuf) { - for (const auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - } - - tmp = ggml_rope_ext_inplace(ctx0, tmp, + tmp = ggml_rope_ext(ctx0, tmp, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); @@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); ggml_build_forward_expand(gf, cur); } @@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - ggml_backend_buffer_clear(buf_output.get(), 0); - this->n_outputs = 0; this->n_outputs_max = n_outputs_max; diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h index a59ff8fd4..30f84bfd3 100644 --- a/llama/llama.cpp/src/llama-context.h +++ b/llama/llama.cpp/src/llama-context.h @@ -172,8 +172,7 @@ private: ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const; + float freq_scale) const; llm_graph_result_ptr build_kv_self_shift( ggml_context * ctx0, diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp index d740c1200..b67216a48 100644 --- a/llama/llama.cpp/src/llama-graph.cpp +++ b/llama/llama.cpp/src/llama-graph.cpp @@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + if (ubatch->token && n_pos_per_embd == 4) { + // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D + // the 3 first dims are the same, and 4th dim is all 0 + std::vector pos_data(n_tokens*n_pos_per_embd); + // copy the first dimension + for (int i = 0; i < n_tokens; ++i) { + pos_data[ i] = ubatch->pos[i]; + pos_data[ n_tokens + i] = ubatch->pos[i]; + pos_data[2 * n_tokens + i] = ubatch->pos[i]; + pos_data[3 * n_tokens + i] = 0; // 4th dim is 0 + } + ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); + } else { + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + } } } @@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ) * f_attn_temp_scale + 1.0; } - ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); + ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale)); } } @@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : res (std::make_unique()) { } -int64_t llm_graph_context::n_pos_per_token() const { +int64_t llm_graph_context::n_pos_per_embd() const { return arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); + if (arch == LLM_ARCH_GLM4) { + // GLM4 seems to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } } if (down_b) { @@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); + ggml_tensor * experts = nullptr; + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } switch (type_op) { case LLM_FFN_SILU: { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_moe_gelu", il); } break; default: GGML_ABORT("fatal error"); } - ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); + if (gate_exps) { + cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate_par", il); + } - ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); if (!weight_before_ffn) { @@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { } ggml_tensor * llm_graph_context::build_inp_pos() const { - auto inp = std::make_unique(n_pos_per_token()); + auto inp = std::make_unique(n_pos_per_embd()); auto & cur = inp->pos; - cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd()); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto & cur = inp->attn_scale; - cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); + // this need to be 1x1xN for broadcasting + cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); res->add_input(std::move(inp)); diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h index 260a2af21..0fe18150b 100644 --- a/llama/llama.cpp/src/llama-graph.h +++ b/llama/llama.cpp/src/llama-graph.h @@ -91,29 +91,27 @@ public: class llm_graph_input_pos : public llm_graph_input_i { public: - llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} virtual ~llm_graph_input_pos() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * pos = nullptr; // I32 [n_batch] - const int64_t n_pos_per_token = 1; + const int64_t n_pos_per_embd = 1; }; // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * attn_scale = nullptr; // F32 [n_batch] - const int64_t n_pos_per_token = 1; - const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; }; @@ -430,7 +428,7 @@ struct llm_graph_context { llm_graph_context(const llm_graph_params & params); - int64_t n_pos_per_token() const; + int64_t n_pos_per_embd() const; void cb(ggml_tensor * cur, const char * name, int il) const; diff --git a/llama/llama.cpp/src/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h index c8a34d521..b6fc7e6df 100644 --- a/llama/llama.cpp/src/llama-hparams.h +++ b/llama/llama.cpp/src/llama-hparams.h @@ -72,6 +72,7 @@ struct llama_hparams { float expert_weights_scale = 0.0; bool expert_weights_norm = false; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; + uint32_t moe_every_n_layers = 0; float f_norm_eps; float f_norm_rms_eps; diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index c8374159f..ef70486d0 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; + case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_6B: return "1.6B"; + case LLM_TYPE_1_7B: return "1.7B"; case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2_8B: return "2.8B"; @@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_15B: return "15B"; case LLM_TYPE_16B: return "16B"; case LLM_TYPE_20B: return "20B"; + case LLM_TYPE_27B: return "27B"; case LLM_TYPE_30B: return "30B"; case LLM_TYPE_32B: return "32B"; case LLM_TYPE_34B: return "34B"; @@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; case LLM_TYPE_236B: return "236B"; + case LLM_TYPE_290B: return "290B"; case LLM_TYPE_314B: return "314B"; case LLM_TYPE_671B: return "671B"; case LLM_TYPE_SMALL: return "0.1B"; @@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_57B_A14B: return "57B.A14B"; - case LLM_TYPE_27B: return "27B"; - case LLM_TYPE_290B: return "290B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_235B_A22B: return "235B.A22B"; default: return "?B"; } } @@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { type = LLM_TYPE_137M; @@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; + case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; + case 40: type = LLM_TYPE_14B; break; + case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -2133,6 +2145,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); @@ -2166,20 +2179,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); } + if (arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + } else { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } } layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); @@ -6074,6 +6098,11 @@ struct llm_build_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); @@ -6126,13 +6155,29 @@ struct llm_build_bert : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - if (model.arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + nullptr, + model.layers[il].ffn_down_exps, + nullptr, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_GELU, + false, false, + 0.0f, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cb(cur, "ffn_moe_out", il); + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -6140,6 +6185,7 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -6147,8 +6193,8 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } - cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); @@ -13349,6 +13395,7 @@ llm_graph_result_ptr llama_model::build_graph( case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { llm = std::make_unique(*this, params, gf); } break; @@ -13714,6 +13761,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DBRX: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h index 72bab5bee..6be91282a 100644 --- a/llama/llama.cpp/src/llama-model.h +++ b/llama/llama.cpp/src/llama-model.h @@ -40,11 +40,13 @@ enum llm_type { LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, + LLM_TYPE_0_6B, LLM_TYPE_1B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, LLM_TYPE_1_6B, + LLM_TYPE_1_7B, LLM_TYPE_1_8B, LLM_TYPE_2B, LLM_TYPE_2_8B, @@ -64,6 +66,7 @@ enum llm_type { LLM_TYPE_16B, LLM_TYPE_20B, LLM_TYPE_22B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, LLM_TYPE_34B, @@ -73,6 +76,7 @@ enum llm_type { LLM_TYPE_70B, LLM_TYPE_90B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, LLM_TYPE_671B, LLM_TYPE_SMALL, @@ -87,10 +91,10 @@ enum llm_type { LLM_TYPE_16x3_8B, LLM_TYPE_10B_128x3_66B, LLM_TYPE_57B_A14B, - LLM_TYPE_27B, - LLM_TYPE_290B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_30B_A3B, + LLM_TYPE_235B_A22B, }; struct llama_layer_posnet { diff --git a/llama/llama.cpp/src/llama-sampling.cpp b/llama/llama.cpp/src/llama-sampling.cpp index b1a9dca3c..757310533 100644 --- a/llama/llama.cpp/src/llama-sampling.cpp +++ b/llama/llama.cpp/src/llama-sampling.cpp @@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // } if (k <= 0) { - k = cur_p->size; + return; } k = std::min(k, (int) cur_p->size); @@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } cur_p->sorted = true; } + cur_p->size = k; } diff --git a/llama/llama.cpp/src/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp index ba37df355..d6515ff65 100644 --- a/llama/llama.cpp/src/llama-vocab.cpp +++ b/llama/llama.cpp/src/llama-vocab.cpp @@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| - tokenizer_pre == "falcon3") { + tokenizer_pre == "falcon3" || + tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch index 8f5c3a779..44aa70953 100644 --- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch +++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch @@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 /** diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index a7febef7..31750b6f 100644 +index 9fb2134f..04ce764e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 266d8af4..12886cd3 100644 +index d92392ed..425524d0 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) +@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } free(ctx); @@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp -index a0667b7d..bd83adc5 100644 +index 140a775f..e33c4ba0 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp -@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); GGML_ASSERT(status); delete ctx; @@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp -index 1de34c96..4600f61e 100644 +index 66b6f2cc..e3e6deae 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp -@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { +@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { ggml_sycl_set_device(ctx->device); delete ctx; @@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644 } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ -@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context { +@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; delete ctx; @@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644 } static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ +@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_sycl_host_free(buffer->context); @@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 39f3cd34..c569a8a5 100644 +index c0bdb9e1..03d03064 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644 } static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe +@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch index e51b43730..ecdabe7e1 100644 --- a/llama/patches/0002-pretokenizer.patch +++ b/llama/patches/0002-pretokenizer.patch @@ -10,7 +10,7 @@ logs instead of throwing an error 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 48060517..a35b498c 100644 +index 50ded286..a9ee9f03 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -31,7 +31,7 @@ index 48060517..a35b498c 100644 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { +@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; } else { diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch index c27dbd7b5..022a83f43 100644 --- a/llama/patches/0003-embeddings.patch +++ b/llama/patches/0003-embeddings.patch @@ -11,10 +11,10 @@ instead of forcing one or the error 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 983385f8..32f59819 100644 +index 5a2eef9b..9c1fe93f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_all = 0; // count outputs @@ -23,7 +23,7 @@ index 983385f8..32f59819 100644 for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } -@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} @@ -32,7 +32,7 @@ index 983385f8..32f59819 100644 auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; if (t_embd && res->get_embd_pooled()) { -@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { +@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch index 81d61a827..35f54fd3c 100644 --- a/llama/patches/0004-clip-unicode.patch +++ b/llama/patches/0004-clip-unicode.patch @@ -10,12 +10,12 @@ filesystems for paths that include wide characters 1 file changed, 39 insertions(+) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp -index 75970615..d57b4bd6 100644 +index ad3e7df1..b3218c78 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp -@@ -29,6 +29,19 @@ - #include +@@ -30,6 +30,19 @@ #include + #include +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN @@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; //#define CLIP_DEBUG_FUNCTIONS -@@ -1430,7 +1443,29 @@ struct clip_model_loader { +@@ -1971,7 +1984,29 @@ struct clip_model_loader { { std::vector read_buf; @@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644 if (!fin) { throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); } -@@ -1457,7 +1492,11 @@ struct clip_model_loader { +@@ -1998,7 +2033,11 @@ struct clip_model_loader { ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); } } diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch index 76ddc6197..bf0fe310e 100644 --- a/llama/patches/0005-solar-pro.patch +++ b/llama/patches/0005-solar-pro.patch @@ -15,10 +15,10 @@ adds support for the Solar Pro architecture 7 files changed, 248 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 62e1480b..f754bc8f 100644 +index f2bc8ca7..5ab3f572 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp -@@ -68,6 +68,7 @@ static const std::map LLM_ARCH_NAMES = { +@@ -69,6 +69,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, @@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644 { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, -@@ -140,6 +141,7 @@ static const std::map LLM_KV_NAMES = { +@@ -142,6 +143,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, @@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644 { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, -@@ -1482,6 +1484,24 @@ static const std::map> LLM_TENSOR_N +@@ -1502,6 +1504,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, @@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644 { LLM_ARCH_WAVTOKENIZER_DEC, { -@@ -1660,6 +1680,7 @@ static const std::map LLM_TENSOR_INFOS = { +@@ -1680,6 +1700,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, @@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644 {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h -index 98ca00a1..439aaeab 100644 +index 41a023da..525c1b7d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h -@@ -72,6 +72,7 @@ enum llm_arch { +@@ -73,6 +73,7 @@ enum llm_arch { LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, @@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644 LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, -@@ -144,6 +145,7 @@ enum llm_kv { +@@ -146,6 +147,7 @@ enum llm_kv { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, @@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644 LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, -@@ -344,6 +346,7 @@ enum llm_tensor { +@@ -346,6 +348,7 @@ enum llm_tensor { LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, @@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644 if (il < n_layer) { return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 80fcd65d..6e278945 100644 +index 7ee6a5b7..48dce407 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -55,6 +55,8 @@ struct llama_hparams { @@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644 uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; -@@ -153,6 +155,9 @@ struct llama_hparams { +@@ -154,6 +156,9 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_v_s() const; @@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index 6b7bfecf..aba42819 100644 +index 822e2bb2..572378c9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; @@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context { +@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context { } }; @@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644 struct llm_build_wavtokenizer_dec : public llm_graph_context { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { ggml_tensor * cur; -@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph( +@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; @@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { llm = std::make_unique(*this, params, gf); -@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_CHAMELEON: @@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644 return LLAMA_ROPE_TYPE_NORM; diff --git a/src/llama-model.h b/src/llama-model.h -index fd82d106..5865d5e9 100644 +index 95eca002..856e6042 100644 --- a/src/llama-model.h +++ b/src/llama-model.h -@@ -62,6 +62,7 @@ enum llm_type { +@@ -64,6 +64,7 @@ enum llm_type { LLM_TYPE_15B, LLM_TYPE_16B, LLM_TYPE_20B, + LLM_TYPE_22B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, - LLM_TYPE_34B, -@@ -307,6 +308,8 @@ struct llama_layer { +@@ -311,6 +312,8 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr; diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch index e5fa0462c..9283224fe 100644 --- a/llama/patches/0006-add-mllama-support.patch +++ b/llama/patches/0006-add-mllama-support.patch @@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support adds support for the llama 3.2 vision architecture --- - examples/llava/gemma3-cli.cpp | 3 +- examples/llava/llava.cpp | 5 +- examples/llava/mtmd.cpp | 6 +- ggml/src/ggml-backend-reg.cpp | 6 +- @@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++- src/llama-model.h | 12 ++ src/llama-quant.cpp | 4 +- - 20 files changed, 475 insertions(+), 22 deletions(-) + 19 files changed, 473 insertions(+), 21 deletions(-) -diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp -index 3d566475..654d1358 100644 ---- a/examples/llava/gemma3-cli.cpp -+++ b/examples/llava/gemma3-cli.cpp -@@ -106,7 +106,7 @@ struct decode_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -118,6 +118,7 @@ struct decode_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp -index 03a22cbb..5eb40bcd 100644 +index c00d16ae..bab027b5 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp -@@ -456,7 +456,7 @@ struct llava_embd_batch { +@@ -457,7 +457,7 @@ struct llava_embd_batch { std::vector seq_ids; std::vector logits; llama_batch batch; @@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644 pos .resize(n_tokens); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); -@@ -468,6 +468,7 @@ struct llava_embd_batch { +@@ -469,6 +469,7 @@ struct llava_embd_batch { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, /*embd =*/ embd, @@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644 /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), /*seq_id =*/ seq_ids.data(), -@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ +@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ n_eval = n_batch; } float * embd = image_embed->embed+i*n_embd; @@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644 LOG_ERR("%s : failed to eval\n", __func__); return false; diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp -index 3fd5bebc..f0cec596 100644 +index 7081fd73..c14ac501 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp -@@ -233,7 +233,7 @@ struct decode_embd_batch { +@@ -476,7 +476,7 @@ struct decode_embd_batch { std::vector seq_ids; std::vector logits; llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); +- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { ++ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { + pos .resize(n_tokens * n_pos_per_embd); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); -@@ -245,6 +245,7 @@ struct decode_embd_batch { +@@ -487,6 +487,7 @@ struct decode_embd_batch { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, /*embd =*/ embd, @@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644 /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), /*seq_id =*/ seq_ids.data(), -@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, - - int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); +@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, + int32_t i_batch = 0; + int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; float * embd = mtmd_get_output_embd(ctx); -- decode_embd_batch batch_img(embd, n_tokens, n_past, 0); +- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); + int n_embd = llama_model_n_embd(llama_get_model(lctx)); -+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0); - int64_t t1 = ggml_time_ms(); - ret = llama_decode(lctx, batch_img.batch); - if (ret != 0) { ++ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0); + + const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get()); + const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get()); diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31..82ae1b5b 100644 --- a/ggml/src/ggml-backend-reg.cpp @@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644 register_backend(ggml_backend_rpc_reg()); #endif diff --git a/include/llama.h b/include/llama.h -index 5657fbf0..f91896e4 100644 +index 06c56395..f1628e88 100644 --- a/include/llama.h +++ b/include/llama.h -@@ -255,6 +255,7 @@ extern "C" { +@@ -256,6 +256,7 @@ extern "C" { llama_token * token; float * embd; @@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644 llama_pos * pos; int32_t * n_seq_id; llama_seq_id ** seq_id; -@@ -357,6 +358,7 @@ extern "C" { +@@ -358,6 +359,7 @@ extern "C" { bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings @@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644 // Abort callback // if it returns true, execution of llama_decode() will be aborted -@@ -458,6 +460,10 @@ extern "C" { +@@ -459,6 +461,10 @@ extern "C" { struct llama_context_params params), "use llama_init_from_model instead"); @@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644 LLAMA_API void llama_free(struct llama_context * ctx); diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index f754bc8f..0568565f 100644 +index 5ab3f572..eb7b5325 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -6,6 +6,7 @@ @@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644 { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, { LLM_ARCH_FALCON, "falcon" }, -@@ -142,6 +143,7 @@ static const std::map LLM_KV_NAMES = { +@@ -144,6 +145,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, @@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644 { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, -@@ -271,6 +273,40 @@ static const std::map> LLM_TENSOR_N +@@ -273,6 +275,40 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, @@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644 { LLM_ARCH_DECI, { -@@ -1681,6 +1717,14 @@ static const std::map LLM_TENSOR_INFOS = { +@@ -1701,6 +1737,14 @@ static const std::map LLM_TENSOR_INFOS = { // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644 {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h -index 439aaeab..6a989034 100644 +index 525c1b7d..bc8a4f0b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -11,6 +11,7 @@ @@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644 LLM_ARCH_DECI, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, -@@ -146,6 +147,7 @@ enum llm_kv { +@@ -148,6 +149,7 @@ enum llm_kv { LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, @@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644 LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, -@@ -347,6 +349,14 @@ enum llm_tensor { +@@ -349,6 +351,14 @@ enum llm_tensor { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, LLM_TENSOR_BSKCN_TV, @@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644 batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 32f59819..0343ba8a 100644 +index 9c1fe93f..cd06ad91 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) { +@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); } @@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644 } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG -@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) { +@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) { cparams.warmup = value; } @@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644 void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { -@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) { +@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) { const int64_t n_embd = hparams.n_embd; @@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644 const llama_ubatch ubatch = sbatch.split_simple(n_tokens); -@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) { const llama_batch & batch = batch_allocr.batch; @@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644 const int64_t n_tokens_all = batch.n_tokens; const int64_t n_embd = hparams.n_embd; -@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) { const bool logits_all = n_outputs_all == n_tokens_all; @@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644 /* simple_split */ !kv_self->recurrent, /* logits_all */ logits_all); -@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) { int32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; @@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644 const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead -@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { +@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { void llama_context::output_reorder() { auto & out_ids = sbatch.out_ids; if (!out_ids.empty()) { @@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644 const uint32_t n_embd = model.hparams.n_embd; GGML_ASSERT((size_t) n_outputs == out_ids.size()); -@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { +@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); @@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644 io.write(&logits_size, sizeof(logits_size)); -@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() { +@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() { /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, @@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644 /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; -@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { +@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { ctx->set_warmup(warmup); } @@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644 ctx->synchronize(); } diff --git a/src/llama-context.h b/src/llama-context.h -index 04facb54..baa03276 100644 +index 5457f077..a50c4afa 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -65,6 +65,7 @@ struct llama_context { @@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644 enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp -index a85e9728..d740c120 100644 +index fabb9ca2..b67216a4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp -@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { +@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { } } @@ -442,7 +420,7 @@ index a85e9728..d740c120 100644 // // llm_graph_context // -@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { +@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } @@ -469,7 +447,7 @@ index a85e9728..d740c120 100644 llm_graph_input_attn_cross * inp, ggml_cgraph * gf, diff --git a/src/llama-graph.h b/src/llama-graph.h -index d192dc14..260a2af2 100644 +index d0c8d321..0fe18150 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -86,6 +86,7 @@ public: @@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644 }; class llm_graph_input_pos : public llm_graph_input_i { -@@ -285,6 +286,16 @@ public: +@@ -283,6 +284,16 @@ public: const llama_cross * cross = nullptr; }; @@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644 // // llm_graph_result // -@@ -493,6 +504,7 @@ struct llm_graph_context { +@@ -491,6 +502,7 @@ struct llm_graph_context { ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_mask() const; @@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644 + return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 6e278945..c8a34d52 100644 +index 48dce407..b6fc7e6d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -2,6 +2,8 @@ @@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644 uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; -@@ -158,6 +162,9 @@ struct llama_hparams { +@@ -159,6 +163,9 @@ struct llama_hparams { // Block skip connection bool n_bskcn(uint32_t n, uint32_t il) const; @@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644 bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { const int kid = gguf_find_key(meta.get(), key.c_str()); diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index aba42819..d051696c 100644 +index 572378c9..9d099f11 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get general kv ml.get_key(LLM_KV_GENERAL_NAME, name, false); @@ -604,7 +582,7 @@ index aba42819..d051696c 100644 // everything past this point is not vocab-related if (hparams.vocab_only) { -@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); @@ -612,7 +590,7 @@ index aba42819..d051696c 100644 if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); -@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); @@ -624,7 +602,7 @@ index aba42819..d051696c 100644 // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; -@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); @@ -633,7 +611,7 @@ index aba42819..d051696c 100644 if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } -@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.use_kq_norm = false; } } break; @@ -650,7 +628,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_DECI: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; @@ -659,7 +637,7 @@ index aba42819..d051696c 100644 const int64_t n_token_types = vocab.n_token_types(); const int64_t n_rot = hparams.n_rot; const int64_t n_expert = hparams.n_expert; -@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; @@ -712,7 +690,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_DECI: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); -@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context { +@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context { } }; @@ -959,7 +937,7 @@ index aba42819..d051696c 100644 struct llm_build_deci : public llm_graph_context { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; -@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph( +@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; @@ -970,7 +948,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_DECI: { llm = std::make_unique(*this, params, gf); -@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA4: @@ -979,7 +957,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: diff --git a/src/llama-model.h b/src/llama-model.h -index 5865d5e9..72bab5be 100644 +index 856e6042..6be91282 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -11,6 +11,7 @@ @@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644 struct llama_cparams; struct llama_ubatch; -@@ -70,6 +71,7 @@ enum llm_type { +@@ -73,6 +74,7 @@ enum llm_type { LLM_TYPE_40B, LLM_TYPE_65B, LLM_TYPE_70B, + LLM_TYPE_90B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, - LLM_TYPE_671B, -@@ -310,6 +312,16 @@ struct llama_layer { +@@ -314,6 +316,16 @@ struct llama_layer { struct ggml_tensor * bskcn_tv = nullptr; diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch index 116545d67..50acfc632 100644 --- a/llama/patches/0007-add-unpad-operator.patch +++ b/llama/patches/0007-add-unpad-operator.patch @@ -18,10 +18,10 @@ adds the unpad operator to GGML 10 files changed, 223 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h -index 8fcc16df..d19fc167 100644 +index 1b8603e7..53ef31b2 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h -@@ -488,6 +488,7 @@ extern "C" { +@@ -489,6 +489,7 @@ extern "C" { GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, @@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644 GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, -@@ -1757,6 +1758,15 @@ extern "C" { +@@ -1777,6 +1778,15 @@ extern "C" { int p0, int p1); @@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644 // timesteps: [N,] // return: [N, dim] diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 50400328..432942bf 100644 +index 64405449..34624cca 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c -@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm +@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad_reflect_1d(params, tensor); } break; @@ -60,7 +60,7 @@ index 50400328..432942bf 100644 case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); -@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { +@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: @@ -69,10 +69,10 @@ index 50400328..432942bf 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp -index 6050147b..66b8da68 100644 +index 7413192b..becdae07 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp -@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d( +@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d( } } @@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644 static void ggml_compute_forward_arange_f32( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h -index 410a3720..3eca1cf8 100644 +index dc081b9e..a7125555 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h -@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params +@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); @@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 31750b6f..0fef9522 100644 +index 04ce764e..491acccb 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg +@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; @@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644 case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; -@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g +@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_UPSCALE: return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_PAD: @@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 12886cd3..b2e95a66 100644 +index 425524d0..112abef6 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644 GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, -@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass +@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); @@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644 GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); -@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex +@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: @@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: -@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node( +@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node( const int nth = MIN(1024, ne0); @@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644 } break; case GGML_OP_ARANGE: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal -index 8d6e99e6..71f0f97f 100644 +index 9f4147e9..6ceb3cef 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644 device char * dst, constant ggml_metal_kargs_arange & args, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c -index 950772c7..2276b631 100644 +index 7654ae17..3c57aff8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c -@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { +@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "UPSCALE", "PAD", "PAD_REFLECT_1D", @@ -365,16 +365,16 @@ index 950772c7..2276b631 100644 "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", -@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { +@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; --static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); ++static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", -@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { +@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "upscale(x)", "pad(x)", "pad_reflect_1d(x)", @@ -382,16 +382,16 @@ index 950772c7..2276b631 100644 "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", -@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { +@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; --static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); ++static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); -@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( +@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( return result; } diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0008-fix-deepseek-deseret-regex.patch index 9b2d33984..5b4753bf8 100644 --- a/llama/patches/0008-fix-deepseek-deseret-regex.patch +++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch @@ -12,7 +12,7 @@ regex 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index a35b498c..032019c9 100644 +index a9ee9f03..1306864e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { diff --git a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch index 5504b1d31..4c2192887 100644 --- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch +++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch @@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp -index 90679822..56043678 100644 +index 5b3059c2..656b3eca 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp -@@ -346,7 +346,7 @@ private: +@@ -349,7 +349,7 @@ private: friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); std::function _fetch_json; bool _dotall; diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch index c9d4e9ad8..e4b2a4081 100644 --- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch +++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch @@ -22,10 +22,10 @@ multiple batches of processing until everything is complete. 4 files changed, 51 insertions(+), 106 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 0343ba8a..4b3e6a83 100644 +index cd06ad91..77177c5e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( +@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( llm_graph_result_ptr llama_context::build_kv_self_defrag( ggml_context * ctx0, @@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644 #if 0 // CPU defrag // -@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( +@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); } #else @@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644 ggml_tensor * view_v_src; ggml_tensor * view_v_dst; -@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( +@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( if (cparams.flash_attn) { // NOTE: the V cache is not transposed when using flash attention view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], @@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644 #endif return res; -@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( +@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( void llama_context::kv_self_update() { auto & kv = kv_self; @@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644 if (kv->has_shift) { if (!kv->get_can_shift()) { GGML_ABORT("The current context does not support K-shift"); -@@ -763,8 +744,6 @@ void llama_context::kv_self_update() { +@@ -752,8 +733,6 @@ void llama_context::kv_self_update() { res->set_inputs(nullptr); graph_compute(gf, false); @@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644 } { -@@ -779,49 +758,28 @@ void llama_context::kv_self_update() { +@@ -768,49 +747,28 @@ void llama_context::kv_self_update() { // defragment the KV cache if needed if (kv->do_defrag) { LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); @@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644 } enum llama_pooling_type llama_context::pooling_type() const { -@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) { // find KV slot { if (!kv_self->find_slot(ubatch)) { @@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644 if (!kv_self->recurrent) { diff --git a/src/llama-context.h b/src/llama-context.h -index baa03276..a59ff8fd 100644 +index a50c4afa..30f84bfd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -5,6 +5,7 @@ @@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644 #include "ggml-cpp.h" -@@ -180,7 +181,8 @@ private: +@@ -179,7 +180,8 @@ private: llm_graph_result_ptr build_kv_self_defrag( ggml_context * ctx0, diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch index eaba3c4c8..6de840a60 100644 --- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch +++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch @@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants 1 file changed, 2 insertions(+) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index f00700da..91d6a7d5 100644 +index 43d9fc4f..4c0d3824 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt -@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name) +@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name) endforeach() ggml_add_cpu_backend_variant_impl(${tag_name}) @@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644 endfunction() ggml_add_backend(CPU) -@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS) +@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS) if (NOT GGML_BACKEND_DL) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() + add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) + ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0013-remove-amx.patch index 0bbc0a3a3..c27032372 100644 --- a/llama/patches/0013-remove-amx.patch +++ b/llama/patches/0013-remove-amx.patch @@ -1,6 +1,6 @@ From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: jmorganca -Date: Tue, 8 Apr 2025 20:33:01 -0700 +Date: Thu, 1 May 2025 15:05:08 -0700 Subject: [PATCH] remove amx disable amx as it reduces performance on some systems @@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems 1 file changed, 4 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index 91d6a7d5..d6b393a2 100644 +index 4c0d3824..79c26312 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt -@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) +@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) - if (NOT MSVC) - # MSVC doesn't support AMX -- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) +- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) - endif() elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0014-fix-string-arr-kv-loading.patch index 01f1b71eb..5d94ca2c8 100644 --- a/llama/patches/0014-fix-string-arr-kv-loading.patch +++ b/llama/patches/0014-fix-string-arr-kv-loading.patch @@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644 } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 032019c9..ba37df35 100644 +index 1306864e..d6515ff6 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0015-ollama-debug-tensor.patch index a192bdea8..79d997c75 100644 --- a/llama/patches/0015-ollama-debug-tensor.patch +++ b/llama/patches/0015-ollama-debug-tensor.patch @@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 432942bf..6d4abe4c 100644 +index 34624cca..59bd3c62 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -15,6 +15,8 @@ @@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644 #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { +@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); diff --git a/llama/patches/0016-add-model-quantizations.patch b/llama/patches/0016-add-model-quantizations.patch index 3d078b03f..2e3be0c68 100644 --- a/llama/patches/0016-add-model-quantizations.patch +++ b/llama/patches/0016-add-model-quantizations.patch @@ -13,10 +13,10 @@ models not supported in llama.cpp 4 files changed, 24 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 0568565f..dd01df60 100644 +index eb7b5325..df42d1a5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp -@@ -73,6 +73,7 @@ static const std::map LLM_ARCH_NAMES = { +@@ -74,6 +74,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, @@ -24,7 +24,7 @@ index 0568565f..dd01df60 100644 { LLM_ARCH_UNKNOWN, "(unknown)" }, }; -@@ -1586,6 +1587,22 @@ static const std::map> LLM_TENSOR_N +@@ -1606,6 +1607,22 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, @@ -48,10 +48,10 @@ index 0568565f..dd01df60 100644 LLM_ARCH_UNKNOWN, { diff --git a/src/llama-arch.h b/src/llama-arch.h -index 6a989034..b6227eeb 100644 +index bc8a4f0b..bda9d071 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h -@@ -75,6 +75,7 @@ enum llm_arch { +@@ -76,6 +76,7 @@ enum llm_arch { LLM_ARCH_CHAMELEON, LLM_ARCH_SOLAR, LLM_ARCH_WAVTOKENIZER_DEC, @@ -60,10 +60,10 @@ index 6a989034..b6227eeb 100644 LLM_ARCH_BAILINGMOE, LLM_ARCH_UNKNOWN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index d051696c..c8374159 100644 +index 9d099f11..ef70486d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; @@ -71,7 +71,7 @@ index d051696c..c8374159 100644 default: throw std::runtime_error("unsupported model architecture"); } -@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_CHAMELEON: case LLM_ARCH_SOLAR: case LLM_ARCH_BAILINGMOE: diff --git a/llama/patches/0021-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0017-add-ollama-vocab-for-grammar-support.patch similarity index 98% rename from llama/patches/0021-add-ollama-vocab-for-grammar-support.patch rename to llama/patches/0017-add-ollama-vocab-for-grammar-support.patch index 6193b755f..26a91ad9a 100644 --- a/llama/patches/0021-add-ollama-vocab-for-grammar-support.patch +++ b/llama/patches/0017-add-ollama-vocab-for-grammar-support.patch @@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644 const char * grammar_root, bool lazy, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp -index d1497985..b1a9dca3 100644 +index c0a5f934..75731053 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp -@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { +@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); } @@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644 ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); -@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( +@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( /* .vocab = */ vocab, /* .grammar_str = */ grammar_str, /* .grammar_root = */ grammar_root, diff --git a/ml/backend/ggml/ggml/include/ggml-cpu.h b/ml/backend/ggml/ggml/include/ggml-cpu.h index f5e11f1e1..de77a875e 100644 --- a/ml/backend/ggml/ggml/include/ggml-cpu.h +++ b/ml/backend/ggml/ggml/include/ggml-cpu.h @@ -133,6 +133,11 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t); + #ifdef __cplusplus } #endif diff --git a/ml/backend/ggml/ggml/include/ggml-rpc.h b/ml/backend/ggml/ggml/include/ggml-rpc.h index c8b6097f7..1e6741127 100644 --- a/ml/backend/ggml/ggml/include/ggml-rpc.h +++ b/ml/backend/ggml/ggml/include/ggml-rpc.h @@ -7,7 +7,7 @@ extern "C" { #endif -#define RPC_PROTO_MAJOR_VERSION 1 +#define RPC_PROTO_MAJOR_VERSION 2 #define RPC_PROTO_MINOR_VERSION 0 #define RPC_PROTO_PATCH_VERSION 0 #define GGML_RPC_MAX_SERVERS 16 diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h index d19fc1678..53ef31b22 100644 --- a/ml/backend/ggml/ggml/include/ggml.h +++ b/ml/backend/ggml/ggml/include/ggml.h @@ -393,8 +393,8 @@ extern "C" { // precision enum ggml_prec { - GGML_PREC_DEFAULT, - GGML_PREC_F32, + GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default + GGML_PREC_F32 = 10, }; // model file types @@ -481,6 +481,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -678,6 +679,9 @@ extern "C" { GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN + GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -1661,7 +1665,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // depthwise + // depthwise (via im2col and mul_mat) GGML_API struct ggml_tensor * ggml_conv_2d_dw( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1673,6 +1677,22 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + // Depthwise 2D convolution + // may be faster than ggml_conv_2d_dw, but not available in all backends + // a: KW KH 1 C convolution kernel + // b: W H C N input data + // res: W_out H_out C N + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt index d6b393a21..79c26312f 100644 --- a/ml/backend/ggml/ggml/src/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/CMakeLists.txt @@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name) set(GGML_CPU_TAG_NAME ${tag_name}) # other: OPENMP LLAMAFILE CPU_HBM foreach (feat NATIVE + SSE42 AVX AVX2 BMI2 AVX_VNNI FMA F16C AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8 AMX_BF16) @@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) + ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) + ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") endif() diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt index e73a3b69b..9a3085bef 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt @@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) elseif (GGML_AVX) list(APPEND ARCH_FLAGS /arch:AVX) list(APPEND ARCH_DEFINITIONS GGML_AVX) - else () + elseif (GGML_SSE42) list(APPEND ARCH_FLAGS /arch:SSE4.2) list(APPEND ARCH_DEFINITIONS GGML_SSE42) endif() @@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) else () - list(APPEND ARCH_FLAGS -msse4.2) - list(APPEND ARCH_DEFINITIONS GGML_SSE42) + if (GGML_SSE42) + list(APPEND ARCH_FLAGS -msse4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + endif() if (GGML_F16C) list(APPEND ARCH_FLAGS -mf16c) list(APPEND ARCH_DEFINITIONS GGML_F16C) @@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") message(STATUS "z15 target") - list(APPEND ARCH_FLAGS -march=z15 -mtune=z15) + list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") message(STATUS "z16 target") - list(APPEND ARCH_FLAGS -march=z16 -mtune=z16) + list(APPEND ARCH_FLAGS -march=z16) + elseif (${S390X_M} MATCHES "9175|9176") + # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. + message(STATUS "z17 target") + list(APPEND ARCH_FLAGS -march=z17) else() message(STATUS "Unknown target") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 902ee4346..d775a0363 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -263,7 +263,7 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support - int score = 0; + int score = 1; cpuid_x86 is; #ifdef GGML_FMA diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index 6d4abe4c7..59bd3c621 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_F16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, @@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K, }, [GGML_TYPE_BF16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D_DW: + { + ggml_compute_forward_conv_2d_dw(params, tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor); @@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: { @@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m512 x_vec = _mm512_loadu_ps(x + i); + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_si256((__m256i *)(y + i), y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for (; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + +void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i)); + __m512 y_vec = _mm512_cvtph_ps(x_vec); + _mm512_storeu_ps(y + i, y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i)); + __m256 y_vec = _mm256_cvtph_ps(x_vec); + _mm256_storeu_ps(y + i, y_vec); + } + for (; i + 3 < n; i += 4) { + __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i)); + __m128 y_vec = _mm_cvtph_ps(x_vec); + _mm_storeu_ps(y + i, y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) { + int64_t i = 0; + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_BF16(x[i]); + } +} + +void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX2__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#endif + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} int ggml_cpu_has_avx(void) { #if defined(__AVX__) diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp index 66b8da68f..becdae075 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp @@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_fp16_to_fp32_row( + ggml_cpu_fp16_to_fp32( (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_bf16_to_fp32_row( + ggml_cpu_bf16_to_fp32( (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d( } } +// ggml_compute_forward_conv_2d_dw + +struct ggml_conv_2d_dw_params { + int64_t channels; + int64_t batch; + int64_t src_w; + int64_t src_h; + int64_t dst_w; + int64_t dst_h; + int64_t knl_w; + int64_t knl_h; + int stride_x; + int stride_y; + int pad_x; + int pad_y; + int dilation_x; + int dilation_y; +}; + +static void ggml_compute_forward_conv_2d_dw_cwhn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t c = p.channels; + const float * knl_data = (const float *)kernel->data; + + const int64_t rows_total = p.dst_h * p.batch; + const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; + const int64_t row_start = params->ith * rows_per_thread; + const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); + +#ifdef GGML_SIMD + const int64_t pkg_size = GGML_F32_EPR; + const int64_t pkg_count = c / pkg_size; + const int64_t c_pkg_end = pkg_count * pkg_size; +#else + const int64_t c_pkg_end = 0; +#endif + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t dst_y = row % p.dst_h; + const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; + const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + +#ifdef GGML_SIMD + // Vectorized loop + for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); + GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); + sum = GGML_F32_VEC_FMA(sum, k, s); + } + } + GGML_F32_VEC_STORE(dst_data + c_i, sum); + } +#endif + // Scalar loop + for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] + * src_data[(src_y * p.src_w + src_x) * c + c_i]; + } + } + dst_data[c_i] = sum; + } + } + } +} + +static void ggml_compute_forward_conv_2d_dw_whcn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t n = p.channels * p.batch; + const int64_t per_thread = (n + params->nth - 1) / params->nth; + const int64_t start = params->ith * per_thread; + const int64_t end = MIN(start + per_thread, n); + + for (int64_t i = start; i < end; ++i) { + const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; + float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + + for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[knl_y * p.knl_w + knl_x] + * src_data[src_y * p.src_w + src_x]; + } + } + dst_data[dst_y * p.dst_w + dst_x] = sum; + } + } + } +} + +void ggml_compute_forward_conv_2d_dw( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * src = dst->src[1]; + ggml_conv_2d_dw_params p; + p.channels = src->ne[2]; + p.batch = src->ne[3]; + p.src_w = src->ne[0]; + p.src_h = src->ne[1]; + p.dst_w = dst->ne[0]; + p.dst_h = dst->ne[1]; + p.knl_w = kernel->ne[0]; + p.knl_h = kernel->ne[1]; + p.stride_x = dst->op_params[0]; + p.stride_y = dst->op_params[1]; + p.pad_x = dst->op_params[2]; + p.pad_y = dst->op_params[3]; + p.dilation_x = dst->op_params[4]; + p.dilation_y = dst->op_params[5]; + + GGML_ASSERT(kernel->ne[3] == p.channels); + GGML_ASSERT(dst->ne[3] == p.batch); + + if (ggml_is_contiguous(src)) { + ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p); + } else if (ggml_is_contiguous_channels(src)) { + // kernel should also have channels most contiguous in memory + GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]); + ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p); + } else { + GGML_ABORT("non-contiguous memory layout not supported"); + } +} + // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h index 3eca1cf86..a7125555e 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h @@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h index 04d10cec2..45c31cf1f 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h @@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F32_EPR 4 #define GGML_F32x4 vector float -#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_ZERO {0.0f} #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh index 8284a0017..2ea014e64 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh @@ -78,13 +78,13 @@ // Moore Threads #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) -#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 -#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 -#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD +#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 +#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 +#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) -#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT) +#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG) #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) #ifdef __CUDA_ARCH_LIST__ diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu index a224ec0e1..c6dec4276 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu @@ -1,6 +1,8 @@ #include "convert.cuh" #include "dequantize.cuh" +#include + #define CUDA_Q8_0_NE_ALIGN 2048 template @@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t } template -static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { - const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void convert_unary( + const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t s01, const int64_t s02, const int64_t s03) { + const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (i00 >= ne00) { return; } + const int64_t i01 = blockIdx.y; + const int64_t i02 = blockIdx.z % ne02; + const int64_t i03 = blockIdx.z / ne02; + const src_t * x = (const src_t *) vx; - y[i] = float(x[i]); + const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; + const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; + y[iy] = float(x[ix]); } template -static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - convert_unary<<>>(vx, y, k); +static void convert_unary_cuda(const void * vx, dst_t * y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { + const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03); + convert_unary<<>> + (vx, y, ne00, ne01, ne02, s01, s02, s03); +} + +template +static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + convert_unary_cuda(vx, y, k, 1, 1, 1, k, k, k, stream); } to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_BF16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; + case GGML_TYPE_BF16: + return convert_unary_cont_cuda; + default: + return nullptr; + } +} + +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh index 411a13cf1..b65b98e08 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh @@ -3,7 +3,7 @@ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 template -using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); +using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream); typedef to_t_cuda_t to_fp32_cuda_t; typedef to_t_cuda_t to_fp16_cuda_t; @@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type); to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); + +// TODO more general support for non-contiguous inputs + +template +using to_t_nc_cuda_t = void (*)(const void * x, T * y, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, + int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream); + +typedef to_t_nc_cuda_t to_fp16_nc_cuda_t; +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu index ed25646e8..2d46176ea 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu @@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; } +#else + GGML_UNUSED(disable_indirection_for_this_node); #endif } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu index 4cef53a98..ea8bf6916 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu @@ -33,8 +33,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = v.x; - dst_row[iybs + iqs + y_offset] = v.y; + dst_row[iybs + iqs + 0] = float(v.x); + dst_row[iybs + iqs + y_offset] = float(v.y); } template @@ -60,7 +60,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = src0_row[i00]; + dst_row[i00] = float(src0_row[i00]); } template @@ -86,120 +86,159 @@ static __global__ void k_get_rows_back_float( dst[dst_row*ncols + col] = sum; } -template -static void get_rows_cuda( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - +template +static void get_rows_cuda_q( + const void * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); const dim3 block_nums(block_num_x, ne10, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); GGML_ASSERT(ne00 % 2 == 0); k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); } -template +template static void get_rows_cuda_float( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne13 == 1); - + const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const dim3 block_nums(block_num_x, ne10, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); +} - GGML_UNUSED(dst); +template +static void ggml_cuda_get_rows_switch_src0_type( + const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + switch (src0_type) { + case GGML_TYPE_F16: + get_rows_cuda_float((const half *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float((const float *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + // TODO: k-quants + GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type)); + break; + } +} + +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream) { + switch (dst_type) { + case GGML_TYPE_F32: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type)); + break; + } } void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const void * src0_d = (const void *) src0->data; - const int32_t * src1_d = (const int32_t *) src1->data; - float * dst_d = (float *) dst->data; - cudaStream_t stream = ctx.stream(); + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ne13 == 1); GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - switch (src0->type) { - case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - default: - // TODO: k-quants - GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - break; - } + get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); } void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh index a1ca643f1..3c5bea5f4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh @@ -3,6 +3,13 @@ #define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256 +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream); + void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 0fef9522d..491acccb4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1413,6 +1413,11 @@ static void ggml_cuda_op_mul_mat( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; + const int64_t nb2 = dst->nb[2]; const int64_t nb3 = dst->nb[3]; @@ -1548,7 +1553,10 @@ static void ggml_cuda_op_mul_mat( dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size); if (src1_on_device && src1_is_contiguous) { - quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream); + quantize_src1( + dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10, + nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float), + src1_padded_col_size, ne11, ne12, ne13, stream); CUDA_CHECK(cudaGetLastError()); } } @@ -1643,7 +1651,9 @@ static void ggml_cuda_op_mul_mat( } if (quantize_src1 && !src1_is_contiguous) { - quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream); + quantize_src1( + src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10, + src1_padded_col_size, src1_ncols, 1, 1, stream); CUDA_CHECK(cudaGetLastError()); } @@ -1713,15 +1723,15 @@ static __global__ void k_compute_batched_ptrs( size_t nb12, size_t nb13, size_t nbd2, size_t nbd3, int64_t r2, int64_t r3) { - int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; - int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; + const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int64_t i03 = i13 / r3; - int64_t i02 = i12 / r2; + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; @@ -1735,6 +1745,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); + // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. + // As long as dst is contiguous this does not matter though. + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); @@ -1743,21 +1757,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - void * src0_ddq = src0->data; - half * src0_f16 = (half *) src0_ddq; - float * src1_ddf = (float *) src1->data; - float * dst_ddf = (float *) dst->data; + const half * src0_f16 = (const half *) src0->data; + float * dst_ddf = (float *) dst->data; + + const half * src1_f16 = (const half *) src1->data; + const size_t ts_src1 = ggml_type_size(src1->type); + GGML_ASSERT(nb10 == ts_src1); + int64_t s11 = nb11 / ts_src1; + int64_t s12 = nb12 / ts_src1; + int64_t s13 = nb13 / ts_src1; + ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); // convert src1 to fp16 - ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); if (src1->type != GGML_TYPE_F16) { - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type); const int64_t ne_src1 = ggml_nelements(src1); src1_f16_alloc.alloc(ne_src1); GGML_ASSERT(to_fp16_cuda != nullptr); - to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + + to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + + src1_f16 = src1_f16_alloc.get(); + s11 = ne10; + s12 = ne11*s11; + s13 = ne12*s12; } - half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get(); ggml_cuda_pool_alloc dst_f16(ctx.pool()); char * dst_t; @@ -1817,13 +1841,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, - cu_compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half), + src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11, + beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0, + cu_compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } } @@ -1834,15 +1858,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK( cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA - (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB - beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC + alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA + src1_f16, CUDA_R_16F, s11, s12, // strideB + beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx - const int ne23 = ne12*ne13; + const int64_t ne23 = ne12*ne13; ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); @@ -1854,8 +1878,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ne12, ne13, ne23, nb02, nb03, - src1->type == GGML_TYPE_F16 ? nb12 : nb12/2, - src1->type == GGML_TYPE_F16 ? nb13 : nb13/2, + src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half), + src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half), nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); @@ -1864,8 +1888,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10, - beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, + (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11, + beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, ne23, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1881,7 +1905,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); - bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) + bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) @@ -1922,12 +1946,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { + if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) - ggml_cuda_mul_mat_vec(ctx, src0, src1, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) - && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_vec_q) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_q) { + ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); + } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && + !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { @@ -1941,196 +1969,145 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } } -struct mmid_row_mapping { - int32_t i1; - int32_t i2; -}; - -static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous, - int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping, - const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0, - int64_t ne11, int64_t ne10, - size_t nb11, size_t nb12) { - int32_t iid1 = blockIdx.x; - int32_t id = blockIdx.y; - - const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0); - - if (row_id_i != i02) { - return; - } - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - __shared__ int src1_row; - if (threadIdx.x == 0) { - src1_row = atomicAdd(cur_src1_row, 1); - row_mapping[src1_row] = {id, iid1}; - } - __syncthreads(); - - const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12); - float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11); - - for (int i = threadIdx.x; i < ne10; i += blockDim.x) { - src1_row_contiguous[i] = src1_row_original[i]; - } -} - -static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous, - const mmid_row_mapping * __restrict__ row_mapping, - int64_t ne0, - size_t nb1, size_t nb2) { - int32_t i = blockIdx.x; - - const int32_t i1 = row_mapping[i].i1; - const int32_t i2 = row_mapping[i].i2; - - const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1); - float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2); - - for (int j = threadIdx.x; j < ne0; j += blockDim.x) { - dst_row_original[j] = dst_row_contiguous[j]; - } -} - static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); + GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (ne2 == 1) { + if (ggml_is_quantized(src0->type)) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); + } else { + ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst); + } + return; + } + + if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) { + ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); + return; + } + } cudaStream_t stream = ctx.stream(); - const int64_t n_as = ne02; - const int64_t n_ids = ids->ne[0]; + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc)) + || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type; + const ggml_type type_dst_sorted = GGML_TYPE_F32; + const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted); + const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_to_sorted_host; + ids_to_sorted_host.reserve(2*ne_get_rows); + std::vector ids_from_sorted_host(ne_get_rows); + + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool(), 2*ne_get_rows); + + std::vector tokens_per_expert(ne02); + + ggml_cuda_pool_alloc src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted); + ggml_cuda_pool_alloc dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted); std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *) ids->data; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); - ggml_tensor src0_row = *src0; - ggml_tensor src1_row = *src1; - ggml_tensor dst_row = *dst; - - char * src0_original = (char *) src0->data; - char * src1_original = (char *) src1->data; - char * dst_original = (char *) dst->data; - - src0_row.ne[2] = 1; - src0_row.ne[3] = 1; - src0_row.nb[3] = nb02; - - src1_row.ne[1] = 1; - src1_row.ne[2] = 1; - src1_row.ne[3] = 1; - src1_row.nb[2] = nb11; - src1_row.nb[3] = nb11; - - dst_row.ne[1] = 1; - dst_row.ne[2] = 1; - dst_row.ne[3] = 1; - dst_row.nb[2] = nb1; - dst_row.nb[3] = nb1; - - if (ne12 == 1) { - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - - GGML_ASSERT(i02 >= 0 && i02 < n_as); - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - const int64_t i1 = id; - const int64_t i2 = i12; - - src0_row.data = src0_original + i02*nb02; - src1_row.data = src1_original + i11*nb11 + i12*nb12; - dst_row.data = dst_original + i1*nb1 + i2*nb2; - - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - } - } - } else { - ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); - ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); - - src1_row.data = src1_contiguous.get(); - dst_row.data = dst_contiguous.get(); - - for (int64_t i02 = 0; i02 < n_as; i02++) { - int64_t num_src1_rows = 0; - - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - - GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); - - if (row_id_i != i02) { - continue; - } - - num_src1_rows++; + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size(); + ids_to_sorted_host.push_back(i12*ne11 + iex % ne11); + tokens_per_expert[i02]++; + break; } } - - if (num_src1_rows == 0) { - continue; - } - - ggml_cuda_pool_alloc dev_cur_src1_row(ctx.pool(), 1); - ggml_cuda_pool_alloc dev_row_mapping(ctx.pool(), num_src1_rows); - CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream)); - - { - dim3 block_dims(std::min((unsigned int)ne10, 768u)); - dim3 grid_dims(ids->ne[1], n_ids); - k_copy_src1_to_contiguous<<>>( - src1_original, src1_contiguous.get(), - dev_cur_src1_row.get(), dev_row_mapping.get(), - ids_dev, i02, ids->nb[1], ids->nb[0], - ne11, ne10, - nb11, nb12); - CUDA_CHECK(cudaGetLastError()); - } - - src0_row.data = src0_original + i02*nb02; - - GGML_ASSERT(nb11 == sizeof(float)*ne10); - GGML_ASSERT(nb1 == sizeof(float)*ne0); - - src1_row.ne[1] = num_src1_rows; - src1_row.nb[1] = nb11; - src1_row.nb[2] = num_src1_rows*nb11; - src1_row.nb[3] = num_src1_rows*nb11; - - dst_row.ne[1] = num_src1_rows; - dst_row.nb[1] = nb1; - dst_row.nb[2] = num_src1_rows*nb1; - dst_row.nb[3] = num_src1_rows*nb1; - - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - - { - dim3 block_dims(std::min((unsigned int)ne0, 768u)); - dim3 grid_dims(num_src1_rows); - k_copy_dst_from_contiguous<<>>( - dst_original, dst_contiguous.get(), - dev_row_mapping.get(), - ne0, - nb1, nb2); - CUDA_CHECK(cudaGetLastError()); - } } } + GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows)); + + ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end()); + + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows; + const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows; + + get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted, + ne10, nb11, nb12, nb13, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream); + CUDA_CHECK(cudaGetLastError()); + + char * src1_data_cur = (char *) src1_sorted.ptr; + char * dst_data_cur = (char *) dst_sorted.ptr; + for (int64_t i02 = 0; i02 < ne02; ++i02) { + if (tokens_per_expert[i02] == 0) { + continue; + } + + ggml_tensor src0_slice = *src0; + src0_slice.ne[2] = 1; + src0_slice.nb[3] = src0_slice.nb[2]; + src0_slice.data = (char *) src0->data + i02*nb02; + + ggml_tensor src1_slice; + memset(&src1_slice, 0, sizeof(src1_slice)); + src1_slice.buffer = src1->buffer; + src1_slice.type = type_src1_sorted; + src1_slice.ne[0] = ne10; + src1_slice.ne[1] = tokens_per_expert[i02]; + src1_slice.ne[2] = 1; + src1_slice.ne[3] = 1; + src1_slice.nb[0] = ts_src1_sorted; + src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0]; + src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1]; + src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2]; + src1_slice.data = src1_data_cur; + + ggml_tensor dst_slice; + memset(&dst_slice, 0, sizeof(dst_slice)); + dst_slice.buffer = dst->buffer; + dst_slice.type = type_dst_sorted; + dst_slice.ne[0] = ne0; + dst_slice.ne[1] = tokens_per_expert[i02]; + dst_slice.ne[2] = 1; + dst_slice.ne[3] = 1; + dst_slice.nb[0] = ts_dst_sorted; + dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0]; + dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1]; + dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; + dst_slice.data = dst_data_cur; + + ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); + CUDA_CHECK(cudaGetLastError()); + + src1_data_cur += src1_slice.nb[2]; + dst_data_cur += dst_slice.nb[2]; + } + + get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type, + ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + nb1, nb2, nb3, stream); } static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { @@ -2495,7 +2472,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_MUL_MAT_ID) { + if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__); @@ -3209,9 +3186,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - const size_t ts = ggml_type_size(op->src[0]->type); - const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2]; - return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts; + return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]); } case GGML_OP_IM2COL: case GGML_OP_POOL_2D: diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu index b36b43d54..f397a7e03 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu @@ -1,37 +1,10 @@ #include "mmq.cuh" +#include "quantize.cuh" -void ggml_cuda_op_mul_mat_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { +#include - const int64_t ne00 = src0->ne[0]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne0 = dst->ne[0]; - - const int64_t row_diff = row_high - row_low; - const int64_t stride00 = ne00 / ggml_blck_size(src0->type); - - int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - - // The stream-k decomposition is only faster for recent NVIDIA GPUs. - // Also its fixup needs to allocate a temporary buffer in the memory pool. - // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && - ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; - - switch (src0->type) { +static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { + switch (args.type_x) { case GGML_TYPE_Q4_0: mul_mat_q_case(ctx, args, stream); break; @@ -90,10 +63,195 @@ void ggml_cuda_op_mul_mat_q( GGML_ABORT("fatal error"); break; } +} + +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. + + GGML_TENSOR_BINARY_OP_LOCALS; + + cudaStream_t stream = ctx.stream(); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); + + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + + const char * src0_d = (const char *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; + + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; + + if (!ids) { + const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d, + ne00, ne01, ne1, s01, s1, + ne02, ne12, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); + return; + } + + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_host(ggml_nbytes(ids)); + std::vector ids_src1_host; + ids_src1_host.reserve(ne_get_rows); + std::vector ids_dst_host; + ids_dst_host.reserve(ne_get_rows); + std::vector tokens_per_expert_host(ne02); + std::vector expert_bounds_host(ne02 + 1); + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool()); + + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11); + ids_dst_host.push_back(i12*ne1 + iex); + tokens_per_expert_host[i02]++; + break; + } + } + } + } + + int32_t cumsum = 0; + for (int64_t i = 0; i < ne02; ++i) { + expert_bounds_host[i] = cumsum; + cumsum += tokens_per_expert_host[i]; + } + expert_bounds_host[ne02] = cumsum; + + std::vector ids_buf_host; + ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size()); + ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end()); + ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end()); + ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end()); + ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device. + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_src1_dev = ids_buf_dev.ptr; + const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size(); + const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size(); + + const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + const int64_t ne11_flat = ne12*n_expert_used; + const int64_t ne12_flat = 1; + const int64_t ne13_flat = 1; + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[2] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d, + ne00, ne01, ne_get_rows, s01, s1, + ne02, ne02, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); +} + +void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + const int64_t stride01 = ne00 / ggml_blck_size(src0->type); + + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && + ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; + const mmq_args args = { + src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, + ne00, row_diff, src1_ncols, stride01, nrows_dst, + 1, 1, 0, 0, 0, + 1, 1, 0, 0, 0, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); GGML_UNUSED(src1_ddf_i); + GGML_UNUSED(src1_padded_row_size); } bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh index 532358018..8c93e8326 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh @@ -13,9 +13,10 @@ using namespace ggml_cuda_mma; #define MMQ_ITER_K 256 #define MMQ_NWARPS 8 -typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride); -typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00); -typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max); +typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); +typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00); +typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted, + float * __restrict__ dst, const int stride, const int i_max, const int j_max); enum mmq_q8_1_ds_layout { MMQ_Q8_1_DS_LAYOUT_D4, @@ -155,25 +156,27 @@ static constexpr __device__ int get_mmq_y_device() { #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8} static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) { - return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 : - type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 : - type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K : - type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K : - type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 : - tile_x_sizes{0, 0, 0}; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0; + case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1; + case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K; + case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K; + case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; + default: return tile_x_sizes{0, 0, 0}; + } } #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) @@ -189,25 +192,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 : - 0; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; + default: return 0; + } } #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) @@ -229,7 +234,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */ // ------------------------------------------------------------ template static __device__ __forceinline__ void load_tiles_q4_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -285,7 +290,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); const int * x_qs = (const int *) x; @@ -324,7 +329,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q4_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -380,7 +385,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); const int * x_qs = (const int *) x; @@ -419,7 +424,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -491,7 +496,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q5_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -561,7 +566,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q8_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -617,7 +622,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y); const int * x_qs = (const int *) x; @@ -647,7 +652,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -728,7 +733,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); const int * x_qs = (const int *) x; @@ -758,7 +763,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -835,7 +840,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16; const int * x_qs = (const int *) x; @@ -867,7 +872,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -951,7 +956,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q2_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1007,7 +1012,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y); const int * x_qs = (const int *) x; @@ -1070,7 +1075,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1197,7 +1202,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q3_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1294,7 +1299,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y); const int * x_qs = (const int *) x; @@ -1336,7 +1341,7 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co } template static __device__ __forceinline__ void load_tiles_q4_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1433,7 +1438,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); const int * x_qs = (const int *) x; @@ -1465,7 +1470,7 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1574,7 +1579,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); const int * x_qs = (const int *) x; @@ -1606,7 +1611,7 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q6_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1689,7 +1694,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y); const int * x_qs = (const int *) x; @@ -1722,7 +1727,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1831,7 +1836,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_iq4_nl( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1889,7 +1894,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1947,7 +1952,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2003,7 +2008,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2066,7 +2071,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2122,7 +2127,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2185,7 +2190,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq1_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2241,7 +2246,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq4_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2302,8 +2307,8 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void mmq_write_back_dp4a( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -2320,15 +2325,15 @@ static __device__ __forceinline__ void mmq_write_back_dp4a( continue; } - dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } template static __device__ __forceinline__ void mmq_write_back_mma( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { typedef tile<16, 8, int> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -2358,7 +2363,7 @@ static __device__ __forceinline__ void mmq_write_back_mma( continue; } - dst[j*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; + dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; } } } @@ -2514,17 +2519,18 @@ struct mmq_type_traits { }; template -static __device__ void mul_mat_q_process_tile( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0, - const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) { +static __device__ __forceinline__ void mul_mat_q_process_tile( + const char * __restrict__ x, const int offset_x, const int * __restrict__ y, + const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst, + const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; - extern __shared__ char data_mul_mat_q[]; - int * tile_y = (int *) data_mul_mat_q; + extern __shared__ int data_mul_mat_q[]; + int * tile_y = data_mul_mat_q + mmq_x; int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE); #ifdef NEW_MMA_AVAILABLE @@ -2539,16 +2545,11 @@ static __device__ void mul_mat_q_process_tile( float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int tile_x_max_i = ne01 - it*mmq_y - 1; - const int tile_y_max_j = ne11 - jt*mmq_x - 1; - - const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); - for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { - load_tiles(x, tile_x, stride01*it*mmq_y + kb0, tile_x_max_i, stride01); + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2564,7 +2565,7 @@ static __device__ void mul_mat_q_process_tile( __syncthreads(); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2581,12 +2582,10 @@ static __device__ void mul_mat_q_process_tile( } if (fixup) { - write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); + write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); } else { - write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j); + write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j); } - - GGML_UNUSED(ne00); GGML_UNUSED(ne10); } @@ -2605,8 +2604,11 @@ template #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { + const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst, + const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst, + const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { @@ -2617,26 +2619,85 @@ static __global__ void mul_mat_q( constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); + const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x + const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y + + // Initialize the ids for writing back data with just the index. + // For regular matrix multiplications this is never changed. + // For MoE the correct indices are loaded from ids_dst. + extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory. +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { + const int wt = blockIdx.z / nchannels_y; + const int zt = blockIdx.z - wt*nchannels_y; + const int jt = blockIdx.y; + const int it = blockIdx.x; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; + constexpr bool fixup = false; mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - blockIdx.x, blockIdx.y, 0, ne00/qk); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x - const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y - // kbc == k block continuous, current index in continuous ijk space. - int64_t kbc = (int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x; - int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x; + int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter; @@ -2645,13 +2706,64 @@ static __global__ void mul_mat_q( int kb0_start = kbc % blocks_per_ne00; int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) { - const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile. - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile. + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + kbc += blocks_per_ne00; + kbc -= kbc % blocks_per_ne00; + + kb0_start = 0; + kb0_stop = min(blocks_per_ne00, kbc_stop - kbc); + + continue; + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); kbc += blocks_per_ne00; kbc -= kbc % blocks_per_ne00; @@ -2664,55 +2776,106 @@ static __global__ void mul_mat_q( return; } - const int jt = kbc / (blocks_per_ne00*nty); - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + + // The memory layout for the fixup buffer is always contiguous, therefore reset ids: +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); } template static __global__ void mul_mat_q_stream_k_fixup( - float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) { - + const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, + const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst, + const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; - const int nty = (ne01 + mmq_y - 1) / mmq_y; + const int ntx = (ncols_y + mmq_x - 1) / mmq_x; + const int nty = (nrows_x + mmq_y - 1) / mmq_y; + + const int bidx0 = blockIdx.x; + + // kbc == k block continuous, current index in continuous ijk space. + int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + + kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter; + kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter; + + const bool did_not_have_any_data = kbc0 == kbc0_stop; + const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0; + const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0; + if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { + return; + } bool any_fixup = false; - const int bidx_start = ((blockIdx.y*nty + blockIdx.x) * block_num_mmq) / (gridDim.y*gridDim.x); - const int bidx_stop = ((blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq + gridDim.y*gridDim.x - 1) / (gridDim.y*gridDim.x); + // Iterate over previous blocks and sum up partial sums written to fixup buffer. + // All CUDA blocks that get here must have a previous block that needs a fixup. + int64_t bidx = bidx0 - 1; + int64_t kbc_stop = kbc0; + while(true) { + int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; - int64_t kbc_0; - int64_t kbc_stop_0 = (int64_t) bidx_start*blocks_per_ne00*ntx*nty / block_num_mmq; - - for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) { - kbc_0 = kbc_stop_0; - kbc_stop_0 = (int64_t) (bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq; - - const int64_t kbc = kbc_0 - (kbc_0 % blocks_per_ne00) % blocks_per_iter; - const int64_t kbc_stop = kbc_stop_0 - (kbc_stop_0 % blocks_per_ne00) % blocks_per_iter; - - // Skip fixup tile if the MMQ CUDA block never wrote anything to it: - if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) { - continue; - } - - const int jt = kbc_stop / (blocks_per_ne00*nty); - const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; - - // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block: - if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) { + if (kbc == kbc_stop) { // Did not have any data. + bidx--; + kbc_stop = kbc; continue; } @@ -2729,16 +2892,71 @@ static __global__ void mul_mat_q_stream_k_fixup( sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; } } + + // If this block started in a previous tile we are done and don't need to combine additional partial results. + if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) { + break; + } + bidx--; + kbc_stop = kbc; } if (!any_fixup) { return; } - dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y; + int tmp = kbc0; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; - const int i_max = ne01 - blockIdx.x*mmq_y - 1; - const int j_max = ne11 - blockIdx.y*mmq_x - 1; + if (!ids_dst) { + const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = ncols_y - jt*mmq_x - 1; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j > j_max) { + return; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + if (need_check && i > i_max) { + continue; + } + + dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + } + } + return; + } + + __shared__ int ids_dst_shared[mmq_x]; + const int col_low = expert_bounds[zt + 0]; + const int col_high = expert_bounds[zt + 1]; + const int col_diff = col_high - col_low; + + for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) { + ids_dst_shared[j] = ids_dst[col_low + j]; + } + + const int offset_dst = it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = col_diff - jt*mmq_x - 1; #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { @@ -2756,26 +2974,27 @@ static __global__ void mul_mat_q_stream_k_fixup( continue; } - dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } struct mmq_args { - const char * x; const char * y; float * dst; - int64_t ne00; int64_t ne01; int64_t stride01; - int64_t ne10; int64_t ne11; int64_t stride11; - int64_t ne0; + const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst; + int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst; + int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst; + int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst; bool use_stream_k; }; template -static int mmq_get_shmem(const int mmq_x, const int mmq_y, const int cc) { +static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) { const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y); const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); - const int shmem_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); - const int shmem_y = mmq_x*sizeof(block_q8_1_mmq); - return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); + const size_t nbs_ids = mmq_x*sizeof(int); + const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); + const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); + return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); } template @@ -2787,86 +3006,114 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1); - const int shmem = mmq_get_shmem(mmq_x, mmq_y, cc); + const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc); #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shmem_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - shmem_limit_raised[id] = true; + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + shared_memory_limit_raised[id] = true; } #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - const int nty = (args.ne01 + mmq_y - 1) / mmq_y; - const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; - const dim3 block_nums_xy_tiling(nty, ntx, 1); + const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; + const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x; + const int ntzw = args.nchannels_y * args.nsamples_y; + const dim3 block_nums_xy_tiling(nty, ntx, ntzw); + + GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0); + GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0); + const int channel_ratio = args.nchannels_y / args.nchannels_x; + const int sample_ratio = args.nsamples_y / args.nsamples_x; if (!args.use_stream_k) { - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } return; } - const dim3 block_nums_mmq(nsm, 1, 1); + const dim3 block_nums_stream_k(nsm, 1, 1); + const bool fixup_needed = ntx*nty*ntzw % nsm != 0; ggml_cuda_pool & pool = ctx.pool(id); - ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); + ggml_cuda_pool_alloc tmp_fixup(pool); + if (fixup_needed) { + tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y); + } - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + if (!fixup_needed) { + return; + } + + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + if (!fixup_needed) { + return; + } + + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } } template void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int smpbo = ggml_cuda_info().devices[id].smpbo; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); - const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; int mmq_x_best = 0; - int nparts_best = INT_MAX; + int ntiles_x_best = INT_MAX; - for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) { + for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) { const int granularity = mmq_get_granularity_host(mmq_x, cc); - if (mmq_x % granularity != 0 || mmq_get_shmem(mmq_x, mmq_y, cc) > smpbo) { + if (mmq_x % granularity != 0 || mmq_get_nbytes_shared(mmq_x, mmq_y, cc) > smpbo) { continue; } - const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x; - const int nwaves_xy_tiling = ntiles_x*block_num_y; - const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling; + const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x; - if (nparts < nparts_best) { - mmq_x_best = mmq_x; - nparts_best = nparts; + if (ntiles_x < ntiles_x_best) { + mmq_x_best = mmq_x; + ntiles_x_best = ntiles_x; } } @@ -2950,6 +3197,9 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); // ------------------------------------------------------------------------------------------------------------------------- +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu index b39961cd1..d8c385e23 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu @@ -4,18 +4,23 @@ template static __global__ void mul_mat_vec( - const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, + const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row, const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) { - const int64_t row = blockIdx.x; - const int64_t channel = blockIdx.y; - const int64_t sample = blockIdx.z; - const int tid = threadIdx.x; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + const int64_t row = blockIdx.x; + const int64_t channel_dst = blockIdx.y; + const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst; + const int64_t sample_dst = blockIdx.z; + const int64_t sample_x = sample_dst / sample_ratio; + const int64_t sample_y = sample_dst; + const int tid = threadIdx.x; + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - x += (sample/sample_ratio)*stride_sample_x + (channel/channel_ratio)*stride_channel_x + row*stride_row; - y += sample *stride_sample_y + channel *stride_channel_y; - dst += sample *stride_sample_dst + channel *stride_channel_dst; + x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row; + y += sample_y *stride_sample_y + channel_y *stride_channel_y; + dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst; const float2 * y2 = (const float2 *) y; @@ -31,12 +36,19 @@ static __global__ void mul_mat_vec( float sumf = 0.0f; - if constexpr (std::is_same::value) { + if constexpr (std::is_same::value) { + const float2 * x2 = (const float2 *) x; + + for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + const float2 tmpx = x2[col2]; + const float2 tmpy = y2[col2]; + sumf += tmpx.x*tmpy.x; + sumf += tmpx.y*tmpy.y; + } + } else if constexpr (std::is_same::value) { const half2 * x2 = (const half2 *) x; if (std::is_same::value) { - sumf = 0.0f; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = __half22float2(x2[col2]); const float2 tmpy = y2[col2]; @@ -59,8 +71,6 @@ static __global__ void mul_mat_vec( } } else if constexpr (std::is_same::value) { const int * x2 = (const int *) x; - sumf = 0.0f; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { const int tmpx = x2[col2]; const float2 tmpy = y2[col2]; @@ -92,17 +102,17 @@ static __global__ void mul_mat_vec( template static void launch_mul_mat_vec_cuda( - const T * x, const float * y, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(stride_row % 2 == 0); - GGML_ASSERT(nchannels_y % nchannels_x == 0); - GGML_ASSERT(nsamples_y % nsamples_x == 0); - const int64_t channel_ratio = nchannels_y / nchannels_x; - const int64_t sample_ratio = nsamples_y / nsamples_x; + GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); + GGML_ASSERT( nsamples_dst % nsamples_x == 0); + const int64_t channel_ratio = nchannels_dst / nchannels_x; + const int64_t sample_ratio = nsamples_dst / nsamples_x; int device; int warp_size; @@ -124,48 +134,48 @@ static void launch_mul_mat_vec_cuda( } const int smem = warp_size*sizeof(float); - const dim3 block_nums(nrows, nchannels_y, nsamples_y); + const dim3 block_nums(nrows, nchannels_dst, nsamples_dst); const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 64: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 96: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 128: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 160: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 192: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 224: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 256: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; default: { GGML_ABORT("fatal error"); @@ -175,28 +185,28 @@ static void launch_mul_mat_vec_cuda( template static void mul_mat_vec_cuda( - const T * x, const float * y, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, enum ggml_prec prec, cudaStream_t stream) { - switch (prec) { - case GGML_PREC_DEFAULT: { + if constexpr(std::is_same::value) { + if (prec == GGML_PREC_DEFAULT) { launch_mul_mat_vec_cuda - (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case GGML_PREC_F32: { - launch_mul_mat_vec_cuda - (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; + (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + return; + } } + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); } -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); +void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_TENSOR_BINARY_OP_LOCALS; @@ -204,21 +214,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); - GGML_ASSERT(ne11 == 1); - GGML_ASSERT(ne12 == ne2); + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. GGML_ASSERT(ne13 == ne3); - GGML_ASSERT(nb00 == ts_src0); - GGML_ASSERT(nb10 == ts_src1); - GGML_ASSERT(nb0 == ts_dst); + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + GGML_ASSERT( nb0 == ts_dst); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src1_d = (const float *) src1->data; + const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; + float * dst_d = (float *) dst->data; const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s1 = dst->nb[1] / ts_dst; const int64_t s02 = src0->nb[2] / ts_src0; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s2 = dst->nb[2] / ts_dst; @@ -226,14 +239,33 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int64_t s13 = src1->nb[3] / ts_src1; const int64_t s3 = dst->nb[3] / ts_dst; + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: + const int64_t ncols_dst = ids ? ne2 : ne1; + const int64_t nchannels_y = ids ? ne11 : ne12; + const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_channel_dst = ids ? s1 : s2; + const int64_t stride_channel_y = ids ? s11 : s12; + + GGML_ASSERT(ncols_dst == 1); + switch (src0->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0->data; + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); + } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream()); + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream()); + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); @@ -262,27 +294,34 @@ void ggml_cuda_op_mul_mat_vec( const int64_t stride_row = ne00; const int64_t nchannels_x = 1; const int64_t nchannels_y = 1; + const int64_t nchannels_dst = 1; const int64_t stride_channel_x = 0; const int64_t stride_channel_y = 0; const int64_t stride_channel_dst = 0; const int64_t nsamples_x = 1; - const int64_t nsamples_y = 1; + const int64_t nsamples_dst = 1; const int64_t stride_sample_x = 0; const int64_t stride_sample_y = 0; const int64_t stride_sample_dst = 0; switch (src0->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0_dd_i; + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh index 78a1cd4a6..756e7e1cc 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh @@ -3,7 +3,7 @@ // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available #define MMV_MAX_ROWS 512 -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); void ggml_cuda_op_mul_mat_vec( ggml_backend_cuda_context & ctx, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu index eef8585a7..132c466fd 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu @@ -1,50 +1,57 @@ #include "mmvq.cuh" +#include "quantize.cuh" #include "vecdotq.cuh" +#include + typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs); static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 : - type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 : - type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 : - type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 : - type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 : - type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 : - type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 : - type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 : - type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 : - type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 : - type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 : - type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 : - type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 : - type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 : - type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 : - type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 : - type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 : - type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 : - type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 : - nullptr; + switch (type) { + case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1; + case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1; + case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1; + case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1; + case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1; + case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1; + case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1; + case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1; + case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1; + case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1; + case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1; + case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1; + case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1; + case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1; + case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1; + case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; + case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; + case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; + case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; + default: return nullptr; + } } static constexpr __device__ int get_vdr_mmvq(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : - type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : - type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : - type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : - type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : - type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : - type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : - type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : - type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : - type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ : - 1; + switch (type) { + case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ; + case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ; + case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ; + case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ; + case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ; + case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ; + case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ; + case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ; + case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ; + case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ; + case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; + case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; + case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; + default: return 1; + } } enum mmvq_parameter_table_id { @@ -73,9 +80,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { return MMVQ_PARAMETERS_GENERIC; } -static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) { +static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC) { - switch (ncols_y) { + switch (ncols_dst) { case 1: case 2: case 3: @@ -90,7 +97,7 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete return 1; } } else if (table_id == MMVQ_PARAMETERS_GCN) { - switch (ncols_y) { + switch (ncols_dst) { case 1: case 2: case 3: @@ -107,9 +114,9 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete return 1; } -static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) { +static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) { - switch (ncols_y) { + switch (ncols_dst) { case 1: return 1; case 2: @@ -127,19 +134,21 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta return 1; } -template +template // tell the compiler to use as many registers as it wants, see nwarps definition below -__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) +__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; constexpr int vdr = get_vdr_mmvq(type); constexpr mmvq_parameter_table_id table_id = get_device_table_id(); - constexpr int nwarps = calc_nwarps(ncols_y, table_id); - constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id); + constexpr int nwarps = calc_nwarps(ncols_dst, table_id); + constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id); constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); @@ -147,13 +156,21 @@ static __global__ void mul_mat_vec_q( const int tid = warp_size*threadIdx.y + threadIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x; const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; - // partial sum for each thread - float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}}; + // The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1. + const int channel_dst = blockIdx.y; + const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; - const block_q8_1 * y = (const block_q8_1 *) vy; + // partial sum for each thread + float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}}; + + const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; + const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx @@ -162,18 +179,19 @@ static __global__ void mul_mat_vec_q( const int kqs = vdr * (tid % (qi/vdr)); #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { - tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs); + tmp[j][i] += vec_dot_q_cuda( + vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs); } } } - __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size]; + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; if (threadIdx.y > 0) { #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; @@ -185,9 +203,11 @@ static __global__ void mul_mat_vec_q( return; } + dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0; + // sum up partial sums and write back result #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { #pragma unroll @@ -197,88 +217,121 @@ static __global__ void mul_mat_vec_q( tmp[j][i] = warp_reduce_sum(tmp[j][i]); } - if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) { - dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; + if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) { + dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x]; } } - - GGML_UNUSED(nrows_x); } -static std::pair calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) { - const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id); - const dim3 block_nums(nblocks, 1, 1); - const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1); +static std::pair calc_launch_params( + const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y, + const int warp_size, const mmvq_parameter_table_id table_id) { + const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id); + const dim3 block_nums(nblocks, nchannels_y, nsamples_y); + const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1); return {block_nums, block_dims}; } template -static void mul_mat_vec_q_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { +static void mul_mat_vec_q_switch_ncols_dst( + const void * vx, const void * vy, const int32_t * ids, float * dst, + const int ncols_x, const int nrows_x, const int ncols_dst, + const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int nchannels_x, const int nchannels_y, const int nchannels_dst, + const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + cudaStream_t stream) { GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0); - GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); + GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE); + + const int channel_ratio = nchannels_dst / nchannels_x; + const int sample_ratio = nsamples_dst / nsamples_x; const int device = ggml_cuda_get_device(); const int warp_size = ggml_cuda_info().devices[device].warp_size; const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc); - switch (ncols_y) { + GGML_ASSERT(!ids || ncols_dst == 1); + switch (ncols_dst) { case 1: { - constexpr int c_ncols_y = 1; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 1; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 2: { - constexpr int c_ncols_y = 2; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 2; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 3: { - constexpr int c_ncols_y = 3; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 3; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 4: { - constexpr int c_ncols_y = 4; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 4; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 5: { - constexpr int c_ncols_y = 5; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 5; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 6: { - constexpr int c_ncols_y = 6; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 6; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 7: { - constexpr int c_ncols_y = 7; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 7; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 8: { - constexpr int c_ncols_y = 8; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 8; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } default: @@ -287,137 +340,213 @@ static void mul_mat_vec_q_cuda( } } -static void mul_mat_vec_q4_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +static void mul_mat_vec_q_switch_type( + const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst, + const int ncols_x, const int nrows_x, const int ncols_dst, + const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int nchannels_x, const int nchannels_y, const int nchannels_dst, + const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + cudaStream_t stream) { + switch (type_x) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + default: + GGML_ABORT("fatal error"); + break; + } } -static void mul_mat_vec_q4_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { +void ggml_cuda_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + GGML_TENSOR_BINARY_OP_LOCALS; -static void mul_mat_vec_q5_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + cudaStream_t stream = ctx.stream(); - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); -static void mul_mat_vec_q5_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. -static void mul_mat_vec_q8_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const float * src1_d = (const float *) src1->data; + const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; + float * dst_d = (float *) dst->data; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1); + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } -static void mul_mat_vec_q2_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s11 = ne10_padded / QK8_1; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const int64_t s12 = ne11*s11; + const int64_t s13 = ne12*s12; -static void mul_mat_vec_q3_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: + const int64_t ncols_dst = ids ? ne2 : ne1; + const int64_t nchannels_y = ids ? ne11 : ne12; + const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_col_dst = ids ? s2 : s1; + const int64_t stride_col_y = ids ? s12 : s11; + const int64_t stride_channel_dst = ids ? s1 : s2; + const int64_t stride_channel_y = ids ? s11 : s12; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q4_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q5_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q6_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_xxs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_xs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq3_xxs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq1_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq1_m_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq4_nl_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq4_xs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq3_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); + mul_mat_vec_q_switch_type( + src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00, + ne01, ncols_dst, s01, stride_col_y, stride_col_dst, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, stream); } void ggml_cuda_op_mul_mat_vec_q( @@ -440,68 +569,12 @@ void ggml_cuda_op_mul_mat_vec_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XXS: - mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XS: - mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_S: - mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_XXS: - mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_M: - mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_NL: - mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_S: - mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - default: - GGML_ABORT("fatal error"); - break; - } + const int stride_row_x = ne00 / ggml_blck_size(src0->type); + const int stride_col_y = src1_padded_row_size / QK8_1; + + mul_mat_vec_q_switch_type( + src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh index d9e42fdd6..39dc7d33e 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh @@ -2,6 +2,9 @@ #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. +void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_vec_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu index 1702e4ce2..931a45ad3 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu @@ -1,30 +1,40 @@ #include "quantize.cuh" #include -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { - const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void quantize_q8_1( + const float * __restrict__ x, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { + const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const int64_t ix1 = blockIdx.y; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t i_padded = ix1*kx0_padded + ix0; + const int64_t & i00 = i0; + const int64_t & i01 = i1; + const int64_t & i02 = i2; + const int64_t & i03 = i3; + + const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0; block_q8_1 * y = (block_q8_1 *) vy; - const int64_t ib = i_padded / QK8_1; // block index - const int64_t iqs = i_padded % QK8_1; // quant index + const int64_t ib = i_cont / QK8_1; // block index + const int64_t iqs = i_cont % QK8_1; // quant index - const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f; + const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); float sum = xi; amax = warp_reduce_max(amax); - sum = warp_reduce_sum(sum); + sum = warp_reduce_sum(sum); - const float d = amax / 127; + const float d = amax / 127; const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); y[ib].qs[iqs] = q; @@ -39,29 +49,38 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest template static __global__ void quantize_mmq_q8_1( - const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) { + const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32; constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32; - const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; + const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const float4 * x4 = (const float4 *) x; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t ix1 = kx1*blockIdx.z + blockIdx.y; + const int64_t i00 = i0; + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t i02 = i2; + const int64_t i03 = i3; + + const float4 * x4 = (const float4 *) x; block_q8_1_mmq * y = (block_q8_1_mmq *) vy; const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel - const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel - const int64_t iqs = ix0 % (4*QK8_1); // quant index in block + const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel + const int64_t iqs = i0 % (4*QK8_1); // quant index in block // Load 4 floats per thread and calculate max. abs. value between them: - const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); + const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); float amax = fabsf(xi.x); amax = fmaxf(amax, fabsf(xi.y)); amax = fmaxf(amax, fabsf(xi.z)); @@ -77,7 +96,7 @@ static __global__ void quantize_mmq_q8_1( if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) { sum = xi.x + xi.y + xi.z + xi.w; - // Exchange calculate sum across vals_per_sum/4 threads. + // Calculate sums across vals_per_sum/4 threads. #pragma unroll for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); @@ -127,40 +146,40 @@ static __global__ void quantize_mmq_q8_1( } void quantize_row_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, - const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(!ids); + GGML_ASSERT(ne0 % QK8_1 == 0); - GGML_ASSERT(kx0_padded % QK8_1 == 0); - - const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - const dim3 num_blocks(block_num_x, kx1*channels, 1); + const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, kx0, kx0_padded); - - GGML_UNUSED(type_x); + quantize_q8_1<<>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2); + GGML_UNUSED(type_src0); } void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, - const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(ne0 % (4*QK8_1) == 0); - GGML_ASSERT(kx0_padded % (4*QK8_1) == 0); - - const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); - const dim3 num_blocks(block_num_x, kx1, channels); + const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); + const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1); - switch (mmq_get_q8_1_ds_layout(type_x)) { + switch (mmq_get_q8_1_ds_layout(type_src0)) { case MMQ_Q8_1_DS_LAYOUT_D4: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_DS4: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_D2S6: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; default: GGML_ABORT("fatal error"); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh index 03bf322b9..725ab5244 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh @@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access."); typedef void (*quantize_cuda_t)( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_row_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh index 40091a0ef..ba195e1d1 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh @@ -1,3 +1,5 @@ +#pragma once + #include "common.cuh" #include diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal index 297933adb..223dc1807 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal @@ -5690,7 +5690,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -5950,7 +5950,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -6197,7 +6197,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL; diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index b2e95a66c..112abef68 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -44,8 +44,8 @@ static struct ggml_backend_device g_ggml_backend_metal_device; // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; + id mtl_device; + int mtl_device_ref_count; id mtl_library; bool has_simdgroup_reduction; @@ -491,7 +491,259 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; +// +// ggml_metal_heap +// + +struct ggml_metal_heap { + // number of times the heap was unused + int n_unused; + + // total number of buffer allocations in this heap across all computes + int64_t n_alloc; + + // current offset in the heap - we reset this after each node in order to reuse the memory + size_t offs; + + // the currently allocated MTLBuffer objects in this heap + id obj; + + NSMutableArray * bufs; +}; + +static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { + struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypePlacement; + desc.size = size; + + heap->n_unused = 0; + heap->n_alloc = 0; + + heap->obj = [device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + free(heap); + + return false; + } + + [desc release]; + + heap->bufs = [[NSMutableArray alloc] init]; + + return heap; +} + +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->offs = 0; + + // count how many graph computes the heap ended up being unused + if ([heap->bufs count] > 0) { + heap->n_unused = 0; + } else { + heap->n_unused++; + } + + for (id buf in heap->bufs) { + [buf release]; + } + [heap->bufs removeAllObjects]; + + // tell the OS that it can reuse this memory if needed + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; +} + +static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { + if (heap == nil) { + return; + } + + ggml_metal_heap_reset(heap); + + [heap->obj release]; + [heap->bufs release]; + + free(heap); +} + +@interface ggml_metal_heap_ptr : NSObject + +@property (nonatomic, assign) struct ggml_metal_heap * data; + +@end + +@implementation ggml_metal_heap_ptr +@end + +// +// ggml_metal_mem_pool +// + +struct ggml_metal_mem_pool { + id device; + + int n_heaps; // total number of heaps ever created (including those that were removed) + + NSMutableArray * heaps; + NSMutableArray * heaps_to_remove; +}; + +static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { + struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + + mem_pool->n_heaps = 0; + + mem_pool->heaps = [[NSMutableArray alloc] init]; + mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; + + return mem_pool; +} + +static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); + + size_t size_all = 0; + size_t size_cur = 0; + + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); + GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); + GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); + + if ([ptr.data->bufs count] > 0) { + size_cur += [ptr.data->obj size]; + } + size_all += [ptr.data->obj size]; + + ggml_metal_heap_free(ptr.data); + [ptr release]; + } + [mem_pool->heaps release]; + [mem_pool->heaps_to_remove release]; + + if (size_all > 0) { + GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); + } + + free(mem_pool); +} + +static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { + for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_reset(heap); + + // if the heap hasn't been used for a while, remove it + if (heap->n_unused >= 128) { + [mem_pool->heaps_to_remove addObject:@(i)]; + } + } + + if (mem_pool->heaps_to_remove.count > 0) { + for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) { + NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_free(heap); + + [mem_pool->heaps removeObjectAtIndex:index]; + [ptr release]; + } + + [mem_pool->heaps_to_remove removeAllObjects]; + } +} + +static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + ptr.data->offs = 0; + } +} + +static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { + const size_t alignment = 32; + + const size_t size_aligned = GGML_PAD(size, alignment); + + // try one of the existing heaps + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + struct ggml_metal_heap * heap = ptr.data; + if (heap->offs + size_aligned <= [heap->obj size]) { + // if this is the first buffer in the heap for the current command buffer, tell the OS that + // it cannot free the memory used by the heap + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + if ([heap->bufs count] == 0) { + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return nil; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + return buf; + } + } + + // create a new heap that can fit this buffer + ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; + + struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); + if (heap == NULL) { + GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); + return NULL; + } + + //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + + heap_ptr.data = heap; + ggml_metal_heap_reset(heap); + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return NULL; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + [mem_pool->heaps addObject:heap_ptr]; + mem_pool->n_heaps++; + + return buf; +} + +struct ggml_metal_command_buffer { + id obj; + + // each command buffer has a memory pool from which it can allocate temporary buffers during the compute + struct ggml_metal_mem_pool * mem_pool; +}; + struct ggml_backend_metal_context { + id device; id queue; dispatch_queue_t d_queue; @@ -516,7 +768,7 @@ struct ggml_backend_metal_context { void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -706,9 +958,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de struct ggml_backend_metal_device_context * ctx_dev = dev->context; id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - ctx->queue = [device newCommandQueue]; + ctx->device = device; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -769,7 +1023,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->gf = nil; ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - ctx->command_buffers[i] = nil; + ctx->cmd_bufs[i].obj = nil; + + ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs[i].mem_pool->device = device; } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1183,6 +1440,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + // ctx->cmd_bufs[i].obj is auto released + + ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); + } + dispatch_release(ctx->d_queue); free(ctx); @@ -1489,10 +1752,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static void ggml_metal_encode_node( +static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, - id encoder) { + id encoder, + struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1508,7 +1772,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return; + return true; } switch (dst->op) { @@ -1519,7 +1783,7 @@ static void ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return; + } return true; default: { } break; @@ -1530,6 +1794,8 @@ static void ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + ggml_metal_mem_pool_clear(mem_pool); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2176,26 +2442,76 @@ static void ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_metal_kargs_soft_max args = { +// use this branch to test the ggml_metal_mem_pool functionality +#if 0 + // cpy to tmp buffer in MTLHeap + + id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); + if (!h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); + return false; + } + + offs_src0 = 0; + + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, - /*.scale =*/ scale, - /*.max_bias =*/ max_bias, - /*.m0 =*/ m0, - /*.m1 =*/ m1, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne0 =*/ ne00, + /*.ne1 =*/ ne01, + /*.ne2 =*/ ne02, + /*.ne3 =*/ ne03, + /*.nb0 =*/ nb00, + /*.nb1 =*/ nb01, + /*.nb2 =*/ nb02, + /*.nb3 =*/ nb03, + }; + + if (src0->type == GGML_TYPE_F16) { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; + } else { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; + } + [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:0 atIndex:2]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; + +#else + id h_src0 = id_src0; +#endif + // softmax + + ggml_metal_kargs_soft_max args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.scale =*/ scale, + /*.max_bias =*/ max_bias, + /*.m0 =*/ m0, + /*.m1 =*/ m1, /*.n_head_log2 =*/ n_head_log2, }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:0]; if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; @@ -4634,6 +4950,8 @@ static void ggml_metal_encode_node( GGML_ABORT("fatal error"); } } + + return true; } static enum ggml_status ggml_metal_graph_compute( @@ -4687,25 +5005,25 @@ static enum ggml_status ggml_metal_graph_compute( } // the main thread commits the first few commands immediately - // command_buffer[n_cb] + // cmd_buf[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[n_cb] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - [command_buffer enqueue]; + [cmd_buf enqueue]; ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously - // command_buffer[0.. n_cb) + // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[cb_idx] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; + [cmd_buf enqueue]; } } @@ -4714,14 +5032,14 @@ static enum ggml_status ggml_metal_graph_compute( // wait for completion and check status of each command buffer // needed to detect if the device ran out-of-memory for example (#1881) { - id command_buffer = ctx->command_buffers[n_cb]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -4729,20 +5047,20 @@ static enum ggml_status ggml_metal_graph_compute( } for (int i = 0; i < n_cb; ++i) { - id command_buffer = ctx->command_buffers[i]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; } - id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); if (!next_buffer) { continue; } @@ -5126,8 +5444,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + + id encoder = [cmd_buf computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; @@ -5139,22 +5458,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); + for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder); + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); if (should_capture) { [encoder popDebugGroup]; } + + if (!res) { + break; + } } [encoder endEncoding]; if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [cmd_buf commit]; } }); } diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal index 71f0f97ff..6ceb3cef7 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal @@ -3237,7 +3237,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -3497,7 +3497,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3744,7 +3744,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL; diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c index 2276b6312..3c57aff8b 100644 --- a/ml/backend/ggml/ggml/src/ggml.c +++ b/ml/backend/ggml/ggml/src/ggml.c @@ -4,6 +4,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-threading.h" +#include "ggml-cpu.h" #include "ggml.h" // FIXME: required here for quantization functions @@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library -// currently, the ggml_cpu_has_* functions are entirely compile-time void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; -#if defined(__F16C__) - //if (ggml_cpu_has_f16c()) { - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for(; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } } void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { - int64_t i = 0; -#if defined(__AVX512F__) - //if (ggml_cpu_has_avx512()) { - for (; i + 16 <= n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); - } - //} -#endif -#if defined(__AVX2__) - //if (ggml_cpu_has_avx2()) { - for (; i + 8 <= n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_BF16_TO_FP32(x[i]); } } @@ -956,6 +915,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -994,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1051,6 +1011,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", @@ -1089,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1346,6 +1307,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; } +bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) { + return + tensor->nb[0] > tensor->nb[2] && + tensor->nb[1] > tensor->nb[0] && + tensor->nb[2] == ggml_type_size(tensor->type); +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -4052,6 +4020,46 @@ struct ggml_tensor * ggml_conv_2d_dw( return result; } +// ggml_conv_2d_dw_direct + +struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1) { + GGML_ASSERT(a->ne[2] == 1); + GGML_ASSERT(a->ne[3] == b->ne[2]); + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1); + ne[2] = b->ne[2]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + if (ggml_is_contiguous_channels(b)) { + // Result will be permuted the same way as input (CWHN order) + const int64_t type_size = ggml_type_size(result->type); + GGML_ASSERT(ggml_blck_size(result->type) == 1); + result->nb[0] = result->ne[2] * type_size; + result->nb[1] = result->ne[0] * result->nb[0]; + result->nb[2] = type_size; + } + + int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_2D_DW; + result->src[0] = a; + result->src[1] = b; + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {