mirror of
https://github.com/ollama/ollama.git
synced 2025-05-10 18:06:33 +02:00
llama: update to commit e1e8e099 (#10513)
This commit is contained in:
parent
e6d2d04121
commit
8dd12c873d
68 changed files with 3783 additions and 1774 deletions
|
@ -1,6 +1,6 @@
|
|||
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac
|
||||
FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
|
|
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
|
@ -1,4 +1,4 @@
|
|||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
|
||||
char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
|
2
llama/llama.cpp/common/common.h
vendored
2
llama/llama.cpp/common/common.h
vendored
|
@ -342,6 +342,8 @@ struct common_params {
|
|||
|
||||
// multimodal models (see examples/llava)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
|
|
|
@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
|
|||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||
|
||||
if (max_items == 0) {
|
||||
return "";
|
||||
}
|
||||
if (min_items == 0 && max_items == 1) {
|
||||
return item_rule + "?";
|
||||
}
|
||||
|
|
50
llama/llama.cpp/examples/llava/clip-impl.h
vendored
50
llama/llama.cpp/examples/llava/clip-impl.h
vendored
|
@ -2,8 +2,6 @@
|
|||
#include "gguf.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include "clip.h"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
|
@ -17,33 +15,31 @@
|
|||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_USE_SILU "clip.use_silu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||
#define KEY_N_POSITIONS "clip.text.context_length"
|
||||
#define KEY_N_EMBD "clip.vision.embedding_length"
|
||||
#define KEY_N_FF "clip.vision.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.vision.block_count"
|
||||
#define KEY_N_HEAD "clip.vision.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.vision.projection_dim"
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
|
||||
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
|
||||
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
|
||||
|
||||
//
|
||||
|
@ -60,7 +56,9 @@
|
|||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
|
@ -72,6 +70,8 @@
|
|||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
|
||||
// mimicpmv
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
|
@ -87,18 +87,19 @@
|
|||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
#define TN_GLM_BOI_W "adapter.boi"
|
||||
#define TN_GLM_EOI_W "adapter.eoi"
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_MINICPMV,
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_MERGER,
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
@ -106,10 +107,13 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
|
||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
|
1261
llama/llama.cpp/examples/llava/clip.cpp
vendored
1261
llama/llama.cpp/examples/llava/clip.cpp
vendored
File diff suppressed because it is too large
Load diff
21
llama/llama.cpp/examples/llava/clip.h
vendored
21
llama/llama.cpp/examples/llava/clip.h
vendored
|
@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
|
|||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
||||
|
||||
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||
|
@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
||||
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
|
||||
"use clip_n_output_tokens instead");
|
||||
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
|
||||
"use clip_n_output_tokens instead");
|
||||
|
||||
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
||||
// for other models, X will be the total number of tokens and Y will be 1
|
||||
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// this should be equal to the embedding dimension of the text model
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
|
@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|||
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
|
||||
|
|
15
llama/llama.cpp/examples/llava/llava.cpp
vendored
15
llama/llama.cpp/examples/llava/llava.cpp
vendored
|
@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
|||
}
|
||||
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
|
||||
struct {
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
|
@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
for (size_t i = 1; i < num_images; i++) {
|
||||
|
@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
|
||||
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
|
||||
|
||||
// Debug: Test single segments
|
||||
// Current findings: sending base image, sending a segment embedding all works similar to python
|
||||
|
@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||
image_embd_v[i],
|
||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
|
||||
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
|
@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
|
@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
int n_img_pos_out;
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
||||
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
||||
*n_img_pos = n_img_pos_out;
|
||||
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
|
|
2
llama/llama.cpp/include/llama.h
vendored
2
llama/llama.cpp/include/llama.h
vendored
|
@ -111,6 +111,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
|
@ -1237,6 +1238,7 @@ extern "C" {
|
|||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
/// Setting k <= 0 makes this a noop
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
|
|
20
llama/llama.cpp/src/llama-arch.cpp
vendored
20
llama/llama.cpp/src/llama-arch.cpp
vendored
|
@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_REFACT, "refact" },
|
||||
{ LLM_ARCH_BERT, "bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
|
@ -109,6 +110,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
|
@ -511,6 +513,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
{
|
||||
|
|
2
llama/llama.cpp/src/llama-arch.h
vendored
2
llama/llama.cpp/src/llama-arch.h
vendored
|
@ -24,6 +24,7 @@ enum llm_arch {
|
|||
LLM_ARCH_REFACT,
|
||||
LLM_ARCH_BERT,
|
||||
LLM_ARCH_NOMIC_BERT,
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
|
@ -113,6 +114,7 @@ enum llm_kv {
|
|||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||
LLM_KV_EXPERT_GATING_FUNC,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
|
|
43
llama/llama.cpp/src/llama-chat.cpp
vendored
43
llama/llama.cpp/src/llama-chat.cpp
vendored
|
@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
|
@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
||||
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
||||
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
||||
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
||||
};
|
||||
|
||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||
|
@ -81,6 +82,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
if (tmpl_contains("<|im_start|>")) {
|
||||
return tmpl_contains("<|im_sep|>")
|
||||
? LLM_CHAT_TEMPLATE_PHI_4
|
||||
: tmpl_contains("<end_of_utterance>")
|
||||
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
|
||||
: LLM_CHAT_TEMPLATE_CHATML;
|
||||
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
||||
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
||||
|
@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
}
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||
|
@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
||||
} else if (tmpl_contains("[gMASK]sop")) {
|
||||
// chatglm3-6b
|
||||
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
||||
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
||||
} else if (tmpl_contains(LU8("<用户>"))) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||
|
@ -432,7 +437,7 @@ int32_t llm_chat_apply_template(
|
|||
if (add_ass) {
|
||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
||||
// chatglm3-6b
|
||||
ss << "[gMASK]" << "sop";
|
||||
for (auto message : chat) {
|
||||
|
@ -442,7 +447,7 @@ int32_t llm_chat_apply_template(
|
|||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
ss << "[gMASK]" << "<sop>";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
|
@ -451,14 +456,6 @@ int32_t llm_chat_apply_template(
|
|||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
|
@ -620,6 +617,22 @@ int32_t llm_chat_apply_template(
|
|||
if (add_ass) {
|
||||
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
|
||||
// SmolVLM
|
||||
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << message->content << "\n\n";
|
||||
} else if (role == "user") {
|
||||
ss << "User: " << message->content << "<end_of_utterance>\n";
|
||||
} else {
|
||||
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "Assistant:";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
|
5
llama/llama.cpp/src/llama-chat.h
vendored
5
llama/llama.cpp/src/llama-chat.h
vendored
|
@ -29,8 +29,8 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_4,
|
||||
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
|
@ -41,6 +41,7 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_YANDEX,
|
||||
LLM_CHAT_TEMPLATE_BAILING,
|
||||
LLM_CHAT_TEMPLATE_LLAMA4,
|
||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
21
llama/llama.cpp/src/llama-context.cpp
vendored
21
llama/llama.cpp/src/llama-context.cpp
vendored
|
@ -114,7 +114,7 @@ llama_context::llama_context(
|
|||
}
|
||||
|
||||
if (n_ctx_per_seq > hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
|
@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
ggml_backend_buffer * bbuf) const {
|
||||
float freq_scale) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
|
@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
|||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
|
||||
if (bbuf) {
|
||||
for (const auto & backend : backends) {
|
||||
// Figure out which backend KV cache belongs to
|
||||
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
|
||||
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tmp = ggml_rope_ext_inplace(ctx0, tmp,
|
||||
tmp = ggml_rope_ext(ctx0, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
|
@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
|||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
|
||||
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
// set all ids as invalid (negative)
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
|
||||
ggml_backend_buffer_clear(buf_output.get(), 0);
|
||||
|
||||
this->n_outputs = 0;
|
||||
this->n_outputs_max = n_outputs_max;
|
||||
|
||||
|
|
3
llama/llama.cpp/src/llama-context.h
vendored
3
llama/llama.cpp/src/llama-context.h
vendored
|
@ -172,8 +172,7 @@ private:
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
ggml_backend_buffer * bbuf) const;
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
|
|
58
llama/llama.cpp/src/llama-graph.cpp
vendored
58
llama/llama.cpp/src/llama-graph.cpp
vendored
|
@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|||
if (ubatch->pos && pos) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
|
||||
if (ubatch->token && n_pos_per_embd == 4) {
|
||||
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
|
||||
// the 3 first dims are the same, and 4th dim is all 0
|
||||
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
|
||||
// copy the first dimension
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
pos_data[ i] = ubatch->pos[i];
|
||||
pos_data[ n_tokens + i] = ubatch->pos[i];
|
||||
pos_data[2 * n_tokens + i] = ubatch->pos[i];
|
||||
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
|
||||
}
|
||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
|
||||
} else {
|
||||
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|||
) * f_attn_temp_scale + 1.0;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
res (std::make_unique<llm_graph_result>()) {
|
||||
}
|
||||
|
||||
int64_t llm_graph_context::n_pos_per_token() const {
|
||||
int64_t llm_graph_context::n_pos_per_embd() const {
|
||||
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
||||
}
|
||||
|
||||
|
@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
|
||||
if (down) {
|
||||
cur = build_lora_mm(down, cur);
|
||||
if (arch == LLM_ARCH_GLM4) {
|
||||
// GLM4 seems to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
|
||||
if (down_b) {
|
||||
|
@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(gate, "ffn_moe_gate", il);
|
||||
ggml_tensor * experts = nullptr;
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
} else {
|
||||
cur = up;
|
||||
}
|
||||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
{
|
||||
gate = ggml_silu(ctx0, gate);
|
||||
cb(gate, "ffn_moe_silu", il);
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_silu", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU:
|
||||
{
|
||||
gate = ggml_gelu(ctx0, gate);
|
||||
cb(gate, "ffn_moe_gelu", il);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_gelu", il);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(par, "ffn_moe_gate_par", il);
|
||||
if (gate_exps) {
|
||||
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate_par", il);
|
||||
}
|
||||
|
||||
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
if (!weight_before_ffn) {
|
||||
|
@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
|
||||
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
||||
|
||||
auto & cur = inp->pos;
|
||||
|
||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
|
||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
|
||||
auto & cur = inp->attn_scale;
|
||||
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
||||
// this need to be 1x1xN for broadcasting
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
|
12
llama/llama.cpp/src/llama-graph.h
vendored
12
llama/llama.cpp/src/llama-graph.h
vendored
|
@ -91,29 +91,27 @@ public:
|
|||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
|
||||
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
||||
virtual ~llm_graph_input_pos() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||
|
||||
const int64_t n_pos_per_token = 1;
|
||||
const int64_t n_pos_per_embd = 1;
|
||||
};
|
||||
|
||||
// temperature tuning, used by llama4
|
||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
virtual ~llm_graph_input_attn_temp() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
||||
|
||||
const int64_t n_pos_per_token = 1;
|
||||
|
||||
const uint32_t n_attn_temp_floor_scale;
|
||||
const float f_attn_temp_scale;
|
||||
};
|
||||
|
@ -430,7 +428,7 @@ struct llm_graph_context {
|
|||
|
||||
llm_graph_context(const llm_graph_params & params);
|
||||
|
||||
int64_t n_pos_per_token() const;
|
||||
int64_t n_pos_per_embd() const;
|
||||
|
||||
void cb(ggml_tensor * cur, const char * name, int il) const;
|
||||
|
||||
|
|
1
llama/llama.cpp/src/llama-hparams.h
vendored
1
llama/llama.cpp/src/llama-hparams.h
vendored
|
@ -72,6 +72,7 @@ struct llama_hparams {
|
|||
float expert_weights_scale = 0.0;
|
||||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
|
58
llama/llama.cpp/src/llama-model.cpp
vendored
58
llama/llama.cpp/src/llama-model.cpp
vendored
|
@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_770M: return "770M";
|
||||
case LLM_TYPE_780M: return "780M";
|
||||
case LLM_TYPE_0_5B: return "0.5B";
|
||||
case LLM_TYPE_0_6B: return "0.6B";
|
||||
case LLM_TYPE_1B: return "1B";
|
||||
case LLM_TYPE_1_3B: return "1.3B";
|
||||
case LLM_TYPE_1_4B: return "1.4B";
|
||||
case LLM_TYPE_1_5B: return "1.5B";
|
||||
case LLM_TYPE_1_6B: return "1.6B";
|
||||
case LLM_TYPE_1_7B: return "1.7B";
|
||||
case LLM_TYPE_1_8B: return "1.8B";
|
||||
case LLM_TYPE_2B: return "2B";
|
||||
case LLM_TYPE_2_8B: return "2.8B";
|
||||
|
@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_15B: return "15B";
|
||||
case LLM_TYPE_16B: return "16B";
|
||||
case LLM_TYPE_20B: return "20B";
|
||||
case LLM_TYPE_27B: return "27B";
|
||||
case LLM_TYPE_30B: return "30B";
|
||||
case LLM_TYPE_32B: return "32B";
|
||||
case LLM_TYPE_34B: return "34B";
|
||||
|
@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_65B: return "65B";
|
||||
case LLM_TYPE_70B: return "70B";
|
||||
case LLM_TYPE_236B: return "236B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_314B: return "314B";
|
||||
case LLM_TYPE_671B: return "671B";
|
||||
case LLM_TYPE_SMALL: return "0.1B";
|
||||
|
@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
||||
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
||||
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
||||
case LLM_TYPE_27B: return "27B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
||||
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
} break;
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
type = LLM_TYPE_137M;
|
||||
|
@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
||||
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
||||
case 40: type = LLM_TYPE_14B; break;
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 48: type = LLM_TYPE_30B_A3B; break;
|
||||
case 94: type = LLM_TYPE_235B_A22B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -2133,6 +2145,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
||||
|
@ -2166,21 +2179,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
}
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
} else {
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
|
||||
if (arch == LLM_ARCH_BERT) {
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
} else {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
|
@ -6074,6 +6098,11 @@ struct llm_build_bert : public llm_graph_context {
|
|||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
@ -6126,13 +6155,29 @@ struct llm_build_bert : public llm_graph_context {
|
|||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
if (model.arch == LLM_ARCH_BERT) {
|
||||
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
||||
// MoE branch
|
||||
cur = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
nullptr,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
hparams.n_expert,
|
||||
hparams.n_expert_used,
|
||||
LLM_FFN_GELU,
|
||||
false, false,
|
||||
0.0f,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
|
@ -6140,6 +6185,7 @@ struct llm_build_bert : public llm_graph_context {
|
|||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
|
@ -6147,8 +6193,8 @@ struct llm_build_bert : public llm_graph_context {
|
|||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
}
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
// attentions bypass the intermediate layer
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
@ -13349,6 +13395,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
||||
} break;
|
||||
|
@ -13714,6 +13761,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_DBRX:
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_STABLELM:
|
||||
case LLM_ARCH_BITNET:
|
||||
case LLM_ARCH_QWEN:
|
||||
|
|
8
llama/llama.cpp/src/llama-model.h
vendored
8
llama/llama.cpp/src/llama-model.h
vendored
|
@ -40,11 +40,13 @@ enum llm_type {
|
|||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_0_6B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_3B,
|
||||
LLM_TYPE_1_4B,
|
||||
LLM_TYPE_1_5B,
|
||||
LLM_TYPE_1_6B,
|
||||
LLM_TYPE_1_7B,
|
||||
LLM_TYPE_1_8B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_8B,
|
||||
|
@ -64,6 +66,7 @@ enum llm_type {
|
|||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
LLM_TYPE_22B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
|
@ -73,6 +76,7 @@ enum llm_type {
|
|||
LLM_TYPE_70B,
|
||||
LLM_TYPE_90B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
|
@ -87,10 +91,10 @@ enum llm_type {
|
|||
LLM_TYPE_16x3_8B,
|
||||
LLM_TYPE_10B_128x3_66B,
|
||||
LLM_TYPE_57B_A14B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_17B_16E, // llama4 Scout
|
||||
LLM_TYPE_17B_128E, // llama4 Maverick
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_235B_A22B,
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
|
|
3
llama/llama.cpp/src/llama-sampling.cpp
vendored
3
llama/llama.cpp/src/llama-sampling.cpp
vendored
|
@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|||
// }
|
||||
|
||||
if (k <= 0) {
|
||||
k = cur_p->size;
|
||||
return;
|
||||
}
|
||||
|
||||
k = std::min(k, (int) cur_p->size);
|
||||
|
@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|||
}
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
cur_p->size = k;
|
||||
}
|
||||
|
||||
|
|
3
llama/llama.cpp/src/llama-vocab.cpp
vendored
3
llama/llama.cpp/src/llama-vocab.cpp
vendored
|
@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "llama3" ||
|
||||
tokenizer_pre == "llama-v3" ||
|
||||
tokenizer_pre == "llama-bpe"||
|
||||
tokenizer_pre == "falcon3") {
|
||||
tokenizer_pre == "falcon3" ||
|
||||
tokenizer_pre == "pixtral") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||
ignore_merges = true;
|
||||
add_bos = true;
|
||||
|
|
|
@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
|
|||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index a7febef7..31750b6f 100644
|
||||
index 9fb2134f..04ce764e 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
|
@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
|
|||
|
||||
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 266d8af4..12886cd3 100644
|
||||
index d92392ed..425524d0 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
|
@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
|
|||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index a0667b7d..bd83adc5 100644
|
||||
index 140a775f..e33c4ba0 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
delete ctx;
|
||||
|
@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
|
|||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 1de34c96..4600f61e 100644
|
||||
index 66b6f2cc..e3e6deae 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
|
@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
|
|||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
|
@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
|
@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
|
|||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 39f3cd34..c569a8a5 100644
|
||||
index c0bdb9e1..03d03064 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
|
@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
|
|
@ -10,7 +10,7 @@ logs instead of throwing an error
|
|||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 48060517..a35b498c 100644
|
||||
index 50ded286..a9ee9f03 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
|
|
|
@ -11,10 +11,10 @@ instead of forcing one or the error
|
|||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 983385f8..32f59819 100644
|
||||
index 5a2eef9b..9c1fe93f 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
|
@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
|
|||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
|
@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
|
|||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
|
|
|
@ -10,12 +10,12 @@ filesystems for paths that include wide characters
|
|||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 75970615..d57b4bd6 100644
|
||||
index ad3e7df1..b3218c78 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -29,6 +29,19 @@
|
||||
#include <limits>
|
||||
@@ -30,6 +30,19 @@
|
||||
#include <array>
|
||||
#include <numeric>
|
||||
|
||||
+#if defined(_WIN32)
|
||||
+#define WIN32_LEAN_AND_MEAN
|
||||
|
@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
|
|||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1430,7 +1443,29 @@ struct clip_model_loader {
|
||||
@@ -1971,7 +1984,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
|
@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
|
|||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -1457,7 +1492,11 @@ struct clip_model_loader {
|
||||
@@ -1998,7 +2033,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
|
|||
7 files changed, 248 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 62e1480b..f754bc8f 100644
|
||||
index f2bc8ca7..5ab3f572 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
|
@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
|
|||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
|
@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
|
|||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
|
@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
|
|||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
|
@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
|
|||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 98ca00a1..439aaeab 100644
|
||||
index 41a023da..525c1b7d 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -72,6 +72,7 @@ enum llm_arch {
|
||||
@@ -73,6 +73,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
|
@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
|
|||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -144,6 +145,7 @@ enum llm_kv {
|
||||
@@ -146,6 +147,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
|
@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
|
|||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -344,6 +346,7 @@ enum llm_tensor {
|
||||
@@ -346,6 +348,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
|
@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
|
|||
if (il < n_layer) {
|
||||
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 80fcd65d..6e278945 100644
|
||||
index 7ee6a5b7..48dce407 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -55,6 +55,8 @@ struct llama_hparams {
|
||||
|
@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
|
|||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -153,6 +155,9 @@ struct llama_hparams {
|
||||
@@ -154,6 +156,9 @@ struct llama_hparams {
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
|
||||
|
@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
|
|||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 6b7bfecf..aba42819 100644
|
||||
index 822e2bb2..572378c9 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
|
|||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
|
|||
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
||||
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
||||
} break;
|
||||
|
@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
||||
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
|
@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
|
|||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index fd82d106..5865d5e9 100644
|
||||
index 95eca002..856e6042 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -62,6 +62,7 @@ enum llm_type {
|
||||
@@ -64,6 +64,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
+ LLM_TYPE_22B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
@@ -307,6 +308,8 @@ struct llama_layer {
|
||||
@@ -311,6 +312,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support
|
|||
|
||||
adds support for the llama 3.2 vision architecture
|
||||
---
|
||||
examples/llava/gemma3-cli.cpp | 3 +-
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
examples/llava/mtmd.cpp | 6 +-
|
||||
ggml/src/ggml-backend-reg.cpp | 6 +-
|
||||
|
@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
|
|||
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
|
||||
src/llama-model.h | 12 ++
|
||||
src/llama-quant.cpp | 4 +-
|
||||
20 files changed, 475 insertions(+), 22 deletions(-)
|
||||
19 files changed, 473 insertions(+), 21 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
|
||||
index 3d566475..654d1358 100644
|
||||
--- a/examples/llava/gemma3-cli.cpp
|
||||
+++ b/examples/llava/gemma3-cli.cpp
|
||||
@@ -106,7 +106,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -118,6 +118,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index 03a22cbb..5eb40bcd 100644
|
||||
index c00d16ae..bab027b5 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -456,7 +456,7 @@ struct llava_embd_batch {
|
||||
@@ -457,7 +457,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
|
@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
|
|||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -468,6 +468,7 @@ struct llava_embd_batch {
|
||||
@@ -469,6 +469,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
|
@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
|
|||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
|
@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
|
|||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
|
||||
index 3fd5bebc..f0cec596 100644
|
||||
index 7081fd73..c14ac501 100644
|
||||
--- a/examples/llava/mtmd.cpp
|
||||
+++ b/examples/llava/mtmd.cpp
|
||||
@@ -233,7 +233,7 @@ struct decode_embd_batch {
|
||||
@@ -476,7 +476,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -245,6 +245,7 @@ struct decode_embd_batch {
|
||||
@@ -487,6 +487,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
|
@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
|
|||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
|
||||
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
if (ret != 0) {
|
||||
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
||||
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 405d8e31..82ae1b5b 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
|
@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
|
|||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 5657fbf0..f91896e4 100644
|
||||
index 06c56395..f1628e88 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -255,6 +255,7 @@ extern "C" {
|
||||
@@ -256,6 +256,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
|
@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
|
|||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -357,6 +358,7 @@ extern "C" {
|
||||
@@ -358,6 +359,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
|
@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
|
|||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -458,6 +460,10 @@ extern "C" {
|
||||
@@ -459,6 +461,10 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
|
@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
|
|||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index f754bc8f..0568565f 100644
|
||||
index 5ab3f572..eb7b5325 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -6,6 +6,7 @@
|
||||
|
@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
|
|||
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||
{ LLM_ARCH_DECI, "deci" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
|
@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
|
|||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
|
@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
|
|||
{
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
|
|||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 439aaeab..6a989034 100644
|
||||
index 525c1b7d..bc8a4f0b 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -11,6 +11,7 @@
|
||||
|
@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
|
|||
LLM_ARCH_DECI,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
@@ -146,6 +147,7 @@ enum llm_kv {
|
||||
@@ -148,6 +149,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
|
@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
|
|||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -347,6 +349,14 @@ enum llm_tensor {
|
||||
@@ -349,6 +351,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
|
@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
|
|||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 32f59819..0343ba8a 100644
|
||||
index 9c1fe93f..cd06ad91 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
|
@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
|
|||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
|
||||
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
|
@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
|
|||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
|
@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
|
|||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
|
@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
|
|||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
|
@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
|
|||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
|
||||
@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
|
|||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
|
@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
|
|||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||
|
||||
|
@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
|
|||
|
||||
io.write(&logits_size, sizeof(logits_size));
|
||||
|
||||
@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
|
||||
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
|
@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
|
|||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
ctx->set_warmup(warmup);
|
||||
}
|
||||
|
||||
|
@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
|
|||
ctx->synchronize();
|
||||
}
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index 04facb54..baa03276 100644
|
||||
index 5457f077..a50c4afa 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -65,6 +65,7 @@ struct llama_context {
|
||||
|
@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
|
|||
|
||||
enum llama_pooling_type pooling_type;
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index a85e9728..d740c120 100644
|
||||
index fabb9ca2..b67216a4 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
|
|||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
|
@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
|
|||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
||||
index d192dc14..260a2af2 100644
|
||||
index d0c8d321..0fe18150 100644
|
||||
--- a/src/llama-graph.h
|
||||
+++ b/src/llama-graph.h
|
||||
@@ -86,6 +86,7 @@ public:
|
||||
|
@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
|
|||
};
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
@@ -285,6 +286,16 @@ public:
|
||||
@@ -283,6 +284,16 @@ public:
|
||||
const llama_cross * cross = nullptr;
|
||||
};
|
||||
|
||||
|
@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
|
|||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -493,6 +504,7 @@ struct llm_graph_context {
|
||||
@@ -491,6 +502,7 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
|
@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
|
|||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
+}
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 6e278945..c8a34d52 100644
|
||||
index 48dce407..b6fc7e6d 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -2,6 +2,8 @@
|
||||
|
@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
|
|||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -158,6 +162,9 @@ struct llama_hparams {
|
||||
@@ -159,6 +163,9 @@ struct llama_hparams {
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
|
||||
|
@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
|
|||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index aba42819..d051696c 100644
|
||||
index 572378c9..9d099f11 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||
|
@ -604,7 +582,7 @@ index aba42819..d051696c 100644
|
|||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
|
@ -612,7 +590,7 @@ index aba42819..d051696c 100644
|
|||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||
@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
|
@ -624,7 +602,7 @@ index aba42819..d051696c 100644
|
|||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
|
@ -633,7 +611,7 @@ index aba42819..d051696c 100644
|
|||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
|
@ -650,7 +628,7 @@ index aba42819..d051696c 100644
|
|||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
|
@ -659,7 +637,7 @@ index aba42819..d051696c 100644
|
|||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
@ -712,7 +690,7 @@ index aba42819..d051696c 100644
|
|||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -959,7 +937,7 @@ index aba42819..d051696c 100644
|
|||
struct llm_build_deci : public llm_graph_context {
|
||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||
} break;
|
||||
|
@ -970,7 +948,7 @@ index aba42819..d051696c 100644
|
|||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||
@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
|
@ -979,7 +957,7 @@ index aba42819..d051696c 100644
|
|||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 5865d5e9..72bab5be 100644
|
||||
index 856e6042..6be91282 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -11,6 +11,7 @@
|
||||
|
@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
|
|||
|
||||
struct llama_cparams;
|
||||
struct llama_ubatch;
|
||||
@@ -70,6 +71,7 @@ enum llm_type {
|
||||
@@ -73,6 +74,7 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
+ LLM_TYPE_90B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
@@ -310,6 +312,16 @@ struct llama_layer {
|
||||
@@ -314,6 +316,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
|
|
|
@ -18,10 +18,10 @@ adds the unpad operator to GGML
|
|||
10 files changed, 223 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 8fcc16df..d19fc167 100644
|
||||
index 1b8603e7..53ef31b2 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -488,6 +488,7 @@ extern "C" {
|
||||
@@ -489,6 +489,7 @@ extern "C" {
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
|
@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
|
|||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1757,6 +1758,15 @@ extern "C" {
|
||||
@@ -1777,6 +1778,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
|
@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
|
|||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 50400328..432942bf 100644
|
||||
index 64405449..34624cca 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
|
@ -60,7 +60,7 @@ index 50400328..432942bf 100644
|
|||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
|
@ -69,10 +69,10 @@ index 50400328..432942bf 100644
|
|||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 6050147b..66b8da68 100644
|
||||
index 7413192b..becdae07 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
|
|||
|
||||
static void ggml_compute_forward_arange_f32(
|
||||
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
|
||||
index 410a3720..3eca1cf8 100644
|
||||
index dc081b9e..a7125555 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.h
|
||||
+++ b/ggml/src/ggml-cpu/ops.h
|
||||
@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
|
|||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 31750b6f..0fef9522 100644
|
||||
index 04ce764e..491acccb 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
|
@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
|
|||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
|
@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
|
|||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 12886cd3..b2e95a66 100644
|
||||
index 425524d0..112abef6 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
|
@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
|
|||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass
|
||||
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
|
@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
|
@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
|
|||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node(
|
||||
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
|
@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
|
|||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 8d6e99e6..71f0f97f 100644
|
||||
index 9f4147e9..6ceb3cef 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
|
@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
|
|||
device char * dst,
|
||||
constant ggml_metal_kargs_arange & args,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 950772c7..2276b631 100644
|
||||
index 7654ae17..3c57aff8 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
|
@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
|
|||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
|
@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
|
|||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ regex
|
|||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index a35b498c..032019c9 100644
|
||||
index a9ee9f03..1306864e 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index 90679822..56043678 100644
|
||||
index 5b3059c2..656b3eca 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -346,7 +346,7 @@ private:
|
||||
@@ -349,7 +349,7 @@ private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
|
|
@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
|
|||
4 files changed, 51 insertions(+), 106 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 0343ba8a..4b3e6a83 100644
|
||||
index cd06ad91..77177c5e 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
|
@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
|
@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
|
@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
#endif
|
||||
|
||||
return res;
|
||||
@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
|
@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
|
||||
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
|
@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
}
|
||||
|
||||
{
|
||||
@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
|
||||
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
|
@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
}
|
||||
|
||||
enum llama_pooling_type llama_context::pooling_type() const {
|
||||
@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
|
@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
|
|||
|
||||
if (!kv_self->recurrent) {
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index baa03276..a59ff8fd 100644
|
||||
index a50c4afa..30f84bfd 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -5,6 +5,7 @@
|
||||
|
@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
|
|||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
@@ -180,7 +181,8 @@ private:
|
||||
@@ -179,7 +180,8 @@ private:
|
||||
|
||||
llm_graph_result_ptr build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
|
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
|||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index f00700da..91d6a7d5 100644
|
||||
index 43d9fc4f..4c0d3824 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
endforeach()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
|
@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644
|
|||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
+ add_custom_target(ggml-cpu)
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 8 Apr 2025 20:33:01 -0700
|
||||
Date: Thu, 1 May 2025 15:05:08 -0700
|
||||
Subject: [PATCH] remove amx
|
||||
|
||||
disable amx as it reduces performance on some systems
|
||||
|
@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
|
|||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 91d6a7d5..d6b393a2 100644
|
||||
index 4c0d3824..79c26312 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
- if (NOT MSVC)
|
||||
- # MSVC doesn't support AMX
|
||||
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
- endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
|
|
@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
|
|||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 032019c9..ba37df35 100644
|
||||
index 1306864e..d6515ff6 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
|||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 432942bf..6d4abe4c 100644
|
||||
index 34624cca..59bd3c62 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
|
@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644
|
|||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
|
|
@ -13,10 +13,10 @@ models not supported in llama.cpp
|
|||
4 files changed, 24 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 0568565f..dd01df60 100644
|
||||
index eb7b5325..df42d1a5 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
|
@ -24,7 +24,7 @@ index 0568565f..dd01df60 100644
|
|||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
|
@ -48,10 +48,10 @@ index 0568565f..dd01df60 100644
|
|||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 6a989034..b6227eeb 100644
|
||||
index bc8a4f0b..bda9d071 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -75,6 +75,7 @@ enum llm_arch {
|
||||
@@ -76,6 +76,7 @@ enum llm_arch {
|
||||
LLM_ARCH_CHAMELEON,
|
||||
LLM_ARCH_SOLAR,
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
|
@ -60,10 +60,10 @@ index 6a989034..b6227eeb 100644
|
|||
LLM_ARCH_BAILINGMOE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index d051696c..c8374159 100644
|
||||
index 9d099f11..ef70486d 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -71,7 +71,7 @@ index d051696c..c8374159 100644
|
|||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
case LLM_ARCH_SOLAR:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
|
|
|
@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
|
|||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index d1497985..b1a9dca3 100644
|
||||
index c0a5f934..75731053 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
|
||||
|
@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644
|
|||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
5
ml/backend/ggml/ggml/include/ggml-cpu.h
vendored
5
ml/backend/ggml/ggml/include/ggml-cpu.h
vendored
|
@ -133,6 +133,11 @@ extern "C" {
|
|||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
2
ml/backend/ggml/ggml/include/ggml-rpc.h
vendored
2
ml/backend/ggml/ggml/include/ggml-rpc.h
vendored
|
@ -7,7 +7,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 1
|
||||
#define RPC_PROTO_MAJOR_VERSION 2
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
|
26
ml/backend/ggml/ggml/include/ggml.h
vendored
26
ml/backend/ggml/ggml/include/ggml.h
vendored
|
@ -393,8 +393,8 @@ extern "C" {
|
|||
|
||||
// precision
|
||||
enum ggml_prec {
|
||||
GGML_PREC_DEFAULT,
|
||||
GGML_PREC_F32,
|
||||
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
|
||||
GGML_PREC_F32 = 10,
|
||||
};
|
||||
|
||||
// model file types
|
||||
|
@ -481,6 +481,7 @@ extern "C" {
|
|||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_IM2COL,
|
||||
GGML_OP_IM2COL_BACK,
|
||||
GGML_OP_CONV_2D_DW,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
@ -678,6 +679,9 @@ extern "C" {
|
|||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
|
@ -1661,7 +1665,7 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// depthwise
|
||||
// depthwise (via im2col and mul_mat)
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
|
@ -1673,6 +1677,22 @@ extern "C" {
|
|||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
// Depthwise 2D convolution
|
||||
// may be faster than ggml_conv_2d_dw, but not available in all backends
|
||||
// a: KW KH 1 C convolution kernel
|
||||
// b: W H C N input data
|
||||
// res: W_out H_out C N
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int stride0,
|
||||
int stride1,
|
||||
int pad0,
|
||||
int pad1,
|
||||
int dilation0,
|
||||
int dilation1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
|
13
ml/backend/ggml/ggml/src/CMakeLists.txt
vendored
13
ml/backend/ggml/ggml/src/CMakeLists.txt
vendored
|
@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
|
@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
|
|||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
add_custom_target(ggml-cpu)
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
endif()
|
||||
|
|
12
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
vendored
12
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
vendored
|
@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
else ()
|
||||
elseif (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
|
@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
else ()
|
||||
if (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
||||
|
@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||
if (${S390X_M} MATCHES "8561|8562")
|
||||
message(STATUS "z15 target")
|
||||
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
|
||||
list(APPEND ARCH_FLAGS -march=z15)
|
||||
elseif (${S390X_M} MATCHES "3931")
|
||||
message(STATUS "z16 target")
|
||||
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
else()
|
||||
message(STATUS "Unknown target")
|
||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||
|
|
|
@ -263,7 +263,7 @@ void test_x86_is() {
|
|||
static int ggml_backend_cpu_x86_score() {
|
||||
// FIXME: this does not check for OS support
|
||||
|
||||
int score = 0;
|
||||
int score = 1;
|
||||
cpuid_x86 is;
|
||||
|
||||
#ifdef GGML_FMA
|
||||
|
|
96
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
vendored
96
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
vendored
|
@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
|
@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|||
.from_float = quantize_row_q8_K,
|
||||
},
|
||||
[GGML_TYPE_BF16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
|
@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_im2col_back_f32(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_dw(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d(params, tensor);
|
||||
|
@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
|
@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
|||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m512 x_vec = _mm512_loadu_ps(x + i);
|
||||
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128 x_vec = _mm_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
|
||||
__m512 y_vec = _mm512_cvtph_ps(x_vec);
|
||||
_mm512_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
|
||||
__m256 y_vec = _mm256_cvtph_ps(x_vec);
|
||||
_mm256_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
|
||||
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
||||
_mm_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_BF16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i,
|
||||
_mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(
|
||||
_mm512_cvtepu16_epi32(
|
||||
_mm256_loadu_si256(
|
||||
(const __m256i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
#endif
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i,
|
||||
_mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(
|
||||
_mm256_cvtepu16_epi32(
|
||||
_mm_loadu_si128(
|
||||
(const __m128i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
y[i] = GGML_BF16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx(void) {
|
||||
#if defined(__AVX__)
|
||||
|
|
176
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
vendored
176
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
vendored
|
@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
|
|||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_fp16_to_fp32_row(
|
||||
ggml_cpu_fp16_to_fp32(
|
||||
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
|
@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
|
|||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_bf16_to_fp32_row(
|
||||
ggml_cpu_bf16_to_fp32(
|
||||
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
|
@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_dw
|
||||
|
||||
struct ggml_conv_2d_dw_params {
|
||||
int64_t channels;
|
||||
int64_t batch;
|
||||
int64_t src_w;
|
||||
int64_t src_h;
|
||||
int64_t dst_w;
|
||||
int64_t dst_h;
|
||||
int64_t knl_w;
|
||||
int64_t knl_h;
|
||||
int stride_x;
|
||||
int stride_y;
|
||||
int pad_x;
|
||||
int pad_y;
|
||||
int dilation_x;
|
||||
int dilation_y;
|
||||
};
|
||||
|
||||
static void ggml_compute_forward_conv_2d_dw_cwhn(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * src,
|
||||
const ggml_tensor * kernel,
|
||||
ggml_tensor * dst,
|
||||
const ggml_conv_2d_dw_params & p) {
|
||||
|
||||
const int64_t c = p.channels;
|
||||
const float * knl_data = (const float *)kernel->data;
|
||||
|
||||
const int64_t rows_total = p.dst_h * p.batch;
|
||||
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
|
||||
const int64_t row_start = params->ith * rows_per_thread;
|
||||
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
const int64_t pkg_size = GGML_F32_EPR;
|
||||
const int64_t pkg_count = c / pkg_size;
|
||||
const int64_t c_pkg_end = pkg_count * pkg_size;
|
||||
#else
|
||||
const int64_t c_pkg_end = 0;
|
||||
#endif
|
||||
|
||||
for (int64_t row = row_start; row < row_end; ++row) {
|
||||
const int64_t dst_y = row % p.dst_h;
|
||||
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
|
||||
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
||||
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
|
||||
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
|
||||
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
// Vectorized loop
|
||||
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
|
||||
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
|
||||
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
|
||||
sum = GGML_F32_VEC_FMA(sum, k, s);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_STORE(dst_data + c_i, sum);
|
||||
}
|
||||
#endif
|
||||
// Scalar loop
|
||||
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
|
||||
float sum = 0.0f;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
|
||||
* src_data[(src_y * p.src_w + src_x) * c + c_i];
|
||||
}
|
||||
}
|
||||
dst_data[c_i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_dw_whcn(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * src,
|
||||
const ggml_tensor * kernel,
|
||||
ggml_tensor * dst,
|
||||
const ggml_conv_2d_dw_params & p) {
|
||||
|
||||
const int64_t n = p.channels * p.batch;
|
||||
const int64_t per_thread = (n + params->nth - 1) / params->nth;
|
||||
const int64_t start = params->ith * per_thread;
|
||||
const int64_t end = MIN(start + per_thread, n);
|
||||
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
|
||||
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
|
||||
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
|
||||
|
||||
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
|
||||
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
||||
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
|
||||
if (src_y < 0 || src_y >= p.src_h) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
||||
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
|
||||
if (src_x < 0 || src_x >= p.src_w) {
|
||||
continue;
|
||||
}
|
||||
sum += knl_data[knl_y * p.knl_w + knl_x]
|
||||
* src_data[src_y * p.src_w + src_x];
|
||||
}
|
||||
}
|
||||
dst_data[dst_y * p.dst_w + dst_x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_conv_2d_dw(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * src = dst->src[1];
|
||||
ggml_conv_2d_dw_params p;
|
||||
p.channels = src->ne[2];
|
||||
p.batch = src->ne[3];
|
||||
p.src_w = src->ne[0];
|
||||
p.src_h = src->ne[1];
|
||||
p.dst_w = dst->ne[0];
|
||||
p.dst_h = dst->ne[1];
|
||||
p.knl_w = kernel->ne[0];
|
||||
p.knl_h = kernel->ne[1];
|
||||
p.stride_x = dst->op_params[0];
|
||||
p.stride_y = dst->op_params[1];
|
||||
p.pad_x = dst->op_params[2];
|
||||
p.pad_y = dst->op_params[3];
|
||||
p.dilation_x = dst->op_params[4];
|
||||
p.dilation_y = dst->op_params[5];
|
||||
|
||||
GGML_ASSERT(kernel->ne[3] == p.channels);
|
||||
GGML_ASSERT(dst->ne[3] == p.batch);
|
||||
|
||||
if (ggml_is_contiguous(src)) {
|
||||
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
|
||||
} else if (ggml_is_contiguous_channels(src)) {
|
||||
// kernel should also have channels most contiguous in memory
|
||||
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
|
||||
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
|
||||
} else {
|
||||
GGML_ABORT("non-contiguous memory layout not supported");
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_1d_sk_p0
|
||||
|
||||
static void ggml_compute_forward_pool_1d_sk_p0(
|
||||
|
|
1
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
vendored
1
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
vendored
|
@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
|
|||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
|
|
@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 vector float
|
||||
#define GGML_F32x4_ZERO 0.0f
|
||||
#define GGML_F32x4_ZERO {0.0f}
|
||||
#define GGML_F32x4_SET1 vec_splats
|
||||
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
||||
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
||||
|
|
|
@ -78,13 +78,13 @@
|
|||
// Moore Threads
|
||||
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
||||
|
||||
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
|
||||
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
|
||||
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
|
|
53
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
vendored
53
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
vendored
|
@ -1,6 +1,8 @@
|
|||
#include "convert.cuh"
|
||||
#include "dequantize.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
|
@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
|
|||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void convert_unary(
|
||||
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const src_t * x = (const src_t *) vx;
|
||||
|
||||
y[i] = float(x[i]);
|
||||
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
|
||||
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
|
||||
y[iy] = float(x[ix]);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
static void convert_unary_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
|
||||
}
|
||||
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
return convert_unary_cont_cuda<half>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
|
|
12
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
vendored
12
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
vendored
|
@ -3,7 +3,7 @@
|
|||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
|
@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
|||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
||||
|
||||
// TODO more general support for non-contiguous inputs
|
||||
|
||||
template<typename T>
|
||||
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
|
||||
|
||||
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
|
|
2
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
vendored
2
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
vendored
|
@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(disable_indirection_for_this_node);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
181
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
vendored
181
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
vendored
|
@ -33,8 +33,8 @@ static __global__ void k_get_rows(
|
|||
dfloat2 v;
|
||||
dequantize_kernel(src0_row, ib, iqs, v);
|
||||
|
||||
dst_row[iybs + iqs + 0] = v.x;
|
||||
dst_row[iybs + iqs + y_offset] = v.y;
|
||||
dst_row[iybs + iqs + 0] = float(v.x);
|
||||
dst_row[iybs + iqs + y_offset] = float(v.y);
|
||||
}
|
||||
|
||||
template<typename src0_t, typename dst_t>
|
||||
|
@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
|
|||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
|
||||
dst_row[i00] = src0_row[i00];
|
||||
dst_row[i00] = float(src0_row[i00]);
|
||||
}
|
||||
|
||||
template<typename grad_t, typename dst_t>
|
||||
|
@ -86,120 +86,159 @@ static __global__ void k_get_rows_back_float(
|
|||
dst[dst_row*ncols + col] = sum;
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq>
|
||||
static void get_rows_cuda(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
|
||||
static void get_rows_cuda_q(
|
||||
const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
|
||||
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
|
||||
// strides in elements
|
||||
//const size_t s0 = nb0 / ggml_element_size(dst);
|
||||
const size_t s1 = nb1 / ggml_element_size(dst);
|
||||
const size_t s2 = nb2 / ggml_element_size(dst);
|
||||
const size_t s3 = nb3 / ggml_element_size(dst);
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
const size_t s1 = nb1 / sizeof(dst_t);
|
||||
const size_t s2 = nb2 / sizeof(dst_t);
|
||||
const size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
const size_t s10 = nb10 / ggml_element_size(src1);
|
||||
const size_t s11 = nb11 / ggml_element_size(src1);
|
||||
const size_t s12 = nb12 / ggml_element_size(src1);
|
||||
//const size_t s13 = nb13 / ggml_element_size(src1);
|
||||
const size_t s10 = nb10 / sizeof(int32_t);
|
||||
const size_t s11 = nb11 / sizeof(int32_t);
|
||||
const size_t s12 = nb12 / sizeof(int32_t);
|
||||
// const size_t s13 = nb13 / sizeof(int32_t);
|
||||
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
|
||||
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd,
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
template<typename src0_t>
|
||||
template<typename src0_t, typename dst_t>
|
||||
static void get_rows_cuda_float(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
|
||||
const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d,
|
||||
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
|
||||
// strides in elements
|
||||
//const size_t s0 = nb0 / ggml_element_size(dst);
|
||||
const size_t s1 = nb1 / ggml_element_size(dst);
|
||||
const size_t s2 = nb2 / ggml_element_size(dst);
|
||||
const size_t s3 = nb3 / ggml_element_size(dst);
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
const size_t s1 = nb1 / sizeof(dst_t);
|
||||
const size_t s2 = nb2 / sizeof(dst_t);
|
||||
const size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
const size_t s10 = nb10 / ggml_element_size(src1);
|
||||
const size_t s11 = nb11 / ggml_element_size(src1);
|
||||
const size_t s12 = nb12 / ggml_element_size(src1);
|
||||
//const size_t s13 = nb13 / ggml_element_size(src1);
|
||||
const size_t s10 = nb10 / sizeof(int32_t);
|
||||
const size_t s11 = nb11 / sizeof(int32_t);
|
||||
const size_t s12 = nb12 / sizeof(int32_t);
|
||||
// const size_t s13 = nb13 / sizeof(int32_t);
|
||||
|
||||
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd,
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
}
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
template <typename dst_t>
|
||||
static void ggml_cuda_get_rows_switch_src0_type(
|
||||
const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
|
||||
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_F16:
|
||||
get_rows_cuda_float((const half *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
get_rows_cuda_q<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
get_rows_cuda_q<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
get_rows_cuda_q<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void get_rows_cuda(
|
||||
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
|
||||
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
|
||||
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
|
||||
size_t nb1, size_t nb2, size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
switch (dst_type) {
|
||||
case GGML_TYPE_F32:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const void * src0_d = (const void *) src0->data;
|
||||
const int32_t * src1_d = (const int32_t *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
||||
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
break;
|
||||
}
|
||||
get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
|
|
@ -3,6 +3,13 @@
|
|||
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
||||
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
|
||||
|
||||
void get_rows_cuda(
|
||||
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
|
||||
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
|
||||
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
|
||||
size_t nb1, size_t nb2, size_t nb3,
|
||||
cudaStream_t stream);
|
||||
|
||||
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
363
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
363
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
|
@ -1413,6 +1413,11 @@ static void ggml_cuda_op_mul_mat(
|
|||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// const int64_t nb10 = src1->nb[0];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2];
|
||||
const int64_t nb13 = src1->nb[3];
|
||||
|
||||
const int64_t nb2 = dst->nb[2];
|
||||
const int64_t nb3 = dst->nb[3];
|
||||
|
||||
|
@ -1548,7 +1553,10 @@ static void ggml_cuda_op_mul_mat(
|
|||
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
|
||||
quantize_src1(
|
||||
dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
|
||||
nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
|
||||
src1_padded_col_size, ne11, ne12, ne13, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
@ -1643,7 +1651,9 @@ static void ggml_cuda_op_mul_mat(
|
|||
}
|
||||
|
||||
if (quantize_src1 && !src1_is_contiguous) {
|
||||
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
|
||||
quantize_src1(
|
||||
src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
|
||||
src1_padded_col_size, src1_ncols, 1, 1, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
@ -1713,15 +1723,15 @@ static __global__ void k_compute_batched_ptrs(
|
|||
size_t nb12, size_t nb13,
|
||||
size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3) {
|
||||
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i13 >= ne13 || i12 >= ne12) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t i03 = i13 / r3;
|
||||
int64_t i02 = i12 / r2;
|
||||
const int64_t i03 = i13 / r3;
|
||||
const int64_t i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
||||
|
@ -1735,6 +1745,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
|
||||
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
|
||||
// As long as dst is contiguous this does not matter though.
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int64_t ne_dst = ggml_nelements(dst);
|
||||
|
@ -1743,21 +1757,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
|
||||
|
||||
void * src0_ddq = src0->data;
|
||||
half * src0_f16 = (half *) src0_ddq;
|
||||
float * src1_ddf = (float *) src1->data;
|
||||
const half * src0_f16 = (const half *) src0->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
// convert src1 to fp16
|
||||
const half * src1_f16 = (const half *) src1->data;
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
GGML_ASSERT(nb10 == ts_src1);
|
||||
int64_t s11 = nb11 / ts_src1;
|
||||
int64_t s12 = nb12 / ts_src1;
|
||||
int64_t s13 = nb13 / ts_src1;
|
||||
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
|
||||
|
||||
// convert src1 to fp16
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
||||
|
||||
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
|
||||
|
||||
src1_f16 = src1_f16_alloc.get();
|
||||
s11 = ne10;
|
||||
s12 = ne11*s11;
|
||||
s13 = ne12*s12;
|
||||
}
|
||||
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
||||
|
||||
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
|
||||
char * dst_t;
|
||||
|
@ -1817,11 +1841,11 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
||||
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
|
||||
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
|
||||
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
|
@ -1834,15 +1858,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
||||
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
||||
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
||||
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
||||
src1_f16, CUDA_R_16F, s11, s12, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
ne12*ne13,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
const int ne23 = ne12*ne13;
|
||||
const int64_t ne23 = ne12*ne13;
|
||||
|
||||
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
|
@ -1854,8 +1878,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
||||
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
||||
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
|
||||
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
|
||||
nbd2, nbd3,
|
||||
r2, r3);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
@ -1864,8 +1888,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
|
||||
ne23,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
@ -1881,7 +1905,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
|
@ -1922,12 +1946,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
||||
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_vec_q) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_q) {
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
|
||||
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec) {
|
||||
|
@ -1941,196 +1969,145 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||
}
|
||||
}
|
||||
|
||||
struct mmid_row_mapping {
|
||||
int32_t i1;
|
||||
int32_t i2;
|
||||
};
|
||||
|
||||
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
|
||||
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
|
||||
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
||||
int64_t ne11, int64_t ne10,
|
||||
size_t nb11, size_t nb12) {
|
||||
int32_t iid1 = blockIdx.x;
|
||||
int32_t id = blockIdx.y;
|
||||
|
||||
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
||||
|
||||
if (row_id_i != i02) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = iid1;
|
||||
|
||||
__shared__ int src1_row;
|
||||
if (threadIdx.x == 0) {
|
||||
src1_row = atomicAdd(cur_src1_row, 1);
|
||||
row_mapping[src1_row] = {id, iid1};
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
||||
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
||||
|
||||
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
|
||||
src1_row_contiguous[i] = src1_row_original[i];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
|
||||
const mmid_row_mapping * __restrict__ row_mapping,
|
||||
int64_t ne0,
|
||||
size_t nb1, size_t nb2) {
|
||||
int32_t i = blockIdx.x;
|
||||
|
||||
const int32_t i1 = row_mapping[i].i1;
|
||||
const int32_t i2 = row_mapping[i].i2;
|
||||
|
||||
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
||||
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
||||
|
||||
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
|
||||
dst_row_original[j] = dst_row_contiguous[j];
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const ggml_tensor * ids = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (ne2 == 1) {
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
} else {
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t n_as = ne02;
|
||||
const int64_t n_ids = ids->ne[0];
|
||||
GGML_ASSERT(nb12 % nb11 == 0);
|
||||
GGML_ASSERT(nb2 % nb1 == 0);
|
||||
|
||||
const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
|
||||
|| ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
|
||||
const ggml_type type_dst_sorted = GGML_TYPE_F32;
|
||||
const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
|
||||
const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
|
||||
|
||||
const int64_t n_expert_used = ids->ne[0];
|
||||
const int64_t ne_get_rows = ne12 * n_expert_used;
|
||||
|
||||
std::vector<int32_t> ids_to_sorted_host;
|
||||
ids_to_sorted_host.reserve(2*ne_get_rows);
|
||||
std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
|
||||
|
||||
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), 2*ne_get_rows);
|
||||
|
||||
std::vector<int32_t> tokens_per_expert(ne02);
|
||||
|
||||
ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted);
|
||||
ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted);
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
const char * ids_dev = (const char *) ids->data;
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
char * src0_original = (char *) src0->data;
|
||||
char * src1_original = (char *) src1->data;
|
||||
char * dst_original = (char *) dst->data;
|
||||
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
src0_row.nb[3] = nb02;
|
||||
|
||||
src1_row.ne[1] = 1;
|
||||
src1_row.ne[2] = 1;
|
||||
src1_row.ne[3] = 1;
|
||||
src1_row.nb[2] = nb11;
|
||||
src1_row.nb[3] = nb11;
|
||||
|
||||
dst_row.ne[1] = 1;
|
||||
dst_row.ne[2] = 1;
|
||||
dst_row.ne[3] = 1;
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
if (ne12 == 1) {
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = iid1;
|
||||
|
||||
const int64_t i1 = id;
|
||||
const int64_t i2 = i12;
|
||||
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
||||
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
||||
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
|
||||
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
|
||||
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
|
||||
assert(expert_to_use >= 0 && expert_to_use < ne02);
|
||||
if (expert_to_use == i02) {
|
||||
ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size();
|
||||
ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
|
||||
tokens_per_expert[i02]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
||||
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
|
||||
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end());
|
||||
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
int64_t num_src1_rows = 0;
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows;
|
||||
const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows;
|
||||
|
||||
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
||||
get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
|
||||
ne10, nb11, nb12, nb13,
|
||||
ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
|
||||
ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (row_id_i != i02) {
|
||||
char * src1_data_cur = (char *) src1_sorted.ptr;
|
||||
char * dst_data_cur = (char *) dst_sorted.ptr;
|
||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
||||
if (tokens_per_expert[i02] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
num_src1_rows++;
|
||||
}
|
||||
}
|
||||
ggml_tensor src0_slice = *src0;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
|
||||
if (num_src1_rows == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_tensor src1_slice;
|
||||
memset(&src1_slice, 0, sizeof(src1_slice));
|
||||
src1_slice.buffer = src1->buffer;
|
||||
src1_slice.type = type_src1_sorted;
|
||||
src1_slice.ne[0] = ne10;
|
||||
src1_slice.ne[1] = tokens_per_expert[i02];
|
||||
src1_slice.ne[2] = 1;
|
||||
src1_slice.ne[3] = 1;
|
||||
src1_slice.nb[0] = ts_src1_sorted;
|
||||
src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0];
|
||||
src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1];
|
||||
src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2];
|
||||
src1_slice.data = src1_data_cur;
|
||||
|
||||
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
||||
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
|
||||
ggml_tensor dst_slice;
|
||||
memset(&dst_slice, 0, sizeof(dst_slice));
|
||||
dst_slice.buffer = dst->buffer;
|
||||
dst_slice.type = type_dst_sorted;
|
||||
dst_slice.ne[0] = ne0;
|
||||
dst_slice.ne[1] = tokens_per_expert[i02];
|
||||
dst_slice.ne[2] = 1;
|
||||
dst_slice.ne[3] = 1;
|
||||
dst_slice.nb[0] = ts_dst_sorted;
|
||||
dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0];
|
||||
dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1];
|
||||
dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2];
|
||||
dst_slice.data = dst_data_cur;
|
||||
|
||||
{
|
||||
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
||||
dim3 grid_dims(ids->ne[1], n_ids);
|
||||
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src1_original, src1_contiguous.get(),
|
||||
dev_cur_src1_row.get(), dev_row_mapping.get(),
|
||||
ids_dev, i02, ids->nb[1], ids->nb[0],
|
||||
ne11, ne10,
|
||||
nb11, nb12);
|
||||
ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
src1_data_cur += src1_slice.nb[2];
|
||||
dst_data_cur += dst_slice.nb[2];
|
||||
}
|
||||
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
src1_row.nb[1] = nb11;
|
||||
src1_row.nb[2] = num_src1_rows*nb11;
|
||||
src1_row.nb[3] = num_src1_rows*nb11;
|
||||
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
||||
|
||||
{
|
||||
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
dst_original, dst_contiguous.get(),
|
||||
dev_row_mapping.get(),
|
||||
ne0,
|
||||
nb1, nb2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
}
|
||||
get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
|
||||
ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
|
||||
ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
|
||||
nb1, nb2, nb3, stream);
|
||||
}
|
||||
|
||||
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
||||
|
@ -2495,7 +2472,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|||
#endif
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||
if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
|
||||
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
||||
|
@ -3209,9 +3186,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
}
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK: {
|
||||
const size_t ts = ggml_type_size(op->src[0]->type);
|
||||
const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
|
||||
return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
|
||||
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_POOL_2D:
|
||||
|
|
220
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
vendored
220
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
vendored
|
@ -1,37 +1,10 @@
|
|||
#include "mmq.cuh"
|
||||
#include "quantize.cuh"
|
||||
|
||||
void ggml_cuda_op_mul_mat_q(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
||||
#include <vector>
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(ne10 % QK8_1 == 0);
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
||||
|
||||
switch (src0->type) {
|
||||
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
switch (args.type_x) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||
break;
|
||||
|
@ -90,10 +63,195 @@ void ggml_cuda_op_mul_mat_q(
|
|||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_q(
|
||||
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT( nb00 == ts_src0);
|
||||
GGML_ASSERT( nb10 == ts_src1);
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
|
||||
|
||||
const char * src0_d = (const char *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
const int64_t s1 = dst->nb[1] / ts_dst;
|
||||
const int64_t s02 = src0->nb[2] / ts_src0;
|
||||
const int64_t s2 = dst->nb[2] / ts_dst;
|
||||
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
|
||||
if (!ids) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
|
||||
|
||||
{
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
||||
ne00, ne01, ne1, s01, s1,
|
||||
ne02, ne12, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
GGML_ASSERT(nb12 % nb11 == 0);
|
||||
GGML_ASSERT(nb2 % nb1 == 0);
|
||||
|
||||
const int64_t n_expert_used = ids->ne[0];
|
||||
const int64_t ne_get_rows = ne12 * n_expert_used;
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
std::vector<int32_t> ids_src1_host;
|
||||
ids_src1_host.reserve(ne_get_rows);
|
||||
std::vector<int32_t> ids_dst_host;
|
||||
ids_dst_host.reserve(ne_get_rows);
|
||||
std::vector<int32_t> tokens_per_expert_host(ne02);
|
||||
std::vector<int32_t> expert_bounds_host(ne02 + 1);
|
||||
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool());
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
|
||||
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
|
||||
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
|
||||
assert(expert_to_use >= 0 && expert_to_use < ne02);
|
||||
if (expert_to_use == i02) {
|
||||
ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11);
|
||||
ids_dst_host.push_back(i12*ne1 + iex);
|
||||
tokens_per_expert_host[i02]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t cumsum = 0;
|
||||
for (int64_t i = 0; i < ne02; ++i) {
|
||||
expert_bounds_host[i] = cumsum;
|
||||
cumsum += tokens_per_expert_host[i];
|
||||
}
|
||||
expert_bounds_host[ne02] = cumsum;
|
||||
|
||||
std::vector<int32_t> ids_buf_host;
|
||||
ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size());
|
||||
ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end());
|
||||
ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end());
|
||||
ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end());
|
||||
ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device.
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
const int32_t * ids_src1_dev = ids_buf_dev.ptr;
|
||||
const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size();
|
||||
const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size();
|
||||
|
||||
const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
|
||||
|
||||
const int64_t ne11_flat = ne12*n_expert_used;
|
||||
const int64_t ne12_flat = 1;
|
||||
const int64_t ne13_flat = 1;
|
||||
|
||||
{
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[2] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
||||
ne00, ne01, ne_get_rows, s01, s1,
|
||||
ne02, ne02, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
|
||||
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul_mat_q(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(ne10 % QK8_1 == 0);
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
const int64_t stride01 = ne00 / ggml_blck_size(src0->type);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, nrows_dst,
|
||||
1, 1, 0, 0, 0,
|
||||
1, 1, 0, 0, 0,
|
||||
use_stream_k};
|
||||
|
||||
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
GGML_UNUSED(src1_padded_row_size);
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
|
|
632
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
vendored
632
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
vendored
File diff suppressed because it is too large
Load diff
169
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
vendored
169
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
vendored
|
@ -4,18 +4,23 @@
|
|||
|
||||
template <typename T, typename type_acc, int block_size>
|
||||
static __global__ void mul_mat_vec(
|
||||
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
|
||||
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
|
||||
const int64_t row = blockIdx.x;
|
||||
const int64_t channel = blockIdx.y;
|
||||
const int64_t sample = blockIdx.z;
|
||||
const int64_t channel_dst = blockIdx.y;
|
||||
const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio;
|
||||
const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst;
|
||||
const int64_t sample_dst = blockIdx.z;
|
||||
const int64_t sample_x = sample_dst / sample_ratio;
|
||||
const int64_t sample_y = sample_dst;
|
||||
const int tid = threadIdx.x;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
x += (sample/sample_ratio)*stride_sample_x + (channel/channel_ratio)*stride_channel_x + row*stride_row;
|
||||
y += sample *stride_sample_y + channel *stride_channel_y;
|
||||
dst += sample *stride_sample_dst + channel *stride_channel_dst;
|
||||
x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
|
||||
y += sample_y *stride_sample_y + channel_y *stride_channel_y;
|
||||
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
|
||||
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
|
||||
|
@ -31,12 +36,19 @@ static __global__ void mul_mat_vec(
|
|||
|
||||
float sumf = 0.0f;
|
||||
|
||||
if constexpr (std::is_same<T, half>::value) {
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
const float2 * x2 = (const float2 *) x;
|
||||
|
||||
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const float2 tmpx = x2[col2];
|
||||
const float2 tmpy = y2[col2];
|
||||
sumf += tmpx.x*tmpy.x;
|
||||
sumf += tmpx.y*tmpy.y;
|
||||
}
|
||||
} else if constexpr (std::is_same<T, half>::value) {
|
||||
const half2 * x2 = (const half2 *) x;
|
||||
|
||||
if (std::is_same<type_acc, float>::value) {
|
||||
sumf = 0.0f;
|
||||
|
||||
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const float2 tmpx = __half22float2(x2[col2]);
|
||||
const float2 tmpy = y2[col2];
|
||||
|
@ -59,8 +71,6 @@ static __global__ void mul_mat_vec(
|
|||
}
|
||||
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
|
||||
const int * x2 = (const int *) x;
|
||||
sumf = 0.0f;
|
||||
|
||||
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const int tmpx = x2[col2];
|
||||
const float2 tmpy = y2[col2];
|
||||
|
@ -92,17 +102,17 @@ static __global__ void mul_mat_vec(
|
|||
|
||||
template <typename T, typename type_acc>
|
||||
static void launch_mul_mat_vec_cuda(
|
||||
const T * x, const float * y, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
GGML_ASSERT(stride_row % 2 == 0);
|
||||
GGML_ASSERT(nchannels_y % nchannels_x == 0);
|
||||
GGML_ASSERT(nsamples_y % nsamples_x == 0);
|
||||
const int64_t channel_ratio = nchannels_y / nchannels_x;
|
||||
const int64_t sample_ratio = nsamples_y / nsamples_x;
|
||||
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
|
||||
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
|
||||
const int64_t channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int64_t sample_ratio = nsamples_dst / nsamples_x;
|
||||
int device;
|
||||
int warp_size;
|
||||
|
||||
|
@ -124,48 +134,48 @@ static void launch_mul_mat_vec_cuda(
|
|||
}
|
||||
|
||||
const int smem = warp_size*sizeof(float);
|
||||
const dim3 block_nums(nrows, nchannels_y, nsamples_y);
|
||||
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
|
||||
const dim3 block_dims(block_size_best, 1, 1);
|
||||
switch (block_size_best) {
|
||||
case 32: {
|
||||
mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 64: {
|
||||
mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 96: {
|
||||
mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 128: {
|
||||
mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 160: {
|
||||
mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 192: {
|
||||
mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 224: {
|
||||
mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 256: {
|
||||
mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
|
@ -175,28 +185,28 @@ static void launch_mul_mat_vec_cuda(
|
|||
|
||||
template<typename T>
|
||||
static void mul_mat_vec_cuda(
|
||||
const T * x, const float * y, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
enum ggml_prec prec, cudaStream_t stream) {
|
||||
switch (prec) {
|
||||
case GGML_PREC_DEFAULT: {
|
||||
if constexpr(std::is_same<T, half>::value) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
launch_mul_mat_vec_cuda<T, half>
|
||||
(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case GGML_PREC_F32: {
|
||||
launch_mul_mat_vec_cuda<T, float>
|
||||
(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
launch_mul_mat_vec_cuda<T, float>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
|
@ -204,21 +214,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
|||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
GGML_ASSERT(ne12 == ne2);
|
||||
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
|
||||
GGML_ASSERT(ne13 == ne3);
|
||||
|
||||
GGML_ASSERT(nb00 == ts_src0);
|
||||
GGML_ASSERT(nb10 == ts_src1);
|
||||
GGML_ASSERT(nb0 == ts_dst);
|
||||
GGML_ASSERT( nb00 == ts_src0);
|
||||
GGML_ASSERT( nb10 == ts_src1);
|
||||
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
||||
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s1 = dst->nb[1] / ts_dst;
|
||||
const int64_t s02 = src0->nb[2] / ts_src0;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s2 = dst->nb[2] / ts_dst;
|
||||
|
@ -226,14 +239,33 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
|||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
|
||||
const int64_t ncols_dst = ids ? ne2 : ne1;
|
||||
const int64_t nchannels_y = ids ? ne11 : ne12;
|
||||
const int64_t nchannels_dst = ids ? ne1 : ne2;
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
const int64_t stride_channel_y = ids ? s11 : s12;
|
||||
|
||||
GGML_ASSERT(ncols_dst == 1);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half * src0_d = (const half *) src0->data;
|
||||
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
|
||||
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
|
||||
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
|
||||
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||
|
@ -262,27 +294,34 @@ void ggml_cuda_op_mul_mat_vec(
|
|||
const int64_t stride_row = ne00;
|
||||
const int64_t nchannels_x = 1;
|
||||
const int64_t nchannels_y = 1;
|
||||
const int64_t nchannels_dst = 1;
|
||||
const int64_t stride_channel_x = 0;
|
||||
const int64_t stride_channel_y = 0;
|
||||
const int64_t stride_channel_dst = 0;
|
||||
const int64_t nsamples_x = 1;
|
||||
const int64_t nsamples_y = 1;
|
||||
const int64_t nsamples_dst = 1;
|
||||
const int64_t stride_sample_x = 0;
|
||||
const int64_t stride_sample_y = 0;
|
||||
const int64_t stride_sample_dst = 0;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0_dd_i;
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half * src0_d = (const half *) src0_dd_i;
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||
nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||
nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||
|
|
2
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
vendored
2
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
vendored
|
@ -3,7 +3,7 @@
|
|||
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
|
||||
#define MMV_MAX_ROWS 512
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
|
|
623
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
vendored
623
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
vendored
|
@ -1,50 +1,57 @@
|
|||
#include "mmvq.cuh"
|
||||
#include "quantize.cuh"
|
||||
#include "vecdotq.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
|
||||
|
||||
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
|
||||
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
|
||||
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
|
||||
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
|
||||
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
|
||||
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
|
||||
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
|
||||
type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
|
||||
type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
|
||||
type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
|
||||
type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
|
||||
type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
|
||||
type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
|
||||
type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
|
||||
type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
|
||||
type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
|
||||
type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
|
||||
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
|
||||
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
|
||||
nullptr;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
|
||||
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
|
||||
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
|
||||
case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
|
||||
case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
|
||||
case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
|
||||
case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
|
||||
case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
|
||||
case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1;
|
||||
case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1;
|
||||
case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
|
||||
case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1;
|
||||
case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1;
|
||||
case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
|
||||
case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
|
||||
case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
|
||||
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
|
||||
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
|
||||
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
|
||||
default: return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
||||
1;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
||||
enum mmvq_parameter_table_id {
|
||||
|
@ -73,9 +80,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
|||
return MMVQ_PARAMETERS_GENERIC;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) {
|
||||
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||
switch (ncols_y) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
|
@ -90,7 +97,7 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete
|
|||
return 1;
|
||||
}
|
||||
} else if (table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_y) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
|
@ -107,9 +114,9 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete
|
|||
return 1;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
|
||||
switch (ncols_y) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
return 1;
|
||||
case 2:
|
||||
|
@ -127,19 +134,21 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta
|
|||
return 1;
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_y>
|
||||
template <ggml_type type, int ncols_dst>
|
||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||
__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||
constexpr int nwarps = calc_nwarps(ncols_y, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
|
||||
constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||
|
@ -147,13 +156,21 @@ static __global__ void mul_mat_vec_q(
|
|||
const int tid = warp_size*threadIdx.y + threadIdx.x;
|
||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
const int blocks_per_col_y = nrows_y / QK8_1;
|
||||
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||
|
||||
// partial sum for each thread
|
||||
float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}};
|
||||
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
|
||||
const int channel_dst = blockIdx.y;
|
||||
const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio;
|
||||
const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
const int sample_x = sample_dst / sample_ratio;
|
||||
const int sample_y = sample_dst;
|
||||
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
// partial sum for each thread
|
||||
float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
|
||||
|
||||
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
|
||||
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
|
||||
|
||||
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
||||
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
|
||||
|
@ -162,18 +179,19 @@ static __global__ void mul_mat_vec_q(
|
|||
const int kqs = vdr * (tid % (qi/vdr));
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_y; ++j) {
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||
tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
|
||||
tmp[j][i] += vec_dot_q_cuda(
|
||||
vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
|
||||
if (threadIdx.y > 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_y; ++j) {
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
|
||||
|
@ -185,9 +203,11 @@ static __global__ void mul_mat_vec_q(
|
|||
return;
|
||||
}
|
||||
|
||||
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_y; ++j) {
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||
#pragma unroll
|
||||
|
@ -197,88 +217,121 @@ static __global__ void mul_mat_vec_q(
|
|||
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) {
|
||||
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
|
||||
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
GGML_UNUSED(nrows_x);
|
||||
}
|
||||
|
||||
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
|
||||
const dim3 block_nums(nblocks, 1, 1);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
|
||||
static std::pair<dim3, dim3> calc_launch_params(
|
||||
const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y,
|
||||
const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
|
||||
const dim3 block_nums(nblocks, nchannels_y, nsamples_y);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
static void mul_mat_vec_q_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q_switch_ncols_dst(
|
||||
const void * vx, const void * vy, const int32_t * ids, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst,
|
||||
const int stride_row_x, const int stride_col_y, const int stride_col_dst,
|
||||
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
|
||||
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
||||
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||
GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
|
||||
|
||||
const int channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int sample_ratio = nsamples_dst / nsamples_x;
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
|
||||
|
||||
switch (ncols_y) {
|
||||
GGML_ASSERT(!ids || ncols_dst == 1);
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
{
|
||||
constexpr int c_ncols_y = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
constexpr int c_ncols_y = 2;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 2;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
constexpr int c_ncols_y = 3;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 3;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
constexpr int c_ncols_y = 4;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 4;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
{
|
||||
constexpr int c_ncols_y = 5;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 5;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 6:
|
||||
{
|
||||
constexpr int c_ncols_y = 6;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 6;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 7:
|
||||
{
|
||||
constexpr int c_ncols_y = 7;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 7;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 8:
|
||||
{
|
||||
constexpr int c_ncols_y = 8;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
constexpr int c_ncols_dst = 8;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -287,137 +340,213 @@ static void mul_mat_vec_q_cuda(
|
|||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
static void mul_mat_vec_q_switch_type(
|
||||
const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst,
|
||||
const int stride_row_x, const int stride_col_y, const int stride_col_dst,
|
||||
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
|
||||
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
switch (type_x) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
|
||||
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
void ggml_cuda_mul_mat_vec_q(
|
||||
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
GGML_ASSERT( nb00 == ts_src0);
|
||||
GGML_ASSERT( nb10 == ts_src1);
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
|
||||
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
||||
{
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
const int64_t s11 = ne10_padded / QK8_1;
|
||||
const int64_t s1 = dst->nb[1] / ts_dst;
|
||||
const int64_t s02 = src0->nb[2] / ts_src0;
|
||||
const int64_t s2 = dst->nb[2] / ts_dst;
|
||||
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
const int64_t s12 = ne11*s11;
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
|
||||
const int64_t ncols_dst = ids ? ne2 : ne1;
|
||||
const int64_t nchannels_y = ids ? ne11 : ne12;
|
||||
const int64_t nchannels_dst = ids ? ne1 : ne2;
|
||||
const int64_t stride_col_dst = ids ? s2 : s1;
|
||||
const int64_t stride_col_y = ids ? s12 : s11;
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
const int64_t stride_channel_y = ids ? s11 : s12;
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq2_xxs_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq2_xs_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq2_s_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq3_xxs_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq1_s_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq1_m_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq4_nl_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq4_xs_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq3_s_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
||||
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
||||
mul_mat_vec_q_switch_type(
|
||||
src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00,
|
||||
ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec_q(
|
||||
|
@ -440,68 +569,12 @@ void ggml_cuda_op_mul_mat_vec_q(
|
|||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
const int stride_row_x = ne00 / ggml_blck_size(src0->type);
|
||||
const int stride_col_y = src1_padded_row_size / QK8_1;
|
||||
|
||||
mul_mat_vec_q_switch_type(
|
||||
src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
|
|
3
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh
vendored
3
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh
vendored
|
@ -2,6 +2,9 @@
|
|||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
|
||||
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec_q(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
|
|
91
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
vendored
91
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
vendored
|
@ -1,23 +1,33 @@
|
|||
#include "quantize.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
||||
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void quantize_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int ne1, const int ne2) {
|
||||
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (ix0 >= kx0_padded) {
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ix1 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.y;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
const int64_t i_padded = ix1*kx0_padded + ix0;
|
||||
const int64_t & i00 = i0;
|
||||
const int64_t & i01 = i1;
|
||||
const int64_t & i02 = i2;
|
||||
const int64_t & i03 = i3;
|
||||
|
||||
const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int64_t ib = i_padded / QK8_1; // block index
|
||||
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||
const int64_t ib = i_cont / QK8_1; // block index
|
||||
const int64_t iqs = i_cont % QK8_1; // quant index
|
||||
|
||||
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
|
||||
const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
float sum = xi;
|
||||
|
||||
|
@ -39,29 +49,38 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|||
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
||||
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int ne1, const int ne2) {
|
||||
|
||||
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
|
||||
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
|
||||
|
||||
const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
|
||||
const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
|
||||
|
||||
if (ix0 >= kx0_padded) {
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float4 * x4 = (const float4 *) x;
|
||||
const int64_t i1 = blockIdx.y;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
|
||||
const int64_t i00 = i0;
|
||||
const int64_t i01 = ids ? ids[i1] : i1;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i03 = i3;
|
||||
|
||||
const float4 * x4 = (const float4 *) x;
|
||||
|
||||
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
||||
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
||||
const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = i0 % (4*QK8_1); // quant index in block
|
||||
|
||||
// Load 4 floats per thread and calculate max. abs. value between them:
|
||||
const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
float amax = fabsf(xi.x);
|
||||
amax = fmaxf(amax, fabsf(xi.y));
|
||||
amax = fmaxf(amax, fabsf(xi.z));
|
||||
|
@ -77,7 +96,7 @@ static __global__ void quantize_mmq_q8_1(
|
|||
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
|
||||
sum = xi.x + xi.y + xi.z + xi.w;
|
||||
|
||||
// Exchange calculate sum across vals_per_sum/4 threads.
|
||||
// Calculate sums across vals_per_sum/4 threads.
|
||||
#pragma unroll
|
||||
for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
|
||||
sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
|
||||
|
@ -127,40 +146,40 @@ static __global__ void quantize_mmq_q8_1(
|
|||
}
|
||||
|
||||
void quantize_row_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
||||
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(!ids);
|
||||
GGML_ASSERT(ne0 % QK8_1 == 0);
|
||||
|
||||
GGML_ASSERT(kx0_padded % QK8_1 == 0);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, kx1*channels, 1);
|
||||
const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
|
||||
|
||||
GGML_UNUSED(type_x);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
GGML_UNUSED(type_src0);
|
||||
}
|
||||
|
||||
void quantize_mmq_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
|
||||
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
||||
|
||||
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(block_num_x, kx1, channels);
|
||||
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
|
||||
switch (mmq_get_q8_1_ds_layout(type_x)) {
|
||||
switch (mmq_get_q8_1_ds_layout(type_src0)) {
|
||||
case MMQ_Q8_1_DS_LAYOUT_D4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_DS4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_D2S6:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
15
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
vendored
15
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
vendored
|
@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk
|
|||
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
||||
|
||||
typedef void (*quantize_cuda_t)(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
const ggml_type type_x, cudaStream_t stream);
|
||||
const float * x, const int32_t * ids, void * vy,
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
|
||||
void quantize_row_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
const ggml_type type_x, cudaStream_t stream);
|
||||
const float * x, const int32_t * ids, void * vy,
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
|
||||
void quantize_mmq_q8_1_cuda(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
const ggml_type type_x, cudaStream_t stream);
|
||||
const float * x, const int32_t * ids, void * vy,
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
|
|
|
@ -5690,7 +5690,7 @@ kernel void kernel_flash_attn_ext(
|
|||
|
||||
{
|
||||
float S[Q] = { [0 ... Q-1] = 0.0f };
|
||||
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
|
||||
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
// TODO: see if we can utilize quad-group functions for better performance
|
||||
|
@ -5950,7 +5950,7 @@ kernel void kernel_flash_attn_ext(
|
|||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
float S = { 0.0f };
|
||||
float M = { -__FLT16_MAX__/2 };
|
||||
float M = { -__FLT_MAX__/2 };
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
|
@ -6197,7 +6197,7 @@ kernel void kernel_flash_attn_ext_vec(
|
|||
|
||||
{
|
||||
float S = 0.0f;
|
||||
float M = -__FLT16_MAX__/2;
|
||||
float M = -__FLT_MAX__/2;
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
const short tx = tiisg%NL;
|
||||
|
|
384
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
vendored
384
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
vendored
|
@ -491,7 +491,259 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_COUNT
|
||||
};
|
||||
|
||||
//
|
||||
// ggml_metal_heap
|
||||
//
|
||||
|
||||
struct ggml_metal_heap {
|
||||
// number of times the heap was unused
|
||||
int n_unused;
|
||||
|
||||
// total number of buffer allocations in this heap across all computes
|
||||
int64_t n_alloc;
|
||||
|
||||
// current offset in the heap - we reset this after each node in order to reuse the memory
|
||||
size_t offs;
|
||||
|
||||
// the currently allocated MTLBuffer objects in this heap
|
||||
id<MTLHeap> obj;
|
||||
|
||||
NSMutableArray * bufs;
|
||||
};
|
||||
|
||||
static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
|
||||
struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap));
|
||||
|
||||
MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
|
||||
desc.storageMode = MTLStorageModePrivate;
|
||||
desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
|
||||
desc.type = MTLHeapTypePlacement;
|
||||
desc.size = size;
|
||||
|
||||
heap->n_unused = 0;
|
||||
heap->n_alloc = 0;
|
||||
|
||||
heap->obj = [device newHeapWithDescriptor:desc];
|
||||
if (!heap->obj) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
|
||||
|
||||
free(heap);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
[desc release];
|
||||
|
||||
heap->bufs = [[NSMutableArray alloc] init];
|
||||
|
||||
return heap;
|
||||
}
|
||||
|
||||
static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
|
||||
heap->offs = 0;
|
||||
|
||||
// count how many graph computes the heap ended up being unused
|
||||
if ([heap->bufs count] > 0) {
|
||||
heap->n_unused = 0;
|
||||
} else {
|
||||
heap->n_unused++;
|
||||
}
|
||||
|
||||
for (id<MTLBuffer> buf in heap->bufs) {
|
||||
[buf release];
|
||||
}
|
||||
[heap->bufs removeAllObjects];
|
||||
|
||||
// tell the OS that it can reuse this memory if needed
|
||||
// ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
|
||||
[heap->obj setPurgeableState:MTLPurgeableStateVolatile];
|
||||
}
|
||||
|
||||
static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
|
||||
if (heap == nil) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_metal_heap_reset(heap);
|
||||
|
||||
[heap->obj release];
|
||||
[heap->bufs release];
|
||||
|
||||
free(heap);
|
||||
}
|
||||
|
||||
@interface ggml_metal_heap_ptr : NSObject
|
||||
|
||||
@property (nonatomic, assign) struct ggml_metal_heap * data;
|
||||
|
||||
@end
|
||||
|
||||
@implementation ggml_metal_heap_ptr
|
||||
@end
|
||||
|
||||
//
|
||||
// ggml_metal_mem_pool
|
||||
//
|
||||
|
||||
struct ggml_metal_mem_pool {
|
||||
id<MTLDevice> device;
|
||||
|
||||
int n_heaps; // total number of heaps ever created (including those that were removed)
|
||||
|
||||
NSMutableArray * heaps;
|
||||
NSMutableArray * heaps_to_remove;
|
||||
};
|
||||
|
||||
static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) {
|
||||
struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool));
|
||||
|
||||
mem_pool->n_heaps = 0;
|
||||
|
||||
mem_pool->heaps = [[NSMutableArray alloc] init];
|
||||
mem_pool->heaps_to_remove = [[NSMutableArray alloc] init];
|
||||
|
||||
return mem_pool;
|
||||
}
|
||||
|
||||
static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
|
||||
GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps);
|
||||
|
||||
size_t size_all = 0;
|
||||
size_t size_cur = 0;
|
||||
|
||||
for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
|
||||
GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data);
|
||||
GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc);
|
||||
GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused);
|
||||
GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0);
|
||||
GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]);
|
||||
|
||||
if ([ptr.data->bufs count] > 0) {
|
||||
size_cur += [ptr.data->obj size];
|
||||
}
|
||||
size_all += [ptr.data->obj size];
|
||||
|
||||
ggml_metal_heap_free(ptr.data);
|
||||
[ptr release];
|
||||
}
|
||||
[mem_pool->heaps release];
|
||||
[mem_pool->heaps_to_remove release];
|
||||
|
||||
if (size_all > 0) {
|
||||
GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0);
|
||||
GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
free(mem_pool);
|
||||
}
|
||||
|
||||
static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
|
||||
for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) {
|
||||
ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i];
|
||||
|
||||
struct ggml_metal_heap * heap = ptr.data;
|
||||
ggml_metal_heap_reset(heap);
|
||||
|
||||
// if the heap hasn't been used for a while, remove it
|
||||
if (heap->n_unused >= 128) {
|
||||
[mem_pool->heaps_to_remove addObject:@(i)];
|
||||
}
|
||||
}
|
||||
|
||||
if (mem_pool->heaps_to_remove.count > 0) {
|
||||
for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
|
||||
NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
|
||||
ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
|
||||
|
||||
struct ggml_metal_heap * heap = ptr.data;
|
||||
ggml_metal_heap_free(heap);
|
||||
|
||||
[mem_pool->heaps removeObjectAtIndex:index];
|
||||
[ptr release];
|
||||
}
|
||||
|
||||
[mem_pool->heaps_to_remove removeAllObjects];
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
|
||||
for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
|
||||
ptr.data->offs = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
|
||||
const size_t alignment = 32;
|
||||
|
||||
const size_t size_aligned = GGML_PAD(size, alignment);
|
||||
|
||||
// try one of the existing heaps
|
||||
for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
|
||||
struct ggml_metal_heap * heap = ptr.data;
|
||||
if (heap->offs + size_aligned <= [heap->obj size]) {
|
||||
// if this is the first buffer in the heap for the current command buffer, tell the OS that
|
||||
// it cannot free the memory used by the heap
|
||||
// ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
|
||||
if ([heap->bufs count] == 0) {
|
||||
[heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
|
||||
}
|
||||
|
||||
id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
|
||||
if (buf == nil) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
|
||||
return nil;
|
||||
}
|
||||
|
||||
heap->n_alloc++;
|
||||
heap->offs += size_aligned;
|
||||
|
||||
[heap->bufs addObject:buf];
|
||||
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
|
||||
// create a new heap that can fit this buffer
|
||||
ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new];
|
||||
|
||||
struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned);
|
||||
if (heap == NULL) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
|
||||
|
||||
heap_ptr.data = heap;
|
||||
ggml_metal_heap_reset(heap);
|
||||
|
||||
[heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
|
||||
id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
|
||||
if (buf == nil) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
heap->n_alloc++;
|
||||
heap->offs += size_aligned;
|
||||
|
||||
[heap->bufs addObject:buf];
|
||||
|
||||
[mem_pool->heaps addObject:heap_ptr];
|
||||
mem_pool->n_heaps++;
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
struct ggml_metal_command_buffer {
|
||||
id<MTLCommandBuffer> obj;
|
||||
|
||||
// each command buffer has a memory pool from which it can allocate temporary buffers during the compute
|
||||
struct ggml_metal_mem_pool * mem_pool;
|
||||
};
|
||||
|
||||
struct ggml_backend_metal_context {
|
||||
id<MTLDevice> device;
|
||||
id<MTLCommandQueue> queue;
|
||||
|
||||
dispatch_queue_t d_queue;
|
||||
|
@ -516,7 +768,7 @@ struct ggml_backend_metal_context {
|
|||
void (^encode_async)(size_t ith);
|
||||
|
||||
// n_cb command buffers + 1 used by the main thread
|
||||
id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
||||
struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
||||
|
||||
// abort ggml_metal_graph_compute if callback returns true
|
||||
ggml_abort_callback abort_callback;
|
||||
|
@ -706,8 +958,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||
struct ggml_backend_metal_device_context * ctx_dev = dev->context;
|
||||
|
||||
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
|
||||
|
||||
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||
|
||||
ctx->device = device;
|
||||
ctx->queue = [device newCommandQueue];
|
||||
if (ctx->queue == nil) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||
|
@ -769,7 +1023,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||
ctx->gf = nil;
|
||||
ctx->encode_async = nil;
|
||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||
ctx->command_buffers[i] = nil;
|
||||
ctx->cmd_bufs[i].obj = nil;
|
||||
|
||||
ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init();
|
||||
ctx->cmd_bufs[i].mem_pool->device = device;
|
||||
}
|
||||
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
|
@ -1183,6 +1440,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
|||
|
||||
[ctx->queue release];
|
||||
|
||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||
// ctx->cmd_bufs[i].obj is auto released
|
||||
|
||||
ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
|
||||
}
|
||||
|
||||
dispatch_release(ctx->d_queue);
|
||||
|
||||
free(ctx);
|
||||
|
@ -1489,10 +1752,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_metal_encode_node(
|
||||
static bool ggml_metal_encode_node(
|
||||
ggml_backend_t backend,
|
||||
int idx,
|
||||
id<MTLComputeCommandEncoder> encoder) {
|
||||
id<MTLComputeCommandEncoder> encoder,
|
||||
struct ggml_metal_mem_pool * mem_pool) {
|
||||
struct ggml_backend_metal_context * ctx = backend->context;
|
||||
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
||||
|
||||
|
@ -1508,7 +1772,7 @@ static void ggml_metal_encode_node(
|
|||
struct ggml_tensor * dst = node;
|
||||
|
||||
if (ggml_is_empty(dst)) {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (dst->op) {
|
||||
|
@ -1519,7 +1783,7 @@ static void ggml_metal_encode_node(
|
|||
case GGML_OP_PERMUTE:
|
||||
{
|
||||
// noop -> next node
|
||||
} return;
|
||||
} return true;
|
||||
default:
|
||||
{
|
||||
} break;
|
||||
|
@ -1530,6 +1794,8 @@ static void ggml_metal_encode_node(
|
|||
GGML_ABORT("unsupported op");
|
||||
}
|
||||
|
||||
ggml_metal_mem_pool_clear(mem_pool);
|
||||
|
||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
||||
|
@ -2176,6 +2442,56 @@ static void ggml_metal_encode_node(
|
|||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// use this branch to test the ggml_metal_mem_pool functionality
|
||||
#if 0
|
||||
// cpy to tmp buffer in MTLHeap
|
||||
|
||||
id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
|
||||
if (!h_src0) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
|
||||
return false;
|
||||
}
|
||||
|
||||
offs_src0 = 0;
|
||||
|
||||
ggml_metal_kargs_cpy args_cpy = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne00,
|
||||
/*.ne1 =*/ ne01,
|
||||
/*.ne2 =*/ ne02,
|
||||
/*.ne3 =*/ ne03,
|
||||
/*.nb0 =*/ nb00,
|
||||
/*.nb1 =*/ nb01,
|
||||
/*.nb2 =*/ nb02,
|
||||
/*.nb3 =*/ nb03,
|
||||
};
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
[encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline];
|
||||
}
|
||||
[encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:h_src0 offset:0 atIndex:2];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type));
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)];
|
||||
|
||||
#else
|
||||
id<MTLBuffer> h_src0 = id_src0;
|
||||
#endif
|
||||
// softmax
|
||||
|
||||
ggml_metal_kargs_soft_max args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
|
@ -2188,11 +2504,11 @@ static void ggml_metal_encode_node(
|
|||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:h_src0 offset:offs_src0 atIndex:0];
|
||||
if (id_src1) {
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
} else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:h_src0 offset:offs_src0 atIndex:1];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:3];
|
||||
|
@ -4634,6 +4950,8 @@ static void ggml_metal_encode_node(
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_metal_graph_compute(
|
||||
|
@ -4687,25 +5005,25 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
}
|
||||
|
||||
// the main thread commits the first few commands immediately
|
||||
// command_buffer[n_cb]
|
||||
// cmd_buf[n_cb]
|
||||
{
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->command_buffers[n_cb] = command_buffer;
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->cmd_bufs[n_cb].obj = cmd_buf;
|
||||
|
||||
[command_buffer enqueue];
|
||||
[cmd_buf enqueue];
|
||||
ctx->encode_async(n_cb);
|
||||
}
|
||||
|
||||
// prepare the rest of the command buffers asynchronously
|
||||
// command_buffer[0.. n_cb)
|
||||
// cmd_buf[0.. n_cb)
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->command_buffers[cb_idx] = command_buffer;
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
|
||||
|
||||
// always enqueue the first two command buffers
|
||||
// enqueue all of the command buffers if we don't need to abort
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[command_buffer enqueue];
|
||||
[cmd_buf enqueue];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4714,14 +5032,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
// wait for completion and check status of each command buffer
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
{
|
||||
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
|
||||
[command_buffer waitUntilCompleted];
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [command_buffer status];
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
|
@ -4729,20 +5047,20 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
}
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
|
||||
[command_buffer waitUntilCompleted];
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [command_buffer status];
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil);
|
||||
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
|
||||
if (!next_buffer) {
|
||||
continue;
|
||||
}
|
||||
|
@ -5126,8 +5444,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|||
|
||||
const int n_nodes_per_cb = ctx->n_nodes_per_cb;
|
||||
|
||||
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
|
||||
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
|
||||
|
||||
int node_start = 0;
|
||||
int node_end = n_nodes_0;
|
||||
|
@ -5139,22 +5458,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|||
|
||||
const bool should_capture = ctx->capture_next_compute;
|
||||
|
||||
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
|
||||
ggml_metal_mem_pool_reset(mem_pool);
|
||||
|
||||
for (int idx = node_start; idx < node_end; ++idx) {
|
||||
if (should_capture) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
|
||||
ggml_metal_encode_node(backend, idx, encoder);
|
||||
const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
|
||||
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
}
|
||||
|
||||
if (!res) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[command_buffer commit];
|
||||
[cmd_buf commit];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -3237,7 +3237,7 @@ kernel void kernel_flash_attn_ext(
|
|||
|
||||
{
|
||||
float S[Q] = { [0 ... Q-1] = 0.0f };
|
||||
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
|
||||
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
// TODO: see if we can utilize quad-group functions for better performance
|
||||
|
@ -3497,7 +3497,7 @@ kernel void kernel_flash_attn_ext(
|
|||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
float S = { 0.0f };
|
||||
float M = { -__FLT16_MAX__/2 };
|
||||
float M = { -__FLT_MAX__/2 };
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
|
@ -3744,7 +3744,7 @@ kernel void kernel_flash_attn_ext_vec(
|
|||
|
||||
{
|
||||
float S = 0.0f;
|
||||
float M = -__FLT16_MAX__/2;
|
||||
float M = -__FLT_MAX__/2;
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
const short tx = tiisg%NL;
|
||||
|
|
104
ml/backend/ggml/ggml/src/ggml.c
vendored
104
ml/backend/ggml/ggml/src/ggml.c
vendored
|
@ -4,6 +4,7 @@
|
|||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// FIXME: required here for quantization functions
|
||||
|
@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
|
||||
// currently, the ggml_cpu_has_* functions are entirely compile-time
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
//if (ggml_cpu_has_f16c()) {
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
for(; i + 3 < n; i += 4) {
|
||||
__m128 x_vec = _mm_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
int i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__AVX512F__)
|
||||
//if (ggml_cpu_has_avx512()) {
|
||||
for (; i + 16 <= n; i += 16) {
|
||||
_mm512_storeu_ps(y + i,
|
||||
_mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(
|
||||
_mm512_cvtepu16_epi32(
|
||||
_mm256_loadu_si256(
|
||||
(const __m256i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
//if (ggml_cpu_has_avx2()) {
|
||||
for (; i + 8 <= n; i += 8) {
|
||||
_mm256_storeu_ps(y + i,
|
||||
_mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(
|
||||
_mm256_cvtepu16_epi32(
|
||||
_mm_loadu_si128(
|
||||
(const __m128i *)(x + i))),
|
||||
16)));
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
int i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_BF16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
@ -956,6 +915,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"CONV_TRANSPOSE_1D",
|
||||
"IM2COL",
|
||||
"IM2COL_BACK",
|
||||
"CONV_2D_DW",
|
||||
"CONV_TRANSPOSE_2D",
|
||||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
|
@ -994,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -1051,6 +1011,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"conv_transpose_1d(x)",
|
||||
"im2col(x)",
|
||||
"im2col_back(x)",
|
||||
"conv_2d_dw(x)",
|
||||
"conv_transpose_2d(x)",
|
||||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
|
@ -1089,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"adamw(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
@ -1346,6 +1307,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|||
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
||||
}
|
||||
|
||||
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
|
||||
return
|
||||
tensor->nb[0] > tensor->nb[2] &&
|
||||
tensor->nb[1] > tensor->nb[0] &&
|
||||
tensor->nb[2] == ggml_type_size(tensor->type);
|
||||
}
|
||||
|
||||
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
|
@ -4052,6 +4020,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_2d_dw_direct
|
||||
|
||||
struct ggml_tensor * ggml_conv_2d_dw_direct(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int stride0,
|
||||
int stride1,
|
||||
int pad0,
|
||||
int pad1,
|
||||
int dilation0,
|
||||
int dilation1) {
|
||||
GGML_ASSERT(a->ne[2] == 1);
|
||||
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
||||
int64_t ne[4];
|
||||
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
|
||||
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
|
||||
ne[2] = b->ne[2];
|
||||
ne[3] = b->ne[3];
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
|
||||
|
||||
if (ggml_is_contiguous_channels(b)) {
|
||||
// Result will be permuted the same way as input (CWHN order)
|
||||
const int64_t type_size = ggml_type_size(result->type);
|
||||
GGML_ASSERT(ggml_blck_size(result->type) == 1);
|
||||
result->nb[0] = result->ne[2] * type_size;
|
||||
result->nb[1] = result->ne[0] * result->nb[0];
|
||||
result->nb[2] = type_size;
|
||||
}
|
||||
|
||||
int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_2D_DW;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_transpose_2d_p0
|
||||
|
||||
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue