diff --git a/.gitattributes b/.gitattributes index 4bcd95b0b..b1279845b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,6 +15,10 @@ ml/backend/**/*.cu linguist-vendored ml/backend/**/*.cuh linguist-vendored ml/backend/**/*.m linguist-vendored ml/backend/**/*.metal linguist-vendored +ml/backend/**/CMakeLists.txt linguist-vendored + +llama/build-info.cpp linguist-generated +ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated * text=auto *.go text eol=lf diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f8e1cadf4..8af8812fb 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -163,5 +163,5 @@ jobs: - uses: actions/checkout@v4 - name: Verify patches apply cleanly and do not change files run: | - make -f Makefile.sync clean checkout sync + make -f Makefile.sync clean sync git diff --compact-summary --exit-code diff --git a/Makefile.sync b/Makefile.sync index 3001487de..007282746 100644 --- a/Makefile.sync +++ b/Makefile.sync @@ -15,7 +15,11 @@ help: @echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync" .PHONY: sync -sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches +sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches + +.PHONY: llama/build-info.cpp +llama/build-info.cpp: llama/build-info.cpp.in + sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@ .PHONY: llama/llama.cpp llama/llama.cpp: llama/vendor/ apply-patches diff --git a/llama/build-info.cpp b/llama/build-info.cpp index b2c1dba73..e169b926b 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,4 +1,4 @@ int LLAMA_BUILD_NUMBER = 0; -char const *LLAMA_COMMIT = "ba1cb19cdd0d92e012e0f6e009e0620f854b6afd"; +char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c"; char const *LLAMA_COMPILER = ""; char const *LLAMA_BUILD_TARGET = ""; diff --git a/llama/build-info.cpp.in b/llama/build-info.cpp.in new file mode 100644 index 000000000..07cbd0e4d --- /dev/null +++ b/llama/build-info.cpp.in @@ -0,0 +1,4 @@ +int LLAMA_BUILD_NUMBER = 0; +char const *LLAMA_COMMIT = "@FETCH_HEAD@"; +char const *LLAMA_COMPILER = ""; +char const *LLAMA_BUILD_TARGET = ""; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py deleted file mode 100755 index d7874e6ea..000000000 --- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 - -from glob import glob -import os - -TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"] - -SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. - -#include "../fattn-vec-f{vkq_size}.cuh" - -DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v}); -""" - -SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. - -#include "../fattn-wmma-f16.cuh" - -""" - -SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n" - -TYPES_MMQ = [ - "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", - "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", - "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S", - "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" -] - -SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. - -#include "../mmq.cuh" - -DECL_MMQ_CASE({type}); -""" - - -def get_short_name(long_quant_name): - return long_quant_name.replace("GGML_TYPE_", "").lower() - - -def get_head_sizes(type_k, type_v): - if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16": - return [64, 128, 256] - if type_k == "GGML_TYPE_F16": - return [64, 128] - return [128] - - -for filename in glob("*.cu"): - os.remove(filename) - -for vkq_size in [16, 32]: - for type_k in TYPES_KV: - for type_v in TYPES_KV: - for head_size in get_head_sizes(type_k, type_v): - with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f: - f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v)) - -for kq_acc_t in ["half", "float"]: - for cols_per_block in [8, 16, 32]: - if kq_acc_t == "float" and cols_per_block == 8: - continue - - with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f: - f.write(SOURCE_FATTN_WMMA_START) - - for head_size in [64, 80, 96, 112, 128, 256]: - if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32 - continue - if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance - continue - f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size)) - -for type in TYPES_MMQ: - with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f: - f.write(SOURCE_MMQ.format(type=type))