mirror of
https://github.com/ollama/ollama.git
synced 2025-05-11 02:16:36 +02:00
103 lines
5 KiB
Diff
103 lines
5 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Saman <saman.khatir@amd.com>
|
|
Date: Wed, 19 Mar 2025 14:02:26 -0700
|
|
Subject: [PATCH] add rdna4 support
|
|
|
|
---
|
|
ggml/src/ggml-cuda/common.cuh | 6 ++++--
|
|
ggml/src/ggml-cuda/mmq.cu | 2 +-
|
|
ggml/src/ggml-cuda/mmq.cuh | 4 ++--
|
|
ggml/src/ggml-cuda/mmvq.cu | 4 ++--
|
|
ggml/src/ggml-cuda/vendors/hip.h | 4 ++++
|
|
5 files changed, 13 insertions(+), 7 deletions(-)
|
|
|
|
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
|
index adf0d3ec..b24593fc 100644
|
|
--- a/ggml/src/ggml-cuda/common.cuh
|
|
+++ b/ggml/src/ggml-cuda/common.cuh
|
|
@@ -61,11 +61,13 @@
|
|
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
|
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
|
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
|
+#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
|
|
|
|
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
|
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
|
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
|
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
|
|
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
|
+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
|
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
|
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
|
|
|
@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
|
-#elif defined(RDNA3)
|
|
+#elif defined(RDNA3) || defined(RDNA4)
|
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
|
#elif defined(__gfx1010__) || defined(__gfx900__)
|
|
int tmp1;
|
|
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
|
|
index 10f2ebb1..933d945c 100644
|
|
--- a/ggml/src/ggml-cuda/mmq.cu
|
|
+++ b/ggml/src/ggml-cuda/mmq.cu
|
|
@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
}
|
|
|
|
- return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
+ return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
}
|
|
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
|
index 0451c65f..66ce2bc9 100644
|
|
--- a/ggml/src/ggml-cuda/mmq.cuh
|
|
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
|
@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
|
|
|
|
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
|
+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
|
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
|
+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
|
#else
|
|
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
|
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
|
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
|
|
index 4fb466ca..23ae7abc 100644
|
|
--- a/ggml/src/ggml-cuda/mmvq.cu
|
|
+++ b/ggml/src/ggml-cuda/mmvq.cu
|
|
@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
|
|
|
|
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
|
|
|
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
|
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
|
|
constexpr int nwarps = 1;
|
|
constexpr int rows_per_cuda_block = 1;
|
|
#else
|
|
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
|
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
|
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
|
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
|
|
|
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
|
const int row0 = rows_per_cuda_block*blockIdx.x;
|
|
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
|
index 81964611..a62544b5 100644
|
|
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
|
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
|
@@ -150,6 +150,10 @@
|
|
#define CDNA
|
|
#endif
|
|
|
|
+#if defined(__gfx1200__) || defined(__gfx1201__)
|
|
+#define RDNA4
|
|
+#endif
|
|
+
|
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
defined(__gfx1150__) || defined(__gfx1151__)
|
|
#define RDNA3
|