Skip to content

Commit

Permalink
CUDA: MMQ code deduplication + iquant support (ggerganov#8495)
Browse files Browse the repository at this point in the history
* CUDA: MMQ code deduplication + iquant support

* 1 less parallel job for CI build
  • Loading branch information
JohannesGaessler authored Jul 20, 2024
1 parent 07283b1 commit 69c487f
Show file tree
Hide file tree
Showing 11 changed files with 809 additions and 648 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ jobs:
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
- name: Determine tag name
id: tag
Expand Down
24 changes: 24 additions & 0 deletions ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
case GGML_TYPE_Q6_K:
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
break;
Expand Down Expand Up @@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
mmq_supported = true;
Expand Down
1,377 changes: 731 additions & 646 deletions ggml/src/ggml-cuda/mmq.cuh

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion ggml/src/ggml-cuda/template-instances/generate_cu_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
TYPES_MMQ = [
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
]

SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.

#include "../mmq.cuh"

DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.

#include "../mmq.cuh"

DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.

#include "../mmq.cuh"

DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.

#include "../mmq.cuh"

DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.

#include "../mmq.cuh"

DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.

#include "../mmq.cuh"

DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
21 changes: 21 additions & 0 deletions ggml/src/ggml-cuda/vecdotq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
}

template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
const int * v, const int * u, const float * d8_0, const float & d8_1) {

float sumf = 0.0f;

#pragma unroll
for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
int sumi = 0;

#pragma unroll
for (int i = i0; i < i0 + QI8_0/2; ++i) {
// SIMD dot product of quantized values
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}

sumf += d8_0[i0/(QI8_0/2)]*sumi;
}

return d8_1*sumf;
}

#define VDR_Q2_K_Q8_1_MMVQ 1
#define VDR_Q2_K_Q8_1_MMQ 4

Expand Down

0 comments on commit 69c487f

Please sign in to comment.