From be58c2b64b5fda88f64584eaa78c43b651e31fbe Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 09:56:30 +0800 Subject: [PATCH 01/13] support q8 sparse operator for CPU --- ggml-cuda.cu | 2 +- ggml-quants.c | 20 ++++++ ggml-quants.h | 1 + ggml.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 195 insertions(+), 7 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f64bf8d5..91175908 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9271,7 +9271,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func(tensor->src[0], tensor->src[1], tensor); - // CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaDeviceSynchronize()); return true; } diff --git a/ggml-quants.c b/ggml-quants.c index b563aaa6..ddf443ae 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2422,6 +2422,26 @@ static inline __m128i get_scale_shuffle(int i) { return _mm_loadu_si128((const __m128i*)k_shuffle + i); } #endif +void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { + const int qk = QK8_0; + const int nb = n / qk; + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_q8_0 * restrict x = vx; + + float *res = (float *)vz; + float scale_fp32 = GGML_FP16_TO_FP32(scale); + for (int i = 0; i < nb; i++) { + float result_scale = GGML_FP16_TO_FP32(x[i].d) * scale_fp32; + int offset = i * QK8_0; + + for (int j = 0; j < qk; ++j) { + const int v0 = (x[i].qs[j]); + res[offset + j] = res[offset + j] + ((float)(v0 * (int)alpha) * result_scale); + } + } +} void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { const int qk = QK8_0; diff --git a/ggml-quants.h b/ggml-quants.h index 6308f483..07b02b98 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -210,6 +210,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); // Dot product diff --git a/ggml.c b/ggml.c index ee4eadc9..0f84a7e7 100644 --- a/ggml.c +++ b/ggml.c @@ -130,7 +130,7 @@ void ggml_print_backtrace(void) { } #endif -// #define GGML_PERF +#define GGML_PERF #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -14601,7 +14601,160 @@ static void ggml_compute_forward_mul_mat_axpy( _freea(vec); #endif } +static void ggml_compute_forward_mul_mat_axpy_q8_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + // const bool src1_cont = ggml_is_contiguous(src1); + + // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + const float threshold = sparse_pred_threshold; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + // const int64_t r2 = ne12/ne02; + // const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + // const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int64_t dr = (src2->ne[0] + nth - 1)/(nth); + const int nr = ggml_nrows(src0); + + const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int idx_row_size = src2->nb[1]; + int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + +#if defined(_MSC_VER) + float* vec = (float *)_malloca(ne00 * 4 * sizeof(float)); +#else + float vec[ne00*4]; +#endif + void *vy = vec; + char* src0_row = (char *) src0->data; + for (int col_idx = 0; col_idx < nr1; col_idx++) { + // const block_q8_0 * restrict nerual = wdata; + const block_q8_0 *restrict nerual = (block_q8_0 *)((char *)wdata + col_idx * row_size); + idx = (float *)((char *)src2->data + col_idx * idx_row_size); + memset(vy, 0, ne00 * 4); + // while(true) { + // const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir10; ir1 < ir10 + dr; ir1++) + { + if (ir1 >= nr) + break; + if (gid[ir1] == 1) + continue; + if (idx[ir1] < threshold) + continue; + int bid = ir1 / QK8_0; + int qsid = ir1 % QK8_0; + int b = (int)nerual[bid].qs[qsid]; + if (b == 0) + continue; + ggml_fp16_t d = nerual[bid].d; + ggml_axpy_q8_0_q8_0(ne00, src0_row + nb01 * ir1, vy, vy, b, d); + } + // if (ir0 + dr >= nr) + // break; + // } + + // float *res = (float *)(dst->data); + float *res = (float *)((char *)(dst->data) + col_idx * nb1); + float *tmp = (float *)vy; + int i; + + // 计算剩余的元素个数 + int remainder = ne00 % 8; +#if defined(__AVX2__) + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) + { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) + { + res[i] += tmp[i]; + } +#else + for (i = 0; i < ne00; i++) { + res[i] += tmp[i]; + } +#endif + } +#if defined(_MSC_VER) + _freea(vec); +#endif +} static void ggml_compute_forward_mul_mat_axpy_q4_0( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -15058,13 +15211,27 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_mul_mat_axpy_dense(params, tensor->src[0], tensor->src[1], tensor); } else if (src3 != NULL){ - if (tensor->src[0]->type != GGML_TYPE_Q4_0) { - ggml_compute_forward_mul_mat_axpy(params, tensor->src[0], tensor->src[1], tensor); - } - else { + switch (tensor->src[0]->type) + { + case GGML_TYPE_Q4_0: ggml_compute_forward_mul_mat_axpy_q4_0(params, tensor->src[0], tensor->src[1], tensor); - + break; + case GGML_TYPE_Q8_0: + ggml_compute_forward_mul_mat_axpy_q8_0(params, tensor->src[0], tensor->src[1], tensor); + break; + case GGML_TYPE_F16: + ggml_compute_forward_mul_mat_axpy(params, tensor->src[0], tensor->src[1], tensor); + break; + default: + printf("We do not support type %d\n", tensor->src[0]->type); + break; } + // if (tensor->src[0]->type != GGML_TYPE_Q4_0) { + // ggml_compute_forward_mul_mat_axpy(params, tensor->src[0], tensor->src[1], tensor); + // } + // else { + // ggml_compute_forward_mul_mat_axpy_q4_0(params, tensor->src[0], tensor->src[1], tensor); + // } } else { ggml_compute_forward_mul_mat_axpy_head(params, tensor->src[0], tensor->src[1], tensor); From 6d83a8b357c2971fb0204f305531fbb1f0598b4b Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 10:48:44 +0800 Subject: [PATCH 02/13] add avx2 for q8 --- ggml-quants.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/ggml-quants.c b/ggml-quants.c index ddf443ae..a1573016 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2429,7 +2429,62 @@ void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * res assert(nb % 2 == 0); const block_q8_0 * restrict x = vx; +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + __m256i alpha_v = _mm256_set1_epi16((short)alpha); + // Main loop + for (int i = 0; i < nb; ++i) { + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(scale) ); + __m256i bx = _mm256_loadu_si256((void const*)((char *)&x[i].qs[0])); + //16个数计算 + __m128i m_a = _mm256_extracti128_si256(bx, 0); + __m256i m_x = _mm256_cvtepi8_epi16(m_a); //16 elements + m_x = _mm256_mullo_epi16(m_x, alpha_v); + __m128i x_0 = _mm256_extracti128_si256(m_x, 0); + __m256i x0_32 = _mm256_cvtepi16_epi32(x_0); + __m256 fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + + + __m256 by = _mm256_loadu_ps((const __m256 *)((char *)vy+i*128)); + + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128), by); + //second phase + x_0 = _mm256_extracti128_si256(m_x, 1); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+32)); + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128+32), by); + + //third phase + m_a = _mm256_extracti128_si256(bx, 1); + m_x = _mm256_cvtepi8_epi16(m_a); + m_x = _mm256_mullo_epi16(m_x, alpha_v); + x_0 = _mm256_extracti128_si256(m_x, 0); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+64)); + + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128+64), by); + + //fourth phase + x_0 = _mm256_extracti128_si256(m_x, 1); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+96)); + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128+96), by); + } +#else float *res = (float *)vz; float scale_fp32 = GGML_FP16_TO_FP32(scale); for (int i = 0; i < nb; i++) { @@ -2441,6 +2496,7 @@ void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * res res[offset + j] = res[offset + j] + ((float)(v0 * (int)alpha) * result_scale); } } +#endif } void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { From b5c8c08b824d43d720570f64aa0b2238b771bdcf Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 12:16:21 +0800 Subject: [PATCH 03/13] support q8 --- ggml-cuda.cu | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 91175908..800dde3f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5330,6 +5330,16 @@ static void dequantize_mul_mat_batch_q4_0_cuda_sparse(const void * vx, const dfl dequantize_mul_mat_batch_sparse <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); +} +static void dequantize_mul_mat_batch_q8_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + + dequantize_mul_mat_batch_sparse + <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); + } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { @@ -5466,6 +5476,14 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows, lst, idx); } +static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows, lst, idx); +} static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { @@ -5596,6 +5614,16 @@ static void dequantize_axpy_vec_q4_0_cuda(const void * vx, const dfloat * y, flo dequantize_mul_mat_axpy <<>>(vx, y, dst, ncols, nrows); } +static void dequantize_axpy_sparse_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + // dequantize_mul_mat_axpy + // <<>>(vx, y, dst, ncols, nrows); + dequantize_mul_mat_axpy_sparse + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} static void dequantize_axpy_sparse_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -5607,6 +5635,8 @@ static void dequantize_axpy_sparse_vec_q4_0_cuda(const void * vx, const dfloat * <<>>(vx, y, dst, ncols, nrows, lst, idx); } + + static void dequantize_axpy_sparse_batch_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -5615,6 +5645,14 @@ static void dequantize_axpy_sparse_batch_q4_0_cuda(const void * vx, const dfloat dequantize_mul_mat_axpy_sparse_batch <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); } +static void dequantize_axpy_sparse_batch_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse_batch + <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); +} static void convert_axpy_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -6951,6 +6989,9 @@ inline void ggml_cuda_op_mul_mat_batch_sparse( case GGML_TYPE_Q4_0: dequantize_mul_mat_batch_q4_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_batch_q8_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); + break; case GGML_TYPE_F16: convert_mul_mat_batch_f16_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; @@ -7199,6 +7240,12 @@ inline void ggml_cuda_op_mul_mat_vec_sparse_q( row_lookup, sparse_idx ); break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, + row_lookup, sparse_idx + ); + break; default: GGML_ASSERT(false && "Unsupported type"); break; @@ -7563,6 +7610,15 @@ inline void ggml_cuda_op_dequantize_axpy( dequantize_axpy_sparse_batch_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); } break; + case GGML_TYPE_Q8_0: + if (sparse_idx == NULL) { + dequantize_axpy_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } else if (ne11 == 1) { + dequantize_axpy_sparse_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, row_lookup, sparse_idx); + } else { + dequantize_axpy_sparse_batch_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); + } + break; case GGML_TYPE_F16: if (sparse_idx == NULL) { convert_axpy_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); @@ -8723,7 +8779,9 @@ static void ggml_cuda_mul_mat_sparse(const ggml_tensor * src0, const ggml_tensor ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_dequantized, false); break; case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_q, true); + // ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); break; default: GGML_ASSERT(false && "unsupported type for sparse matrix multiplication"); From ce5464d45407ba1ea08ec5218d549364562909b7 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 15:42:14 +0800 Subject: [PATCH 04/13] support q4_1 in cpu scalar --- ggml-quants.c | 32 +++++++++- ggml-quants.h | 1 + ggml.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) diff --git a/ggml-quants.c b/ggml-quants.c index a1573016..1019eb91 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5,7 +5,7 @@ #include #include #include - +#include #ifdef __ARM_NEON // if YCM cannot find , make a symbolic link to it, for example: @@ -2583,6 +2583,36 @@ void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * res } #endif } +void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { + const int qk = QK8_1; + const int nb = n / qk; + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_q4_1 * restrict x = vx; + + float *res = (float *)vz; + float scale_fp32 = GGML_FP16_TO_FP32(scale); + for (int i = 0; i < nb; i++) { + float result_scale = GGML_FP16_TO_FP32(x[i].d) * scale_fp32; + int offset = i * QK4_1; + float m = GGML_FP16_TO_FP32(x[i].m); + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); + float x0 = v0*GGML_FP16_TO_FP32(x[i].d) + m; + float x1 = v1*GGML_FP16_TO_FP32(x[i].d) + m; + float y0 = scale_fp32 * alpha; + // res[offset + j] = res[offset + j] + x0*y0; + // res[offset + j + qk/2] = res[offset + j + qk/2] + x1*y0; + // float res = ((int)v0*(int)alpha)*GGML_FP16_TO_FP32(x[i].d)*scale_fp32+ m*y0; + // float res = ((int)v0*(int)alpha)*result_scale+ m*y0; + res[offset + j] = res[offset + j] + ((int)v0*(int)alpha)*result_scale+ m*y0; + res[offset + j + qk/2] = res[offset + j + qk/2] + ((int)v1*(int)alpha)*result_scale+ m*y0; + } + } +} void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { diff --git a/ggml-quants.h b/ggml-quants.h index 07b02b98..f622374a 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -212,6 +212,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); +void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); diff --git a/ggml.c b/ggml.c index 0f84a7e7..ea4ae91f 100644 --- a/ggml.c +++ b/ggml.c @@ -14755,6 +14755,162 @@ static void ggml_compute_forward_mul_mat_axpy_q8_0( _freea(vec); #endif } +static void ggml_compute_forward_mul_mat_axpy_q4_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + // const bool src1_cont = ggml_is_contiguous(src1); + + // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type vec_dot_type = type_traits[type].vec_dot_type; + //TODO: not hardcode + vec_dot_type = GGML_TYPE_Q8_0; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + const float threshold = sparse_pred_threshold; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + // const int64_t r2 = ne12/ne02; + // const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + // const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int64_t dr = (src2->ne[0] + nth - 1)/(nth); + const int nr = ggml_nrows(src0); + + const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int idx_row_size = src2->nb[1]; + int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + +#if defined(_MSC_VER) + float* vec = (float *)_malloca(ne00 * 4 * sizeof(float)); +#else + float vec[ne00*4]; +#endif + void *vy = vec; + char* src0_row = (char *) src0->data; + for (int col_idx = 0; col_idx < nr1; col_idx++) { + // const block_q8_0 * restrict nerual = wdata; + const block_q8_0 *restrict nerual = (block_q8_0 *)((char *)wdata + col_idx * row_size); + idx = (float *)((char *)src2->data + col_idx * idx_row_size); + memset(vy, 0, ne00 * 4); + // while(true) { + // const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir10; ir1 < ir10 + dr; ir1++) + { + if (ir1 >= nr) + break; + if (gid[ir1] == 1) + continue; + if (idx[ir1] < threshold) + continue; + int bid = ir1 / QK8_1; + int qsid = ir1 % QK8_1; + int b = (int)nerual[bid].qs[qsid]; + if (b == 0) + continue; + ggml_fp16_t d = nerual[bid].d; + ggml_axpy_q4_1_q8_1(ne00, src0_row + nb01 * ir1, vy, vy, b, d); + } + // if (ir0 + dr >= nr) + // break; + // } + + // float *res = (float *)(dst->data); + float *res = (float *)((char *)(dst->data) + col_idx * nb1); + float *tmp = (float *)vy; + int i; + + // 计算剩余的元素个数 + int remainder = ne00 % 8; +#if defined(__AVX2__) + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) + { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) + { + res[i] += tmp[i]; + } +#else + for (i = 0; i < ne00; i++) { + res[i] += tmp[i]; + } +#endif + } +#if defined(_MSC_VER) + _freea(vec); +#endif +} static void ggml_compute_forward_mul_mat_axpy_q4_0( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -15216,6 +15372,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm case GGML_TYPE_Q4_0: ggml_compute_forward_mul_mat_axpy_q4_0(params, tensor->src[0], tensor->src[1], tensor); break; + case GGML_TYPE_Q4_1: + ggml_compute_forward_mul_mat_axpy_q4_1(params, tensor->src[0], tensor->src[1], tensor); + break; case GGML_TYPE_Q8_0: ggml_compute_forward_mul_mat_axpy_q8_0(params, tensor->src[0], tensor->src[1], tensor); break; From d7c074ea2b3ddb1725bc6d63b848ff53391594e9 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 16:04:19 +0800 Subject: [PATCH 05/13] minor --- ggml-quants.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 1019eb91..6d99fb91 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2584,32 +2584,27 @@ void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * res #endif } void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { + // TODO AVX2 const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; - float *res = (float *)vz; float scale_fp32 = GGML_FP16_TO_FP32(scale); for (int i = 0; i < nb; i++) { float result_scale = GGML_FP16_TO_FP32(x[i].d) * scale_fp32; int offset = i * QK4_1; float m = GGML_FP16_TO_FP32(x[i].m); + float y0 = scale_fp32 * alpha; + float tmp = m * y0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[i].qs[j] & 0x0F); const int v1 = (x[i].qs[j] >> 4); - float x0 = v0*GGML_FP16_TO_FP32(x[i].d) + m; - float x1 = v1*GGML_FP16_TO_FP32(x[i].d) + m; - float y0 = scale_fp32 * alpha; - // res[offset + j] = res[offset + j] + x0*y0; - // res[offset + j + qk/2] = res[offset + j + qk/2] + x1*y0; - // float res = ((int)v0*(int)alpha)*GGML_FP16_TO_FP32(x[i].d)*scale_fp32+ m*y0; - // float res = ((int)v0*(int)alpha)*result_scale+ m*y0; - res[offset + j] = res[offset + j] + ((int)v0*(int)alpha)*result_scale+ m*y0; - res[offset + j + qk/2] = res[offset + j + qk/2] + ((int)v1*(int)alpha)*result_scale+ m*y0; + res[offset + j] = res[offset + j] + ((int)v0*(int)alpha)*result_scale+ tmp; + res[offset + j + qk/2] = res[offset + j + qk/2] + ((int)v1*(int)alpha)*result_scale+ tmp; } } } From 450d36f2be460e77f9876673c722125c25bbca76 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 16:15:59 +0800 Subject: [PATCH 06/13] support q4_1 --- ggml-cuda.cu | 58 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 800dde3f..72ccb730 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5325,11 +5325,18 @@ static void dequantize_mul_mat_batch_q4_0_cuda_sparse(const void * vx, const dfl const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - // printf("ncols %d, nrows %d, src1_ncols %d, dst_ne0 %d\n", ncols, nrows, src1_ncols, dst_ne0); - dequantize_mul_mat_batch_sparse <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); +} +static void dequantize_mul_mat_batch_q4_1_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_batch_sparse + <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); + } static void dequantize_mul_mat_batch_q8_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); @@ -5476,6 +5483,14 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows, lst, idx); } +static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows, lst, idx); +} static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -5629,11 +5644,17 @@ static void dequantize_axpy_sparse_vec_q4_0_cuda(const void * vx, const dfloat * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - // dequantize_mul_mat_axpy - // <<>>(vx, y, dst, ncols, nrows); dequantize_mul_mat_axpy_sparse <<>>(vx, y, dst, ncols, nrows, lst, idx); } +static void dequantize_axpy_sparse_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} @@ -5645,6 +5666,14 @@ static void dequantize_axpy_sparse_batch_q4_0_cuda(const void * vx, const dfloat dequantize_mul_mat_axpy_sparse_batch <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); } +static void dequantize_axpy_sparse_batch_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse_batch + <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); +} static void dequantize_axpy_sparse_batch_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -6989,6 +7018,9 @@ inline void ggml_cuda_op_mul_mat_batch_sparse( case GGML_TYPE_Q4_0: dequantize_mul_mat_batch_q4_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_batch_q4_1_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); + break; case GGML_TYPE_Q8_0: dequantize_mul_mat_batch_q8_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; @@ -7240,6 +7272,12 @@ inline void ggml_cuda_op_mul_mat_vec_sparse_q( row_lookup, sparse_idx ); break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, + row_lookup, sparse_idx + ); + break; case GGML_TYPE_Q8_0: mul_mat_vec_q8_0_q8_1_cuda( src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, @@ -7610,9 +7648,18 @@ inline void ggml_cuda_op_dequantize_axpy( dequantize_axpy_sparse_batch_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); } break; + case GGML_TYPE_Q4_1: + if (sparse_idx == NULL) { + GGML_ASSERT(false); + } else if (ne11 == 1) { + dequantize_axpy_sparse_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, row_lookup, sparse_idx); + } else { + dequantize_axpy_sparse_batch_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); + } + break; case GGML_TYPE_Q8_0: if (sparse_idx == NULL) { - dequantize_axpy_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + GGML_ASSERT(false); } else if (ne11 == 1) { dequantize_axpy_sparse_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, row_lookup, sparse_idx); } else { @@ -8779,6 +8826,7 @@ static void ggml_cuda_mul_mat_sparse(const ggml_tensor * src0, const ggml_tensor ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_dequantized, false); break; case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q8_0: ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_q, true); // ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); From 1848cc29d43e30404c03c63831dbdb8958730419 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 16:43:54 +0800 Subject: [PATCH 07/13] support q5_0 for cpu --- ggml-quants.c | 91 +++++++++++++++++++++++++++++ ggml-quants.h | 1 + ggml.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+) diff --git a/ggml-quants.c b/ggml-quants.c index 6d99fb91..8a0c06ae 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2583,6 +2583,97 @@ void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * res } #endif } +void ggml_axpy_q5_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { + const int qk = QK8_0; + const int nb = n / qk; + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_q5_0 * restrict x = vx; +#if defined(__AVX22__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + __m256i alpha_v = _mm256_set1_epi16((short)alpha); + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(scale) ); + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + //16个数计算 + __m128i m_a = _mm256_extracti128_si256(bx, 0); + __m256i m_x = _mm256_cvtepi8_epi16(m_a); //16 elements + m_x = _mm256_mullo_epi16(m_x, alpha_v); + __m128i x_0 = _mm256_extracti128_si256(m_x, 0); + __m256i x0_32 = _mm256_cvtepi16_epi32(x_0); + __m256 fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + + + __m256 by = _mm256_loadu_ps((const __m256 *)((char *)vy+i*128)); + + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128), by); + //second phase + + x_0 = _mm256_extracti128_si256(m_x, 1); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+32)); + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128+32), by); + + //third phase + m_a = _mm256_extracti128_si256(bx, 1); + m_x = _mm256_cvtepi8_epi16(m_a); + m_x = _mm256_mullo_epi16(m_x, alpha_v); + x_0 = _mm256_extracti128_si256(m_x, 0); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+64)); + + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128+64), by); + + //fourth phase + x_0 = _mm256_extracti128_si256(m_x, 1); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+96)); + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)((char*)vz + i*128+96), by); + + } +#else + float *res = (float *)vz; + float scale_fp32 = GGML_FP16_TO_FP32(scale); + for (int i = 0; i < nb; i++) { + float result_scale = GGML_FP16_TO_FP32(x[i].d) * scale_fp32; + int offset = i * QK5_0; + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + // const int v0 = (x[i].qs[j] & 0x0F) - 8; + // const int v1 = (x[i].qs[j] >> 4) - 8; + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t v0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t v1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + res[offset + j] = res[offset + j] + ((float)(v0 * (int)alpha) * result_scale); + res[offset + j + qk/2] = res[offset + j + qk/2] + ((float)(v1 * (int)alpha) * result_scale); + } + } +#endif +} void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { // TODO AVX2 const int qk = QK8_1; diff --git a/ggml-quants.h b/ggml-quants.h index f622374a..cfc03ab0 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -212,6 +212,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); +void ggml_axpy_q5_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); // Dot product diff --git a/ggml.c b/ggml.c index ea4ae91f..d1f38b25 100644 --- a/ggml.c +++ b/ggml.c @@ -15065,6 +15065,160 @@ static void ggml_compute_forward_mul_mat_axpy_q4_0( _freea(vec); #endif } +static void ggml_compute_forward_mul_mat_axpy_q5_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + // const bool src1_cont = ggml_is_contiguous(src1); + + // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + const float threshold = sparse_pred_threshold; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + // const int64_t r2 = ne12/ne02; + // const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + // const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int64_t dr = (src2->ne[0] + nth - 1)/(nth); + const int nr = ggml_nrows(src0); + + const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int idx_row_size = src2->nb[1]; + int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + +#if defined(_MSC_VER) + float* vec = (float *)_malloca(ne00 * 4 * sizeof(float)); +#else + float vec[ne00*4]; +#endif + void *vy = vec; + char* src0_row = (char *) src0->data; + for (int col_idx = 0; col_idx < nr1; col_idx++) { + // const block_q8_0 * restrict nerual = wdata; + const block_q8_0 *restrict nerual = (block_q8_0 *)((char *)wdata + col_idx * row_size); + idx = (float *)((char *)src2->data + col_idx * idx_row_size); + memset(vy, 0, ne00 * 4); + // while(true) { + // const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir10; ir1 < ir10 + dr; ir1++) + { + if (ir1 >= nr) + break; + if (gid[ir1] == 1) + continue; + if (idx[ir1] < threshold) + continue; + int bid = ir1 / QK8_0; + int qsid = ir1 % QK8_0; + int b = (int)nerual[bid].qs[qsid]; + if (b == 0) + continue; + ggml_fp16_t d = nerual[bid].d; + ggml_axpy_q5_0_q8_0(ne00, src0_row + nb01 * ir1, vy, vy, b, d); + } + // if (ir0 + dr >= nr) + // break; + // } + + // float *res = (float *)(dst->data); + float *res = (float *)((char *)(dst->data) + col_idx * nb1); + float *tmp = (float *)vy; + int i; + + // 计算剩余的元素个数 + int remainder = ne00 % 8; +#if defined(__AVX2__) + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) + { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) + { + res[i] += tmp[i]; + } +#else + for (i = 0; i < ne00; i++) { + res[i] += tmp[i]; + } +#endif + } +#if defined(_MSC_VER) + _freea(vec); +#endif +} atomic_flag g_axpy_head_lock = ATOMIC_FLAG_INIT; static void ggml_compute_forward_mul_mat_axpy_head( const struct ggml_compute_params * params, @@ -15375,6 +15529,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm case GGML_TYPE_Q4_1: ggml_compute_forward_mul_mat_axpy_q4_1(params, tensor->src[0], tensor->src[1], tensor); break; + case GGML_TYPE_Q5_0: + ggml_compute_forward_mul_mat_axpy_q5_0(params, tensor->src[0], tensor->src[1], tensor); + break; case GGML_TYPE_Q8_0: ggml_compute_forward_mul_mat_axpy_q8_0(params, tensor->src[0], tensor->src[1], tensor); break; From 86f66617cdff29b1b10a0cb483576a44e1f2c8c6 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 17:32:08 +0800 Subject: [PATCH 08/13] q5 support --- ggml-cuda.cu | 53 +++++++++++++++++++++++++++++++++++++++++-- ggml-quants.c | 63 --------------------------------------------------- 2 files changed, 51 insertions(+), 65 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 72ccb730..fc10ba7a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5336,7 +5336,14 @@ static void dequantize_mul_mat_batch_q4_1_cuda_sparse(const void * vx, const dfl const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_batch_sparse <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); - +} +static void dequantize_mul_mat_batch_q5_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_batch_sparse + <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); } static void dequantize_mul_mat_batch_q8_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); @@ -5491,6 +5498,14 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows, lst, idx); } +static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows, lst, idx); +} static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -5655,7 +5670,14 @@ static void dequantize_axpy_sparse_vec_q4_1_cuda(const void * vx, const dfloat * dequantize_mul_mat_axpy_sparse <<>>(vx, y, dst, ncols, nrows, lst, idx); } - +static void dequantize_axpy_sparse_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} static void dequantize_axpy_sparse_batch_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { @@ -5674,6 +5696,14 @@ static void dequantize_axpy_sparse_batch_q4_1_cuda(const void * vx, const dfloat dequantize_mul_mat_axpy_sparse_batch <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); } +static void dequantize_axpy_sparse_batch_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse_batch + <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); +} static void dequantize_axpy_sparse_batch_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -7021,6 +7051,9 @@ inline void ggml_cuda_op_mul_mat_batch_sparse( case GGML_TYPE_Q4_1: dequantize_mul_mat_batch_q4_1_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_batch_q5_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); + break; case GGML_TYPE_Q8_0: dequantize_mul_mat_batch_q8_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; @@ -7278,6 +7311,12 @@ inline void ggml_cuda_op_mul_mat_vec_sparse_q( row_lookup, sparse_idx ); break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, + row_lookup, sparse_idx + ); + break; case GGML_TYPE_Q8_0: mul_mat_vec_q8_0_q8_1_cuda( src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, @@ -7657,6 +7696,15 @@ inline void ggml_cuda_op_dequantize_axpy( dequantize_axpy_sparse_batch_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); } break; + case GGML_TYPE_Q5_0: + if (sparse_idx == NULL) { + GGML_ASSERT(false); + } else if (ne11 == 1) { + dequantize_axpy_sparse_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, row_lookup, sparse_idx); + } else { + dequantize_axpy_sparse_batch_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); + } + break; case GGML_TYPE_Q8_0: if (sparse_idx == NULL) { GGML_ASSERT(false); @@ -8827,6 +8875,7 @@ static void ggml_cuda_mul_mat_sparse(const ggml_tensor * src0, const ggml_tensor break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: case GGML_TYPE_Q8_0: ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_q, true); // ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); diff --git a/ggml-quants.c b/ggml-quants.c index 8a0c06ae..ff058351 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2590,68 +2590,6 @@ void ggml_axpy_q5_0_q8_0(const int n, const void * restrict vx, const void * res assert(nb % 2 == 0); const block_q5_0 * restrict x = vx; -#if defined(__AVX22__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - __m256i alpha_v = _mm256_set1_epi16((short)alpha); - // Main loop - for (int i = 0; i < nb; ++i) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(scale) ); - __m256i bx = bytes_from_nibbles_32(x[i].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - //16个数计算 - __m128i m_a = _mm256_extracti128_si256(bx, 0); - __m256i m_x = _mm256_cvtepi8_epi16(m_a); //16 elements - m_x = _mm256_mullo_epi16(m_x, alpha_v); - __m128i x_0 = _mm256_extracti128_si256(m_x, 0); - __m256i x0_32 = _mm256_cvtepi16_epi32(x_0); - __m256 fx0 = _mm256_cvtepi32_ps(x0_32); - fx0 = _mm256_mul_ps(fx0, d); - - - __m256 by = _mm256_loadu_ps((const __m256 *)((char *)vy+i*128)); - - by = _mm256_add_ps(by, fx0); - _mm256_storeu_ps((__m256*)((char*)vz + i*128), by); - //second phase - - x_0 = _mm256_extracti128_si256(m_x, 1); - x0_32 = _mm256_cvtepi16_epi32(x_0); - fx0 = _mm256_cvtepi32_ps(x0_32); - fx0 = _mm256_mul_ps(fx0, d); - by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+32)); - by = _mm256_add_ps(by, fx0); - _mm256_storeu_ps((__m256*)((char*)vz + i*128+32), by); - - //third phase - m_a = _mm256_extracti128_si256(bx, 1); - m_x = _mm256_cvtepi8_epi16(m_a); - m_x = _mm256_mullo_epi16(m_x, alpha_v); - x_0 = _mm256_extracti128_si256(m_x, 0); - x0_32 = _mm256_cvtepi16_epi32(x_0); - fx0 = _mm256_cvtepi32_ps(x0_32); - fx0 = _mm256_mul_ps(fx0, d); - by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+64)); - - by = _mm256_add_ps(by, fx0); - _mm256_storeu_ps((__m256*)((char*)vz + i*128+64), by); - - //fourth phase - x_0 = _mm256_extracti128_si256(m_x, 1); - x0_32 = _mm256_cvtepi16_epi32(x_0); - fx0 = _mm256_cvtepi32_ps(x0_32); - fx0 = _mm256_mul_ps(fx0, d); - by = _mm256_loadu_ps((const __m256 *)((char*)vy+i*128+96)); - by = _mm256_add_ps(by, fx0); - _mm256_storeu_ps((__m256*)((char*)vz + i*128+96), by); - - } -#else float *res = (float *)vz; float scale_fp32 = GGML_FP16_TO_FP32(scale); for (int i = 0; i < nb; i++) { @@ -2672,7 +2610,6 @@ void ggml_axpy_q5_0_q8_0(const int n, const void * restrict vx, const void * res res[offset + j + qk/2] = res[offset + j + qk/2] + ((float)(v1 * (int)alpha) * result_scale); } } -#endif } void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { // TODO AVX2 From 0c665085fa83b728ac3d165784b70ae42739a6e1 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 17:55:02 +0800 Subject: [PATCH 09/13] q5_1 support --- ggml-cuda.cu | 2 +- ggml-quants.c | 31 ++++++++++ ggml-quants.h | 1 + ggml.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 192 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index fc10ba7a..1b697d61 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9426,7 +9426,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func(tensor->src[0], tensor->src[1], tensor); - CUDA_CHECK(cudaDeviceSynchronize()); + // CUDA_CHECK(cudaDeviceSynchronize()); return true; } diff --git a/ggml-quants.c b/ggml-quants.c index ff058351..442e3520 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2611,6 +2611,37 @@ void ggml_axpy_q5_0_q8_0(const int n, const void * restrict vx, const void * res } } } +void ggml_axpy_q5_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { + const int qk = QK8_1; + const int nb = n / qk; + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_q5_1 * restrict x = vx; + float *res = (float *)vz; + float scale_fp32 = GGML_FP16_TO_FP32(scale); + for (int i = 0; i < nb; i++) { + float result_scale = GGML_FP16_TO_FP32(x[i].d) * scale_fp32; + int offset = i * QK5_1; + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + float m = GGML_FP16_TO_FP32(x[i].m); + float y0 = scale_fp32 * alpha; + float tmp = m * y0; + + for (int j = 0; j < qk/2; ++j) { + // const int v0 = (x[i].qs[j] & 0x0F) - 8; + // const int v1 = (x[i].qs[j] >> 4) - 8; + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t v0 = ((x[i].qs[j] & 0x0F) | xh_0); + const int32_t v1 = ((x[i].qs[j] >> 4) | xh_1); + res[offset + j] = res[offset + j] + ((float)(v0 * (int)alpha) * result_scale) + tmp; + res[offset + j + qk/2] = res[offset + j + qk/2] + ((float)(v1 * (int)alpha) * result_scale) + tmp; + } + } +} void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { // TODO AVX2 const int qk = QK8_1; diff --git a/ggml-quants.h b/ggml-quants.h index cfc03ab0..34f33197 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -213,6 +213,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int void ggml_axpy_q8_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q5_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); +void ggml_axpy_q5_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); void ggml_axpy_q4_1_q8_1(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); // Dot product diff --git a/ggml.c b/ggml.c index d1f38b25..5318f57f 100644 --- a/ggml.c +++ b/ggml.c @@ -14911,6 +14911,162 @@ static void ggml_compute_forward_mul_mat_axpy_q4_1( _freea(vec); #endif } +static void ggml_compute_forward_mul_mat_axpy_q5_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + // const bool src1_cont = ggml_is_contiguous(src1); + + // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type vec_dot_type = type_traits[type].vec_dot_type; + //TODO: not hardcode + vec_dot_type = GGML_TYPE_Q8_0; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + const float threshold = sparse_pred_threshold; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + // const int64_t r2 = ne12/ne02; + // const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + // const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int64_t dr = (src2->ne[0] + nth - 1)/(nth); + const int nr = ggml_nrows(src0); + + const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int idx_row_size = src2->nb[1]; + int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + +#if defined(_MSC_VER) + float* vec = (float *)_malloca(ne00 * 4 * sizeof(float)); +#else + float vec[ne00*4]; +#endif + void *vy = vec; + char* src0_row = (char *) src0->data; + for (int col_idx = 0; col_idx < nr1; col_idx++) { + // const block_q8_0 * restrict nerual = wdata; + const block_q8_0 *restrict nerual = (block_q8_0 *)((char *)wdata + col_idx * row_size); + idx = (float *)((char *)src2->data + col_idx * idx_row_size); + memset(vy, 0, ne00 * 4); + // while(true) { + // const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir10; ir1 < ir10 + dr; ir1++) + { + if (ir1 >= nr) + break; + if (gid[ir1] == 1) + continue; + if (idx[ir1] < threshold) + continue; + int bid = ir1 / QK8_1; + int qsid = ir1 % QK8_1; + int b = (int)nerual[bid].qs[qsid]; + if (b == 0) + continue; + ggml_fp16_t d = nerual[bid].d; + ggml_axpy_q5_1_q8_1(ne00, src0_row + nb01 * ir1, vy, vy, b, d); + } + // if (ir0 + dr >= nr) + // break; + // } + + // float *res = (float *)(dst->data); + float *res = (float *)((char *)(dst->data) + col_idx * nb1); + float *tmp = (float *)vy; + int i; + + // 计算剩余的元素个数 + int remainder = ne00 % 8; +#if defined(__AVX2__) + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) + { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) + { + res[i] += tmp[i]; + } +#else + for (i = 0; i < ne00; i++) { + res[i] += tmp[i]; + } +#endif + } +#if defined(_MSC_VER) + _freea(vec); +#endif +} static void ggml_compute_forward_mul_mat_axpy_q4_0( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -15183,11 +15339,6 @@ static void ggml_compute_forward_mul_mat_axpy_q5_0( ggml_fp16_t d = nerual[bid].d; ggml_axpy_q5_0_q8_0(ne00, src0_row + nb01 * ir1, vy, vy, b, d); } - // if (ir0 + dr >= nr) - // break; - // } - - // float *res = (float *)(dst->data); float *res = (float *)((char *)(dst->data) + col_idx * nb1); float *tmp = (float *)vy; int i; @@ -15532,6 +15683,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm case GGML_TYPE_Q5_0: ggml_compute_forward_mul_mat_axpy_q5_0(params, tensor->src[0], tensor->src[1], tensor); break; + case GGML_TYPE_Q5_1: + ggml_compute_forward_mul_mat_axpy_q5_1(params, tensor->src[0], tensor->src[1], tensor); + break; case GGML_TYPE_Q8_0: ggml_compute_forward_mul_mat_axpy_q8_0(params, tensor->src[0], tensor->src[1], tensor); break; From 7a04b460518cbe9ec105dacdfb07e8c0884d6507 Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 18:10:41 +0800 Subject: [PATCH 10/13] q5_1 support --- ggml-cuda.cu | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1b697d61..77c36735 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5345,6 +5345,14 @@ static void dequantize_mul_mat_batch_q5_0_cuda_sparse(const void * vx, const dfl dequantize_mul_mat_batch_sparse <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); } +static void dequantize_mul_mat_batch_q5_1_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_batch_sparse + <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); +} static void dequantize_mul_mat_batch_q8_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -5506,6 +5514,14 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows, lst, idx); } +static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows, lst, idx); +} static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream, const int *lst, const float *idx) { GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -5678,6 +5694,15 @@ static void dequantize_axpy_sparse_vec_q5_0_cuda(const void * vx, const dfloat * dequantize_mul_mat_axpy_sparse <<>>(vx, y, dst, ncols, nrows, lst, idx); } +static void dequantize_axpy_sparse_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} + static void dequantize_axpy_sparse_batch_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { @@ -5704,6 +5729,14 @@ static void dequantize_axpy_sparse_batch_q5_0_cuda(const void * vx, const dfloat dequantize_mul_mat_axpy_sparse_batch <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); } +static void dequantize_axpy_sparse_batch_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse_batch + <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); +} static void dequantize_axpy_sparse_batch_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -7054,6 +7087,9 @@ inline void ggml_cuda_op_mul_mat_batch_sparse( case GGML_TYPE_Q5_0: dequantize_mul_mat_batch_q5_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_batch_q5_1_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); + break; case GGML_TYPE_Q8_0: dequantize_mul_mat_batch_q8_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, row_lookup, sparse_idx); break; @@ -7317,6 +7353,12 @@ inline void ggml_cuda_op_mul_mat_vec_sparse_q( row_lookup, sparse_idx ); break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, + row_lookup, sparse_idx + ); + break; case GGML_TYPE_Q8_0: mul_mat_vec_q8_0_q8_1_cuda( src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream, @@ -7705,6 +7747,15 @@ inline void ggml_cuda_op_dequantize_axpy( dequantize_axpy_sparse_batch_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); } break; + case GGML_TYPE_Q5_1: + if (sparse_idx == NULL) { + GGML_ASSERT(false); + } else if (ne11 == 1) { + dequantize_axpy_sparse_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, row_lookup, sparse_idx); + } else { + dequantize_axpy_sparse_batch_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, src1_ncols, stream, row_lookup, sparse_idx); + } + break; case GGML_TYPE_Q8_0: if (sparse_idx == NULL) { GGML_ASSERT(false); @@ -8876,6 +8927,7 @@ static void ggml_cuda_mul_mat_sparse(const ggml_tensor * src0, const ggml_tensor case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_q, true); // ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); From e2b668dec8599716a2906804cd880d2312df0a5e Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 18:21:35 +0800 Subject: [PATCH 11/13] minor --- ggml.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 5318f57f..6f45f665 100644 --- a/ggml.c +++ b/ggml.c @@ -14293,13 +14293,13 @@ static void ggml_axpy_avx_f16(const int n, const ggml_fp16_t * restrict vx, cons #if defined(__AVX2__) float *result = (float *)vz; float alpha_f32 = GGML_FP16_TO_FP32(alpha); - __m256 scale = _mm256_set1_ps(alpha_f32); // 创建scale向量 + __m256 scale = _mm256_set1_ps(alpha_f32); // create scale for (int i = 0; i < n; i += 8) { __m128i vx_low = _mm_loadu_si128((__m128i const*)(&vx[i])); - __m256 vx_f32 = _mm256_cvtph_ps(vx_low); // 转换vx为fp32 - __m256 vy_f32 = _mm256_loadu_ps((float const*)(result+ i)); // 加载vy - __m256 res = _mm256_fmadd_ps(vx_f32, scale, vy_f32); // 执行向量加法和乘法操作 - _mm256_storeu_ps((float*)(&result[i]), res); // 存储结果 + __m256 vx_f32 = _mm256_cvtph_ps(vx_low); // conver vx to fp32 + __m256 vy_f32 = _mm256_loadu_ps((float const*)(result+ i)); // load vy + __m256 res = _mm256_fmadd_ps(vx_f32, scale, vy_f32); + _mm256_storeu_ps((float*)(&result[i]), res); // store the result } #else float *res = (float *)vz; From e1002f613564b025311a53214d3fc61177fe36ea Mon Sep 17 00:00:00 2001 From: syx Date: Sun, 31 Mar 2024 19:10:03 +0800 Subject: [PATCH 12/13] minor --- ggml.c | 114 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/ggml.c b/ggml.c index 6f45f665..f39444d2 100644 --- a/ggml.c +++ b/ggml.c @@ -14414,9 +14414,9 @@ static void ggml_compute_forward_mul_mat_axpy_dense( break; } - // 获取锁 + while (atomic_flag_test_and_set(&g_axpy_dense_lock)) { - // 如果锁已经被占用,则等待 + } float *res = (float *)(dst->data); @@ -14424,19 +14424,19 @@ static void ggml_compute_forward_mul_mat_axpy_dense( int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; } @@ -14581,10 +14581,10 @@ static void ggml_compute_forward_mul_mat_axpy( #if defined(__AVX2__) // 使用AVX指令进行向量化计算 for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } // 处理剩余的元素 @@ -14728,19 +14728,19 @@ static void ggml_compute_forward_mul_mat_axpy_q8_0( float *tmp = (float *)vy; int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; @@ -14884,19 +14884,19 @@ static void ggml_compute_forward_mul_mat_axpy_q4_1( float *tmp = (float *)vy; int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; @@ -15040,19 +15040,19 @@ static void ggml_compute_forward_mul_mat_axpy_q5_1( float *tmp = (float *)vy; int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; @@ -15194,19 +15194,19 @@ static void ggml_compute_forward_mul_mat_axpy_q4_0( float *tmp = (float *)vy; int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; @@ -15343,19 +15343,19 @@ static void ggml_compute_forward_mul_mat_axpy_q5_0( float *tmp = (float *)vy; int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; @@ -15479,28 +15479,28 @@ static void ggml_compute_forward_mul_mat_axpy_head( break; } - // 获取锁 + / while (atomic_flag_test_and_set(&g_axpy_head_lock)) { - // 如果锁已经被占用,则等待 + } float *res = (float *)(dst->data); float *tmp = (float *)vy; int i; - // 计算剩余的元素个数 + int remainder = ne00 % 8; #if defined(__AVX2__) - // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { - __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 - __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 - __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 - _mm256_storeu_ps(res + i, result); // 存储结果到res中 + __m256 res_vec = _mm256_loadu_ps(res + i); + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); + __m256 result = _mm256_add_ps(res_vec, tmp_vec); + _mm256_storeu_ps(res + i, result); } - // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { res[i] += tmp[i]; } From 4f126960dbe6a036c267d2203e78653f5473aa80 Mon Sep 17 00:00:00 2001 From: Holden Date: Sun, 31 Mar 2024 23:42:42 +0800 Subject: [PATCH 13/13] fix syntax error --- ggml.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml.c b/ggml.c index f39444d2..f2837bcb 100644 --- a/ggml.c +++ b/ggml.c @@ -15479,7 +15479,6 @@ static void ggml_compute_forward_mul_mat_axpy_head( break; } - / while (atomic_flag_test_and_set(&g_axpy_head_lock)) { }