Skip to content

Commit bd38dde

Browse files
authored
vulkan: support copy from f32 to q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl (#11166)
* vulkan: support copy from f32 to q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl Shaders are based on cpy.cu. * vulkan: support copy from q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl to f32 * ggml: copy q->f32 assumes some contiguity in the destination
1 parent 466300f commit bd38dde

File tree

7 files changed

+446
-5
lines changed

7 files changed

+446
-5
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3967,6 +3967,57 @@ static void ggml_compute_forward_dup_bytes(
39673967
}
39683968
}
39693969

3970+
static void ggml_compute_forward_dup_q(
3971+
const struct ggml_compute_params * params,
3972+
struct ggml_tensor * dst) {
3973+
3974+
const struct ggml_tensor * src0 = dst->src[0];
3975+
const struct ggml_tensor * src1 = dst->src[1];
3976+
3977+
GGML_TENSOR_BINARY_OP_LOCALS
3978+
3979+
const enum ggml_type type = src0->type;
3980+
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
3981+
3982+
size_t qk = ggml_blck_size(type);
3983+
const int64_t nr = ggml_nelements(src1) / qk;
3984+
3985+
// destination must be contiguous in the first dimension
3986+
GGML_ASSERT(nb10 == ggml_type_size(dst->type));
3987+
// must either have first dimension large enough to hold a row, or fully contiguous
3988+
GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst));
3989+
3990+
const int ith = params->ith;
3991+
const int nth = params->nth;
3992+
3993+
const int dr = (nr + nth - 1)/nth;
3994+
3995+
// row range for this thread
3996+
const int ir0 = dr*ith;
3997+
const int ir1 = MIN(ir0 + dr, nr);
3998+
3999+
for (int64_t ir = ir0; ir < ir1; ++ir) {
4000+
4001+
uint32_t i = ir * qk;
4002+
4003+
const int64_t i03 = i/(ne00 * ne01 * ne02);
4004+
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
4005+
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
4006+
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
4007+
const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
4008+
4009+
const int64_t i13 = i/(ne10 * ne11 * ne12);
4010+
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
4011+
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
4012+
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
4013+
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
4014+
4015+
dequantize_row_q(
4016+
(const void *) ((char *) src0->data + x_offset),
4017+
(float *) ((char *) dst->data + dst_offset), qk);
4018+
}
4019+
}
4020+
39704021
static void ggml_compute_forward_dup(
39714022
const struct ggml_compute_params * params,
39724023
struct ggml_tensor * dst) {
@@ -3993,6 +4044,10 @@ static void ggml_compute_forward_dup(
39934044
} break;
39944045
default:
39954046
{
4047+
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
4048+
ggml_compute_forward_dup_q(params, dst);
4049+
break;
4050+
}
39964051
GGML_ABORT("fatal error");
39974052
}
39984053
}

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ struct vk_device_struct {
228228
vk_pipeline pipeline_repeat_f32;
229229
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
230230
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
231+
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
232+
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
231233
vk_pipeline pipeline_norm_f32;
232234
vk_pipeline pipeline_group_norm_f32;
233235
vk_pipeline pipeline_rms_norm_f32;
@@ -1965,6 +1967,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
19651967
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
19661968
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
19671969

1970+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
1971+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
1972+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
1973+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
1974+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
1975+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
1976+
1977+
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
1978+
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
1979+
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
1980+
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
1981+
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
1982+
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
1983+
19681984
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
19691985
ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
19701986
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
@@ -3689,6 +3705,33 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
36893705
return ctx->device->pipeline_cpy_f16_f16;
36903706
}
36913707
}
3708+
if (src->type == GGML_TYPE_F32) {
3709+
switch (to) {
3710+
case GGML_TYPE_Q4_0:
3711+
case GGML_TYPE_Q4_1:
3712+
case GGML_TYPE_Q5_0:
3713+
case GGML_TYPE_Q5_1:
3714+
case GGML_TYPE_Q8_0:
3715+
case GGML_TYPE_IQ4_NL:
3716+
return ctx->device->pipeline_cpy_f32_quant[to];
3717+
default:
3718+
break;
3719+
}
3720+
}
3721+
3722+
if (to == GGML_TYPE_F32) {
3723+
switch (src->type) {
3724+
case GGML_TYPE_Q4_0:
3725+
case GGML_TYPE_Q4_1:
3726+
case GGML_TYPE_Q5_0:
3727+
case GGML_TYPE_Q5_1:
3728+
case GGML_TYPE_Q8_0:
3729+
case GGML_TYPE_IQ4_NL:
3730+
return ctx->device->pipeline_cpy_quant_f32[src->type];
3731+
default:
3732+
break;
3733+
}
3734+
}
36923735

36933736
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
36943737
GGML_ABORT("fatal error");
@@ -5160,7 +5203,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51605203
}
51615204
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
51625205
std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
5163-
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
5206+
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
51645207
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
51655208
GGML_ASSERT(dst->buffer != nullptr);
51665209
const uint64_t ne00 = src0->ne[0];
@@ -7905,12 +7948,36 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
79057948
{
79067949
ggml_type src0_type = op->src[0]->type;
79077950
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
7908-
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
7909-
return true;
7951+
7952+
if (src0_type == GGML_TYPE_F32) {
7953+
switch (src1_type) {
7954+
case GGML_TYPE_F32:
7955+
case GGML_TYPE_F16:
7956+
case GGML_TYPE_Q4_0:
7957+
case GGML_TYPE_Q4_1:
7958+
case GGML_TYPE_Q5_0:
7959+
case GGML_TYPE_Q5_1:
7960+
case GGML_TYPE_Q8_0:
7961+
case GGML_TYPE_IQ4_NL:
7962+
return true;
7963+
default:
7964+
break;
7965+
}
79107966
}
7911-
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
7912-
return true;
7967+
if (src1_type == GGML_TYPE_F32) {
7968+
switch (src0_type) {
7969+
case GGML_TYPE_Q4_0:
7970+
case GGML_TYPE_Q4_1:
7971+
case GGML_TYPE_Q5_0:
7972+
case GGML_TYPE_Q5_1:
7973+
case GGML_TYPE_Q8_0:
7974+
case GGML_TYPE_IQ4_NL:
7975+
return true;
7976+
default:
7977+
break;
7978+
}
79137979
}
7980+
79147981
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
79157982
return true;
79167983
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#version 450
2+
3+
#include "types.comp"
4+
#include "generic_unary_head.comp"
5+
#include "dequant_funcs.comp"
6+
7+
#if defined(DATA_A_IQ4_NL)
8+
// 16 invocations needed for init_iq4nl_shmem
9+
layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
10+
#else
11+
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
12+
#endif
13+
14+
void main() {
15+
#if defined(DATA_A_IQ4_NL)
16+
init_iq4nl_shmem();
17+
if (gl_LocalInvocationIndex.x != 0) {
18+
return;
19+
}
20+
#endif
21+
22+
const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
23+
24+
if (idx >= p.ne) {
25+
return;
26+
}
27+
28+
uint dst_idx = get_doffset() + dst_idx(idx);
29+
uint src_idx = src0_idx_quant(idx, QUANT_K);
30+
31+
const uint a_offset = 0;
32+
const uint ib = src_idx;
33+
const vec2 dm = get_dm(ib, a_offset);
34+
35+
[[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
36+
vec4 v = dequantize4(ib, j / QUANT_R, a_offset);
37+
v = v * dm.x + vec4(dm.y);
38+
39+
#if QUANT_R == 2
40+
data_d[dst_idx + j/2 + 0] = v[0];
41+
data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1];
42+
data_d[dst_idx + j/2 + 1] = v[2];
43+
data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3];
44+
#else
45+
data_d[dst_idx + j + 0] = v[0];
46+
data_d[dst_idx + j + 1] = v[1];
47+
data_d[dst_idx + j + 2] = v[2];
48+
data_d[dst_idx + j + 3] = v[3];
49+
#endif
50+
}
51+
}

0 commit comments

Comments
 (0)