Skip to content

Commit 465219b

Browse files
committed
CLBlast: Add outer loops over src0 for broadcasting in mulmat
Reduce repeated dequantization of the same data.
1 parent d1031cf commit 465219b

File tree

1 file changed

+161
-169
lines changed

1 file changed

+161
-169
lines changed

ggml-opencl.cpp

Lines changed: 161 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14891489
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
14901490

14911491
size_t x_offset = 0;
1492-
int64_t pi02 = -1;
1493-
int64_t pi03 = -1;
1494-
1495-
for (int64_t i13 = 0; i13 < ne13; i13++) {
1496-
int64_t i03 = i13 / r3;
1497-
1498-
for (int64_t i12 = 0; i12 < ne12; i12++) {
1499-
int64_t i02 = i12 / r2;
1500-
1501-
// copy data to device
1502-
if (src0->backend == GGML_BACKEND_GPU) {
1503-
x_offset = (i03 * ne02 + i02) * x_ne;
1504-
} else if (i02 != pi02 || i03 != pi03) {
1505-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1506-
pi02 = i02;
1507-
pi03 = i03;
1508-
}
1509-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
15101492

1511-
CL_CHECK(clFinish(queue));
1493+
for (int64_t i03 = 0; i03 < ne03; i03++) {
1494+
// TODO: copy src0 here when r3>1
1495+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1496+
for (int64_t i02 = 0; i02 < ne02; i02++) {
1497+
if (src0->backend == GGML_BACKEND_GPU) {
1498+
x_offset = (i03 * ne02 + i02) * x_ne;
1499+
} else {
1500+
// copy src0 to device
1501+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1502+
}
15121503

1513-
// compute
1514-
cl_event ev_sgemm;
1515-
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1516-
clblast::Transpose::kYes, clblast::Transpose::kNo,
1517-
ne01, ne11, ne10,
1518-
alpha,
1519-
d_X, x_offset, ne00,
1520-
d_Y, 0, ne10,
1521-
beta,
1522-
d_D, 0, ne01,
1523-
&queue, &ev_sgemm);
1524-
1525-
if (status != clblast::StatusCode::kSuccess) {
1526-
GGML_ASSERT(false);
1527-
}
1504+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1505+
// copy src1 to device
1506+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1507+
1508+
CL_CHECK(clFinish(queue));
1509+
1510+
// compute
1511+
cl_event ev_sgemm;
1512+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1513+
clblast::Transpose::kYes, clblast::Transpose::kNo,
1514+
ne01, ne11, ne10,
1515+
alpha,
1516+
d_X, x_offset, ne00,
1517+
d_Y, 0, ne10,
1518+
beta,
1519+
d_D, 0, ne01,
1520+
&queue, &ev_sgemm);
1521+
1522+
if (status != clblast::StatusCode::kSuccess) {
1523+
GGML_ASSERT(false);
1524+
}
15281525

1529-
// copy dst to host
1530-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1531-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1526+
// copy dst to host
1527+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1528+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1529+
}
1530+
}
15321531
}
15331532
}
15341533

@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15891588
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
15901589

15911590
size_t x_offset = 0;
1592-
int64_t pi02 = -1;
1593-
int64_t pi03 = -1;
1594-
1595-
for (int64_t i13 = 0; i13 < ne13; i13++) {
1596-
int64_t i03 = i13 / r3;
1597-
1598-
for (int64_t i12 = 0; i12 < ne12; i12++) {
1599-
int64_t i02 = i12 / r2;
16001591

1601-
// copy src0 to device
1602-
if (src0->backend == GGML_BACKEND_GPU) {
1603-
x_offset = (i03 * ne02 + i02) * x_ne;
1604-
} else if (i02 != pi02 || i03 != pi03) {
1605-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1606-
pi02 = i02;
1607-
pi03 = i03;
1608-
}
1609-
1610-
// convert src1 to fp16
1611-
// TODO: use multiple threads
1612-
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1613-
if (src1_cont_rows) {
1614-
if (src1_cont_cols) {
1615-
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1592+
for (int64_t i03 = 0; i03 < ne03; i03++) {
1593+
// TODO: copy src0 here when r3>1
1594+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1595+
for (int64_t i02 = 0; i02 < ne02; i02++) {
1596+
if (src0->backend == GGML_BACKEND_GPU) {
1597+
x_offset = (i03 * ne02 + i02) * x_ne;
1598+
} else {
1599+
// copy src0 to device
1600+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
16161601
}
1617-
else {
1618-
for (int64_t i11 = 0; i11 < ne11; i11++) {
1619-
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1602+
1603+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1604+
// convert src1 to fp16
1605+
// TODO: use multiple threads
1606+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1607+
if (src1_cont_rows) {
1608+
if (src1_cont_cols) {
1609+
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1610+
}
1611+
else {
1612+
for (int64_t i11 = 0; i11 < ne11; i11++) {
1613+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1614+
}
1615+
}
16201616
}
1621-
}
1622-
}
1623-
else {
1624-
for (int64_t i11 = 0; i11 < ne11; i11++) {
1625-
for (int64_t i10 = 0; i10 < ne10; i10++) {
1626-
// very slow due to no inlining
1627-
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1617+
else {
1618+
for (int64_t i11 = 0; i11 < ne11; i11++) {
1619+
for (int64_t i10 = 0; i10 < ne10; i10++) {
1620+
// very slow due to no inlining
1621+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1622+
}
1623+
}
16281624
}
1629-
}
1630-
}
1631-
1632-
// copy src1 to device
1633-
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1634-
1635-
CL_CHECK(clFinish(queue));
16361625

1637-
// compute
1638-
cl_event ev_sgemm;
1639-
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1640-
clblast::Transpose::kYes, clblast::Transpose::kNo,
1641-
ne01, ne11, ne10,
1642-
alpha,
1643-
d_X, x_offset, ne00,
1644-
d_Y, 0, ne10,
1645-
beta,
1646-
d_D, 0, ne01,
1647-
&queue, &ev_sgemm);
1648-
1649-
if (status != clblast::StatusCode::kSuccess) {
1650-
GGML_ASSERT(false);
1651-
}
1626+
// copy src1 to device
1627+
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1628+
1629+
CL_CHECK(clFinish(queue));
1630+
1631+
// compute
1632+
cl_event ev_sgemm;
1633+
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1634+
clblast::Transpose::kYes, clblast::Transpose::kNo,
1635+
ne01, ne11, ne10,
1636+
alpha,
1637+
d_X, x_offset, ne00,
1638+
d_Y, 0, ne10,
1639+
beta,
1640+
d_D, 0, ne01,
1641+
&queue, &ev_sgemm);
1642+
1643+
if (status != clblast::StatusCode::kSuccess) {
1644+
GGML_ASSERT(false);
1645+
}
16521646

1653-
// copy dst to host, then convert to float
1654-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647+
// copy dst to host, then convert to float
1648+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
16551649

1656-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1650+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
16571651

1658-
ggml_fp16_to_fp32_row(tmp, d, d_ne);
1652+
ggml_fp16_to_fp32_row(tmp, d, d_ne);
1653+
}
1654+
}
16591655
}
16601656
}
16611657

@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17181714
size_t ev_idx = 0;
17191715
std::vector<cl_event> events;
17201716

1721-
int64_t pi02 = -1;
1722-
int64_t pi03 = -1;
1723-
1724-
for (int64_t i13 = 0; i13 < ne13; i13++) {
1725-
int64_t i03 = i13 / r3;
1726-
1727-
for (int64_t i12 = 0; i12 < ne12; i12++) {
1728-
int64_t i02 = i12 / r2;
1729-
1730-
// copy src0 to device if necessary
1731-
if (src0->backend == GGML_BACKEND_CPU) {
1732-
if (i02 != pi02 || i03 != pi03) {
1717+
for (int64_t i03 = 0; i03 < ne03; i03++) {
1718+
// TODO: copy and dequantize src0 here when r3>1
1719+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1720+
for (int64_t i02 = 0; i02 < ne02; i02++) {
1721+
// copy src0 to device if necessary
1722+
if (src0->backend == GGML_BACKEND_CPU) {
17331723
events.emplace_back();
17341724
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1735-
pi02 = i02;
1736-
pi03 = i03;
1737-
}
1738-
} else if (src0->backend == GGML_BACKEND_GPU) {
1739-
d_Q = (cl_mem) src0->extra;
1740-
} else {
1741-
GGML_ASSERT(false);
1742-
}
1743-
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1744-
// copy src1 to device
1745-
events.emplace_back();
1746-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1747-
1748-
// compute
1749-
const size_t global = ne01 * local;
1750-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1751-
const cl_int ncols = ne00;
1752-
events.emplace_back();
1753-
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1754-
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1755-
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1756-
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1757-
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1758-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1759-
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
1760-
// convert src0 to fp32 on device
1761-
const size_t global = x_ne / global_denom;
1762-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1763-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1764-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1765-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1766-
1767-
// copy src1 to device
1768-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1769-
1770-
events.emplace_back();
1771-
1772-
// wait for conversion
1773-
CL_CHECK(clFinish(queue));
1774-
1775-
// compute
1776-
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1777-
clblast::Transpose::kYes, clblast::Transpose::kNo,
1778-
ne01, ne11, ne10,
1779-
alpha,
1780-
d_X, 0, ne00,
1781-
d_Y, 0, ne10,
1782-
beta,
1783-
d_D, 0, ne01,
1784-
&queue, events.data() + ev_idx++);
1785-
1786-
if (status != clblast::StatusCode::kSuccess) {
1725+
} else if (src0->backend == GGML_BACKEND_GPU) {
1726+
d_Q = (cl_mem) src0->extra;
1727+
} else {
17871728
GGML_ASSERT(false);
17881729
}
1789-
}
17901730

1791-
// copy dst to host
1792-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1793-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1794-
for (auto *event : events) {
1795-
clReleaseEvent(event);
1796-
}
1731+
if (!mul_mat_vec) {
1732+
// convert src0 to fp32 on device
1733+
const size_t global = x_ne / global_denom;
1734+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1735+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1736+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1737+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1738+
}
17971739

1798-
ev_idx = 0;
1799-
events.clear();
1740+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1741+
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1742+
// copy src1 to device
1743+
events.emplace_back();
1744+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1745+
1746+
// compute
1747+
const size_t global = ne01 * local;
1748+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1749+
const cl_int ncols = ne00;
1750+
events.emplace_back();
1751+
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1752+
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1753+
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1754+
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1755+
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1756+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1757+
} else { // CLBlast matrix matrix multiplication
1758+
// copy src1 to device
1759+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1760+
1761+
// wait for conversion
1762+
CL_CHECK(clFinish(queue));
1763+
1764+
// compute
1765+
events.emplace_back();
1766+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1767+
clblast::Transpose::kYes, clblast::Transpose::kNo,
1768+
ne01, ne11, ne10,
1769+
alpha,
1770+
d_X, 0, ne00,
1771+
d_Y, 0, ne10,
1772+
beta,
1773+
d_D, 0, ne01,
1774+
&queue, events.data() + ev_idx++);
1775+
1776+
if (status != clblast::StatusCode::kSuccess) {
1777+
GGML_ASSERT(false);
1778+
}
1779+
}
1780+
1781+
// copy dst to host
1782+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1783+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1784+
for (auto *event : events) {
1785+
clReleaseEvent(event);
1786+
}
1787+
1788+
ev_idx = 0;
1789+
events.clear();
1790+
}
1791+
}
18001792
}
18011793
}
18021794

0 commit comments

Comments
 (0)