Skip to content

Commit fd08ab8

Browse files
committed
some cleanup with tinyblas backend
1 parent 4a5ac65 commit fd08ab8

13 files changed

+262
-148
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
8484
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
8585

8686
# change the default for these ggml options
87-
if (NOT DEFINED GGML_LLAMAFILE)
88-
set(GGML_LLAMAFILE_DEFAULT ON)
87+
if (NOT DEFINED GGML_TINYBLAS)
88+
set(GGML_TINYBLAS ON)
8989
endif()
9090

9191
if (NOT DEFINED GGML_AMX)

docs/android.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ $ cmake \
4545
-DCMAKE_C_FLAGS="-march=armv8.7a" \
4646
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
4747
-DGGML_OPENMP=OFF \
48-
-DGGML_LLAMAFILE=OFF \
48+
-DGGML_TINYBLAS=OFF \
4949
-B build-android
5050
```
5151

docs/build.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
4242

4343
**Notes**:
4444

45-
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
45+
- For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
4646
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
4747
- For faster repeated compilation, install [ccache](https://ccache.dev/).
4848
- For debug builds, there are two cases:
@@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)
393393

394394
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
395395

396-
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
396+
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).

ggml/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ else()
5757
endif()
5858

5959
# defaults
60-
if (NOT GGML_LLAMAFILE_DEFAULT)
61-
set(GGML_LLAMAFILE_DEFAULT OFF)
60+
if (NOT GGML_TINYBLAS_DEFAULT)
61+
set(GGML_TINYBLAS_DEFAULT OFF)
6262
endif()
6363

6464
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
@@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
124124
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
125125
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
126126
"ggml: BLAS library vendor")
127-
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
128-
127+
option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF)
129128
option(GGML_CUDA "ggml: use CUDA" OFF)
130129
option(GGML_MUSA "ggml: use MUSA" OFF)
131130
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
@@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
231230
include/ggml-metal.h
232231
include/ggml-rpc.h
233232
include/ggml-sycl.h
233+
include/ggml-tinyblas.h
234234
include/ggml-vulkan.h)
235235

236236
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
256256
ggml_add_backend(METAL)
257257
ggml_add_backend(RPC)
258258
ggml_add_backend(SYCL)
259+
ggml_add_backend(TINYBLAS)
259260
ggml_add_backend(Vulkan)
260261
ggml_add_backend(MUSA)
261262

ggml/src/ggml-backend-reg.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,12 @@ struct ggml_backend_registry {
9191
return;
9292
}
9393

94-
#ifndef NDEBUG
95-
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
94+
GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
9695
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
97-
#endif
96+
//#ifndef NDEBUG
97+
// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
98+
// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
99+
//#endif
98100
backends.push_back(reg);
99101
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
100102
register_device(ggml_backend_reg_dev_get(reg, i));

ggml/src/ggml-common.h

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,20 @@
66
typedef uint16_t ggml_half;
77
typedef uint32_t ggml_half2;
88

9-
#define GGML_COMMON_AGGR
9+
#define GGML_COMMON_AGGR_U
10+
#define GGML_COMMON_AGGR_S
11+
12+
#define GGML_COMMON_DECL
13+
#elif defined(GGML_COMMON_DECL_CPP)
14+
#include <cstdint>
15+
16+
typedef uint16_t ggml_half;
17+
typedef uint32_t ggml_half2;
18+
19+
// std-c++ allow anonymous unions but some compiler warn on it
20+
#define GGML_COMMON_AGGR_U data
21+
// std-c++ do not allow it.
22+
#define GGML_COMMON_AGGR_S data
1023

1124
#define GGML_COMMON_DECL
1225
#elif defined(GGML_COMMON_DECL_METAL)
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
1528
typedef half ggml_half;
1629
typedef half2 ggml_half2;
1730

18-
#define GGML_COMMON_AGGR
31+
#define GGML_COMMON_AGGR_U
32+
#define GGML_COMMON_AGGR_S
1933

2034
#define GGML_COMMON_DECL
2135
#elif defined(GGML_COMMON_DECL_CUDA)
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
2943
typedef half ggml_half;
3044
typedef half2 ggml_half2;
3145

32-
#define GGML_COMMON_AGGR data
46+
#define GGML_COMMON_AGGR_U
47+
#define GGML_COMMON_AGGR_S data
3348

3449
#define GGML_COMMON_DECL
3550
#elif defined(GGML_COMMON_DECL_HIP)
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
3954
typedef half ggml_half;
4055
typedef half2 ggml_half2;
4156

42-
#define GGML_COMMON_AGGR data
57+
#define GGML_COMMON_AGGR_U
58+
#define GGML_COMMON_AGGR_S data
4359

4460
#define GGML_COMMON_DECL
4561
#elif defined(GGML_COMMON_DECL_SYCL)
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
4965
typedef sycl::half ggml_half;
5066
typedef sycl::half2 ggml_half2;
5167

52-
#define GGML_COMMON_AGGR data
68+
#define GGML_COMMON_AGGR_U
69+
#define GGML_COMMON_AGGR_S data
5370

5471
#define GGML_COMMON_DECL
5572
#endif
@@ -154,9 +171,9 @@ typedef struct {
154171
struct {
155172
ggml_half d; // delta
156173
ggml_half m; // min
157-
} GGML_COMMON_AGGR;
174+
} GGML_COMMON_AGGR_S;
158175
ggml_half2 dm;
159-
};
176+
} GGML_COMMON_AGGR_U;
160177
uint8_t qs[QK4_1 / 2]; // nibbles / quants
161178
} block_q4_1;
162179
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -175,9 +192,9 @@ typedef struct {
175192
struct {
176193
ggml_half d; // delta
177194
ggml_half m; // min
178-
} GGML_COMMON_AGGR;
195+
} GGML_COMMON_AGGR_S;
179196
ggml_half2 dm;
180-
};
197+
} GGML_COMMON_AGGR_U;
181198
uint8_t qh[4]; // 5-th bit of quants
182199
uint8_t qs[QK5_1 / 2]; // nibbles / quants
183200
} block_q5_1;
@@ -196,9 +213,9 @@ typedef struct {
196213
struct {
197214
ggml_half d; // delta
198215
ggml_half s; // d * sum(qs[i])
199-
} GGML_COMMON_AGGR;
216+
} GGML_COMMON_AGGR_S;
200217
ggml_half2 ds;
201-
};
218+
} GGML_COMMON_AGGR_U;
202219
int8_t qs[QK8_1]; // quants
203220
} block_q8_1;
204221
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@@ -261,9 +278,9 @@ typedef struct {
261278
struct {
262279
ggml_half d; // super-block scale for quantized scales
263280
ggml_half dmin; // super-block scale for quantized mins
264-
} GGML_COMMON_AGGR;
281+
} GGML_COMMON_AGGR_S;
265282
ggml_half2 dm;
266-
};
283+
} GGML_COMMON_AGGR_U;
267284
} block_q2_K;
268285
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
269286

@@ -288,9 +305,9 @@ typedef struct {
288305
struct {
289306
ggml_half d; // super-block scale for quantized scales
290307
ggml_half dmin; // super-block scale for quantized mins
291-
} GGML_COMMON_AGGR;
308+
} GGML_COMMON_AGGR_S;
292309
ggml_half2 dm;
293-
};
310+
} GGML_COMMON_AGGR_U;
294311
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
295312
uint8_t qs[QK_K/2]; // 4--bit quants
296313
} block_q4_K;
@@ -305,9 +322,9 @@ typedef struct {
305322
struct {
306323
ggml_half d; // super-block scale for quantized scales
307324
ggml_half dmin; // super-block scale for quantized mins
308-
} GGML_COMMON_AGGR;
325+
} GGML_COMMON_AGGR_S;
309326
ggml_half2 dm;
310-
};
327+
} GGML_COMMON_AGGR_U;
311328
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
312329
uint8_t qh[QK_K/8]; // quants, high bit
313330
uint8_t qs[QK_K/2]; // quants, low 4 bits
@@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
431448
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
432449
#define GGML_TABLE_END() };
433450

451+
#define GGML_COMMON_IMPL
452+
#elif defined(GGML_COMMON_IMPL_CPP)
453+
#include <cstdint>
454+
455+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
456+
#define GGML_TABLE_END() };
457+
434458
#define GGML_COMMON_IMPL
435459
#elif defined(GGML_COMMON_IMPL_METAL)
436460
#include <metal_stdlib>

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,6 @@ if (GGML_OPENMP)
4444
endif()
4545
endif()
4646

47-
if (GGML_LLAMAFILE)
48-
message(STATUS "Using llamafile")
49-
50-
add_compile_definitions(GGML_USE_LLAMAFILE)
51-
52-
target_sources(ggml-cpu PRIVATE
53-
llamafile/sgemm.cpp
54-
llamafile/sgemm.h)
55-
endif()
56-
5747
if (GGML_CPU_HBM)
5848
find_library(memkind memkind REQUIRED)
5949

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,6 @@
3939
#include <omp.h>
4040
#endif
4141

42-
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
43-
#undef GGML_USE_LLAMAFILE
44-
#endif
45-
46-
#ifdef GGML_USE_LLAMAFILE
47-
#include "llamafile/sgemm.h"
48-
#endif
49-
5042
#if defined(_MSC_VER)
5143
// disable "possible loss of data" to avoid hundreds of casts
5244
// we should just be careful :)
@@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
74667458
// nb01 >= nb00 - src0 is not transposed
74677459
// compute by src0 rows
74687460

7469-
#if GGML_USE_LLAMAFILE
7470-
// broadcast factors
7471-
const int64_t r2 = ne12 / ne02;
7472-
const int64_t r3 = ne13 / ne03;
7473-
7474-
const bool src1_cont = ggml_is_contiguous(src1);
7475-
7476-
if (src1_cont) {
7477-
for (int64_t i13 = 0; i13 < ne13; i13++)
7478-
for (int64_t i12 = 0; i12 < ne12; i12++)
7479-
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
7480-
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7481-
nb01/ggml_type_size(type),
7482-
(const char *)src1->data + i12*nb12 + i13*nb13,
7483-
nb11/ggml_type_size(src1->type),
7484-
(char *)dst->data + i12*nb2 + i13*nb3,
7485-
nb1/ggml_type_size(dst->type),
7486-
ith, nth,
7487-
type,
7488-
src1->type,
7489-
dst->type))
7490-
goto UseGgmlGemm1;
7491-
return;
7492-
}
7493-
UseGgmlGemm1:;
7494-
#endif
7495-
74967461
if (src1->type != vec_dot_type) {
74977462
char * wdata = params->wdata;
74987463

@@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
75307495

75317496
ggml_barrier(params->threadpool);
75327497

7533-
#if GGML_USE_LLAMAFILE
7534-
if (src1->type != vec_dot_type) {
7535-
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
7536-
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
7537-
7538-
for (int64_t i13 = 0; i13 < ne13; i13++)
7539-
for (int64_t i12 = 0; i12 < ne12; i12++)
7540-
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
7541-
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7542-
nb01/ggml_type_size(type),
7543-
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
7544-
row_size/ggml_type_size(vec_dot_type),
7545-
(char *)dst->data + i12*nb2 + i13*nb3,
7546-
nb1/ggml_type_size(dst->type),
7547-
ith, nth,
7548-
type,
7549-
vec_dot_type,
7550-
dst->type))
7551-
goto UseGgmlGemm2;
7552-
return;
7553-
}
7554-
UseGgmlGemm2:;
7555-
#endif
7556-
75577498
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
75587499
const int64_t nr0 = ne0;
75597500

ggml/src/ggml-tinyblas/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
message(STATUS "Using TINYBLAS")
2+
13
add_library(ggml-tinyblas
24
ggml-tinyblas.cpp
35
)
@@ -225,6 +227,10 @@ endif()
225227
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
226228
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
227229

230+
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
231+
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
232+
target_compile_features (ggml-tinyblas PRIVATE cxx_std_17)
233+
228234
if (EMSCRIPTEN)
229235
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
230236
endif()

0 commit comments

Comments
 (0)