Skip to content

Commit 977ab13

Browse files
ggerganovhodlen
authored andcommitted
metal : enable shader debugging (cmake option) (ggml-org#4705)
* ggml : disable fast-math for Metal (cmake build only) ggml-ci * metal : fix Metal API debug warnings * cmake : add -fno-inline for Metal build (ggml-org#4545) * metal : fix API debug warnings * metal : fix compile warnings * metal : use uint64_t for strides * cmake : rename option to LLAMA_METAL_SHADER_DEBUG * metal : fix mat-vec Q8_0 kernel for BS > 1 * metal : normalize mat-vec kernel signatures * cmake : respect LLAMA_QKK_64 option * metal : fix mat-vec Q4_K kernel for QK_K == 64 ggml-ci
1 parent cac3eee commit 977ab13

File tree

5 files changed

+329
-230
lines changed

5 files changed

+329
-230
lines changed

CMakeLists.txt

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ option(LLAMA_HIP_UMA "llama: use HIP unified memory arch
9595
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
9696
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
9797
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
98+
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
9899
option(LLAMA_MPI "llama: use MPI" OFF)
99100
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
100101

@@ -154,9 +155,9 @@ if (APPLE AND LLAMA_ACCELERATE)
154155
endif()
155156

156157
if (LLAMA_METAL)
157-
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
158-
find_library(METAL_FRAMEWORK Metal REQUIRED)
159-
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
158+
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
159+
find_library(METAL_FRAMEWORK Metal REQUIRED)
160+
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
160161

161162
message(STATUS "Metal framework found")
162163
set(GGML_HEADERS_METAL ggml-metal.h)
@@ -173,6 +174,33 @@ if (LLAMA_METAL)
173174
# copy ggml-metal.metal to bin directory
174175
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
175176

177+
if (LLAMA_METAL_SHADER_DEBUG)
178+
# custom command to do the following:
179+
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
180+
# xcrun -sdk macosx metallib ggml-metal.air -o ggml.metallib
181+
#
182+
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
183+
# disabling fast math is needed in order to pass tests/test-backend-ops
184+
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
185+
set(XC_FLAGS -fno-fast-math -fno-inline -g)
186+
if (LLAMA_QKK_64)
187+
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
188+
endif()
189+
190+
add_custom_command(
191+
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
192+
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
193+
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
194+
DEPENDS ggml-metal.metal
195+
COMMENT "Compiling Metal kernels"
196+
)
197+
198+
add_custom_target(
199+
ggml-metal ALL
200+
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
201+
)
202+
endif()
203+
176204
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
177205
${FOUNDATION_LIBRARY}
178206
${METAL_FRAMEWORK}

ci/run.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ sd=`dirname $0`
3030
cd $sd/../
3131
SRC=`pwd`
3232

33+
CMAKE_EXTRA=""
34+
35+
if [ ! -z ${GG_BUILD_METAL} ]; then
36+
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
37+
fi
38+
3339
## helpers
3440

3541
# download a file if it does not exist or if it is outdated
@@ -81,8 +87,8 @@ function gg_run_ctest_debug {
8187

8288
set -e
8389

84-
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
85-
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
90+
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
91+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
8692

8793
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
8894

@@ -109,8 +115,8 @@ function gg_run_ctest_release {
109115

110116
set -e
111117

112-
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
113-
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
118+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
119+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
114120

115121
if [ -z ${GG_BUILD_LOW_PERF} ]; then
116122
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log

ggml-metal.m

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -257,13 +257,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
257257
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
258258
#endif
259259
NSError * error = nil;
260-
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
260+
NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
261261
if (libPath != nil) {
262+
// pre-compiled library found
262263
NSURL * libURL = [NSURL fileURLWithPath:libPath];
263264
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
264265
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
265266
} else {
266-
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
267+
GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
267268

268269
NSString * sourcePath;
269270
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@@ -291,6 +292,13 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
291292
options = [MTLCompileOptions new];
292293
options.preprocessorMacros = @{ @"QK_K" : @(64) };
293294
#endif
295+
// try to disable fast-math
296+
// NOTE: this seems to have no effect whatsoever
297+
// instead, in order to disable fast-math, we have to build ggml.metallib from the command line
298+
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
299+
// and go through the "pre-compiled library found" path above
300+
//[options setFastMathEnabled:false];
301+
294302
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
295303
}
296304

@@ -1230,7 +1238,7 @@ void ggml_metal_graph_compute(
12301238
// not sure how to avoid this
12311239
// TODO: make a simpler cpy_bytes kernel
12321240

1233-
const int nth = MIN(1024, ne00);
1241+
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
12341242

12351243
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
12361244
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1285,7 +1293,7 @@ void ggml_metal_graph_compute(
12851293
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
12861294
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
12871295

1288-
const int nth = MIN(1024, ne0);
1296+
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
12891297

12901298
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
12911299
} break;
@@ -1785,8 +1793,9 @@ void ggml_metal_graph_compute(
17851793
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
17861794
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
17871795
// TODO: how to make this an array? read Metal docs
1788-
for (int j = 0; j < n_as; ++j) {
1789-
struct ggml_tensor * src_cur = dst->src[2 + j];
1796+
for (int j = 0; j < 8; ++j) {
1797+
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1798+
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
17901799

17911800
size_t offs_src_cur = 0;
17921801
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -1909,8 +1918,9 @@ void ggml_metal_graph_compute(
19091918
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
19101919
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
19111920
// TODO: how to make this an array? read Metal docs
1912-
for (int j = 0; j < n_as; ++j) {
1913-
struct ggml_tensor * src_cur = dst->src[2 + j];
1921+
for (int j = 0; j < 8; ++j) {
1922+
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1923+
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
19141924

19151925
size_t offs_src_cur = 0;
19161926
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -2229,7 +2239,7 @@ void ggml_metal_graph_compute(
22292239
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
22302240
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
22312241

2232-
const int nth = MIN(1024, ne0);
2242+
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
22332243

22342244
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
22352245
} break;

0 commit comments

Comments
 (0)