@@ -257,13 +257,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
257
257
bundle = [NSBundle bundleForClass: [GGMLMetalClass class ]];
258
258
#endif
259
259
NSError * error = nil ;
260
- NSString * libPath = [bundle pathForResource: @" default " ofType: @" metallib" ];
260
+ NSString * libPath = [bundle pathForResource: @" ggml " ofType: @" metallib" ];
261
261
if (libPath != nil ) {
262
+ // pre-compiled library found
262
263
NSURL * libURL = [NSURL fileURLWithPath: libPath];
263
264
GGML_METAL_LOG_INFO (" %s : loading '%s '\n " , __func__, [libPath UTF8String ]);
264
265
ctx->library = [ctx->device newLibraryWithURL: libURL error: &error];
265
266
} else {
266
- GGML_METAL_LOG_INFO (" %s : default .metallib not found, loading from source\n " , __func__);
267
+ GGML_METAL_LOG_INFO (" %s : ggml .metallib not found, loading from source\n " , __func__);
267
268
268
269
NSString * sourcePath;
269
270
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo ].environment objectForKey: @" GGML_METAL_PATH_RESOURCES" ];
@@ -291,6 +292,13 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
291
292
options = [MTLCompileOptions new ];
292
293
options.preprocessorMacros = @{ @" QK_K" : @(64 ) };
293
294
#endif
295
+ // try to disable fast-math
296
+ // NOTE: this seems to have no effect whatsoever
297
+ // instead, in order to disable fast-math, we have to build ggml.metallib from the command line
298
+ // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
299
+ // and go through the "pre-compiled library found" path above
300
+ // [options setFastMathEnabled:false];
301
+
294
302
ctx->library = [ctx->device newLibraryWithSource: src options: options error: &error];
295
303
}
296
304
@@ -1230,7 +1238,7 @@ void ggml_metal_graph_compute(
1230
1238
// not sure how to avoid this
1231
1239
// TODO: make a simpler cpy_bytes kernel
1232
1240
1233
- const int nth = MIN (1024 , ne00);
1241
+ const int nth = MIN (( int ) ctx-> pipeline_cpy_f32_f32 . maxTotalThreadsPerThreadgroup , ne00);
1234
1242
1235
1243
[encoder setComputePipelineState: ctx->pipeline_cpy_f32_f32];
1236
1244
[encoder setBuffer: id_src0 offset: offs_src0 atIndex: 0 ];
@@ -1285,7 +1293,7 @@ void ggml_metal_graph_compute(
1285
1293
[encoder setBytes: &pnb3 length: sizeof (pnb3) atIndex: 26 ];
1286
1294
[encoder setBytes: &offs length: sizeof (offs) atIndex: 27 ];
1287
1295
1288
- const int nth = MIN (1024 , ne0 );
1296
+ const int nth = MIN (( int ) ctx-> pipeline_add . maxTotalThreadsPerThreadgroup , ne00 );
1289
1297
1290
1298
[encoder dispatchThreadgroups: MTLSizeMake (ne11, ne12, ne13) threadsPerThreadgroup: MTLSizeMake (nth, 1 , 1 )];
1291
1299
} break ;
@@ -1785,8 +1793,9 @@ void ggml_metal_graph_compute(
1785
1793
[encoder setBytes: &r3 length: sizeof (r3) atIndex: 17 ];
1786
1794
[encoder setBytes: &idx length: sizeof (idx) atIndex: 18 ];
1787
1795
// TODO: how to make this an array? read Metal docs
1788
- for (int j = 0 ; j < n_as; ++j) {
1789
- struct ggml_tensor * src_cur = dst->src [2 + j];
1796
+ for (int j = 0 ; j < 8 ; ++j) {
1797
+ // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1798
+ struct ggml_tensor * src_cur = dst->src [2 + (j % n_as)];
1790
1799
1791
1800
size_t offs_src_cur = 0 ;
1792
1801
id <MTLBuffer > id_src_cur = ggml_metal_get_buffer (ctx, src_cur, &offs_src_cur);
@@ -1909,8 +1918,9 @@ void ggml_metal_graph_compute(
1909
1918
[encoder setBytes: &r3 length: sizeof (r3) atIndex: 21 ];
1910
1919
[encoder setBytes: &idx length: sizeof (idx) atIndex: 22 ];
1911
1920
// TODO: how to make this an array? read Metal docs
1912
- for (int j = 0 ; j < n_as; ++j) {
1913
- struct ggml_tensor * src_cur = dst->src [2 + j];
1921
+ for (int j = 0 ; j < 8 ; ++j) {
1922
+ // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
1923
+ struct ggml_tensor * src_cur = dst->src [2 + (j % n_as)];
1914
1924
1915
1925
size_t offs_src_cur = 0 ;
1916
1926
id <MTLBuffer > id_src_cur = ggml_metal_get_buffer (ctx, src_cur, &offs_src_cur);
@@ -2229,7 +2239,7 @@ void ggml_metal_graph_compute(
2229
2239
[encoder setBytes: &nb3 length: sizeof (nb3) atIndex: 17 ];
2230
2240
[encoder setBytes: &sf length: sizeof (sf) atIndex: 18 ];
2231
2241
2232
- const int nth = MIN (1024 , ne0);
2242
+ const int nth = MIN (( int ) ctx-> pipeline_upscale_f32 . maxTotalThreadsPerThreadgroup , ne0);
2233
2243
2234
2244
[encoder dispatchThreadgroups: MTLSizeMake (ne1, ne2, ne3) threadsPerThreadgroup: MTLSizeMake (nth, 1 , 1 )];
2235
2245
} break ;
0 commit comments