Skip to content

Commit 210b970

Browse files
mingfeimaarthw
authored andcommitted
add amx kernel for gemm (ggml-org#8998)
add intel amx isa detection add vnni kernel for gemv cases add vnni and amx kernel support for block_q8_0 code cleanup fix packing B issue enable openmp fine tune amx kernel switch to aten parallel pattern add error message for nested parallelism code cleanup add f16 support in ggml-amx add amx kernels for QK_K quant formats: Q4_K, Q5_K, Q6_K and IQ4_XS update CMakeList update README fix some compilation warning fix compiler warning when amx is not enabled minor change ggml-ci move ggml_amx_init from ggml.c to ggml-amx/mmq.cpp ggml-ci update CMakeLists with -mamx-tile, -mamx-int8 and -mamx-bf16 ggml-ci add amx as an ggml-backend update header file, the old path for immintrin.h has changed to ggml-cpu-impl.h minor change update CMakeLists.txt minor change apply weight prepacking in set_tensor method in ggml-backend fix compile error ggml-ci minor change ggml-ci update CMakeLists.txt ggml-ci add march dependency minor change ggml-ci change ggml_backend_buffer_is_host to return false for amx backend ggml-ci fix supports_op use device reg for AMX backend ggml-ci minor change ggml-ci minor change fix rebase set .buffer_from_host_ptr to be false for AMX backend
1 parent cbfd0e7 commit 210b970

File tree

14 files changed

+3204
-7
lines changed

14 files changed

+3204
-7
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_AMX)
92+
set(GGML_AMX ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_CUDA_GRAPHS)
9296
set(GGML_CUDA_GRAPHS_DEFAULT ON)
9397
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1086,6 +1086,19 @@ ggml/src/llamafile/sgemm.o: \
10861086
$(CXX) $(CXXFLAGS) -c $< -o $@
10871087
endif # GGML_NO_LLAMAFILE
10881088

1089+
ifndef GGML_NO_AMX
1090+
ggml/src/ggml-amx.o: \
1091+
ggml/src/ggml-amx.cpp \
1092+
ggml/include/ggml-amx.h
1093+
$(CXX) $(CXXFLAGS) -c $< -o $@
1094+
1095+
ggml/src/ggml-amx/mmq.o: \
1096+
ggml/src/ggml-amx/mmq.cpp \
1097+
ggml/src/ggml-amx/mmq.h \
1098+
ggml/include/ggml.h
1099+
$(CXX) $(CXXFLAGS) -c $< -o $@
1100+
endif
1101+
10891102
ifdef GGML_RPC
10901103
ggml/src/ggml-rpc.o: \
10911104
ggml/src/ggml-rpc.cpp \
@@ -1237,6 +1250,7 @@ clean:
12371250
rm -vrf ggml/src/ggml-metal-embed.metal
12381251
rm -vrf ggml/src/ggml-cuda/*.o
12391252
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1253+
rm -vrf ggml/src/ggml-amx/*.o
12401254
rm -rvf $(BUILD_TARGETS)
12411255
rm -rvf $(TEST_TARGETS)
12421256
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ variety of hardware - locally and in the cloud.
6363

6464
- Plain C/C++ implementation without any dependencies
6565
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
66-
- AVX, AVX2 and AVX512 support for x86 architectures
66+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
6767
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
6868
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
6969
- Vulkan and SYCL backend support

ggml/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
9999
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
100100
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
101101
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
102+
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
103+
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
104+
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
102105
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
103106
if (NOT MSVC)
104107
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -158,6 +161,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
158161
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
159162
option(GGML_OPENMP "ggml: use OpenMP" ON)
160163
option(GGML_RPC "ggml: use RPC" OFF)
164+
option(GGML_AMX "ggml: use AMX" OFF)
161165
option(GGML_SYCL "ggml: use SYCL" OFF)
162166
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
163167
set (GGML_SYCL_TARGET "INTEL" CACHE STRING

ggml/include/ggml-amx.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#pragma once
2+
3+
#include "ggml.h"
4+
#include "ggml-backend.h"
5+
6+
7+
#ifdef __cplusplus
8+
extern "C" {
9+
#endif
10+
11+
// buffer_type API
12+
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
13+
14+
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
15+
16+
// backend API
17+
GGML_API ggml_backend_t ggml_backend_amx_init(void);
18+
19+
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
20+
21+
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
22+
23+
#ifdef __cplusplus
24+
}
25+
#endif

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2488,6 +2488,7 @@ extern "C" {
24882488
GGML_API int ggml_cpu_has_avx512_vbmi(void);
24892489
GGML_API int ggml_cpu_has_avx512_vnni(void);
24902490
GGML_API int ggml_cpu_has_avx512_bf16(void);
2491+
GGML_API int ggml_cpu_has_amx_int8 (void);
24912492
GGML_API int ggml_cpu_has_fma (void);
24922493
GGML_API int ggml_cpu_has_neon (void);
24932494
GGML_API int ggml_cpu_has_sve (void);

ggml/src/CMakeLists.txt

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,26 @@ if (GGML_LLAMAFILE)
267267
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
268268
endif()
269269

270+
if (GGML_AMX)
271+
if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
272+
else()
273+
set(GGML_AMX OFF)
274+
message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
275+
endif()
276+
277+
if (GGML_AMX)
278+
message(STATUS "Using AMX")
279+
280+
list(APPEND GGML_CDEF_PUBLIC GGML_USE_AMX)
281+
282+
file(GLOB GGML_HEADERS_AMX "ggml-amx/*.h")
283+
list(APPEND GGML_HEADERS_AMX "../include/ggml-amx.h")
284+
285+
file(GLOB GGML_SOURCES_AMX "ggml-amx/*.cpp")
286+
list(APPEND GGML_SOURCES_AMX "ggml-amx.cpp")
287+
endif()
288+
endif()
289+
270290
if (GGML_CUDA)
271291
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
272292

@@ -1180,6 +1200,18 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
11801200
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
11811201
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
11821202
endif()
1203+
if (GGML_AMX_TILE)
1204+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
1205+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
1206+
endif()
1207+
if (GGML_AMX_INT8)
1208+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
1209+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
1210+
endif()
1211+
if (GGML_AMX_BF16)
1212+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
1213+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
1214+
endif()
11831215
elseif (GGML_AVX2)
11841216
list(APPEND ARCH_FLAGS /arch:AVX2)
11851217
elseif (GGML_AVX)
@@ -1215,6 +1247,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
12151247
if (GGML_AVX512_BF16)
12161248
list(APPEND ARCH_FLAGS -mavx512bf16)
12171249
endif()
1250+
if (GGML_AMX_TILE)
1251+
list(APPEND ARCH_FLAGS -mamx-tile)
1252+
endif()
1253+
if (GGML_AMX_INT8)
1254+
list(APPEND ARCH_FLAGS -mamx-int8)
1255+
endif()
1256+
if (GGML_AMX_BF16)
1257+
list(APPEND ARCH_FLAGS -mamx-bf16)
1258+
endif()
12181259
endif()
12191260
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
12201261
message(STATUS "PowerPC detected")
@@ -1340,6 +1381,7 @@ add_library(ggml
13401381
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
13411382
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
13421383
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
1384+
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
13431385
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
13441386
ggml-aarch64.c ggml-aarch64.h
13451387
)

0 commit comments

Comments
 (0)