Skip to content

Commit 9c57861

Browse files
authored
Merge branch 'ggerganov:master' into ag_cuda_graphs
2 parents 0640427 + ca7f29f commit 9c57861

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+3945
-8694
lines changed

.github/workflows/bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ on:
3232
- cron: '04 2 * * *'
3333

3434
concurrency:
35-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
35+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
3636
cancel-in-progress: true
3737

3838
jobs:

.github/workflows/build.yml

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ jobs:
3232
- name: Clone
3333
id: checkout
3434
uses: actions/checkout@v4
35+
with:
36+
fetch-depth: 0
3537

3638
- name: Dependencies
3739
id: depends
@@ -88,6 +90,8 @@ jobs:
8890
- name: Clone
8991
id: checkout
9092
uses: actions/checkout@v4
93+
with:
94+
fetch-depth: 0
9195

9296
- name: Dependencies
9397
id: depends
@@ -206,6 +210,8 @@ jobs:
206210
- name: Clone
207211
id: checkout
208212
uses: actions/checkout@v4
213+
with:
214+
fetch-depth: 0
209215

210216
- name: Dependencies
211217
id: depends
@@ -238,6 +244,33 @@ jobs:
238244
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
239245
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
240246
247+
- name: Determine tag name
248+
id: tag
249+
shell: bash
250+
run: |
251+
BUILD_NUMBER="$(git rev-list --count HEAD)"
252+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
253+
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
254+
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
255+
else
256+
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
257+
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
258+
fi
259+
260+
- name: Pack artifacts
261+
id: pack_artifacts
262+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
263+
run: |
264+
cp LICENSE ./build/bin/
265+
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
266+
267+
- name: Upload artifacts
268+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
269+
uses: actions/upload-artifact@v4
270+
with:
271+
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
272+
name: llama-bin-ubuntu-x64.zip
273+
241274
# ubuntu-latest-cmake-sanitizer:
242275
# runs-on: ubuntu-latest
243276
#
@@ -560,6 +593,63 @@ jobs:
560593
run: |
561594
make swift
562595
596+
windows-msys2:
597+
runs-on: windows-latest
598+
599+
strategy:
600+
fail-fast: false
601+
matrix:
602+
include:
603+
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
604+
- { sys: CLANG64, env: clang-x86_64, build: Release }
605+
606+
steps:
607+
- name: Clone
608+
uses: actions/checkout@v4
609+
610+
- name: Setup ${{ matrix.sys }}
611+
uses: msys2/setup-msys2@v2
612+
with:
613+
update: true
614+
msystem: ${{matrix.sys}}
615+
install: >-
616+
base-devel
617+
mingw-w64-${{matrix.env}}-toolchain
618+
mingw-w64-${{matrix.env}}-cmake
619+
mingw-w64-${{matrix.env}}-openblas
620+
621+
- name: Build using make
622+
shell: msys2 {0}
623+
run: |
624+
make -j $(nproc)
625+
626+
- name: Clean after building using make
627+
shell: msys2 {0}
628+
run: |
629+
make clean
630+
631+
- name: Build using make w/ OpenBLAS
632+
shell: msys2 {0}
633+
run: |
634+
make LLAMA_OPENBLAS=1 -j $(nproc)
635+
636+
- name: Build using CMake
637+
shell: msys2 {0}
638+
run: |
639+
cmake -B build
640+
cmake --build build --config ${{ matrix.build }} -j $(nproc)
641+
642+
- name: Clean after building using CMake
643+
shell: msys2 {0}
644+
run: |
645+
rm -rf build
646+
647+
- name: Build using CMake w/ OpenBLAS
648+
shell: msys2 {0}
649+
run: |
650+
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
651+
cmake --build build --config ${{ matrix.build }} -j $(nproc)
652+
563653
windows-latest-cmake:
564654
runs-on: windows-latest
565655

.github/workflows/server.yml

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323
- cron: '2 4 * * *'
2424

2525
concurrency:
26-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
26+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
2727
cancel-in-progress: true
2828

2929
jobs:
@@ -41,23 +41,16 @@ jobs:
4141
sanitizer: ""
4242
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
4343

44-
container:
45-
image: ubuntu:latest
46-
ports:
47-
- 8888
48-
options: --cpus 4
49-
5044
steps:
5145
- name: Dependencies
5246
id: depends
5347
run: |
54-
apt-get update
55-
apt-get -y install \
48+
sudo apt-get update
49+
sudo apt-get -y install \
5650
build-essential \
5751
xxd \
5852
git \
5953
cmake \
60-
python3-pip \
6154
curl \
6255
wget \
6356
language-pack-en \
@@ -70,6 +63,17 @@ jobs:
7063
fetch-depth: 0
7164
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
7265

66+
- name: Python setup
67+
id: setup_python
68+
uses: actions/setup-python@v5
69+
with:
70+
python-version: '3.11'
71+
72+
- name: Tests dependencies
73+
id: test_dependencies
74+
run: |
75+
pip install -r examples/server/tests/requirements.txt
76+
7377
- name: Verify server deps
7478
id: verify_server_deps
7579
run: |
@@ -100,10 +104,6 @@ jobs:
100104
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
101105
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
102106
103-
- name: Tests dependencies
104-
id: test_dependencies
105-
run: |
106-
pip install -r examples/server/tests/requirements.txt
107107
108108
- name: Tests
109109
id: server_integration_tests
@@ -129,6 +129,7 @@ jobs:
129129
uses: actions/checkout@v4
130130
with:
131131
fetch-depth: 0
132+
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
132133

133134
- name: libCURL
134135
id: get_libcurl

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ lcov-report/
3434
gcovr-report/
3535

3636
build*
37+
!build.zig
3738
cmake-build-*
3839
out/
3940
tmp/
@@ -100,6 +101,9 @@ qnt-*.txt
100101
perf-*.txt
101102

102103
examples/jeopardy/results.txt
104+
examples/server/*.html.hpp
105+
examples/server/*.js.hpp
106+
examples/server/*.mjs.hpp
103107

104108
poetry.lock
105109
poetry.toml

CMakeLists.txt

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,7 @@ else()
4343
set(LLAMA_METAL_DEFAULT OFF)
4444
endif()
4545

46-
# TODO: fix this for Android CI
47-
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
48-
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
49-
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
50-
#else()
51-
# set(LLAMA_LLAMAFILE_DEFAULT ON)
52-
#endif()
53-
54-
# TODO: temporary disable until MoE is fixed
55-
# https://github.com/ggerganov/llama.cpp/pull/6716
56-
set(LLAMA_LLAMAFILE_DEFAULT OFF)
46+
set(LLAMA_LLAMAFILE_DEFAULT ON)
5747

5848
# general
5949
option(BUILD_SHARED_LIBS "build shared libraries" OFF)

Makefile

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS
384384
MK_LDFLAGS += $(shell pkg-config --libs openblas)
385385
endif # LLAMA_OPENBLAS
386386

387-
# TODO: temporary disable until MoE is fixed
388-
# https://github.com/ggerganov/llama.cpp/pull/6716
389-
LLAMA_NO_LLAMAFILE := 1
390-
391387
ifndef LLAMA_NO_LLAMAFILE
392388
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
393389
OBJS += sgemm.o
@@ -699,7 +695,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
699695
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
700696
$(CXX) $(CXXFLAGS) -c $< -o $@
701697

702-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
698+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
703699
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
704700

705701
common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -772,7 +768,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
772768
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
773769
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
774770

775-
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
771+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
776772
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
777773
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
778774

@@ -800,10 +796,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
800796
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
801797
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
802798

803-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
799+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804800
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
805801
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
806802

803+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
804+
examples/server/%.hpp: examples/server/public/% Makefile
805+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
806+
echo "unsigned char $${NAME}[] = {" && \
807+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
808+
echo "};" && \
809+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
810+
) > $@
811+
807812
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
808813
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
809814
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

README-sycl.md

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh
229229
# Build LLAMA with MKL BLAS acceleration for intel GPU
230230
mkdir -p build && cd build
231231

232-
# Option 1: Use FP16 for better performance in long-prompt inference
233-
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
234-
235-
# Option 2: Use FP32 by default
232+
# Option 1: Use FP32 (recommended for better performance in most cases)
236233
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
237234

235+
# Option 2: Use FP16
236+
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
237+
238238
#build all binary
239239
cmake --build . --config Release -j -v
240240
```
@@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
250250
# Build LLAMA with Nvidia BLAS acceleration through SYCL
251251
mkdir -p build && cd build
252252

253-
# Option 1: Use FP16 for better performance in long-prompt inference
254-
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
255-
256-
# Option 2: Use FP32 by default
253+
# Option 1: Use FP32 (recommended for better performance in most cases)
257254
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
258255

256+
# Option 2: Use FP16
257+
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
258+
259259
#build all binary
260260
cmake --build . --config Release -j -v
261261

@@ -416,6 +416,10 @@ mkdir -p build
416416
cd build
417417
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
418418
419+
# Option 1: Use FP32 (recommended for better performance in most cases)
420+
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
421+
422+
# Option 2: Or FP16
419423
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
420424
421425
make -j

0 commit comments

Comments
 (0)