Skip to content

Commit f53c47e

Browse files
ggerganovdsx1986
authored andcommitted
llama : refactor sampling v2 (ggml-org#9294)
- Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings`
1 parent 3f4bea1 commit f53c47e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+3401
-2818
lines changed

Makefile

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,6 @@ OBJ_COMMON = \
927927
common/ngram-cache.o \
928928
common/sampling.o \
929929
common/train.o \
930-
common/grammar-parser.o \
931930
common/build-info.o \
932931
common/json-schema-to-grammar.o
933932

@@ -1167,11 +1166,6 @@ common/console.o: \
11671166
common/console.h
11681167
$(CXX) $(CXXFLAGS) -c $< -o $@
11691168

1170-
common/grammar-parser.o: \
1171-
common/grammar-parser.cpp \
1172-
common/grammar-parser.h
1173-
$(CXX) $(CXXFLAGS) -c $< -o $@
1174-
11751169
common/json-schema-to-grammar.o: \
11761170
common/json-schema-to-grammar.cpp \
11771171
common/json-schema-to-grammar.h

common/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ add_library(${TARGET} STATIC
5858
sampling.cpp
5959
console.h
6060
console.cpp
61-
grammar-parser.h
62-
grammar-parser.cpp
6361
json.hpp
6462
json-schema-to-grammar.cpp
6563
train.h

common/common.cpp

Lines changed: 37 additions & 72 deletions
Large diffs are not rendered by default.

common/common.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ struct cpu_params {
7777
};
7878

7979
struct gpt_params {
80-
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
81-
8280
int32_t n_predict = -1; // new tokens to predict
8381
int32_t n_ctx = 0; // context size
8482
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -120,8 +118,7 @@ struct gpt_params {
120118
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
121119
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
122120

123-
// // sampling parameters
124-
struct llama_sampling_params sparams;
121+
struct gpt_sampler_params sparams;
125122

126123
std::string model = ""; // model path
127124
std::string model_draft = ""; // draft model for speculative decoding
@@ -185,7 +182,6 @@ struct gpt_params {
185182
bool flash_attn = false; // flash attention
186183

187184
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
188-
bool ignore_eos = false; // ignore generated EOS tokens
189185
bool logits_all = false; // return logits for all tokens in the batch
190186
bool use_mmap = true; // use mmap for faster loads
191187
bool use_mlock = false; // use mlock to keep model in memory

0 commit comments

Comments
 (0)