ggml-org
diff --git a/‎common/common.cpp
Lines changed: 2 additions & 34 deletions b/‎common/common.cpp
Lines changed: 2 additions & 34 deletions
diff --git a/‎common/sampling.cpp
Lines changed: 6 additions & 87 deletions b/‎common/sampling.cpp
Lines changed: 6 additions & 87 deletions
diff --git a/‎common/sampling.h
Lines changed: 2 additions & 13 deletions b/‎common/sampling.h
Lines changed: 2 additions & 13 deletions
diff --git a/‎examples/infill/infill.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/infill/infill.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llava/llava-cli.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/llava/llava-cli.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llava/minicpmv-cli.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/llava/minicpmv-cli.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/lookahead/lookahead.cpp
Lines changed: 3 additions & 3 deletions b/‎examples/lookahead/lookahead.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/lookup/lookup.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/lookup/lookup.cpp
Lines changed: 1 addition & 1 deletion
@@ -310,7 +310,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         string_process_escapes(params.prompt);
         string_process_escapes(params.input_prefix);
         string_process_escapes(params.input_suffix);
-        string_process_escapes(sparams.cfg_negative_prompt);
         for (auto & antiprompt : params.antiprompt) {
             string_process_escapes(antiprompt);
         }
@@ -321,8 +320,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         params.kv_overrides.back().key[0] = 0;
     }
 
-    if (params.sparams.seed == LLAMA_DEFAULT_SEED) {
-        params.sparams.seed = time(NULL);
+    if (sparams.seed == LLAMA_DEFAULT_SEED) {
+        sparams.seed = time(NULL);
     }
 
     return true;
@@ -665,30 +664,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         sparams.mirostat_tau = std::stof(argv[i]);
         return true;
     }
-    if (arg == "--cfg-negative-prompt") {
-        CHECK_ARG
-        sparams.cfg_negative_prompt = argv[i];
-        return true;
-    }
-    if (arg == "--cfg-negative-prompt-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-        if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-            sparams.cfg_negative_prompt.pop_back();
-        }
-        return true;
-    }
-    if (arg == "--cfg-scale") {
-        CHECK_ARG
-        sparams.cfg_scale = std::stof(argv[i]);
-        return true;
-    }
     if (arg == "-b" || arg == "--batch-size") {
         CHECK_ARG
         params.n_batch = std::stoi(argv[i]);
@@ -1577,11 +1552,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
                                                                         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
                                                                         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
-    options.push_back({ "main",        "       --cfg-negative-prompt PROMPT",
-                                                                        "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
-    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
-                                                                        "negative prompt file to use for guidance" });
-    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
     options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                         "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                         "if suffix/prefix are specified, template will be disabled\n"
@@ -3258,8 +3228,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
 
     fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
 
@@ -29,7 +29,6 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_sampling_pa
         lp.mirostat          = params.mirostat;
         lp.mirostat_tau      = params.mirostat_tau;
         lp.mirostat_eta      = params.mirostat_eta;
-        lp.cfg_scale         = params.cfg_scale;
         lp.penalize_nl       = params.penalize_nl;
         lp.ignore_eos        = params.ignore_eos;
 
@@ -51,9 +50,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
 
 void llama_sampling_reset(llama_sampling_context * ctx) {
     llama_sampling_reset(ctx->smpl);
-
-    ctx->cur.clear();
-    ctx->org.clear();
 }
 
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
@@ -219,61 +215,11 @@ static void sampler_queue(
     }
 }
 
-llama_token_data_array llama_sampling_prepare(
+void llama_sampling_prepare(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
         int idx) {
-    const gpt_sampling_params & params = ctx_sampling->params;
-
-    auto & cur = ctx_sampling->cur;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    // apply params.logit_bias map
-    for (const auto & logit_bias : params.logit_bias) {
-        logits[logit_bias.token] += logit_bias.bias;
-    }
-
-    if (params.ignore_eos) {
-        logits[llama_token_eos(llama_get_model(ctx_main))] = -INFINITY;
-    }
-
-    llama_sampling * smpl = ctx_sampling->smpl;
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sampling_cfg(smpl, logits, logits_guidance);
-    }
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    cur.resize(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sampling_penalties(smpl, &cur_p);
-
-        if (!params.penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    return cur_p;
+    llama_sampling_set_logits(ctx_sampling->smpl, llama_get_logits_ith(ctx_main, idx));
 }
 
 static llama_token llama_sampling_sample(
@@ -325,41 +271,14 @@ static llama_token llama_sampling_sample(
 llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
         int idx) {
-    llama_token_data_array cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx);
-
-    if (ctx_sampling->params.grammar.empty()) {
-        return llama_sampling_sample(ctx_sampling, &cur_p);
-    }
+    llama_sampling_prepare(ctx_sampling, ctx_main, idx);
 
-    // TODO: this logic is confusing, try to figure out a better way to handle this
+    auto * cur_p = llama_sampling_get_candidates(ctx_sampling->smpl);
 
-    // store the original candidates
-    ctx_sampling->org = ctx_sampling->cur;
-    llama_token_data_array org_p = { ctx_sampling->org.data(), ctx_sampling->org.size(), false };
+    llama_sampling_grammar(ctx_sampling->smpl, cur_p);
 
-    llama_token id = llama_sampling_sample(ctx_sampling, &cur_p);
-
-    // Create an array with a single token data element for the sampled id
-    llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-    llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-    // Apply grammar constraints to the single token
-    llama_sampling_grammar(ctx_sampling->smpl, &single_token_data_array);
-
-    // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-    const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-    if (!is_valid) {
-        llama_sampling_grammar(ctx_sampling->smpl, &org_p);
-
-        id = llama_sampling_sample(ctx_sampling, &org_p);
-
-        ctx_sampling->cur = std::move(ctx_sampling->org);
-    }
-
-    return id;
+    return llama_sampling_sample(ctx_sampling, cur_p);
 }
 
 void llama_sampling_accept(
 
@@ -50,11 +50,6 @@ typedef struct gpt_sampling_params {
 
     std::string grammar;  // optional BNF-like grammar to constrain sampling
 
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
-
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
 } gpt_sampling_params;
 
@@ -65,9 +60,6 @@ struct llama_sampling_context {
     gpt_sampling_params params;
 
     llama_sampling * smpl;
-
-    std::vector<llama_token_data> cur;
-    std::vector<llama_token_data> org;
 };
 
 // Create a new sampling context instance.
@@ -101,11 +93,10 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
 
 // Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-llama_token_data_array llama_sampling_prepare(
+void llama_sampling_prepare(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx);
 
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
@@ -117,7 +108,6 @@ llama_token_data_array llama_sampling_prepare(
 //  - ctx_sampling: sampling-specific context
 //
 // optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
 //  - idx:          sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
@@ -131,7 +121,6 @@ llama_token_data_array llama_sampling_prepare(
 llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
         int idx = -1);
 
 void llama_sampling_accept(
 
@@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
         embd.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx);
 
             llama_sampling_accept(ctx_sampling, id, true);
 
 
@@ -43,7 +43,7 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
 static const char * sample(struct llama_sampling_context * ctx_sampling,
                            struct llama_context * ctx_llama,
                            int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama);
     llama_sampling_accept(ctx_sampling, id, true);
     static std::string ret;
     if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
 
@@ -166,7 +166,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
 static const char * sample(struct llama_sampling_context * ctx_sampling,
                            struct llama_context * ctx_llama,
                            int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama);
     llama_sampling_accept(ctx_sampling, id, true);
     static std::string ret;
     if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
 
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
 
     // sample first token
     {
-        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
+        id = llama_sampling_sample(ctx_sampling, ctx, 0);
 
         llama_sampling_accept(ctx_sampling, id, true);
 
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
             }
 
             // sample the next token
-            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
+            id = llama_sampling_sample(ctx_sampling, ctx, i_batch);
 
             llama_sampling_accept(ctx_sampling, id, true);
 
@@ -360,7 +360,7 @@ int main(int argc, char ** argv) {
                 if (v == 0) {
                     // sample from the last level
                     for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                     }
                 } else {
                     for (int i = 0; i < W; i++) {
 
@@ -128,7 +128,7 @@ int main(int argc, char ** argv){
         int i_dft = 0;
         while (true) {
             // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx, i_dft);
 
             llama_sampling_accept(ctx_sampling, id, true);
Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {`
`158`	`158`
`159`	`159`	`// sample first token`
`160`	`160`	`{`
`161`		`- id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);`
	`161`	`+ id = llama_sampling_sample(ctx_sampling, ctx, 0);`
`162`	`162`
`163`	`163`	`llama_sampling_accept(ctx_sampling, id, true);`
`164`	`164`
`@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {`
`283`	`283`	`}`
`284`	`284`
`285`	`285`	`// sample the next token`
`286`		`- id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);`
	`286`	`+ id = llama_sampling_sample(ctx_sampling, ctx, i_batch);`
`287`	`287`
`288`	`288`	`llama_sampling_accept(ctx_sampling, id, true);`
`289`	`289`
`@@ -360,7 +360,7 @@ int main(int argc, char ** argv) {`
`360`	`360`	`if (v == 0) {`
`361`	`361`	`// sample from the last level`
`362`	`362`	`for (int i = 0; i < W; i++) {`
`363`		`- tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()(N-1) + W(N - 2) + i);`
	`363`	`+ tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, ngrams_cur.size()(N-1) + W(N - 2) + i);`
`364`	`364`	`}`
`365`	`365`	`} else {`
`366`	`366`	`for (int i = 0; i < W; i++) {`