cont

ggerganov · ggerganov · commit 861ad6f0f67e · 2024-08-29T12:56:43.000+03:00
ggml-ci
diff --git a/common/common.cpp b/common/common.cpp
@@ -584,12 +584,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--samplers") {
         CHECK_ARG
         const auto sampler_names = string_split(argv[i], ';');
-        sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
+        sparams.samplers = llama_sampling_types_from_names(sampler_names, true);
         return true;
     }
     if (arg == "--sampling-seq") {
         CHECK_ARG
-        sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
+        sparams.samplers = llama_sampling_types_from_chars(argv[i]);
         return true;
     }
     if (arg == "--top-p") {
@@ -1438,9 +1438,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
     std::string sampler_type_chars;
     std::string sampler_type_names;
-    for (const auto sampler_type : sparams.samplers_sequence) {
-        sampler_type_chars += static_cast<char>(sampler_type);
-        sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
+    for (const auto & sampler : sparams.samplers) {
+        sampler_type_chars += llama_sampling_type_to_chr(sampler);
+        sampler_type_names += llama_sampling_type_to_str(sampler) + ";";
     }
     sampler_type_names.pop_back();
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -32,6 +32,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_model * m
         lparams.penalize_nl       = params.penalize_nl;
         lparams.ignore_eos        = params.ignore_eos;
 
+        lparams.n_samplers = params.samplers.size();
+
         result->smpl = llama_sampling_init(model, lparams);
 
         llama_sampling_set_grammar   (result->smpl, params.grammar.c_str(), "root");
@@ -101,7 +103,7 @@ std::string llama_sampling_print(const gpt_sampling_params & params) {
 std::string llama_sampling_order_print(const gpt_sampling_params & params) {
     std::string result = "CFG -> Penalties ";
     if (params.mirostat == 0) {
-        for (auto sampler_type : params.samplers_sequence) {
+        for (auto sampler_type : params.samplers) {
             const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
             if (!sampler_type_name.empty()) {
                 result += "-> " + sampler_type_name + " ";
@@ -114,6 +116,18 @@ std::string llama_sampling_order_print(const gpt_sampling_params & params) {
     return result;
 }
 
+char llama_sampling_type_to_chr(llama_sampler_type sampler_type) {
+    switch (sampler_type) {
+        case LLAMA_SAMPLER_TYPE_TOP_K:       return 'k';
+        case LLAMA_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case LLAMA_SAMPLER_TYPE_TOP_P:       return 'p';
+        case LLAMA_SAMPLER_TYPE_MIN_P:       return 'm';
+        case LLAMA_SAMPLER_TYPE_TEMPERATURE: return 't';
+        default : return '?';
+    }
+}
+
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
     switch (sampler_type) {
         case LLAMA_SAMPLER_TYPE_TOP_K:       return "top_k";
@@ -128,26 +142,26 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
 
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
     std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       LLAMA_SAMPLER_TYPE_TOP_K},
-        {"top_p",       LLAMA_SAMPLER_TYPE_TOP_P},
-        {"typical_p",   LLAMA_SAMPLER_TYPE_TYPICAL_P},
-        {"min_p",       LLAMA_SAMPLER_TYPE_MIN_P},
-        {"tfs_z",       LLAMA_SAMPLER_TYPE_TFS_Z},
-        {"temperature", LLAMA_SAMPLER_TYPE_TEMPERATURE}
+        { "top_k",       LLAMA_SAMPLER_TYPE_TOP_K },
+        { "top_p",       LLAMA_SAMPLER_TYPE_TOP_P },
+        { "typical_p",   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       LLAMA_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "temperature", LLAMA_SAMPLER_TYPE_TEMPERATURE },
     };
 
     // since samplers names are written multiple ways
     // make it ready for both system names and input names
     std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       LLAMA_SAMPLER_TYPE_TOP_K},
-        {"top-p",       LLAMA_SAMPLER_TYPE_TOP_P},
-        {"nucleus",     LLAMA_SAMPLER_TYPE_TOP_P},
-        {"typical-p",   LLAMA_SAMPLER_TYPE_TYPICAL_P},
-        {"typical",     LLAMA_SAMPLER_TYPE_TYPICAL_P},
-        {"min-p",       LLAMA_SAMPLER_TYPE_MIN_P},
-        {"tfs-z",       LLAMA_SAMPLER_TYPE_TFS_Z},
-        {"tfs",         LLAMA_SAMPLER_TYPE_TFS_Z},
-        {"temp",        LLAMA_SAMPLER_TYPE_TEMPERATURE}
+        { "top-k",       LLAMA_SAMPLER_TYPE_TOP_K },
+        { "top-p",       LLAMA_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     LLAMA_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       LLAMA_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "temp",        LLAMA_SAMPLER_TYPE_TEMPERATURE },
     };
 
     std::vector<llama_sampler_type> sampler_types;
@@ -172,12 +186,12 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
 
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
     std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', LLAMA_SAMPLER_TYPE_TOP_K},
-        {'p', LLAMA_SAMPLER_TYPE_TOP_P},
-        {'y', LLAMA_SAMPLER_TYPE_TYPICAL_P},
-        {'m', LLAMA_SAMPLER_TYPE_MIN_P},
-        {'f', LLAMA_SAMPLER_TYPE_TFS_Z},
-        {'t', LLAMA_SAMPLER_TYPE_TEMPERATURE}
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_K),       LLAMA_SAMPLER_TYPE_TOP_K },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TFS_Z),       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TYPICAL_P),   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_P),       LLAMA_SAMPLER_TYPE_TOP_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_MIN_P),       LLAMA_SAMPLER_TYPE_MIN_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TEMPERATURE), LLAMA_SAMPLER_TYPE_TEMPERATURE }
     };
 
     std::vector<llama_sampler_type> sampler_types;
@@ -199,10 +213,10 @@ static void sampler_queue(
 
     const gpt_sampling_params & params = ctx_sampling->params;
 
-    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
+    const std::vector<llama_sampler_type> & samplers = params.samplers;
 
-    for (auto sampler_type : samplers_sequence) {
-        switch (sampler_type) {
+    for (const auto & sampler : samplers) {
+        switch (sampler) {
             case LLAMA_SAMPLER_TYPE_TOP_K:       llama_sampling_top_k    (smpl, cur_p); break;
             case LLAMA_SAMPLER_TYPE_TFS_Z:       llama_sampling_tail_free(smpl, cur_p); break;
             case LLAMA_SAMPLER_TYPE_TYPICAL_P:   llama_sampling_typical  (smpl, cur_p); break;
diff --git a/common/sampling.h b/common/sampling.h
@@ -30,7 +30,7 @@ typedef struct gpt_sampling_params {
     bool    penalize_nl       = false; // consider newlines as a repeatable token
     bool    ignore_eos        = false;
 
-    std::vector<llama_sampler_type> samplers_sequence = {
+    std::vector<llama_sampler_type> samplers = {
         LLAMA_SAMPLER_TYPE_TOP_K,
         LLAMA_SAMPLER_TYPE_TFS_Z,
         LLAMA_SAMPLER_TYPE_TYPICAL_P,
@@ -78,6 +78,7 @@ std::string llama_sampling_print(const gpt_sampling_params & params);
 // Print sampling order into a string
 std::string llama_sampling_order_print(const gpt_sampling_params & params);
 
+char        llama_sampling_type_to_chr(llama_sampler_type sampler_type);
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
 
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
@@ -155,7 +155,7 @@ while n_cur <= n_len {
         llama_sampling_top_p(smpl, &candidates_p)
         llama_sampling_temp (smpl, &candidates_p)
 
-        let new_token_id = llama_sampling_sample(smpl, &candidates_p)
+        let new_token_id = llama_sampling_sample_dist(smpl, &candidates_p)
 
         // const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1039,17 +1039,17 @@ struct server_context {
         }
 
         {
-            const auto & samplers_sequence = data.find("samplers");
-            if (samplers_sequence != data.end() && samplers_sequence->is_array()) {
+            const auto & samplers = data.find("samplers");
+            if (samplers != data.end() && samplers->is_array()) {
                 std::vector<std::string> sampler_names;
-                for (const auto & sampler_name : *samplers_sequence) {
+                for (const auto & sampler_name : *samplers) {
                     if (sampler_name.is_string()) {
                         sampler_names.emplace_back(sampler_name);
                     }
                 }
-                slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
+                slot.sparams.samplers = llama_sampling_types_from_names(sampler_names, false);
             } else {
-                slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
+                slot.sparams.samplers = default_sparams.samplers;
             }
         }
 
@@ -1265,10 +1265,10 @@ struct server_context {
     }
 
     json get_formated_generation(const server_slot & slot) const {
-        std::vector<std::string> samplers_sequence;
-        samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
-        for (const auto & sampler_type : slot.sparams.samplers_sequence) {
-            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
+        std::vector<std::string> samplers;
+        samplers.reserve(slot.sparams.samplers.size());
+        for (const auto & sampler : slot.sparams.samplers) {
+            samplers.emplace_back(llama_sampling_type_to_str(sampler));
         }
 
         return json {
@@ -1302,7 +1302,7 @@ struct server_context {
             {"n_probs",                   slot.sparams.n_probs},
             {"min_keep",                  slot.sparams.min_keep},
             {"grammar",                   slot.sparams.grammar},
-            {"samplers",                  samplers_sequence},
+            {"samplers",                  samplers},
         };
     }
 
diff --git a/include/llama.h b/include/llama.h
@@ -400,7 +400,7 @@ extern "C" {
         float    mirostat_tau;      // target entropy
         float    mirostat_eta;      // learning rate
 
-        // samples
+        // samplers
         int32_t n_samplers;
         enum llama_sampler_type samplers[LLAMA_MAX_SAMPLERS];
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -40,6 +40,10 @@ struct llama_sampling * llama_sampling_init_impl(const struct llama_vocab & voca
 
     result->prev = ring_buffer<llama_token>(params.n_prev);
 
+    for (int i = 0; i < params.n_samplers; ++i) {
+        result->samplers.push_back(params.samplers[i]);
+    }
+
     llama_sampling_set_rng_seed_impl(*result, params.seed);
 
     return result;
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
@@ -27,10 +27,12 @@ struct llama_sampling {
 
     const struct llama_vocab & vocab;
 
-    struct llama_grammar * grammar = nullptr;
+    std::vector<llama_sampler_type> samplers;
 
     ring_buffer<llama_token> prev;
 
+    struct llama_grammar * grammar = nullptr;
+
     // mirostat sampler state
     float mirostat_mu;
 
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -17430,7 +17430,7 @@ struct llama_sampling_params llama_sampling_default_params() {
         /*.mirostat_tau      =*/ 5.00f,
         /*.mirostat_eta      =*/ 0.10f,
         /*.n_samplers        =*/ 3,
-        /*.samplers          =*/ { LLAMA_SAMPLER_TYPE_TOP_K, LLAMA_SAMPLER_TYPE_TOP_P, LLAMA_SAMPLER_TYPE_TEMPERATURE },
+        /*.samplers          =*/ { LLAMA_SAMPLER_TYPE_TEMPERATURE, LLAMA_SAMPLER_TYPE_TOP_K, LLAMA_SAMPLER_TYPE_TOP_P, },
         /*.penalize_nl       =*/ false,
         /*.ignore_eos        =*/ false,
     };

Original file line number	Diff line number	Diff line change
`@@ -1039,17 +1039,17 @@ struct server_context {`
`1039`	`1039`	`}`
`1040`	`1040`
`1041`	`1041`	`{`
`1042`		`- const auto & samplers_sequence = data.find("samplers");`
`1043`		`- if (samplers_sequence != data.end() && samplers_sequence->is_array()) {`
	`1042`	`+ const auto & samplers = data.find("samplers");`
	`1043`	`+ if (samplers != data.end() && samplers->is_array()) {`
`1044`	`1044`	`std::vector<std::string> sampler_names;`
`1045`		`- for (const auto & sampler_name : *samplers_sequence) {`
	`1045`	`+ for (const auto & sampler_name : *samplers) {`
`1046`	`1046`	`if (sampler_name.is_string()) {`
`1047`	`1047`	`sampler_names.emplace_back(sampler_name);`
`1048`	`1048`	`}`
`1049`	`1049`	`}`
`1050`		`- slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);`
	`1050`	`+ slot.sparams.samplers = llama_sampling_types_from_names(sampler_names, false);`
`1051`	`1051`	`} else {`
`1052`		`- slot.sparams.samplers_sequence = default_sparams.samplers_sequence;`
	`1052`	`+ slot.sparams.samplers = default_sparams.samplers;`
`1053`	`1053`	`}`
`1054`	`1054`	`}`
`1055`	`1055`
`@@ -1265,10 +1265,10 @@ struct server_context {`
`1265`	`1265`	`}`
`1266`	`1266`
`1267`	`1267`	`json get_formated_generation(const server_slot & slot) const {`
`1268`		`- std::vector<std::string> samplers_sequence;`
`1269`		`- samplers_sequence.reserve(slot.sparams.samplers_sequence.size());`
`1270`		`- for (const auto & sampler_type : slot.sparams.samplers_sequence) {`
`1271`		`- samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));`
	`1268`	`+ std::vector<std::string> samplers;`
	`1269`	`+ samplers.reserve(slot.sparams.samplers.size());`
	`1270`	`+ for (const auto & sampler : slot.sparams.samplers) {`
	`1271`	`+ samplers.emplace_back(llama_sampling_type_to_str(sampler));`
`1272`	`1272`	`}`
`1273`	`1273`
`1274`	`1274`	`return json {`
`@@ -1302,7 +1302,7 @@ struct server_context {`
`1302`	`1302`	`{"n_probs", slot.sparams.n_probs},`
`1303`	`1303`	`{"min_keep", slot.sparams.min_keep},`
`1304`	`1304`	`{"grammar", slot.sparams.grammar},`
`1305`		`- {"samplers", samplers_sequence},`
	`1305`	`+ {"samplers", samplers},`
`1306`	`1306`	`};`
`1307`	`1307`	`}`
`1308`	`1308`