llama : refactor samplers (wip)

ggerganov · ggerganov · commit 81471a7dea49 · 2024-08-29T12:05:04.000+03:00
ggml-ci
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -116,38 +116,38 @@ std::string llama_sampling_order_print(const gpt_sampling_params & params) {
 
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
     switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
+        case LLAMA_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case LLAMA_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return "typical_p";
+        case LLAMA_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case LLAMA_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case LLAMA_SAMPLER_TYPE_TEMPERATURE: return "temperature";
         default : return "";
     }
 }
 
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
     std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
+        {"top_k",       LLAMA_SAMPLER_TYPE_TOP_K},
+        {"top_p",       LLAMA_SAMPLER_TYPE_TOP_P},
+        {"typical_p",   LLAMA_SAMPLER_TYPE_TYPICAL_P},
+        {"min_p",       LLAMA_SAMPLER_TYPE_MIN_P},
+        {"tfs_z",       LLAMA_SAMPLER_TYPE_TFS_Z},
+        {"temperature", LLAMA_SAMPLER_TYPE_TEMPERATURE}
     };
 
     // since samplers names are written multiple ways
     // make it ready for both system names and input names
     std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
+        {"top-k",       LLAMA_SAMPLER_TYPE_TOP_K},
+        {"top-p",       LLAMA_SAMPLER_TYPE_TOP_P},
+        {"nucleus",     LLAMA_SAMPLER_TYPE_TOP_P},
+        {"typical-p",   LLAMA_SAMPLER_TYPE_TYPICAL_P},
+        {"typical",     LLAMA_SAMPLER_TYPE_TYPICAL_P},
+        {"min-p",       LLAMA_SAMPLER_TYPE_MIN_P},
+        {"tfs-z",       LLAMA_SAMPLER_TYPE_TFS_Z},
+        {"tfs",         LLAMA_SAMPLER_TYPE_TFS_Z},
+        {"temp",        LLAMA_SAMPLER_TYPE_TEMPERATURE}
     };
 
     std::vector<llama_sampler_type> sampler_types;
@@ -172,12 +172,12 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
 
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
     std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
+        {'k', LLAMA_SAMPLER_TYPE_TOP_K},
+        {'p', LLAMA_SAMPLER_TYPE_TOP_P},
+        {'y', LLAMA_SAMPLER_TYPE_TYPICAL_P},
+        {'m', LLAMA_SAMPLER_TYPE_MIN_P},
+        {'f', LLAMA_SAMPLER_TYPE_TFS_Z},
+        {'t', LLAMA_SAMPLER_TYPE_TEMPERATURE}
     };
 
     std::vector<llama_sampler_type> sampler_types;
@@ -203,12 +203,12 @@ static void sampler_queue(
 
     for (auto sampler_type : samplers_sequence) {
         switch (sampler_type) {
-            case llama_sampler_type::TOP_K:       llama_sampling_top_k    (smpl, cur_p); break;
-            case llama_sampler_type::TFS_Z:       llama_sampling_tail_free(smpl, cur_p); break;
-            case llama_sampler_type::TYPICAL_P:   llama_sampling_typical  (smpl, cur_p); break;
-            case llama_sampler_type::TOP_P:       llama_sampling_top_p    (smpl, cur_p); break;
-            case llama_sampler_type::MIN_P:       llama_sampling_min_p    (smpl, cur_p); break;
-            case llama_sampler_type::TEMPERATURE: llama_sampling_temp     (smpl, cur_p); break;
+            case LLAMA_SAMPLER_TYPE_TOP_K:       llama_sampling_top_k    (smpl, cur_p); break;
+            case LLAMA_SAMPLER_TYPE_TFS_Z:       llama_sampling_tail_free(smpl, cur_p); break;
+            case LLAMA_SAMPLER_TYPE_TYPICAL_P:   llama_sampling_typical  (smpl, cur_p); break;
+            case LLAMA_SAMPLER_TYPE_TOP_P:       llama_sampling_top_p    (smpl, cur_p); break;
+            case LLAMA_SAMPLER_TYPE_MIN_P:       llama_sampling_min_p    (smpl, cur_p); break;
+            case LLAMA_SAMPLER_TYPE_TEMPERATURE: llama_sampling_temp     (smpl, cur_p); break;
             default : break;
         }
     }
diff --git a/common/sampling.h b/common/sampling.h
@@ -5,47 +5,38 @@
 #include <string>
 #include <vector>
 
-// sampler types
-enum class llama_sampler_type : char {
-    TOP_K       = 'k',
-    TOP_P       = 'p',
-    MIN_P       = 'm',
-    TFS_Z       = 'f',
-    TYPICAL_P   = 'y',
-    TEMPERATURE = 't'
-};
-
 // sampling parameters
 typedef struct gpt_sampling_params {
-    uint32_t seed              = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
-    int32_t  n_prev            = 64;                 // number of previous tokens to remember
-    int32_t  n_probs           = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t  min_keep          = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t  top_k             = 40;                 // <= 0 to use vocab size
-    float    top_p             = 0.95f;              // 1.0 = disabled
-    float    min_p             = 0.05f;              // 0.0 = disabled
-    float    tfs_z             = 1.00f;              // 1.0 = disabled
-    float    typical_p         = 1.00f;              // 1.0 = disabled
-    float    temp              = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float    dynatemp_range    = 0.00f;              // 0.0 = disabled
-    float    dynatemp_exponent = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t  penalty_last_n    = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float    penalty_repeat    = 1.00f;              // 1.0 = disabled
-    float    penalty_freq      = 0.00f;              // 0.0 = disabled
-    float    penalty_present   = 0.00f;              // 0.0 = disabled
-    int32_t  mirostat          = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float    mirostat_tau      = 5.00f;              // target entropy
-    float    mirostat_eta      = 0.10f;              // learning rate
-    bool     penalize_nl       = false;              // consider newlines as a repeatable token
-    bool     ignore_eos        = false;
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
 
     std::vector<llama_sampler_type> samplers_sequence = {
-        llama_sampler_type::TOP_K,
-        llama_sampler_type::TFS_Z,
-        llama_sampler_type::TYPICAL_P,
-        llama_sampler_type::TOP_P,
-        llama_sampler_type::MIN_P,
-        llama_sampler_type::TEMPERATURE
+        LLAMA_SAMPLER_TYPE_TOP_K,
+        LLAMA_SAMPLER_TYPE_TFS_Z,
+        LLAMA_SAMPLER_TYPE_TYPICAL_P,
+        LLAMA_SAMPLER_TYPE_TOP_P,
+        LLAMA_SAMPLER_TYPE_MIN_P,
+        LLAMA_SAMPLER_TYPE_TEMPERATURE
     };
 
     std::string grammar;  // optional BNF-like grammar to constrain sampling
diff --git a/include/llama.h b/include/llama.h
@@ -46,6 +46,8 @@
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 2
 
+#define LLAMA_MAX_SAMPLERS 16
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -203,6 +205,16 @@ extern "C" {
         LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
     };
 
+    enum llama_sampler_type {
+        LLAMA_SAMPLER_TYPE_NONE        = 0,
+        LLAMA_SAMPLER_TYPE_TOP_K       = 1,
+        LLAMA_SAMPLER_TYPE_TOP_P       = 2,
+        LLAMA_SAMPLER_TYPE_MIN_P       = 3,
+        LLAMA_SAMPLER_TYPE_TFS_Z       = 4,
+        LLAMA_SAMPLER_TYPE_TYPICAL_P   = 5,
+        LLAMA_SAMPLER_TYPE_TEMPERATURE = 6,
+    };
+
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -387,7 +399,10 @@ extern "C" {
         int32_t  mirostat;          // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
         float    mirostat_tau;      // target entropy
         float    mirostat_eta;      // learning rate
-        float    cfg_scale;         // classifier-free guidance scale
+
+        // samples
+        int32_t n_samplers;
+        enum llama_sampler_type samplers[LLAMA_MAX_SAMPLERS];
 
         // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
         bool penalize_nl; // consider newlines as a repeatable token
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -17429,7 +17429,8 @@ struct llama_sampling_params llama_sampling_default_params() {
         /*.mirostat          =*/ 0,
         /*.mirostat_tau      =*/ 5.00f,
         /*.mirostat_eta      =*/ 0.10f,
-        /*.cfg_scale         =*/ 1.00f,
+        /*.n_samplers        =*/ 3,
+        /*.samplers          =*/ { LLAMA_SAMPLER_TYPE_TOP_K, LLAMA_SAMPLER_TYPE_TOP_P, LLAMA_SAMPLER_TYPE_TEMPERATURE },
         /*.penalize_nl       =*/ false,
         /*.ignore_eos        =*/ false,
     };