diff --git a/.gitignore b/.gitignore index b862a0415f279..5c397c7501528 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ models-mnt /embedding /gguf /gguf-llama-simple +/infill /libllama.so /llama-bench /main diff --git a/Makefile b/Makefile index e07db8afa16f5..31800ae39bc68 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama @@ -513,6 +513,8 @@ main: examples/main/main.cpp build-info.h ggml. @echo @echo '==== Run ./main -h for help. ====' @echo +infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt new file mode 100644 index 0000000000000..046f9b1e7f4b2 --- /dev/null +++ b/examples/infill/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET infill) +add_executable(${TARGET} infill.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/infill/README.md b/examples/infill/README.md new file mode 100644 index 0000000000000..8c97f719b00a0 --- /dev/null +++ b/examples/infill/README.md @@ -0,0 +1,41 @@ +# llama.cpp/example/infill + +This example shows how to use the infill mode with Code Llama models supporting infill mode. +Currently the 7B and 13B models support infill mode. + +Infill supports most of the options available in the main example. + +For further information have a look at the main README.md in llama.cpp/example/main/README.md + +## Common Options + +In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models: + +- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. +- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. +- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. + +## Input Prompts + +The `infill` program provides several ways to interact with the LLaMA models using input prompts: + +- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option. +- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option. +- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) + +## Interaction + +The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first` + +### Interaction Options + +- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model. +- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. +- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. + +### Example + +```bash +./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " +``` diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp new file mode 100644 index 0000000000000..233c12a0878c1 --- /dev/null +++ b/examples/infill/infill.cpp @@ -0,0 +1,771 @@ +#include "common.h" + +#include "console.h" +#include "llama.h" +#include "build-info.h" +#include "grammar-parser.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; +static bool is_interacting = false; + +void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: infill\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_string_yaml_multiline(logfile, "output", output.c_str()); + dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting = true; + } else { + console::cleanup(); + printf("\n"); + llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + gpt_params params; + g_params = ¶ms; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("infill", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + + if (params.perplexity) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.embedding) { + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.instruct) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for instruct mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (!params.antiprompt.empty()) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { + printf("\n************\n"); + printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.random_prompt) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for random prompt mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (!params.path_prompt_cache.empty()) { + printf("\n************\n"); + printf("%s: infill does not support prompt caching\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.rope_freq_base != 10000.0) { + LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 1.0) { + LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); + } + + LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + + LOG("%s: llama backend init\n", __func__); + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + llama_context * ctx_guidance = NULL; + g_model = &model; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + LOG("%s: load the model and apply lora adapter, if any\n", __func__); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (params.cfg_scale > 1.f) { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } + + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } else if (params.n_ctx < 8) { + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + // print system information + { + LOG_TEE("\n"); + LOG_TEE("system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + // export the cgraph and exit + if (params.export_cgraph) { + llama_eval_export(ctx, "llama.ggml"); + llama_free(ctx); + llama_free_model(model); + + return 0; + } + + const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + LOG("add_bos: %d\n", add_bos); + + std::vector embd_inp; + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); + embd_inp = inp_pfx; + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp.push_back(llama_token_middle(ctx)); + + LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); + LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp)); + + // Should not run without any tokens + if (embd_inp.empty()) { + embd_inp.push_back(llama_token_bos(ctx)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp)); + } + + // Tokenize negative prompt + std::vector guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) { + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); + + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); + LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); + LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp)); + + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); + LOG("guidance_offset: %s", log_tostr(guidance_offset)); + } + + const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); + + if ((int) embd_inp.size() > n_ctx - 4) { + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { + params.n_keep = (int)embd_inp.size(); + } + + LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx)); + LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx)); + + + // enable interactive mode if interactive start is specified + if (params.interactive_first) { + params.interactive = true; + } + + if (params.verbose_prompt) { + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (ctx_guidance) { + LOG_TEE("\n"); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); + LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int) guidance_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); + } + } + + if (params.n_keep > 0) { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + LOG_TEE("'\n"); + } + LOG_TEE("\n"); + } + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + + LOG_TEE("%s: interactive mode on.\n", __func__); + + if (params.input_prefix_bos) { + LOG_TEE("Input prefix with BOS\n"); + } + + if (!params.input_prefix.empty()) { + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + } + + if (!params.input_suffix.empty()) { + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + } + } + LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", + params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("\n\n"); + + struct llama_grammar * grammar = NULL; + grammar_parser::parse_state parsed_grammar; + + if (!params.grammar.empty()) { + parsed_grammar = grammar_parser::parse(params.grammar.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + return 1; + } + LOG_TEE("%s: grammar:\n", __func__); + grammar_parser::print_grammar(stderr, parsed_grammar); + LOG_TEE("\n"); + + { + auto it = params.logit_bias.find(llama_token_eos(ctx)); + if (it != params.logit_bias.end() && it->second == -INFINITY) { + LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); + } + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + } + + // TODO: replace with ring-buffer + std::vector last_tokens(n_ctx); + std::fill(last_tokens.begin(), last_tokens.end(), 0); + LOG_TEE("\n##### Infill mode #####\n\n"); + if (params.infill) { + printf("\n************\n"); + printf("no need to specify '--infill', always running infill\n"); + printf("************\n\n"); + } + if (params.interactive) { + const char *control_message; + if (params.multiline_input) { + control_message = " - To return control to LLaMa, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to LLaMa.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + LOG_TEE("== Running in interactive mode. ==\n"); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); +#endif + LOG_TEE( "%s\n", control_message); + + is_interacting = params.interactive_first; + } + + bool input_echo = true; + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_past_guidance = 0; + + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + + // the first thing we will do is to output the prompt, so set color accordingly + console::set_display(console::prompt); + + std::vector embd; + std::vector embd_guidance; + + const int n_vocab = llama_n_vocab(ctx); + + std::vector candidates; + candidates.reserve(n_vocab); + + while (n_remain != 0 || params.interactive) { + // predict + if (!embd.empty()) { + // Note: n_ctx - 4 here is to match the logic for commandLine prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + + console::set_display(console::error); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console::set_display(console::reset); + fflush(stdout); + } + + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (params.n_predict == -2) { + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep); + + // always keep the first token - BOS + n_past = std::max(1, params.n_keep); + n_past_guidance = std::max(1, params.n_keep + guidance_offset); + + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + + // insert n_left/2 tokens at the start of embd from last_tokens + embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size()); + + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); + + } + + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + + if (ctx_guidance) { + int input_size = 0; + llama_token * input_buf = NULL; + + if (n_past_guidance < (int) guidance_inp.size()) { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end() + ); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + + LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance)); + } else { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); + + if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past += n_eval; + + LOG("n_past = %d\n", n_past); + } + + } + + embd.clear(); + embd_guidance.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + + const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates); + + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(id); + + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens)); + + embd.push_back(id); + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + LOG("n_remain: %d\n", n_remain); + } else { + // some user input remains from prompt or interaction, forward it to processing + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(embd_inp[n_consumed]); + ++n_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (input_echo) { + for (auto id : embd) { + const std::string token_str = llama_token_to_piece(ctx, id); + printf("%s", token_str.c_str()); + + if (embd.size() > 1) { + input_tokens.push_back(id); + } else { + output_tokens.push_back(id); + output_ss << token_str; + } + } + fflush(stdout); + } + // reset color to default if we there is no pending user input + if (input_echo && (int) embd_inp.size() == n_consumed) { + console::set_display(console::reset); + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + + // deal with eot token in infill mode + if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){ + if(is_interacting && !params.interactive_first) { + // print an eot token + printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); + } + fflush(stdout); + printf("\n"); + console::set_display(console::user_input); + std::string buffer; + std::string line; + bool another_line=true; + // set a new prefix via stdin + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + // check if we got an empty line, if so we use the old input + if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + params.input_prefix = buffer; + } + buffer.clear(); + // set a new suffix via stdin + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + // check if we got an empty line + if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + params.input_suffix = buffer; + } + buffer.clear(); + // done taking input, reset color + console::set_display(console::reset); + // tokenize new prefix and suffix + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); + embd_inp = inp_pfx; + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp.push_back(llama_token_middle(ctx)); + embd.clear(); + embd_guidance.clear(); + n_remain = params.n_predict; + n_past = 0; + n_consumed = 0; + // LOG_TEE("took new input\n"); + is_interacting = false; + } + // deal with end of text token in interactive mode + else if (last_tokens.back() == llama_token_eos(ctx)) { + LOG("found EOS token\n"); + + if (params.interactive) { + + is_interacting = true; + printf("\n"); + console::set_display(console::user_input); + fflush(stdout); + } + } + + if (n_past > 0 && is_interacting && !params.interactive) { + LOG("waiting for user input\n"); + + if (params.input_prefix_bos) { + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(ctx)); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + buffer += params.input_prefix; + printf("%s", buffer.c_str()); + } + + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // done taking input, reset color + console::set_display(console::reset); + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { + // append input suffix if any + if (!params.input_suffix.empty()) { + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + buffer += params.input_suffix; + printf("%s", params.input_suffix.c_str()); + } + + LOG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + const auto line_inp = ::llama_tokenize(ctx, buffer, false); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp)); + + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << llama_token_to_piece(ctx, token); + } + + n_remain -= line_inp.size(); + LOG("n_remain: %d\n", n_remain); + } else { + LOG("empty line, passing control back\n"); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0) { + if (is_interacting) { + // reset grammar state if we're restarting generation + if (grammar != NULL) { + llama_grammar_free(grammar); + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), + parsed_grammar.symbol_ids.at("root")); + } + } + is_interacting = false; + } + } + + // end of text token + if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) { + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0 ) { + n_remain = params.n_predict; + is_interacting = true; + } + } + if (!params.interactive && n_remain <= 0) { + printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); + fflush(stdout); + } + + llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + + if (ctx_guidance) { llama_free(ctx_guidance); } + llama_free(ctx); + llama_free_model(model); + + if (grammar != NULL) { + llama_grammar_free(grammar); + } + llama_backend_free(); + +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n") +#endif // LOG_DISABLE_LOGS + + return 0; +} diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 365640ffb55cd..15d7974c6d667 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -239,15 +239,8 @@ int main(int argc, char ** argv) { LOG("add_bos: %d\n", add_bos); std::vector embd_inp; - if(params.infill) { - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(ctx)); - } else if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { + + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); } else { @@ -716,58 +709,9 @@ int main(int argc, char ** argv) { LOG("found antiprompt: %s\n", last_output.c_str()); } } - // deal with eot token in infill mode - if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.infill && params.interactive){ - if(is_interacting && !params.interactive_first) { - // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); - } - fflush(stdout); - printf("\n"); - console::set_display(console::user_input); - std::string buffer; - std::string line; - bool another_line=true; - // set a new prefix via stdin - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - // check if we got an empty line, if so we use the old input - if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { - params.input_prefix = buffer; - } - buffer.clear(); - // set a new suffix via stdin - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - // check if we got an empty line - if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { - params.input_suffix = buffer; - } - buffer.clear(); - // done taking input, reset color - console::set_display(console::reset); - // tokenize new prefix and suffix - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(ctx)); - embd.clear(); - embd_guidance.clear(); - n_remain = params.n_predict; - n_past = 0; - n_consumed = 0; - // LOG_TEE("took new input\n"); - is_interacting = false; - } + // deal with end of text token in interactive mode - else if (last_tokens.back() == llama_token_eos(ctx)) { + if (last_tokens.back() == llama_token_eos(ctx)) { LOG("found EOS token\n"); if (params.interactive) { @@ -787,7 +731,7 @@ int main(int argc, char ** argv) { } } - if (n_past > 0 && is_interacting && !(params.infill && params.interactive)) { + if (n_past > 0 && is_interacting ) { LOG("waiting for user input\n"); if (params.instruct) { @@ -881,23 +825,17 @@ int main(int argc, char ** argv) { // end of text token if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) { - if (!params.infill){ - LOG_TEE(" [end of text]\n"); - } + LOG_TEE(" [end of text]\n"); break; } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). - if (params.interactive && n_remain <= 0 && params.n_predict >= 0 ) { + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { n_remain = params.n_predict; is_interacting = true; } } - if (params.infill && !params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); - fflush(stdout); - } if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());