diff --git a/examples/tts/orpheus-tts.cpp b/examples/tts/orpheus-tts.cpp index 622ec46fde05a..45595e9552fc0 100644 --- a/examples/tts/orpheus-tts.cpp +++ b/examples/tts/orpheus-tts.cpp @@ -298,7 +298,7 @@ int main(int argc, char **argv) { params.model = params.vocoder.model; params.n_batch = 2; - params.embedding = true + params.embedding = true; // disable warmup, SNAC doesn't care about BOS or EOS tokens; params.warmup = false; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ca4adaa781cb3..5bec63e2e79ff 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -312,9 +312,7 @@ llama_context::llama_context( // reserve pp graph first so that buffers are only allocated once { - LLAMA_LOG_DEBUG("here 3\n"); llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto * gf = graph_init(); graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT); if (!ggml_backend_sched_reserve(sched.get(), gf)) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e711b28684837..bee6e6bd359b4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1495,8 +1495,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (flags & TENSOR_NOT_REQUIRED) { return nullptr; } - return nullptr; - //throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); + throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); } // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops @@ -1591,8 +1590,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { return t; } } - fprintf(stderr, "create_tensor: Creating '%s' with ne=[%ld, %ld, %ld]\n", - tn_str.c_str(), ne.begin()[0], ne.begin()[1], ne.begin()[2]); return ml.create_tensor(ctx, tn, ne, flags); };