Skip to content

server : fix slot selection by lru #10126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ struct server_slot {
if (is_processing()) {
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);

t_last_used = ggml_time_us();
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
state = SLOT_STATE_IDLE;
callback_on_release(id);
Expand Down Expand Up @@ -730,7 +731,7 @@ struct server_context {

// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
int max_lcs_len = 0;
int lcs_len = 0;
float similarity = 0;

for (server_slot & slot : slots) {
Expand All @@ -745,20 +746,21 @@ struct server_context {
}

// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);

// fraction of the common subsequence length compared to the current slot's prompt length
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());

// select the current slot if the criteria match
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
max_lcs_len = lcs_len;
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
lcs_len = cur_lcs_len;
similarity = cur_similarity;
ret = &slot;
}
}

if (ret != nullptr) {
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
}
}

Expand Down
14 changes: 7 additions & 7 deletions examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
}

// get the lengths of the input sequences
int a_len = a.size();
int b_len = b.size();
size_t a_len = a.size();
size_t b_len = b.size();

// initialize the maximum length of the longest common subsequence (LCS)
int max_length = 0;
size_t max_length = 0;

// use two rows instead of a 2D matrix to optimize space
std::vector<int> prev_row(b_len + 1, 0);
std::vector<int> curr_row(b_len + 1, 0);
std::vector<size_t> prev_row(b_len + 1, 0);
std::vector<size_t> curr_row(b_len + 1, 0);

// iterate through the elements of a
for (int i = 1; i <= a_len; i++) {
for (size_t i = 1; i <= a_len; i++) {
// iterate through the elements of b
for (int j = 1; j <= b_len; j++) {
for (size_t j = 1; j <= b_len; j++) {
// if elements at the current positions match
if (a[i - 1] == b[j - 1]) {
// if it's the first element of either sequences, set LCS length to 1
Expand Down
Loading