Skip to content

Commit 33c6bf9

Browse files
committed
WIP Compute per layer LIM Scores during imatrix
*WARNING*: This is mostly vibe code. Hope I'm not wasting y'alls time. Compute Layer Importance Modification (LIM) Scores The goal of this PR is to rank layers of a given tensor in order of sensitivity to quantization error. Given that it is now possible to use `llama-quantize --custom-q ...` regex, it may be possible to use these LIM Scores to decide which layers of a given tensor to quantize more or less in an attempt to preserve generation quality (e.g. low perplexity) while reducing memory footprint as compared to using same quant size across all layers of a given tensor. This experimental PR was motivated by this comment and PR: ggml-org/llama.cpp#12718 I may force-push this after more testing and experimenting to see if it is actually doing the right thing and if the output is actually useful to improve quantization quality e.g. PPL per GiB... This may just be a big mistake, lol. This is built on existing imatrix computation and assumes that values of `x[j]` are the "activations" coming right in/out of the given tensor layer. I don't know GGML and generally work in python or vanilla c not so much c++. So a lot of this was vibe coded running [ubergarm/DeepSeek-V3-0324-GGUF IQ4_K_R4 quant](https://huggingface.co/ubergarm/DeepSeek-V3-0324-GGUF/tree/main/DeepSeek-V3-0324-IQ4_K_R4). So this is partially an experiment actually trying to use an LLM instead of just enjoying the meta of manual quantization min-maxing. ``` @misc{dumitru2024layerwisequantizationpragmaticeffective, title={Layer-Wise Quantization: A Pragmatic and Effective Method for Quantizing LLMs Beyond Integer Bit-Levels}, author={Razvan-Gabriel Dumitru and Vikas Yadav and Rishabh Maheshwary and Paul-Ioan Clotan and Sathwik Tejaswi Madhusudhan and Mihai Surdeanu}, year={2024}, eprint={2406.17415}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2406.17415}, code={https://github.com/RazvanDu/LayerwiseQuant/}, } ```
1 parent c01449a commit 33c6bf9

File tree

3 files changed

+113
-1
lines changed

3 files changed

+113
-1
lines changed

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
14191419
params.compute_ppl = false;
14201420
return true;
14211421
}
1422+
if (arg == "--no-lim") {
1423+
params.compute_lim = false;
1424+
return true;
1425+
}
14221426
if (arg == "--chunk" || arg == "--from-chunk") {
14231427
CHECK_ARG
14241428
params.i_chunk = std::stoi(argv[i]);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ struct gpt_params {
270270

271271
bool process_output = false; // collect data for the output tensor
272272
bool compute_ppl = true; // whether to compute perplexity
273+
bool compute_lim = true; // whether to compute and show Layer Importance Scores https://arxiv.org/pdf/2406.17415
273274

274275
// cvector-generator params
275276
int n_pca_batch = 100;

examples/imatrix/imatrix.cpp

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <cstring>
1414
#include <ctime>
1515
#include <sstream>
16+
#include <string>
1617
#include <thread>
1718
#include <mutex>
1819
#include <vector>
@@ -30,12 +31,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
3031
LOG_TEE("\nexample usage:\n");
3132
LOG_TEE("\n %s \\\n"
3233
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
33-
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
34+
" [--no-ppl] [--no-lim] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
3435
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
3536
LOG_TEE("\n");
3637
}
3738

3839
struct Stats {
40+
std::vector<float> activations;
3941
std::vector<float> values;
4042
std::vector<int> counts;
4143
int ncall = 0;
@@ -48,6 +50,7 @@ class IMatrixCollector {
4850
void set_params(gpt_params params) { m_params = std::move(params); }
4951
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
5052
void save_imatrix(int ncall = -1) const;
53+
void compute_lim();
5154
bool load_imatrix(const char * file_name);
5255
private:
5356
std::unordered_map<std::string, Stats> m_stats;
@@ -131,6 +134,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
131134
++e.ncall;
132135

133136
if (e.values.empty()) {
137+
e.activations.resize(src1->ne[0]*n_as, 0);
134138
e.values.resize(src1->ne[0]*n_as, 0);
135139
e.counts.resize(src1->ne[0]*n_as, 0);
136140
e.n_as = n_as;
@@ -162,6 +166,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
162166
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
163167

164168
for (int j = 0; j < (int)src1->ne[0]; ++j) {
169+
e.activations[e_start + j] = x[j];
165170
e.values[e_start + j] += x[j]*x[j];
166171
e.counts[e_start + j]++;
167172
if (!std::isfinite(e.values[e_start + j])) {
@@ -183,7 +188,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
183188
}
184189
} else {
185190
auto & e = m_stats[wname];
191+
186192
if (e.values.empty()) {
193+
e.activations.resize(src1->ne[0], 0);
187194
e.values.resize(src1->ne[0], 0);
188195
e.counts.resize(src1->ne[0], 0);
189196
}
@@ -198,6 +205,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
198205
for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
199206
const float * x = data + row * src1->ne[0];
200207
for (int j = 0; j < (int)src1->ne[0]; ++j) {
208+
e.activations[j] = x[j];
201209
e.values[j] += x[j]*x[j];
202210
e.counts[j]++;
203211
if (!std::isfinite(e.values[j])) {
@@ -396,6 +404,99 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
396404
return true;
397405
}
398406

407+
// Extract layer number from keys like "blk.17.ffn_gate.weight"
408+
int extract_layer(const std::string& name) {
409+
size_t p1 = name.find('.') + 1; // Skip "blk."
410+
size_t p2 = name.find('.', p1); // Find next "."
411+
return std::stoi(name.substr(p1, p2 - p1));
412+
}
413+
414+
void IMatrixCollector::compute_lim() {
415+
if (m_stats.empty()) {
416+
fprintf(stderr, "%s: no data collected - cannot compute LIM scores\n", __func__);
417+
return;
418+
}
419+
printf("\n===\n");
420+
printf("Computing Layer Importance Modification (LIM) Scores...\n");
421+
422+
// Convert to vector and sort by layer number
423+
std::vector<std::pair<std::string, Stats>> sorted_pairs(m_stats.begin(), m_stats.end());
424+
std::sort(sorted_pairs.begin(), sorted_pairs.end(),
425+
[](const auto& a, const auto& b) {
426+
return extract_layer(a.first) < extract_layer(b.first);
427+
}
428+
);
429+
430+
// Group activations by tensor type (e.g., ffn_gate, attn_k, etc.)
431+
std::unordered_map<std::string, std::vector<std::pair<int, const std::vector<float>*>>> tensor_groups;
432+
433+
for (const auto& pair : sorted_pairs) {
434+
std::string full_name = pair.first;
435+
size_t p1 = full_name.find('.') + 1; // Skip "blk."
436+
size_t p2 = full_name.find('.', p1); // Find next "."
437+
int layer = std::stoi(full_name.substr(p1, p2 - p1));
438+
std::string tensor_name = full_name.substr(p2 + 1, full_name.rfind('.') - p2 - 1);
439+
440+
tensor_groups[tensor_name].emplace_back(layer, &pair.second.activations);
441+
}
442+
443+
// Calculate LIM scores for each tensor type
444+
for (const auto& group : tensor_groups) {
445+
const std::string& tensor_name = group.first;
446+
const auto& layers = group.second;
447+
448+
printf("\nTensor: %s\n", tensor_name.c_str());
449+
printf("Layer\tLIM Score\n");
450+
printf("-----\t---------\n");
451+
452+
// Need at least 2 layers to compute LIM scores
453+
if (layers.size() < 2) {
454+
printf("(Need at least 2 layers to compute LIM scores)\n");
455+
continue;
456+
}
457+
458+
// For each layer, compare with next layer's input (current layer's output)
459+
for (size_t i = 0; i < layers.size() - 1; i++) {
460+
int layer = layers[i].first;
461+
const std::vector<float>& input_acts = *layers[i].second;
462+
const std::vector<float>& output_acts = *layers[i+1].second;
463+
464+
// Check if activation sizes match
465+
if (input_acts.size() != output_acts.size()) {
466+
printf("%d\t(skipped - dimension mismatch: %zu vs %zu)\n",
467+
layer, input_acts.size(), output_acts.size());
468+
continue;
469+
}
470+
471+
// Calculate dot product and magnitudes
472+
float dot_product = 0.0f;
473+
float input_magnitude = 0.0f;
474+
float output_magnitude = 0.0f;
475+
476+
for (size_t j = 0; j < input_acts.size(); j++) {
477+
dot_product += input_acts[j] * output_acts[j];
478+
input_magnitude += input_acts[j] * input_acts[j];
479+
output_magnitude += output_acts[j] * output_acts[j];
480+
}
481+
482+
input_magnitude = sqrtf(input_magnitude);
483+
output_magnitude = sqrtf(output_magnitude);
484+
485+
// Avoid division by zero
486+
if (input_magnitude == 0 || output_magnitude == 0) {
487+
printf("%d\t(skipped - zero magnitude)\n", layer);
488+
continue;
489+
}
490+
491+
// Calculate cosine similarity and LIM score
492+
float cosine_sim = dot_product / (input_magnitude * output_magnitude);
493+
float lim_score = -cosine_sim;
494+
495+
printf("%d\t%.4f\n", layer, lim_score);
496+
}
497+
}
498+
}
499+
399500
static IMatrixCollector g_collector;
400501

401502
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -683,10 +784,16 @@ int main(int argc, char ** argv) {
683784

684785
llama_print_timings(ctx);
685786

787+
if (params.compute_lim) {
788+
g_collector.compute_lim();
789+
}
790+
686791
llama_free(ctx);
687792
llama_free_model(model);
688793

689794
llama_backend_free();
690795

796+
797+
691798
return 0;
692799
}

0 commit comments

Comments
 (0)