Description
I tried this code:
#![feature(new_zeroed_alloc)]
trait Square {
fn sqr(self) -> Self;
}
impl Square for f32 {
fn sqr(self) -> Self {
self * self
}
}
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd)]
struct Lab {
l: f32,
a: f32,
b: f32,
}
impl Eq for Lab {}
impl Ord for Lab {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.partial_cmp(other).unwrap()
}
}
fn msc_iter_mf(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
for (rgb, lab, _) in known_rgbs {
let mut center = *lab;
loop {
let (newcenter_count, newcenter) = known_rgbs.iter()
.filter(|(_, lab, _)| {
let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
distance_squared <= radiussy_squared
})
.fold((0,
Lab {
l: 0f32,
a: 0f32,
b: 0f32,
}),
|(cnt, acc), (_, lab, _freq)| {
(cnt + _freq,
Lab {
l: acc.l + lab.l * (*_freq as f32),
a: acc.a + lab.a * (*_freq as f32),
b: acc.b + lab.b * (*_freq as f32),
})
});
let newcenter = Lab {
l: newcenter.l / newcenter_count as f32,
a: newcenter.a / newcenter_count as f32,
b: newcenter.b / newcenter_count as f32,
};
if newcenter == center {
break;
}
center = newcenter;
}
outputs[*rgb as usize] = center;
}
}
fn msc_iter_p(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
for (rgb, lab, _) in known_rgbs {
let mut center = *lab;
loop {
let (newcenter_count, newcenter) = known_rgbs.iter()
.filter(|(_, lab, _)| {
let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
distance_squared <= radiussy_squared
})
.fold((0,
Lab {
l: 0f32,
a: 0f32,
b: 0f32,
}),
|(cnt, acc), (_, lab, _freq)| {
(cnt + 1,
Lab {
l: acc.l + lab.l,
a: acc.a + lab.a,
b: acc.b + lab.b,
})
});
let newcenter = Lab {
l: newcenter.l / newcenter_count as f32,
a: newcenter.a / newcenter_count as f32,
b: newcenter.b / newcenter_count as f32,
};
if newcenter == center {
break;
}
center = newcenter;
}
outputs[*rgb as usize] = center;
}
}
fn msc_for_mf(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
for (rgb, lab, _) in known_rgbs {
let mut center = *lab;
loop {
let (mut newcenter_count, mut newcenter) = (0,
Lab {
l: 0f32,
a: 0f32,
b: 0f32,
});
for (_, lab, freq) in known_rgbs {
let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
if distance_squared <= radiussy_squared {
newcenter_count += *freq;
newcenter.l += lab.l * (*freq as f32);
newcenter.a += lab.a * (*freq as f32);
newcenter.b += lab.b * (*freq as f32);
}
}
newcenter.l /= newcenter_count as f32;
newcenter.a /= newcenter_count as f32;
newcenter.b /= newcenter_count as f32;
if newcenter == center {
outputs[*rgb as usize] = center;
break;
}
center = newcenter;
}
}
}
fn msc_for_p(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
for (rgb, lab, _) in known_rgbs {
let mut center = *lab;
loop {
let (mut newcenter_count, mut newcenter) = (0,
Lab {
l: 0f32,
a: 0f32,
b: 0f32,
});
for (_, lab, _) in known_rgbs {
let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
if distance_squared <= radiussy_squared {
newcenter_count += 1;
newcenter.l += lab.l;
newcenter.a += lab.a;
newcenter.b += lab.b;
}
}
newcenter.l /= newcenter_count as f32;
newcenter.a /= newcenter_count as f32;
newcenter.b /= newcenter_count as f32;
if newcenter == center {
outputs[*rgb as usize] = center;
break;
}
center = newcenter;
}
}
}
fn msc_for_p_cold(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
for (rgb, lab, _) in known_rgbs {
let mut center = *lab;
loop {
let (mut newcenter_count, mut newcenter) = (0,
Lab {
l: 0f32,
a: 0f32,
b: 0f32,
});
for (_, lab, _) in known_rgbs {
let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
if distance_squared <= radiussy_squared {
#[cold]
fn cold() {}
cold();
newcenter_count += 1;
newcenter.l += lab.l;
newcenter.a += lab.a;
newcenter.b += lab.b;
}
}
newcenter.l /= newcenter_count as f32;
newcenter.a /= newcenter_count as f32;
newcenter.b /= newcenter_count as f32;
if newcenter == center {
outputs[*rgb as usize] = center;
break;
}
center = newcenter;
}
}
}
fn main() {
use std::io::BufRead;
let mut outputs = unsafe { Box::<[Lab; 0xFFFFFF + 1]>::new_zeroed().assume_init() };
let mut known_rgbs = vec![];
for l in std::io::BufReader::new(std::fs::File::open("kr_").unwrap()).lines().map(Result::unwrap) {
let mut iter = l.split_whitespace();
known_rgbs.push((iter.next().unwrap().parse::<u32>().unwrap(),
Lab {
l: iter.next().unwrap().parse::<f32>().unwrap(),
a: iter.next().unwrap().parse::<f32>().unwrap(),
b: iter.next().unwrap().parse::<f32>().unwrap(),
},
1));
}
macro_rules! one {
($f:ident) => {
let start = std::time::Instant::now();
$f(&mut *outputs, &known_rgbs, 0.02f32 * 0.02f32);
let end = std::time::Instant::now();
println!("{}:\t{:?}", stringify!($f), end - start);
}
}
one!(msc_iter_mf);
one!(msc_iter_p);
one!(msc_for_mf);
one!(msc_for_p);
one!(msc_for_p_cold);
}
kr_.gz (real, non-synthetic, data; shouldn't really matter though)
This is five identical implementations (if freq is fixed at 1, which it is).
I expected to see this happen: *_p
variants are faster-or-at-worst-identical to *_mf
.
Instead, this happened: *_mf
is 30% (iter
) or 21% (for
) faster than *_p
despite being more computationally complex.
@zopsicle analysed this godbolt of the fors as "linear_core_p
does the addition unconditionally, then conditionally stores the result. linear_core_mf
preserves the original control flow. If you put #[cold] fn cold() {} cold();
inside the if statement it will preserve the control flow." and they were right, but the _cold
variant is still 7% worse than *_mf
.
This is obviously wrong.
Measurements, for me, on a i7-2600:
msc_iter_mf: 16.235556598
msc_iter_p: 21.680596694
msc_for_mf: 15.856701278
msc_for_p: 19.5196018094
msc_for_p_cold: 14.865306603
Meta
rustc --version --verbose
:
rustc 1.85.1 (4eb161250 2025-03-15)
binary: rustc
commit-hash: 4eb161250e340c8f48f66e2b929ef4a5bed7c181
commit-date: 2025-03-15
host: x86_64-pc-windows-gnu
release: 1.85.1
LLVM version: 19.1.7
Building in release mode in cargo and --codegen opt-level=3
out cargo.
Please note that I am not interested in making the code faster overall, just in the pessimisation that leads to the relative difference between the variants. (You know how programmers are.)