Skip to content

Commit 206251b

Browse files
[WIP] Improve VecCache under parallel frontend
This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (<0.1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds). FIXME: Extract the internals to rustc_data_structures, safety comments, etc.
1 parent 02f7806 commit 206251b

File tree

1 file changed

+226
-33
lines changed
  • compiler/rustc_query_system/src/query

1 file changed

+226
-33
lines changed

compiler/rustc_query_system/src/query/caches.rs

Lines changed: 226 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@ use crate::dep_graph::DepNodeIndex;
22

33
use rustc_data_structures::fx::FxHashMap;
44
use rustc_data_structures::sharded::{self, Sharded};
5-
use rustc_data_structures::sync::{Lock, OnceLock};
5+
use rustc_data_structures::sync::{AtomicUsize, OnceLock};
66
use rustc_hir::def_id::LOCAL_CRATE;
7-
use rustc_index::{Idx, IndexVec};
7+
use rustc_index::Idx;
88
use rustc_span::def_id::DefId;
99
use rustc_span::def_id::DefIndex;
1010
use std::fmt::Debug;
1111
use std::hash::Hash;
12+
use std::marker::PhantomData;
13+
use std::mem::offset_of;
14+
use std::sync::atomic::{AtomicPtr, AtomicU32, Ordering};
1215

1316
pub trait QueryCache: Sized {
1417
type Key: Hash + Eq + Copy + Debug;
@@ -100,16 +103,173 @@ where
100103
}
101104
}
102105

103-
pub struct VecCache<K: Idx, V> {
104-
cache: Lock<IndexVec<K, Option<(V, DepNodeIndex)>>>,
106+
struct Slot<V> {
107+
// We never construct &Slot<V> so it's fine for this to not be in an UnsafeCell.
108+
value: V,
109+
// This is both an index and a once-lock.
110+
//
111+
// 0: not yet initialized.
112+
// 1: lock held, initializing.
113+
// 2..u32::MAX - 2: initialized.
114+
index_and_lock: AtomicU32,
105115
}
106116

107117
impl<K: Idx, V> Default for VecCache<K, V> {
108118
fn default() -> Self {
109-
VecCache { cache: Default::default() }
119+
VecCache {
120+
buckets: Default::default(),
121+
key: PhantomData,
122+
len: Default::default(),
123+
present: Default::default(),
124+
}
125+
}
126+
}
127+
128+
#[derive(Copy, Clone, Debug)]
129+
struct SlotIndex {
130+
bucket_idx: usize,
131+
entries: usize,
132+
index_in_bucket: usize,
133+
}
134+
135+
impl SlotIndex {
136+
#[inline]
137+
fn from_index(idx: u32) -> Self {
138+
let mut bucket = idx.checked_ilog2().unwrap_or(0) as usize;
139+
let entries;
140+
let running_sum;
141+
if bucket <= 11 {
142+
entries = 1 << 12;
143+
running_sum = 0;
144+
bucket = 0;
145+
} else {
146+
entries = 1 << bucket;
147+
running_sum = entries;
148+
bucket = bucket - 11;
149+
}
150+
SlotIndex { bucket_idx: bucket, entries, index_in_bucket: idx as usize - running_sum }
151+
}
152+
153+
#[inline]
154+
unsafe fn get<V: Copy>(&self, buckets: &[AtomicPtr<Slot<V>>; 21]) -> Option<(V, u32)> {
155+
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e.,
156+
// in-bounds of buckets.
157+
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) };
158+
let ptr = bucket.load(Ordering::Acquire);
159+
// Bucket is not yet initialized: then we obviously won't find this entry in that bucket.
160+
if ptr.is_null() {
161+
return None;
162+
}
163+
// SAFETY: Follows from preconditions on `buckets` and `self.
164+
let slot = unsafe { ptr.add(self.index_in_bucket) };
165+
166+
// SAFETY:
167+
let index_and_lock =
168+
unsafe { &*slot.byte_add(offset_of!(Slot<V>, index_and_lock)).cast::<AtomicU32>() };
169+
let current = index_and_lock.load(Ordering::Acquire);
170+
let index = match current {
171+
0 => return None,
172+
// Treat "initializing" as actually just not initialized at all.
173+
// The only reason this is a separate state is that `complete` calls could race and
174+
// we can't allow that, but from load perspective there's no difference.
175+
1 => return None,
176+
_ => current - 2,
177+
};
178+
179+
// SAFETY:
180+
// * slot is a valid pointer (buckets are always valid for the index we get).
181+
// * value is initialized since we saw a >= 2 index above.
182+
// * `V: Copy`, so safe to read.
183+
let value = unsafe { slot.byte_add(offset_of!(Slot<V>, value)).cast::<V>().read() };
184+
Some((value, index))
185+
}
186+
187+
/// Returns true if this successfully put into the map.
188+
#[inline]
189+
unsafe fn put<V>(&self, buckets: &[AtomicPtr<Slot<V>>; 21], value: V, extra: u32) -> bool {
190+
static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
191+
192+
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e.,
193+
// in-bounds of buckets.
194+
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) };
195+
let mut ptr = bucket.load(Ordering::Acquire);
196+
let _allocator_guard;
197+
if ptr.is_null() {
198+
// If we load null, then acquire the global lock; this path is quite cold, so it's cheap
199+
// to use a global lock.
200+
_allocator_guard = LOCK.lock();
201+
// And re-load the value.
202+
ptr = bucket.load(Ordering::Acquire);
203+
}
204+
205+
// OK, now under the allocator lock, if we're still null then it's definitely us that will
206+
// initialize this bucket.
207+
if ptr.is_null() {
208+
let bucket_layout =
209+
std::alloc::Layout::array::<Slot<V>>(self.entries as usize).unwrap();
210+
// SAFETY: Always >0 entries in each bucket.
211+
let allocated = unsafe { std::alloc::alloc_zeroed(bucket_layout).cast::<Slot<V>>() };
212+
if allocated.is_null() {
213+
std::alloc::handle_alloc_error(bucket_layout);
214+
}
215+
bucket.store(allocated, Ordering::Release);
216+
ptr = allocated;
217+
}
218+
assert!(!ptr.is_null());
219+
220+
// SAFETY: `index_in_bucket` is always in-bounds of the allocated array.
221+
let slot = unsafe { ptr.add(self.index_in_bucket) };
222+
223+
let index_and_lock =
224+
unsafe { &*slot.byte_add(offset_of!(Slot<V>, index_and_lock)).cast::<AtomicU32>() };
225+
match index_and_lock.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed) {
226+
Ok(_) => {
227+
// We have acquired the initialization lock. It is our job to write `value` and
228+
// then set the lock to the real index.
229+
230+
unsafe {
231+
slot.byte_add(offset_of!(Slot<V>, value)).cast::<V>().write(value);
232+
}
233+
234+
index_and_lock.store(extra.checked_add(2).unwrap(), Ordering::Release);
235+
236+
true
237+
}
238+
239+
// Treat "initializing" as actually initialized: we lost the race and should skip
240+
// any updates to this slot.
241+
//
242+
// FIXME: Is that OK? Some of our other implementations replace the entry instead of
243+
// preserving the earliest write. We could do a parking-lot or spinlock here...
244+
Err(1) => false,
245+
246+
// This slot was already populated. Also ignore, currently this is the same as
247+
// "initializing".
248+
Err(_) => false,
249+
}
110250
}
111251
}
112252

253+
pub struct VecCache<K: Idx, V> {
254+
// Entries per bucket:
255+
// Bucket 0: 4096 2^12
256+
// Bucket 1: 4096 2^12
257+
// Bucket 2: 8192
258+
// Bucket 3: 16384
259+
// ...
260+
// Bucket 19: 1073741824
261+
// Bucket 20: 2147483648
262+
// The total number of entries if all buckets are initialized is u32::MAX-1.
263+
buckets: [AtomicPtr<Slot<V>>; 21],
264+
265+
// Present and len are only used during incremental and self-profiling.
266+
// They are an optimization over iterating the full buckets array.
267+
present: [AtomicPtr<Slot<()>>; 21],
268+
len: AtomicUsize,
269+
270+
key: PhantomData<K>,
271+
}
272+
113273
impl<K, V> QueryCache for VecCache<K, V>
114274
where
115275
K: Eq + Idx + Copy + Debug,
@@ -120,20 +280,41 @@ where
120280

121281
#[inline(always)]
122282
fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> {
123-
let lock = self.cache.lock();
124-
if let Some(Some(value)) = lock.get(*key) { Some(*value) } else { None }
283+
let key = u32::try_from(key.index()).unwrap();
284+
let slot_idx = SlotIndex::from_index(key);
285+
match unsafe { slot_idx.get(&self.buckets) } {
286+
Some((value, idx)) => Some((value, DepNodeIndex::from_u32(idx))),
287+
None => None,
288+
}
125289
}
126290

127291
#[inline]
128292
fn complete(&self, key: K, value: V, index: DepNodeIndex) {
129-
let mut lock = self.cache.lock();
130-
lock.insert(key, (value, index));
293+
let key = u32::try_from(key.index()).unwrap();
294+
let slot_idx = SlotIndex::from_index(key);
295+
if unsafe { slot_idx.put(&self.buckets, value, index.index() as u32) } {
296+
let present_idx = self.len.fetch_add(1, Ordering::Relaxed);
297+
let slot = SlotIndex::from_index(present_idx as u32);
298+
// We should always be uniquely putting due to `len` fetch_add returning unique values.
299+
assert!(unsafe { slot.put(&self.present, (), key) });
300+
}
131301
}
132302

133303
fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
134-
for (k, v) in self.cache.lock().iter_enumerated() {
135-
if let Some(v) = v {
136-
f(&k, &v.0, v.1);
304+
for idx in 0..self.len.load(Ordering::Relaxed) {
305+
let key = SlotIndex::from_index(idx as u32);
306+
match unsafe { key.get(&self.present) } {
307+
// This shouldn't happen in our current usage (iter is really only
308+
// used long after queries are done running), but if we hit this in practice it's
309+
// probably fine to just break early.
310+
None => unreachable!(),
311+
Some(((), key)) => {
312+
let key = K::new(key as usize);
313+
// unwrap() is OK: present entries are always written only after we put the real
314+
// entry.
315+
let value = self.lookup(&key).unwrap();
316+
f(&key, &value.0, value.1);
317+
}
137318
}
138319
}
139320
}
@@ -142,10 +323,7 @@ where
142323
pub struct DefIdCache<V> {
143324
/// Stores the local DefIds in a dense map. Local queries are much more often dense, so this is
144325
/// a win over hashing query keys at marginal memory cost (~5% at most) compared to FxHashMap.
145-
///
146-
/// The second element of the tuple is the set of keys actually present in the IndexVec, used
147-
/// for faster iteration in `iter()`.
148-
local: Lock<(IndexVec<DefIndex, Option<(V, DepNodeIndex)>>, Vec<DefIndex>)>,
326+
local: VecCache<DefIndex, V>,
149327
foreign: DefaultCache<DefId, V>,
150328
}
151329

@@ -165,8 +343,7 @@ where
165343
#[inline(always)]
166344
fn lookup(&self, key: &DefId) -> Option<(V, DepNodeIndex)> {
167345
if key.krate == LOCAL_CRATE {
168-
let cache = self.local.lock();
169-
cache.0.get(key.index).and_then(|v| *v)
346+
self.local.lookup(&key.index)
170347
} else {
171348
self.foreign.lookup(key)
172349
}
@@ -175,27 +352,43 @@ where
175352
#[inline]
176353
fn complete(&self, key: DefId, value: V, index: DepNodeIndex) {
177354
if key.krate == LOCAL_CRATE {
178-
let mut cache = self.local.lock();
179-
let (cache, present) = &mut *cache;
180-
let slot = cache.ensure_contains_elem(key.index, Default::default);
181-
if slot.is_none() {
182-
// FIXME: Only store the present set when running in incremental mode. `iter` is not
183-
// used outside of saving caches to disk and self-profile.
184-
present.push(key.index);
185-
}
186-
*slot = Some((value, index));
355+
self.local.complete(key.index, value, index)
187356
} else {
188357
self.foreign.complete(key, value, index)
189358
}
190359
}
191360

192361
fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
193-
let guard = self.local.lock();
194-
let (cache, present) = &*guard;
195-
for &idx in present.iter() {
196-
let value = cache[idx].unwrap();
197-
f(&DefId { krate: LOCAL_CRATE, index: idx }, &value.0, value.1);
198-
}
362+
self.local.iter(&mut |key, value, index| {
363+
f(&DefId { krate: LOCAL_CRATE, index: *key }, value, index);
364+
});
199365
self.foreign.iter(f);
200366
}
201367
}
368+
369+
#[test]
370+
fn slot_index_exhaustive() {
371+
let mut buckets = [0u32; 21];
372+
for idx in 0..=u32::MAX {
373+
buckets[SlotIndex::from_index(idx).bucket_idx] += 1;
374+
}
375+
let mut prev = None::<SlotIndex>;
376+
for idx in 0..u32::MAX {
377+
let slot_idx = SlotIndex::from_index(idx);
378+
if let Some(p) = prev {
379+
if p.bucket_idx == slot_idx.bucket_idx {
380+
assert_eq!(p.index_in_bucket + 1, slot_idx.index_in_bucket);
381+
} else {
382+
assert_eq!(slot_idx.index_in_bucket, 0);
383+
}
384+
} else {
385+
assert_eq!(idx, 0);
386+
assert_eq!(slot_idx.index_in_bucket, 0);
387+
assert_eq!(slot_idx.bucket_idx, 0);
388+
}
389+
390+
assert_eq!(buckets[slot_idx.bucket_idx], slot_idx.entries as u32);
391+
392+
prev = Some(slot_idx);
393+
}
394+
}

0 commit comments

Comments
 (0)