@@ -1559,41 +1559,20 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
1559
1559
///
1560
1560
/// # Safety
1561
1561
/// `a` must be a power of two.
1562
- #[ lang = "align_offset" ]
1563
- #[ rustc_do_not_const_check]
1564
- #[ cfg( not( bootstrap) ) ]
1565
- pub ( crate ) const unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
1566
- // SAFETY: Caller ensures that `a` is a power of two.
1567
- unsafe { const_align_offset :: < T > ( p. addr ( ) , a) }
1568
- }
1569
-
1570
- #[ lang = "align_offset" ]
1571
- #[ cfg( bootstrap) ]
1572
- pub ( crate ) unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
1573
- // SAFETY: Caller ensures that `a` is a power of two.
1574
- unsafe { const_align_offset :: < T > ( p. addr ( ) , a) }
1575
- }
1576
-
1577
- /// Align address `addr`.
1578
- ///
1579
- /// Calculate offset (in terms of elements of `size_of::<T>()` stride) that has to be applied
1580
- /// to address `addr` so that `addr` would get aligned to `a`.
1581
1562
///
1582
- /// Note: This implementation has been carefully tailored to not panic. It is UB for this to panic.
1563
+ /// # Notes
1564
+ /// This implementation has been carefully tailored to not panic. It is UB for this to panic.
1583
1565
/// The only real change that can be made here is change of `INV_TABLE_MOD_16` and associated
1584
1566
/// constants.
1585
1567
///
1586
- /// # Safety
1587
- /// `a` must be a power of two.
1588
- ///
1589
1568
/// If we ever decide to make it possible to call the intrinsic with `a` that is not a
1590
1569
/// power-of-two, it will probably be more prudent to just change to a naive implementation rather
1591
1570
/// than trying to adapt this to accommodate that change.
1592
1571
///
1593
1572
/// Any questions go to @nagisa.
1594
- #[ cfg_attr ( not ( bootstrap ) , lang = "const_align_offset" ) ]
1595
- #[ rustc_allow_const_fn_unstable ( const_exact_div ) ]
1596
- pub ( crate ) const unsafe fn const_align_offset < T : Sized > ( addr : usize , a : usize ) -> usize {
1573
+ #[ lang = "align_offset" ]
1574
+ #[ cfg ( not ( bootstrap ) ) ]
1575
+ pub ( crate ) const unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
1597
1576
// FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
1598
1577
// 1, where the method versions of these operations are not inlined.
1599
1578
use intrinsics:: {
@@ -1650,6 +1629,171 @@ pub(crate) const unsafe fn const_align_offset<T: Sized>(addr: usize, a: usize) -
1650
1629
}
1651
1630
}
1652
1631
1632
+ let stride = mem:: size_of :: < T > ( ) ;
1633
+
1634
+ // SAFETY: At runtime transmuting a pointer to `usize` is always safe, because they have the
1635
+ // same layout. During const eval we hook this function to ensure that the pointer always has
1636
+ // an address (only the standard library can do this).
1637
+ let addr = unsafe { mem:: transmute ( p) } ;
1638
+
1639
+ // SAFETY: `a` is a power-of-two, therefore non-zero.
1640
+ let a_minus_one = unsafe { unchecked_sub ( a, 1 ) } ;
1641
+
1642
+ if stride == 0 {
1643
+ // SPECIAL_CASE: handle 0-sized types. No matter how many times we step, the address will
1644
+ // stay the same, so no offset will be able to align the pointer unless it is already
1645
+ // aligned. This branch _will_ be optimized out as `stride` is known at compile-time.
1646
+ let p_mod_a = addr & a_minus_one;
1647
+ return if p_mod_a == 0 { 0 } else { usize:: MAX } ;
1648
+ }
1649
+
1650
+ // SAFETY: `stride == 0` case has been handled by the special case above.
1651
+ let a_mod_stride = unsafe { unchecked_rem ( a, stride) } ;
1652
+ if a_mod_stride == 0 {
1653
+ // SPECIAL_CASE: In cases where the `a` is divisible by `stride`, byte offset to align a
1654
+ // pointer can be computed more simply through `-p (mod a)`. In the off-chance the byte
1655
+ // offset is not a multiple of `stride`, the input pointer was misaligned and no pointer
1656
+ // offset will be able to produce a `p` aligned to the specified `a`.
1657
+ //
1658
+ // The naive `-p (mod a)` equation inhibits LLVM's ability to select instructions
1659
+ // like `lea`. We compute `(round_up_to_next_alignment(p, a) - p)` instead. This
1660
+ // redistributes operations around the load-bearing, but pessimizing `and` instruction
1661
+ // sufficiently for LLVM to be able to utilize the various optimizations it knows about.
1662
+ //
1663
+ // LLVM handles the branch here particularly nicely. If this branch needs to be evaluated
1664
+ // at runtime, it will produce a mask `if addr_mod_stride == 0 { 0 } else { usize::MAX }`
1665
+ // in a branch-free way and then bitwise-OR it with whatever result the `-p mod a`
1666
+ // computation produces.
1667
+
1668
+ // SAFETY: `stride == 0` case has been handled by the special case above.
1669
+ let addr_mod_stride = unsafe { unchecked_rem ( addr, stride) } ;
1670
+
1671
+ return if addr_mod_stride == 0 {
1672
+ let aligned_address = wrapping_add ( addr, a_minus_one) & wrapping_sub ( 0 , a) ;
1673
+ let byte_offset = wrapping_sub ( aligned_address, addr) ;
1674
+ // SAFETY: `stride` is non-zero. This is guaranteed to divide exactly as well, because
1675
+ // addr has been verified to be aligned to the original type’s alignment requirements.
1676
+ unsafe { exact_div ( byte_offset, stride) }
1677
+ } else {
1678
+ usize:: MAX
1679
+ } ;
1680
+ }
1681
+
1682
+ // GENERAL_CASE: From here on we’re handling the very general case where `addr` may be
1683
+ // misaligned, there isn’t an obvious relationship between `stride` and `a` that we can take an
1684
+ // advantage of, etc. This case produces machine code that isn’t particularly high quality,
1685
+ // compared to the special cases above. The code produced here is still within the realm of
1686
+ // miracles, given the situations this case has to deal with.
1687
+
1688
+ // SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above.
1689
+ let gcdpow = unsafe { cttz_nonzero ( stride) . min ( cttz_nonzero ( a) ) } ;
1690
+ // SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize.
1691
+ let gcd = unsafe { unchecked_shl ( 1usize , gcdpow) } ;
1692
+ // SAFETY: gcd is always greater or equal to 1.
1693
+ if addr & unsafe { unchecked_sub ( gcd, 1 ) } == 0 {
1694
+ // This branch solves for the following linear congruence equation:
1695
+ //
1696
+ // ` p + so = 0 mod a `
1697
+ //
1698
+ // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
1699
+ // requested alignment.
1700
+ //
1701
+ // With `g = gcd(a, s)`, and the above condition asserting that `p` is also divisible by
1702
+ // `g`, we can denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
1703
+ //
1704
+ // ` p' + s'o = 0 mod a' `
1705
+ // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
1706
+ //
1707
+ // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the
1708
+ // second term is "how does incrementing `p` by `s` bytes change the relative alignment of
1709
+ // `p`" (again divided by `g`). Division by `g` is necessary to make the inverse well
1710
+ // formed if `a` and `s` are not co-prime.
1711
+ //
1712
+ // Furthermore, the result produced by this solution is not "minimal", so it is necessary
1713
+ // to take the result `o mod lcm(s, a)`. This `lcm(s, a)` is the same as `a'`.
1714
+
1715
+ // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1716
+ // `a`.
1717
+ let a2 = unsafe { unchecked_shr ( a, gcdpow) } ;
1718
+ // SAFETY: `a2` is non-zero. Shifting `a` by `gcdpow` cannot shift out any of the set bits
1719
+ // in `a` (of which it has exactly one).
1720
+ let a2minus1 = unsafe { unchecked_sub ( a2, 1 ) } ;
1721
+ // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1722
+ // `a`.
1723
+ let s2 = unsafe { unchecked_shr ( stride & a_minus_one, gcdpow) } ;
1724
+ // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1725
+ // `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will
1726
+ // always be strictly greater than `(p % a) >> gcdpow`.
1727
+ let minusp2 = unsafe { unchecked_sub ( a2, unchecked_shr ( addr & a_minus_one, gcdpow) ) } ;
1728
+ // SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2`
1729
+ // because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`.
1730
+ return wrapping_mul ( minusp2, unsafe { mod_inv ( s2, a2) } ) & a2minus1;
1731
+ }
1732
+
1733
+ // Cannot be aligned at all.
1734
+ usize:: MAX
1735
+ }
1736
+
1737
+ #[ lang = "align_offset" ]
1738
+ #[ cfg( bootstrap) ]
1739
+ pub ( crate ) unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
1740
+ // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
1741
+ // 1, where the method versions of these operations are not inlined.
1742
+ use intrinsics:: {
1743
+ cttz_nonzero, exact_div, unchecked_rem, unchecked_shl, unchecked_shr, unchecked_sub,
1744
+ wrapping_add, wrapping_mul, wrapping_sub,
1745
+ } ;
1746
+
1747
+ /// Calculate multiplicative modular inverse of `x` modulo `m`.
1748
+ ///
1749
+ /// This implementation is tailored for `align_offset` and has following preconditions:
1750
+ ///
1751
+ /// * `m` is a power-of-two;
1752
+ /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead)
1753
+ ///
1754
+ /// Implementation of this function shall not panic. Ever.
1755
+ #[ inline]
1756
+ unsafe fn mod_inv ( x : usize , m : usize ) -> usize {
1757
+ /// Multiplicative modular inverse table modulo 2⁴ = 16.
1758
+ ///
1759
+ /// Note, that this table does not contain values where inverse does not exist (i.e., for
1760
+ /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
1761
+ const INV_TABLE_MOD_16 : [ u8 ; 8 ] = [ 1 , 11 , 13 , 7 , 9 , 3 , 5 , 15 ] ;
1762
+ /// Modulo for which the `INV_TABLE_MOD_16` is intended.
1763
+ const INV_TABLE_MOD : usize = 16 ;
1764
+ /// INV_TABLE_MOD²
1765
+ const INV_TABLE_MOD_SQUARED : usize = INV_TABLE_MOD * INV_TABLE_MOD ;
1766
+
1767
+ let table_inverse = INV_TABLE_MOD_16 [ ( x & ( INV_TABLE_MOD - 1 ) ) >> 1 ] as usize ;
1768
+ // SAFETY: `m` is required to be a power-of-two, hence non-zero.
1769
+ let m_minus_one = unsafe { unchecked_sub ( m, 1 ) } ;
1770
+ if m <= INV_TABLE_MOD {
1771
+ table_inverse & m_minus_one
1772
+ } else {
1773
+ // We iterate "up" using the following formula:
1774
+ //
1775
+ // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
1776
+ //
1777
+ // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
1778
+ let mut inverse = table_inverse;
1779
+ let mut going_mod = INV_TABLE_MOD_SQUARED ;
1780
+ loop {
1781
+ // y = y * (2 - xy) mod n
1782
+ //
1783
+ // Note, that we use wrapping operations here intentionally – the original formula
1784
+ // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
1785
+ // usize::MAX` instead, because we take the result `mod n` at the end
1786
+ // anyway.
1787
+ inverse = wrapping_mul ( inverse, wrapping_sub ( 2usize , wrapping_mul ( x, inverse) ) ) ;
1788
+ if going_mod >= m {
1789
+ return inverse & m_minus_one;
1790
+ }
1791
+ going_mod = wrapping_mul ( going_mod, going_mod) ;
1792
+ }
1793
+ }
1794
+ }
1795
+
1796
+ let addr = p. addr ( ) ;
1653
1797
let stride = mem:: size_of :: < T > ( ) ;
1654
1798
// SAFETY: `a` is a power-of-two, therefore non-zero.
1655
1799
let a_minus_one = unsafe { unchecked_sub ( a, 1 ) } ;
0 commit comments