Skip to content

Commit 4838746

Browse files
committed
WIP
1 parent f3316bd commit 4838746

File tree

1 file changed

+103
-96
lines changed

1 file changed

+103
-96
lines changed

src/float/div.rs

Lines changed: 103 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`
2020
`res = (m_a / m_b) * 2^{p_a - p_b}`
2121
- Check for early exits (infinity, zero, etc).
2222
- If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
23+
- Set the implicit bit so math is correct.
2324
- Shift the significand (with implicit bit) fully left so that arithmetic can happen with
2425
greater precision.
25-
- Calculate the reciprocal of `b`, `x`
26-
- Multiply: `res = m_a * x_b * 2^{p_a - p_b}`
27-
- Reapply rounding
26+
- Calculate the reciprocal of `b`, `x`.
27+
- Multiply: `res = m_a * x_b * 2^{p_a - p_b}`.
28+
- Reapply rounding.
2829
2930
The reciprocal and multiplication steps must happen with more bits of precision than the
3031
mantissa has; otherwise, precision would be lost rounding at each step. It is sufficient to use
@@ -49,7 +50,6 @@ x_{n+1} = x_n - f(x_n) / f'(x_n)
4950
5051
Applying this to finding the reciprocal:
5152
52-
5353
```text
5454
1 / x = b
5555
@@ -75,6 +75,14 @@ use crate::float::Float;
7575
use crate::int::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
7676
use core::mem::size_of;
7777

78+
macro_rules! guess {
79+
($ty:ty) => {
80+
const { (INITIAL_GUESS >> (u128::BITS - <$ty>::BITS)) as $ty }
81+
};
82+
}
83+
84+
const INITIAL_GUESS: u128 = 0x7504f333f9de6108b2fb1366eaa6a542;
85+
7886
/// Type-specific configuration used for float division
7987
trait FloatDivision: Float
8088
where
@@ -171,7 +179,9 @@ impl FloatDivision for f32 {
171179
/// for float32 division. This is expected to be useful for some 16-bit
172180
/// targets. Not used by default as it requires performing more work during
173181
/// rounding and would hardly help on regular 32- or 64-bit targets.
174-
const C_HW: HalfRep<Self> = 0x7504;
182+
const C_HW: HalfRep<Self> = guess!(HalfRep<Self>);
183+
184+
// 0x7504;
175185
}
176186

177187
impl FloatDivision for f64 {
@@ -185,13 +195,16 @@ impl FloatDivision for f64 {
185195
impl FloatDivision for f128 {
186196
const HALF_ITERATIONS: usize = 4;
187197

188-
const C_HW: HalfRep<Self> = 0x7504F333 << (HalfRep::<Self>::BITS - 32);
198+
// const C_HW: HalfRep<Self> = 0x7504F333 << (HalfRep::<Self>::BITS - 32);
199+
const C_HW: HalfRep<Self> = 0x7504f333f9de6108;
189200
}
190201

191202
extern crate std;
192203
#[allow(unused)]
193204
use std::{dbg, fmt, println};
194205

206+
// TODO: try adding const where possible
207+
195208
fn div<F>(a: F, b: F) -> F
196209
where
197210
F: FloatDivision,
@@ -332,7 +345,8 @@ where
332345
b_significand,
333346
);
334347

335-
// Transform to a fixed-point representation
348+
// Transform to a fixed-point representation. We know this is in the range [1.0, 2.0] since
349+
// the explicit bit is set.
336350
let b_uq1 = b_significand << (F::BITS - significand_bits - 1);
337351

338352
println!("b_uq1: {:#034x}", b_uq1);
@@ -384,94 +398,92 @@ where
384398
// b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
385399
let c_hw = F::C_HW;
386400

401+
debug_assert!(
402+
b_uq1_hw & HalfRep::<F>::ONE << (HalfRep::<F>::BITS - 1) > HalfRep::<F>::ZERO
403+
);
387404
// b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
388405
// so x0 fits to UQ0.HW without wrapping.
389-
let x_uq0_hw: HalfRep<F> = {
390-
let mut x_uq0_hw: HalfRep<F> =
391-
c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
392-
393-
// An e_0 error is comprised of errors due to
394-
// * x0 being an inherently imprecise first approximation of 1/b_hw
395-
// * C_hw being some (irrational) number **truncated** to W0 bits
396-
// Please note that e_0 is calculated against the infinitely precise
397-
// reciprocal of b_hw (that is, **truncated** version of b).
398-
//
399-
// e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
400-
//
401-
// By construction, 1 <= b < 2
402-
// f(x) = x * (2 - b*x) = 2*x - b*x^2
403-
// f'(x) = 2 * (1 - b*x)
404-
//
405-
// On the [0, 1] interval, f(0) = 0,
406-
// then it increses until f(1/b) = 1 / b, maximum on (0, 1),
407-
// then it decreses to f(1) = 2 - b
408-
//
409-
// Let g(x) = x - f(x) = b*x^2 - x.
410-
// On (0, 1/b), g(x) < 0 <=> f(x) > x
411-
// On (1/b, 1], g(x) > 0 <=> f(x) < x
412-
//
413-
// For half-width iterations, b_hw is used instead of b.
414-
for _ in 0..half_iterations {
415-
// corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
416-
// of corr_UQ1_hw.
417-
// "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
418-
// On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
419-
// no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
420-
// expected to be strictly positive because b_UQ1_hw has its highest bit set
421-
// and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
422-
let corr_uq1_hw: HalfRep<F> = zero
423-
.wrapping_sub(
424-
(F::Int::from(x_uq0_hw).wrapping_mul(F::Int::from(b_uq1_hw))) >> hw,
425-
)
426-
.cast();
427-
428-
// Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
429-
// obtaining an UQ1.(HW-1) number and proving its highest bit could be
430-
// considered to be 0 to be able to represent it in UQ0.HW.
431-
// From the above analysis of f(x), if corr_UQ1_hw would be represented
432-
// without any intermediate loss of precision (that is, in twice_rep_t)
433-
// x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
434-
// less otherwise. On the other hand, to obtain [1.]000..., one have to pass
435-
// 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
436-
// to 1.0 being not representable as UQ0.HW).
437-
// The fact corr_UQ1_hw was virtually round up (due to result of
438-
// multiplication being **first** truncated, then negated - to improve
439-
// error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
440-
x_uq0_hw = (F::Int::from(x_uq0_hw).wrapping_mul(F::Int::from(corr_uq1_hw))
441-
>> (hw - 1))
442-
.cast();
443-
444-
// Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
445-
// representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
446-
// any number of iterations, so just subtract 2 from the reciprocal
447-
// approximation after last iteration.
448-
//
449-
// In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
450-
// corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
451-
// = 1 - e_n * b_hw + 2*eps1
452-
// x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
453-
// = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
454-
// = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
455-
// e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
456-
// = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
457-
// \------ >0 -------/ \-- >0 ---/
458-
// abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
459-
}
406+
let mut x_uq0_hw: HalfRep<F> =
407+
c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
408+
409+
// An e_0 error is comprised of errors due to
410+
// * x0 being an inherently imprecise first approximation of 1/b_hw
411+
// * C_hw being some (irrational) number **truncated** to W0 bits
412+
// Please note that e_0 is calculated against the infinitely precise
413+
// reciprocal of b_hw (that is, **truncated** version of b).
414+
//
415+
// e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
416+
//
417+
// By construction, 1 <= b < 2
418+
// f(x) = x * (2 - b*x) = 2*x - b*x^2
419+
// f'(x) = 2 * (1 - b*x)
420+
//
421+
// On the [0, 1] interval, f(0) = 0,
422+
// then it increses until f(1/b) = 1 / b, maximum on (0, 1),
423+
// then it decreses to f(1) = 2 - b
424+
//
425+
// Let g(x) = x - f(x) = b*x^2 - x.
426+
// On (0, 1/b), g(x) < 0 <=> f(x) > x
427+
// On (1/b, 1], g(x) > 0 <=> f(x) < x
428+
//
429+
// For half-width iterations, b_hw is used instead of b.
430+
for _ in 0..half_iterations {
431+
// corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
432+
// of corr_UQ1_hw.
433+
// "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
434+
// On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
435+
// no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
436+
// expected to be strictly positive because b_UQ1_hw has its highest bit set
437+
// and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
438+
let corr_uq1_hw: HalfRep<F> = zero
439+
.wrapping_sub((F::Int::from(x_uq0_hw).wrapping_mul(F::Int::from(b_uq1_hw))) >> hw)
440+
.cast();
460441

461-
// For initial half-width iterations, U = 2^-HW
462-
// Let abs(e_n) <= u_n * U,
463-
// then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
464-
// u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
442+
// Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
443+
// obtaining an UQ1.(HW-1) number and proving its highest bit could be
444+
// considered to be 0 to be able to represent it in UQ0.HW.
445+
// From the above analysis of f(x), if corr_UQ1_hw would be represented
446+
// without any intermediate loss of precision (that is, in twice_rep_t)
447+
// x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
448+
// less otherwise. On the other hand, to obtain [1.]000..., one have to pass
449+
// 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
450+
// to 1.0 being not representable as UQ0.HW).
451+
// The fact corr_UQ1_hw was virtually round up (due to result of
452+
// multiplication being **first** truncated, then negated - to improve
453+
// error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
454+
x_uq0_hw =
455+
(F::Int::from(x_uq0_hw).wrapping_mul(F::Int::from(corr_uq1_hw)) >> (hw - 1)).cast();
456+
457+
// Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
458+
// representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
459+
// any number of iterations, so just subtract 2 from the reciprocal
460+
// approximation after last iteration.
465461
//
466-
// Account for possible overflow (see above). For an overflow to occur for the
467-
// first time, for "ideal" corr_UQ1_hw (that is, without intermediate
468-
// truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
469-
// value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
470-
// be not below that value (see g(x) above), so it is safe to decrement just
471-
// once after the final iteration. On the other hand, an effective value of
472-
// divisor changes after this point (from b_hw to b), so adjust here.
473-
x_uq0_hw.wrapping_sub(HalfRep::<F>::ONE)
474-
};
462+
// In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
463+
// corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
464+
// = 1 - e_n * b_hw + 2*eps1
465+
// x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
466+
// = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
467+
// = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
468+
// e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
469+
// = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
470+
// \------ >0 -------/ \-- >0 ---/
471+
// abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
472+
}
473+
474+
// For initial half-width iterations, U = 2^-HW
475+
// Let abs(e_n) <= u_n * U,
476+
// then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
477+
// u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
478+
//
479+
// Account for possible overflow (see above). For an overflow to occur for the
480+
// first time, for "ideal" corr_UQ1_hw (that is, without intermediate
481+
// truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
482+
// value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
483+
// be not below that value (see g(x) above), so it is safe to decrement just
484+
// once after the final iteration. On the other hand, an effective value of
485+
// divisor changes after this point (from b_hw to b), so adjust here.
486+
x_uq0_hw = x_uq0_hw.wrapping_sub(HalfRep::<F>::ONE);
475487

476488
// Error estimations for full-precision iterations are calculated just
477489
// as above, but with U := 2^-W and taking extra decrementing into account.
@@ -544,11 +556,6 @@ where
544556
as u32)
545557
.cast();
546558
}
547-
} else {
548-
assert!(
549-
F::BITS != 32,
550-
"native full iterations onlydoaijfoisd supports f32"
551-
);
552559
}
553560

554561
// Finally, account for possible overflow, as explained above.

0 commit comments

Comments
 (0)