@@ -20,11 +20,12 @@ Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`
20
20
`res = (m_a / m_b) * 2^{p_a - p_b}`
21
21
- Check for early exits (infinity, zero, etc).
22
22
- If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
23
+ - Set the implicit bit so math is correct.
23
24
- Shift the significand (with implicit bit) fully left so that arithmetic can happen with
24
25
greater precision.
25
- - Calculate the reciprocal of `b`, `x`
26
- - Multiply: `res = m_a * x_b * 2^{p_a - p_b}`
27
- - Reapply rounding
26
+ - Calculate the reciprocal of `b`, `x`.
27
+ - Multiply: `res = m_a * x_b * 2^{p_a - p_b}`.
28
+ - Reapply rounding.
28
29
29
30
The reciprocal and multiplication steps must happen with more bits of precision than the
30
31
mantissa has; otherwise, precision would be lost rounding at each step. It is sufficient to use
@@ -49,7 +50,6 @@ x_{n+1} = x_n - f(x_n) / f'(x_n)
49
50
50
51
Applying this to finding the reciprocal:
51
52
52
-
53
53
```text
54
54
1 / x = b
55
55
@@ -75,6 +75,14 @@ use crate::float::Float;
75
75
use crate :: int:: { CastFrom , CastInto , DInt , HInt , Int , MinInt } ;
76
76
use core:: mem:: size_of;
77
77
78
+ macro_rules! guess {
79
+ ( $ty: ty) => {
80
+ const { ( INITIAL_GUESS >> ( u128 :: BITS - <$ty>:: BITS ) ) as $ty }
81
+ } ;
82
+ }
83
+
84
+ const INITIAL_GUESS : u128 = 0x7504f333f9de6108b2fb1366eaa6a542 ;
85
+
78
86
/// Type-specific configuration used for float division
79
87
trait FloatDivision : Float
80
88
where
@@ -171,7 +179,9 @@ impl FloatDivision for f32 {
171
179
/// for float32 division. This is expected to be useful for some 16-bit
172
180
/// targets. Not used by default as it requires performing more work during
173
181
/// rounding and would hardly help on regular 32- or 64-bit targets.
174
- const C_HW : HalfRep < Self > = 0x7504 ;
182
+ const C_HW : HalfRep < Self > = guess ! ( HalfRep <Self >) ;
183
+
184
+ // 0x7504;
175
185
}
176
186
177
187
impl FloatDivision for f64 {
@@ -185,13 +195,16 @@ impl FloatDivision for f64 {
185
195
impl FloatDivision for f128 {
186
196
const HALF_ITERATIONS : usize = 4 ;
187
197
188
- const C_HW : HalfRep < Self > = 0x7504F333 << ( HalfRep :: < Self > :: BITS - 32 ) ;
198
+ // const C_HW: HalfRep<Self> = 0x7504F333 << (HalfRep::<Self>::BITS - 32);
199
+ const C_HW : HalfRep < Self > = 0x7504f333f9de6108 ;
189
200
}
190
201
191
202
extern crate std;
192
203
#[ allow( unused) ]
193
204
use std:: { dbg, fmt, println} ;
194
205
206
+ // TODO: try adding const where possible
207
+
195
208
fn div < F > ( a : F , b : F ) -> F
196
209
where
197
210
F : FloatDivision ,
@@ -332,7 +345,8 @@ where
332
345
b_significand,
333
346
) ;
334
347
335
- // Transform to a fixed-point representation
348
+ // Transform to a fixed-point representation. We know this is in the range [1.0, 2.0] since
349
+ // the explicit bit is set.
336
350
let b_uq1 = b_significand << ( F :: BITS - significand_bits - 1 ) ;
337
351
338
352
println ! ( "b_uq1: {:#034x}" , b_uq1) ;
@@ -384,94 +398,92 @@ where
384
398
// b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
385
399
let c_hw = F :: C_HW ;
386
400
401
+ debug_assert ! (
402
+ b_uq1_hw & HalfRep :: <F >:: ONE << ( HalfRep :: <F >:: BITS - 1 ) > HalfRep :: <F >:: ZERO
403
+ ) ;
387
404
// b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
388
405
// so x0 fits to UQ0.HW without wrapping.
389
- let x_uq0_hw: HalfRep < F > = {
390
- let mut x_uq0_hw: HalfRep < F > =
391
- c_hw. wrapping_sub ( b_uq1_hw /* exact b_hw/2 as UQ0.HW */ ) ;
392
-
393
- // An e_0 error is comprised of errors due to
394
- // * x0 being an inherently imprecise first approximation of 1/b_hw
395
- // * C_hw being some (irrational) number **truncated** to W0 bits
396
- // Please note that e_0 is calculated against the infinitely precise
397
- // reciprocal of b_hw (that is, **truncated** version of b).
398
- //
399
- // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
400
- //
401
- // By construction, 1 <= b < 2
402
- // f(x) = x * (2 - b*x) = 2*x - b*x^2
403
- // f'(x) = 2 * (1 - b*x)
404
- //
405
- // On the [0, 1] interval, f(0) = 0,
406
- // then it increses until f(1/b) = 1 / b, maximum on (0, 1),
407
- // then it decreses to f(1) = 2 - b
408
- //
409
- // Let g(x) = x - f(x) = b*x^2 - x.
410
- // On (0, 1/b), g(x) < 0 <=> f(x) > x
411
- // On (1/b, 1], g(x) > 0 <=> f(x) < x
412
- //
413
- // For half-width iterations, b_hw is used instead of b.
414
- for _ in 0 ..half_iterations {
415
- // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
416
- // of corr_UQ1_hw.
417
- // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
418
- // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
419
- // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
420
- // expected to be strictly positive because b_UQ1_hw has its highest bit set
421
- // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
422
- let corr_uq1_hw: HalfRep < F > = zero
423
- . wrapping_sub (
424
- ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( b_uq1_hw) ) ) >> hw,
425
- )
426
- . cast ( ) ;
427
-
428
- // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
429
- // obtaining an UQ1.(HW-1) number and proving its highest bit could be
430
- // considered to be 0 to be able to represent it in UQ0.HW.
431
- // From the above analysis of f(x), if corr_UQ1_hw would be represented
432
- // without any intermediate loss of precision (that is, in twice_rep_t)
433
- // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
434
- // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
435
- // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
436
- // to 1.0 being not representable as UQ0.HW).
437
- // The fact corr_UQ1_hw was virtually round up (due to result of
438
- // multiplication being **first** truncated, then negated - to improve
439
- // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
440
- x_uq0_hw = ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( corr_uq1_hw) )
441
- >> ( hw - 1 ) )
442
- . cast ( ) ;
443
-
444
- // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
445
- // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
446
- // any number of iterations, so just subtract 2 from the reciprocal
447
- // approximation after last iteration.
448
- //
449
- // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
450
- // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
451
- // = 1 - e_n * b_hw + 2*eps1
452
- // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
453
- // = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
454
- // = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
455
- // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
456
- // = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
457
- // \------ >0 -------/ \-- >0 ---/
458
- // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
459
- }
406
+ let mut x_uq0_hw: HalfRep < F > =
407
+ c_hw. wrapping_sub ( b_uq1_hw /* exact b_hw/2 as UQ0.HW */ ) ;
408
+
409
+ // An e_0 error is comprised of errors due to
410
+ // * x0 being an inherently imprecise first approximation of 1/b_hw
411
+ // * C_hw being some (irrational) number **truncated** to W0 bits
412
+ // Please note that e_0 is calculated against the infinitely precise
413
+ // reciprocal of b_hw (that is, **truncated** version of b).
414
+ //
415
+ // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
416
+ //
417
+ // By construction, 1 <= b < 2
418
+ // f(x) = x * (2 - b*x) = 2*x - b*x^2
419
+ // f'(x) = 2 * (1 - b*x)
420
+ //
421
+ // On the [0, 1] interval, f(0) = 0,
422
+ // then it increses until f(1/b) = 1 / b, maximum on (0, 1),
423
+ // then it decreses to f(1) = 2 - b
424
+ //
425
+ // Let g(x) = x - f(x) = b*x^2 - x.
426
+ // On (0, 1/b), g(x) < 0 <=> f(x) > x
427
+ // On (1/b, 1], g(x) > 0 <=> f(x) < x
428
+ //
429
+ // For half-width iterations, b_hw is used instead of b.
430
+ for _ in 0 ..half_iterations {
431
+ // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
432
+ // of corr_UQ1_hw.
433
+ // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
434
+ // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
435
+ // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
436
+ // expected to be strictly positive because b_UQ1_hw has its highest bit set
437
+ // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
438
+ let corr_uq1_hw: HalfRep < F > = zero
439
+ . wrapping_sub ( ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( b_uq1_hw) ) ) >> hw)
440
+ . cast ( ) ;
460
441
461
- // For initial half-width iterations, U = 2^-HW
462
- // Let abs(e_n) <= u_n * U,
463
- // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
464
- // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
442
+ // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
443
+ // obtaining an UQ1.(HW-1) number and proving its highest bit could be
444
+ // considered to be 0 to be able to represent it in UQ0.HW.
445
+ // From the above analysis of f(x), if corr_UQ1_hw would be represented
446
+ // without any intermediate loss of precision (that is, in twice_rep_t)
447
+ // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
448
+ // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
449
+ // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
450
+ // to 1.0 being not representable as UQ0.HW).
451
+ // The fact corr_UQ1_hw was virtually round up (due to result of
452
+ // multiplication being **first** truncated, then negated - to improve
453
+ // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
454
+ x_uq0_hw =
455
+ ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( corr_uq1_hw) ) >> ( hw - 1 ) ) . cast ( ) ;
456
+
457
+ // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
458
+ // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
459
+ // any number of iterations, so just subtract 2 from the reciprocal
460
+ // approximation after last iteration.
465
461
//
466
- // Account for possible overflow (see above). For an overflow to occur for the
467
- // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
468
- // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
469
- // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
470
- // be not below that value (see g(x) above), so it is safe to decrement just
471
- // once after the final iteration. On the other hand, an effective value of
472
- // divisor changes after this point (from b_hw to b), so adjust here.
473
- x_uq0_hw. wrapping_sub ( HalfRep :: < F > :: ONE )
474
- } ;
462
+ // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
463
+ // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
464
+ // = 1 - e_n * b_hw + 2*eps1
465
+ // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
466
+ // = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
467
+ // = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
468
+ // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
469
+ // = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
470
+ // \------ >0 -------/ \-- >0 ---/
471
+ // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
472
+ }
473
+
474
+ // For initial half-width iterations, U = 2^-HW
475
+ // Let abs(e_n) <= u_n * U,
476
+ // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
477
+ // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
478
+ //
479
+ // Account for possible overflow (see above). For an overflow to occur for the
480
+ // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
481
+ // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
482
+ // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
483
+ // be not below that value (see g(x) above), so it is safe to decrement just
484
+ // once after the final iteration. On the other hand, an effective value of
485
+ // divisor changes after this point (from b_hw to b), so adjust here.
486
+ x_uq0_hw = x_uq0_hw. wrapping_sub ( HalfRep :: < F > :: ONE ) ;
475
487
476
488
// Error estimations for full-precision iterations are calculated just
477
489
// as above, but with U := 2^-W and taking extra decrementing into account.
@@ -544,11 +556,6 @@ where
544
556
as u32 )
545
557
. cast ( ) ;
546
558
}
547
- } else {
548
- assert ! (
549
- F :: BITS != 32 ,
550
- "native full iterations onlydoaijfoisd supports f32"
551
- ) ;
552
559
}
553
560
554
561
// Finally, account for possible overflow, as explained above.
0 commit comments