1
1
/*
2
+ Floating point division routines.
2
3
3
- Solving for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`.
4
+ This module documentation gives an overview of the method used. More documentation is inline.
5
+
6
+ Relevant notation:
7
+
8
+ - `m_a`: the mantissa of `a`, in base 2
9
+ - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
10
+ - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
11
+ fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
12
+ will have N bits of integer and M-N bits of fraction.
13
+ - `hw`: half width, i.e. for `f64` this will be a `u32`.
14
+
15
+ # Method Overview
16
+
17
+ Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`. Process:
4
18
5
19
- Separate the exponent and significand
6
- `res = (m_a / m_b) * 2^( p_a - p_b) `
7
- - Check for early exits
20
+ `res = (m_a / m_b) * 2^{ p_a - p_b} `
21
+ - Check for early exits (infinity, zero, etc).
8
22
- If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
9
- - Shift the significand (with implicit bit) fully left so that arithmetic can happen with greater
10
- precision.
11
- - Calculate the reciprocal of `b`, `r`
12
- - Multiply: `res = m_a * r_b * 2^(p_a - p_b)`
23
+ - Shift the significand (with implicit bit) fully left so that arithmetic can happen with
24
+ greater precision.
25
+ - Calculate the reciprocal of `b`, `x`
26
+ - Multiply: `res = m_a * x_b * 2^{p_a - p_b}`
27
+ - Reapply rounding
28
+
29
+ The reciprocal and multiplication steps must happen with more bits of precision than the
30
+ mantissa has; otherwise, precision would be lost rounding at each step. It is sufficient to use
31
+ the float's same-sized integer.
32
+
33
+ # Reciprocal calculation
34
+
35
+ Calculating the reciprocal is the most complicated part of this process. It uses the
36
+ [Newton-Raphson method], which picks an initial estimation (of the reciprocal) and performs
37
+ a number of iterations to increase its precision.
38
+
39
+ In general, Newton's method takes the following form:
40
+
41
+ ```
42
+ `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
43
+ desired result.
44
+
45
+ The result is a zero of `f(x)`.
46
+
47
+ x_{n+1} = x_n - f(x_n) / f'(x_n)
48
+ ```
49
+
50
+ Applying this to finding the reciprocal:
51
+
52
+
53
+ ```text
54
+ 1 / x = b
55
+
56
+ Rearrange so we can solve by finding a zero
57
+ 0 = (1 / x) - b = f(x)
58
+
59
+ f'(x) = -x^{-2}
60
+
61
+ x_{n+1} = 2*x_n - b*x_n^2
62
+
63
+
64
+ ```
65
+
66
+ [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
13
67
14
68
The most complicated part of this process is calculating the reciprocal.
15
69
16
- Note that variables named e.g. `uq0` refer to Q notation. E.g. Q1.31 refers to a fixed-point
17
- number that has 1 bit of integer and 31 bits of decimal.
18
70
19
71
*/
20
72
73
+ use super :: HalfRep ;
21
74
use crate :: float:: Float ;
22
75
use crate :: int:: { CastFrom , CastInto , DInt , HInt , Int , MinInt } ;
23
-
24
- use super :: HalfRep ;
76
+ use core:: mem:: size_of;
25
77
26
78
/// Type-specific configuration used for float division
27
79
trait FloatDivision : Float
32
84
const HALF_ITERATIONS : usize ;
33
85
34
86
/// Iterations that are done at the full float's width. Must be at least one.
35
- const FULL_ITERATIONS : usize ;
87
+ const FULL_ITERATIONS : usize = 1 ;
36
88
37
- const USE_NATIVE_FULL_ITERATIONS : bool = false ;
89
+ const USE_NATIVE_FULL_ITERATIONS : bool = size_of :: < Self > ( ) < size_of :: < * const ( ) > ( ) ;
38
90
39
91
/// C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
40
92
/// with W0 being either 16 or 32 and W0 <= HW.
@@ -90,10 +142,30 @@ where
90
142
} ;
91
143
}
92
144
145
+ /// Calculate the number of iterations required to get needed precision of a float type.
146
+ ///
147
+ /// This returns `(h, f)` where `h` is the number of iterations to be donei using integers
148
+ /// at half the float's width, and `f` is the number of iterations done using integers of the
149
+ /// float's full width. Doing some iterations at half width is an optimization when the float
150
+ /// is larger than a word.
151
+ ///
152
+ /// ASSUMPTION: the initial estimate should have at least 8 bits of precision. If this is not
153
+ /// true, results will be inaccurate.
154
+ const fn calc_iterations < F : Float > ( ) -> ( usize , usize ) {
155
+ // Precision doubles with each iteration
156
+ let total_iterations = F :: BITS . ilog2 ( ) as usize - 2 ;
157
+
158
+ if size_of :: < F > ( ) < size_of :: < * const ( ) > ( ) {
159
+ // No need to use half iterations if math at the half
160
+ ( 0 , total_iterations)
161
+ } else {
162
+ ( total_iterations - 1 , 1 )
163
+ }
164
+ }
165
+
93
166
impl FloatDivision for f32 {
94
167
const HALF_ITERATIONS : usize = 0 ;
95
168
const FULL_ITERATIONS : usize = 3 ;
96
- const USE_NATIVE_FULL_ITERATIONS : bool = true ;
97
169
98
170
/// Use 16-bit initial estimation in case we are using half-width iterations
99
171
/// for float32 division. This is expected to be useful for some 16-bit
@@ -104,15 +176,14 @@ impl FloatDivision for f32 {
104
176
105
177
impl FloatDivision for f64 {
106
178
const HALF_ITERATIONS : usize = 3 ;
107
- const FULL_ITERATIONS : usize = 1 ;
179
+
108
180
/// HW is at least 32. Shifting into the highest bits if needed.
109
181
const C_HW : HalfRep < Self > = 0x7504F333 << ( HalfRep :: < Self > :: BITS - 32 ) ;
110
182
}
111
183
112
184
#[ cfg( not( feature = "no-f16-f128" ) ) ]
113
185
impl FloatDivision for f128 {
114
186
const HALF_ITERATIONS : usize = 4 ;
115
- const FULL_ITERATIONS : usize = 1 ;
116
187
117
188
const C_HW : HalfRep < Self > = 0x7504F333 << ( HalfRep :: < Self > :: BITS - 32 ) ;
118
189
}
@@ -169,6 +240,7 @@ where
169
240
let inf_rep = exponent_mask;
170
241
let quiet_bit = implicit_bit >> 1 ;
171
242
let qnan_rep = exponent_mask | quiet_bit;
243
+ let ( half_iterations, full_iterations) = calc_iterations :: < F > ( ) ;
172
244
173
245
let a_rep = a. repr ( ) ;
174
246
let b_rep = b. repr ( ) ;
@@ -300,7 +372,7 @@ where
300
372
// NB: Using half-width iterations increases computation errors due to
301
373
// rounding, so error estimations have to be computed taking the selected
302
374
// mode into account!
303
- let mut x_uq0 = if F :: HALF_ITERATIONS > 0 {
375
+ let mut x_uq0 = if half_iterations > 0 {
304
376
// Starting with (n-1) half-width iterations
305
377
let b_uq1_hw: HalfRep < F > = b_uq1. hi ( ) ;
306
378
@@ -339,7 +411,7 @@ where
339
411
// On (1/b, 1], g(x) > 0 <=> f(x) < x
340
412
//
341
413
// For half-width iterations, b_hw is used instead of b.
342
- for _ in 0 ..F :: HALF_ITERATIONS {
414
+ for _ in 0 ..half_iterations {
343
415
// corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
344
416
// of corr_UQ1_hw.
345
417
// "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
@@ -458,12 +530,12 @@ where
458
530
x_uq0
459
531
} ;
460
532
461
- if F :: USE_NATIVE_FULL_ITERATIONS {
533
+ if full_iterations > 1 {
462
534
// Need to use concrete types since `F::Int::D` might not support math. So, restrict to
463
535
// one type.
464
536
assert ! ( F :: BITS == 32 , "native full iterations only supports f32" ) ;
465
537
466
- for _ in 0 ..F :: FULL_ITERATIONS {
538
+ for _ in 0 ..full_iterations {
467
539
let corr_uq1: u32 = 0 . wrapping_sub (
468
540
( ( CastInto :: < u32 > :: cast ( x_uq0) as u64 ) * ( CastInto :: < u32 > :: cast ( b_uq1) as u64 ) )
469
541
>> F :: BITS ,
@@ -472,6 +544,11 @@ where
472
544
as u32 )
473
545
. cast ( ) ;
474
546
}
547
+ } else {
548
+ assert ! (
549
+ F :: BITS != 32 ,
550
+ "native full iterations onlydoaijfoisd supports f32"
551
+ ) ;
475
552
}
476
553
477
554
// Finally, account for possible overflow, as explained above.
0 commit comments