Skip to content

Commit f3316bd

Browse files
committed
Better calculations
1 parent b400920 commit f3316bd

File tree

1 file changed

+97
-20
lines changed

1 file changed

+97
-20
lines changed

src/float/div.rs

Lines changed: 97 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,79 @@
11
/*
2+
Floating point division routines.
23
3-
Solving for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`.
4+
This module documentation gives an overview of the method used. More documentation is inline.
5+
6+
Relevant notation:
7+
8+
- `m_a`: the mantissa of `a`, in base 2
9+
- `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
10+
- `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
11+
fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
12+
will have N bits of integer and M-N bits of fraction.
13+
- `hw`: half width, i.e. for `f64` this will be a `u32`.
14+
15+
# Method Overview
16+
17+
Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`. Process:
418
519
- Separate the exponent and significand
6-
`res = (m_a / m_b) * 2^(p_a - p_b)`
7-
- Check for early exits
20+
`res = (m_a / m_b) * 2^{p_a - p_b}`
21+
- Check for early exits (infinity, zero, etc).
822
- If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
9-
- Shift the significand (with implicit bit) fully left so that arithmetic can happen with greater
10-
precision.
11-
- Calculate the reciprocal of `b`, `r`
12-
- Multiply: `res = m_a * r_b * 2^(p_a - p_b)`
23+
- Shift the significand (with implicit bit) fully left so that arithmetic can happen with
24+
greater precision.
25+
- Calculate the reciprocal of `b`, `x`
26+
- Multiply: `res = m_a * x_b * 2^{p_a - p_b}`
27+
- Reapply rounding
28+
29+
The reciprocal and multiplication steps must happen with more bits of precision than the
30+
mantissa has; otherwise, precision would be lost rounding at each step. It is sufficient to use
31+
the float's same-sized integer.
32+
33+
# Reciprocal calculation
34+
35+
Calculating the reciprocal is the most complicated part of this process. It uses the
36+
[Newton-Raphson method], which picks an initial estimation (of the reciprocal) and performs
37+
a number of iterations to increase its precision.
38+
39+
In general, Newton's method takes the following form:
40+
41+
```
42+
`x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
43+
desired result.
44+
45+
The result is a zero of `f(x)`.
46+
47+
x_{n+1} = x_n - f(x_n) / f'(x_n)
48+
```
49+
50+
Applying this to finding the reciprocal:
51+
52+
53+
```text
54+
1 / x = b
55+
56+
Rearrange so we can solve by finding a zero
57+
0 = (1 / x) - b = f(x)
58+
59+
f'(x) = -x^{-2}
60+
61+
x_{n+1} = 2*x_n - b*x_n^2
62+
63+
64+
```
65+
66+
[Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
1367
1468
The most complicated part of this process is calculating the reciprocal.
1569
16-
Note that variables named e.g. `uq0` refer to Q notation. E.g. Q1.31 refers to a fixed-point
17-
number that has 1 bit of integer and 31 bits of decimal.
1870
1971
*/
2072

73+
use super::HalfRep;
2174
use crate::float::Float;
2275
use crate::int::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
23-
24-
use super::HalfRep;
76+
use core::mem::size_of;
2577

2678
/// Type-specific configuration used for float division
2779
trait FloatDivision: Float
@@ -32,9 +84,9 @@ where
3284
const HALF_ITERATIONS: usize;
3385

3486
/// Iterations that are done at the full float's width. Must be at least one.
35-
const FULL_ITERATIONS: usize;
87+
const FULL_ITERATIONS: usize = 1;
3688

37-
const USE_NATIVE_FULL_ITERATIONS: bool = false;
89+
const USE_NATIVE_FULL_ITERATIONS: bool = size_of::<Self>() < size_of::<*const ()>();
3890

3991
/// C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
4092
/// with W0 being either 16 or 32 and W0 <= HW.
@@ -90,10 +142,30 @@ where
90142
};
91143
}
92144

145+
/// Calculate the number of iterations required to get needed precision of a float type.
146+
///
147+
/// This returns `(h, f)` where `h` is the number of iterations to be donei using integers
148+
/// at half the float's width, and `f` is the number of iterations done using integers of the
149+
/// float's full width. Doing some iterations at half width is an optimization when the float
150+
/// is larger than a word.
151+
///
152+
/// ASSUMPTION: the initial estimate should have at least 8 bits of precision. If this is not
153+
/// true, results will be inaccurate.
154+
const fn calc_iterations<F: Float>() -> (usize, usize) {
155+
// Precision doubles with each iteration
156+
let total_iterations = F::BITS.ilog2() as usize - 2;
157+
158+
if size_of::<F>() < size_of::<*const ()>() {
159+
// No need to use half iterations if math at the half
160+
(0, total_iterations)
161+
} else {
162+
(total_iterations - 1, 1)
163+
}
164+
}
165+
93166
impl FloatDivision for f32 {
94167
const HALF_ITERATIONS: usize = 0;
95168
const FULL_ITERATIONS: usize = 3;
96-
const USE_NATIVE_FULL_ITERATIONS: bool = true;
97169

98170
/// Use 16-bit initial estimation in case we are using half-width iterations
99171
/// for float32 division. This is expected to be useful for some 16-bit
@@ -104,15 +176,14 @@ impl FloatDivision for f32 {
104176

105177
impl FloatDivision for f64 {
106178
const HALF_ITERATIONS: usize = 3;
107-
const FULL_ITERATIONS: usize = 1;
179+
108180
/// HW is at least 32. Shifting into the highest bits if needed.
109181
const C_HW: HalfRep<Self> = 0x7504F333 << (HalfRep::<Self>::BITS - 32);
110182
}
111183

112184
#[cfg(not(feature = "no-f16-f128"))]
113185
impl FloatDivision for f128 {
114186
const HALF_ITERATIONS: usize = 4;
115-
const FULL_ITERATIONS: usize = 1;
116187

117188
const C_HW: HalfRep<Self> = 0x7504F333 << (HalfRep::<Self>::BITS - 32);
118189
}
@@ -169,6 +240,7 @@ where
169240
let inf_rep = exponent_mask;
170241
let quiet_bit = implicit_bit >> 1;
171242
let qnan_rep = exponent_mask | quiet_bit;
243+
let (half_iterations, full_iterations) = calc_iterations::<F>();
172244

173245
let a_rep = a.repr();
174246
let b_rep = b.repr();
@@ -300,7 +372,7 @@ where
300372
// NB: Using half-width iterations increases computation errors due to
301373
// rounding, so error estimations have to be computed taking the selected
302374
// mode into account!
303-
let mut x_uq0 = if F::HALF_ITERATIONS > 0 {
375+
let mut x_uq0 = if half_iterations > 0 {
304376
// Starting with (n-1) half-width iterations
305377
let b_uq1_hw: HalfRep<F> = b_uq1.hi();
306378

@@ -339,7 +411,7 @@ where
339411
// On (1/b, 1], g(x) > 0 <=> f(x) < x
340412
//
341413
// For half-width iterations, b_hw is used instead of b.
342-
for _ in 0..F::HALF_ITERATIONS {
414+
for _ in 0..half_iterations {
343415
// corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
344416
// of corr_UQ1_hw.
345417
// "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
@@ -458,12 +530,12 @@ where
458530
x_uq0
459531
};
460532

461-
if F::USE_NATIVE_FULL_ITERATIONS {
533+
if full_iterations > 1 {
462534
// Need to use concrete types since `F::Int::D` might not support math. So, restrict to
463535
// one type.
464536
assert!(F::BITS == 32, "native full iterations only supports f32");
465537

466-
for _ in 0..F::FULL_ITERATIONS {
538+
for _ in 0..full_iterations {
467539
let corr_uq1: u32 = 0.wrapping_sub(
468540
((CastInto::<u32>::cast(x_uq0) as u64) * (CastInto::<u32>::cast(b_uq1) as u64))
469541
>> F::BITS,
@@ -472,6 +544,11 @@ where
472544
as u32)
473545
.cast();
474546
}
547+
} else {
548+
assert!(
549+
F::BITS != 32,
550+
"native full iterations onlydoaijfoisd supports f32"
551+
);
475552
}
476553

477554
// Finally, account for possible overflow, as explained above.

0 commit comments

Comments
 (0)