Skip to content

Commit 0273200

Browse files
Refactor BCMath _bc_do_sub (#14132)
_bc_do_sub now uses SIMD to perform calculations at high speed. Moved the macros used for SIMD to `private.h`, and added some constants and macros.
1 parent bb21d19 commit 0273200

File tree

4 files changed

+126
-17
lines changed

4 files changed

+126
-17
lines changed

ext/bcmath/libbcmath/src/convert.c

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,11 @@
1616

1717
#include "bcmath.h"
1818
#include "convert.h"
19+
#include "private.h"
1920
#ifdef __SSE2__
2021
# include <emmintrin.h>
2122
#endif
2223

23-
/* This will be 0x01010101 for 32-bit and 0x0101010101010101 */
24-
#define SWAR_ONES (~((size_t) 0) / 0xFF)
25-
/* This repeats a byte `x` into an entire 32/64-bit word.
26-
* Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
27-
#define SWAR_REPEAT(x) (SWAR_ONES * (x))
28-
2924
static char *bc_copy_and_shift_numbers(char *restrict dest, const char *source, const char *source_end, unsigned char shift, bool add)
3025
{
3126
size_t bulk_shift = SWAR_REPEAT(shift);

ext/bcmath/libbcmath/src/doaddsub.c

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -124,27 +124,26 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
124124
bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
125125
{
126126
bc_num diff;
127-
size_t diff_scale, diff_len;
128-
size_t min_scale, min_len;
129-
size_t borrow, count;
127+
/* The caller is guaranteed that n1 is always large. */
128+
size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
129+
size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
130+
/* Same condition as EXPECTED before, but using EXPECTED again will make it slower. */
131+
size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
132+
size_t min_scale = MIN(n1->n_scale, n2->n_scale);
133+
size_t min_bytes = min_len + min_scale;
134+
size_t borrow = 0;
135+
size_t count;
130136
int val;
131137
char *n1ptr, *n2ptr, *diffptr;
132138

133139
/* Allocate temporary storage. */
134-
diff_len = MAX(n1->n_len, n2->n_len);
135-
diff_scale = MAX(n1->n_scale, n2->n_scale);
136-
min_len = MIN(n1->n_len, n2->n_len);
137-
min_scale = MIN(n1->n_scale, n2->n_scale);
138140
diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
139141

140142
/* Initialize the subtract. */
141143
n1ptr = (char *) (n1->n_value + n1->n_len + n1->n_scale - 1);
142144
n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
143145
diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
144146

145-
/* Subtract the numbers. */
146-
borrow = 0;
147-
148147
/* Take care of the longer scaled number. */
149148
if (n1->n_scale != min_scale) {
150149
/* n1 has the longer scale */
@@ -166,7 +165,59 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
166165
}
167166

168167
/* Now do the equal length scale and integer parts. */
169-
for (count = 0; count < min_len + min_scale; count++) {
168+
count = 0;
169+
/* Uses SIMD to perform calculations at high speed. */
170+
if (min_bytes >= sizeof(BC_UINT_T)) {
171+
diffptr++;
172+
n1ptr++;
173+
n2ptr++;
174+
while (count + sizeof(BC_UINT_T) <= min_bytes) {
175+
diffptr -= sizeof(BC_UINT_T);
176+
n1ptr -= sizeof(BC_UINT_T);
177+
n2ptr -= sizeof(BC_UINT_T);
178+
179+
BC_UINT_T n1bytes;
180+
BC_UINT_T n2bytes;
181+
memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
182+
memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
183+
184+
#if BC_LITTLE_ENDIAN
185+
/* Little endian requires changing the order of bytes. */
186+
n1bytes = BC_BSWAP(n1bytes);
187+
n2bytes = BC_BSWAP(n2bytes);
188+
#endif
189+
190+
n1bytes -= n2bytes + borrow;
191+
/* If the most significant bit is 1, a carry down has occurred. */
192+
bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
193+
194+
/*
195+
* Check the most significant bit of each of the bytes, and if it is 1, a carry down has
196+
* occurred. When carrying down occurs, due to the difference between decimal and hexadecimal
197+
* numbers, an extra 6 is added to the lower 4 bits.
198+
* Therefore, for a byte that has been carried down, set all the upper 4 bits to 0 and subtract
199+
* 6 from the lower 4 bits to adjust it to the correct value as a decimal number.
200+
*/
201+
BC_UINT_T borrow_mask = ((n1bytes & SWAR_REPEAT(0x80)) >> 7) * 0x06;
202+
n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
203+
204+
#if BC_LITTLE_ENDIAN
205+
/* Little endian requires changing the order of bytes back. */
206+
n1bytes = BC_BSWAP(n1bytes);
207+
#endif
208+
209+
memcpy(diffptr, &n1bytes, sizeof(n1bytes));
210+
211+
borrow = tmp_borrow;
212+
count += sizeof(BC_UINT_T);
213+
}
214+
diffptr--;
215+
n1ptr--;
216+
n2ptr--;
217+
}
218+
219+
/* Calculate the remaining bytes that are less than the size of BC_UINT_T using a normal loop. */
220+
for (; count < min_bytes; count++) {
170221
val = *n1ptr-- - *n2ptr-- - borrow;
171222
if (val < 0) {
172223
val += BASE;

ext/bcmath/libbcmath/src/private.h

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,68 @@
3434
#include <stdbool.h>
3535
#include <stddef.h>
3636

37+
/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
38+
#define SWAR_ONES (~((size_t) 0) / 0xFF)
39+
/* This repeats a byte `x` into an entire 32/64-bit word.
40+
* Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
41+
#define SWAR_REPEAT(x) (SWAR_ONES * (x))
42+
43+
/* Bytes swap */
44+
#if defined(_MSC_VER)
45+
# include <stdlib.h>
46+
# define BSWAP32(u) _byteswap_ulong(u)
47+
# define BSWAP64(u) _byteswap_uint64(u)
48+
#else
49+
# ifdef __has_builtin
50+
# if __has_builtin(__builtin_bswap32)
51+
# define BSWAP32(u) __builtin_bswap32(u)
52+
# endif // __has_builtin(__builtin_bswap32)
53+
# if __has_builtin(__builtin_bswap64)
54+
# define BSWAP64(u) __builtin_bswap64(u)
55+
# endif // __has_builtin(__builtin_bswap64)
56+
# elif defined(__GNUC__)
57+
# define BSWAP32(u) __builtin_bswap32(u)
58+
# define BSWAP64(u) __builtin_bswap64(u)
59+
# endif // __has_builtin
60+
#endif // defined(_MSC_VER)
61+
#ifndef BSWAP32
62+
inline uint32_t BSWAP32(uint32_t u)
63+
{
64+
return (((u & 0xff000000) >> 24)
65+
| ((u & 0x00ff0000) >> 8)
66+
| ((u & 0x0000ff00) << 8)
67+
| ((u & 0x000000ff) << 24));
68+
}
69+
#endif
70+
#ifndef BSWAP64
71+
inline uint64_t BSWAP64(uint64_t u)
72+
{
73+
return (((u & 0xff00000000000000ULL) >> 56)
74+
| ((u & 0x00ff000000000000ULL) >> 40)
75+
| ((u & 0x0000ff0000000000ULL) >> 24)
76+
| ((u & 0x000000ff00000000ULL) >> 8)
77+
| ((u & 0x00000000ff000000ULL) << 8)
78+
| ((u & 0x0000000000ff0000ULL) << 24)
79+
| ((u & 0x000000000000ff00ULL) << 40)
80+
| ((u & 0x00000000000000ffULL) << 56));
81+
}
82+
#endif
83+
84+
#if SIZEOF_SIZE_T >= 8
85+
#define BC_BSWAP(u) BSWAP64(u)
86+
#define BC_UINT_T uint64_t
87+
#else
88+
#define BC_BSWAP(u) BSWAP32(u)
89+
#define BC_UINT_T uint32_t
90+
#endif
91+
92+
#ifdef WORDS_BIGENDIAN
93+
#define BC_LITTLE_ENDIAN 0
94+
#else
95+
#define BC_LITTLE_ENDIAN 1
96+
#endif
97+
98+
3799
/* routines */
38100
int _bc_do_compare (bc_num n1, bc_num n2, bool use_sign);
39101
bc_num _bc_do_add (bc_num n1, bc_num n2, size_t scale_min);

ext/bcmath/libbcmath/src/str2num.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
#include "bcmath.h"
3333
#include "convert.h"
34+
#include "private.h"
3435
#include <stdbool.h>
3536
#include <stddef.h>
3637
#ifdef __SSE2__

0 commit comments

Comments
 (0)