diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c index 91d940bb41b8e..6fb2776170ea9 100644 --- a/ext/bcmath/libbcmath/src/recmul.c +++ b/ext/bcmath/libbcmath/src/recmul.c @@ -58,12 +58,68 @@ static inline void bc_digits_adjustment(BC_UINT_T *prod_uint, size_t prod_arr_si } } +/* This is based on the technique described in https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html. + * This function transforms AABBCCDD into 1000 * AA + 100 * BB + 10 * CC + DD, + * with the caveat that all components must be in the interval [0, 25] to prevent overflow + * due to the multiplication by power of 10 (10 * 25 = 250 is the largest number that fits in a byte). + * The advantage of this method instead of using shifts + 3 multiplications is that this is cheaper + * due to its divide-and-conquer nature. + */ +#if SIZEOF_SIZE_T == 4 +static uint32_t bc_parse_chunk_chars(const char *str) +{ + uint32_t tmp; + memcpy(&tmp, str, sizeof(tmp)); +#if !BC_LITTLE_ENDIAN + tmp = BC_BSWAP(tmp); +#endif + + uint32_t lower_digits = (tmp & 0x0f000f00) >> 8; + uint32_t upper_digits = (tmp & 0x000f000f) * 10; + + tmp = lower_digits + upper_digits; + + lower_digits = (tmp & 0x00ff0000) >> 16; + upper_digits = (tmp & 0x000000ff) * 100; + + return lower_digits + upper_digits; +} +#elif SIZEOF_SIZE_T == 8 +static uint64_t bc_parse_chunk_chars(const char *str) +{ + uint64_t tmp; + memcpy(&tmp, str, sizeof(tmp)); +#if !BC_LITTLE_ENDIAN + tmp = BC_BSWAP(tmp); +#endif + + uint64_t lower_digits = (tmp & 0x0f000f000f000f00) >> 8; + uint64_t upper_digits = (tmp & 0x000f000f000f000f) * 10; + + tmp = lower_digits + upper_digits; + + lower_digits = (tmp & 0x00ff000000ff0000) >> 16; + upper_digits = (tmp & 0x000000ff000000ff) * 100; + + tmp = lower_digits + upper_digits; + + lower_digits = (tmp & 0x0000ffff00000000) >> 32; + upper_digits = (tmp & 0x000000000000ffff) * 10000; + + return lower_digits + upper_digits; +} +#endif + /* * Converts BCD to uint, going backwards from pointer n by the number of * characters specified by len. */ static inline BC_UINT_T bc_partial_convert_to_uint(const char *n, size_t len) { + if (len == BC_MUL_UINT_DIGITS) { + return bc_parse_chunk_chars(n - BC_MUL_UINT_DIGITS + 1); + } + BC_UINT_T num = 0; BC_UINT_T base = 1; @@ -92,7 +148,7 @@ static inline void bc_convert_to_uint(BC_UINT_T *n_uint, const char *nend, size_ * If the n_values of n1 and n2 are both 4 (32-bit) or 8 (64-bit) digits or less, * the calculation will be performed at high speed without using an array. */ -static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod) +static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc_num *prod) { const char *n1end = n1->n_value + n1len - 1; const char *n2end = n2->n_value + n2len - 1; @@ -112,6 +168,52 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc } } +#if BC_LITTLE_ENDIAN +# define BC_ENCODE_LUT(A, B) ((A) | (B) << 4) +#else +# define BC_ENCODE_LUT(A, B) ((B) | (A) << 4) +#endif + +#define LUT_ITERATE(_, A) \ + _(A, 0), _(A, 1), _(A, 2), _(A, 3), _(A, 4), _(A, 5), _(A, 6), _(A, 7), _(A, 8), _(A, 9) + +/* This LUT encodes the decimal representation of numbers 0-100 + * such that we can avoid taking modulos and divisions which would be slow. */ +static const unsigned char LUT[100] = { + LUT_ITERATE(BC_ENCODE_LUT, 0), + LUT_ITERATE(BC_ENCODE_LUT, 1), + LUT_ITERATE(BC_ENCODE_LUT, 2), + LUT_ITERATE(BC_ENCODE_LUT, 3), + LUT_ITERATE(BC_ENCODE_LUT, 4), + LUT_ITERATE(BC_ENCODE_LUT, 5), + LUT_ITERATE(BC_ENCODE_LUT, 6), + LUT_ITERATE(BC_ENCODE_LUT, 7), + LUT_ITERATE(BC_ENCODE_LUT, 8), + LUT_ITERATE(BC_ENCODE_LUT, 9), +}; + +static inline unsigned short bc_expand_lut(unsigned char c) +{ + return (c & 0x0f) | (c & 0xf0) << 4; +} + +/* Writes the character representation of the number encoded in value. + * E.g. if value = 1234, then the string "1234" will be written to str. */ +static void bc_write_bcd_representation(uint32_t value, char *str) +{ + uint32_t upper = value / 100; /* e.g. 12 */ + uint32_t lower = value % 100; /* e.g. 34 */ + +#if BC_LITTLE_ENDIAN + /* Note: little endian, so `lower` comes before `upper`! */ + uint32_t digits = bc_expand_lut(LUT[lower]) << 16 | bc_expand_lut(LUT[upper]); +#else + /* Note: big endian, so `upper` comes before `lower`! */ + uint32_t digits = bc_expand_lut(LUT[upper]) << 16 | bc_expand_lut(LUT[lower]); +#endif + memcpy(str, &digits, sizeof(digits)); +} + /* * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_UINT_Ts. * The array is generated starting with the smaller digits. @@ -180,10 +282,14 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc char *pend = pptr + prodlen - 1; i = 0; while (i < prod_arr_size - 1) { - for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) { - *pend-- = prod_uint[i] % BASE; - prod_uint[i] /= BASE; - } +#if BC_MUL_UINT_DIGITS == 4 + bc_write_bcd_representation(prod_uint[i], pend - 3); + pend -= 4; +#else + bc_write_bcd_representation(prod_uint[i] / 10000, pend - 7); + bc_write_bcd_representation(prod_uint[i] % 10000, pend - 3); + pend -= 8; +#endif i++; }