From 536d2f7cd5e86cf5aabdb63a83d1c427435bdebd Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Sun, 5 May 2024 21:13:55 +0900
Subject: [PATCH 01/10] Added some definitions to bcmath.h

Moved SWAR_ONES and SWAR_REPEAT to header, and defined bytes swap etc.
---
 ext/bcmath/libbcmath/src/bcmath.h  | 60 ++++++++++++++++++++++++++++++
 ext/bcmath/libbcmath/src/convert.c |  6 ---
 2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/bcmath.h b/ext/bcmath/libbcmath/src/bcmath.h
index b8e3b4c7fb86a..c394982b3af23 100644
--- a/ext/bcmath/libbcmath/src/bcmath.h
+++ b/ext/bcmath/libbcmath/src/bcmath.h
@@ -78,6 +78,66 @@ typedef struct bc_struct {
 #define LONG_MAX 0x7ffffff
 #endif
 
+/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
+#define SWAR_ONES (~((size_t) 0) / 0xFF)
+/* This repeats a byte `x` into an entire 32/64-bit word.
+ * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
+#define SWAR_REPEAT(x) (SWAR_ONES * (x))
+
+/* Bytes swap */
+#if defined(_MSC_VER)
+#  include <stdlib.h>
+#  define BSWAP32(u) _byteswap_ulong(u)
+#  define BSWAP64(u) _byteswap_uint64(u)
+#else
+#  ifdef __has_builtin
+#    if __has_builtin(__builtin_bswap32)
+#      define BSWAP32(u) __builtin_bswap32(u)
+#    endif // __has_builtin(__builtin_bswap32)
+#    if __has_builtin(__builtin_bswap64)
+#      define BSWAP64(u) __builtin_bswap64(u)
+#    endif // __has_builtin(__builtin_bswap64)
+#  elif defined(__GNUC__)
+#    define BSWAP32(u) __builtin_bswap32(u)
+#    define BSWAP64(u) __builtin_bswap64(u)
+#  endif // __has_builtin
+#endif // defined(_MSC_VER)
+#ifndef BSWAP32
+inline uint32_t BSWAP32(uint32_t u)
+{
+  return (((u & 0xff000000) >> 24)
+          | ((u & 0x00ff0000) >>  8)
+          | ((u & 0x0000ff00) <<  8)
+          | ((u & 0x000000ff) << 24));
+}
+#endif
+#ifndef BSWAP64
+inline uint64_t BSWAP64(uint64_t u)
+{
+   return (((u & 0xff00000000000000ULL) >> 56)
+          | ((u & 0x00ff000000000000ULL) >> 40)
+          | ((u & 0x0000ff0000000000ULL) >> 24)
+          | ((u & 0x000000ff00000000ULL) >>  8)
+          | ((u & 0x00000000ff000000ULL) <<  8)
+          | ((u & 0x0000000000ff0000ULL) << 24)
+          | ((u & 0x000000000000ff00ULL) << 40)
+          | ((u & 0x00000000000000ffULL) << 56));
+}
+#endif
+
+#if SIZEOF_SIZE_T >= 8
+#define BC_BSWAP(u) BSWAP64(u)
+#define BC_UINT_T uint64_t
+#else
+#define BC_BSWAP(u) BSWAP32(u)
+#define BC_UINT_T uint32_t
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define BC_LITTLE_ENDIAN 0
+#else
+#define BC_LITTLE_ENDIAN 1
+#endif
 
 /* Function Prototypes */
 
diff --git a/ext/bcmath/libbcmath/src/convert.c b/ext/bcmath/libbcmath/src/convert.c
index 953dde27bb13a..5164215a333f3 100644
--- a/ext/bcmath/libbcmath/src/convert.c
+++ b/ext/bcmath/libbcmath/src/convert.c
@@ -20,12 +20,6 @@
 # include <emmintrin.h>
 #endif
 
-/* This will be 0x01010101 for 32-bit and 0x0101010101010101 */
-#define SWAR_ONES (~((size_t) 0) / 0xFF)
-/* This repeats a byte `x` into an entire 32/64-bit word.
- * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
-#define SWAR_REPEAT(x) (SWAR_ONES * (x))
-
 static char *bc_copy_and_shift_numbers(char *restrict dest, const char *source, const char *source_end, unsigned char shift, bool add)
 {
 	size_t bulk_shift = SWAR_REPEAT(shift);

From 670eada0f69e1a62a575c4b48088b96801c143dc Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Sun, 5 May 2024 22:34:34 +0900
Subject: [PATCH 02/10] Refactor _bc_do_sub

_bc_do_sub has been modified to use SIMD to perform faster calculations.
---
 ext/bcmath/libbcmath/src/doaddsub.c | 77 +++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index 650321d7ff6f4..b3c60ac7bbfc3 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -124,36 +124,33 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
-	size_t diff_scale, diff_len;
-	size_t min_scale, min_len;
-	size_t borrow, count;
+
+	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
+	size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
+	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
+	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
+	size_t min_bytes = min_len + min_scale;
+	size_t borrow = 0;
 	int val;
 	char *n1ptr, *n2ptr, *diffptr;
 
 	/* Allocate temporary storage. */
-	diff_len = MAX(n1->n_len, n2->n_len);
-	diff_scale = MAX(n1->n_scale, n2->n_scale);
-	min_len = MIN(n1->n_len, n2->n_len);
-	min_scale = MIN(n1->n_scale, n2->n_scale);
-	diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
+	diff = bc_new_num (n1->n_len, MAX(diff_scale, scale_min));
 
 	/* Initialize the subtract. */
 	n1ptr = (char *) (n1->n_value + n1->n_len + n1->n_scale - 1);
 	n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
 	diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
 
-	/* Subtract the numbers. */
-	borrow = 0;
-
 	/* Take care of the longer scaled number. */
 	if (n1->n_scale != min_scale) {
 		/* n1 has the longer scale */
-		for (count = n1->n_scale - min_scale; count > 0; count--) {
+		for (size_t count = n1->n_scale - min_scale; count > 0; count--) {
 			*diffptr-- = *n1ptr--;
 		}
 	} else {
 		/* n2 has the longer scale */
-		for (count = n2->n_scale - min_scale; count > 0; count--) {
+		for (size_t count = n2->n_scale - min_scale; count > 0; count--) {
 			val = -*n2ptr-- - borrow;
 			if (val < 0) {
 				val += BASE;
@@ -166,7 +163,57 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	}
 
 	/* Now do the equal length scale and integer parts. */
-	for (count = 0; count < min_len + min_scale; count++) {
+	size_t sub_count = 0;
+	if (min_bytes >= sizeof(BC_UINT_T)) {
+		diffptr++;
+		n1ptr++;
+		n2ptr++;
+		while (sub_count + sizeof(BC_UINT_T) <= min_bytes) {
+			diffptr -= sizeof(BC_UINT_T);
+			n1ptr -= sizeof(BC_UINT_T);
+			n2ptr -= sizeof(BC_UINT_T);
+
+			BC_UINT_T n1bytes;
+			BC_UINT_T n2bytes;
+			memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
+			memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
+
+#if BC_LITTLE_ENDIAN
+			/* Bytes swap */
+			n1bytes = BC_BSWAP(n1bytes);
+			n2bytes = BC_BSWAP(n2bytes);
+#endif
+
+			n1bytes -= (n2bytes + borrow);
+			/* If the most significant 4 bits of the 8 bytes are not 0, a carry-down has occurred. */
+			bool tmp_borrow = n1bytes >= ((BC_UINT_T) 0x10 << (8 * (sizeof(BC_UINT_T) - 1)));
+
+			/*
+			 * If any one of the upper 4 bits of each of the 8 bytes is 1, subtract 6 from that byte.
+			 * The fact that the upper 4 bits are not 0 means that a carry-down has occurred, and when
+			 * the hexadecimal number is carried down, there is a difference of 6 from the decimal
+			 * calculation, so 6 is subtracted.
+			 * Also, set all upper 4 bits to 0.
+			 */
+			BC_UINT_T borrow_mask = (((n1bytes | (n1bytes >> 1) | (n1bytes >> 2) | (n1bytes >> 3)) & SWAR_REPEAT(0x10)) * 0x06) >> 4;
+			n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
+
+#if BC_LITTLE_ENDIAN
+			/* Bytes swap */
+			n1bytes = BC_BSWAP(n1bytes);
+#endif
+
+			memcpy(diffptr, &n1bytes, sizeof(n1bytes));
+
+			borrow = tmp_borrow;
+			sub_count += sizeof(BC_UINT_T);
+		}
+		diffptr--;
+		n1ptr--;
+		n2ptr--;
+	}
+
+	for (; sub_count < min_bytes; sub_count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {
 			val += BASE;
@@ -179,7 +226,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 
 	/* If n1 has more digits than n2, we now do that subtract. */
 	if (diff_len != min_len) {
-		for (count = diff_len - min_len; count > 0; count--) {
+		for (size_t count = diff_len - min_len; count > 0; count--) {
 			val = *n1ptr-- - borrow;
 			if (val < 0) {
 				val += BASE;

From 9a715bbf5e207953685353c3236df93f327ebe7f Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 07:20:36 +0900
Subject: [PATCH 03/10] restore it once

---
 ext/bcmath/libbcmath/src/doaddsub.c | 77 ++++++-----------------------
 1 file changed, 15 insertions(+), 62 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index b3c60ac7bbfc3..650321d7ff6f4 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -124,33 +124,36 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
-
-	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
-	size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
-	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
-	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
-	size_t min_bytes = min_len + min_scale;
-	size_t borrow = 0;
+	size_t diff_scale, diff_len;
+	size_t min_scale, min_len;
+	size_t borrow, count;
 	int val;
 	char *n1ptr, *n2ptr, *diffptr;
 
 	/* Allocate temporary storage. */
-	diff = bc_new_num (n1->n_len, MAX(diff_scale, scale_min));
+	diff_len = MAX(n1->n_len, n2->n_len);
+	diff_scale = MAX(n1->n_scale, n2->n_scale);
+	min_len = MIN(n1->n_len, n2->n_len);
+	min_scale = MIN(n1->n_scale, n2->n_scale);
+	diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
 
 	/* Initialize the subtract. */
 	n1ptr = (char *) (n1->n_value + n1->n_len + n1->n_scale - 1);
 	n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
 	diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
 
+	/* Subtract the numbers. */
+	borrow = 0;
+
 	/* Take care of the longer scaled number. */
 	if (n1->n_scale != min_scale) {
 		/* n1 has the longer scale */
-		for (size_t count = n1->n_scale - min_scale; count > 0; count--) {
+		for (count = n1->n_scale - min_scale; count > 0; count--) {
 			*diffptr-- = *n1ptr--;
 		}
 	} else {
 		/* n2 has the longer scale */
-		for (size_t count = n2->n_scale - min_scale; count > 0; count--) {
+		for (count = n2->n_scale - min_scale; count > 0; count--) {
 			val = -*n2ptr-- - borrow;
 			if (val < 0) {
 				val += BASE;
@@ -163,57 +166,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	}
 
 	/* Now do the equal length scale and integer parts. */
-	size_t sub_count = 0;
-	if (min_bytes >= sizeof(BC_UINT_T)) {
-		diffptr++;
-		n1ptr++;
-		n2ptr++;
-		while (sub_count + sizeof(BC_UINT_T) <= min_bytes) {
-			diffptr -= sizeof(BC_UINT_T);
-			n1ptr -= sizeof(BC_UINT_T);
-			n2ptr -= sizeof(BC_UINT_T);
-
-			BC_UINT_T n1bytes;
-			BC_UINT_T n2bytes;
-			memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
-			memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
-
-#if BC_LITTLE_ENDIAN
-			/* Bytes swap */
-			n1bytes = BC_BSWAP(n1bytes);
-			n2bytes = BC_BSWAP(n2bytes);
-#endif
-
-			n1bytes -= (n2bytes + borrow);
-			/* If the most significant 4 bits of the 8 bytes are not 0, a carry-down has occurred. */
-			bool tmp_borrow = n1bytes >= ((BC_UINT_T) 0x10 << (8 * (sizeof(BC_UINT_T) - 1)));
-
-			/*
-			 * If any one of the upper 4 bits of each of the 8 bytes is 1, subtract 6 from that byte.
-			 * The fact that the upper 4 bits are not 0 means that a carry-down has occurred, and when
-			 * the hexadecimal number is carried down, there is a difference of 6 from the decimal
-			 * calculation, so 6 is subtracted.
-			 * Also, set all upper 4 bits to 0.
-			 */
-			BC_UINT_T borrow_mask = (((n1bytes | (n1bytes >> 1) | (n1bytes >> 2) | (n1bytes >> 3)) & SWAR_REPEAT(0x10)) * 0x06) >> 4;
-			n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
-
-#if BC_LITTLE_ENDIAN
-			/* Bytes swap */
-			n1bytes = BC_BSWAP(n1bytes);
-#endif
-
-			memcpy(diffptr, &n1bytes, sizeof(n1bytes));
-
-			borrow = tmp_borrow;
-			sub_count += sizeof(BC_UINT_T);
-		}
-		diffptr--;
-		n1ptr--;
-		n2ptr--;
-	}
-
-	for (; sub_count < min_bytes; sub_count++) {
+	for (count = 0; count < min_len + min_scale; count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {
 			val += BASE;
@@ -226,7 +179,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 
 	/* If n1 has more digits than n2, we now do that subtract. */
 	if (diff_len != min_len) {
-		for (size_t count = diff_len - min_len; count > 0; count--) {
+		for (count = diff_len - min_len; count > 0; count--) {
 			val = *n1ptr-- - borrow;
 			if (val < 0) {
 				val += BASE;

From 63f69688c0f7dc2be23926140019a83cdcdeca01 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 08:19:18 +0900
Subject: [PATCH 04/10] use SIMD

---
 ext/bcmath/libbcmath/src/doaddsub.c | 52 ++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index 650321d7ff6f4..f628c7e363d2d 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -166,7 +166,57 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	}
 
 	/* Now do the equal length scale and integer parts. */
-	for (count = 0; count < min_len + min_scale; count++) {
+	count = 0;
+	if (min_len + min_scale >= sizeof(BC_UINT_T)) {
+		diffptr++;
+		n1ptr++;
+		n2ptr++;
+		while (count + sizeof(BC_UINT_T) <= min_len + min_scale) {
+			diffptr -= sizeof(BC_UINT_T);
+			n1ptr -= sizeof(BC_UINT_T);
+			n2ptr -= sizeof(BC_UINT_T);
+
+			BC_UINT_T n1bytes;
+			BC_UINT_T n2bytes;
+			memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
+			memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
+
+#if BC_LITTLE_ENDIAN
+			/* Bytes swap */
+			n1bytes = BC_BSWAP(n1bytes);
+			n2bytes = BC_BSWAP(n2bytes);
+#endif
+
+			n1bytes -= n2bytes + borrow;
+			/* If the most significant 4 bits of the 8 bytes are not 0, a carry-down has occurred. */
+			bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
+
+			/*
+			 * If any one of the upper 4 bits of each of the 8 bytes is 1, subtract 6 from that byte.
+			 * The fact that the upper 4 bits are not 0 means that a carry-down has occurred, and when
+			 * the hexadecimal number is carried down, there is a difference of 6 from the decimal
+			 * calculation, so 6 is subtracted.
+			 * Also, set all upper 4 bits to 0.
+			 */
+			BC_UINT_T borrow_mask = ((n1bytes & SWAR_REPEAT(0x80)) >> 7) * 0x06;
+			n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
+
+#if BC_LITTLE_ENDIAN
+			/* Bytes swap */
+			n1bytes = BC_BSWAP(n1bytes);
+#endif
+
+			memcpy(diffptr, &n1bytes, sizeof(n1bytes));
+
+			borrow = tmp_borrow;
+			count += sizeof(BC_UINT_T);
+		}
+		diffptr--;
+		n1ptr--;
+		n2ptr--;
+	}
+
+	for (; count < min_len + min_scale; count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {
 			val += BASE;

From 1dede5a69bec79f0bd0020bc4768954ae77fa6c7 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 09:18:03 +0900
Subject: [PATCH 05/10] refactor initialization

---
 ext/bcmath/libbcmath/src/doaddsub.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index f628c7e363d2d..fdc96b95e3e04 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -124,17 +124,16 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
-	size_t diff_scale, diff_len;
-	size_t min_scale, min_len;
-	size_t borrow, count;
+	size_t diff_len = MAX(n1->n_len, n2->n_len);
+	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
+	size_t min_len = MIN(n1->n_len, n2->n_len);
+	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
+	size_t borrow = 0;
+	size_t count;
 	int val;
 	char *n1ptr, *n2ptr, *diffptr;
 
 	/* Allocate temporary storage. */
-	diff_len = MAX(n1->n_len, n2->n_len);
-	diff_scale = MAX(n1->n_scale, n2->n_scale);
-	min_len = MIN(n1->n_len, n2->n_len);
-	min_scale = MIN(n1->n_scale, n2->n_scale);
 	diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
 
 	/* Initialize the subtract. */
@@ -142,9 +141,6 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
 	diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
 
-	/* Subtract the numbers. */
-	borrow = 0;
-
 	/* Take care of the longer scaled number. */
 	if (n1->n_scale != min_scale) {
 		/* n1 has the longer scale */

From 6a29fb72e7ab9ea6ab9a4949c1289ea7be526ca8 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 09:23:30 +0900
Subject: [PATCH 06/10] use EXPECTED

---
 ext/bcmath/libbcmath/src/doaddsub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index fdc96b95e3e04..82dbfed490bc5 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -124,9 +124,9 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
-	size_t diff_len = MAX(n1->n_len, n2->n_len);
+	size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
 	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
-	size_t min_len = MIN(n1->n_len, n2->n_len);
+	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
 	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
 	size_t borrow = 0;
 	size_t count;

From ce78b1449656cf17a997669daa7e08617e070c9c Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 09:33:08 +0900
Subject: [PATCH 07/10] Calculate `min_len + min_scale first`

---
 ext/bcmath/libbcmath/src/doaddsub.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index 82dbfed490bc5..0fec8b6a1912f 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -128,6 +128,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
 	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
 	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
+	size_t min_bytes = min_len + min_scale;
 	size_t borrow = 0;
 	size_t count;
 	int val;
@@ -163,11 +164,11 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 
 	/* Now do the equal length scale and integer parts. */
 	count = 0;
-	if (min_len + min_scale >= sizeof(BC_UINT_T)) {
+	if (min_bytes >= sizeof(BC_UINT_T)) {
 		diffptr++;
 		n1ptr++;
 		n2ptr++;
-		while (count + sizeof(BC_UINT_T) <= min_len + min_scale) {
+		while (count + sizeof(BC_UINT_T) <= min_bytes) {
 			diffptr -= sizeof(BC_UINT_T);
 			n1ptr -= sizeof(BC_UINT_T);
 			n2ptr -= sizeof(BC_UINT_T);
@@ -212,7 +213,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 		n2ptr--;
 	}
 
-	for (; count < min_len + min_scale; count++) {
+	for (; count < min_bytes; count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {
 			val += BASE;

From 2b6cbadf120b728c5ec1554968fbeb076cfbd491 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 12:52:51 +0900
Subject: [PATCH 08/10] Moved some constants and macros to private.h

---
 ext/bcmath/libbcmath/src/bcmath.h  | 60 -----------------------------
 ext/bcmath/libbcmath/src/convert.c |  1 +
 ext/bcmath/libbcmath/src/private.h | 62 ++++++++++++++++++++++++++++++
 ext/bcmath/libbcmath/src/str2num.c |  1 +
 4 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/bcmath.h b/ext/bcmath/libbcmath/src/bcmath.h
index c394982b3af23..b8e3b4c7fb86a 100644
--- a/ext/bcmath/libbcmath/src/bcmath.h
+++ b/ext/bcmath/libbcmath/src/bcmath.h
@@ -78,66 +78,6 @@ typedef struct bc_struct {
 #define LONG_MAX 0x7ffffff
 #endif
 
-/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
-#define SWAR_ONES (~((size_t) 0) / 0xFF)
-/* This repeats a byte `x` into an entire 32/64-bit word.
- * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
-#define SWAR_REPEAT(x) (SWAR_ONES * (x))
-
-/* Bytes swap */
-#if defined(_MSC_VER)
-#  include <stdlib.h>
-#  define BSWAP32(u) _byteswap_ulong(u)
-#  define BSWAP64(u) _byteswap_uint64(u)
-#else
-#  ifdef __has_builtin
-#    if __has_builtin(__builtin_bswap32)
-#      define BSWAP32(u) __builtin_bswap32(u)
-#    endif // __has_builtin(__builtin_bswap32)
-#    if __has_builtin(__builtin_bswap64)
-#      define BSWAP64(u) __builtin_bswap64(u)
-#    endif // __has_builtin(__builtin_bswap64)
-#  elif defined(__GNUC__)
-#    define BSWAP32(u) __builtin_bswap32(u)
-#    define BSWAP64(u) __builtin_bswap64(u)
-#  endif // __has_builtin
-#endif // defined(_MSC_VER)
-#ifndef BSWAP32
-inline uint32_t BSWAP32(uint32_t u)
-{
-  return (((u & 0xff000000) >> 24)
-          | ((u & 0x00ff0000) >>  8)
-          | ((u & 0x0000ff00) <<  8)
-          | ((u & 0x000000ff) << 24));
-}
-#endif
-#ifndef BSWAP64
-inline uint64_t BSWAP64(uint64_t u)
-{
-   return (((u & 0xff00000000000000ULL) >> 56)
-          | ((u & 0x00ff000000000000ULL) >> 40)
-          | ((u & 0x0000ff0000000000ULL) >> 24)
-          | ((u & 0x000000ff00000000ULL) >>  8)
-          | ((u & 0x00000000ff000000ULL) <<  8)
-          | ((u & 0x0000000000ff0000ULL) << 24)
-          | ((u & 0x000000000000ff00ULL) << 40)
-          | ((u & 0x00000000000000ffULL) << 56));
-}
-#endif
-
-#if SIZEOF_SIZE_T >= 8
-#define BC_BSWAP(u) BSWAP64(u)
-#define BC_UINT_T uint64_t
-#else
-#define BC_BSWAP(u) BSWAP32(u)
-#define BC_UINT_T uint32_t
-#endif
-
-#ifdef WORDS_BIGENDIAN
-#define BC_LITTLE_ENDIAN 0
-#else
-#define BC_LITTLE_ENDIAN 1
-#endif
 
 /* Function Prototypes */
 
diff --git a/ext/bcmath/libbcmath/src/convert.c b/ext/bcmath/libbcmath/src/convert.c
index 5164215a333f3..01617c0662c64 100644
--- a/ext/bcmath/libbcmath/src/convert.c
+++ b/ext/bcmath/libbcmath/src/convert.c
@@ -16,6 +16,7 @@
 
 #include "bcmath.h"
 #include "convert.h"
+#include "private.h"
 #ifdef __SSE2__
 # include <emmintrin.h>
 #endif
diff --git a/ext/bcmath/libbcmath/src/private.h b/ext/bcmath/libbcmath/src/private.h
index 481a651128320..f21bef665f954 100644
--- a/ext/bcmath/libbcmath/src/private.h
+++ b/ext/bcmath/libbcmath/src/private.h
@@ -34,6 +34,68 @@
 #include <stdbool.h>
 #include <stddef.h>
 
+/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
+#define SWAR_ONES (~((size_t) 0) / 0xFF)
+/* This repeats a byte `x` into an entire 32/64-bit word.
+ * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
+#define SWAR_REPEAT(x) (SWAR_ONES * (x))
+
+/* Bytes swap */
+#if defined(_MSC_VER)
+#  include <stdlib.h>
+#  define BSWAP32(u) _byteswap_ulong(u)
+#  define BSWAP64(u) _byteswap_uint64(u)
+#else
+#  ifdef __has_builtin
+#    if __has_builtin(__builtin_bswap32)
+#      define BSWAP32(u) __builtin_bswap32(u)
+#    endif // __has_builtin(__builtin_bswap32)
+#    if __has_builtin(__builtin_bswap64)
+#      define BSWAP64(u) __builtin_bswap64(u)
+#    endif // __has_builtin(__builtin_bswap64)
+#  elif defined(__GNUC__)
+#    define BSWAP32(u) __builtin_bswap32(u)
+#    define BSWAP64(u) __builtin_bswap64(u)
+#  endif // __has_builtin
+#endif // defined(_MSC_VER)
+#ifndef BSWAP32
+inline uint32_t BSWAP32(uint32_t u)
+{
+  return (((u & 0xff000000) >> 24)
+          | ((u & 0x00ff0000) >>  8)
+          | ((u & 0x0000ff00) <<  8)
+          | ((u & 0x000000ff) << 24));
+}
+#endif
+#ifndef BSWAP64
+inline uint64_t BSWAP64(uint64_t u)
+{
+   return (((u & 0xff00000000000000ULL) >> 56)
+          | ((u & 0x00ff000000000000ULL) >> 40)
+          | ((u & 0x0000ff0000000000ULL) >> 24)
+          | ((u & 0x000000ff00000000ULL) >>  8)
+          | ((u & 0x00000000ff000000ULL) <<  8)
+          | ((u & 0x0000000000ff0000ULL) << 24)
+          | ((u & 0x000000000000ff00ULL) << 40)
+          | ((u & 0x00000000000000ffULL) << 56));
+}
+#endif
+
+#if SIZEOF_SIZE_T >= 8
+#define BC_BSWAP(u) BSWAP64(u)
+#define BC_UINT_T uint64_t
+#else
+#define BC_BSWAP(u) BSWAP32(u)
+#define BC_UINT_T uint32_t
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define BC_LITTLE_ENDIAN 0
+#else
+#define BC_LITTLE_ENDIAN 1
+#endif
+
+
 /* routines */
 int _bc_do_compare (bc_num n1, bc_num n2, bool use_sign);
 bc_num _bc_do_add (bc_num n1, bc_num n2, size_t scale_min);
diff --git a/ext/bcmath/libbcmath/src/str2num.c b/ext/bcmath/libbcmath/src/str2num.c
index 370d899772ff8..4787e16b53a2b 100644
--- a/ext/bcmath/libbcmath/src/str2num.c
+++ b/ext/bcmath/libbcmath/src/str2num.c
@@ -31,6 +31,7 @@
 
 #include "bcmath.h"
 #include "convert.h"
+#include "private.h"
 #include <stdbool.h>
 #include <stddef.h>
 #ifdef __SSE2__

From 318601cc514b2c35e5a8778b6731a4f86ca21e97 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 20:04:26 +0900
Subject: [PATCH 09/10] Added and fixed comments

---
 ext/bcmath/libbcmath/src/doaddsub.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index 0fec8b6a1912f..2952805f52b1c 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -124,8 +124,10 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
+	/* The caller is guaranteed that n1 is always large. */
 	size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
 	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
+	/* Same condition as EXPECTED before, but using EXPECTED again will make it slower. */
 	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
 	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
 	size_t min_bytes = min_len + min_scale;
@@ -164,6 +166,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 
 	/* Now do the equal length scale and integer parts. */
 	count = 0;
+	/* Uses SIMD to perform calculations at high speed. */
 	if (min_bytes >= sizeof(BC_UINT_T)) {
 		diffptr++;
 		n1ptr++;
@@ -179,27 +182,27 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 			memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
 
 #if BC_LITTLE_ENDIAN
-			/* Bytes swap */
+			/* Little endian requires changing the order of bytes. */
 			n1bytes = BC_BSWAP(n1bytes);
 			n2bytes = BC_BSWAP(n2bytes);
 #endif
 
 			n1bytes -= n2bytes + borrow;
-			/* If the most significant 4 bits of the 8 bytes are not 0, a carry-down has occurred. */
+			/* If the most significant bit is 1, a carry down has occurred. */
 			bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
 
 			/*
-			 * If any one of the upper 4 bits of each of the 8 bytes is 1, subtract 6 from that byte.
-			 * The fact that the upper 4 bits are not 0 means that a carry-down has occurred, and when
-			 * the hexadecimal number is carried down, there is a difference of 6 from the decimal
-			 * calculation, so 6 is subtracted.
-			 * Also, set all upper 4 bits to 0.
+			 * Check the most significant bit of each of the 16 bytes, and if it is 1, a carry down has
+			 * occurred. When carrying down occurs, due to the difference between decimal and hexadecimal
+			 * numbers, an extra 6 is added to the lower 4 bits.
+			 * Therefore, for a byte that has been carried down, set all the upper 4 bits to 0 and subtract
+			 * 6 from the lower 4 bits to adjust it to the correct value as a decimal number.
 			 */
 			BC_UINT_T borrow_mask = ((n1bytes & SWAR_REPEAT(0x80)) >> 7) * 0x06;
 			n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
 
 #if BC_LITTLE_ENDIAN
-			/* Bytes swap */
+			/* Little endian requires changing the order of bytes back. */
 			n1bytes = BC_BSWAP(n1bytes);
 #endif
 
@@ -213,6 +216,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 		n2ptr--;
 	}
 
+	/* Calculate the remaining bytes that are less than the size of BC_UINT_T using a normal loop. */
 	for (; count < min_bytes; count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {

From 86a0084f81ba72d90af03b8d5ed240c68fe867ac Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 6 May 2024 20:06:21 +0900
Subject: [PATCH 10/10] fixed comment

---
 ext/bcmath/libbcmath/src/doaddsub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
index 2952805f52b1c..eb2f7b6645d7a 100644
--- a/ext/bcmath/libbcmath/src/doaddsub.c
+++ b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -192,7 +192,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 			bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
 
 			/*
-			 * Check the most significant bit of each of the 16 bytes, and if it is 1, a carry down has
+			 * Check the most significant bit of each of the bytes, and if it is 1, a carry down has
 			 * occurred. When carrying down occurs, due to the difference between decimal and hexadecimal
 			 * numbers, an extra 6 is added to the lower 4 bits.
 			 * Therefore, for a byte that has been carried down, set all the upper 4 bits to 0 and subtract