From e8141973715b216c224d6b0c05ae5e92c868c02f Mon Sep 17 00:00:00 2001
From: Alex Dowad <alexinbeijing@gmail.com>
Date: Wed, 20 Dec 2023 09:57:56 +0200
Subject: [PATCH] Fix bug in mb_get_substr_slow (sometimes outputs wrong number
 of characters)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thanks to Maurício Fauth for finding and reporting this bug.

The bug was introduced in October 2022. It originally only affected
text encodings which do not have a fixed byte width per characters
and for which mbstring does not have an mblen_table. However, I recently
made another change to mbstring, such that mb_substr no longer relies
on the mblen_table even if one is available. Because of this change,
the bug earlier introduced in October 2022 now affected a greater
number of text encodings, including UTF-8.
---
 ext/mbstring/mbstring.c           | 5 +++--
 ext/mbstring/tests/mb_substr.phpt | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
index 405806604a0e4..ad15dd6768e03 100644
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@@ -2092,9 +2092,10 @@ static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t
 		if (from >= out_len) {
 			from -= out_len;
 		} else {
-			enc->from_wchar(wchar_buf + from, MIN(out_len - from, len), &buf, !in_len || out_len >= len);
+			size_t needed_codepoints = MIN(out_len - from, len);
+			enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
 			from = 0;
-			len -= MIN(out_len, len);
+			len -= needed_codepoints;
 		}
 	}
 
diff --git a/ext/mbstring/tests/mb_substr.phpt b/ext/mbstring/tests/mb_substr.phpt
index ad94c3b6069b1..8c376fb5e6ddf 100644
--- a/ext/mbstring/tests/mb_substr.phpt
+++ b/ext/mbstring/tests/mb_substr.phpt
@@ -133,6 +133,11 @@ echo "Regression:\n";
 $str = "\xbd\xbd\xbd\xbd\xbd\xbd\xbd\xbe\xbd\xbd\xbd\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x89\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x00\x00\x00\x00\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8b\x8b\x8b\xbd\xbd\xbd\xbd\xbd\xbd\xbd\xbe\x01:O\xaa\xd3";
 echo bin2hex(mb_substr($str, 0, 128, "JIS")), "\n";
 
+/* Alex messed up when reimplementing mb_substr and, in cases where `from` is non-zero and
+ * the number of characters to extract is more than 128, miscalculated where to end the substring
+ * Thanks to Maurício Fauth for finding the issue */
+var_dump(mb_substr('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum dapibus feugiat ex non cursus. Pellentesque vestibulum tellus sit lectus.', 19, -1));
+
 ?>
 --EXPECT--
 EUC-JP:
@@ -213,3 +218,4 @@ Testing agreement with mb_strpos on invalid UTF-8 string:
 ?AAA
 Regression:
 1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842
+string(121) "it amet, consectetur adipiscing elit. Vestibulum dapibus feugiat ex non cursus. Pellentesque vestibulum tellus sit lectus"