Skip to content

Commit 0e540ed

Browse files
committed
Merge branch 'PHP-8.2'
* PHP-8.2: Fix mangled kana output for JIS encoding
2 parents 94c9a47 + 8f84192 commit 0e540ed

File tree

2 files changed

+36
-0
lines changed

2 files changed

+36
-0
lines changed

ext/mbstring/libmbfl/filters/mbfilter_jis.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,17 @@ static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t
584584
} else if (c < 0x80) {
585585
*out++ = c;
586586
} else if (c >= 0xA1 && c <= 0xDF) {
587+
/* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
588+
* with the MSB bit (in the context of ISO-2022 encoding).
589+
*
590+
* In this regard, Wikipedia states:
591+
* "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
592+
* encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
593+
* escape sequences, using Shift Out and Shift In or setting the eighth bit
594+
* (GR-invoked), respectively."
595+
*
596+
* Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
597+
* and the 'JIS8' use of GR-invoked Kana */
587598
*out++ = 0xFEC0 + c;
588599
} else {
589600
*out++ = MBFL_BAD_INPUT;
@@ -731,6 +742,13 @@ static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool
731742
buf->state = ASCII;
732743
}
733744
out = mb_convert_buf_add(out, s);
745+
} else if (s >= 0xA1 && s <= 0xDF) {
746+
if (buf->state != JISX_0201_KANA) {
747+
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
748+
out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
749+
buf->state = JISX_0201_KANA;
750+
}
751+
out = mb_convert_buf_add(out, s & 0x7F);
734752
} else if (s < 0x8080) { /* JIS X 0208 */
735753
if (buf->state != JISX_0208) {
736754
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);

ext/mbstring/tests/iso2022jp_encoding.phpt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,23 @@ testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", 'UTF-16BE', 'ISO-2022-JP', false)
218218

219219
echo "Other mappings from Unicode -> ISO-2022-JP are OK\n";
220220

221+
// Single bytes from 0xA3-0xDF can be used to encode kana in JIS8
222+
$grInvoked = [
223+
"\xA3" => "\x1B(I\x23\x1B(B",
224+
"\xB1" => "\x1B(I\x31\x1B(B",
225+
"\xC2" => "\x1B(I\x42\x1B(B",
226+
"\xDF" => "\x1B(I\x5F\x1B(B"
227+
];
228+
foreach ($grInvoked as $gr => $jisx) {
229+
// JISX 0201 is used as the canonical form for outputting kana
230+
testValidString($gr, $jisx, 'JIS', 'JIS', false);
231+
if (mb_convert_encoding($gr, 'UTF-16BE', 'JIS') !== mb_convert_encoding($jisx, 'UTF-16BE', 'JIS'))
232+
die("Equivalent GR byte and JISX 0201 sequence do not decode to the same codepoint");
233+
}
234+
235+
echo "GR-invoked kana support OK\n";
236+
237+
// Check handling of BOM
221238
convertInvalidString("\xFF\xFE", "%", "UTF-16BE", "JIS", false);
222239
convertInvalidString("\xFF\xFE", "%", "UTF-16BE", "ISO-2022-JP", false);
223240

@@ -239,4 +256,5 @@ JIS X 0208 support OK
239256
JIS X 0212 support OK
240257
All escape sequences work as expected
241258
Other mappings from Unicode -> ISO-2022-JP are OK
259+
GR-invoked kana support OK
242260
Done!

0 commit comments

Comments
 (0)