Skip to content

Commit 6719cc7

Browse files
committed
Make strtolower() and strtoupper() do ASCII case conversion
Implement RFC https://wiki.php.net/rfc/strtolower-ascii
1 parent e69fb48 commit 6719cc7

25 files changed

+513
-901
lines changed

UPGRADING

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ PHP 8.2 UPGRADE NOTES
1919
1. Backward Incompatible Changes
2020
========================================
2121

22+
- Standard:
23+
. strtolower() and strtoupper() are no longer locale-sensitive. They now
24+
perform ASCII case conversion. Use mb_strtolower() if you want localized
25+
case conversion. Similarly, stristr, stripos, strripos, ucfirst, ucwords,
26+
str_ireplace and strip_tags use ASCII case conversion.
27+
2228
========================================
2329
2. New Features
2430
========================================

Zend/tests/lc_ctype_inheritance.phpt

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,30 @@ LC_CTYPE=de_DE
99
--FILE--
1010
<?php
1111

12+
echo "inherited\n";
13+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
14+
echo 'preg_match(\w, \xe4): ';
15+
var_dump(preg_match('/\w/', "\xe4"));
16+
1217
var_dump(setlocale(LC_CTYPE, "0"));
13-
var_dump(bin2hex(strtoupper("\xe4")));
18+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
19+
echo 'preg_match(\w, \xe4): ';
1420
var_dump(preg_match('/\w/', "\xe4"));
21+
22+
echo "de_DE\n";
1523
var_dump(setlocale(LC_CTYPE, "de_DE", "de-DE") !== false);
16-
var_dump(bin2hex(strtoupper("\xe4")));
24+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
25+
echo 'preg_match(\w, \xe4): ';
1726
var_dump(preg_match('/\w/', "\xe4"));
1827
?>
1928
--EXPECT--
29+
inherited
30+
ctype_lower(\xe4): n
31+
preg_match(\w, \xe4): int(0)
2032
string(1) "C"
21-
string(2) "e4"
22-
int(0)
33+
ctype_lower(\xe4): n
34+
preg_match(\w, \xe4): int(0)
35+
de_DE
2336
bool(true)
24-
string(2) "c4"
25-
int(1)
37+
ctype_lower(\xe4): y
38+
preg_match(\w, \xe4): int(1)

Zend/zend_operators.c

Lines changed: 168 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,27 @@ static const unsigned char tolower_map[256] = {
7373
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
7474
};
7575

76-
#define zend_tolower_ascii(c) (tolower_map[(unsigned char)(c)])
76+
static const unsigned char toupper_map[256] = {
77+
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
78+
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
79+
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
80+
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
81+
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
82+
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
83+
0x60,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
84+
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x7b,0x7c,0x7d,0x7e,0x7f,
85+
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
86+
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
87+
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
88+
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
89+
0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
90+
0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
91+
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
92+
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
93+
};
94+
95+
#define i_zend_tolower_ascii(c) (tolower_map[(unsigned char)(c)])
96+
#define i_zend_toupper_ascii(c) (toupper_map[(unsigned char)(c)])
7797

7898
/**
7999
* Functions using locale lowercase:
@@ -2686,11 +2706,51 @@ static zend_always_inline void zend_str_tolower_impl(char *dest, const char *str
26862706
}
26872707
#endif
26882708
while (p < end) {
2689-
*q++ = zend_tolower_ascii(*p++);
2709+
*q++ = i_zend_tolower_ascii(*p++);
26902710
}
26912711
}
26922712
/* }}} */
26932713

2714+
static zend_always_inline void zend_str_toupper_impl(char *dest, const char *str, size_t length) /* {{{ */ {
2715+
unsigned char *p = (unsigned char*)str;
2716+
unsigned char *q = (unsigned char*)dest;
2717+
unsigned char *end = p + length;
2718+
#ifdef __SSE2__
2719+
if (length >= 16) {
2720+
const __m128i _a = _mm_set1_epi8('a' - 1);
2721+
const __m128i z_ = _mm_set1_epi8('z' + 1);
2722+
const __m128i delta = _mm_set1_epi8('a' - 'A');
2723+
do {
2724+
__m128i op = _mm_loadu_si128((__m128i*)p);
2725+
__m128i gt = _mm_cmpgt_epi8(op, _a);
2726+
__m128i lt = _mm_cmplt_epi8(op, z_);
2727+
__m128i mingle = _mm_and_si128(gt, lt);
2728+
__m128i sub = _mm_and_si128(mingle, delta);
2729+
__m128i upper = _mm_sub_epi8(op, sub);
2730+
_mm_storeu_si128((__m128i *)q, upper);
2731+
p += 16;
2732+
q += 16;
2733+
} while (p + 16 <= end);
2734+
}
2735+
#endif
2736+
while (p < end) {
2737+
*q++ = i_zend_toupper_ascii(*p++);
2738+
}
2739+
}
2740+
/* }}} */
2741+
2742+
ZEND_API int ZEND_FASTCALL zend_tolower_ascii(int c) /* {{{ */
2743+
{
2744+
return i_zend_tolower_ascii(c);
2745+
}
2746+
/* }}} */
2747+
2748+
ZEND_API int ZEND_FASTCALL zend_toupper_ascii(int c) /* {{{ */
2749+
{
2750+
return i_zend_toupper_ascii(c);
2751+
}
2752+
/* }}} */
2753+
26942754
ZEND_API char* ZEND_FASTCALL zend_str_tolower_copy(char *dest, const char *source, size_t length) /* {{{ */
26952755
{
26962756
zend_str_tolower_impl(dest, source, length);
@@ -2699,25 +2759,46 @@ ZEND_API char* ZEND_FASTCALL zend_str_tolower_copy(char *dest, const char *sourc
26992759
}
27002760
/* }}} */
27012761

2762+
ZEND_API char* ZEND_FASTCALL zend_str_toupper_copy(char *dest, const char *source, size_t length) /* {{{ */
2763+
{
2764+
zend_str_toupper_impl(dest, source, length);
2765+
dest[length] = '\0';
2766+
return dest;
2767+
}
2768+
/* }}} */
2769+
27022770
ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup(const char *source, size_t length) /* {{{ */
27032771
{
27042772
return zend_str_tolower_copy((char *)emalloc(length+1), source, length);
27052773
}
27062774
/* }}} */
27072775

2776+
ZEND_API char* ZEND_FASTCALL zend_str_toupper_dup(const char *source, size_t length) /* {{{ */
2777+
{
2778+
return zend_str_toupper_copy((char *)emalloc(length+1), source, length);
2779+
}
2780+
/* }}} */
2781+
27082782
ZEND_API void ZEND_FASTCALL zend_str_tolower(char *str, size_t length) /* {{{ */
27092783
{
27102784
zend_str_tolower_impl(str, (const char*)str, length);
27112785
}
27122786
/* }}} */
27132787

2788+
ZEND_API void ZEND_FASTCALL zend_str_toupper(char *str, size_t length) /* {{{ */
2789+
{
2790+
zend_str_toupper_impl(str, (const char*)str, length);
2791+
}
2792+
/* }}} */
2793+
2794+
27142795
ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup_ex(const char *source, size_t length) /* {{{ */
27152796
{
27162797
const unsigned char *p = (const unsigned char*)source;
27172798
const unsigned char *end = p + length;
27182799

27192800
while (p < end) {
2720-
if (*p != zend_tolower_ascii(*p)) {
2801+
if (*p != i_zend_tolower_ascii(*p)) {
27212802
char *res = (char*)emalloc(length + 1);
27222803
unsigned char *r;
27232804

@@ -2735,6 +2816,30 @@ ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup_ex(const char *source, size_t
27352816
}
27362817
/* }}} */
27372818

2819+
ZEND_API char* ZEND_FASTCALL zend_str_toupper_dup_ex(const char *source, size_t length) /* {{{ */
2820+
{
2821+
const unsigned char *p = (const unsigned char*)source;
2822+
const unsigned char *end = p + length;
2823+
2824+
while (p < end) {
2825+
if (*p != i_zend_toupper_ascii(*p)) {
2826+
char *res = (char*)emalloc(length + 1);
2827+
unsigned char *r;
2828+
2829+
if (p != (const unsigned char*)source) {
2830+
memcpy(res, source, p - (const unsigned char*)source);
2831+
}
2832+
r = (unsigned char*)p + (res - source);
2833+
zend_str_toupper_impl((char *)r, (const char*)p, end - p);
2834+
res[length] = '\0';
2835+
return res;
2836+
}
2837+
p++;
2838+
}
2839+
return NULL;
2840+
}
2841+
/* }}} */
2842+
27382843
ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, bool persistent) /* {{{ */
27392844
{
27402845
size_t length = ZSTR_LEN(str);
@@ -2771,13 +2876,67 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, boo
27712876
#endif
27722877

27732878
while (p < end) {
2774-
if (*p != zend_tolower_ascii(*p)) {
2879+
if (*p != i_zend_tolower_ascii(*p)) {
2880+
zend_string *res = zend_string_alloc(length, persistent);
2881+
memcpy(ZSTR_VAL(res), ZSTR_VAL(str), p - (unsigned char*) ZSTR_VAL(str));
2882+
2883+
unsigned char *q = p + (ZSTR_VAL(res) - ZSTR_VAL(str));
2884+
while (p < end) {
2885+
*q++ = i_zend_tolower_ascii(*p++);
2886+
}
2887+
ZSTR_VAL(res)[length] = '\0';
2888+
return res;
2889+
}
2890+
p++;
2891+
}
2892+
2893+
return zend_string_copy(str);
2894+
}
2895+
/* }}} */
2896+
2897+
ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, bool persistent) /* {{{ */
2898+
{
2899+
size_t length = ZSTR_LEN(str);
2900+
unsigned char *p = (unsigned char *) ZSTR_VAL(str);
2901+
unsigned char *end = p + length;
2902+
2903+
#ifdef __SSE2__
2904+
while (p + 16 <= end) {
2905+
const __m128i _a = _mm_set1_epi8('a' - 1);
2906+
const __m128i z_ = _mm_set1_epi8('z' + 1);
2907+
__m128i op = _mm_loadu_si128((__m128i*)p);
2908+
__m128i gt = _mm_cmpgt_epi8(op, _a);
2909+
__m128i lt = _mm_cmplt_epi8(op, z_);
2910+
__m128i mingle = _mm_and_si128(gt, lt);
2911+
if (_mm_movemask_epi8(mingle)) {
2912+
zend_string *res = zend_string_alloc(length, persistent);
2913+
memcpy(ZSTR_VAL(res), ZSTR_VAL(str), p - (unsigned char *) ZSTR_VAL(str));
2914+
unsigned char *q = p + (ZSTR_VAL(res) - ZSTR_VAL(str));
2915+
2916+
/* Uppercase the chunk we already compared. */
2917+
const __m128i delta = _mm_set1_epi8('a' - 'A');
2918+
__m128i add = _mm_and_si128(mingle, delta);
2919+
__m128i upper = _mm_sub_epi8(op, add);
2920+
_mm_storeu_si128((__m128i *) q, upper);
2921+
2922+
/* Uppercase the rest of the string. */
2923+
p += 16; q += 16;
2924+
zend_str_toupper_impl((char *) q, (const char *) p, end - p);
2925+
ZSTR_VAL(res)[length] = '\0';
2926+
return res;
2927+
}
2928+
p += 16;
2929+
}
2930+
#endif
2931+
2932+
while (p < end) {
2933+
if (*p != i_zend_toupper_ascii(*p)) {
27752934
zend_string *res = zend_string_alloc(length, persistent);
27762935
memcpy(ZSTR_VAL(res), ZSTR_VAL(str), p - (unsigned char*) ZSTR_VAL(str));
27772936

27782937
unsigned char *q = p + (ZSTR_VAL(res) - ZSTR_VAL(str));
27792938
while (p < end) {
2780-
*q++ = zend_tolower_ascii(*p++);
2939+
*q++ = i_zend_toupper_ascii(*p++);
27812940
}
27822941
ZSTR_VAL(res)[length] = '\0';
27832942
return res;
@@ -2832,8 +2991,8 @@ ZEND_API int ZEND_FASTCALL zend_binary_strcasecmp(const char *s1, size_t len1, c
28322991

28332992
len = MIN(len1, len2);
28342993
while (len--) {
2835-
c1 = zend_tolower_ascii(*(unsigned char *)s1++);
2836-
c2 = zend_tolower_ascii(*(unsigned char *)s2++);
2994+
c1 = i_zend_tolower_ascii(*(unsigned char *)s1++);
2995+
c2 = i_zend_tolower_ascii(*(unsigned char *)s2++);
28372996
if (c1 != c2) {
28382997
return c1 - c2;
28392998
}
@@ -2853,8 +3012,8 @@ ZEND_API int ZEND_FASTCALL zend_binary_strncasecmp(const char *s1, size_t len1,
28533012
}
28543013
len = MIN(length, MIN(len1, len2));
28553014
while (len--) {
2856-
c1 = zend_tolower_ascii(*(unsigned char *)s1++);
2857-
c2 = zend_tolower_ascii(*(unsigned char *)s2++);
3015+
c1 = i_zend_tolower_ascii(*(unsigned char *)s1++);
3016+
c2 = i_zend_tolower_ascii(*(unsigned char *)s2++);
28583017
if (c1 != c2) {
28593018
return c1 - c2;
28603019
}

Zend/zend_operators.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,13 +433,22 @@ ZEND_API int ZEND_FASTCALL string_compare_function(zval *op1, zval *op2);
433433
ZEND_API int ZEND_FASTCALL string_case_compare_function(zval *op1, zval *op2);
434434
ZEND_API int ZEND_FASTCALL string_locale_compare_function(zval *op1, zval *op2);
435435

436+
ZEND_API int ZEND_FASTCALL zend_tolower_ascii(int c);
437+
ZEND_API int ZEND_FASTCALL zend_toupper_ascii(int c);
438+
436439
ZEND_API void ZEND_FASTCALL zend_str_tolower(char *str, size_t length);
440+
ZEND_API void ZEND_FASTCALL zend_str_toupper(char *str, size_t length);
437441
ZEND_API char* ZEND_FASTCALL zend_str_tolower_copy(char *dest, const char *source, size_t length);
442+
ZEND_API char* ZEND_FASTCALL zend_str_toupper_copy(char *dest, const char *source, size_t length);
438443
ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup(const char *source, size_t length);
444+
ZEND_API char* ZEND_FASTCALL zend_str_toupper_dup(const char *source, size_t length);
439445
ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup_ex(const char *source, size_t length);
446+
ZEND_API char* ZEND_FASTCALL zend_str_toupper_dup_ex(const char *source, size_t length);
440447
ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, bool persistent);
448+
ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, bool persistent);
441449

442450
#define zend_string_tolower(str) zend_string_tolower_ex(str, 0)
451+
#define zend_string_toupper(str) zend_string_toupper_ex(str, 0)
443452

444453
ZEND_API int ZEND_FASTCALL zend_binary_zval_strcmp(zval *s1, zval *s2);
445454
ZEND_API int ZEND_FASTCALL zend_binary_zval_strncmp(zval *s1, zval *s2, zval *s3);

ext/intl/grapheme/grapheme_string.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727
#include <unicode/ustring.h>
2828
#include <unicode/ubrk.h>
2929

30-
#include "ext/standard/php_string.h"
31-
3230
/* }}} */
3331

3432
#define GRAPHEME_EXTRACT_TYPE_COUNT 0
@@ -179,9 +177,9 @@ PHP_FUNCTION(grapheme_stripos)
179177
char *haystack_dup, *needle_dup;
180178
int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
181179
needle_dup = estrndup(needle, needle_len);
182-
php_strtolower(needle_dup, needle_len);
180+
zend_str_tolower(needle_dup, needle_len);
183181
haystack_dup = estrndup(haystack, haystack_len);
184-
php_strtolower(haystack_dup, haystack_len);
182+
zend_str_tolower(haystack_dup, haystack_len);
185183

186184
found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
187185

@@ -295,9 +293,9 @@ PHP_FUNCTION(grapheme_strripos)
295293
char *needle_dup, *haystack_dup;
296294

297295
needle_dup = estrndup(needle, needle_len);
298-
php_strtolower(needle_dup, needle_len);
296+
zend_str_tolower(needle_dup, needle_len);
299297
haystack_dup = estrndup(haystack, haystack_len);
300-
php_strtolower(haystack_dup, haystack_len);
298+
zend_str_tolower(haystack_dup, haystack_len);
301299

302300
ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
303301

ext/ldap/ldap.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
#define __STDC__ 1
4646
#endif
4747

48-
#include "ext/standard/php_string.h"
4948
#include "ext/standard/info.h"
5049

5150
#ifdef HAVE_LDAP_SASL
@@ -2012,7 +2011,8 @@ PHP_FUNCTION(ldap_get_entries)
20122011
ldap_value_free_len(ldap_value);
20132012

20142013
attr_len = strlen(attribute);
2015-
zend_hash_str_update(Z_ARRVAL(tmp1), php_strtolower(attribute, attr_len), attr_len, &tmp2);
2014+
zend_str_tolower(attribute, attr_len);
2015+
zend_hash_str_update(Z_ARRVAL(tmp1), attribute, attr_len, &tmp2);
20162016
add_index_string(&tmp1, num_attrib, attribute);
20172017

20182018
num_attrib++;

ext/mbstring/mbstring.c

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3374,9 +3374,7 @@ static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t
33743374

33753375
if (fld_name != NULL && fld_val != NULL) {
33763376
zval val;
3377-
/* FIXME: some locale free implementation is
3378-
* really required here,,, */
3379-
php_strtoupper(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3377+
zend_str_toupper(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
33803378
ZVAL_STR(&val, fld_val);
33813379

33823380
zend_hash_update(ht, fld_name, &val);
@@ -3422,9 +3420,7 @@ static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t
34223420
}
34233421
if (fld_name != NULL && fld_val != NULL) {
34243422
zval val;
3425-
/* FIXME: some locale free implementation is
3426-
* really required here,,, */
3427-
php_strtoupper(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
3423+
zend_str_toupper(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
34283424
ZVAL_STR(&val, fld_val);
34293425

34303426
zend_hash_update(ht, fld_name, &val);

0 commit comments

Comments
 (0)