Skip to content

Commit 6fdf447

Browse files
committed
Make strtolower() and strtoupper() do ASCII case conversion
Implement RFC https://wiki.php.net/rfc/strtolower-ascii
1 parent 1959bbf commit 6fdf447

12 files changed

+318
-865
lines changed

UPGRADING

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ PHP 8.2 UPGRADE NOTES
1919
1. Backward Incompatible Changes
2020
========================================
2121

22+
- Standard:
23+
. strtolower() and strtoupper() are no longer locale-sensitive. They now
24+
perform ASCII case conversion, as if the locale were "C". Use
25+
mb_strtolower() if you want localized case conversion. Similarly, stristr,
26+
stripos, strripos, lcfirst, ucfirst, ucwords, str_ireplace,
27+
array_change_key_case and sorting with SORT_FLAG_CASE use ASCII case
28+
conversion.
29+
2230
========================================
2331
2. New Features
2432
========================================

Zend/tests/lc_ctype_inheritance.phpt

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,30 @@ LC_CTYPE=de_DE
99
--FILE--
1010
<?php
1111

12+
echo "inherited\n";
13+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
14+
echo 'preg_match(\w, \xe4): ';
15+
var_dump(preg_match('/\w/', "\xe4"));
16+
1217
var_dump(setlocale(LC_CTYPE, "0"));
13-
var_dump(bin2hex(strtoupper("\xe4")));
18+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
19+
echo 'preg_match(\w, \xe4): ';
1420
var_dump(preg_match('/\w/', "\xe4"));
21+
22+
echo "de_DE\n";
1523
var_dump(setlocale(LC_CTYPE, "de_DE", "de-DE") !== false);
16-
var_dump(bin2hex(strtoupper("\xe4")));
24+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
25+
echo 'preg_match(\w, \xe4): ';
1726
var_dump(preg_match('/\w/', "\xe4"));
1827
?>
1928
--EXPECTF--
29+
inherited
30+
ctype_lower(\xe4): n
31+
preg_match(\w, \xe4): int(0)
2032
string(%d) "C%r(\.UTF-8)?%r"
21-
string(2) "e4"
22-
int(0)
33+
ctype_lower(\xe4): n
34+
preg_match(\w, \xe4): int(0)
35+
de_DE
2336
bool(true)
24-
string(2) "c4"
25-
int(1)
37+
ctype_lower(\xe4): y
38+
preg_match(\w, \xe4): int(1)

Zend/zend_operators.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ ZEND_API const unsigned char zend_toupper_map[256] = {
126126
* Functions using locale lowercase:
127127
zend_binary_strncasecmp_l
128128
zend_binary_strcasecmp_l
129+
* Functions using ascii lowercase:
129130
string_compare_function_ex
130131
string_case_compare_function
131-
* Functions using ascii lowercase:
132132
zend_str_tolower_copy
133133
zend_str_tolower_dup
134134
zend_str_tolower
@@ -1997,7 +1997,7 @@ ZEND_API int ZEND_FASTCALL string_compare_function_ex(zval *op1, zval *op2, bool
19971997
int ret;
19981998

19991999
if (case_insensitive) {
2000-
ret = zend_binary_strcasecmp_l(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
2000+
ret = zend_binary_strcasecmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
20012001
} else {
20022002
ret = zend_binary_strcmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str2));
20032003
}
@@ -2037,13 +2037,13 @@ ZEND_API int ZEND_FASTCALL string_case_compare_function(zval *op1, zval *op2) /*
20372037
if (Z_STR_P(op1) == Z_STR_P(op2)) {
20382038
return 0;
20392039
} else {
2040-
return zend_binary_strcasecmp_l(Z_STRVAL_P(op1), Z_STRLEN_P(op1), Z_STRVAL_P(op2), Z_STRLEN_P(op2));
2040+
return zend_binary_strcasecmp(Z_STRVAL_P(op1), Z_STRLEN_P(op1), Z_STRVAL_P(op2), Z_STRLEN_P(op2));
20412041
}
20422042
} else {
20432043
zend_string *tmp_str1, *tmp_str2;
20442044
zend_string *str1 = zval_get_tmp_string(op1, &tmp_str1);
20452045
zend_string *str2 = zval_get_tmp_string(op2, &tmp_str2);
2046-
int ret = zend_binary_strcasecmp_l(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
2046+
int ret = zend_binary_strcasecmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
20472047

20482048
zend_tmp_string_release(tmp_str1);
20492049
zend_tmp_string_release(tmp_str2);

ext/pdo_dblib/dblib_stmt.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
#include "php.h"
2323
#include "php_ini.h"
24-
#include "ext/standard/php_string.h"
2524
#include "ext/standard/info.h"
2625
#include "pdo/php_pdo.h"
2726
#include "pdo/php_pdo_driver.h"

ext/standard/string.c

Lines changed: 25 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,52 +1346,15 @@ PHP_FUNCTION(strtok)
13461346
/* {{{ php_strtoupper */
13471347
PHPAPI char *php_strtoupper(char *s, size_t len)
13481348
{
1349-
unsigned char *c;
1350-
const unsigned char *e;
1351-
1352-
c = (unsigned char *)s;
1353-
e = (unsigned char *)c+len;
1354-
1355-
while (c < e) {
1356-
*c = toupper(*c);
1357-
c++;
1358-
}
1349+
zend_str_toupper(s, len);
13591350
return s;
13601351
}
13611352
/* }}} */
13621353

13631354
/* {{{ php_string_toupper */
13641355
PHPAPI zend_string *php_string_toupper(zend_string *s)
13651356
{
1366-
unsigned char *c;
1367-
const unsigned char *e;
1368-
1369-
if (EXPECTED(!BG(ctype_string))) {
1370-
return zend_string_toupper(s);
1371-
}
1372-
c = (unsigned char *)ZSTR_VAL(s);
1373-
e = c + ZSTR_LEN(s);
1374-
1375-
while (c < e) {
1376-
if (islower(*c)) {
1377-
unsigned char *r;
1378-
zend_string *res = zend_string_alloc(ZSTR_LEN(s), 0);
1379-
1380-
if (c != (unsigned char*)ZSTR_VAL(s)) {
1381-
memcpy(ZSTR_VAL(res), ZSTR_VAL(s), c - (unsigned char*)ZSTR_VAL(s));
1382-
}
1383-
r = c + (ZSTR_VAL(res) - ZSTR_VAL(s));
1384-
while (c < e) {
1385-
*r = toupper(*c);
1386-
r++;
1387-
c++;
1388-
}
1389-
*r = '\0';
1390-
return res;
1391-
}
1392-
c++;
1393-
}
1394-
return zend_string_copy(s);
1357+
return zend_string_toupper(s);
13951358
}
13961359
/* }}} */
13971360

@@ -1404,56 +1367,22 @@ PHP_FUNCTION(strtoupper)
14041367
Z_PARAM_STR(arg)
14051368
ZEND_PARSE_PARAMETERS_END();
14061369

1407-
RETURN_STR(php_string_toupper(arg));
1370+
RETURN_STR(zend_string_toupper(arg));
14081371
}
14091372
/* }}} */
14101373

14111374
/* {{{ php_strtolower */
14121375
PHPAPI char *php_strtolower(char *s, size_t len)
14131376
{
1414-
unsigned char *c;
1415-
const unsigned char *e;
1416-
1417-
c = (unsigned char *)s;
1418-
e = c+len;
1419-
1420-
while (c < e) {
1421-
*c = tolower(*c);
1422-
c++;
1423-
}
1377+
zend_str_tolower(s, len);
14241378
return s;
14251379
}
14261380
/* }}} */
14271381

14281382
/* {{{ php_string_tolower */
14291383
PHPAPI zend_string *php_string_tolower(zend_string *s)
14301384
{
1431-
if (EXPECTED(!BG(ctype_string))) {
1432-
return zend_string_tolower(s);
1433-
}
1434-
1435-
unsigned char *c = (unsigned char *)ZSTR_VAL(s);
1436-
const unsigned char *e = c + ZSTR_LEN(s);
1437-
while (c < e) {
1438-
if (isupper(*c)) {
1439-
unsigned char *r;
1440-
zend_string *res = zend_string_alloc(ZSTR_LEN(s), 0);
1441-
1442-
if (c != (unsigned char*)ZSTR_VAL(s)) {
1443-
memcpy(ZSTR_VAL(res), ZSTR_VAL(s), c - (unsigned char*)ZSTR_VAL(s));
1444-
}
1445-
r = c + (ZSTR_VAL(res) - ZSTR_VAL(s));
1446-
while (c < e) {
1447-
*r = tolower(*c);
1448-
r++;
1449-
c++;
1450-
}
1451-
*r = '\0';
1452-
return res;
1453-
}
1454-
c++;
1455-
}
1456-
return zend_string_copy(s);
1385+
return zend_string_tolower(s);
14571386
}
14581387
/* }}} */
14591388

@@ -1466,7 +1395,7 @@ PHP_FUNCTION(strtolower)
14661395
Z_PARAM_STR(str)
14671396
ZEND_PARSE_PARAMETERS_END();
14681397

1469-
RETURN_STR(php_string_tolower(str));
1398+
RETURN_STR(zend_string_tolower(str));
14701399
}
14711400
/* }}} */
14721401

@@ -1758,8 +1687,8 @@ PHP_FUNCTION(pathinfo)
17581687
case insensitive strstr */
17591688
PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len)
17601689
{
1761-
php_strtolower(s, s_len);
1762-
php_strtolower(t, t_len);
1690+
zend_str_tolower(s, s_len);
1691+
zend_str_tolower(t, t_len);
17631692
return (char*)php_memnstr(s, t, t_len, s + s_len);
17641693
}
17651694
/* }}} */
@@ -1982,8 +1911,8 @@ PHP_FUNCTION(stripos)
19821911
RETURN_FALSE;
19831912
}
19841913

1985-
haystack_dup = php_string_tolower(haystack);
1986-
needle_dup = php_string_tolower(needle);
1914+
haystack_dup = zend_string_tolower(haystack);
1915+
needle_dup = zend_string_tolower(needle);
19871916
found = (char*)php_memnstr(ZSTR_VAL(haystack_dup) + offset,
19881917
ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), ZSTR_VAL(haystack_dup) + ZSTR_LEN(haystack));
19891918

@@ -2077,18 +2006,17 @@ PHP_FUNCTION(strripos)
20772006
}
20782007
e = ZSTR_VAL(haystack) + (ZSTR_LEN(haystack) + (size_t)offset);
20792008
}
2080-
/* Borrow that ord_needle buffer to avoid repeatedly tolower()ing needle */
2081-
lowered = tolower(*ZSTR_VAL(needle));
2009+
lowered = zend_tolower_ascii(*ZSTR_VAL(needle));
20822010
while (e >= p) {
2083-
if (tolower(*e) == lowered) {
2011+
if (zend_tolower_ascii(*e) == lowered) {
20842012
RETURN_LONG(e - p + (offset > 0 ? offset : 0));
20852013
}
20862014
e--;
20872015
}
20882016
RETURN_FALSE;
20892017
}
20902018

2091-
haystack_dup = php_string_tolower(haystack);
2019+
haystack_dup = zend_string_tolower(haystack);
20922020
if (offset >= 0) {
20932021
if ((size_t)offset > ZSTR_LEN(haystack)) {
20942022
zend_string_release_ex(haystack_dup, 0);
@@ -2112,7 +2040,7 @@ PHP_FUNCTION(strripos)
21122040
}
21132041
}
21142042

2115-
needle_dup = php_string_tolower(needle);
2043+
needle_dup = zend_string_tolower(needle);
21162044
if ((found = (char *)zend_memnrstr(p, ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), e))) {
21172045
RETVAL_LONG(found - ZSTR_VAL(haystack_dup));
21182046
zend_string_release_ex(needle_dup, 0);
@@ -2647,7 +2575,7 @@ PHP_FUNCTION(chr)
26472575
static zend_string* php_ucfirst(zend_string *str)
26482576
{
26492577
const unsigned char ch = ZSTR_VAL(str)[0];
2650-
unsigned char r = toupper(ch);
2578+
unsigned char r = zend_toupper_ascii(ch);
26512579
if (r == ch) {
26522580
return zend_string_copy(str);
26532581
} else {
@@ -2679,7 +2607,7 @@ PHP_FUNCTION(ucfirst)
26792607
Lowercase the first character of the word in a native string */
26802608
static zend_string* php_lcfirst(zend_string *str)
26812609
{
2682-
unsigned char r = tolower(ZSTR_VAL(str)[0]);
2610+
unsigned char r = zend_tolower_ascii(ZSTR_VAL(str)[0]);
26832611
if (r == ZSTR_VAL(str)[0]) {
26842612
return zend_string_copy(str);
26852613
} else {
@@ -2732,10 +2660,10 @@ PHP_FUNCTION(ucwords)
27322660
ZVAL_STRINGL(return_value, ZSTR_VAL(str), ZSTR_LEN(str));
27332661
r = Z_STRVAL_P(return_value);
27342662

2735-
*r = toupper((unsigned char) *r);
2663+
*r = zend_toupper_ascii((unsigned char) *r);
27362664
for (r_end = r + Z_STRLEN_P(return_value) - 1; r < r_end; ) {
27372665
if (mask[(unsigned char)*r++]) {
2738-
*r = toupper((unsigned char) *r);
2666+
*r = zend_toupper_ascii((unsigned char) *r);
27392667
}
27402668
}
27412669
}
@@ -3067,11 +2995,11 @@ static zend_string* php_char_to_str_ex(zend_string *str, char from, char *to, si
30672995
if (case_sensitivity) {
30682996
char_count = count_chars(ZSTR_VAL(str), ZSTR_LEN(str), from);
30692997
} else {
3070-
lc_from = tolower(from);
30712998
char_count = 0;
2999+
lc_from = zend_tolower_ascii(from);
30723000
source_end = ZSTR_VAL(str) + ZSTR_LEN(str);
30733001
for (source = ZSTR_VAL(str); source < source_end; source++) {
3074-
if (tolower(*source) == lc_from) {
3002+
if (zend_tolower_ascii(*source) == lc_from) {
30753003
char_count++;
30763004
}
30773005
}
@@ -3111,7 +3039,7 @@ static zend_string* php_char_to_str_ex(zend_string *str, char from, char *to, si
31113039
} else {
31123040
source_end = ZSTR_VAL(str) + ZSTR_LEN(str);
31133041
for (source = ZSTR_VAL(str); source < source_end; source++) {
3114-
if (tolower(*source) == lc_from) {
3042+
if (zend_tolower_ascii(*source) == lc_from) {
31153043
memcpy(target, to, to_len);
31163044
target += to_len;
31173045
} else {
@@ -4345,7 +4273,7 @@ static zend_long php_str_replace_in_subject(
43454273
zend_long old_replace_count = replace_count;
43464274

43474275
if (!lc_subject_str) {
4348-
lc_subject_str = php_string_tolower(subject_str);
4276+
lc_subject_str = zend_string_tolower(subject_str);
43494277
}
43504278
tmp_result = php_str_to_str_i_ex(subject_str, ZSTR_VAL(lc_subject_str),
43514279
search_str, replace_value, replace_len, &replace_count);
@@ -4398,7 +4326,7 @@ static zend_long php_str_replace_in_subject(
43984326
ZSTR_VAL(search_str), ZSTR_LEN(search_str),
43994327
ZSTR_VAL(replace_str), ZSTR_LEN(replace_str), &replace_count));
44004328
} else {
4401-
lc_subject_str = php_string_tolower(subject_str);
4329+
lc_subject_str = zend_string_tolower(subject_str);
44024330
ZVAL_STR(result, php_str_to_str_i_ex(subject_str, ZSTR_VAL(lc_subject_str),
44034331
search_str, ZSTR_VAL(replace_str), ZSTR_LEN(replace_str), &replace_count));
44044332
zend_string_release_ex(lc_subject_str, 0);
@@ -4941,7 +4869,7 @@ int php_tag_find(char *tag, size_t len, const char *set) {
49414869

49424870
n = norm;
49434871
t = tag;
4944-
c = tolower(*t);
4872+
c = zend_tolower_ascii(*t);
49454873
/*
49464874
normalize the tag removing leading and trailing whitespace
49474875
and turn any <a whatever...> into just <a> and any </tag>
@@ -4969,7 +4897,7 @@ int php_tag_find(char *tag, size_t len, const char *set) {
49694897
}
49704898
break;
49714899
}
4972-
c = tolower(*(++t));
4900+
c = zend_tolower_ascii(*(++t));
49734901
}
49744902
*(n++) = '>';
49754903
*n = '\0';

ext/standard/tests/strings/bug79986.phpt

Lines changed: 0 additions & 13 deletions
This file was deleted.

0 commit comments

Comments
 (0)