Skip to content

Commit 0c9181b

Browse files
committed
Add function in zend_test to check UTF8 flag is added
Also add test to check what strings are marked as having the flag
1 parent 0b9fb63 commit 0c9181b

File tree

5 files changed

+290
-1
lines changed

5 files changed

+290
-1
lines changed

ext/zend_test/test.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,17 @@ static ZEND_FUNCTION(zend_test_zend_ini_str)
424424
RETURN_STR(ZT_G(str_test));
425425
}
426426

427+
static ZEND_FUNCTION(zend_test_is_string_marked_as_valid_utf8)
428+
{
429+
zend_string *str;
430+
431+
ZEND_PARSE_PARAMETERS_START(1, 1)
432+
Z_PARAM_STR(str)
433+
ZEND_PARSE_PARAMETERS_END();
434+
435+
RETURN_BOOL(ZSTR_IS_VALID_UTF8(str));
436+
}
437+
427438
static ZEND_FUNCTION(ZendTestNS2_namespaced_func)
428439
{
429440
ZEND_PARSE_PARAMETERS_NONE();

ext/zend_test/test.stub.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ function zend_test_zend_ini_str(): string {}
178178
function zend_test_zend_call_stack_get(): ?array {}
179179
function zend_test_zend_call_stack_use_all(): int {}
180180
#endif
181+
182+
function zend_test_is_string_marked_as_valid_utf8(string $string): bool {}
181183
}
182184

183185
namespace ZendTestNS {

ext/zend_test/test_arginfo.h

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
--TEST--
2+
Check that strings are marked as valid UTF-8
3+
--EXTENSIONS--
4+
zend_test
5+
--FILE--
6+
<?php
7+
echo "Empty strings:\n";
8+
$s = "";
9+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
10+
11+
echo "Known strings:\n";
12+
$s = "c";
13+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
14+
15+
echo "Integer cast to string:\n";
16+
$i = 2563;
17+
$s = (string) $i;
18+
var_dump($s);
19+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
20+
21+
echo "Float cast to string:\n";
22+
$f = 26.7;
23+
$s = (string) $f;
24+
var_dump($s);
25+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
26+
$f = 2e100;
27+
$s = (string) $f;
28+
var_dump($s);
29+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
30+
31+
echo "Concatenation known valid UTF-8 strings in variables:\n";
32+
$s1 = "f";
33+
$s2 = "o";
34+
$s = $s1 . $s2;
35+
var_dump($s);
36+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
37+
38+
echo "Multiple concatenation known valid UTF-8 strings in variables:\n";
39+
$s1 = "f";
40+
$s2 = "o";
41+
$s = $s1 . $s2 . $s2;
42+
var_dump($s);
43+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
44+
45+
echo "Concatenation known valid UTF-8 in assignment:\n";
46+
$s = "f" . "o";
47+
var_dump($s);
48+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
49+
50+
echo "Multiple concatenation known valid UTF-8 in assignment:\n";
51+
$s = "f" . "o" . "o";
52+
var_dump($s);
53+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
54+
55+
echo "Concatenation known valid UTF-8 string with empty string in variables:\n";
56+
$s1 = "f";
57+
$s2 = "";
58+
$s = $s1 . $s2;
59+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
60+
$s1 = "f";
61+
$s2 = "";
62+
$s = $s2 . $s1;
63+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
64+
65+
echo "Concatenation known valid UTF-8 string with empty string in assignment:\n";
66+
$s = "f" . "";
67+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
68+
$s = "" . "f";
69+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
70+
71+
echo "Concatenation in loop:\n";
72+
const COPY_TIMES = 10_000;
73+
$string = "a";
74+
75+
$string_concat = $string;
76+
for ($i = 1; $i < COPY_TIMES; $i++) {
77+
$string_concat = $string_concat . $string;
78+
}
79+
var_dump(zend_test_is_string_marked_as_valid_utf8($string_concat));
80+
81+
echo "Concatenation in loop (compound assignment):\n";
82+
$string = "a";
83+
84+
$string_concat = $string;
85+
for ($i = 1; $i < COPY_TIMES; $i++) {
86+
$string_concat .= $string;
87+
}
88+
var_dump(zend_test_is_string_marked_as_valid_utf8($string_concat));
89+
90+
echo "Concatenation of objects:\n";
91+
class ToString {
92+
public function __toString() : string{
93+
return "z";
94+
}
95+
}
96+
$o = new ToString();
97+
$s = $o . $o;
98+
var_dump($s);
99+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
100+
101+
?>
102+
--EXPECT--
103+
Empty strings:
104+
bool(true)
105+
Known strings:
106+
bool(true)
107+
Integer cast to string:
108+
string(4) "2563"
109+
bool(false)
110+
Float cast to string:
111+
string(4) "26.7"
112+
bool(false)
113+
string(8) "2.0E+100"
114+
bool(false)
115+
Concatenation known valid UTF-8 strings in variables:
116+
string(2) "fo"
117+
bool(false)
118+
Multiple concatenation known valid UTF-8 strings in variables:
119+
string(3) "foo"
120+
bool(false)
121+
Concatenation known valid UTF-8 in assignment:
122+
string(2) "fo"
123+
bool(false)
124+
Multiple concatenation known valid UTF-8 in assignment:
125+
string(3) "foo"
126+
bool(false)
127+
Concatenation known valid UTF-8 string with empty string in variables:
128+
bool(true)
129+
bool(true)
130+
Concatenation known valid UTF-8 string with empty string in assignment:
131+
bool(true)
132+
bool(true)
133+
Concatenation in loop:
134+
bool(false)
135+
Concatenation in loop (compound assignment):
136+
bool(false)
137+
Concatenation of objects:
138+
string(2) "zz"
139+
bool(false)
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
--TEST--
2+
Check that invalid UTF-8 strings are NOT marked as valid UTF-8
3+
--EXTENSIONS--
4+
zend_test
5+
--FILE--
6+
<?php
7+
// Invalid 2 Octet Sequence
8+
$non_utf8 = "\xc3\x28";
9+
10+
echo "Integer cast to string concatenated to invalid UTF-8:\n";
11+
$i = 2563;
12+
$s = (string) $i;
13+
$s .= "\xc3\x28";
14+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
15+
16+
echo "Float cast to string concatenated to invalid UTF-8:\n";
17+
$f = 26.7;
18+
$s = (string) $f;
19+
$s .= "\xc3\x28";
20+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
21+
$f = 2e100;
22+
$s = (string) $f;
23+
$s .= "\xc3\x28";
24+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
25+
26+
echo "Concatenation known valid UTF-8 strings in variables, followed by concatenation of invalid UTF-8:\n";
27+
$s1 = "f";
28+
$s2 = "o";
29+
$s = $s1 . $s2;
30+
$s = $s . $non_utf8;
31+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
32+
33+
echo "Multiple concatenation known valid UTF-8 strings in variables, followed by concatenation of invalid UTF-8:\n";
34+
$s1 = "f";
35+
$s2 = "o";
36+
$s = $s1 . $s2 . $non_utf8;
37+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
38+
39+
echo "Concatenation known valid UTF-8 with invalid UTF-8 in assignment:\n";
40+
$s = "f" . "\xc3\x28";
41+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
42+
43+
// The "foo" string matches with a "Foo" class which is registered by the zend_test extension.
44+
// That class name does not have the "valid UTF-8" flag because class names in general
45+
// don't have to be UTF-8. As the "foo" string here goes through the interning logic,
46+
// the string gets replaced by the "foo" string from the class, which does
47+
// not have the "valid UTF-8" flag. We therefore choose a different test case: "fxo".
48+
// The previous "foo" test case works because it is not interned.
49+
echo "Multiple concatenation known valid UTF-8 and invalid UTF-8 in assignment:\n";
50+
$s = "f" . "o" . "\xc3\x28";
51+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
52+
53+
echo "Concatenation known valid UTF-8 string with empty string in variables, followed by concatenation of invalid UTF-8:\n";
54+
$s1 = "f";
55+
$s2 = "";
56+
$s = $s1 . $s2;
57+
$s = $s . $non_utf8;
58+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
59+
$s1 = "f";
60+
$s2 = "";
61+
$s = $s2 . $s1;
62+
$s = $s . $non_utf8;
63+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
64+
65+
echo "Concatenation known valid UTF-8 string with empty string in assignment, followed by concatenation of invalid UTF-8:\n";
66+
$s = "f" . "";
67+
$s = $s . $non_utf8;
68+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
69+
$s = "" . "f";
70+
$s = $s . $non_utf8;
71+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
72+
73+
echo "Concatenation in loop:\n";
74+
const COPY_TIMES = 10_000;
75+
$string = "a";
76+
77+
$string_concat = $string;
78+
for ($i = 1; $i < COPY_TIMES; $i++) {
79+
$string_concat = $string_concat . $string;
80+
}
81+
$string_concat = $string_concat . $non_utf8;
82+
var_dump(zend_test_is_string_marked_as_valid_utf8($string_concat));
83+
84+
echo "Concatenation in loop (compound assignment):\n";
85+
$string = "a";
86+
87+
$string_concat = $string;
88+
for ($i = 1; $i < COPY_TIMES; $i++) {
89+
$string_concat .= $string;
90+
}
91+
$string_concat = $string_concat . $non_utf8;
92+
var_dump(zend_test_is_string_marked_as_valid_utf8($string_concat));
93+
94+
echo "Concatenation of objects:\n";
95+
class ToString {
96+
public function __toString() : string{
97+
return "z";
98+
}
99+
}
100+
$o = new ToString();
101+
$s = $o . $o;
102+
$s = $s . $non_utf8;
103+
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
104+
105+
?>
106+
--EXPECT--
107+
Integer cast to string concatenated to invalid UTF-8:
108+
bool(false)
109+
Float cast to string concatenated to invalid UTF-8:
110+
bool(false)
111+
bool(false)
112+
Concatenation known valid UTF-8 strings in variables, followed by concatenation of invalid UTF-8:
113+
bool(false)
114+
Multiple concatenation known valid UTF-8 strings in variables, followed by concatenation of invalid UTF-8:
115+
bool(false)
116+
Concatenation known valid UTF-8 with invalid UTF-8 in assignment:
117+
bool(false)
118+
Multiple concatenation known valid UTF-8 and invalid UTF-8 in assignment:
119+
bool(false)
120+
Concatenation known valid UTF-8 string with empty string in variables, followed by concatenation of invalid UTF-8:
121+
bool(false)
122+
bool(false)
123+
Concatenation known valid UTF-8 string with empty string in assignment, followed by concatenation of invalid UTF-8:
124+
bool(false)
125+
bool(false)
126+
Concatenation in loop:
127+
bool(false)
128+
Concatenation in loop (compound assignment):
129+
bool(false)
130+
Concatenation of objects:
131+
bool(false)

0 commit comments

Comments
 (0)