Skip to content

Mark certain strings as valid utf8 based on operations #10463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions Zend/zend_operators.c
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ ZEND_API zend_result ZEND_FASTCALL shift_right_function(zval *result, zval *op1,

ZEND_API zend_result ZEND_FASTCALL concat_function(zval *result, zval *op1, zval *op2) /* {{{ */
{
zval *orig_op1 = op1;
zval *orig_op1 = op1;
zval op1_copy, op2_copy;

ZVAL_UNDEF(&op1_copy);
Expand Down Expand Up @@ -1955,6 +1955,11 @@ ZEND_API zend_result ZEND_FASTCALL concat_function(zval *result, zval *op1, zval
size_t op2_len = Z_STRLEN_P(op2);
size_t result_len = op1_len + op2_len;
zend_string *result_str;
uint32_t flags = 0;

if (ZSTR_IS_VALID_UTF8(Z_STR_P(op1)) && ZSTR_IS_VALID_UTF8(Z_STR_P(op2))) {
flags = IS_STR_VALID_UTF8;
}

if (UNEXPECTED(op1_len > ZSTR_MAX_LEN - op2_len)) {
zend_throw_error(NULL, "String size overflow");
Expand All @@ -1976,6 +1981,7 @@ ZEND_API zend_result ZEND_FASTCALL concat_function(zval *result, zval *op1, zval
i_zval_ptr_dtor(result);
}
}
GC_ADD_FLAGS(result_str, flags);

/* This has to happen first to account for the cases where result == op1 == op2 and
* the realloc is done. In this case this line will also update Z_STRVAL_P(op2) to
Expand Down Expand Up @@ -3218,7 +3224,9 @@ ZEND_API zend_string* ZEND_FASTCALL zend_long_to_str(zend_long num) /* {{{ */
} else {
char buf[MAX_LENGTH_OF_LONG + 1];
char *res = zend_print_long_to_buf(buf + sizeof(buf) - 1, num);
return zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
zend_string *str = zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this assumption valid even for files with arbitrary charsets/encodings, and arbitrary settings of zend.multibyte? Genuinely asking, since I've never used anything but UTF-8.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I totally understand the question.

AFAIK, numbers in UTF-16 would be encoded over two bytes. But looking at the implementation of zend_print_long_to_buf() it seems to just create an ASCII string and doesn't care about zend_multibyte.

However, my understanding of what zend_multibyte does is to convert the current script file into an internal encoding to process the file, which is most likely going to be UTF-8. But that's a good question I don't know the answer. But it seems like we assume anyway that the encoding is going to be ASCII compatible

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it looks like these functions always return ASCII strings.

I don't think this is related at all to files with arbitrary charsets; yes, string literals in such files might use some other text encoding, but this is not about string literals. There is no reason why PHP would use zend_long_to_str to create string objects for string literals embedded in a source file.

return str;
}
}
/* }}} */
Expand All @@ -3230,7 +3238,9 @@ ZEND_API zend_string* ZEND_FASTCALL zend_ulong_to_str(zend_ulong num)
} else {
char buf[MAX_LENGTH_OF_LONG + 1];
char *res = zend_print_ulong_to_buf(buf + sizeof(buf) - 1, num);
return zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
zend_string *str = zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
return str;
}
}

Expand Down Expand Up @@ -3272,7 +3282,9 @@ ZEND_API zend_string* ZEND_FASTCALL zend_u64_to_str(uint64_t num)
} else {
char buf[20 + 1];
char *res = zend_print_u64_to_buf(buf + sizeof(buf) - 1, num);
return zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
zend_string *str = zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
return str;
}
}

Expand All @@ -3283,7 +3295,9 @@ ZEND_API zend_string* ZEND_FASTCALL zend_i64_to_str(int64_t num)
} else {
char buf[20 + 1];
char *res = zend_print_i64_to_buf(buf + sizeof(buf) - 1, num);
return zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
zend_string *str = zend_string_init(res, buf + sizeof(buf) - 1 - res, 0);
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
return str;
}
}

Expand All @@ -3293,7 +3307,9 @@ ZEND_API zend_string* ZEND_FASTCALL zend_double_to_str(double num)
/* Model snprintf precision behavior. */
int precision = (int) EG(precision);
zend_gcvt(num, precision ? precision : 1, '.', 'E', buf);
return zend_string_init(buf, strlen(buf), 0);
zend_string *str = zend_string_init(buf, strlen(buf), 0);
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
return str;
}

ZEND_API zend_uchar ZEND_FASTCALL is_numeric_str_function(const zend_string *str, zend_long *lval, double *dval) /* {{{ */
Expand Down
23 changes: 15 additions & 8 deletions Zend/zend_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,19 @@ ZEND_API zend_string* ZEND_FASTCALL zend_interned_string_find_permanent(zend_str
return zend_interned_string_ht_lookup(str, &interned_strings_permanent);
}

static zend_string* ZEND_FASTCALL zend_init_string_for_interning(zend_string *str, bool persistent) {
uint32_t flags = 0;
if (ZSTR_IS_VALID_UTF8(str)) {
flags = IS_STR_VALID_UTF8;
}
zend_ulong h = ZSTR_H(str);
zend_string_delref(str);
str = zend_string_init(ZSTR_VAL(str), ZSTR_LEN(str), persistent);
GC_ADD_FLAGS(str, flags);
ZSTR_H(str) = h;
return str;
}

static zend_string* ZEND_FASTCALL zend_new_interned_string_permanent(zend_string *str)
{
zend_string *ret;
Expand All @@ -208,10 +221,7 @@ static zend_string* ZEND_FASTCALL zend_new_interned_string_permanent(zend_string

ZEND_ASSERT(GC_FLAGS(str) & GC_PERSISTENT);
if (GC_REFCOUNT(str) > 1) {
zend_ulong h = ZSTR_H(str);
zend_string_delref(str);
str = zend_string_init(ZSTR_VAL(str), ZSTR_LEN(str), 1);
ZSTR_H(str) = h;
str = zend_init_string_for_interning(str, true);
}

return zend_add_interned_string(str, &interned_strings_permanent, IS_STR_PERMANENT);
Expand Down Expand Up @@ -249,10 +259,7 @@ static zend_string* ZEND_FASTCALL zend_new_interned_string_request(zend_string *
}
#endif
if (GC_REFCOUNT(str) > 1) {
zend_ulong h = ZSTR_H(str);
zend_string_delref(str);
str = zend_string_init(ZSTR_VAL(str), ZSTR_LEN(str), 0);
ZSTR_H(str) = h;
str = zend_init_string_for_interning(str, false);
}

ret = zend_add_interned_string(str, &CG(interned_strings), 0);
Expand Down
1 change: 1 addition & 0 deletions Zend/zend_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ END_EXTERN_C()
/*---*/

#define ZSTR_IS_INTERNED(s) (GC_FLAGS(s) & IS_STR_INTERNED)
#define ZSTR_IS_VALID_UTF8(s) (GC_FLAGS(s) & IS_STR_VALID_UTF8)

#define ZSTR_EMPTY_ALLOC() zend_empty_string
#define ZSTR_CHAR(c) zend_one_char_string[c]
Expand Down
18 changes: 18 additions & 0 deletions Zend/zend_vm_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,11 @@ ZEND_VM_HANDLER(8, ZEND_CONCAT, CONST|TMPVAR|CV, CONST|TMPVAR|CV, SPEC(NO_CONST_
zend_string *op1_str = Z_STR_P(op1);
zend_string *op2_str = Z_STR_P(op2);
zend_string *str;
uint32_t flags = 0;

if (ZSTR_IS_VALID_UTF8(op1_str) && ZSTR_IS_VALID_UTF8(op2_str)) {
flags = IS_STR_VALID_UTF8;
}

if (OP1_TYPE != IS_CONST && UNEXPECTED(ZSTR_LEN(op1_str) == 0)) {
if (OP2_TYPE == IS_CONST || OP2_TYPE == IS_CV) {
Expand Down Expand Up @@ -412,6 +417,7 @@ ZEND_VM_HANDLER(8, ZEND_CONCAT, CONST|TMPVAR|CV, CONST|TMPVAR|CV, SPEC(NO_CONST_
}
str = zend_string_extend(op1_str, len + ZSTR_LEN(op2_str), 0);
memcpy(ZSTR_VAL(str) + len, ZSTR_VAL(op2_str), ZSTR_LEN(op2_str)+1);
GC_ADD_FLAGS(str, flags);
ZVAL_NEW_STR(EX_VAR(opline->result.var), str);
if (OP2_TYPE & (IS_TMP_VAR|IS_VAR)) {
zend_string_release_ex(op2_str, 0);
Expand All @@ -420,6 +426,7 @@ ZEND_VM_HANDLER(8, ZEND_CONCAT, CONST|TMPVAR|CV, CONST|TMPVAR|CV, SPEC(NO_CONST_
str = zend_string_alloc(ZSTR_LEN(op1_str) + ZSTR_LEN(op2_str), 0);
memcpy(ZSTR_VAL(str), ZSTR_VAL(op1_str), ZSTR_LEN(op1_str));
memcpy(ZSTR_VAL(str) + ZSTR_LEN(op1_str), ZSTR_VAL(op2_str), ZSTR_LEN(op2_str)+1);
GC_ADD_FLAGS(str, flags);
ZVAL_NEW_STR(EX_VAR(opline->result.var), str);
if (OP1_TYPE & (IS_TMP_VAR|IS_VAR)) {
zend_string_release_ex(op1_str, 0);
Expand Down Expand Up @@ -3140,6 +3147,11 @@ ZEND_VM_COLD_CONSTCONST_HANDLER(53, ZEND_FAST_CONCAT, CONST|TMPVAR|CV, CONST|TMP
zend_string *op1_str = Z_STR_P(op1);
zend_string *op2_str = Z_STR_P(op2);
zend_string *str;
uint32_t flags = 0;

if (ZSTR_IS_VALID_UTF8(op1_str) && ZSTR_IS_VALID_UTF8(op2_str)) {
flags = IS_STR_VALID_UTF8;
}

if (OP1_TYPE != IS_CONST && UNEXPECTED(ZSTR_LEN(op1_str) == 0)) {
if (OP2_TYPE == IS_CONST || OP2_TYPE == IS_CV) {
Expand All @@ -3165,6 +3177,7 @@ ZEND_VM_COLD_CONSTCONST_HANDLER(53, ZEND_FAST_CONCAT, CONST|TMPVAR|CV, CONST|TMP

str = zend_string_extend(op1_str, len + ZSTR_LEN(op2_str), 0);
memcpy(ZSTR_VAL(str) + len, ZSTR_VAL(op2_str), ZSTR_LEN(op2_str)+1);
GC_ADD_FLAGS(str, flags);
ZVAL_NEW_STR(EX_VAR(opline->result.var), str);
if (OP2_TYPE & (IS_TMP_VAR|IS_VAR)) {
zend_string_release_ex(op2_str, 0);
Expand All @@ -3173,6 +3186,7 @@ ZEND_VM_COLD_CONSTCONST_HANDLER(53, ZEND_FAST_CONCAT, CONST|TMPVAR|CV, CONST|TMP
str = zend_string_alloc(ZSTR_LEN(op1_str) + ZSTR_LEN(op2_str), 0);
memcpy(ZSTR_VAL(str), ZSTR_VAL(op1_str), ZSTR_LEN(op1_str));
memcpy(ZSTR_VAL(str) + ZSTR_LEN(op1_str), ZSTR_VAL(op2_str), ZSTR_LEN(op2_str)+1);
GC_ADD_FLAGS(str, flags);
ZVAL_NEW_STR(EX_VAR(opline->result.var), str);
if (OP1_TYPE & (IS_TMP_VAR|IS_VAR)) {
zend_string_release_ex(op1_str, 0);
Expand Down Expand Up @@ -3233,6 +3247,10 @@ ZEND_VM_COLD_CONSTCONST_HANDLER(53, ZEND_FAST_CONCAT, CONST|TMPVAR|CV, CONST|TMP
str = zend_string_alloc(ZSTR_LEN(op1_str) + ZSTR_LEN(op2_str), 0);
memcpy(ZSTR_VAL(str), ZSTR_VAL(op1_str), ZSTR_LEN(op1_str));
memcpy(ZSTR_VAL(str) + ZSTR_LEN(op1_str), ZSTR_VAL(op2_str), ZSTR_LEN(op2_str)+1);

if (ZSTR_IS_VALID_UTF8(op1_str) && ZSTR_IS_VALID_UTF8(op2_str)) {
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
}
ZVAL_NEW_STR(EX_VAR(opline->result.var), str);
if (OP1_TYPE != IS_CONST) {
zend_string_release_ex(op1_str, 0);
Expand Down
Loading