From 2a49922890dffe5d570093fe47a27da5030df4ee Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 24 May 2025 14:52:06 +0200 Subject: [PATCH] Avoid string copies in ext/intl after string conversion Introduces intl_convert_utf8_to_utf16_zstr() to convert a UTF-8 string to a UTF-16 string zend_string* instance. This way we avoid a double copy later from a UChar* into a zend_string*. --- UPGRADING | 4 +++ ext/intl/collator/collator_convert.c | 37 ++++++--------------- ext/intl/intl_convert.c | 48 ++++++++++++++++++++++++++++ ext/intl/intl_convert.h | 4 +++ 4 files changed, 66 insertions(+), 27 deletions(-) diff --git a/UPGRADING b/UPGRADING index 400187a40f2c6..457bf58fda3d6 100644 --- a/UPGRADING +++ b/UPGRADING @@ -523,6 +523,10 @@ PHP 8.5 UPGRADE NOTES . Add OPcode specialization for `=== []` and `!== []` comparisons. . Creating exception objects is now much faster. +- Intl: + . Now avoids creating extra string copies when converting strings + for use in the collator. + - ReflectionProperty: . Improved performance of the following methods: getValue(), getRawValue(), isInitialized(), setValue(), setRawValue(). diff --git a/ext/intl/collator/collator_convert.c b/ext/intl/collator/collator_convert.c index 463348f2a196b..26fdb0526eb9d 100644 --- a/ext/intl/collator/collator_convert.c +++ b/ext/intl/collator/collator_convert.c @@ -38,8 +38,6 @@ static void collator_convert_hash_item_from_utf8_to_utf16( { const char* old_val; size_t old_val_len; - UChar* new_val = NULL; - int32_t new_val_len = 0; zval znew_val; /* Process string values only. */ @@ -49,17 +47,13 @@ static void collator_convert_hash_item_from_utf8_to_utf16( old_val = Z_STRVAL_P( hashData ); old_val_len = Z_STRLEN_P( hashData ); - /* Convert it from UTF-8 to UTF-16LE and save the result to new_val[_len]. */ - intl_convert_utf8_to_utf16( &new_val, &new_val_len, old_val, old_val_len, status ); + /* Convert it from UTF-8 to UTF-16LE. */ + zend_string *zstr = intl_convert_utf8_to_utf16_zstr( old_val, old_val_len, status ); if( U_FAILURE( *status ) ) return; /* Update current hash item with the converted value. */ - ZVAL_STRINGL( &znew_val, (char*)new_val, UBYTES(new_val_len + 1) ); - //??? - efree(new_val); - /* hack to fix use of initialized value */ - Z_STRLEN(znew_val) = Z_STRLEN(znew_val) - UBYTES(1); + ZVAL_NEW_STR( &znew_val, zstr ); if( hashKey) { @@ -176,23 +170,19 @@ zval* collator_convert_zstr_utf16_to_utf8( zval* utf16_zval, zval *rv ) zend_string *collator_convert_zstr_utf8_to_utf16(zend_string *utf8_str) { - UChar *ustr = NULL; - int32_t ustr_len = 0; UErrorCode status = U_ZERO_ERROR; /* Convert the string to UTF-16. */ - intl_convert_utf8_to_utf16( - &ustr, &ustr_len, + zend_string *zstr = intl_convert_utf8_to_utf16_zstr( ZSTR_VAL(utf8_str), ZSTR_LEN(utf8_str), &status); // FIXME Or throw error or use intl internal error handler if (U_FAILURE(status)) { php_error(E_WARNING, "Error casting object to string in collator_convert_zstr_utf8_to_utf16()"); + zstr = ZSTR_EMPTY_ALLOC(); } - zend_string *zstr = zend_string_init((char *) ustr, UBYTES(ustr_len), 0); - efree((char *)ustr); return zstr; } @@ -203,8 +193,6 @@ zval* collator_convert_object_to_string( zval* obj, zval *rv ) { zval* zstr = NULL; UErrorCode status = U_ZERO_ERROR; - UChar* ustr = NULL; - int32_t ustr_len = 0; /* Bail out if it's not an object. */ if( Z_TYPE_P( obj ) != IS_OBJECT ) @@ -229,25 +217,20 @@ zval* collator_convert_object_to_string( zval* obj, zval *rv ) } /* Convert the string to UTF-16. */ - intl_convert_utf8_to_utf16( - &ustr, &ustr_len, + zend_string *converted_str = intl_convert_utf8_to_utf16_zstr( Z_STRVAL_P( zstr ), Z_STRLEN_P( zstr ), &status ); // FIXME Or throw error or use intl internal error handler - if( U_FAILURE( status ) ) + if( U_FAILURE( status ) ) { php_error( E_WARNING, "Error casting object to string in collator_convert_object_to_string()" ); + converted_str = ZSTR_EMPTY_ALLOC(); + } /* Cleanup zstr to hold utf16 string. */ zval_ptr_dtor_str( zstr ); /* Set string. */ - ZVAL_STRINGL( zstr, (char*)ustr, UBYTES(ustr_len)); - //??? - efree((char *)ustr); - - /* Don't free ustr cause it's set in zstr without copy. - * efree( ustr ); - */ + ZVAL_STR( zstr, converted_str ); return zstr; } diff --git a/ext/intl/intl_convert.c b/ext/intl/intl_convert.c index 3514b81ffe7a6..39f70821e117d 100644 --- a/ext/intl/intl_convert.c +++ b/ext/intl/intl_convert.c @@ -104,6 +104,54 @@ void intl_convert_utf8_to_utf16( } /* }}} */ +/* Convert given string from UTF-8 to UTF-16 to a zend_string* + * + * @param src String to convert. + * @param src_len Length of the source string. + * @param status Conversion status. + * + * @return zend_string* on success and NULL on failure + */ +zend_string *intl_convert_utf8_to_utf16_zstr( + const char* src, size_t src_len, + UErrorCode* status ) +{ + int32_t dst_len = 0; + + *status = U_ZERO_ERROR; + + if(src_len > INT32_MAX) { + /* we cannot fit this string */ + *status = U_BUFFER_OVERFLOW_ERROR; + return NULL; + } + + /* Pre-flight */ + u_strFromUTF8( NULL, 0, &dst_len, src, (int32_t)src_len, status ); + if( *status != U_BUFFER_OVERFLOW_ERROR && *status != U_STRING_NOT_TERMINATED_WARNING ) + return NULL; + + /* Note: l=sizeof(UChar)-1 because we need sizeof(UChar) bytes for the NUL terminator instead of 1. */ + zend_string *dst = zend_string_safe_alloc(sizeof(UChar), dst_len, sizeof(UChar) - 1, false); + UChar *dst_buf = (UChar *) ZSTR_VAL(dst); + ZEND_ASSERT((ZSTR_LEN(dst) - 1) / 2 == dst_len); + /* However, the length must not include the NUL terminator that we included previously. */ + ZSTR_LEN(dst)--; + + /* Convert source string from UTF-8 to UTF-16. */ + *status = U_ZERO_ERROR; + u_strFromUTF8( dst_buf, dst_len + 1, NULL, src, src_len, status ); + if( U_FAILURE( *status ) ) + { + zend_string_efree( dst ); + return NULL; + } + + dst_buf[dst_len] = 0; + + return dst; +} + /* {{{ intl_convert_utf16_to_utf8 * Convert given string from UTF-16 to UTF-8. * diff --git a/ext/intl/intl_convert.h b/ext/intl/intl_convert.h index 5ea4adbe579e7..5cc3a671333dc 100644 --- a/ext/intl/intl_convert.h +++ b/ext/intl/intl_convert.h @@ -23,6 +23,10 @@ void intl_convert_utf8_to_utf16( const char* src, size_t src_len, UErrorCode* status ); +zend_string *intl_convert_utf8_to_utf16_zstr( + const char* src, size_t src_len, + UErrorCode* status ); + zend_string* intl_convert_utf16_to_utf8( const UChar* src, int32_t src_len, UErrorCode* status );