@@ -175,6 +175,9 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf
175
175
176
176
static int mbfl_filt_conv_utf16_wchar_flush (mbfl_convert_filter * filter );
177
177
static size_t mb_utf16_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state );
178
+ static zend_string * mb_cut_utf16 (unsigned char * str , size_t from , size_t len , unsigned char * end );
179
+ static zend_string * mb_cut_utf16be (unsigned char * str , size_t from , size_t len , unsigned char * end );
180
+ static zend_string * mb_cut_utf16le (unsigned char * str , size_t from , size_t len , unsigned char * end );
178
181
179
182
static const char * mbfl_encoding_utf16_aliases [] = {"utf16" , NULL };
180
183
@@ -190,7 +193,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
190
193
mb_utf16_to_wchar ,
191
194
mb_wchar_to_utf16be ,
192
195
NULL ,
193
- NULL ,
196
+ mb_cut_utf16
194
197
};
195
198
196
199
const mbfl_encoding mbfl_encoding_utf16be = {
@@ -205,7 +208,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
205
208
mb_utf16be_to_wchar ,
206
209
mb_wchar_to_utf16be ,
207
210
NULL ,
208
- NULL ,
211
+ mb_cut_utf16be
209
212
};
210
213
211
214
const mbfl_encoding mbfl_encoding_utf16le = {
@@ -220,7 +223,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
220
223
mb_utf16le_to_wchar ,
221
224
mb_wchar_to_utf16le ,
222
225
NULL ,
223
- NULL ,
226
+ mb_cut_utf16le
224
227
};
225
228
226
229
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
@@ -1043,3 +1046,89 @@ static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *b
1043
1046
}
1044
1047
1045
1048
#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
1049
+
1050
+ static zend_string * mb_cut_utf16be (unsigned char * str , size_t from , size_t len , unsigned char * end )
1051
+ {
1052
+ if (len > end - (str + from )) {
1053
+ len = end - (str + from );
1054
+ }
1055
+ from &= ~1 ;
1056
+ len &= ~1 ;
1057
+ unsigned char * start = str + from ;
1058
+ if (len < 2 || (end - start ) < 2 ) {
1059
+ return zend_empty_string ;
1060
+ }
1061
+ /* Check if 1st codepoint is 2nd part of surrogate pair */
1062
+ if (from > 0 ) {
1063
+ uint32_t start_cp = (* start << 8 ) + * (start + 1 );
1064
+ if (start_cp >= 0xDC00 && start_cp <= 0xDFFF ) {
1065
+ uint32_t preceding_cp = (* (start - 2 ) << 8 ) + * (start - 1 );
1066
+ if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF ) {
1067
+ from -= 2 ;
1068
+ }
1069
+ }
1070
+ }
1071
+ /* Same for ending cut point */
1072
+ unsigned char * _end = start + len ;
1073
+ if (_end > end ) {
1074
+ _end = end ;
1075
+ }
1076
+ uint32_t ending_cp = (* (_end - 2 ) << 8 ) + * (_end - 1 );
1077
+ if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF ) {
1078
+ _end -= 2 ;
1079
+ }
1080
+ return zend_string_init_fast ((char * )start , _end - start );
1081
+ }
1082
+
1083
+ static zend_string * mb_cut_utf16le (unsigned char * str , size_t from , size_t len , unsigned char * end )
1084
+ {
1085
+ if (len > end - (str + from )) {
1086
+ len = end - (str + from );
1087
+ }
1088
+ from &= ~1 ;
1089
+ len &= ~1 ;
1090
+ unsigned char * start = str + from ;
1091
+ if (len < 2 || (end - start ) < 2 ) {
1092
+ return zend_empty_string ;
1093
+ }
1094
+ /* Check if 1st codepoint is 2nd part of surrogate pair */
1095
+ if (from > 0 ) {
1096
+ uint32_t start_cp = (* (start + 1 ) << 8 ) + * start ;
1097
+ if (start_cp >= 0xDC00 && start_cp <= 0xDFFF ) {
1098
+ uint32_t preceding_cp = (* (start - 1 ) << 8 ) + * (start - 2 );
1099
+ if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF ) {
1100
+ from -= 2 ;
1101
+ }
1102
+ }
1103
+ }
1104
+ /* Same for ending cut point */
1105
+ unsigned char * _end = start + len ;
1106
+ if (_end > end ) {
1107
+ _end = end ;
1108
+ }
1109
+ uint32_t ending_cp = (* (_end - 1 ) << 8 ) + * (_end - 2 );
1110
+ if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF ) {
1111
+ _end -= 2 ;
1112
+ }
1113
+ return zend_string_init_fast ((char * )start , _end - start );
1114
+ }
1115
+
1116
+ static zend_string * mb_cut_utf16 (unsigned char * str , size_t from , size_t len , unsigned char * end )
1117
+ {
1118
+ if (len < 2 || (end - str ) < 2 ) {
1119
+ return zend_empty_string ;
1120
+ }
1121
+ uint32_t cp = (* str << 8 ) + * (str + 1 );
1122
+ if (cp == 0xFFFE ) {
1123
+ /* Little-endian BOM */
1124
+ if (from < 2 ) {
1125
+ from = 2 ;
1126
+ }
1127
+ return mb_cut_utf16le (str , from , len , end );
1128
+ } else {
1129
+ if (cp == 0xFEFF && from < 2 ) {
1130
+ from = 2 ;
1131
+ }
1132
+ return mb_cut_utf16be (str , from , len , end );
1133
+ }
1134
+ }
0 commit comments