Skip to content

Commit a1c1ee6

Browse files
committed
Don't use opaque for encoding detection score
opaque is used by the htmlentities filter, which means that we end up trying to free the score value as a pointer. Don't try to be overly tricky here and simply allocate a separate structure to hold the number of illegal characters and the score.
1 parent d2ccea1 commit a1c1ee6

File tree

3 files changed

+27
-14
lines changed

3 files changed

+27
-14
lines changed

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -285,24 +285,23 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
285285
/*
286286
* encoding detector
287287
*/
288-
static int mbfl_estimate_encoding_likelihood(int c, void* data)
288+
static int mbfl_estimate_encoding_likelihood(int c, void *void_data)
289289
{
290-
mbfl_convert_filter *filter = *((mbfl_convert_filter**)data);
291-
uintptr_t *score = (uintptr_t*)(&filter->opaque);
290+
mbfl_encoding_detector_data *data = void_data;
292291

293292
/* Receive wchars decoded from test string using candidate encoding
294293
* If the test string was invalid in the candidate encoding, we assume
295294
* it's the wrong one. */
296295
if (c & MBFL_WCSGROUP_THROUGH) {
297-
filter->num_illegalchar++;
296+
data->num_illegalchars++;
298297
} else if (php_unicode_is_cntrl(c) || php_unicode_is_private(c)) {
299298
/* Otherwise, count how many control characters and 'private use'
300299
* codepoints we see. Those are rarely used and may indicate that
301300
* the candidate encoding is not the right one. */
302-
*score += 10;
301+
data->score += 10;
303302
} else if (php_unicode_is_punct(c)) {
304303
/* Punctuation is also less common than letters/digits */
305-
(*score)++;
304+
data->score++;
306305
}
307306
return c;
308307
}
@@ -315,14 +314,14 @@ mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist,
315314

316315
mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
317316
identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
317+
identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data));
318318

319319
int filter_list_size = 0;
320320
for (int i = 0; i < elistsz; i++) {
321321
mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
322-
mbfl_estimate_encoding_likelihood, NULL, &identd->filter_list[i]);
322+
mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]);
323323
if (filter) {
324324
identd->filter_list[filter_list_size++] = filter;
325-
filter->opaque = (void*)0;
326325
}
327326
}
328327
identd->filter_list_size = filter_list_size;
@@ -336,6 +335,7 @@ void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
336335
mbfl_convert_filter_delete(identd->filter_list[i]);
337336
}
338337
efree(identd->filter_list);
338+
efree(identd->filter_data);
339339
efree(identd);
340340
}
341341

@@ -351,7 +351,7 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str
351351
mbfl_convert_filter *filter = identd->filter_list[i];
352352
if (!filter->num_illegalchar) {
353353
(*filter->filter_function)(*p, filter);
354-
if (filter->num_illegalchar) {
354+
if (identd->filter_data[i].num_illegalchars) {
355355
bad++;
356356
}
357357
}
@@ -374,14 +374,15 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str
374374

375375
const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
376376
{
377-
uintptr_t best_score = UINT_MAX; /* Low score is 'better' */
377+
size_t best_score = SIZE_MAX; /* Low score is 'better' */
378378
const mbfl_encoding *enc = NULL;
379379

380380
for (int i = 0; i < identd->filter_list_size; i++) {
381381
mbfl_convert_filter *filter = identd->filter_list[i];
382-
if (!filter->num_illegalchar && (uintptr_t)filter->opaque < best_score) {
382+
mbfl_encoding_detector_data *data = &identd->filter_data[i];
383+
if (!data->num_illegalchars && data->score < best_score) {
383384
enc = filter->from;
384-
best_score = (uintptr_t)filter->opaque;
385+
best_score = data->score;
385386
}
386387
}
387388

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,14 @@ MBFLAPI extern size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd);
154154
*/
155155
typedef struct _mbfl_encoding_detector mbfl_encoding_detector;
156156

157+
typedef struct {
158+
size_t num_illegalchars;
159+
size_t score;
160+
} mbfl_encoding_detector_data;
161+
157162
struct _mbfl_encoding_detector {
158163
mbfl_convert_filter **filter_list;
164+
mbfl_encoding_detector_data *filter_data;
159165
int filter_list_size;
160166
int strict;
161167
};

ext/mbstring/tests/bug81298.phpt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,16 @@ Bug #81298: mb_detect_encoding() segfaults when 7bit encoding is specified
33
--FILE--
44
<?php
55

6-
var_dump(mb_detect_encoding("foobar", "7bit"));
7-
var_dump(mb_detect_encoding("foobar", "7bit,ascii"));
6+
var_dump(mb_detect_encoding("foobar.", "7bit"));
7+
var_dump(mb_detect_encoding("foobar.", "7bit,ascii"));
8+
var_dump(mb_detect_encoding("foobar.", "7bit,ascii,utf8"));
9+
var_dump(mb_detect_encoding("foobar.", "html"));
10+
var_dump(mb_detect_encoding("foobar.", "ascii,html"));
811

912
?>
1013
--EXPECT--
1114
bool(false)
1215
string(5) "ASCII"
16+
string(5) "ASCII"
17+
string(13) "HTML-ENTITIES"
18+
string(5) "ASCII"

0 commit comments

Comments
 (0)