Skip to content

Commit f4a896c

Browse files
committed
- PHP uses a big endian representation when it converts the
code unit sequences to integers so as to store the entity maps. Code in traverse_for_entities assumed little endian. Fixed. (in practice, due to the absence of unicode and entity mappings for multi-byte encodings -- except UTF-8 --, this doesn't matter, so the relevant code was commented out for performance reasons).
1 parent 17dc181 commit f4a896c

File tree

1 file changed

+56
-13
lines changed

1 file changed

+56
-13
lines changed

ext/standard/html.c

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
456456
/* }}} */
457457

458458
/* {{{ php_utf32_utf8 */
459-
size_t php_utf32_utf8(unsigned char *buf, int k)
459+
static size_t php_utf32_utf8(unsigned char *buf, unsigned k)
460460
{
461461
size_t retval = 0;
462462

@@ -487,6 +487,47 @@ size_t php_utf32_utf8(unsigned char *buf, int k)
487487
}
488488
/* }}} */
489489

490+
/* {{{ php_mb2_int_to_char
491+
* Convert back big endian int representation of sequence of one or two 8-bit code units. */
492+
static size_t php_mb2_int_to_char(unsigned char *buf, unsigned k)
493+
{
494+
assert(k <= 0xFFFFU);
495+
/* one or two bytes */
496+
if (k <= 0xFFU) { /* 1 */
497+
buf[0] = k;
498+
return 1U;
499+
} else { /* 2 */
500+
buf[0] = k >> 8;
501+
buf[1] = k & 0xFFU;
502+
return 2U;
503+
}
504+
}
505+
/* }}} */
506+
507+
/* {{{ php_mb3_int_to_char
508+
* Convert back big endian int representation of sequence of one to three 8-bit code units.
509+
* For EUC-JP. */
510+
static size_t php_mb3_int_to_char(unsigned char *buf, unsigned k)
511+
{
512+
assert(k <= 0xFFFFFFU);
513+
/* one to three bytes */
514+
if (k <= 0xFFU) { /* 1 */
515+
buf[0] = k;
516+
return 1U;
517+
} else if (k <= 0xFFFFU) { /* 2 */
518+
buf[0] = k >> 8;
519+
buf[1] = k & 0xFFU;
520+
return 2U;
521+
} else {
522+
buf[0] = k >> 16;
523+
buf[1] = (k >> 8) & 0xFFU;
524+
buf[2] = k & 0xFFU;
525+
return 3U;
526+
}
527+
}
528+
/* }}} */
529+
530+
490531
/* {{{ unimap_bsearc_cmp
491532
* Binary search of unicode code points in unicode <--> charset mapping.
492533
* Returns the code point in the target charset (whose mapping table was given) or 0 if
@@ -817,21 +858,23 @@ static void traverse_for_entities(char *ret, int *retlen_p, int all, int quote_s
817858
case cs_big5hkscs:
818859
case cs_sjis:
819860
case cs_gb2312:
820-
/* one or two bytes */
821-
*(q++) = (code & 0xFFU);
822-
if (0xFF00U & code) { /* 2 */
823-
*(q++) = (code >> 8);
824-
}
861+
/* we don't have named entity or unicode mappings for these yet,
862+
* so we're guaranteed code <= 0xFF */
863+
#if 0
864+
q += php_mb2_int_to_char((unsigned char*)q, code);
865+
#else
866+
assert(code <= 0xFFU);
867+
*(q++) = code;
868+
#endif
825869
break;
826870

827871
case cs_eucjp:
828-
/* one to three bytes */
829-
*(q++) = code & 0xFFU;
830-
if (0xFFFF00U & code) { /* 2 */
831-
*(q++) = ((code >> 8) & 0xFFU);
832-
if (0xFF0000U & code) /* 3 */
833-
*(q++) = (code >> 16);
834-
}
872+
#if 0 /* idem */
873+
q += php_mb2_int_to_char((unsigned char*)q, code);
874+
#else
875+
assert(code <= 0xFFU);
876+
*(q++) = code;
877+
#endif
835878
break;
836879

837880
default:

0 commit comments

Comments
 (0)