@@ -104,17 +104,21 @@ zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retva
104
104
105
105
static void dom_decoding_encoding_ctx_init (dom_decoding_encoding_ctx * ctx )
106
106
{
107
- ctx -> encode_data = lxb_encoding_data (LXB_ENCODING_UTF_8 );
108
- ctx -> decode_data = NULL ;
109
- /* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */
110
- ctx -> fast_path = true;
107
+ ctx -> decode_data = ctx -> encode_data = lxb_encoding_data (LXB_ENCODING_UTF_8 );
111
108
(void ) lxb_encoding_encode_init (
112
109
& ctx -> encode ,
113
110
ctx -> encode_data ,
114
111
ctx -> encoding_output ,
115
112
sizeof (ctx -> encoding_output ) / sizeof (* ctx -> encoding_output )
116
113
);
117
114
(void ) lxb_encoding_encode_replace_set (& ctx -> encode , LXB_ENCODING_REPLACEMENT_BYTES , LXB_ENCODING_REPLACEMENT_SIZE );
115
+ (void ) lxb_encoding_decode_init (
116
+ & ctx -> decode ,
117
+ ctx -> decode_data ,
118
+ ctx -> codepoints ,
119
+ sizeof (ctx -> codepoints ) / sizeof (* ctx -> codepoints )
120
+ );
121
+ (void ) lxb_encoding_decode_replace_set (& ctx -> decode , LXB_ENCODING_REPLACEMENT_BUFFER , LXB_ENCODING_REPLACEMENT_BUFFER_LEN );
118
122
}
119
123
120
124
static const char * dom_lexbor_tokenizer_error_code_to_string (lxb_html_tokenizer_error_id_t id )
@@ -523,6 +527,8 @@ static bool dom_decode_encode_fast_path(
523
527
size_t * tree_error_offset
524
528
)
525
529
{
530
+ decoding_encoding_ctx -> decode .status = LXB_STATUS_OK ;
531
+
526
532
const lxb_char_t * buf_ref = * buf_ref_ref ;
527
533
const lxb_char_t * last_output = buf_ref ;
528
534
while (buf_ref != buf_end ) {
@@ -551,6 +557,17 @@ static bool dom_decode_encode_fast_path(
551
557
)) {
552
558
goto fail_oom ;
553
559
}
560
+
561
+ if (codepoint == LXB_ENCODING_DECODE_CONTINUE ) {
562
+ ZEND_ASSERT (buf_ref == buf_end );
563
+ /* The decoder needs more data but the entire buffer is consumed.
564
+ * All valid data is outputted, and if the remaining data for the code point
565
+ * is invalid, the next call will output the replacement bytes. */
566
+ * buf_ref_ref = buf_ref ;
567
+ decoding_encoding_ctx -> decode .status = LXB_STATUS_CONTINUE ;
568
+ return true;
569
+ }
570
+
554
571
if (!dom_process_parse_chunk (
555
572
ctx ,
556
573
document ,
@@ -563,6 +580,7 @@ static bool dom_decode_encode_fast_path(
563
580
)) {
564
581
goto fail_oom ;
565
582
}
583
+
566
584
last_output = buf_ref ;
567
585
}
568
586
}
@@ -676,29 +694,22 @@ static bool dom_parse_decode_encode_finish(
676
694
size_t * tree_error_offset
677
695
)
678
696
{
679
- if (!decoding_encoding_ctx -> fast_path ) {
680
- /* Fast path handles codepoints one by one, so this part is not applicable in that case */
681
- (void ) lxb_encoding_decode_finish (& decoding_encoding_ctx -> decode );
682
- size_t decoding_buffer_size = lxb_encoding_decode_buf_used (& decoding_encoding_ctx -> decode );
683
- if (decoding_buffer_size > 0 ) {
684
- const lxb_codepoint_t * codepoints_ref = (const lxb_codepoint_t * ) decoding_encoding_ctx -> codepoints ;
685
- const lxb_codepoint_t * codepoints_end = codepoints_ref + decoding_buffer_size ;
686
- (void ) decoding_encoding_ctx -> encode_data -> encode (& decoding_encoding_ctx -> encode , & codepoints_ref , codepoints_end );
687
- if (!dom_process_parse_chunk (
688
- ctx ,
689
- document ,
690
- parser ,
691
- lxb_encoding_encode_buf_used (& decoding_encoding_ctx -> encode ),
692
- decoding_encoding_ctx -> encoding_output ,
693
- decoding_buffer_size ,
694
- tokenizer_error_offset ,
695
- tree_error_offset
696
- )) {
697
- return false;
698
- }
699
- }
697
+ lxb_status_t status ;
698
+
699
+ status = lxb_encoding_decode_finish (& decoding_encoding_ctx -> decode );
700
+ ZEND_ASSERT (status == LXB_STATUS_OK );
701
+
702
+ size_t decoding_buffer_size = lxb_encoding_decode_buf_used (& decoding_encoding_ctx -> decode );
703
+ if (decoding_buffer_size > 0 ) {
704
+ const lxb_codepoint_t * codepoints_ref = (const lxb_codepoint_t * ) decoding_encoding_ctx -> codepoints ;
705
+ const lxb_codepoint_t * codepoints_end = codepoints_ref + decoding_buffer_size ;
706
+ status = decoding_encoding_ctx -> encode_data -> encode (& decoding_encoding_ctx -> encode , & codepoints_ref , codepoints_end );
707
+ ZEND_ASSERT (status == LXB_STATUS_OK );
708
+ /* No need to produce output here, as we finish the encoder below and pass the chunk. */
700
709
}
701
- (void ) lxb_encoding_encode_finish (& decoding_encoding_ctx -> encode );
710
+
711
+ status = lxb_encoding_encode_finish (& decoding_encoding_ctx -> encode );
712
+ ZEND_ASSERT (status == LXB_STATUS_OK );
702
713
if (lxb_encoding_encode_buf_used (& decoding_encoding_ctx -> encode )
703
714
&& !dom_process_parse_chunk (
704
715
ctx ,
0 commit comments