From cd3637b0fa8f760f606999899156d2cb7d6004cd Mon Sep 17 00:00:00 2001 From: Guillaume Nodet Date: Wed, 1 Mar 2023 15:07:23 +0100 Subject: [PATCH 1/2] Fix reading comments with UTF chars (fixes #238) --- .../plexus/util/xml/pull/MXParser.java | 28 +++++++++++++++---- .../plexus/util/xml/pull/MXParserTest.java | 26 +++++++++++++++++ 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index d44c9a7f..9cdcdbf1 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -2981,8 +2981,8 @@ private void parseComment() // implements XML 1.0 Section 2.5 Comments // ASSUMPTION: seen - ch = more(); + cch = more(); + int ch; + char cch2; + if ( Character.isHighSurrogate( cch ) ) + { + cch2 = more(); + ch = Character.toCodePoint( cch, cch2 ); + } + else + { + cch2 = 0; + ch = cch; + } if ( seenDashDash && ch != '>' ) { throw new XmlPullParserException( "in comment after two dashes (--) next character must be >" @@ -3074,7 +3086,11 @@ else if ( ch == '\n' ) { if ( pcEnd >= pc.length ) ensurePC( pcEnd ); - pc[pcEnd++] = ch; + pc[pcEnd++] = cch; + if ( cch2 != 0 ) + { + pc[pcEnd++] = cch2; + } } normalizedCR = false; } @@ -4153,7 +4169,7 @@ private static boolean isS( char ch ) // ch != '\u0000' ch < '\uFFFE' // private char printable(char ch) { return ch; } - private static String printable( char ch ) + private static String printable( int ch ) { if ( ch == '\n' ) { @@ -4175,7 +4191,7 @@ else if ( ch == '\'' ) { return "\\u" + Integer.toHexString( ch ); } - return "" + ch; + return Character.toString( ch ); } private static String printable( String s ) diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java index e5e04708..cba42b32 100644 --- a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java +++ b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java @@ -1511,4 +1511,30 @@ public void testReplacementInPCArrayWithShorterCharArray() fail( "should not raise exception: " + e ); } } + + /** + * Ensures emoji can be parsed correctly + */ + @Test + public void testUnicode() throws IOException { + String input = ""; + + try + { + MXParser parser = new MXParser(); + parser.setInput( new StringReader( input ) ); + + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( "project", parser.getName() ); + assertEquals( XmlPullParser.COMMENT, parser.nextToken() ); + assertEquals( "ALL TEH BOMS! \uD83D\uDCA3 ", parser.getText() ); + assertEquals( XmlPullParser.END_TAG, parser.nextToken() ); + assertEquals( "project", parser.getName() ); + } + catch ( XmlPullParserException e ) + { + e.printStackTrace(); + fail( "should not raise exception: " + e ); + } + } } From 212374fd4520dac1a23b699b2bfbac94289ea29a Mon Sep 17 00:00:00 2001 From: Guillaume Nodet Date: Wed, 1 Mar 2023 15:35:48 +0100 Subject: [PATCH 2/2] Fix printable methods to account for UTF chars --- .../org/codehaus/plexus/util/xml/pull/MXParser.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index 9cdcdbf1..e9fc1182 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -4191,18 +4191,25 @@ else if ( ch == '\'' ) { return "\\u" + Integer.toHexString( ch ); } - return Character.toString( ch ); + if ( Character.isBmpCodePoint( ch ) ) + { + return Character.toString( ( char ) ch ); + } + else + { + return new String( new char[] { Character.highSurrogate( ch ), Character.lowSurrogate( ch ) } ); + } } private static String printable( String s ) { if ( s == null ) return null; - final int sLen = s.length(); + final int sLen = s.codePointCount(0, s.length()); StringBuilder buf = new StringBuilder( sLen + 10 ); for ( int i = 0; i < sLen; ++i ) { - buf.append( printable( s.charAt( i ) ) ); + buf.append( printable( s.codePointAt( i ) ) ); } s = buf.toString(); return s;