@@ -805,23 +805,23 @@ fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
805
805
806
806
/// Determines if a vector of `u16` contains valid UTF-16
807
807
pub fn is_utf16 ( v : & [ u16 ] ) -> bool {
808
- let len = v. len ( ) ;
809
- let mut i = 0 u ;
810
- while i < len {
811
- let u = v [ i ] ;
812
-
813
- if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
814
- i += 1 u ;
808
+ let mut it = v. iter ( ) ;
809
+ macro_rules! next ( ( $ret : expr ) => {
810
+ match it . next ( ) { Some ( u ) => * u , None => return $ret }
811
+ }
812
+ )
813
+ loop {
814
+ let u = next ! ( true ) ;
815
815
816
- } else {
817
- if i+1 u < len { return false ; }
818
- let u2 = v[ i+1 u] ;
819
- if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false ; }
820
- if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false ; }
821
- i += 2 u;
816
+ match char:: from_u32 ( u as u32 ) {
817
+ Some ( _) => { }
818
+ None => {
819
+ let u2 = next ! ( false ) ;
820
+ if u < 0xD7FF || u > 0xDBFF ||
821
+ u2 < 0xDC00 || u2 > 0xDFFF { return false ; }
822
+ }
822
823
}
823
824
}
824
- return true ;
825
825
}
826
826
827
827
/// Iterates over the utf-16 characters in the specified slice, yielding each
@@ -3511,6 +3511,65 @@ mod tests {
3511
3511
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3512
3512
}
3513
3513
3514
+ #[test]
3515
+ fn test_is_utf16() {
3516
+ macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3517
+
3518
+ // non-surrogates
3519
+ pos!([0x0000],
3520
+ [0x0001, 0x0002],
3521
+ [0xD7FF],
3522
+ [0xE000]);
3523
+
3524
+ // surrogate pairs (randomly generated with Python 3's
3525
+ // .encode('utf-16be'))
3526
+ pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3527
+ [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3528
+ [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3529
+
3530
+ // mixtures (also random)
3531
+ pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3532
+ [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3533
+ [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3534
+
3535
+ // negative tests
3536
+ macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3537
+
3538
+ neg!(
3539
+ // surrogate + regular unit
3540
+ [0xdb45, 0x0000],
3541
+ // surrogate + lead surrogate
3542
+ [0xd900, 0xd900],
3543
+ // unterminated surrogate
3544
+ [0xd8ff],
3545
+ // trail surrogate without a lead
3546
+ [0xddb7]);
3547
+
3548
+ // random byte sequences that Python 3's .decode('utf-16be')
3549
+ // failed on
3550
+ neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3551
+ [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3552
+ [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3553
+ [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3554
+ [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3555
+ [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3556
+ [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3557
+ [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3558
+ [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3559
+ [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3560
+ [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3561
+ [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3562
+ [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3563
+ [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3564
+ [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3565
+ [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3566
+ [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3567
+ [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3568
+ [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3569
+ [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3570
+ [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3571
+ }
3572
+
3514
3573
#[test]
3515
3574
fn test_raw_from_c_str() {
3516
3575
unsafe {
@@ -3666,10 +3725,11 @@ mod tests {
3666
3725
3667
3726
for p in pairs. iter( ) {
3668
3727
let ( s, u) = ( * p) . clone( ) ;
3669
- assert!( s. to_utf16( ) == u) ;
3670
- assert!( from_utf16( u) == s) ;
3671
- assert!( from_utf16( s. to_utf16( ) ) == s) ;
3672
- assert!( from_utf16( u) . to_utf16( ) == u) ;
3728
+ assert!( is_utf16( u) ) ;
3729
+ assert_eq!( s. to_utf16( ) , u) ;
3730
+ assert_eq!( from_utf16( u) , s) ;
3731
+ assert_eq!( from_utf16( s. to_utf16( ) ) , s) ;
3732
+ assert_eq!( from_utf16( u) . to_utf16( ) , u) ;
3673
3733
}
3674
3734
}
3675
3735
0 commit comments