diff --git a/src/ascii_char.rs b/src/ascii_char.rs index 49c1b21..f726df6 100644 --- a/src/ascii_char.rs +++ b/src/ascii_char.rs @@ -376,7 +376,8 @@ impl AsciiChar { #[inline] #[must_use] pub unsafe fn from_ascii_unchecked(ch: u8) -> Self { - ch.to_ascii_char_unchecked() + // SAFETY: Caller guarantees `ch` is within bounds of ascii. + unsafe { ch.to_ascii_char_unchecked() } } /// Converts an ASCII character into a `u8`. @@ -659,12 +660,20 @@ impl AsciiChar { /// ``` #[must_use] pub fn as_printable_char(self) -> char { - unsafe { - match self as u8 { - b' '..=b'~' => self.as_char(), - 127 => '␡', - _ => char::from_u32_unchecked(self as u32 + '␀' as u32), - } + match self as u8 { + // Non printable characters + // SAFETY: From codepoint 0x2400 ('␀') to 0x241f (`␟`), there are characters representing + // the unprintable characters from 0x0 to 0x1f, ordered correctly. + // As `b` is guaranteed to be within 0x0 to 0x1f, the conversion represents a + // valid character. + b @ 0x0..=0x1f => unsafe { char::from_u32_unchecked(u32::from('␀') + u32::from(b)) }, + + // 0x7f (delete) has it's own character at codepoint 0x2420, not 0x247f, so it is special + // cased to return it's character + 0x7f => '␡', + + // All other characters are printable, and per function contract use `Self::as_char` + _ => self.as_char(), } } @@ -825,6 +834,14 @@ pub trait ToAsciiChar { fn to_ascii_char(self) -> Result; /// Convert to `AsciiChar` without checking that it is an ASCII character. + /// + /// # Safety + /// Calling this function with a value outside of the ascii range, `0x0` to `0x7f` inclusive, + /// is undefined behavior. + // TODO: Make sure this is the contract we want to express in this function. + // It is ambigous if numbers such as `0xffffff20_u32` are valid ascii characters, + // as this function returns `Ascii::Space` due to the cast to `u8`, even though + // `to_ascii_char` returns `Err()`. unsafe fn to_ascii_char_unchecked(self) -> AsciiChar; } @@ -833,6 +850,7 @@ impl ToAsciiChar for AsciiChar { fn to_ascii_char(self) -> Result { Ok(self) } + #[inline] unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { self @@ -846,7 +864,10 @@ impl ToAsciiChar for u8 { } #[inline] unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { - mem::transmute(self) + // SAFETY: Caller guarantees `self` is within bounds of the enum + // variants, so this cast successfully produces a valid ascii + // variant + unsafe { mem::transmute::(self) } } } @@ -857,34 +878,38 @@ impl ToAsciiChar for u8 { impl ToAsciiChar for i8 { #[inline] fn to_ascii_char(self) -> Result { - (self as u32).to_ascii_char() + u32::from(self as u8).to_ascii_char() } #[inline] unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { - mem::transmute(self) + // SAFETY: Caller guarantees `self` is within bounds of the enum + // variants, so this cast successfully produces a valid ascii + // variant + unsafe { mem::transmute::(self as u8) } } } impl ToAsciiChar for char { #[inline] fn to_ascii_char(self) -> Result { - (self as u32).to_ascii_char() + u32::from(self).to_ascii_char() } #[inline] unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { - (self as u32).to_ascii_char_unchecked() + // SAFETY: Caller guarantees we're within ascii range. + unsafe { u32::from(self).to_ascii_char_unchecked() } } } impl ToAsciiChar for u32 { fn to_ascii_char(self) -> Result { - unsafe { - match self { - 0..=127 => Ok(self.to_ascii_char_unchecked()), - _ => Err(ToAsciiCharError(())), - } + match self { + // SAFETY: We're within the valid ascii range in this branch. + 0x0..=0x7f => Ok(unsafe { self.to_ascii_char_unchecked() }), + _ => Err(ToAsciiCharError(())), } } + #[inline] unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { // Note: This cast discards the top bytes, this may cause problems, see diff --git a/src/ascii_str.rs b/src/ascii_str.rs index 2d1fbbd..836f983 100644 --- a/src/ascii_str.rs +++ b/src/ascii_str.rs @@ -28,6 +28,7 @@ impl AsciiStr { #[inline] #[must_use] pub fn as_str(&self) -> &str { + // SAFETY: All variants of `AsciiChar` are valid bytes for a `str`. unsafe { &*(self as *const AsciiStr as *const str) } } @@ -35,6 +36,7 @@ impl AsciiStr { #[inline] #[must_use] pub fn as_bytes(&self) -> &[u8] { + // SAFETY: All variants of `AsciiChar` are valid `u8`, given they're `repr(u8)`. unsafe { &*(self as *const AsciiStr as *const [u8]) } } @@ -105,6 +107,10 @@ impl AsciiStr { /// Converts anything that can be represented as a byte slice to an `AsciiStr` without checking /// for non-ASCII characters.. /// + /// # Safety + /// If any of the bytes in `bytes` do not represent valid ascii characters, calling + /// this function is undefined behavior. + /// /// # Examples /// ``` /// # use ascii::AsciiStr; @@ -114,7 +120,9 @@ impl AsciiStr { #[inline] #[must_use] pub unsafe fn from_ascii_unchecked(bytes: &[u8]) -> &AsciiStr { - bytes.as_ascii_str_unchecked() + // SAFETY: Caller guarantees all bytes in `bytes` are valid + // ascii characters. + unsafe { bytes.as_ascii_str_unchecked() } } /// Returns the number of characters / bytes in this ASCII sequence. @@ -878,6 +886,10 @@ pub trait AsAsciiStr { } /// Convert to an ASCII slice without checking for non-ASCII characters. /// + /// # Safety + /// Calling this function when `self` contains non-ascii characters is + /// undefined behavior. + /// /// # Examples /// unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr; @@ -893,6 +905,7 @@ pub trait AsMutAsciiStr: AsAsciiStr { fn slice_ascii_mut(&mut self, range: R) -> Result<&mut AsciiStr, AsAsciiStrError> where R: SliceIndex<[Self::Inner], Output = [Self::Inner]>; + /// Convert to a mutable ASCII slice. /// /// # Errors @@ -900,7 +913,12 @@ pub trait AsMutAsciiStr: AsAsciiStr { fn as_mut_ascii_str(&mut self) -> Result<&mut AsciiStr, AsAsciiStrError> { self.slice_ascii_mut(..) } + /// Convert to a mutable ASCII slice without checking for non-ASCII characters. + /// + /// # Safety + /// Calling this function when `self` contains non-ascii characters is + /// undefined behavior. unsafe fn as_mut_ascii_str_unchecked(&mut self) -> &mut AsciiStr; } @@ -916,8 +934,10 @@ where { ::slice_ascii(*self, range) } + unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { - ::as_ascii_str_unchecked(*self) + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { ::as_ascii_str_unchecked(*self) } } } @@ -934,7 +954,8 @@ where } unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { - ::as_ascii_str_unchecked(*self) + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { ::as_ascii_str_unchecked(*self) } } } @@ -950,26 +971,31 @@ where } unsafe fn as_mut_ascii_str_unchecked(&mut self) -> &mut AsciiStr { - ::as_mut_ascii_str_unchecked(*self) + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { ::as_mut_ascii_str_unchecked(*self) } } } impl AsAsciiStr for AsciiStr { type Inner = AsciiChar; + fn slice_ascii(&self, range: R) -> Result<&AsciiStr, AsAsciiStrError> where R: SliceIndex<[AsciiChar], Output = [AsciiChar]>, { self.slice.slice_ascii(range) } + #[inline] fn as_ascii_str(&self) -> Result<&AsciiStr, AsAsciiStrError> { Ok(self) } + #[inline] unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { self } + #[inline] fn get_ascii(&self, index: usize) -> Option { self.slice.get_ascii(index) @@ -982,6 +1008,7 @@ impl AsMutAsciiStr for AsciiStr { { self.slice.slice_ascii_mut(range) } + #[inline] unsafe fn as_mut_ascii_str_unchecked(&mut self) -> &mut AsciiStr { self @@ -999,14 +1026,17 @@ impl AsAsciiStr for [AsciiChar] { None => Err(AsAsciiStrError(self.len())), } } + #[inline] fn as_ascii_str(&self) -> Result<&AsciiStr, AsAsciiStrError> { Ok(self.into()) } + #[inline] unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { - self.into() + <&AsciiStr>::from(self) } + #[inline] fn get_ascii(&self, index: usize) -> Option { self.get(index).cloned() @@ -1025,7 +1055,7 @@ impl AsMutAsciiStr for [AsciiChar] { } #[inline] unsafe fn as_mut_ascii_str_unchecked(&mut self) -> &mut AsciiStr { - self.into() + <&mut AsciiStr>::from(self) } } @@ -1045,9 +1075,11 @@ impl AsAsciiStr for [u8] { Err(AsAsciiStrError(self.len())) } } + fn as_ascii_str(&self) -> Result<&AsciiStr, AsAsciiStrError> { + // is_ascii is likely optimized if self.is_ascii() { - // is_ascii is likely optimized + // SAFETY: `is_ascii` guarantees all bytes are within ascii range. unsafe { Ok(self.as_ascii_str_unchecked()) } } else { Err(AsAsciiStrError( @@ -1055,10 +1087,11 @@ impl AsAsciiStr for [u8] { )) } } + #[inline] unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { - let ptr = self as *const [u8] as *const AsciiStr; - &*ptr + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { &*(self as *const [u8] as *const AsciiStr) } } } impl AsMutAsciiStr for [u8] { @@ -1079,9 +1112,11 @@ impl AsMutAsciiStr for [u8] { Err(AsAsciiStrError(len)) } } + fn as_mut_ascii_str(&mut self) -> Result<&mut AsciiStr, AsAsciiStrError> { + // is_ascii() is likely optimized if self.is_ascii() { - // is_ascii() is likely optimized + // SAFETY: `is_ascii` guarantees all bytes are within ascii range. unsafe { Ok(self.as_mut_ascii_str_unchecked()) } } else { Err(AsAsciiStrError( @@ -1089,10 +1124,11 @@ impl AsMutAsciiStr for [u8] { )) } } + #[inline] unsafe fn as_mut_ascii_str_unchecked(&mut self) -> &mut AsciiStr { - let ptr = self as *mut [u8] as *mut AsciiStr; - &mut *ptr + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { &mut *(self as *mut [u8] as *mut AsciiStr) } } } @@ -1109,7 +1145,8 @@ impl AsAsciiStr for str { } #[inline] unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { - self.as_bytes().as_ascii_str_unchecked() + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { self.as_bytes().as_ascii_str_unchecked() } } } impl AsMutAsciiStr for str { @@ -1117,32 +1154,43 @@ impl AsMutAsciiStr for str { where R: SliceIndex<[u8], Output = [u8]>, { - let (ptr, len) = if let Some(slice) = self.as_bytes().get(range) { - if !slice.is_ascii() { + // SAFETY: We don't modify the reference in this function, and the caller may + // only modify it to include valid ascii characters. + let bytes = unsafe { self.as_bytes_mut() }; + match bytes.get_mut(range) { + // Valid ascii slice + Some(slice) if slice.is_ascii() => { + // SAFETY: All bytes are ascii, so this cast is valid + let ptr = slice.as_mut_ptr() as *mut AsciiChar; + let len = slice.len(); + + // SAFETY: The pointer is valid for `len` elements, as it came + // from a slice. + unsafe { + let slice = core::slice::from_raw_parts_mut(ptr, len); + Ok(<&mut AsciiStr>::from(slice)) + } + } + Some(slice) => { + let not_ascii_len = slice.iter().copied().take_while(u8::is_ascii).count(); let offset = slice.as_ptr() as usize - self.as_ptr() as usize; - let not_ascii = slice.iter().take_while(|&b| b.is_ascii()).count(); - return Err(AsAsciiStrError(offset + not_ascii)); + + Err(AsAsciiStrError(offset + not_ascii_len)) } - (slice.as_ptr(), slice.len()) - } else { - return Err(AsAsciiStrError(self.len())); - }; - unsafe { - let ptr = ptr as *const AsciiChar as *mut AsciiChar; - let slice = core::slice::from_raw_parts_mut(ptr, len); - Ok(slice.into()) + None => Err(AsAsciiStrError(self.len())), } } fn as_mut_ascii_str(&mut self) -> Result<&mut AsciiStr, AsAsciiStrError> { - match self.bytes().position(|b| b > 127) { + match self.bytes().position(|b| !b.is_ascii()) { Some(index) => Err(AsAsciiStrError(index)), + // SAFETY: All bytes were iterated, and all were ascii None => unsafe { Ok(self.as_mut_ascii_str_unchecked()) }, } } #[inline] unsafe fn as_mut_ascii_str_unchecked(&mut self) -> &mut AsciiStr { - let ptr = self as *mut str as *mut AsciiStr; - &mut *ptr + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + &mut *(self as *mut str as *mut AsciiStr) } } @@ -1162,7 +1210,8 @@ impl AsAsciiStr for CStr { } #[inline] unsafe fn as_ascii_str_unchecked(&self) -> &AsciiStr { - self.to_bytes().as_ascii_str_unchecked() + // SAFETY: Caller guarantees `self` does not contain non-ascii characters + unsafe { self.to_bytes().as_ascii_str_unchecked() } } } diff --git a/src/ascii_string.rs b/src/ascii_string.rs index b5a66db..6e0fc99 100644 --- a/src/ascii_string.rs +++ b/src/ascii_string.rs @@ -54,10 +54,12 @@ impl AsciiString { /// /// This is highly unsafe, due to the number of invariants that aren't checked: /// - /// * The memory at `ptr` need to have been previously allocated by the same allocator this + /// * The memory at `buf` need to have been previously allocated by the same allocator this /// library uses. /// * `length` needs to be less than or equal to `capacity`. /// * `capacity` needs to be the correct value. + /// * `buf` must have `length` valid ascii elements and contain a total of `capacity` total, + /// possibly, uninitialized, elements. /// /// Violating these may cause problems like corrupting the allocator's internal datastructures. /// @@ -86,7 +88,10 @@ impl AsciiString { #[must_use] pub unsafe fn from_raw_parts(buf: *mut AsciiChar, length: usize, capacity: usize) -> Self { AsciiString { - vec: Vec::from_raw_parts(buf, length, capacity), + // SAFETY: Caller guarantees `buf` was previously allocated by this library, + // that `buf` contains `length` valid ascii elements and has a total + // capacity of `capacity` elements. + vec: unsafe { Vec::from_raw_parts(buf, length, capacity) }, } } @@ -104,13 +109,19 @@ impl AsciiString { B: Into>, { let mut bytes = bytes.into(); - let vec = Vec::from_raw_parts( - bytes.as_mut_ptr() as *mut AsciiChar, - bytes.len(), - bytes.capacity(), - ); + // SAFETY: The caller guarantees all bytes are valid ascii bytes. + let ptr = bytes.as_mut_ptr() as *mut AsciiChar; + let length = bytes.len(); + let capacity = bytes.capacity(); mem::forget(bytes); - AsciiString { vec } + + // SAFETY: We guarantee all invariants, as we got the + // pointer, length and capacity from a `Vec`, + // and we also guarantee the pointer is valid per + // the `SAFETY` notice above. + let vec = Vec::from_raw_parts(ptr, length, capacity); + + Self { vec } } /// Converts anything that can represent a byte buffer into an `AsciiString`. @@ -130,14 +141,13 @@ impl AsciiString { where B: Into> + AsRef<[u8]>, { - unsafe { - match bytes.as_ref().as_ascii_str() { - Ok(_) => Ok(AsciiString::from_ascii_unchecked(bytes)), - Err(e) => Err(FromAsciiError { - error: e, - owner: bytes, - }), - } + match bytes.as_ref().as_ascii_str() { + // SAFETY: `as_ascii_str` guarantees all bytes are valid ascii bytes. + Ok(_) => Ok(unsafe { AsciiString::from_ascii_unchecked(bytes) }), + Err(e) => Err(FromAsciiError { + error: e, + owner: bytes, + }), } } @@ -443,19 +453,18 @@ impl From> for AsciiString { } impl Into> for AsciiString { - fn into(self) -> Vec { - unsafe { - let v = Vec::from_raw_parts( - self.vec.as_ptr() as *mut u8, - self.vec.len(), - self.vec.capacity(), - ); - - // We forget `self` to avoid freeing it at the end of the scope. - // Otherwise, the returned `Vec` would point to freed memory. - mem::forget(self); - v - } + fn into(mut self) -> Vec { + // SAFETY: All ascii bytes are valid `u8`, as we are `repr(u8)`. + // Note: We forget `self` to avoid `self.vec` from being deallocated. + let ptr = self.vec.as_mut_ptr() as *mut u8; + let length = self.vec.len(); + let capacity = self.vec.capacity(); + mem::forget(self); + + // SAFETY: We guarantee all invariants due to getting `ptr`, `length` + // and `capacity` from a `Vec`. We also guarantee `ptr` is valid + // due to the `SAFETY` block above. + unsafe { Vec::from_raw_parts(ptr, length, capacity) } } } @@ -476,6 +485,7 @@ impl<'a> From<&'a [AsciiChar]> for AsciiString { impl Into for AsciiString { #[inline] fn into(self) -> String { + // SAFETY: All ascii bytes are `utf8`. unsafe { String::from_utf8_unchecked(self.into()) } } } @@ -704,7 +714,12 @@ impl Error for FromAsciiError { /// Convert vectors into `AsciiString`. pub trait IntoAsciiString: Sized { /// Convert to `AsciiString` without checking for non-ASCII characters. + /// + /// # Safety + /// If `self` contains non-ascii characters, calling this function is + /// undefined behavior. unsafe fn into_ascii_string_unchecked(self) -> AsciiString; + /// Convert to `AsciiString`. /// /// # Errors @@ -750,7 +765,8 @@ macro_rules! impl_into_ascii_string { impl<'a> IntoAsciiString for $wider { #[inline] unsafe fn into_ascii_string_unchecked(self) -> AsciiString { - AsciiString::from_ascii_unchecked(self) + // SAFETY: Caller guarantees `self` only has valid ascii bytes + unsafe { AsciiString::from_ascii_unchecked(self) } } #[inline] @@ -764,7 +780,8 @@ macro_rules! impl_into_ascii_string { impl IntoAsciiString for $wider { #[inline] unsafe fn into_ascii_string_unchecked(self) -> AsciiString { - AsciiString::from_ascii_unchecked(self) + // SAFETY: Caller guarantees `self` only has valid ascii bytes + unsafe { AsciiString::from_ascii_unchecked(self) } } #[inline] @@ -781,22 +798,22 @@ impl_into_ascii_string! {'a, &'a [u8]} impl_into_ascii_string! {String} impl_into_ascii_string! {'a, &'a str} -/// Note that the trailing null byte will be removed in the conversion. +/// # Notes +/// The trailing null byte `CString` has will be removed during this conversion impl IntoAsciiString for CString { #[inline] unsafe fn into_ascii_string_unchecked(self) -> AsciiString { - AsciiString::from_ascii_unchecked(self.into_bytes()) + // SAFETY: Caller guarantees `self` only has valid ascii bytes + unsafe { AsciiString::from_ascii_unchecked(self.into_bytes()) } } fn into_ascii_string(self) -> Result> { AsciiString::from_ascii(self.into_bytes_with_nul()) .map_err(|FromAsciiError { error, owner }| { FromAsciiError { - owner: unsafe { - // The null byte is preserved from the original - // `CString`, so this is safe. - CString::from_vec_unchecked(owner) - }, + // SAFETY: We don't discard the NULL byte from the original + // string, so we ensure that it's null terminated + owner: unsafe { CString::from_vec_unchecked(owner) }, error, } }) @@ -812,12 +829,15 @@ impl IntoAsciiString for CString { impl<'a> IntoAsciiString for &'a CStr { #[inline] unsafe fn into_ascii_string_unchecked(self) -> AsciiString { - AsciiString::from_ascii_unchecked(self.to_bytes()) + // SAFETY: Caller guarantees `self` only has valid ascii bytes + unsafe { AsciiString::from_ascii_unchecked(self.to_bytes()) } } fn into_ascii_string(self) -> Result> { AsciiString::from_ascii(self.to_bytes_with_nul()) .map_err(|FromAsciiError { error, owner }| FromAsciiError { + // SAFETY: We don't discard the NULL byte from the original + // string, so we ensure that it's null terminated owner: unsafe { CStr::from_ptr(owner.as_ptr() as *const _) }, error, }) @@ -837,7 +857,8 @@ where { #[inline] unsafe fn into_ascii_string_unchecked(self) -> AsciiString { - IntoAsciiString::into_ascii_string_unchecked(self.into_owned()) + // SAFETY: Caller guarantees `self` only has valid ascii bytes + unsafe { IntoAsciiString::into_ascii_string_unchecked(self.into_owned()) } } fn into_ascii_string(self) -> Result> { @@ -898,6 +919,7 @@ mod tests { assert_eq!(ascii_str.len(), 3); assert_eq!(ascii_str.as_slice(), expected_chars); + // SAFETY: "baz" only contains valid ascii characters. let ascii_str_unchecked = unsafe { cstring.into_ascii_string_unchecked() }; assert_eq!(ascii_str_unchecked.len(), 3); assert_eq!(ascii_str_unchecked.as_slice(), expected_chars); diff --git a/src/free_functions.rs b/src/free_functions.rs index 4dbff7a..42f355f 100644 --- a/src/free_functions.rs +++ b/src/free_functions.rs @@ -17,12 +17,11 @@ use ascii_char::{AsciiChar, ToAsciiChar}; pub fn caret_encode>(c: C) -> Option { // The formula is explained in the Wikipedia article. let c = c.into() ^ 0b0100_0000; - unsafe { - if c >= b'?' && c <= b'_' { - Some(c.to_ascii_char_unchecked()) - } else { - None - } + if c >= b'?' && c <= b'_' { + // SAFETY: All bytes between '?' (0x3F) and '_' (0x5f) are valid ascii characters. + Some(unsafe { c.to_ascii_char_unchecked() }) + } else { + None } } @@ -51,10 +50,10 @@ pub fn caret_encode>(c: C) -> Option { /// ``` pub fn caret_decode>(c: C) -> Option { // The formula is explained in the Wikipedia article. - unsafe { - match c.into() { - b'?'..=b'_' => Some(AsciiChar::from_ascii_unchecked(c.into() ^ 0b0100_0000)), - _ => None, - } + match c.into() { + // SAFETY: All bytes between '?' (0x3F) and '_' (0x5f) after `xoring` with `0b0100_0000` are + // valid bytes, as they represent characters between '␀' (0x0) and '␠' (0x1f) + '␡' (0x7f) + b'?'..=b'_' => Some(unsafe { AsciiChar::from_ascii_unchecked(c.into() ^ 0b0100_0000) }), + _ => None, } }