Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/alloctests/tests/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![feature(allocator_api)]
#![feature(str_first_last_char)]
#![feature(binary_heap_pop_if)]
#![feature(const_heap)]
#![feature(deque_extend_front)]
Expand Down
62 changes: 62 additions & 0 deletions library/alloctests/tests/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2452,3 +2452,65 @@ fn ceil_char_boundary() {
// above len
check_many("hello", 5..=10, 5);
}
const _: () = {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this done as a const assertion?

assert!(matches!("hello".first_char(), Some('h')));
assert!(matches!("hello".last_char(), Some('o')));
assert!(matches!("🦀rust".first_char(), Some('🦀')));
assert!(matches!("rust🦀".last_char(), Some('🦀')));
assert!("".first_char().is_none());
assert!("".last_char().is_none());
assert!(matches!("hello".split_first_char(), Some(('h', _))));
assert!(matches!("hello".split_last_char(), Some(('o', _))));
};

#[test]
fn first_char() {
assert_eq!("".first_char(), None);
assert_eq!("x".first_char(), Some('x'));
assert_eq!("hello".first_char(), Some('h'));
// 2-byte char
assert_eq!("ĵƥ".first_char(), Some('ĵ'));
// 3-byte char
assert_eq!("日本".first_char(), Some('日'));
// 4-byte char
assert_eq!("🦀rust".first_char(), Some('🦀'));
}

#[test]
fn last_char() {
assert_eq!("".last_char(), None);
assert_eq!("x".last_char(), Some('x'));
assert_eq!("hello".last_char(), Some('o'));
// 2-byte char
assert_eq!("ĵƥ".last_char(), Some('ƥ'));
// 3-byte char
assert_eq!("日本".last_char(), Some('本'));
// 4-byte char
assert_eq!("rust🦀".last_char(), Some('🦀'));
}

#[test]
fn split_first_char() {
assert_eq!("".split_first_char(), None);
assert_eq!("x".split_first_char(), Some(('x', "")));
assert_eq!("hello".split_first_char(), Some(('h', "ello")));
// 2-byte char
assert_eq!("ĵƥ".split_first_char(), Some(('ĵ', "ƥ")));
// 3-byte char
assert_eq!("日本".split_first_char(), Some(('日', "本")));
// 4-byte char
assert_eq!("🦀rust".split_first_char(), Some(('🦀', "rust")));
}

#[test]
fn split_last_char() {
assert_eq!("".split_last_char(), None);
assert_eq!("x".split_last_char(), Some(('x', "")));
assert_eq!("hello".split_last_char(), Some(('o', "hell")));
// 2-byte char
assert_eq!("ĵƥ".split_last_char(), Some(('ƥ', "ĵ")));
// 3-byte char
assert_eq!("日本".split_last_char(), Some(('本', "日")));
// 4-byte char
assert_eq!("rust🦀".split_last_char(), Some(('🦀', "rust")));
}
81 changes: 81 additions & 0 deletions library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,87 @@ impl str {
Chars { iter: self.as_bytes().iter() }
}

/// # Safety
///
/// `bytes` must be the UTF-8 encoding of exactly one valid Unicode scalar value.
#[inline]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do agree that is probably best to not be declared #[inline], as it's not something that can trivially be optimized out and is shared between a few methods.

const unsafe fn decode_utf8_char(bytes: &[u8]) -> char {
let ch = match bytes {
&[a] => a as u32,
&[a, b] => ((a & 0x1F) as u32) << 6 | (b & 0x3F) as u32,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are these hex numbers coming from? The way UTF-8 is encoded?

&[a, b, c] => ((a & 0x0F) as u32) << 12 | ((b & 0x3F) as u32) << 6 | (c & 0x3F) as u32,
&[a, b, c, d] => {
((a & 0x07) as u32) << 18
| ((b & 0x3F) as u32) << 12
| ((c & 0x3F) as u32) << 6
| (d & 0x3F) as u32
}
// SAFETY: All valid UTF-8 sequences are covered above; this arm is unreachable for valid input.
_ => unsafe { crate::hint::unreachable_unchecked() },
};
// SAFETY: the caller must ensure `bytes` contains a valid UTF-8 sequence.
unsafe { char::from_u32_unchecked(ch) }
}
Comment on lines +1066 to +1086
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be moved to either a standalone method or (ideally) a method on char? It would be pub(crate), but this doesn't feel like the best of places for such a method.


/// Returns the first [`prim@char`] of a string slice, or [`None`] if it's empty.
#[must_use]
#[unstable(feature = "str_first_last_char", issue = "154393")]
#[rustc_const_unstable(feature = "str_first_last_char", issue = "154393")]
#[inline]
pub const fn first_char(&self) -> Option<char> {
match self.split_first_char() {
Some((c, _)) => Some(c),
None => None,
}
}

/// Returns the last [`prim@char`] of a string slice, or [`None`] if it's empty.
#[must_use]
#[unstable(feature = "str_first_last_char", issue = "154393")]
#[rustc_const_unstable(feature = "str_first_last_char", issue = "154393")]
#[inline]
pub const fn last_char(&self) -> Option<char> {
match self.split_last_char() {
Some((c, _)) => Some(c),
None => None,
}
}

/// Returns the first [`prim@char`] and the rest of the string slice, or [`None`] if it's empty.
#[must_use]
#[unstable(feature = "str_first_last_char", issue = "154393")]
#[rustc_const_unstable(feature = "str_first_last_char", issue = "154393")]
pub const fn split_first_char(&self) -> Option<(char, &str)> {
let bytes = self.as_bytes();
let Some(&x) = bytes.first() else {
return None;
};
let width = utf8_char_width(x);
// SAFETY: self is valid UTF-8 and width is correct
let (head, tail) = unsafe { self.split_at_unchecked(width) };
// SAFETY: head is valid UTF-8 for exactly one char
Some((unsafe { Self::decode_utf8_char(head.as_bytes()) }, tail))
}

/// Returns the last [`prim@char`] and the rest of the string slice, or [`None`] if it's empty.
#[must_use]
#[unstable(feature = "str_first_last_char", issue = "154393")]
#[rustc_const_unstable(feature = "str_first_last_char", issue = "154393")]
pub const fn split_last_char(&self) -> Option<(char, &str)> {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a particular reason for returning (char, &str) instead of (&str, char) here, since the latter would mirror the order in the original string?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the ordering was (char, &str) used to provide symmetry with split_first_char, so both would return the first character followed by the rest of the string...aslo could argue that having (&str, char) for split_last_char is better because it preserves the natural order of a string.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep it as-is for now. I agree that it probably should be swapped, but that's not what the ACP was for.

let bytes = self.as_bytes();
if bytes.is_empty() {
return None;
}
let mut i = bytes.len() - 1;
while i > 0 && !self.is_char_boundary(i) {
i -= 1;
}
// SAFETY: i is a char boundary
let (head, tail) = unsafe { self.split_at_unchecked(i) };
// SAFETY: tail is valid UTF-8 for exactly one char
Some((unsafe { Self::decode_utf8_char(tail.as_bytes()) }, head))
}

/// Returns an iterator over the [`char`]s of a string slice, and their
/// positions.
///
Expand Down
Loading